1
2
3 """New CSS Tokenizer (a generator)
4 """
5 __all__ = ['Tokenizer', 'CSSProductions']
6 __docformat__ = 'restructuredtext'
7 __version__ = '$Id: tokenize2.py 1350 2008-07-10 20:37:21Z cthedot $'
8
9 import os
10 import re
11 import string
12 import xml.dom
13 import cssutils
14 import util
15 from cssproductions import *
16
18 """
19 generates a list of Token tuples:
20 (Tokenname, value, startline, startcolumn)
21 """
22 _atkeywords = {
23 u'@font-face': 'FONT_FACE_SYM',
24 u'@import': 'IMPORT_SYM',
25 u'@media': 'MEDIA_SYM',
26 u'@namespace': 'NAMESPACE_SYM',
27 u'@page': 'PAGE_SYM'
28 }
29 _linesep = u'\n'
30
31 - def __init__(self, macros=None, productions=None):
32 """
33 inits tokenizer with given macros and productions which default to
34 cssutils own macros and productions
35 """
36 self.log = cssutils.log
37 if not macros:
38 macros = MACROS
39 if not productions:
40 productions = PRODUCTIONS
41 self.tokenmatches = self._compile_productions(
42 self._expand_macros(macros,
43 productions))
44 self.commentmatcher = [x[1] for x in self.tokenmatches if x[0] == 'COMMENT'][0]
45 self.urimatcher = [x[1] for x in self.tokenmatches if x[0] == 'URI'][0]
46 self.unicodesub = re.compile(r'\\[0-9a-fA-F]{1,6}(?:\r\n|[\t|\r|\n|\f|\x20])?').sub
47
49 """returns macro expanded productions, order of productions is kept"""
50 def macro_value(m):
51 return '(?:%s)' % macros[m.groupdict()['macro']]
52 expanded = []
53 for key, value in productions:
54 while re.search(r'{[a-zA-Z][a-zA-Z0-9-]*}', value):
55 value = re.sub(r'{(?P<macro>[a-zA-Z][a-zA-Z0-9-]*)}',
56 macro_value, value)
57 expanded.append((key, value))
58 return expanded
59
61 """compile productions into callable match objects, order is kept"""
62 compiled = []
63 for key, value in expanded_productions:
64 compiled.append((key, re.compile('^(?:%s)' % value, re.U).match))
65 return compiled
66
67 - def tokenize(self, text, fullsheet=False):
68 """Generator: Tokenize text and yield tokens, each token is a tuple
69 of::
70
71 (nname, value, line, col)
72
73 The token value will contain a normal string, meaning CSS unicode
74 escapes have been resolved to normal characters. The serializer
75 escapes needed characters back to unicode escapes depending on
76 the stylesheet target encoding.
77
78 text
79 to be tokenized
80 fullsheet
81 if ``True`` appends EOF token as last one and completes incomplete
82 COMMENT tokens
83 """
84 def repl(m):
85 "used by unicodesub"
86 num = int(m.group(0)[1:], 16)
87 if num < 0x10000:
88 return unichr(num)
89 else:
90 return m.group(0)
91
92 def normalize(value):
93 "normalize and do unicodesub"
94 return util.Base._normalize(self.unicodesub(repl, value))
95
96 line = col = 1
97
98
99 (BOM, matcher), productions = self.tokenmatches[0], self.tokenmatches[1:]
100 match = matcher(text)
101 if match:
102 found = match.group(0)
103 yield (BOM, found, line, col)
104 text = text[len(found):]
105
106 while text:
107
108 c = text[0]
109 if c in '{}:;,':
110 yield ('CHAR', c, line, col)
111 col += 1
112 text = text[1:]
113
114 else:
115
116 for name, matcher in productions:
117 if fullsheet and name == 'CHAR' and text.startswith(u'/*'):
118
119 possiblecomment = u'%s*/' % text
120 match = self.commentmatcher(possiblecomment)
121 if match:
122 yield ('COMMENT', possiblecomment, line, col)
123 text = None
124 break
125
126 match = matcher(text)
127 if match:
128 found = match.group(0)
129 if fullsheet:
130
131 if 'INVALID' == name and text == found:
132
133 name, found = 'STRING', '%s%s' % (found, found[0])
134
135 elif 'FUNCTION' == name and\
136 u'url(' == normalize(found):
137
138
139 for end in (u"')", u'")', u')'):
140 possibleuri = '%s%s' % (text, end)
141 match = self.urimatcher(possibleuri)
142 if match:
143 name, found = 'URI', match.group(0)
144 break
145
146 if name in ('DIMENSION', 'IDENT', 'STRING', 'URI',
147 'HASH', 'COMMENT', 'FUNCTION', 'INVALID'):
148
149
150 value = self.unicodesub(repl, found)
151
152 else:
153 if 'ATKEYWORD' == name:
154
155 name = self._atkeywords.get(normalize(found), 'ATKEYWORD')
156 value = found
157
158 yield (name, value, line, col)
159 text = text[len(found):]
160 nls = found.count(self._linesep)
161 line += nls
162 if nls:
163 col = len(found[found.rfind(self._linesep):])
164 else:
165 col += len(found)
166 break
167
168 if fullsheet:
169 yield ('EOF', u'', line, col)
170