1
2
3 """New CSS Tokenizer (a generator)
4 """
5 __all__ = ['Tokenizer', 'CSSProductions']
6 __docformat__ = 'restructuredtext'
7 __version__ = '$Id: tokenize2.py 1116 2008-03-05 13:52:23Z cthedot $'
8 import os
9 import re
10 import string
11 import xml.dom
12 import cssutils
13 import util
14 from cssproductions import *
15
17 """
18 generates a list of Token tuples:
19 (Tokenname, value, startline, startcolumn)
20 """
21 _linesep = u'\n'
22
24 """returns macro expanded productions, order of productions is kept"""
25 def macro_value(m):
26 return '(?:%s)' % macros[m.groupdict()['macro']]
27 expanded = []
28 for key, value in productions:
29 while re.search(r'{[a-zA-Z][a-zA-Z0-9-]*}', value):
30 value = re.sub(r'{(?P<macro>[a-zA-Z][a-zA-Z0-9-]*)}',
31 macro_value, value)
32 expanded.append((key, value))
33 return expanded
34
36 """compile productions into callable match objects, order is kept"""
37 compiled = []
38 for key, value in expanded_productions:
39 compiled.append((key, re.compile('^(?:%s)' % value, re.U).match))
40 return compiled
41
42 - def __init__(self, macros=None, productions=None):
43 """
44 inits tokenizer with given macros and productions which default to
45 cssutils own macros and productions
46 """
47 self.log = cssutils.log
48 if not macros:
49 macros = MACROS
50 if not productions:
51 productions = PRODUCTIONS
52 self.tokenmatches = self._compile_productions(
53 self._expand_macros(macros, productions))
54 self.commentmatcher = [x[1] for x in self.tokenmatches if x[0] == 'COMMENT'][0]
55 self.urimatcher = [x[1] for x in self.tokenmatches if x[0] == 'URI'][0]
56 self.unicodesub = re.compile(RE_UNICODE).sub
57
58 - def tokenize(self, text, fullsheet=False):
59 """
60 generator: tokenizes text and yiels tokens, each token is a tuple of::
61
62 (tokenname, tokenvalue, line, col)
63
64 The tokenvalue will contain a normal string, meaning CSS unicode
65 escapes have been resolved to normal characters. The serializer
66 escapes needed characters back to unicode escapes depending of
67 the stylesheet target encoding.
68
69 text
70 to be tokenized
71 fullsheet
72 if ``True`` appends EOF token as last one and completes incomplete
73 COMMENT tokens
74 """
75 line = col = 1
76
77 tokens = []
78 while text:
79 for name, matcher in self.tokenmatches:
80 if fullsheet and name == 'CHAR' and text.startswith(u'/*'):
81
82
83 possiblecomment = u'%s*/' % text
84 match = self.commentmatcher(possiblecomment)
85 if match:
86 yield ('COMMENT', possiblecomment, line, col)
87 text = None
88 break
89
90
91 match = matcher(text)
92 if match:
93 found = match.group(0)
94
95 if fullsheet:
96
97 if 'INVALID' == name and text == found:
98
99 name = 'STRING'
100 found = '%s%s' % (found, found[0])
101
102 elif 'FUNCTION' == name and\
103 u'url(' == util.Base._normalize(found):
104
105
106 for end in (u"')", u'")', u')'):
107 possibleuri = '%s%s' % (text, end)
108 match = self.urimatcher(possibleuri)
109 if match:
110 name = 'URI'
111 found = match.group(0)
112 break
113
114 if name in ('URI', 'FUNCTION', 'ATKEYWORD', 'IDENT', 'STRING',
115 'INVALID', 'HASH', 'DIMENSION', 'COMMENT'):
116
117 def repl(m):
118 num = int(m.group(0)[1:], 16)
119 if num < 0x10000:
120 return unichr(num)
121 else:
122 return m.group(0)
123 value = self.unicodesub(repl, found)
124 else:
125
126 value = found
127
128 yield (name, value, line, col)
129 text = text[len(found):]
130 nls = found.count(self._linesep)
131 line += nls
132 if nls:
133 col = len(found[found.rfind(self._linesep):])
134 else:
135 col += len(found)
136 break
137
138 else:
139
140 raise xml.dom.SyntaxErr('no token match "%s(...)"' % text[:10])
141 text = text[1:]
142
143 if fullsheet:
144 yield ('EOF', u'', line, col)
145