1
2
3 """New CSS Tokenizer (a generator)
4 """
5 __all__ = ['Tokenizer', 'CSSProductions']
6 __docformat__ = 'restructuredtext'
7 __author__ = '$LastChangedBy: cthedot $'
8 __date__ = '$LastChangedDate: 2007-09-01 15:55:42 +0200 (Sa, 01 Sep 2007) $'
9 __version__ = '$LastChangedRevision: 300 $'
10
11 import os
12 import re
13 import string
14 import xml.dom
15 import cssutils
16 import util
17 from cssproductions import *
18
20 """
21 generates a list of Token tuples:
22 (Tokenname, value, startline, startcolumn)
23 """
24 _linesep = u'\n'
25
27 """returns macro expanded productions, order of productions is kept"""
28 def macro_value(m):
29 return '(?:%s)' % macros[m.groupdict()['macro']]
30 expanded = []
31 for key, value in productions:
32 while re.search(r'{[a-zA-Z][a-zA-Z0-9-]*}', value):
33 value = re.sub(r'{(?P<macro>[a-zA-Z][a-zA-Z0-9-]*)}',
34 macro_value, value)
35 expanded.append((key, value))
36 return expanded
37
39 """compile productions into callable match objects, order is kept"""
40 compiled = []
41 for key, value in expanded_productions:
42 compiled.append((key, re.compile('^(?:%s)' % value, re.U).match))
43 return compiled
44
45 - def __init__(self, macros=None, productions=None):
46 """
47 inits tokenizer with given macros and productions which default to
48 cssutils own macros and productions
49 """
50 self.log = cssutils.log
51 if not macros:
52 macros = MACROS
53 if not productions:
54 productions = PRODUCTIONS
55 self.tokenmatches = self._compile_productions(
56 self._expand_macros(macros, productions))
57 self.commentmatcher = [x[1] for x in self.tokenmatches if x[0] == 'COMMENT'][0]
58 self.urimatcher = [x[1] for x in self.tokenmatches if x[0] == 'URI'][0]
59 self.unicodesub = re.compile(RE_UNICODE).sub
60
61 - def tokenize(self, text, fullsheet=False):
62 """
63 generator: tokenizes text and yiels tokens, each token is a tuple of::
64
65 (tokenname, tokenvalue, line, col)
66
67 The tokenvalue will contain a normal string, meaning CSS unicode
68 escapes have been resolved to normal characters. The serializer
69 escapes needed characters back to unicode escapes depending of
70 the stylesheet target encoding.
71
72 text
73 to be tokenized
74 fullsheet
75 if ``True`` appends EOF token as last one and completes incomplete
76 COMMENT tokens
77 """
78 line = col = 1
79
80 tokens = []
81 while text:
82 for name, matcher in self.tokenmatches:
83
84 if fullsheet and name == 'CHAR' and text.startswith(u'/*'):
85
86
87 possiblecomment = u'%s*/' % text
88 match = self.commentmatcher(possiblecomment)
89 if match:
90 yield ('COMMENT', possiblecomment, line, col)
91 text = None
92 break
93
94
95 match = matcher(text)
96 if match:
97 found = match.group(0)
98
99 if fullsheet:
100
101 if 'INVALID' == name and text == found:
102
103 name = 'STRING'
104 found = '%s%s' % (found, found[0])
105
106 elif 'FUNCTION' == name and\
107 u'url(' == util.Base._normalize(found):
108
109
110 for end in (u"')", u'")', u')'):
111 possibleuri = '%s%s' % (text, end)
112 match = self.urimatcher(possibleuri)
113 if match:
114 name = 'URI'
115 found = match.group(0)
116 break
117
118 if name in ('URI', 'FUNCTION', 'ATKEYWORD', 'IDENT', 'STRING',
119 'INVALID', 'HASH', 'DIMENSION', 'COMMENT'):
120
121 def repl(m):
122 num = int(m.group(0)[1:], 16)
123 if num < 0x10000:
124 return unichr(num)
125 else:
126 return m.group(0)
127 value = self.unicodesub(repl, found)
128 else:
129
130 value = found
131
132 yield (name, value, line, col)
133 text = text[len(found):]
134 nls = found.count(self._linesep)
135 line += nls
136 if nls:
137 col = len(found[found.rfind(self._linesep):])
138 else:
139 col += len(found)
140 break
141
142 else:
143
144 raise xml.dom.SyntaxErr('no token match "%s(...)"' % text[:10])
145 text = text[1:]
146
147 if fullsheet:
148 yield ('EOF', u'', line, col)
149