1
2 """CSS Tokenizer
3 """
4 __docformat__ = 'restructuredtext'
5 __author__ = '$LastChangedBy: doerwalter $'
6 __date__ = '$LastChangedDate: 2007-08-02 22:58:23 +0200 (Do, 02 Aug 2007) $'
7 __version__ = '0.9.2b2 $LastChangedRevision: 160 $'
8
9 import string
10 import xml.dom
11
12 from token import Token, Tokenre
13 import cssutils
14
15
16 tokenregex = Tokenre()
17 tokentype = Token
18
19
21 """
22 generates a list of Token objects
23 """
24 WS = ' \t\r\n\f'
25 ttypes = tokentype
26
27 _typelist = [
28 (lambda t: t == u';', tokentype.SEMICOLON),
29 (lambda t: t == u'{', tokentype.LBRACE),
30 (lambda t: t == u'}', tokentype.RBRACE),
31 (lambda t: t == u'[', tokentype.LBRACKET),
32 (lambda t: t == u']', tokentype.RBRACKET),
33 (lambda t: t == u'(', tokentype.LPARANTHESIS),
34 (lambda t: t == u')', tokentype.RPARANTHESIS),
35 (lambda t: t == u',', tokentype.COMMA),
36 (lambda t: t == u'.', tokentype.CLASS),
37 (tokenregex.w, tokentype.S),
38 (tokenregex.num, tokentype.NUMBER),
39 (tokenregex.atkeyword, tokentype.ATKEYWORD),
40 (tokenregex.HASH, tokentype.HASH),
41 (tokenregex.DIMENSION, tokentype.DIMENSION),
42 (tokenregex.ident, tokentype.IDENT),
43 (tokenregex.string, tokentype.STRING)
44 ]
45 _delimmap = {
46 u'*': tokentype.UNIVERSAL,
47 u'.': tokentype.CLASS,
48 u'>': tokentype.GREATER,
49 u'+': tokentype.PLUS,
50 u'~': tokentype.TILDE
51 }
52 _attmap = {
53 u'~=': tokentype.INCLUDES,
54 u'|=': tokentype.DASHMATCH,
55 u'^=': tokentype.PREFIXMATCH,
56 u'$=': tokentype.SUFFIXMATCH,
57 u'*=': tokentype.SUBSTRINGMATCH
58 }
59 _atkeywordmap = {
60 u'charset': tokentype.CHARSET_SYM,
61 u'import': tokentype.IMPORT_SYM,
62 u'media': tokentype.MEDIA_SYM,
63 u'namespace': tokentype.NAMESPACE_SYM,
64 u'page': tokentype.PAGE_SYM
65 }
66
70
72 """
73 check type of tokentype in t which may be string or list
74 returns ttype
75 """
76 if isinstance(t, list): t = u''.join(t)
77
78 for check, result in Tokenizer._typelist:
79 if check(t): return result
80
81 return tokentype.DELIM
82
83
85 """
86 adds a new Token to self.tokens
87 """
88
89 if isinstance(value, list): value = u''.join(value)
90 if not value: return
91
92 if not ttype: ttype = self.getttype(value)
93
94
95 if len(self.tokens) > 0: last = self.tokens[-1]
96 else: last = Token()
97 if len(self.tokens) > 1: last2 = self.tokens[-2]
98 else: last2 = Token()
99
100
101 todo = False
102
103
104 if ttype == tokentype.S:
105 todo = True
106
107
108 elif ttype == tokentype.ATKEYWORD:
109 normkeyword = value[1:].lower().replace(u'\\', u'')
110 ttype = Tokenizer._atkeywordmap.get(
111 normkeyword, tokentype.ATKEYWORD)
112 todo = True
113
114
115 elif u'@' == last.value and ttype == tokentype.IDENT:
116 keyword = value.lower()
117 normkeyword = keyword.replace(u'\\', u'')
118 last.type = Tokenizer._atkeywordmap.get(
119 normkeyword, tokentype.ATKEYWORD)
120 last.value = u'@%s' % keyword
121
122
123
124 elif u'@' == last2.value and u'-' == last.value and \
125 ttype == tokentype.IDENT:
126 keyword = value.lower()
127 normkeyword = keyword.replace(u'\\', u'')
128 last2.type = Tokenizer._atkeywordmap.get(
129 normkeyword, tokentype.ATKEYWORD)
130 last2.value = u'@-%s' % keyword
131 self.tokens.pop(-1)
132
133
134
135 elif u'-' == last.value and (ttype in (
136 tokentype.IDENT, tokentype.NUMBER, tokentype.DIMENSION)):
137 last.type = ttype
138 last.value = u'-%s' % value.lower()
139
140
141 elif last.type == tokentype.NUMBER and\
142 ttype == tokentype.IDENT:
143 last.type = tokentype.DIMENSION
144 last.value = u'%s%s' % (last.value, value.lower())
145
146
147
148 elif self.getttype(last.value + value) == tokentype.HASH:
149 last.type = tokentype.HASH
150 last.value += value
151
152
153 elif last.type == tokentype.IDENT and u'(' == value:
154 last.type = tokentype.FUNCTION
155 last.value = u'%s(' % last.value.lower()
156
157
158 elif last.type == tokentype.NUMBER and u'%' == value:
159 last.type = tokentype.PERCENTAGE
160 last.value = u'%s%%' % last.value
161
162
163
164 elif u'important' == value.lower().replace(u'\\', u''):
165 for i in range(len(self.tokens), 0, -1):
166 _t = self.tokens[i - 1]
167
168 if u'!' == _t.value:
169 _t.type = tokentype.IMPORTANT_SYM
170 _t.value = u'!%s' % value.lower()
171 del self.tokens[i:]
172 break
173
174 elif _t.type != tokentype.S:
175 self.tokens.append(
176 Token(self.line, self.col, ttype, value))
177 break
178
179
180 elif u')' == value or ttype == tokentype.EOF:
181
182 _uriindex = -1
183 for i in range(len(self.tokens), 0, -1):
184 _t = self.tokens[i-1]
185 if tokentype.FUNCTION == _t.type and u'url(' == _t.value:
186 _uriindex = i - 1
187 break
188 elif tokentype.FUNCTION == _t.type:
189
190 todo = True
191 break
192
193 if _uriindex > -1:
194
195 _uricontent = u''
196 for i in range(_uriindex+1, len(self.tokens)):
197 _t = self.tokens[i]
198 if _t.type == tokentype.S and\
199 ((i == _uriindex+1) or (i == len(self.tokens)-1)):
200
201 continue
202 else:
203
204 _uricontent += _t.value
205 if _uricontent:
206
207 _uri = u'url(%s)' % _uricontent
208 if tokenregex.URI(_uri):
209 _urit = self.tokens[_uriindex]
210 _urit.type = tokentype.URI
211 _urit.value = _uri
212 del self.tokens[_uriindex + 1:]
213 else:
214 todo = True
215 else:
216 todo = True
217
218 else:
219 todo = True
220
221
222 if todo or ttype == tokentype.EOF:
223 self.tokens.append(Token(self.line, self.col, ttype, value))
224
225
226 cols = len(value)
227 if self._sub1ofcol:
228
229 cols -= 1
230 self._sub1ofcol = False
231 if value.find('\n') != -1:
232
233 self.col = 1
234 cols = len(value[value.rfind('\n')+1:])
235
236 self.line += value.count('\n')
237 self.col += cols
238
239
241 """
242 http://www.w3.org/TR/2004/CR-CSS21-20040225/syndata.html#q6
243
244 Third, backslash escapes allow authors to refer to characters
245 they can't easily put in a document. In this case, the
246 backslash is followed by at most six hexadecimal digits
247 (0..9A..F), which stand for the ISO 10646 ([ISO10646])
248 character with that number, which must not be zero. If a
249 character in the range [0-9a-fA-F] follows the hexadecimal
250 number, the end of the number needs to be made clear. There
251 are two ways to do that:
252
253 1. with a space (or other whitespace character): "\26 B"
254 ("&B"). In this case, user agents should treat a "CR/LF"
255 pair (U+000D/U+000A) as a single whitespace character.
256 2. by providing exactly 6 hexadecimal digits: "\000026B"
257 ("&B")
258
259 In fact, these two methods may be combined. Only one
260 whitespace character is ignored after a hexadecimal escape.
261 Note that this means that a "real" space after the escape
262 sequence must itself either be escaped or doubled.
263 """
264 escape = u'\\'
265 MAX = 6
266 i = 0
267 actual = 0
268 while self.text and i < MAX:
269 i += 1
270 c, c2, c3 = self.text[0], u''.join(self.text[:1]),\
271 u''.join(self.text[1:2])
272 if c in string.hexdigits:
273 escape += c
274 del self.text[0]
275 else:
276 actual = i
277 i = MAX
278 else:
279 if int(escape[1:], 16) <= 0:
280 self.log.error(
281 u'Tokenizer: Syntax Error, ESCAPE SEQUENCE with value 0.')
282 escape = ''
283 elif actual < MAX + 1 and c2 in self.WS:
284
285 del self.text[0]
286 if u'\r' == c2 and u'\n' == c3:
287
288 del self.text[0]
289
290
291 escape += u' '
292
293 return escape
294
295
386
387
388 - def tokenize(self, text, _fullSheet=False):
389 """
390 tokenizes text and returns tokens
391 """
392 if not text:
393 return []
394
395 self.text = list(text)
396 self.tokens = []
397 self.line = 1
398 self.col = 1
399
400 def addifnewtoken(t, c):
401 """
402 checks if c starts a new token and adds last t as token
403 """
404 if len(t) == 0:
405 return [c]
406
407 tt, ct = self.getttype(t), self.getttype(c)
408
409
410 if tt in (tokentype.ATKEYWORD, tokentype.IDENT)\
411 or (t and t[-1] == u'-')\
412 and ct in (tokentype.IDENT, tokentype.NUMBER):
413
414
415 t.append(c)
416
417
418
419 elif (t[-1] == u'.' or tt == tokentype.NUMBER)\
420 and ct == tokentype.NUMBER:
421
422 if t[0] == u'.':
423 t[0] = '0.'
424 self._sub1ofcol = True
425 t.append(c)
426
427 elif tt == tokentype.NUMBER and c == u'.':
428
429 t.append(c)
430
431 elif ct == tokentype.DELIM:
432
433
434 self.addtoken(t)
435 self.addtoken(c)
436 t = []
437
438 elif tt != ct:
439
440 self.addtoken(t)
441 t = [c]
442
443 else:
444
445 t.append(c)
446
447
448
449 return t
450
451 t = []
452 while self.text:
453
454 c, c2 = self.text.pop(0), u''.join(self.text[:1])
455
456 if c in self.WS:
457
458 self.addtoken(t)
459 t = [c]
460 try:
461 while self.text[0] in self.WS:
462 t.append(self.text.pop(0))
463 except IndexError:
464 pass
465 self.addtoken(t, tokentype.S)
466 t = []
467
468 elif u'/' == c and u'*' == c2:
469
470 self.addtoken(t)
471 del self.text[0]
472 self.dostrorcomment(
473 [u'/*'], u'*/', tokentype.COMMENT, _fullSheet)
474 t = []
475
476 elif c in '"\'':
477
478 self.addtoken(t)
479 self.dostrorcomment(
480 [c], c, tokentype.STRING, _fullSheet)
481 t = []
482
483 elif c in u';{}[](),':
484
485 self.addtoken(t)
486 self.addtoken(c)
487 t = []
488
489 elif c == u'.' and c2 in tuple(u'0123456789'):
490
491 t = addifnewtoken(t, c)
492
493 elif u'::' == c + c2:
494
495 self.addtoken(t)
496 self.addtoken(u'::', tokentype.PSEUDO_ELEMENT)
497 del self.text[0]
498 t = []
499
500 elif c in u'~|^$*' and u'=' == c2:
501
502 self.addtoken(t)
503 _t = c + c2
504 self.addtoken(_t, Tokenizer._attmap[_t])
505 del self.text[0]
506 t = []
507
508 elif c == u'<' and u''.join(self.text[:3]) == u'!--':
509
510 self.addtoken(t)
511 del self.text[:3]
512 self.addtoken(u'<!--', tokentype.CDO)
513 t = []
514 elif c == u'-' and u''.join(self.text[:2]) == u'->':
515
516 self.addtoken(t)
517 del self.text[:2]
518 self.addtoken(u'-->', tokentype.CDC)
519 t = []
520
521 elif c in u'.=~|*+>#!%:&$':
522
523 self.addtoken(t)
524 self.addtoken(
525 c, Tokenizer._delimmap.get(c, tokentype.DELIM))
526 t = []
527
528 elif u'\\' == c and c2 not in string.hexdigits:
529
530 t.append(c)
531 t.append(c2)
532 del self.text[0]
533 elif u'\\' == c and c2:
534
535 escape = self.getescape()
536 t = addifnewtoken(t, escape)
537
538 else:
539
540 t = addifnewtoken(t, c)
541 else:
542
543 self.addtoken(t)
544
545 if _fullSheet:
546
547 self.addtoken(u'EOF', tokentype.EOF)
548
549 return self.tokens
550
551
552 if __name__ == '__main__':
553 """
554 NOT LIKE SPEC:
555 between ! and important only WS is allowed, no comments, this should
556 be very seldomly used anyway
557
558 TODO:
559 Tokenizer:
560
561 parser:
562 - filter CDO/CDC
563 - lengths: % px pt pc em ex in cm mm
564
565 CSS2 parses a number immediately followed by an identifier as a
566 DIMEN token (i.e., an unknown unit), CSS1 parsed it as a number
567 and an identifier. That means that in CSS1, the declaration
568 'font: 10pt/1.2serif' was correct, as was 'font: 10pt/12pt serif';
569 in CSS2, a space is required before "serif". (Some UAs accepted
570 the first example, but not the second.)
571 """
572 css = u'''5px -5px'''
573
574 tokens = Tokenizer().tokenize(css)
575 import pprint
576 pprint.pprint(tokens)
577 print 40* '-'
578
579 sheet = cssutils.parseString(css)
580 print sheet.cssText
581 print 40* '-'
582