1
2 """CSS Tokenizer
3 """
4 __docformat__ = 'restructuredtext'
5 __author__ = '$LastChangedBy: cthedot $'
6 __date__ = '$LastChangedDate: 2007-09-01 15:55:42 +0200 (Sa, 01 Sep 2007) $'
7 __version__ = '0.9.2b2 $LastChangedRevision: 300 $'
8
9 import string
10 import xml.dom
11
12 from token import Token, Tokenre
13 import cssutils
14
15
16 tokenregex = Tokenre()
17 tokentype = Token
18
19
21 """
22 generates a list of Token objects
23 """
24 WS = ' \t\r\n\f'
25 ttypes = tokentype
26
27 _typelist = [
28 (lambda t: t == u';', tokentype.SEMICOLON),
29 (lambda t: t == u'{', tokentype.LBRACE),
30 (lambda t: t == u'}', tokentype.RBRACE),
31 (lambda t: t == u'[', tokentype.LBRACKET),
32 (lambda t: t == u']', tokentype.RBRACKET),
33 (lambda t: t == u'(', tokentype.LPARANTHESIS),
34 (lambda t: t == u')', tokentype.RPARANTHESIS),
35 (lambda t: t == u',', tokentype.COMMA),
36 (lambda t: t == u'.', tokentype.CLASS),
37 (tokenregex.w, tokentype.S),
38 (tokenregex.num, tokentype.NUMBER),
39 (tokenregex.atkeyword, tokentype.ATKEYWORD),
40 (tokenregex.HASH, tokentype.HASH),
41 (tokenregex.DIMENSION, tokentype.DIMENSION),
42 (tokenregex.ident, tokentype.IDENT),
43 (tokenregex.string, tokentype.STRING)
44 ]
45 _delimmap = {
46 u'*': tokentype.UNIVERSAL,
47 u'.': tokentype.CLASS,
48 u'>': tokentype.GREATER,
49 u'+': tokentype.PLUS,
50 u'~': tokentype.TILDE
51 }
52 _attmap = {
53 u'~=': tokentype.INCLUDES,
54 u'|=': tokentype.DASHMATCH,
55 u'^=': tokentype.PREFIXMATCH,
56 u'$=': tokentype.SUFFIXMATCH,
57 u'*=': tokentype.SUBSTRINGMATCH
58 }
59 _atkeywordmap = {
60 u'charset': tokentype.CHARSET_SYM,
61 u'import': tokentype.IMPORT_SYM,
62 u'media': tokentype.MEDIA_SYM,
63 u'namespace': tokentype.NAMESPACE_SYM,
64 u'page': tokentype.PAGE_SYM
65 }
66
70
72 """
73 check type of tokentype in t which may be string or list
74 returns ttype
75 """
76 if isinstance(t, list): t = u''.join(t)
77
78 for check, result in Tokenizer._typelist:
79 if check(t): return result
80
81 return tokentype.DELIM
82
83
85 """
86 adds a new Token to self.tokens
87 """
88
89 if isinstance(value, list): value = u''.join(value)
90 if not value: return
91
92 if not ttype: ttype = self.getttype(value)
93
94
95 if len(self.tokens) > 0: last = self.tokens[-1]
96 else: last = Token()
97 if len(self.tokens) > 1: last2 = self.tokens[-2]
98 else: last2 = Token()
99
100
101 todo = False
102
103
104 if ttype == tokentype.S:
105 todo = True
106
107
108 elif ttype == tokentype.ATKEYWORD:
109 normkeyword = value[1:].lower().replace(u'\\', u'')
110 ttype = Tokenizer._atkeywordmap.get(
111 normkeyword, tokentype.ATKEYWORD)
112 todo = True
113
114
115 elif u'@' == last.value and ttype == tokentype.IDENT:
116 keyword = value.lower()
117 normkeyword = keyword.replace(u'\\', u'')
118 last.type = Tokenizer._atkeywordmap.get(
119 normkeyword, tokentype.ATKEYWORD)
120 last.value = u'@%s' % keyword
121
122
123
124 elif u'@' == last2.value and u'-' == last.value and \
125 ttype == tokentype.IDENT:
126 keyword = value.lower()
127 normkeyword = keyword.replace(u'\\', u'')
128 last2.type = Tokenizer._atkeywordmap.get(
129 normkeyword, tokentype.ATKEYWORD)
130 last2.value = u'@-%s' % keyword
131 self.tokens.pop(-1)
132
133
134
135 elif u'-' == last.value and (ttype in (
136 tokentype.IDENT, tokentype.NUMBER, tokentype.DIMENSION)):
137 last.type = ttype
138 last.value = u'-%s' % value.lower()
139
140
141 elif last.type == tokentype.NUMBER and\
142 ttype == tokentype.IDENT:
143 last.type = tokentype.DIMENSION
144 last.value = u'%s%s' % (last.value, value.lower())
145
146
147
148 elif self.getttype(last.value + value) == tokentype.HASH:
149 last.type = tokentype.HASH
150 last.value += value
151
152
153 elif last.type == tokentype.IDENT and u'(' == value:
154 last.type = tokentype.FUNCTION
155 last.value = u'%s(' % last.value.lower()
156
157
158 elif last.type == tokentype.NUMBER and u'%' == value:
159 last.type = tokentype.PERCENTAGE
160 last.value = u'%s%%' % last.value
161
162
163
164 elif u'important' == value.lower().replace(u'\\', u''):
165 for i in range(len(self.tokens), 0, -1):
166 _t = self.tokens[i - 1]
167
168 if u'!' == _t.value:
169 _t.type = tokentype.IMPORTANT_SYM
170 _t.value = u'!%s' % value.lower()
171 del self.tokens[i:]
172 break
173
174 elif _t.type != tokentype.S:
175 self.tokens.append(
176 Token(self.line, self.col, ttype, value))
177 break
178
179
180 elif u')' == value or ttype == tokentype.EOF:
181
182 _uriindex = -1
183 for i in range(len(self.tokens), 0, -1):
184 _t = self.tokens[i-1]
185 if tokentype.FUNCTION == _t.type and u'url(' == _t.value:
186 _uriindex = i - 1
187 break
188 elif tokentype.FUNCTION == _t.type:
189
190 todo = True
191 break
192
193 if _uriindex > -1:
194
195 _uricontent = u''
196 for i in range(_uriindex+1, len(self.tokens)):
197 _t = self.tokens[i]
198 if _t.type == tokentype.S and\
199 ((i == _uriindex+1) or (i == len(self.tokens)-1)):
200
201 continue
202 else:
203
204 _uricontent += _t.value
205 if _uricontent:
206
207 _uri = u'url(%s)' % _uricontent
208 if tokenregex.URI(_uri):
209 _urit = self.tokens[_uriindex]
210 _urit.type = tokentype.URI
211 _urit.value = _uri
212 del self.tokens[_uriindex + 1:]
213 else:
214 todo = True
215 else:
216
217 _urit = self.tokens[_uriindex]
218 _urit.type = tokentype.URI
219 _urit.value = u'url()'
220 else:
221 todo = True
222
223 else:
224 todo = True
225
226
227 if todo or ttype == tokentype.EOF:
228 self.tokens.append(Token(self.line, self.col, ttype, value))
229
230
231 cols = len(value)
232 if self._sub1ofcol:
233
234 cols -= 1
235 self._sub1ofcol = False
236 if value.find('\n') != -1:
237
238 self.col = 1
239 cols = len(value[value.rfind('\n')+1:])
240
241 self.line += value.count('\n')
242 self.col += cols
243
244
246 """
247 http://www.w3.org/TR/2004/CR-CSS21-20040225/syndata.html#q6
248
249 Third, backslash escapes allow authors to refer to characters
250 they can't easily put in a document. In this case, the
251 backslash is followed by at most six hexadecimal digits
252 (0..9A..F), which stand for the ISO 10646 ([ISO10646])
253 character with that number, which must not be zero. If a
254 character in the range [0-9a-fA-F] follows the hexadecimal
255 number, the end of the number needs to be made clear. There
256 are two ways to do that:
257
258 1. with a space (or other whitespace character): "\26 B"
259 ("&B"). In this case, user agents should treat a "CR/LF"
260 pair (U+000D/U+000A) as a single whitespace character.
261 2. by providing exactly 6 hexadecimal digits: "\000026B"
262 ("&B")
263
264 In fact, these two methods may be combined. Only one
265 whitespace character is ignored after a hexadecimal escape.
266 Note that this means that a "real" space after the escape
267 sequence must itself either be escaped or doubled.
268 """
269 escape = u'\\'
270 MAX = 6
271 i = 0
272 actual = 0
273 while self.text and i < MAX:
274 i += 1
275 c, c2, c3 = self.text[0], u''.join(self.text[:1]),\
276 u''.join(self.text[1:2])
277 if c in string.hexdigits:
278 escape += c
279 del self.text[0]
280 else:
281 actual = i
282 i = MAX
283 else:
284 if int(escape[1:], 16) <= 0:
285 self.log.error(
286 u'Tokenizer: Syntax Error, ESCAPE SEQUENCE with value 0.')
287 escape = ''
288 elif actual < MAX + 1 and c2 in self.WS:
289
290 del self.text[0]
291 if u'\r' == c2 and u'\n' == c3:
292
293 del self.text[0]
294
295
296 escape += u' '
297
298 return escape
299
300
391
392
393 - def tokenize(self, text, _fullSheet=False):
394 """
395 tokenizes text and returns tokens
396 """
397 if not text:
398 return []
399
400 self.text = list(text)
401 self.tokens = []
402 self.line = 1
403 self.col = 1
404
405 def addifnewtoken(t, c):
406 """
407 checks if c starts a new token and adds last t as token
408 """
409 if len(t) == 0:
410 return [c]
411
412 tt, ct = self.getttype(t), self.getttype(c)
413
414
415 if tt in (tokentype.ATKEYWORD, tokentype.IDENT)\
416 or (t and t[-1] == u'-')\
417 and ct in (tokentype.IDENT, tokentype.NUMBER):
418
419
420 t.append(c)
421
422
423
424 elif (t[-1] == u'.' or tt == tokentype.NUMBER)\
425 and ct == tokentype.NUMBER:
426
427 if t[0] == u'.':
428 t[0] = '0.'
429 self._sub1ofcol = True
430 t.append(c)
431
432 elif tt == tokentype.NUMBER and c == u'.':
433
434 t.append(c)
435
436 elif ct == tokentype.DELIM:
437
438
439 self.addtoken(t)
440 self.addtoken(c)
441 t = []
442
443 elif tt != ct:
444
445 self.addtoken(t)
446 t = [c]
447
448 else:
449
450 t.append(c)
451
452
453
454 return t
455
456 t = []
457 while self.text:
458
459 c, c2 = self.text.pop(0), u''.join(self.text[:1])
460
461 if c in self.WS:
462
463 self.addtoken(t)
464 t = [c]
465 try:
466 while self.text[0] in self.WS:
467 t.append(self.text.pop(0))
468 except IndexError:
469 pass
470 self.addtoken(t, tokentype.S)
471 t = []
472
473 elif u'/' == c and u'*' == c2:
474
475 self.addtoken(t)
476 del self.text[0]
477 self.dostrorcomment(
478 [u'/*'], u'*/', tokentype.COMMENT, _fullSheet)
479 t = []
480
481 elif c in '"\'':
482
483 self.addtoken(t)
484 self.dostrorcomment(
485 [c], c, tokentype.STRING, _fullSheet)
486 t = []
487
488 elif c in u';{}[](),':
489
490 self.addtoken(t)
491 self.addtoken(c)
492 t = []
493
494 elif c == u'.' and c2 in tuple(u'0123456789'):
495
496 t = addifnewtoken(t, c)
497
498 elif u'::' == c + c2:
499
500 self.addtoken(t)
501 self.addtoken(u'::', tokentype.PSEUDO_ELEMENT)
502 del self.text[0]
503 t = []
504
505 elif c in u'~|^$*' and u'=' == c2:
506
507 self.addtoken(t)
508 _t = c + c2
509 self.addtoken(_t, Tokenizer._attmap[_t])
510 del self.text[0]
511 t = []
512
513 elif c == u'<' and u''.join(self.text[:3]) == u'!--':
514
515 self.addtoken(t)
516 del self.text[:3]
517 self.addtoken(u'<!--', tokentype.CDO)
518 t = []
519 elif c == u'-' and u''.join(self.text[:2]) == u'->':
520
521 self.addtoken(t)
522 del self.text[:2]
523 self.addtoken(u'-->', tokentype.CDC)
524 t = []
525
526 elif c in u'.=~|*+>#!%:&$':
527
528 self.addtoken(t)
529 self.addtoken(
530 c, Tokenizer._delimmap.get(c, tokentype.DELIM))
531 t = []
532
533 elif u'\\' == c and c2 not in string.hexdigits:
534
535 t.append(c)
536 t.append(c2)
537 del self.text[0]
538 elif u'\\' == c and c2:
539
540 escape = self.getescape()
541 t = addifnewtoken(t, escape)
542
543 else:
544
545 t = addifnewtoken(t, c)
546 else:
547
548 self.addtoken(t)
549
550 if _fullSheet:
551
552 self.addtoken(u'EOF', tokentype.EOF)
553
554 return [t for t in self.tokens
555 if t.type not in (tokentype.CDO, tokentype.CDC)]
556
557
558 if __name__ == '__main__':
559 """
560 NOT LIKE SPEC:
561 between ! and important only WS is allowed, no comments, this should
562 be very seldomly used anyway
563
564 TODO:
565 Tokenizer:
566
567 parser:
568 - filter CDO/CDC
569 - lengths: % px pt pc em ex in cm mm
570
571 CSS2 parses a number immediately followed by an identifier as a
572 DIMEN token (i.e., an unknown unit), CSS1 parsed it as a number
573 and an identifier. That means that in CSS1, the declaration
574 'font: 10pt/1.2serif' was correct, as was 'font: 10pt/12pt serif';
575 in CSS2, a space is required before "serif". (Some UAs accepted
576 the first example, but not the second.)
577 """
578 css = u'''5px -5px'''
579
580 tokens = Tokenizer().tokenize(css)
581 import pprint
582 pprint.pprint(tokens)
583 print 40* '-'
584
585 sheet = cssutils.parseString(css)
586 print sheet.cssText
587 print 40* '-'
588