Package cssutils :: Module tokenize
[hide private]
[frames] | no frames]

Source Code for Module cssutils.tokenize

  1  #!/usr/bin/env python 
  2  """CSS Tokenizer 
  3  """ 
  4  __docformat__ = 'restructuredtext' 
  5  __author__ = '$LastChangedBy: doerwalter $' 
  6  __date__ = '$LastChangedDate: 2007-08-02 22:58:23 +0200 (Do, 02 Aug 2007) $' 
  7  __version__ = '0.9.2b2 $LastChangedRevision: 160 $' 
  8   
  9  import string 
 10  import xml.dom 
 11   
 12  from token import Token, Tokenre 
 13  import cssutils 
 14   
 15   
 16  tokenregex = Tokenre() 
 17  tokentype = Token 
 18   
 19   
20 -class Tokenizer(object):
21 """ 22 generates a list of Token objects 23 """ 24 WS = ' \t\r\n\f' 25 ttypes = tokentype 26 27 _typelist = [ 28 (lambda t: t == u';', tokentype.SEMICOLON), 29 (lambda t: t == u'{', tokentype.LBRACE), 30 (lambda t: t == u'}', tokentype.RBRACE), 31 (lambda t: t == u'[', tokentype.LBRACKET), 32 (lambda t: t == u']', tokentype.RBRACKET), 33 (lambda t: t == u'(', tokentype.LPARANTHESIS), 34 (lambda t: t == u')', tokentype.RPARANTHESIS), 35 (lambda t: t == u',', tokentype.COMMA), 36 (lambda t: t == u'.', tokentype.CLASS), 37 (tokenregex.w, tokentype.S), 38 (tokenregex.num, tokentype.NUMBER), 39 (tokenregex.atkeyword, tokentype.ATKEYWORD), 40 (tokenregex.HASH, tokentype.HASH), 41 (tokenregex.DIMENSION, tokentype.DIMENSION), 42 (tokenregex.ident, tokentype.IDENT), 43 (tokenregex.string, tokentype.STRING) 44 ] 45 _delimmap = { 46 u'*': tokentype.UNIVERSAL, 47 u'.': tokentype.CLASS, 48 u'>': tokentype.GREATER, 49 u'+': tokentype.PLUS, 50 u'~': tokentype.TILDE 51 } 52 _attmap = { 53 u'~=': tokentype.INCLUDES, 54 u'|=': tokentype.DASHMATCH, 55 u'^=': tokentype.PREFIXMATCH, 56 u'$=': tokentype.SUFFIXMATCH, 57 u'*=': tokentype.SUBSTRINGMATCH 58 } 59 _atkeywordmap = { 60 u'charset': tokentype.CHARSET_SYM, 61 u'import': tokentype.IMPORT_SYM, 62 u'media': tokentype.MEDIA_SYM, 63 u'namespace': tokentype.NAMESPACE_SYM, 64 u'page': tokentype.PAGE_SYM 65 } 66
67 - def __init__(self):
68 self.log = cssutils.log 69 self._sub1ofcol = False
70
71 - def getttype(self, t):
72 """ 73 check type of tokentype in t which may be string or list 74 returns ttype 75 """ 76 if isinstance(t, list): t = u''.join(t) 77 78 for check, result in Tokenizer._typelist: 79 if check(t): return result 80 81 return tokentype.DELIM
82 83
84 - def addtoken(self, value, ttype=None):
85 """ 86 adds a new Token to self.tokens 87 """ 88 # convert list of tokens to string 89 if isinstance(value, list): value = u''.join(value) 90 if not value: return 91 92 if not ttype: ttype = self.getttype(value) 93 94 # last two tokens, if none simple use empty Token 95 if len(self.tokens) > 0: last = self.tokens[-1] 96 else: last = Token() 97 if len(self.tokens) > 1: last2 = self.tokens[-2] 98 else: last2 = Token() 99 100 # marker if token already added 101 todo = False 102 103 # WS, simply add later 104 if ttype == tokentype.S: 105 todo = True 106 107 # ATKEYWORD: standard, need to adjust type 108 elif ttype == tokentype.ATKEYWORD: 109 normkeyword = value[1:].lower().replace(u'\\', u'') 110 ttype = Tokenizer._atkeywordmap.get( 111 normkeyword, tokentype.ATKEYWORD) 112 todo = True 113 114 # ATKEYWORD: replace last token if @xxx 115 elif u'@' == last.value and ttype == tokentype.IDENT: 116 keyword = value.lower() 117 normkeyword = keyword.replace(u'\\', u'') 118 last.type = Tokenizer._atkeywordmap.get( 119 normkeyword, tokentype.ATKEYWORD) 120 last.value = u'@%s' % keyword # replace @ 121 122 # @-ATKEYWORD: replace last2 if @-xxx and remove last 123 # probably vendor specific 124 elif u'@' == last2.value and u'-' == last.value and \ 125 ttype == tokentype.IDENT: 126 keyword = value.lower() 127 normkeyword = keyword.replace(u'\\', u'') 128 last2.type = Tokenizer._atkeywordmap.get( 129 normkeyword, tokentype.ATKEYWORD) 130 last2.value = u'@-%s' % keyword # replace @ 131 self.tokens.pop(-1) # remove - 132 133 # IDENT, NUMBER or DIMENSION with -, replace last token 134 # -IDENT probably vendor specific 135 elif u'-' == last.value and (ttype in ( 136 tokentype.IDENT, tokentype.NUMBER, tokentype.DIMENSION)): 137 last.type = ttype 138 last.value = u'-%s' % value.lower() 139 140 # DIMENSION: replace last token with num + ident 141 elif last.type == tokentype.NUMBER and\ 142 ttype == tokentype.IDENT: 143 last.type = tokentype.DIMENSION 144 last.value = u'%s%s' % (last.value, value.lower()) 145 ## check if before was a -? 146 147 # HASH: replace last token with # + name 148 elif self.getttype(last.value + value) == tokentype.HASH: 149 last.type = tokentype.HASH 150 last.value += value # last value starts with # anyway 151 152 # FUNCTION: replace last token with last.value( 153 elif last.type == tokentype.IDENT and u'(' == value: 154 last.type = tokentype.FUNCTION 155 last.value = u'%s(' % last.value.lower() 156 157 # PERCENTAGE: replace last token with NUMBER% 158 elif last.type == tokentype.NUMBER and u'%' == value: 159 last.type = tokentype.PERCENTAGE 160 last.value = u'%s%%' % last.value 161 162 # IMPORTANT_SYM: combine with preceding "!" if only WS in between 163 # No comments in between! 164 elif u'important' == value.lower().replace(u'\\', u''): 165 for i in range(len(self.tokens), 0, -1): 166 _t = self.tokens[i - 1] 167 # check if preceding was "!" => !important and delete nexts 168 if u'!' == _t.value: 169 _t.type = tokentype.IMPORTANT_SYM 170 _t.value = u'!%s' % value.lower() # keep im\portant? 171 del self.tokens[i:] 172 break 173 # other than S means no !important => add 174 elif _t.type != tokentype.S: 175 self.tokens.append( 176 Token(self.line, self.col, ttype, value)) 177 break 178 179 # URI: possible combine if closed URI or EOF 180 elif u')' == value or ttype == tokentype.EOF: 181 # find opening {ident}( 182 _uriindex = -1 183 for i in range(len(self.tokens), 0, -1): 184 _t = self.tokens[i-1] 185 if tokentype.FUNCTION == _t.type and u'url(' == _t.value: 186 _uriindex = i - 1 187 break 188 elif tokentype.FUNCTION == _t.type: 189 # no url( found but other so stop searching 190 todo = True # add ) 191 break 192 193 if _uriindex > -1: 194 # check content between "url(" and ")" 195 _uricontent = u'' 196 for i in range(_uriindex+1, len(self.tokens)): 197 _t = self.tokens[i] 198 if _t.type == tokentype.S and\ 199 ((i == _uriindex+1) or (i == len(self.tokens)-1)): 200 # 1st or last WS ok 201 continue 202 else: 203 # found other content 204 _uricontent += _t.value 205 if _uricontent: 206 # check if valid URI and save if yes 207 _uri = u'url(%s)' % _uricontent 208 if tokenregex.URI(_uri): 209 _urit = self.tokens[_uriindex] 210 _urit.type = tokentype.URI 211 _urit.value = _uri 212 del self.tokens[_uriindex + 1:] 213 else: 214 todo = True # add ) 215 else: 216 todo = True # add ) 217 218 else: 219 todo = True 220 221 # add if not two WS nodes after another 222 if todo or ttype == tokentype.EOF: 223 self.tokens.append(Token(self.line, self.col, ttype, value)) 224 225 # adjust x,y position 226 cols = len(value) 227 if self._sub1ofcol: 228 # added a "0" to ".1" -> "0.1" so PLUS 1 229 cols -= 1 230 self._sub1ofcol = False 231 if value.find('\n') != -1: 232 # end of a line, start anew but count present chars 233 self.col = 1 234 cols = len(value[value.rfind('\n')+1:]) 235 236 self.line += value.count('\n') 237 self.col += cols
238 239
240 - def getescape(self):
241 """ 242 http://www.w3.org/TR/2004/CR-CSS21-20040225/syndata.html#q6 243 244 Third, backslash escapes allow authors to refer to characters 245 they can't easily put in a document. In this case, the 246 backslash is followed by at most six hexadecimal digits 247 (0..9A..F), which stand for the ISO 10646 ([ISO10646]) 248 character with that number, which must not be zero. If a 249 character in the range [0-9a-fA-F] follows the hexadecimal 250 number, the end of the number needs to be made clear. There 251 are two ways to do that: 252 253 1. with a space (or other whitespace character): "\26 B" 254 ("&B"). In this case, user agents should treat a "CR/LF" 255 pair (U+000D/U+000A) as a single whitespace character. 256 2. by providing exactly 6 hexadecimal digits: "\000026B" 257 ("&B") 258 259 In fact, these two methods may be combined. Only one 260 whitespace character is ignored after a hexadecimal escape. 261 Note that this means that a "real" space after the escape 262 sequence must itself either be escaped or doubled. 263 """ 264 escape = u'\\' 265 MAX = 6 266 i = 0 267 actual = 0 268 while self.text and i < MAX: 269 i += 1 270 c, c2, c3 = self.text[0], u''.join(self.text[:1]),\ 271 u''.join(self.text[1:2]) 272 if c in string.hexdigits: 273 escape += c 274 del self.text[0] 275 else: 276 actual = i 277 i = MAX # end while and goto else (break would not work) 278 else: 279 if int(escape[1:], 16) <= 0: 280 self.log.error( 281 u'Tokenizer: Syntax Error, ESCAPE SEQUENCE with value 0.') 282 escape = '' 283 elif actual < MAX + 1 and c2 in self.WS: 284 # remove separating WS 285 del self.text[0] 286 if u'\r' == c2 and u'\n' == c3: 287 # remove combined WS \r\n as one WS 288 del self.text[0] 289 290 # add explicit SPACE to end ESCAPE needed as not MAX len! 291 escape += u' ' 292 293 return escape
294 295
296 - def dostrorcomment(self, t=[], end=None, ttype=None, _fullSheet=False):
297 """ 298 handles 299 strings: "..." or '...' 300 comment: /*...*/ 301 302 t 303 initial token to start result with 304 end 305 string at which to end 306 ttype 307 str description of token to be found 308 _fullSheet 309 if no more tokens complete found tokens 310 """ 311 if ttype == tokentype.STRING: 312 isstring = True 313 kind = 'string' 314 else: 315 isstring = False 316 kind = 'comment' 317 318 while self.text: 319 # c is removed from self.text, c2 may be removed too later here 320 c, c2 = self.text.pop(0), u''.join(self.text[:1]) 321 322 if (isstring and c == end) or\ 323 (not isstring and c + c2 == end): 324 # check if end and add 325 t.append(end) 326 self.addtoken(t, ttype) 327 if not isstring: 328 del self.text[0] # remove ending / (c2) 329 break 330 331 elif isstring and u'\\' == c and c2 in u'\n\r\f': 332 # in STRING ignore and remove a combi of \ + nl 333 if u'\r' == c2 and \ 334 len(self.text) > 2 and u'\n' == self.text[1]: 335 #\r\n! 336 del self.text[0] # remove c2 = \r 337 del self.text[0] # remove "c3" = \n 338 else: 339 del self.text[0] #remove c2 \r or \n or \f 340 341 elif isstring and c in '\n\r\f': 342 # nl in String makes it invalid 343 t.append(c) 344 self.addtoken(t, tokentype.INVALID) 345 break 346 347 elif not isstring and u'\\' == c: 348 # escape in comment does not work 349 # simply keep 350 t.append(c) 351 352 elif u'\\' == c and c2 and c2 not in string.hexdigits: 353 # simple escape 354 t.append(c) 355 t.append(c2) 356 del self.text[0] # remove c2 357 358 elif u'\\' == c and c2: 359 # character escape sequence 360 # sequence end character/s will be stripped! 361 escape = self.getescape() 362 t.append(escape) 363 364 else: 365 # save 366 t.append(c) 367 368 else: 369 # EOF but complete string or comment 370 if _fullSheet: 371 t.append(end) 372 self.addtoken(t, ttype) 373 else: 374 # not complete: 375 value = ''.join(t) 376 lines = value.count('\n') 377 cols = len(value) 378 if value.endswith('\n'): 379 cols = -self.col + 1; 380 token = Token(self.line, self.col, None, value) 381 self.line += lines 382 self.col += cols 383 self.log.error( 384 u'Tokenizer: Syntax Error, incomplete %s.' % kind, 385 token, xml.dom.SyntaxErr)
386 387
388 - def tokenize(self, text, _fullSheet=False):
389 """ 390 tokenizes text and returns tokens 391 """ 392 if not text: 393 return [] 394 395 self.text = list(text) 396 self.tokens = [] 397 self.line = 1 398 self.col = 1 399 400 def addifnewtoken(t, c): 401 """ 402 checks if c starts a new token and adds last t as token 403 """ 404 if len(t) == 0: 405 return [c] # new t 406 407 tt, ct = self.getttype(t), self.getttype(c) 408 ## print '"%s": (%s)\t %s: (%s)' % (c, ct, t, tt), 409 410 if tt in (tokentype.ATKEYWORD, tokentype.IDENT)\ 411 or (t and t[-1] == u'-')\ 412 and ct in (tokentype.IDENT, tokentype.NUMBER): 413 # @keyword or a number starting with - 414 # wait for new token "x1..." 415 t.append(c) 416 417 # . is always followed by number here as calling function 418 # checks this! 419 elif (t[-1] == u'.' or tt == tokentype.NUMBER)\ 420 and ct == tokentype.NUMBER: 421 # start of number which may be 1 OR 1. OR . 422 if t[0] == u'.': 423 t[0] = '0.' # add 0 in any case 424 self._sub1ofcol = True 425 t.append(c) 426 427 elif tt == tokentype.NUMBER and c == u'.': 428 # start of number which may be 1 OR 1. OR . 429 t.append(c) 430 431 elif ct == tokentype.DELIM: 432 # escape always alone 433 # . not with number always alone 434 self.addtoken(t) 435 self.addtoken(c) 436 t = [] 437 438 elif tt != ct: 439 # finish old and start new token with c 440 self.addtoken(t) 441 t = [c] 442 443 else: 444 # wait for new token or end 445 t.append(c) 446 447 ## print '"%s": (%s)\t %s: (%s)\n' % (c, ct, t, tt) 448 ## print '----',self.tokens 449 return t
450 451 t = [] 452 while self.text: 453 # next two chars 454 c, c2 = self.text.pop(0), u''.join(self.text[:1]) 455 456 if c in self.WS: 457 # WhiteSpace 458 self.addtoken(t) # add saved 459 t = [c] # initial add WS 460 try: 461 while self.text[0] in self.WS: 462 t.append(self.text.pop(0)) 463 except IndexError: # end of CSS 464 pass 465 self.addtoken(t, tokentype.S) # add WS 466 t = [] # reset 467 468 elif u'/' == c and u'*' == c2: 469 # Comment 470 self.addtoken(t) # add saved 471 del self.text[0] # remove * 472 self.dostrorcomment( 473 [u'/*'], u'*/', tokentype.COMMENT, _fullSheet) 474 t = [] 475 476 elif c in '"\'': 477 # strings 478 self.addtoken(t) # add saved 479 self.dostrorcomment( 480 [c], c, tokentype.STRING, _fullSheet) 481 t = [] 482 483 elif c in u';{}[](),': 484 # reservedchars, type will be handled above 485 self.addtoken(t) # add saved 486 self.addtoken(c) 487 t = [] 488 489 elif c == u'.' and c2 in tuple(u'0123456789'): 490 # possible num 491 t = addifnewtoken(t, c) 492 493 elif u'::' == c + c2: 494 # CSS3 pseudo 495 self.addtoken(t) # add saved 496 self.addtoken(u'::', tokentype.PSEUDO_ELEMENT) 497 del self.text[0] # remove c2 498 t = [] 499 500 elif c in u'~|^$*' and u'=' == c2: 501 # INCLUDES ~= or DASHMATCH |= + CSS3 Selectors 502 self.addtoken(t) # add saved 503 _t = c + c2 504 self.addtoken(_t, Tokenizer._attmap[_t]) 505 del self.text[0] # remove c2 506 t = [] 507 508 elif c == u'<' and u''.join(self.text[:3]) == u'!--': 509 # CDO 510 self.addtoken(t) # add saved 511 del self.text[:3] 512 self.addtoken(u'<!--', tokentype.CDO) 513 t = [] 514 elif c == u'-' and u''.join(self.text[:2]) == u'->': 515 # CDC 516 self.addtoken(t) # add saved 517 del self.text[:2] 518 self.addtoken(u'-->', tokentype.CDC) 519 t = [] 520 521 elif c in u'.=~|*+>#!%:&$': 522 # DELIM reservedchars, possibly combined later 523 self.addtoken(t) # add saved 524 self.addtoken( 525 c, Tokenizer._delimmap.get(c, tokentype.DELIM)) 526 t = [] 527 528 elif u'\\' == c and c2 not in string.hexdigits: 529 # simple escape 530 t.append(c) 531 t.append(c2) 532 del self.text[0] 533 elif u'\\' == c and c2: 534 # character escape sequence 535 escape = self.getescape() 536 t = addifnewtoken(t, escape) 537 538 else: 539 # save 540 t = addifnewtoken(t, c) 541 else: 542 # add remaining 543 self.addtoken(t) 544 545 if _fullSheet: 546 # add EOF token if from parse or CSSStyleSheet.cssText 547 self.addtoken(u'EOF', tokentype.EOF) 548 549 return self.tokens
550 551 552 if __name__ == '__main__': 553 """ 554 NOT LIKE SPEC: 555 between ! and important only WS is allowed, no comments, this should 556 be very seldomly used anyway 557 558 TODO: 559 Tokenizer: 560 561 parser: 562 - filter CDO/CDC 563 - lengths: % px pt pc em ex in cm mm 564 565 CSS2 parses a number immediately followed by an identifier as a 566 DIMEN token (i.e., an unknown unit), CSS1 parsed it as a number 567 and an identifier. That means that in CSS1, the declaration 568 'font: 10pt/1.2serif' was correct, as was 'font: 10pt/12pt serif'; 569 in CSS2, a space is required before "serif". (Some UAs accepted 570 the first example, but not the second.) 571 """ 572 css = u'''5px -5px''' 573 574 tokens = Tokenizer().tokenize(css) 575 import pprint 576 pprint.pprint(tokens) 577 print 40* '-' 578 579 sheet = cssutils.parseString(css) 580 print sheet.cssText 581 print 40* '-' 582