Package cssutils :: Module tokenize
[hide private]
[frames] | no frames]

Source Code for Module cssutils.tokenize

  1  #!/usr/bin/env python 
  2  """CSS Tokenizer 
  3  """ 
  4  __docformat__ = 'restructuredtext' 
  5  __author__ = '$LastChangedBy: cthedot $' 
  6  __date__ = '$LastChangedDate: 2007-09-01 15:55:42 +0200 (Sa, 01 Sep 2007) $' 
  7  __version__ = '0.9.2b2 $LastChangedRevision: 300 $' 
  8   
  9  import string 
 10  import xml.dom 
 11   
 12  from token import Token, Tokenre 
 13  import cssutils 
 14   
 15   
 16  tokenregex = Tokenre() 
 17  tokentype = Token 
 18   
 19   
20 -class Tokenizer(object):
21 """ 22 generates a list of Token objects 23 """ 24 WS = ' \t\r\n\f' 25 ttypes = tokentype 26 27 _typelist = [ 28 (lambda t: t == u';', tokentype.SEMICOLON), 29 (lambda t: t == u'{', tokentype.LBRACE), 30 (lambda t: t == u'}', tokentype.RBRACE), 31 (lambda t: t == u'[', tokentype.LBRACKET), 32 (lambda t: t == u']', tokentype.RBRACKET), 33 (lambda t: t == u'(', tokentype.LPARANTHESIS), 34 (lambda t: t == u')', tokentype.RPARANTHESIS), 35 (lambda t: t == u',', tokentype.COMMA), 36 (lambda t: t == u'.', tokentype.CLASS), 37 (tokenregex.w, tokentype.S), 38 (tokenregex.num, tokentype.NUMBER), 39 (tokenregex.atkeyword, tokentype.ATKEYWORD), 40 (tokenregex.HASH, tokentype.HASH), 41 (tokenregex.DIMENSION, tokentype.DIMENSION), 42 (tokenregex.ident, tokentype.IDENT), 43 (tokenregex.string, tokentype.STRING) 44 ] 45 _delimmap = { 46 u'*': tokentype.UNIVERSAL, 47 u'.': tokentype.CLASS, 48 u'>': tokentype.GREATER, 49 u'+': tokentype.PLUS, 50 u'~': tokentype.TILDE 51 } 52 _attmap = { 53 u'~=': tokentype.INCLUDES, 54 u'|=': tokentype.DASHMATCH, 55 u'^=': tokentype.PREFIXMATCH, 56 u'$=': tokentype.SUFFIXMATCH, 57 u'*=': tokentype.SUBSTRINGMATCH 58 } 59 _atkeywordmap = { 60 u'charset': tokentype.CHARSET_SYM, 61 u'import': tokentype.IMPORT_SYM, 62 u'media': tokentype.MEDIA_SYM, 63 u'namespace': tokentype.NAMESPACE_SYM, 64 u'page': tokentype.PAGE_SYM 65 } 66
67 - def __init__(self):
68 self.log = cssutils.log 69 self._sub1ofcol = False
70
71 - def getttype(self, t):
72 """ 73 check type of tokentype in t which may be string or list 74 returns ttype 75 """ 76 if isinstance(t, list): t = u''.join(t) 77 78 for check, result in Tokenizer._typelist: 79 if check(t): return result 80 81 return tokentype.DELIM
82 83
84 - def addtoken(self, value, ttype=None):
85 """ 86 adds a new Token to self.tokens 87 """ 88 # convert list of tokens to string 89 if isinstance(value, list): value = u''.join(value) 90 if not value: return 91 92 if not ttype: ttype = self.getttype(value) 93 94 # last two tokens, if none simple use empty Token 95 if len(self.tokens) > 0: last = self.tokens[-1] 96 else: last = Token() 97 if len(self.tokens) > 1: last2 = self.tokens[-2] 98 else: last2 = Token() 99 100 # marker if token already added 101 todo = False 102 103 # WS, simply add later 104 if ttype == tokentype.S: 105 todo = True 106 107 # ATKEYWORD: standard, need to adjust type 108 elif ttype == tokentype.ATKEYWORD: 109 normkeyword = value[1:].lower().replace(u'\\', u'') 110 ttype = Tokenizer._atkeywordmap.get( 111 normkeyword, tokentype.ATKEYWORD) 112 todo = True 113 114 # ATKEYWORD: replace last token if @xxx 115 elif u'@' == last.value and ttype == tokentype.IDENT: 116 keyword = value.lower() 117 normkeyword = keyword.replace(u'\\', u'') 118 last.type = Tokenizer._atkeywordmap.get( 119 normkeyword, tokentype.ATKEYWORD) 120 last.value = u'@%s' % keyword # replace @ 121 122 # @-ATKEYWORD: replace last2 if @-xxx and remove last 123 # probably vendor specific 124 elif u'@' == last2.value and u'-' == last.value and \ 125 ttype == tokentype.IDENT: 126 keyword = value.lower() 127 normkeyword = keyword.replace(u'\\', u'') 128 last2.type = Tokenizer._atkeywordmap.get( 129 normkeyword, tokentype.ATKEYWORD) 130 last2.value = u'@-%s' % keyword # replace @ 131 self.tokens.pop(-1) # remove - 132 133 # IDENT, NUMBER or DIMENSION with -, replace last token 134 # -IDENT probably vendor specific 135 elif u'-' == last.value and (ttype in ( 136 tokentype.IDENT, tokentype.NUMBER, tokentype.DIMENSION)): 137 last.type = ttype 138 last.value = u'-%s' % value.lower() 139 140 # DIMENSION: replace last token with num + ident 141 elif last.type == tokentype.NUMBER and\ 142 ttype == tokentype.IDENT: 143 last.type = tokentype.DIMENSION 144 last.value = u'%s%s' % (last.value, value.lower()) 145 ## check if before was a -? 146 147 # HASH: replace last token with # + name 148 elif self.getttype(last.value + value) == tokentype.HASH: 149 last.type = tokentype.HASH 150 last.value += value # last value starts with # anyway 151 152 # FUNCTION: replace last token with last.value( 153 elif last.type == tokentype.IDENT and u'(' == value: 154 last.type = tokentype.FUNCTION 155 last.value = u'%s(' % last.value.lower() 156 157 # PERCENTAGE: replace last token with NUMBER% 158 elif last.type == tokentype.NUMBER and u'%' == value: 159 last.type = tokentype.PERCENTAGE 160 last.value = u'%s%%' % last.value 161 162 # IMPORTANT_SYM: combine with preceding "!" if only WS in between 163 # No comments in between! 164 elif u'important' == value.lower().replace(u'\\', u''): 165 for i in range(len(self.tokens), 0, -1): 166 _t = self.tokens[i - 1] 167 # check if preceding was "!" => !important and delete nexts 168 if u'!' == _t.value: 169 _t.type = tokentype.IMPORTANT_SYM 170 _t.value = u'!%s' % value.lower() # keep im\portant? 171 del self.tokens[i:] 172 break 173 # other than S means no !important => add 174 elif _t.type != tokentype.S: 175 self.tokens.append( 176 Token(self.line, self.col, ttype, value)) 177 break 178 179 # URI: possible combine if closed URI or EOF 180 elif u')' == value or ttype == tokentype.EOF: 181 # find opening {ident}( 182 _uriindex = -1 183 for i in range(len(self.tokens), 0, -1): 184 _t = self.tokens[i-1] 185 if tokentype.FUNCTION == _t.type and u'url(' == _t.value: 186 _uriindex = i - 1 187 break 188 elif tokentype.FUNCTION == _t.type: 189 # no url( found but other so stop searching 190 todo = True # add ) 191 break 192 193 if _uriindex > -1: 194 # check content between "url(" and ")" 195 _uricontent = u'' 196 for i in range(_uriindex+1, len(self.tokens)): 197 _t = self.tokens[i] 198 if _t.type == tokentype.S and\ 199 ((i == _uriindex+1) or (i == len(self.tokens)-1)): 200 # 1st or last WS ok 201 continue 202 else: 203 # found other content 204 _uricontent += _t.value 205 if _uricontent: 206 # check if valid URI and save if yes 207 _uri = u'url(%s)' % _uricontent 208 if tokenregex.URI(_uri): 209 _urit = self.tokens[_uriindex] 210 _urit.type = tokentype.URI 211 _urit.value = _uri 212 del self.tokens[_uriindex + 1:] 213 else: 214 todo = True # add ) 215 else: 216 # empty url() 217 _urit = self.tokens[_uriindex] 218 _urit.type = tokentype.URI 219 _urit.value = u'url()' 220 else: 221 todo = True # add ) 222 223 else: 224 todo = True 225 226 # add if not two WS nodes after another 227 if todo or ttype == tokentype.EOF: 228 self.tokens.append(Token(self.line, self.col, ttype, value)) 229 230 # adjust x,y position 231 cols = len(value) 232 if self._sub1ofcol: 233 # added a "0" to ".1" -> "0.1" so PLUS 1 234 cols -= 1 235 self._sub1ofcol = False 236 if value.find('\n') != -1: 237 # end of a line, start anew but count present chars 238 self.col = 1 239 cols = len(value[value.rfind('\n')+1:]) 240 241 self.line += value.count('\n') 242 self.col += cols
243 244
245 - def getescape(self):
246 """ 247 http://www.w3.org/TR/2004/CR-CSS21-20040225/syndata.html#q6 248 249 Third, backslash escapes allow authors to refer to characters 250 they can't easily put in a document. In this case, the 251 backslash is followed by at most six hexadecimal digits 252 (0..9A..F), which stand for the ISO 10646 ([ISO10646]) 253 character with that number, which must not be zero. If a 254 character in the range [0-9a-fA-F] follows the hexadecimal 255 number, the end of the number needs to be made clear. There 256 are two ways to do that: 257 258 1. with a space (or other whitespace character): "\26 B" 259 ("&B"). In this case, user agents should treat a "CR/LF" 260 pair (U+000D/U+000A) as a single whitespace character. 261 2. by providing exactly 6 hexadecimal digits: "\000026B" 262 ("&B") 263 264 In fact, these two methods may be combined. Only one 265 whitespace character is ignored after a hexadecimal escape. 266 Note that this means that a "real" space after the escape 267 sequence must itself either be escaped or doubled. 268 """ 269 escape = u'\\' 270 MAX = 6 271 i = 0 272 actual = 0 273 while self.text and i < MAX: 274 i += 1 275 c, c2, c3 = self.text[0], u''.join(self.text[:1]),\ 276 u''.join(self.text[1:2]) 277 if c in string.hexdigits: 278 escape += c 279 del self.text[0] 280 else: 281 actual = i 282 i = MAX # end while and goto else (break would not work) 283 else: 284 if int(escape[1:], 16) <= 0: 285 self.log.error( 286 u'Tokenizer: Syntax Error, ESCAPE SEQUENCE with value 0.') 287 escape = '' 288 elif actual < MAX + 1 and c2 in self.WS: 289 # remove separating WS 290 del self.text[0] 291 if u'\r' == c2 and u'\n' == c3: 292 # remove combined WS \r\n as one WS 293 del self.text[0] 294 295 # add explicit SPACE to end ESCAPE needed as not MAX len! 296 escape += u' ' 297 298 return escape
299 300
301 - def dostrorcomment(self, t=[], end=None, ttype=None, _fullSheet=False):
302 """ 303 handles 304 strings: "..." or '...' 305 comment: /*...*/ 306 307 t 308 initial token to start result with 309 end 310 string at which to end 311 ttype 312 str description of token to be found 313 _fullSheet 314 if no more tokens complete found tokens 315 """ 316 if ttype == tokentype.STRING: 317 isstring = True 318 kind = 'string' 319 else: 320 isstring = False 321 kind = 'comment' 322 323 while self.text: 324 # c is removed from self.text, c2 may be removed too later here 325 c, c2 = self.text.pop(0), u''.join(self.text[:1]) 326 327 if (isstring and c == end) or\ 328 (not isstring and c + c2 == end): 329 # check if end and add 330 t.append(end) 331 self.addtoken(t, ttype) 332 if not isstring: 333 del self.text[0] # remove ending / (c2) 334 break 335 336 elif isstring and u'\\' == c and c2 in u'\n\r\f': 337 # in STRING ignore and remove a combi of \ + nl 338 if u'\r' == c2 and \ 339 len(self.text) > 2 and u'\n' == self.text[1]: 340 #\r\n! 341 del self.text[0] # remove c2 = \r 342 del self.text[0] # remove "c3" = \n 343 else: 344 del self.text[0] #remove c2 \r or \n or \f 345 346 elif isstring and c in '\n\r\f': 347 # nl in String makes it invalid 348 t.append(c) 349 self.addtoken(t, tokentype.INVALID) 350 break 351 352 elif not isstring and u'\\' == c: 353 # escape in comment does not work 354 # simply keep 355 t.append(c) 356 357 elif u'\\' == c and c2 and c2 not in string.hexdigits: 358 # simple escape 359 t.append(c) 360 t.append(c2) 361 del self.text[0] # remove c2 362 363 elif u'\\' == c and c2: 364 # character escape sequence 365 # sequence end character/s will be stripped! 366 escape = self.getescape() 367 t.append(escape) 368 369 else: 370 # save 371 t.append(c) 372 373 else: 374 # EOF but complete string or comment 375 if _fullSheet: 376 t.append(end) 377 self.addtoken(t, ttype) 378 else: 379 # not complete: 380 value = ''.join(t) 381 lines = value.count('\n') 382 cols = len(value) 383 if value.endswith('\n'): 384 cols = -self.col + 1; 385 token = Token(self.line, self.col, None, value) 386 self.line += lines 387 self.col += cols 388 self.log.error( 389 u'Tokenizer: Syntax Error, incomplete %s.' % kind, 390 token, xml.dom.SyntaxErr)
391 392
393 - def tokenize(self, text, _fullSheet=False):
394 """ 395 tokenizes text and returns tokens 396 """ 397 if not text: 398 return [] 399 400 self.text = list(text) 401 self.tokens = [] 402 self.line = 1 403 self.col = 1 404 405 def addifnewtoken(t, c): 406 """ 407 checks if c starts a new token and adds last t as token 408 """ 409 if len(t) == 0: 410 return [c] # new t 411 412 tt, ct = self.getttype(t), self.getttype(c) 413 ## print '"%s": (%s)\t %s: (%s)' % (c, ct, t, tt), 414 415 if tt in (tokentype.ATKEYWORD, tokentype.IDENT)\ 416 or (t and t[-1] == u'-')\ 417 and ct in (tokentype.IDENT, tokentype.NUMBER): 418 # @keyword or a number starting with - 419 # wait for new token "x1..." 420 t.append(c) 421 422 # . is always followed by number here as calling function 423 # checks this! 424 elif (t[-1] == u'.' or tt == tokentype.NUMBER)\ 425 and ct == tokentype.NUMBER: 426 # start of number which may be 1 OR 1. OR . 427 if t[0] == u'.': 428 t[0] = '0.' # add 0 in any case 429 self._sub1ofcol = True 430 t.append(c) 431 432 elif tt == tokentype.NUMBER and c == u'.': 433 # start of number which may be 1 OR 1. OR . 434 t.append(c) 435 436 elif ct == tokentype.DELIM: 437 # escape always alone 438 # . not with number always alone 439 self.addtoken(t) 440 self.addtoken(c) 441 t = [] 442 443 elif tt != ct: 444 # finish old and start new token with c 445 self.addtoken(t) 446 t = [c] 447 448 else: 449 # wait for new token or end 450 t.append(c) 451 452 ## print '"%s": (%s)\t %s: (%s)\n' % (c, ct, t, tt) 453 ## print '----',self.tokens 454 return t
455 456 t = [] 457 while self.text: 458 # next two chars 459 c, c2 = self.text.pop(0), u''.join(self.text[:1]) 460 461 if c in self.WS: 462 # WhiteSpace 463 self.addtoken(t) # add saved 464 t = [c] # initial add WS 465 try: 466 while self.text[0] in self.WS: 467 t.append(self.text.pop(0)) 468 except IndexError: # end of CSS 469 pass 470 self.addtoken(t, tokentype.S) # add WS 471 t = [] # reset 472 473 elif u'/' == c and u'*' == c2: 474 # Comment 475 self.addtoken(t) # add saved 476 del self.text[0] # remove * 477 self.dostrorcomment( 478 [u'/*'], u'*/', tokentype.COMMENT, _fullSheet) 479 t = [] 480 481 elif c in '"\'': 482 # strings 483 self.addtoken(t) # add saved 484 self.dostrorcomment( 485 [c], c, tokentype.STRING, _fullSheet) 486 t = [] 487 488 elif c in u';{}[](),': 489 # reservedchars, type will be handled above 490 self.addtoken(t) # add saved 491 self.addtoken(c) 492 t = [] 493 494 elif c == u'.' and c2 in tuple(u'0123456789'): 495 # possible num 496 t = addifnewtoken(t, c) 497 498 elif u'::' == c + c2: 499 # CSS3 pseudo 500 self.addtoken(t) # add saved 501 self.addtoken(u'::', tokentype.PSEUDO_ELEMENT) 502 del self.text[0] # remove c2 503 t = [] 504 505 elif c in u'~|^$*' and u'=' == c2: 506 # INCLUDES ~= or DASHMATCH |= + CSS3 Selectors 507 self.addtoken(t) # add saved 508 _t = c + c2 509 self.addtoken(_t, Tokenizer._attmap[_t]) 510 del self.text[0] # remove c2 511 t = [] 512 513 elif c == u'<' and u''.join(self.text[:3]) == u'!--': 514 # CDO 515 self.addtoken(t) # add saved 516 del self.text[:3] 517 self.addtoken(u'<!--', tokentype.CDO) 518 t = [] 519 elif c == u'-' and u''.join(self.text[:2]) == u'->': 520 # CDC 521 self.addtoken(t) # add saved 522 del self.text[:2] 523 self.addtoken(u'-->', tokentype.CDC) 524 t = [] 525 526 elif c in u'.=~|*+>#!%:&$': 527 # DELIM reservedchars, possibly combined later 528 self.addtoken(t) # add saved 529 self.addtoken( 530 c, Tokenizer._delimmap.get(c, tokentype.DELIM)) 531 t = [] 532 533 elif u'\\' == c and c2 not in string.hexdigits: 534 # simple escape 535 t.append(c) 536 t.append(c2) 537 del self.text[0] 538 elif u'\\' == c and c2: 539 # character escape sequence 540 escape = self.getescape() 541 t = addifnewtoken(t, escape) 542 543 else: 544 # save 545 t = addifnewtoken(t, c) 546 else: 547 # add remaining 548 self.addtoken(t) 549 550 if _fullSheet: 551 # add EOF token if from parse or CSSStyleSheet.cssText 552 self.addtoken(u'EOF', tokentype.EOF) 553 554 return [t for t in self.tokens 555 if t.type not in (tokentype.CDO, tokentype.CDC)]
556 557 558 if __name__ == '__main__': 559 """ 560 NOT LIKE SPEC: 561 between ! and important only WS is allowed, no comments, this should 562 be very seldomly used anyway 563 564 TODO: 565 Tokenizer: 566 567 parser: 568 - filter CDO/CDC 569 - lengths: % px pt pc em ex in cm mm 570 571 CSS2 parses a number immediately followed by an identifier as a 572 DIMEN token (i.e., an unknown unit), CSS1 parsed it as a number 573 and an identifier. That means that in CSS1, the declaration 574 'font: 10pt/1.2serif' was correct, as was 'font: 10pt/12pt serif'; 575 in CSS2, a space is required before "serif". (Some UAs accepted 576 the first example, but not the second.) 577 """ 578 css = u'''5px -5px''' 579 580 tokens = Tokenizer().tokenize(css) 581 import pprint 582 pprint.pprint(tokens) 583 print 40* '-' 584 585 sheet = cssutils.parseString(css) 586 print sheet.cssText 587 print 40* '-' 588