Package cssutils :: Module tokenize2
[hide private]
[frames] | no frames]

Source Code for Module cssutils.tokenize2

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  """New CSS Tokenizer (a generator) 
  4  """ 
  5  __all__ = ['Tokenizer', 'CSSProductions'] 
  6  __docformat__ = 'restructuredtext' 
  7  __version__ = '$Id: tokenize2.py 1116 2008-03-05 13:52:23Z cthedot $' 
  8  import os 
  9  import re 
 10  import string 
 11  import xml.dom 
 12  import cssutils 
 13  import util 
 14  from cssproductions import * 
 15   
16 -class Tokenizer(object):
17 """ 18 generates a list of Token tuples: 19 (Tokenname, value, startline, startcolumn) 20 """ 21 _linesep = u'\n' 22
23 - def _expand_macros(self, macros, productions):
24 """returns macro expanded productions, order of productions is kept""" 25 def macro_value(m): 26 return '(?:%s)' % macros[m.groupdict()['macro']]
27 expanded = [] 28 for key, value in productions: 29 while re.search(r'{[a-zA-Z][a-zA-Z0-9-]*}', value): 30 value = re.sub(r'{(?P<macro>[a-zA-Z][a-zA-Z0-9-]*)}', 31 macro_value, value) 32 expanded.append((key, value)) 33 return expanded
34
35 - def _compile_productions(self, expanded_productions):
36 """compile productions into callable match objects, order is kept""" 37 compiled = [] 38 for key, value in expanded_productions: 39 compiled.append((key, re.compile('^(?:%s)' % value, re.U).match)) 40 return compiled
41
42 - def __init__(self, macros=None, productions=None):
43 """ 44 inits tokenizer with given macros and productions which default to 45 cssutils own macros and productions 46 """ 47 self.log = cssutils.log 48 if not macros: 49 macros = MACROS 50 if not productions: 51 productions = PRODUCTIONS 52 self.tokenmatches = self._compile_productions( 53 self._expand_macros(macros, productions)) 54 self.commentmatcher = [x[1] for x in self.tokenmatches if x[0] == 'COMMENT'][0] 55 self.urimatcher = [x[1] for x in self.tokenmatches if x[0] == 'URI'][0] 56 self.unicodesub = re.compile(RE_UNICODE).sub
57
58 - def tokenize(self, text, fullsheet=False):
59 """ 60 generator: tokenizes text and yiels tokens, each token is a tuple of:: 61 62 (tokenname, tokenvalue, line, col) 63 64 The tokenvalue will contain a normal string, meaning CSS unicode 65 escapes have been resolved to normal characters. The serializer 66 escapes needed characters back to unicode escapes depending of 67 the stylesheet target encoding. 68 69 text 70 to be tokenized 71 fullsheet 72 if ``True`` appends EOF token as last one and completes incomplete 73 COMMENT tokens 74 """ 75 line = col = 1 76 77 tokens = [] 78 while text: 79 for name, matcher in self.tokenmatches: 80 if fullsheet and name == 'CHAR' and text.startswith(u'/*'): 81 # after all tokens except CHAR have been tested 82 # test for incomplete comment 83 possiblecomment = u'%s*/' % text 84 match = self.commentmatcher(possiblecomment) 85 if match: 86 yield ('COMMENT', possiblecomment, line, col) 87 text = None 88 break 89 90 # default 91 match = matcher(text) 92 if match: 93 found = match.group(0) 94 95 if fullsheet: 96 # check if tokens may be completed 97 if 'INVALID' == name and text == found: 98 # complete INVALID to STRING 99 name = 'STRING' 100 found = '%s%s' % (found, found[0]) 101 102 elif 'FUNCTION' == name and\ 103 u'url(' == util.Base._normalize(found): 104 # FUNCTION url( is fixed to URI if fullsheet 105 # FUNCTION production MUST BE after URI production! 106 for end in (u"')", u'")', u')'): 107 possibleuri = '%s%s' % (text, end) 108 match = self.urimatcher(possibleuri) 109 if match: 110 name = 'URI' 111 found = match.group(0) 112 break 113 114 if name in ('URI', 'FUNCTION', 'ATKEYWORD', 'IDENT', 'STRING', 115 'INVALID', 'HASH', 'DIMENSION', 'COMMENT'): 116 # may contain unicode escape, replace with normal char 117 def repl(m): 118 num = int(m.group(0)[1:], 16) 119 if num < 0x10000: 120 return unichr(num) 121 else: 122 return m.group(0)
123 value = self.unicodesub(repl, found) 124 else: 125 # should not contain unicodes 126 value = found 127 128 yield (name, value, line, col) 129 text = text[len(found):] 130 nls = found.count(self._linesep) 131 line += nls 132 if nls: 133 col = len(found[found.rfind(self._linesep):]) 134 else: 135 col += len(found) 136 break 137 138 else: 139 # should not happen at all 140 raise xml.dom.SyntaxErr('no token match "%s(...)"' % text[:10]) 141 text = text[1:] 142 143 if fullsheet: 144 yield ('EOF', u'', line, col) 145