Package cssutils :: Module tokenize2
[hide private]
[frames] | no frames]

Source Code for Module cssutils.tokenize2

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  """New CSS Tokenizer (a generator) 
  4  """ 
  5  __all__ = ['Tokenizer', 'CSSProductions'] 
  6  __docformat__ = 'restructuredtext' 
  7  __author__ = '$LastChangedBy: cthedot $' 
  8  __date__ = '$LastChangedDate: 2007-09-01 15:55:42 +0200 (Sa, 01 Sep 2007) $' 
  9  __version__ = '$LastChangedRevision: 300 $' 
 10   
 11  import os 
 12  import re 
 13  import string 
 14  import xml.dom 
 15  import cssutils 
 16  import util 
 17  from cssproductions import * 
 18   
19 -class Tokenizer(object):
20 """ 21 generates a list of Token tuples: 22 (Tokenname, value, startline, startcolumn) 23 """ 24 _linesep = u'\n' 25
26 - def _expand_macros(self, macros, productions):
27 """returns macro expanded productions, order of productions is kept""" 28 def macro_value(m): 29 return '(?:%s)' % macros[m.groupdict()['macro']]
30 expanded = [] 31 for key, value in productions: 32 while re.search(r'{[a-zA-Z][a-zA-Z0-9-]*}', value): 33 value = re.sub(r'{(?P<macro>[a-zA-Z][a-zA-Z0-9-]*)}', 34 macro_value, value) 35 expanded.append((key, value)) 36 return expanded
37
38 - def _compile_productions(self, expanded_productions):
39 """compile productions into callable match objects, order is kept""" 40 compiled = [] 41 for key, value in expanded_productions: 42 compiled.append((key, re.compile('^(?:%s)' % value, re.U).match)) 43 return compiled
44
45 - def __init__(self, macros=None, productions=None):
46 """ 47 inits tokenizer with given macros and productions which default to 48 cssutils own macros and productions 49 """ 50 self.log = cssutils.log 51 if not macros: 52 macros = MACROS 53 if not productions: 54 productions = PRODUCTIONS 55 self.tokenmatches = self._compile_productions( 56 self._expand_macros(macros, productions)) 57 self.commentmatcher = [x[1] for x in self.tokenmatches if x[0] == 'COMMENT'][0] 58 self.urimatcher = [x[1] for x in self.tokenmatches if x[0] == 'URI'][0] 59 self.unicodesub = re.compile(RE_UNICODE).sub
60
61 - def tokenize(self, text, fullsheet=False):
62 """ 63 generator: tokenizes text and yiels tokens, each token is a tuple of:: 64 65 (tokenname, tokenvalue, line, col) 66 67 The tokenvalue will contain a normal string, meaning CSS unicode 68 escapes have been resolved to normal characters. The serializer 69 escapes needed characters back to unicode escapes depending of 70 the stylesheet target encoding. 71 72 text 73 to be tokenized 74 fullsheet 75 if ``True`` appends EOF token as last one and completes incomplete 76 COMMENT tokens 77 """ 78 line = col = 1 79 80 tokens = [] 81 while text: 82 for name, matcher in self.tokenmatches: 83 84 if fullsheet and name == 'CHAR' and text.startswith(u'/*'): 85 # after all tokens except CHAR have been tested 86 # test for incomplete comment 87 possiblecomment = u'%s*/' % text 88 match = self.commentmatcher(possiblecomment) 89 if match: 90 yield ('COMMENT', possiblecomment, line, col) 91 text = None 92 break 93 94 # default 95 match = matcher(text) 96 if match: 97 found = match.group(0) 98 99 if fullsheet: 100 # check if tokens may be completed 101 if 'INVALID' == name and text == found: 102 # complete INVALID to STRING 103 name = 'STRING' 104 found = '%s%s' % (found, found[0]) 105 106 elif 'FUNCTION' == name and\ 107 u'url(' == util.Base._normalize(found): 108 # FUNCTION url( is fixed to URI if fullsheet 109 # FUNCTION production MUST BE after URI production! 110 for end in (u"')", u'")', u')'): 111 possibleuri = '%s%s' % (text, end) 112 match = self.urimatcher(possibleuri) 113 if match: 114 name = 'URI' 115 found = match.group(0) 116 break 117 118 if name in ('URI', 'FUNCTION', 'ATKEYWORD', 'IDENT', 'STRING', 119 'INVALID', 'HASH', 'DIMENSION', 'COMMENT'): 120 # may contain unicode escape, replace with normal char 121 def repl(m): 122 num = int(m.group(0)[1:], 16) 123 if num < 0x10000: 124 return unichr(num) 125 else: 126 return m.group(0)
127 value = self.unicodesub(repl, found) 128 else: 129 # should not contain unicodes 130 value = found 131 132 yield (name, value, line, col) 133 text = text[len(found):] 134 nls = found.count(self._linesep) 135 line += nls 136 if nls: 137 col = len(found[found.rfind(self._linesep):]) 138 else: 139 col += len(found) 140 break 141 142 else: 143 # should not happen at all 144 raise xml.dom.SyntaxErr('no token match "%s(...)"' % text[:10]) 145 text = text[1:] 146 147 if fullsheet: 148 yield ('EOF', u'', line, col) 149