Package cssutils :: Module tokenize2
[hide private]
[frames] | no frames]

Source Code for Module cssutils.tokenize2

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  """New CSS Tokenizer (a generator) 
  4  """ 
  5  __all__ = ['Tokenizer', 'CSSProductions'] 
  6  __docformat__ = 'restructuredtext' 
  7  __version__ = '$Id: tokenize2.py 1350 2008-07-10 20:37:21Z cthedot $' 
  8   
  9  import os 
 10  import re 
 11  import string 
 12  import xml.dom 
 13  import cssutils 
 14  import util 
 15  from cssproductions import * 
 16   
17 -class Tokenizer(object):
18 """ 19 generates a list of Token tuples: 20 (Tokenname, value, startline, startcolumn) 21 """ 22 _atkeywords = { 23 u'@font-face': 'FONT_FACE_SYM', 24 u'@import': 'IMPORT_SYM', 25 u'@media': 'MEDIA_SYM', 26 u'@namespace': 'NAMESPACE_SYM', 27 u'@page': 'PAGE_SYM' 28 } 29 _linesep = u'\n' 30
31 - def __init__(self, macros=None, productions=None):
32 """ 33 inits tokenizer with given macros and productions which default to 34 cssutils own macros and productions 35 """ 36 self.log = cssutils.log 37 if not macros: 38 macros = MACROS 39 if not productions: 40 productions = PRODUCTIONS 41 self.tokenmatches = self._compile_productions( 42 self._expand_macros(macros, 43 productions)) 44 self.commentmatcher = [x[1] for x in self.tokenmatches if x[0] == 'COMMENT'][0] 45 self.urimatcher = [x[1] for x in self.tokenmatches if x[0] == 'URI'][0] 46 self.unicodesub = re.compile(r'\\[0-9a-fA-F]{1,6}(?:\r\n|[\t|\r|\n|\f|\x20])?').sub
47
48 - def _expand_macros(self, macros, productions):
49 """returns macro expanded productions, order of productions is kept""" 50 def macro_value(m): 51 return '(?:%s)' % macros[m.groupdict()['macro']]
52 expanded = [] 53 for key, value in productions: 54 while re.search(r'{[a-zA-Z][a-zA-Z0-9-]*}', value): 55 value = re.sub(r'{(?P<macro>[a-zA-Z][a-zA-Z0-9-]*)}', 56 macro_value, value) 57 expanded.append((key, value)) 58 return expanded
59
60 - def _compile_productions(self, expanded_productions):
61 """compile productions into callable match objects, order is kept""" 62 compiled = [] 63 for key, value in expanded_productions: 64 compiled.append((key, re.compile('^(?:%s)' % value, re.U).match)) 65 return compiled
66
67 - def tokenize(self, text, fullsheet=False):
68 """Generator: Tokenize text and yield tokens, each token is a tuple 69 of:: 70 71 (nname, value, line, col) 72 73 The token value will contain a normal string, meaning CSS unicode 74 escapes have been resolved to normal characters. The serializer 75 escapes needed characters back to unicode escapes depending on 76 the stylesheet target encoding. 77 78 text 79 to be tokenized 80 fullsheet 81 if ``True`` appends EOF token as last one and completes incomplete 82 COMMENT tokens 83 """ 84 def repl(m): 85 "used by unicodesub" 86 num = int(m.group(0)[1:], 16) 87 if num < 0x10000: 88 return unichr(num) 89 else: 90 return m.group(0)
91 92 def normalize(value): 93 "normalize and do unicodesub" 94 return util.Base._normalize(self.unicodesub(repl, value)) 95 96 line = col = 1 97 98 # check for BOM first as it should only be max one at the start 99 (BOM, matcher), productions = self.tokenmatches[0], self.tokenmatches[1:] 100 match = matcher(text) 101 if match: 102 found = match.group(0) 103 yield (BOM, found, line, col) 104 text = text[len(found):] 105 106 while text: 107 # speed test for most used CHARs 108 c = text[0] 109 if c in '{}:;,': 110 yield ('CHAR', c, line, col) 111 col += 1 112 text = text[1:] 113 114 else: 115 # check all other productions, at least CHAR must match 116 for name, matcher in productions: 117 if fullsheet and name == 'CHAR' and text.startswith(u'/*'): 118 # before CHAR production test for incomplete comment 119 possiblecomment = u'%s*/' % text 120 match = self.commentmatcher(possiblecomment) 121 if match: 122 yield ('COMMENT', possiblecomment, line, col) 123 text = None # eats all remaining text 124 break 125 126 match = matcher(text) # if no match try next production 127 if match: 128 found = match.group(0) # needed later for line/col 129 if fullsheet: 130 # check if found may be completed into a full token 131 if 'INVALID' == name and text == found: 132 # complete INVALID to STRING with start char " or ' 133 name, found = 'STRING', '%s%s' % (found, found[0]) 134 135 elif 'FUNCTION' == name and\ 136 u'url(' == normalize(found): 137 # FUNCTION url( is fixed to URI if fullsheet 138 # FUNCTION production MUST BE after URI production! 139 for end in (u"')", u'")', u')'): 140 possibleuri = '%s%s' % (text, end) 141 match = self.urimatcher(possibleuri) 142 if match: 143 name, found = 'URI', match.group(0) 144 break 145 146 if name in ('DIMENSION', 'IDENT', 'STRING', 'URI', 147 'HASH', 'COMMENT', 'FUNCTION', 'INVALID'): 148 # may contain unicode escape, replace with normal char 149 # but do not normalize (?) 150 value = self.unicodesub(repl, found) 151 152 else: 153 if 'ATKEYWORD' == name: 154 # get actual ATKEYWORD SYM 155 name = self._atkeywords.get(normalize(found), 'ATKEYWORD') 156 value = found # should not contain unicode escape (?) 157 158 yield (name, value, line, col) 159 text = text[len(found):] 160 nls = found.count(self._linesep) 161 line += nls 162 if nls: 163 col = len(found[found.rfind(self._linesep):]) 164 else: 165 col += len(found) 166 break 167 168 if fullsheet: 169 yield ('EOF', u'', line, col) 170