1 """base classes and helper functions for css and stylesheets packages
2 """
3 __all__ = []
4 __docformat__ = 'restructuredtext'
5 __version__ = '$Id: util.py 1321 2008-06-29 20:59:58Z cthedot $'
6
7 import codecs
8 from itertools import ifilter
9 import re
10 import types
11 import urllib2
12 import xml.dom
13 import cssutils
14 from tokenize2 import Tokenizer
15
16 import encutils
17
18 -class Base(object):
19 """
20 Base class for most CSS and StyleSheets classes
21
22 **Superceded by Base2 which is used for new seq handling class.**
23 See cssutils.util.Base2
24
25 Contains helper methods for inheriting classes helping parsing
26
27 ``_normalize`` is static as used by Preferences.
28 """
29 __tokenizer2 = Tokenizer()
30
31 _log = cssutils.log
32 _prods = cssutils.tokenize2.CSSProductions
33
34
35
36
37 _SHORTHANDPROPERTIES = {
38 u'background': [],
39 u'border': [],
40 u'border-left': [],
41 u'border-right': [],
42 u'border-top': [],
43 u'border-bottom': [],
44 u'border-color': [],
45 u'border-style': [],
46 u'border-width': [],
47 u'cue': [],
48 u'font': [],
49
50
51
52
53 u'list-style': [],
54 u'margin': [],
55 u'outline': [],
56 u'padding': [],
57 u'pause': []
58 }
59
60
61 __escapes = re.compile(ur'(\\[^0-9a-fA-F])').sub
62
63 __unicodes = re.compile(ur'\\[0-9a-fA-F]{1,6}[\t|\r|\n|\f|\x20]?').sub
64
65 @staticmethod
67 """
68 normalizes x, namely:
69
70 - remove any \ before non unicode sequences (0-9a-zA-Z) so for
71 x=="c\olor\" return "color" (unicode escape sequences should have
72 been resolved by the tokenizer already)
73 - lowercase
74 """
75 if x:
76 def removeescape(matchobj):
77 return matchobj.group(0)[1:]
78 x = Base.__escapes(removeescape, x)
79 return x.lower()
80 else:
81 return x
82
84 "raises xml.dom.NoModificationAllowedErr if rule/... is readonly"
85 if hasattr(self, '_readonly') and self._readonly:
86 raise xml.dom.NoModificationAllowedErr(
87 u'%s is readonly.' % self.__class__)
88 return True
89 return False
90
92 """
93 returns tuple (text, dict-of-namespaces) or if no namespaces are
94 in cssText returns (cssText, {})
95
96 used in Selector, SelectorList, CSSStyleRule, CSSMediaRule and
97 CSSStyleSheet
98 """
99 if isinstance(text_namespaces_tuple, tuple):
100 return text_namespaces_tuple[0], _SimpleNamespaces(
101 text_namespaces_tuple[1])
102 else:
103 return text_namespaces_tuple, _SimpleNamespaces()
104
106 """
107 returns tokens of textortokens which may already be tokens in which
108 case simply returns input
109 """
110 if not textortokens:
111 return None
112 elif isinstance(textortokens, basestring):
113
114 return self.__tokenizer2.tokenize(
115 textortokens)
116 elif types.GeneratorType == type(textortokens):
117
118 return textortokens
119 elif isinstance(textortokens, tuple):
120
121 return [textortokens]
122 else:
123
124 return (x for x in textortokens)
125
127 "returns next token in generator tokenizer or the default value"
128 try:
129 return tokenizer.next()
130 except (StopIteration, AttributeError):
131 return default
132
134 "returns type of Tokenizer token"
135 if token:
136 return token[0]
137 else:
138 return None
139
141 "returns value of Tokenizer token"
142 if token and normalize:
143 return Base._normalize(token[1])
144 elif token:
145 return token[1]
146 else:
147 return None
148
150 """
151 for STRING returns the actual content without surrounding "" or ''
152 and without respective escapes, e.g.::
153
154 "with \" char" => with " char
155 """
156 if token:
157 value = token[1]
158 return value.replace('\\'+value[0], value[0])[1:-1]
159 else:
160 return None
161
163 """
164 for URI returns the actual content without surrounding url()
165 or url(""), url('') and without respective escapes, e.g.::
166
167 url("\"") => "
168 """
169 if token:
170 value = token[1][4:-1].strip()
171 if (value[0] in '\'"') and (value[0] == value[-1]):
172
173 value = value.replace('\\'+value[0], value[0])[1:-1]
174 return value
175 else:
176 return None
177
178 - def _tokensupto2(self,
179 tokenizer,
180 starttoken=None,
181 blockstartonly=False,
182 blockendonly=False,
183 mediaendonly=False,
184 importmediaqueryendonly=False,
185 mediaqueryendonly=False,
186 semicolon=False,
187 propertynameendonly=False,
188 propertyvalueendonly=False,
189 propertypriorityendonly=False,
190 selectorattendonly=False,
191 funcendonly=False,
192 listseponly=False,
193 separateEnd=False
194 ):
195 """
196 returns tokens upto end of atrule and end index
197 end is defined by parameters, might be ; } ) or other
198
199 default looks for ending "}" and ";"
200 """
201 ends = u';}'
202 endtypes = ()
203 brace = bracket = parant = 0
204
205 if blockstartonly:
206 ends = u'{'
207 brace = -1
208 elif blockendonly:
209 ends = u'}'
210 brace = 1
211 elif mediaendonly:
212 ends = u'}'
213 brace = 1
214 elif importmediaqueryendonly:
215
216 ends = u';'
217 endtypes = ('STRING',)
218 elif mediaqueryendonly:
219
220
221 ends = u'{'
222 brace = -1
223 endtypes = ('STRING',)
224 elif semicolon:
225 ends = u';'
226 elif propertynameendonly:
227 ends = u':;'
228 elif propertyvalueendonly:
229 ends = u';!'
230 elif propertypriorityendonly:
231 ends = u';'
232 elif selectorattendonly:
233 ends = u']'
234 if starttoken and self._tokenvalue(starttoken) == u'[':
235 bracket = 1
236 elif funcendonly:
237 ends = u')'
238 parant = 1
239 elif listseponly:
240 ends = u','
241
242 resulttokens = []
243 if starttoken:
244 resulttokens.append(starttoken)
245 if tokenizer:
246 for token in tokenizer:
247 typ, val, line, col = token
248 if 'EOF' == typ:
249 resulttokens.append(token)
250 break
251 if u'{' == val:
252 brace += 1
253 elif u'}' == val:
254 brace -= 1
255 elif u'[' == val:
256 bracket += 1
257 elif u']' == val:
258 bracket -= 1
259
260 elif u'(' == val or \
261 Base._prods.FUNCTION == typ:
262 parant += 1
263 elif u')' == val:
264 parant -= 1
265
266 resulttokens.append(token)
267
268 if (brace == bracket == parant == 0) and (
269 val in ends or typ in endtypes):
270 break
271 elif mediaqueryendonly and brace == -1 and (
272 bracket == parant == 0) and typ in endtypes:
273
274 break
275
276 if separateEnd:
277
278 if resulttokens:
279 return resulttokens[:-1], resulttokens[-1]
280 else:
281 return resulttokens, None
282 else:
283 return resulttokens
284
286 """
287 returns string value of t (t may be a string, a list of token tuples
288 or a single tuple in format (type, value, line, col).
289 Mainly used to get a string value of t for error messages.
290 """
291 if not t:
292 return u''
293 elif isinstance(t, basestring):
294 return t
295 else:
296 return u''.join([x[1] for x in t])
297
299 """
300 adds default productions if not already present, used by
301 _parse only
302
303 each production should return the next expected token
304 normaly a name like "uri" or "EOF"
305 some have no expectation like S or COMMENT, so simply return
306 the current value of self.__expected
307 """
308 def ATKEYWORD(expected, seq, token, tokenizer=None):
309 "TODO: add default impl for unexpected @rule?"
310 if expected != 'EOF':
311
312 rule = cssutils.css.CSSUnknownRule()
313 rule.cssText = self._tokensupto2(tokenizer, token)
314 if rule.wellformed:
315 seq.append(rule)
316 return expected
317 else:
318 new['wellformed'] = False
319 self._log.error(u'Expected EOF.', token=token)
320 return expected
321
322 def COMMENT(expected, seq, token, tokenizer=None):
323 "default implementation for COMMENT token adds CSSCommentRule"
324 seq.append(cssutils.css.CSSComment([token]))
325 return expected
326
327 def S(expected, seq, token, tokenizer=None):
328 "default implementation for S token, does nothing"
329 return expected
330
331 def EOF(expected=None, seq=None, token=None, tokenizer=None):
332 "default implementation for EOF token"
333 return 'EOF'
334
335 p = {'ATKEYWORD': ATKEYWORD,
336 'COMMENT': COMMENT,
337 'S': S,
338 'EOF': EOF
339 }
340 p.update(productions)
341 return p
342
343 - def _parse(self, expected, seq, tokenizer, productions, default=None,
344 new=None):
345 """
346 puts parsed tokens in seq by calling a production with
347 (seq, tokenizer, token)
348
349 expected
350 a name what token or value is expected next, e.g. 'uri'
351 seq
352 to add rules etc to
353 tokenizer
354 call tokenizer.next() to get next token
355 productions
356 callbacks {tokentype: callback}
357 default
358 default callback if tokentype not in productions
359 new
360 used to init default productions
361
362 returns (wellformed, expected) which the last prod might have set
363 """
364 wellformed = True
365 if tokenizer:
366 prods = self._adddefaultproductions(productions, new)
367 for token in tokenizer:
368 p = prods.get(token[0], default)
369 if p:
370 expected = p(expected, seq, token, tokenizer)
371 else:
372 wellformed = False
373 self._log.error(u'Unexpected token (%s, %s, %s, %s)' % token)
374 return wellformed, expected
375
378 """
379 Base class for new seq handling, used by Selector for now only
380 """
383
385 """
386 sets newseq and makes it readonly
387 """
388 newseq._readonly = True
389 self._seq = newseq
390
391 seq = property(lambda self: self._seq, doc="seq for most classes")
392
394 "get a writeable Seq() which is added later"
395 return Seq(readonly=readonly)
396
398 """
399 adds default productions if not already present, used by
400 _parse only
401
402 each production should return the next expected token
403 normaly a name like "uri" or "EOF"
404 some have no expectation like S or COMMENT, so simply return
405 the current value of self.__expected
406 """
407 def ATKEYWORD(expected, seq, token, tokenizer=None):
408 "default impl for unexpected @rule"
409 if expected != 'EOF':
410
411 rule = cssutils.css.CSSUnknownRule()
412 rule.cssText = self._tokensupto2(tokenizer, token)
413 if rule.wellformed:
414 seq.append(rule, cssutils.css.CSSRule.UNKNOWN_RULE,
415 line=token[2], col=token[3])
416 return expected
417 else:
418 new['wellformed'] = False
419 self._log.error(u'Expected EOF.', token=token)
420 return expected
421
422 def COMMENT(expected, seq, token, tokenizer=None):
423 "default impl, adds CSSCommentRule if not token == EOF"
424 if expected == 'EOF':
425 new['wellformed'] = False
426 self._log.error(u'Expected EOF but found comment.', token=token)
427 seq.append(cssutils.css.CSSComment([token]), 'COMMENT')
428 return expected
429
430 def S(expected, seq, token, tokenizer=None):
431 "default impl, does nothing if not token == EOF"
432 if expected == 'EOF':
433 new['wellformed'] = False
434 self._log.error(u'Expected EOF but found whitespace.', token=token)
435 return expected
436
437 def EOF(expected=None, seq=None, token=None, tokenizer=None):
438 "default implementation for EOF token"
439 return 'EOF'
440
441 defaultproductions = {'ATKEYWORD': ATKEYWORD,
442 'COMMENT': COMMENT,
443 'S': S,
444 'EOF': EOF
445 }
446 defaultproductions.update(productions)
447 return defaultproductions
448
449
450 -class Seq(object):
451 """
452 property seq of Base2 inheriting classes, holds a list of Item objects.
453
454 used only by Selector for now
455
456 is normally readonly, only writable during parsing
457 """
459 """
460 only way to write to a Seq is to initialize it with new items
461 each itemtuple has (value, type, line) where line is optional
462 """
463 self._seq = []
464 self._readonly = readonly
465
468
471
474
476 return iter(self._seq)
477
479 return len(self._seq)
480
481 - def append(self, val, typ, line=None, col=None):
482 "if not readonly add new Item()"
483 if self._readonly:
484 raise AttributeError('Seq is readonly.')
485 else:
486 self._seq.append(Item(val, typ, line, col))
487
489 "if not readonly add item which must be an Item"
490 if self._readonly:
491 raise AttributeError('Seq is readonly.')
492 else:
493 self._seq.append(item)
494
495 - def replace(self, index=-1, val=None, typ=None, line=None, col=None):
496 """
497 if not readonly replace Item at index with new Item or
498 simply replace value or type
499 """
500 if self._readonly:
501 raise AttributeError('Seq is readonly.')
502 else:
503 self._seq[index] = Item(val, typ, line, col)
504
506 "returns a repr same as a list of tuples of (value, type)"
507 return u'cssutils.%s.%s([\n %s])' % (self.__module__,
508 self.__class__.__name__,
509 u',\n '.join([u'(%r, %r)' % (item.type, item.value)
510 for item in self._seq]
511 ))
513 return "<cssutils.%s.%s object length=%r at 0x%x>" % (
514 self.__module__, self.__class__.__name__, len(self), id(self))
515
517 """
518 an item in the seq list of classes (successor to tuple items in old seq)
519
520 each item has attributes:
521
522 type
523 a sematic type like "element", "attribute"
524 value
525 the actual value which may be a string, number etc or an instance
526 of e.g. a CSSComment
527 *line*
528 **NOT IMPLEMENTED YET, may contain the line in the source later**
529 """
530 - def __init__(self, value, type, line=None, col=None):
531 self.__value = value
532 self.__type = type
533 self.__line = line
534 self.__col = col
535
536 type = property(lambda self: self.__type)
537 value = property(lambda self: self.__value)
538 line = property(lambda self: self.__line)
539 col = property(lambda self: self.__col)
540
542 return "%s.%s(value=%r, type=%r, line=%r, col=%r)" % (
543 self.__module__, self.__class__.__name__,
544 self.__value, self.__type, self.__line, self.__col)
545
548 """
549 (EXPERIMENTAL)
550 A base class used for list classes like css.SelectorList or
551 stylesheets.MediaList
552
553 adds list like behaviour running on inhering class' property ``seq``
554
555 - item in x => bool
556 - len(x) => integer
557 - get, set and del x[i]
558 - for item in x
559 - append(item)
560
561 some methods must be overwritten in inheriting class
562 """
565
568
571
573 return self.seq[index]
574
576 def gen():
577 for x in self.seq:
578 yield x
579 return gen()
580
583
585 "must be overwritten"
586 raise NotImplementedError
587
589 "must be overwritten"
590 raise NotImplementedError
591
594 """
595 A dictionary like wrapper for @namespace rules used in a CSSStyleSheet.
596 Works on effective namespaces, so e.g. if::
597
598 @namespace p1 "uri";
599 @namespace p2 "uri";
600
601 only the second rule is effective and kept.
602
603 namespaces
604 a dictionary {prefix: namespaceURI} containing the effective namespaces
605 only. These are the latest set in the CSSStyleSheet.
606 parentStyleSheet
607 the parent CSSStyleSheet
608 """
609 - def __init__(self, parentStyleSheet, *args):
612
615
631
633 try:
634 return self.namespaces[prefix]
635 except KeyError, e:
636 raise xml.dom.NamespaceErr('Prefix %r not found.' % prefix)
637
640
643
659
666
674
675 namespaces = property(__getNamespaces,
676 doc=u'Holds only effective @namespace rules in self.parentStyleSheets'
677 '@namespace rules.')
678
679 - def get(self, prefix, default):
681
684
687
690
692 """
693 returns effective prefix for given namespaceURI or raises IndexError
694 if this cannot be found"""
695 for prefix, uri in self.namespaces.items():
696 if uri == namespaceURI:
697 return prefix
698 raise IndexError(u'NamespaceURI %r not found.' % namespaceURI)
699
701 return u"<cssutils.util.%s object parentStyleSheet=%r at 0x%x>" % (
702 self.__class__.__name__, str(self.parentStyleSheet), id(self))
703
706 """
707 namespaces used in objects like Selector as long as they are not connected
708 to a CSSStyleSheet
709 """
711 self.__namespaces = dict(*args)
712
715
716 namespaces = property(lambda self: self.__namespaces,
717 doc=u'Dict Wrapper for self.sheets @namespace rules.')
718
720 return u"<cssutils.util.%s object namespaces=%r at 0x%x>" % (
721 self.__class__.__name__, self.namespaces, id(self))
722
724 return u"cssutils.util.%s(%r)" % (self.__class__.__name__,
725 self.namespaces)
726
729 """Retrieve data from ``url``. cssutils default implementation of fetch
730 URL function.
731
732 Returns ``(encoding, string)`` or ``None``
733 """
734 try:
735 res = urllib2.urlopen(url)
736 except WindowsError, e:
737
738 cssutils.log.warn(e, error=WindowsError)
739 except (WindowsError, ValueError), e:
740
741 cssutils.log.warn(u'ValueError, %s' % e.message, error=ValueError)
742 except urllib2.HTTPError, e:
743
744 cssutils.log.warn(u'HTTPError opening url=%r: %s %s' %
745 (url, e.code, e.msg), error=e)
746 except urllib2.URLError, e:
747
748 cssutils.log.warn(u'URLError, %s' % e.reason, error=e)
749 else:
750 if res:
751 mimeType, encoding = encutils.getHTTPInfo(res)
752 if mimeType != u'text/css':
753 cssutils.log.error(u'Expected "text/css" mime type for url=%s but found: %r' %
754 (url, mimeType), error=ValueError)
755 return encoding, res.read()
756
757 -def _readUrl(url, fetcher=None, overrideEncoding=None, parentEncoding=None):
758 """
759 Read cssText from url and decode it using all relevant methods (HTTP
760 header, BOM, @charset). Returns encoding (which is needed to set encoding
761 of stylesheet properly) and decoded text
762
763 ``fetcher``
764 see cssutils.registerFetchUrl for details
765 ``overrideEncoding``
766 If given this encoding is used and all other encoding information is
767 ignored (HTTP, BOM etc)
768 ``parentEncoding``
769 Encoding of parent stylesheet (while e.g. reading @import references sheets)
770 or document if available.
771
772 Priority or encoding information
773 --------------------------------
774 **cssutils only**: overrideEncoding
775
776 1. An HTTP "charset" parameter in a "Content-Type" field (or similar parameters in other protocols)
777 2. BOM and/or @charset (see below)
778 3. <link charset=""> or other metadata from the linking mechanism (if any)
779 4. charset of referring style sheet or document (if any)
780 5. Assume UTF-8
781
782 """
783 if not fetcher:
784 fetcher = _defaultFetcher
785 r = fetcher(url)
786 if r and len(r) == 2 and r[1] is not None:
787 httpEncoding, content = r
788 UTF8_BOM = u'\xEF\xBB\xBF'
789
790 if overrideEncoding:
791
792 encoding = overrideEncoding
793 elif httpEncoding:
794
795 encoding = httpEncoding
796 else:
797 try:
798 if content.startswith(u'@charset "utf-8";') or \
799 content.startswith(UTF8_BOM + u'@charset "utf-8";'):
800
801 contentEncoding = 'utf-8'
802 else:
803
804 contentEncoding = False
805 except UnicodeDecodeError, e:
806
807 contentEncoding = False
808
809 if contentEncoding:
810 encoding = contentEncoding
811 else:
812
813 (contentEncoding, explicit) = cssutils.codec.detectencoding_str(content)
814
815 if contentEncoding and explicit:
816
817 encoding = contentEncoding
818 else:
819
820
821 encoding = parentEncoding
822 try:
823
824 decodedContent = codecs.lookup("css")[1](content, encoding=encoding)[0]
825 except UnicodeDecodeError, e:
826 cssutils.log.warn(e, neverraise=True)
827 decodedContent = None
828
829 return encoding, decodedContent
830 else:
831 return None, None
832