Package cssutils :: Module codec
[hide private]
[frames] | no frames]

Source Code for Module cssutils.codec

  1  #!/usr/bin/env python 
  2   
  3  """Python codec for CSS.""" 
  4  __docformat__ = 'restructuredtext' 
  5  __author__ = 'Walter Doerwald' 
  6  __version__ = '$Id: util.py 1114 2008-03-05 13:22:59Z cthedot $' 
  7   
  8  import codecs, marshal 
  9   
 10   
 11  # We're using bits to store all possible candidate encodings (or variants, i.e. 
 12  # we have two bits for the variants of UTF-16 and two for the 
 13  # variants of UTF-32). 
 14  # 
 15  # Prefixes for various CSS encodings 
 16  # UTF-8-SIG   xEF  xBB  xBF 
 17  # UTF-16 (LE) xFF  xFE ~x00|~x00 
 18  # UTF-16 (BE) xFE  xFF 
 19  # UTF-16-LE    @   x00   @   x00 
 20  # UTF-16-BE   x00   @ 
 21  # UTF-32 (LE) xFF  xFE  x00  x00 
 22  # UTF-32 (BE) x00  x00  xFE  xFF 
 23  # UTF-32-LE    @   x00  x00  x00 
 24  # UTF-32-BE   x00  x00  x00   @ 
 25  # CHARSET      @    c    h    a  ... 
 26   
 27   
 28   
29 -def detectencoding_str(input, final=False):
30 """ 31 Detect the encoding of the byte string ``input``, which contains the 32 beginning of a CSS file. This function returs the detected encoding (or 33 ``None`` if it hasn't got enough data), and a flag that indicates whether 34 to encoding has been detected explicitely or implicitely. To detect the 35 encoding the first few bytes are used (or if ``input`` is ASCII compatible 36 and starts with a charset rule the encoding name from the rule). "Explicit" 37 detection means that the bytes start with a BOM or a charset rule. 38 39 If the encoding can't be detected yet, ``None`` is returned as the encoding. 40 ``final`` specifies whether more data is available in later calls or not. 41 If ``final`` is true, ``detectencoding_str()`` will never return ``None`` 42 as the encoding. 43 """ 44 45 # A bit for every candidate 46 CANDIDATE_UTF_8_SIG = 1 47 CANDIDATE_UTF_16_AS_LE = 2 48 CANDIDATE_UTF_16_AS_BE = 4 49 CANDIDATE_UTF_16_LE = 8 50 CANDIDATE_UTF_16_BE = 16 51 CANDIDATE_UTF_32_AS_LE = 32 52 CANDIDATE_UTF_32_AS_BE = 64 53 CANDIDATE_UTF_32_LE = 128 54 CANDIDATE_UTF_32_BE = 256 55 CANDIDATE_CHARSET = 512 56 57 candidates = 1023 # all candidates 58 59 li = len(input) 60 if li>=1: 61 # Check first byte 62 c = input[0] 63 if c != "\xef": 64 candidates &= ~CANDIDATE_UTF_8_SIG 65 if c != "\xff": 66 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_16_AS_LE) 67 if c != "\xfe": 68 candidates &= ~CANDIDATE_UTF_16_AS_BE 69 if c != "@": 70 candidates &= ~(CANDIDATE_UTF_32_LE|CANDIDATE_UTF_16_LE|CANDIDATE_CHARSET) 71 if c != "\x00": 72 candidates &= ~(CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_BE|CANDIDATE_UTF_16_BE) 73 if li>=2: 74 # Check second byte 75 c = input[1] 76 if c != "\xbb": 77 candidates &= ~CANDIDATE_UTF_8_SIG 78 if c != "\xfe": 79 candidates &= ~(CANDIDATE_UTF_16_AS_LE|CANDIDATE_UTF_32_AS_LE) 80 if c != "\xff": 81 candidates &= ~CANDIDATE_UTF_16_AS_BE 82 if c != "\x00": 83 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE) 84 if c != "@": 85 candidates &= ~CANDIDATE_UTF_16_BE 86 if c != "c": 87 candidates &= ~CANDIDATE_CHARSET 88 if li>=3: 89 # Check third byte 90 c = input[2] 91 if c != "\xbf": 92 candidates &= ~CANDIDATE_UTF_8_SIG 93 if c != "c": 94 candidates &= ~CANDIDATE_UTF_16_LE 95 if c != "\x00": 96 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE) 97 if c != "\xfe": 98 candidates &= ~CANDIDATE_UTF_32_AS_BE 99 if c != "h": 100 candidates &= ~CANDIDATE_CHARSET 101 if li>=4: 102 # Check fourth byte 103 c = input[3] 104 if input[2:4] == "\x00\x00": 105 candidates &= ~CANDIDATE_UTF_16_AS_LE 106 if c != "\x00": 107 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE) 108 if c != "\xff": 109 candidates &= ~CANDIDATE_UTF_32_AS_BE 110 if c != "@": 111 candidates &= ~CANDIDATE_UTF_32_BE 112 if c != "a": 113 candidates &= ~CANDIDATE_CHARSET 114 if candidates == 0: 115 return ("utf-8", False) 116 if not (candidates & (candidates-1)): # only one candidate remaining 117 if candidates == CANDIDATE_UTF_8_SIG and li >= 3: 118 return ("utf-8-sig", True) 119 elif candidates == CANDIDATE_UTF_16_AS_LE and li >= 2: 120 return ("utf-16", True) 121 elif candidates == CANDIDATE_UTF_16_AS_BE and li >= 2: 122 return ("utf-16", True) 123 elif candidates == CANDIDATE_UTF_16_LE and li >= 4: 124 return ("utf-16-le", False) 125 elif candidates == CANDIDATE_UTF_16_BE and li >= 2: 126 return ("utf-16-be", False) 127 elif candidates == CANDIDATE_UTF_32_AS_LE and li >= 4: 128 return ("utf-32", True) 129 elif candidates == CANDIDATE_UTF_32_AS_BE and li >= 4: 130 return ("utf-32", True) 131 elif candidates == CANDIDATE_UTF_32_LE and li >= 4: 132 return ("utf-32-le", False) 133 elif candidates == CANDIDATE_UTF_32_BE and li >= 4: 134 return ("utf-32-be", False) 135 elif candidates == CANDIDATE_CHARSET and li >= 4: 136 prefix = '@charset "' 137 if input[:len(prefix)] == prefix: 138 pos = input.find('"', len(prefix)) 139 if pos >= 0: 140 return (input[len(prefix):pos], True) 141 # if this is the last call, and we haven't determined an encoding yet, 142 # we default to UTF-8 143 if final: 144 return ("utf-8", False) 145 return (None, False) # dont' know yet
146 147
148 -def detectencoding_unicode(input, final=False):
149 """ 150 Detect the encoding of the unicode string ``input``, which contains the 151 beginning of a CSS file. The encoding is detected from the charset rule 152 at the beginning of ``input``. If there is no charset rule, ``"utf-8"`` 153 will be returned. 154 155 If the encoding can't be detected yet, ``None`` is returned. ``final`` 156 specifies whether more data will be available in later calls or not. If 157 ``final`` is true, ``detectencoding_unicode()`` will never return ``None``. 158 """ 159 prefix = u'@charset "' 160 if input.startswith(prefix): 161 pos = input.find(u'"', len(prefix)) 162 if pos >= 0: 163 return (input[len(prefix):pos], True) 164 elif final or not prefix.startswith(input): 165 # if this is the last call, and we haven't determined an encoding yet, 166 # (or the string definitely doesn't start with prefix) we default to UTF-8 167 return ("utf-8", False) 168 return (None, False) # don't know yet
169 170
171 -def _fixencoding(input, encoding, final=False):
172 """ 173 Replace the name of the encoding in the charset rule at the beginning of 174 ``input`` with ``encoding``. If ``input`` doesn't starts with a charset 175 rule, ``input`` will be returned unmodified. 176 177 If the encoding can't be found yet, ``None`` is returned. ``final`` 178 specifies whether more data will be available in later calls or not. 179 If ``final`` is true, ``_fixencoding()`` will never return ``None``. 180 """ 181 prefix = u'@charset "' 182 if len(input) > len(prefix): 183 if input.startswith(prefix): 184 pos = input.find(u'"', len(prefix)) 185 if pos >= 0: 186 if encoding.replace("_", "-").lower() == "utf-8-sig": 187 encoding = u"utf-8" 188 return prefix + encoding + input[pos:] 189 # we haven't seen the end of the encoding name yet => fall through 190 else: 191 return input # doesn't start with prefix, so nothing to fix 192 elif not prefix.startswith(input) or final: 193 # can't turn out to be a @charset rule later (or there is no "later") 194 return input 195 if final: 196 return input 197 return None # don't know yet
198 199
200 -def decode(input, errors="strict", encoding=None, force=True):
201 if encoding is None or not force: 202 (_encoding, explicit) = detectencoding_str(input, True) 203 if _encoding == "css": 204 raise ValueError("css not allowed as encoding name") 205 if (explicit and not force) or encoding is None: # Take the encoding from the input 206 encoding = _encoding 207 (input, consumed) = codecs.getdecoder(encoding)(input, errors) 208 return (_fixencoding(input, unicode(encoding), True), consumed)
209 210
211 -def encode(input, errors="strict", encoding=None):
212 consumed = len(input) 213 if encoding is None: 214 encoding = detectencoding_unicode(input, True)[0] 215 if encoding.replace("_", "-").lower() == "utf-8-sig": 216 input = _fixencoding(input, u"utf-8", True) 217 else: 218 input = _fixencoding(input, unicode(encoding), True) 219 if encoding == "css": 220 raise ValueError("css not allowed as encoding name") 221 encoder = codecs.getencoder(encoding) 222 return (encoder(input, errors)[0], consumed)
223 224
225 -def _bytes2int(bytes):
226 # Helper: convert an 8 bit string into an ``int``. 227 i = 0 228 for byte in bytes: 229 i = (i<<8) + ord(byte) 230 return i
231 232
233 -def _int2bytes(i):
234 # Helper: convert an ``int`` into an 8-bit string. 235 v = [] 236 while i: 237 v.insert(0, chr(i&0xff)) 238 i >>= 8 239 return "".join(v)
240 241 242 if hasattr(codecs, "IncrementalDecoder"):
243 - class IncrementalDecoder(codecs.IncrementalDecoder):
244 - def __init__(self, errors="strict", encoding=None, force=True):
245 self.decoder = None 246 self.encoding = encoding 247 self.force = force 248 codecs.IncrementalDecoder.__init__(self, errors) 249 # Store ``errors`` somewhere else, 250 # because we have to hide it in a property 251 self._errors = errors 252 self.buffer = "" 253 self.headerfixed = False
254
255 - def iterdecode(self, input):
256 for part in input: 257 result = self.decode(part, False) 258 if result: 259 yield result 260 result = self.decode("", True) 261 if result: 262 yield result
263
264 - def decode(self, input, final=False):
265 # We're doing basically the same as a ``BufferedIncrementalDecoder``, 266 # but since the buffer is only relevant until the encoding has been 267 # detected (in which case the buffer of the underlying codec might 268 # kick in), we're implementing buffering ourselves to avoid some 269 # overhead. 270 if self.decoder is None: 271 input = self.buffer + input 272 # Do we have to detect the encoding from the input? 273 if self.encoding is None or not self.force: 274 (encoding, explicit) = detectencoding_str(input, final) 275 if encoding is None: # no encoding determined yet 276 self.buffer = input # retry the complete input on the next call 277 return u"" # no encoding determined yet, so no output 278 elif encoding == "css": 279 raise ValueError("css not allowed as encoding name") 280 if (explicit and not self.force) or self.encoding is None: # Take the encoding from the input 281 self.encoding = encoding 282 self.buffer = "" # drop buffer, as the decoder might keep its own 283 decoder = codecs.getincrementaldecoder(self.encoding) 284 self.decoder = decoder(self._errors) 285 if self.headerfixed: 286 return self.decoder.decode(input, final) 287 # If we haven't fixed the header yet, 288 # the content of ``self.buffer`` is a ``unicode`` object 289 output = self.buffer + self.decoder.decode(input, final) 290 encoding = self.encoding 291 if encoding.replace("_", "-").lower() == "utf-8-sig": 292 encoding = "utf-8" 293 newoutput = _fixencoding(output, unicode(encoding), final) 294 if newoutput is None: 295 # retry fixing the @charset rule (but keep the decoded stuff) 296 self.buffer = output 297 return u"" 298 self.headerfixed = True 299 return newoutput
300
301 - def reset(self):
302 codecs.IncrementalDecoder.reset(self) 303 self.decoder = None 304 self.buffer = "" 305 self.headerfixed = False
306
307 - def _geterrors(self):
308 return self._errors
309
310 - def _seterrors(self, errors):
311 # Setting ``errors`` must be done on the real decoder too 312 if self.decoder is not None: 313 self.decoder.errors = errors 314 self._errors = errors
315 errors = property(_geterrors, _seterrors) 316
317 - def getstate(self):
318 if self.decoder is not None: 319 state = (self.encoding, self.buffer, self.headerfixed, True, self.decoder.getstate()) 320 else: 321 state = (self.encoding, self.buffer, self.headerfixed, False, None) 322 return ("", _bytes2int(marshal.dumps(state)))
323
324 - def setstate(self, state):
325 state = _int2bytes(marshal.loads(state[1])) # ignore buffered input 326 self.encoding = state[0] 327 self.buffer = state[1] 328 self.headerfixed = state[2] 329 if state[3] is not None: 330 self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors) 331 self.decoder.setstate(state[4]) 332 else: 333 self.decoder = None
334 335 336 if hasattr(codecs, "IncrementalEncoder"):
337 - class IncrementalEncoder(codecs.IncrementalEncoder):
338 - def __init__(self, errors="strict", encoding=None):
339 self.encoder = None 340 self.encoding = encoding 341 codecs.IncrementalEncoder.__init__(self, errors) 342 # Store ``errors`` somewhere else, 343 # because we have to hide it in a property 344 self._errors = errors 345 self.buffer = u""
346
347 - def iterencode(self, input):
348 for part in input: 349 result = self.encode(part, False) 350 if result: 351 yield result 352 result = self.encode(u"", True) 353 if result: 354 yield result
355
356 - def encode(self, input, final=False):
357 if self.encoder is None: 358 input = self.buffer + input 359 if self.encoding is not None: 360 # Replace encoding in the @charset rule with the specified one 361 encoding = self.encoding 362 if encoding.replace("_", "-").lower() == "utf-8-sig": 363 encoding = "utf-8" 364 newinput = _fixencoding(input, unicode(encoding), final) 365 if newinput is None: # @charset rule incomplete => Retry next time 366 self.buffer = input 367 return "" 368 input = newinput 369 else: 370 # Use encoding from the @charset declaration 371 self.encoding = detectencoding_unicode(input, final)[0] 372 if self.encoding is not None: 373 if self.encoding == "css": 374 raise ValueError("css not allowed as encoding name") 375 info = codecs.lookup(self.encoding) 376 encoding = self.encoding 377 if self.encoding.replace("_", "-").lower() == "utf-8-sig": 378 input = _fixencoding(input, u"utf-8", True) 379 self.encoder = info.incrementalencoder(self._errors) 380 self.buffer = u"" 381 else: 382 self.buffer = input 383 return "" 384 return self.encoder.encode(input, final)
385
386 - def reset(self):
387 codecs.IncrementalEncoder.reset(self) 388 self.encoder = None 389 self.buffer = u""
390
391 - def _geterrors(self):
392 return self._errors
393
394 - def _seterrors(self, errors):
395 # Setting ``errors ``must be done on the real encoder too 396 if self.encoder is not None: 397 self.encoder.errors = errors 398 self._errors = errors
399 errors = property(_geterrors, _seterrors) 400
401 - def getstate(self):
402 if self.encoder is not None: 403 state = (self.encoding, self.buffer, True, self.encoder.getstate()) 404 else: 405 state = (self.encoding, self.buffer, False, None) 406 return _bytes2int(marshal.dumps(state))
407
408 - def setstate(self, state):
409 state = _int2bytes(marshal.loads(state)) 410 self.encoding = state[0] 411 self.buffer = state[1] 412 if state[2] is not None: 413 self.encoder = codecs.getincrementalencoder(self.encoding)(self._errors) 414 self.encoder.setstate(state[4]) 415 else: 416 self.encoder = None
417 418
419 -class StreamWriter(codecs.StreamWriter):
420 - def __init__(self, stream, errors="strict", encoding=None, header=False):
421 codecs.StreamWriter.__init__(self, stream, errors) 422 self.streamwriter = None 423 self.encoding = encoding 424 self._errors = errors 425 self.buffer = u""
426
427 - def encode(self, input, errors='strict'):
428 li = len(input) 429 if self.streamwriter is None: 430 input = self.buffer + input 431 li = len(input) 432 if self.encoding is not None: 433 # Replace encoding in the @charset rule with the specified one 434 encoding = self.encoding 435 if encoding.replace("_", "-").lower() == "utf-8-sig": 436 encoding = "utf-8" 437 newinput = _fixencoding(input, unicode(encoding), False) 438 if newinput is None: # @charset rule incomplete => Retry next time 439 self.buffer = input 440 return ("", 0) 441 input = newinput 442 else: 443 # Use encoding from the @charset declaration 444 self.encoding = detectencoding_unicode(input, False)[0] 445 if self.encoding is not None: 446 if self.encoding == "css": 447 raise ValueError("css not allowed as encoding name") 448 self.streamwriter = codecs.getwriter(self.encoding)(self.stream, self._errors) 449 encoding = self.encoding 450 if self.encoding.replace("_", "-").lower() == "utf-8-sig": 451 input = _fixencoding(input, u"utf-8", True) 452 self.buffer = u"" 453 else: 454 self.buffer = input 455 return ("", 0) 456 return (self.streamwriter.encode(input, errors)[0], li)
457
458 - def _geterrors(self):
459 return self._errors
460
461 - def _seterrors(self, errors):
462 # Setting ``errors`` must be done on the streamwriter too 463 if self.streamwriter is not None: 464 self.streamwriter.errors = errors 465 self._errors = errors
466 errors = property(_geterrors, _seterrors)
467 468
469 -class StreamReader(codecs.StreamReader):
470 - def __init__(self, stream, errors="strict", encoding=None, force=True):
471 codecs.StreamReader.__init__(self, stream, errors) 472 self.streamreader = None 473 self.encoding = encoding 474 self.force = force 475 self._errors = errors
476
477 - def decode(self, input, errors='strict'):
478 if self.streamreader is None: 479 if self.encoding is None or not self.force: 480 (encoding, explicit) = detectencoding_str(input, False) 481 if encoding is None: # no encoding determined yet 482 return (u"", 0) # no encoding determined yet, so no output 483 elif encoding == "css": 484 raise ValueError("css not allowed as encoding name") 485 if (explicit and not self.force) or self.encoding is None: # Take the encoding from the input 486 self.encoding = encoding 487 streamreader = codecs.getreader(self.encoding) 488 streamreader = streamreader(self.stream, self._errors) 489 (output, consumed) = streamreader.decode(input, errors) 490 encoding = self.encoding 491 if encoding.replace("_", "-").lower() == "utf-8-sig": 492 encoding = "utf-8" 493 newoutput = _fixencoding(output, unicode(encoding), False) 494 if newoutput is not None: 495 self.streamreader = streamreader 496 return (newoutput, consumed) 497 return (u"", 0) # we will create a new streamreader on the next call 498 return self.streamreader.decode(input, errors)
499
500 - def _geterrors(self):
501 return self._errors
502
503 - def _seterrors(self, errors):
504 # Setting ``errors`` must be done on the streamreader too 505 if self.streamreader is not None: 506 self.streamreader.errors = errors 507 self._errors = errors
508 errors = property(_geterrors, _seterrors)
509 510 511 if hasattr(codecs, "CodecInfo"): 512 # We're running on Python 2.5 or better
513 - def search_function(name):
514 if name == "css": 515 return codecs.CodecInfo( 516 name="css", 517 encode=encode, 518 decode=decode, 519 incrementalencoder=IncrementalEncoder, 520 incrementaldecoder=IncrementalDecoder, 521 streamwriter=StreamWriter, 522 streamreader=StreamReader, 523 )
524 else: 525 # If we're running on Python 2.4, define the utf-8-sig codec here
526 - def utf8sig_encode(input, errors='strict'):
527 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
528
529 - def utf8sig_decode(input, errors='strict'):
530 prefix = 0 531 if input[:3] == codecs.BOM_UTF8: 532 input = input[3:] 533 prefix = 3 534 (output, consumed) = codecs.utf_8_decode(input, errors, True) 535 return (output, consumed+prefix)
536
537 - class UTF8SigStreamWriter(codecs.StreamWriter):
538 - def reset(self):
539 codecs.StreamWriter.reset(self) 540 try: 541 del self.encode 542 except AttributeError: 543 pass
544
545 - def encode(self, input, errors='strict'):
546 self.encode = codecs.utf_8_encode 547 return utf8sig_encode(input, errors)
548
549 - class UTF8SigStreamReader(codecs.StreamReader):
550 - def reset(self):
551 codecs.StreamReader.reset(self) 552 try: 553 del self.decode 554 except AttributeError: 555 pass
556
557 - def decode(self, input, errors='strict'):
558 if len(input) < 3 and codecs.BOM_UTF8.startswith(input): 559 # not enough data to decide if this is a BOM 560 # => try again on the next call 561 return (u"", 0) 562 self.decode = codecs.utf_8_decode 563 return utf8sig_decode(input, errors)
564
565 - def search_function(name):
566 import encodings 567 name = encodings.normalize_encoding(name) 568 if name == "css": 569 return (encode, decode, StreamReader, StreamWriter) 570 elif name == "utf_8_sig": 571 return (utf8sig_encode, utf8sig_decode, UTF8SigStreamReader, UTF8SigStreamWriter)
572 573 574 codecs.register(search_function) 575 576 577 # Error handler for CSS escaping 578
579 -def cssescape(exc):
580 if not isinstance(exc, UnicodeEncodeError): 581 raise TypeError("don't know how to handle %r" % exc) 582 return (u"".join(u"\\%06x" % ord(c) for c in exc.object[exc.start:exc.end]), exc.end)
583 584 codecs.register_error("cssescape", cssescape) 585