Package cssutils :: Module codec
[hide private]
[frames] | no frames]

Source Code for Module cssutils.codec

  1  #!/usr/bin/env python 
  2   
  3  """ 
  4  Python codec for CSS. 
  5  """ 
  6   
  7  __docformat__ = 'restructuredtext' 
  8  __author__ = '$LastChangedBy: doerwalter $' 
  9  __date__ = '$LastChangedDate: 2008-01-29 15:03:50 +0100 (Di, 29 Jan 2008) $' 
 10  __version__ = '$LastChangedRevision: 958 $' 
 11   
 12   
 13  import codecs, marshal 
 14   
 15   
 16  # We're using bits to store all possible candidate encodings (or variants, i.e. 
 17  # we have two bits for the variants of UTF-16 and two for the 
 18  # variants of UTF-32). 
 19  # 
 20  # Prefixes for various CSS encodings 
 21  # UTF-8-SIG   xEF  xBB  xBF 
 22  # UTF-16 (LE) xFF  xFE ~x00|~x00 
 23  # UTF-16 (BE) xFE  xFF 
 24  # UTF-16-LE    @   x00   @   x00 
 25  # UTF-16-BE   x00   @ 
 26  # UTF-32 (LE) xFF  xFE  x00  x00 
 27  # UTF-32 (BE) x00  x00  xFE  xFF 
 28  # UTF-32-LE    @   x00  x00  x00 
 29  # UTF-32-BE   x00  x00  x00   @ 
 30  # CHARSET      @    c    h    a  ... 
 31   
 32   
 33   
34 -def _detectencoding_str(input, final=False):
35 """ 36 Detect the encoding of the byte string ``input``, which contains the 37 beginning of a CSS file. To detect the encoding the first few bytes are 38 used (or if ``input`` is ASCII compatible and starts with a charset rule 39 the encoding name from the rule). 40 41 If the encoding can't be detected yet, ``None`` is returned. ``final`` 42 specifies whether more data is available in later calls or not. If ``final`` 43 is true, ``_detectencoding_str()`` will never return ``None``. 44 """ 45 46 # A bit for every candidate 47 CANDIDATE_UTF_8_SIG = 1 48 CANDIDATE_UTF_16_AS_LE = 2 49 CANDIDATE_UTF_16_AS_BE = 4 50 CANDIDATE_UTF_16_LE = 8 51 CANDIDATE_UTF_16_BE = 16 52 CANDIDATE_UTF_32_AS_LE = 32 53 CANDIDATE_UTF_32_AS_BE = 64 54 CANDIDATE_UTF_32_LE = 128 55 CANDIDATE_UTF_32_BE = 256 56 CANDIDATE_CHARSET = 512 57 58 candidates = 1023 # all candidates 59 60 li = len(input) 61 if li>=1: 62 # Check first byte 63 c = input[0] 64 if c != "\xef": 65 candidates &= ~CANDIDATE_UTF_8_SIG 66 if c != "\xff": 67 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_16_AS_LE) 68 if c != "\xfe": 69 candidates &= ~CANDIDATE_UTF_16_AS_BE 70 if c != "@": 71 candidates &= ~(CANDIDATE_UTF_32_LE|CANDIDATE_UTF_16_LE|CANDIDATE_CHARSET) 72 if c != "\x00": 73 candidates &= ~(CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_BE|CANDIDATE_UTF_16_BE) 74 if li>=2: 75 # Check second byte 76 c = input[1] 77 if c != "\xbb": 78 candidates &= ~CANDIDATE_UTF_8_SIG 79 if c != "\xfe": 80 candidates &= ~(CANDIDATE_UTF_16_AS_LE|CANDIDATE_UTF_32_AS_LE) 81 if c != "\xff": 82 candidates &= ~CANDIDATE_UTF_16_AS_BE 83 if c != "\x00": 84 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE) 85 if c != "@": 86 candidates &= ~CANDIDATE_UTF_16_BE 87 if c != "c": 88 candidates &= ~CANDIDATE_CHARSET 89 if li>=3: 90 # Check third byte 91 c = input[2] 92 if c != "\xbf": 93 candidates &= ~CANDIDATE_UTF_8_SIG 94 if c != "c": 95 candidates &= ~CANDIDATE_UTF_16_LE 96 if c != "\x00": 97 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE) 98 if c != "\xfe": 99 candidates &= ~CANDIDATE_UTF_32_AS_BE 100 if c != "h": 101 candidates &= ~CANDIDATE_CHARSET 102 if li>=4: 103 # Check fourth byte 104 c = input[3] 105 if input[2:4] == "\x00\x00": 106 candidates &= ~CANDIDATE_UTF_16_AS_LE 107 if c != "\x00": 108 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE) 109 if c != "\xff": 110 candidates &= ~CANDIDATE_UTF_32_AS_BE 111 if c != "@": 112 candidates &= ~CANDIDATE_UTF_32_BE 113 if c != "a": 114 candidates &= ~CANDIDATE_CHARSET 115 if candidates == 0: 116 return "utf-8" 117 if not (candidates & (candidates-1)): # only one candidate remaining 118 if candidates == CANDIDATE_UTF_8_SIG and li >= 3: 119 return "utf-8-sig" 120 elif candidates == CANDIDATE_UTF_16_AS_LE and li >= 2: 121 return "utf-16" 122 elif candidates == CANDIDATE_UTF_16_AS_BE and li >= 2: 123 return "utf-16" 124 elif candidates == CANDIDATE_UTF_16_LE and li >= 4: 125 return "utf-16-le" 126 elif candidates == CANDIDATE_UTF_16_BE and li >= 2: 127 return "utf-16-be" 128 elif candidates == CANDIDATE_UTF_32_AS_LE and li >= 4: 129 return "utf-32" 130 elif candidates == CANDIDATE_UTF_32_AS_BE and li >= 4: 131 return "utf-32" 132 elif candidates == CANDIDATE_UTF_32_LE and li >= 4: 133 return "utf-32-le" 134 elif candidates == CANDIDATE_UTF_32_BE and li >= 4: 135 return "utf-32-be" 136 elif candidates == CANDIDATE_CHARSET and li >= 4: 137 prefix = '@charset "' 138 if input.startswith(prefix): 139 pos = input.find('"', len(prefix)) 140 if pos >= 0: 141 return input[len(prefix):pos] 142 # if this is the last call, and we haven't determined an encoding yet, 143 # we default to UTF-8 144 if final: 145 return "utf-8" 146 return None # dont' know yet
147 148
149 -def _detectencoding_unicode(input, final=False):
150 """ 151 Detect the encoding of the unicode string ``input``, which contains the 152 beginning of a CSS file. The encoding is detected from the charset rule 153 at the beginning of ``input``. If there is no charset rule, ``"utf-8"`` 154 will be returned. 155 156 If the encoding can't be detected yet, ``None`` is returned. ``final`` 157 specifies whether more data will be available in later calls or not. If 158 ``final`` is true, ``_detectencoding_unicode()`` will never return ``None``. 159 """ 160 prefix = u'@charset "' 161 if input.startswith(prefix): 162 pos = input.find(u'"', len(prefix)) 163 if pos >= 0: 164 return input[len(prefix):pos] 165 elif final or not prefix.startswith(input): 166 # if this is the last call, and we haven't determined an encoding yet, 167 # (or the string definitely doesn't start with prefix) we default to UTF-8 168 return "utf-8" 169 return None # don't know yet
170 171
172 -def _fixencoding(input, encoding, final=False):
173 """ 174 Replace the name of the encoding in the charset rule at the beginning of 175 ``input`` with ``encoding``. If ``input`` doesn't starts with a charset 176 rule, ``input`` will be returned unmodified. 177 178 If the encoding can't be found yet, ``None`` is returned. ``final`` 179 specifies whether more data will be available in later calls or not. 180 If ``final`` is true, ``_fixencoding()`` will never return ``None``. 181 """ 182 prefix = u'@charset "' 183 if len(input) > len(prefix): 184 if input.startswith(prefix): 185 pos = input.find(u'"', len(prefix)) 186 if pos >= 0: 187 if encoding.replace("_", "-").lower() == "utf-8-sig": 188 encoding = u"utf-8" 189 return prefix + encoding + input[pos:] 190 # we haven't seen the end of the encoding name yet => fall through 191 else: 192 return input # doesn't start with prefix, so nothing to fix 193 elif not prefix.startswith(input) or final: 194 # can't turn out to be a @charset rule later (or there is no "later") 195 return input 196 if final: 197 return input 198 return None # don't know yet
199 200
201 -def decode(input, errors="strict", encoding=None):
202 if encoding is None: 203 encoding = _detectencoding_str(input, True) 204 if encoding == "css": 205 raise ValueError("css not allowed as encoding name") 206 (input, consumed) = codecs.getdecoder(encoding)(input, errors) 207 return (_fixencoding(input, unicode(encoding), True), consumed)
208 209
210 -def encode(input, errors="strict", encoding=None):
211 consumed = len(input) 212 if encoding is None: 213 encoding = _detectencoding_unicode(input, True) 214 if encoding.replace("_", "-").lower() == "utf-8-sig": 215 input = _fixencoding(input, u"utf-8", True) 216 else: 217 input = _fixencoding(input, unicode(encoding), True) 218 if encoding == "css": 219 raise ValueError("css not allowed as encoding name") 220 encoder = codecs.getencoder(encoding) 221 return (encoder(input, errors)[0], consumed)
222 223
224 -def _bytes2int(bytes):
225 # Helper: convert an 8 bit string into an ``int``. 226 i = 0 227 for byte in bytes: 228 i = (i<<8) + ord(byte) 229 return i
230 231
232 -def _int2bytes(i):
233 # Helper: convert an ``int`` into an 8-bit string. 234 v = [] 235 while i: 236 v.insert(0, chr(i&0xff)) 237 i >>= 8 238 return "".join(v)
239 240 241 if hasattr(codecs, "IncrementalDecoder"):
242 - class IncrementalDecoder(codecs.IncrementalDecoder):
243 - def __init__(self, errors="strict", encoding=None):
244 self.decoder = None 245 self.encoding = encoding 246 codecs.IncrementalDecoder.__init__(self, errors) 247 # Store ``errors`` somewhere else, 248 # because we have to hide it in a property 249 self._errors = errors 250 self.buffer = "" 251 self.headerfixed = False
252
253 - def iterdecode(self, input):
254 for part in input: 255 result = self.decode(part, False) 256 if result: 257 yield result 258 result = self.decode("", True) 259 if result: 260 yield result
261
262 - def decode(self, input, final=False):
263 # We're doing basically the same as a ``BufferedIncrementalDecoder``, 264 # but since the buffer is only relevant until the encoding has been 265 # detected (in which case the buffer of the underlying codec might 266 # kick in), we're implementing buffering ourselves to avoid some 267 # overhead. 268 if self.decoder is None: 269 input = self.buffer + input 270 self.encoding = _detectencoding_str(input, final) 271 if self.encoding is None: 272 self.buffer = input # retry the complete input on the next call 273 return u"" # no encoding determined yet, so no output 274 if self.encoding == "css": 275 raise ValueError("css not allowed as encoding name") 276 self.buffer = "" # drop buffer, as the decoder might keep its own 277 decoder = codecs.getincrementaldecoder(self.encoding) 278 self.decoder = decoder(self._errors) 279 if self.headerfixed: 280 return self.decoder.decode(input, final) 281 # If we haven't fixed the header yet, 282 # the content of ``self.buffer`` is a ``unicode`` object 283 output = self.buffer + self.decoder.decode(input, final) 284 encoding = self.encoding 285 if encoding.replace("_", "-").lower() == "utf-8-sig": 286 encoding = "utf-8" 287 newoutput = _fixencoding(output, unicode(encoding), final) 288 if newoutput is None: 289 # retry fixing the @charset rule (but keep the decoded stuff) 290 self.buffer = output 291 return u"" 292 self.headerfixed = True 293 return newoutput
294
295 - def reset(self):
296 codecs.IncrementalDecoder.reset(self) 297 self.decoder = None 298 self.buffer = "" 299 self.headerfixed = False
300
301 - def _geterrors(self):
302 return self._errors
303
304 - def _seterrors(self, errors):
305 # Setting ``errors`` must be done on the real decoder too 306 if self.decoder is not None: 307 self.decoder.errors = errors 308 self._errors = errors
309 errors = property(_geterrors, _seterrors) 310
311 - def getstate(self):
312 if self.decoder is not None: 313 state = (self.encoding, self.buffer, self.headerfixed, True, self.decoder.getstate()) 314 else: 315 state = (self.encoding, self.buffer, self.headerfixed, False, None) 316 return ("", _bytes2int(marshal.dumps(state)))
317
318 - def setstate(self, state):
319 state = _int2bytes(marshal.loads(state[1])) # ignore buffered input 320 self.encoding = state[0] 321 self.buffer = state[1] 322 self.headerfixed = state[2] 323 if state[3] is not None: 324 self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors) 325 self.decoder.setstate(state[4]) 326 else: 327 self.decoder = None
328 329 330 if hasattr(codecs, "IncrementalEncoder"):
331 - class IncrementalEncoder(codecs.IncrementalEncoder):
332 - def __init__(self, errors="strict", encoding=None):
333 self.encoder = None 334 self.encoding = encoding 335 codecs.IncrementalEncoder.__init__(self, errors) 336 # Store ``errors`` somewhere else, 337 # because we have to hide it in a property 338 self._errors = errors 339 self.buffer = u""
340
341 - def iterencode(self, input):
342 for part in input: 343 result = self.encode(part, False) 344 if result: 345 yield result 346 result = self.encode(u"", True) 347 if result: 348 yield result
349
350 - def encode(self, input, final=False):
351 if self.encoder is None: 352 input = self.buffer + input 353 if self.encoding is not None: 354 # Replace encoding in the @charset rule with the specified one 355 encoding = self.encoding 356 if encoding.replace("_", "-").lower() == "utf-8-sig": 357 encoding = "utf-8" 358 newinput = _fixencoding(input, unicode(encoding), final) 359 if newinput is None: # @charset rule incomplete => Retry next time 360 self.buffer = input 361 return "" 362 input = newinput 363 else: 364 # Use encoding from the @charset declaration 365 self.encoding = _detectencoding_unicode(input, final) 366 if self.encoding is not None: 367 if self.encoding == "css": 368 raise ValueError("css not allowed as encoding name") 369 info = codecs.lookup(self.encoding) 370 encoding = self.encoding 371 if self.encoding.replace("_", "-").lower() == "utf-8-sig": 372 input = _fixencoding(input, u"utf-8", True) 373 self.encoder = info.incrementalencoder(self._errors) 374 self.buffer = u"" 375 else: 376 self.buffer = input 377 return "" 378 return self.encoder.encode(input, final)
379
380 - def reset(self):
381 codecs.IncrementalEncoder.reset(self) 382 self.encoder = None 383 self.buffer = u""
384
385 - def _geterrors(self):
386 return self._errors
387
388 - def _seterrors(self, errors):
389 # Setting ``errors ``must be done on the real encoder too 390 if self.encoder is not None: 391 self.encoder.errors = errors 392 self._errors = errors
393 errors = property(_geterrors, _seterrors) 394
395 - def getstate(self):
396 if self.encoder is not None: 397 state = (self.encoding, self.buffer, True, self.encoder.getstate()) 398 else: 399 state = (self.encoding, self.buffer, False, None) 400 return _bytes2int(marshal.dumps(state))
401
402 - def setstate(self, state):
403 state = _int2bytes(marshal.loads(state)) 404 self.encoding = state[0] 405 self.buffer = state[1] 406 if state[2] is not None: 407 self.encoder = codecs.getincrementalencoder(self.encoding)(self._errors) 408 self.encoder.setstate(state[4]) 409 else: 410 self.encoder = None
411 412
413 -class StreamWriter(codecs.StreamWriter):
414 - def __init__(self, stream, errors="strict", encoding=None, header=False):
415 codecs.StreamWriter.__init__(self, stream, errors) 416 self.streamwriter = None 417 self.encoding = encoding 418 self._errors = errors 419 self.buffer = u""
420
421 - def encode(self, input, errors='strict'):
422 li = len(input) 423 if self.streamwriter is None: 424 input = self.buffer + input 425 li = len(input) 426 if self.encoding is not None: 427 # Replace encoding in the @charset rule with the specified one 428 encoding = self.encoding 429 if encoding.replace("_", "-").lower() == "utf-8-sig": 430 encoding = "utf-8" 431 newinput = _fixencoding(input, unicode(encoding), False) 432 if newinput is None: # @charset rule incomplete => Retry next time 433 self.buffer = input 434 return ("", 0) 435 input = newinput 436 else: 437 # Use encoding from the @charset declaration 438 self.encoding = _detectencoding_unicode(input, False) 439 if self.encoding is not None: 440 if self.encoding == "css": 441 raise ValueError("css not allowed as encoding name") 442 self.streamwriter = codecs.getwriter(self.encoding)(self.stream, self._errors) 443 encoding = self.encoding 444 if self.encoding.replace("_", "-").lower() == "utf-8-sig": 445 input = _fixencoding(input, u"utf-8", True) 446 self.buffer = u"" 447 else: 448 self.buffer = input 449 return ("", 0) 450 return (self.streamwriter.encode(input, errors)[0], li)
451
452 - def _geterrors(self):
453 return self._errors
454
455 - def _seterrors(self, errors):
456 # Setting ``errors`` must be done on the streamwriter too 457 if self.streamwriter is not None: 458 self.streamwriter.errors = errors 459 self._errors = errors
460 errors = property(_geterrors, _seterrors)
461 462
463 -class StreamReader(codecs.StreamReader):
464 - def __init__(self, stream, errors="strict", encoding=None):
465 codecs.StreamReader.__init__(self, stream, errors) 466 self.streamreader = None 467 self.encoding = encoding 468 self._errors = errors
469
470 - def decode(self, input, errors='strict'):
471 if self.streamreader is None: 472 self.encoding = _detectencoding_str(input, False) 473 if self.encoding is None: 474 return (u"", 0) # no encoding determined yet, so no output 475 if self.encoding == "css": 476 raise ValueError("css not allowed as encoding name") 477 streamreader = codecs.getreader(self.encoding) 478 streamreader = streamreader(self.stream, self._errors) 479 (output, consumed) = streamreader.decode(input, errors) 480 encoding = self.encoding 481 if encoding.replace("_", "-").lower() == "utf-8-sig": 482 encoding = "utf-8" 483 newoutput = _fixencoding(output, unicode(encoding), False) 484 if newoutput is not None: 485 self.streamreader = streamreader 486 return (newoutput, consumed) 487 return (u"", 0) # we will create a new streamreader on the next call 488 return self.streamreader.decode(input, errors)
489
490 - def _geterrors(self):
491 return self._errors
492
493 - def _seterrors(self, errors):
494 # Setting ``errors`` must be done on the streamreader too 495 if self.streamreader is not None: 496 self.streamreader.errors = errors 497 self._errors = errors
498 errors = property(_geterrors, _seterrors)
499 500 501 if hasattr(codecs, "CodecInfo"): 502 # We're running on Python 2.5 or better
503 - def search_function(name):
504 if name == "css": 505 return codecs.CodecInfo( 506 name="css", 507 encode=encode, 508 decode=decode, 509 incrementalencoder=IncrementalEncoder, 510 incrementaldecoder=IncrementalDecoder, 511 streamwriter=StreamWriter, 512 streamreader=StreamReader, 513 )
514 else: 515 # If we're running on Python 2.4, define the utf-8-sig codec here
516 - def utf8sig_encode(input, errors='strict'):
517 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
518
519 - def utf8sig_decode(input, errors='strict'):
520 prefix = 0 521 if input[:3] == codecs.BOM_UTF8: 522 input = input[3:] 523 prefix = 3 524 (output, consumed) = codecs.utf_8_decode(input, errors, True) 525 return (output, consumed+prefix)
526
527 - class UTF8SigStreamWriter(codecs.StreamWriter):
528 - def reset(self):
529 codecs.StreamWriter.reset(self) 530 try: 531 del self.encode 532 except AttributeError: 533 pass
534
535 - def encode(self, input, errors='strict'):
536 self.encode = codecs.utf_8_encode 537 return utf8sig_encode(input, errors)
538
539 - class UTF8SigStreamReader(codecs.StreamReader):
540 - def reset(self):
541 codecs.StreamReader.reset(self) 542 try: 543 del self.decode 544 except AttributeError: 545 pass
546
547 - def decode(self, input, errors='strict'):
548 if len(input) < 3 and codecs.BOM_UTF8.startswith(input): 549 # not enough data to decide if this is a BOM 550 # => try again on the next call 551 return (u"", 0) 552 self.decode = codecs.utf_8_decode 553 return utf8sig_decode(input, errors)
554
555 - def search_function(name):
556 import encodings 557 name = encodings.normalize_encoding(name) 558 if name == "css": 559 return (encode, decode, StreamReader, StreamWriter) 560 elif name == "utf_8_sig": 561 return (utf8sig_encode, utf8sig_decode, UTF8SigStreamReader, UTF8SigStreamWriter)
562 563 564 codecs.register(search_function) 565 566 567 # Error handler for CSS escaping 568
569 -def cssescape(exc):
570 if not isinstance(exc, UnicodeEncodeError): 571 raise TypeError("don't know how to handle %r" % exc) 572 return (u"".join(u"\\%06x" % ord(c) for c in exc.object[exc.start:exc.end]), exc.end)
573 574 codecs.register_error("cssescape", cssescape) 575