1
2
3 """
4 Python codec for CSS.
5 """
6
7 __docformat__ = 'restructuredtext'
8 __author__ = '$LastChangedBy: doerwalter $'
9 __date__ = '$LastChangedDate: 2007-11-05 14:52:02 +0100 (Mo, 05 Nov 2007) $'
10 __version__ = '$LastChangedRevision: 645 $'
11
12
13 import codecs, marshal
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
35 """
36 Detect the encoding of the byte string ``input``, which contains the
37 beginning of a CSS file. To detect the encoding the first few bytes are
38 used (or if ``input`` is ASCII compatible and starts with a charset rule
39 the encoding name from the rule.
40
41 If the encoding can't be detected yet, ``None`` is returned. ``final``
42 specifies whether more data is available in later calls or not. If ``final``
43 is true, ``_detectencoding_str()`` will never return ``None``.
44 """
45
46
47 CANDIDATE_UTF_8_SIG = 1
48 CANDIDATE_UTF_16_AS_LE = 2
49 CANDIDATE_UTF_16_AS_BE = 4
50 CANDIDATE_UTF_16_LE = 8
51 CANDIDATE_UTF_16_BE = 16
52 CANDIDATE_UTF_32_AS_LE = 32
53 CANDIDATE_UTF_32_AS_BE = 64
54 CANDIDATE_UTF_32_LE = 128
55 CANDIDATE_UTF_32_BE = 256
56 CANDIDATE_CHARSET = 512
57
58 candidates = 1023
59
60 li = len(input)
61 if li>=1:
62
63 c = input[0]
64 if c != "\xef":
65 candidates &= ~CANDIDATE_UTF_8_SIG
66 if c != "\xff":
67 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_16_AS_LE)
68 if c != "\xfe":
69 candidates &= ~CANDIDATE_UTF_16_AS_BE
70 if c != "@":
71 candidates &= ~(CANDIDATE_UTF_32_LE|CANDIDATE_UTF_16_LE|CANDIDATE_CHARSET)
72 if c != "\x00":
73 candidates &= ~(CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_BE|CANDIDATE_UTF_16_BE)
74 if li>=2:
75
76 c = input[1]
77 if c != "\xbb":
78 candidates &= ~CANDIDATE_UTF_8_SIG
79 if c != "\xfe":
80 candidates &= ~(CANDIDATE_UTF_16_AS_LE|CANDIDATE_UTF_32_AS_LE)
81 if c != "\xff":
82 candidates &= ~CANDIDATE_UTF_16_AS_BE
83 if c != "\x00":
84 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
85 if c != "@":
86 candidates &= ~CANDIDATE_UTF_16_BE
87 if c != "c":
88 candidates &= ~CANDIDATE_CHARSET
89 if li>=3:
90
91 c = input[2]
92 if c != "\xbf":
93 candidates &= ~CANDIDATE_UTF_8_SIG
94 if c != "c":
95 candidates &= ~CANDIDATE_UTF_16_LE
96 if c != "\x00":
97 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
98 if c != "\xfe":
99 candidates &= ~CANDIDATE_UTF_32_AS_BE
100 if c != "h":
101 candidates &= ~CANDIDATE_CHARSET
102 if li>=4:
103
104 c = input[3]
105 if input[2:4] == "\x00\x00":
106 candidates &= ~CANDIDATE_UTF_16_AS_LE
107 if c != "\x00":
108 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE)
109 if c != "\xff":
110 candidates &= ~CANDIDATE_UTF_32_AS_BE
111 if c != "@":
112 candidates &= ~CANDIDATE_UTF_32_BE
113 if c != "a":
114 candidates &= ~CANDIDATE_CHARSET
115 if candidates == 0:
116 return "utf-8"
117 if not (candidates & (candidates-1)):
118 if candidates == CANDIDATE_UTF_8_SIG and li >= 3:
119 return "utf-8-sig"
120 elif candidates == CANDIDATE_UTF_16_AS_LE and li >= 2:
121 return "utf-16"
122 elif candidates == CANDIDATE_UTF_16_AS_BE and li >= 2:
123 return "utf-16"
124 elif candidates == CANDIDATE_UTF_16_LE and li >= 4:
125 return "utf-16-le"
126 elif candidates == CANDIDATE_UTF_16_BE and li >= 2:
127 return "utf-16-be"
128 elif candidates == CANDIDATE_UTF_32_AS_LE and li >= 4:
129 return "utf-32"
130 elif candidates == CANDIDATE_UTF_32_AS_BE and li >= 4:
131 return "utf-32"
132 elif candidates == CANDIDATE_UTF_32_LE and li >= 4:
133 return "utf-32-le"
134 elif candidates == CANDIDATE_UTF_32_BE and li >= 4:
135 return "utf-32-be"
136 elif candidates == CANDIDATE_CHARSET and li >= 4:
137 prefix = '@charset "'
138 if input.startswith(prefix):
139 pos = input.find('"', len(prefix))
140 if pos >= 0:
141 return input[len(prefix):pos]
142
143
144 if final:
145 return "utf-8"
146 return None
147
148
150 """
151 Detect the encoding of the unicode string ``input``, which contains the
152 beginning of a CSS file. The encoding is detected from the charset rule
153 at the beginning of ``input``. If there is no charset rule, ``"utf-8"``
154 will be returned.
155
156 If the encoding can't be detected yet, ``None`` is returned. ``final``
157 specifies whether more data will be available in later calls or not. If
158 ``final`` is true, ``_detectencoding_unicode()`` will never return ``None``.
159 """
160 prefix = u'@charset "'
161 if input.startswith(prefix):
162 pos = input.find(u'"', len(prefix))
163 if pos >= 0:
164 return input[len(prefix):pos]
165 elif final or not prefix.startswith(input):
166
167
168 return "utf-8"
169 return None
170
171
173 """
174 Replace the name of the encoding in the charset rule at the beginning of
175 ``input`` with ``encoding``. If ``input`` doesn't starts with a charset
176 rule, ``input`` will be returned unmodified.
177
178 If the encoding can't be found yet, ``None`` is returned. ``final``
179 specifies whether more data will be available in later calls or not.
180 If ``final`` is true, ``_fixencoding()`` will never return ``None``.
181 """
182 prefix = u'@charset "'
183 if len(input) > len(prefix):
184 if input.startswith(prefix):
185 pos = input.find(u'"', len(prefix))
186 if pos >= 0:
187 if encoding.replace("_", "-").lower() == "utf-8-sig":
188 encoding = u"utf-8"
189 return prefix + encoding + input[pos:]
190
191 else:
192 return input
193 elif not prefix.startswith(input) or final:
194
195 return input
196 if final:
197 return input
198 return None
199
200
201 -def decode(input, errors="strict", encoding=None):
208
209
210 -def encode(input, errors="strict", encoding=None):
222
223
225
226 i = 0
227 for byte in bytes:
228 i = (i<<8) + ord(byte)
229 return i
230
231
233
234 v = []
235 while i:
236 v.insert(0, chr(i&0xff))
237 i >>= 8
238 return "".join(v)
239
240
241 if hasattr(codecs, "IncrementalDecoder"):
243 - def __init__(self, errors="strict", encoding=None):
252
254 for part in input:
255 result = self.decode(part, False)
256 if result:
257 yield result
258 result = self.decode("", True)
259 if result:
260 yield result
261
262 - def decode(self, input, final=False):
263
264
265
266
267
268 if self.decoder is None:
269 input = self.buffer + input
270 self.encoding = _detectencoding_str(input, final)
271 if self.encoding is None:
272 self.buffer = input
273 return u""
274 if self.encoding == "css":
275 raise ValueError("css not allowed as encoding name")
276 self.buffer = ""
277 decoder = codecs.getincrementaldecoder(self.encoding)
278 self.decoder = decoder(self._errors)
279 if self.headerfixed:
280 return self.decoder.decode(input, final)
281
282
283 output = self.buffer + self.decoder.decode(input, final)
284 encoding = self.encoding
285 if encoding.replace("_", "-").lower() == "utf-8-sig":
286 encoding = "utf-8"
287 newoutput = _fixencoding(output, unicode(encoding), final)
288 if newoutput is None:
289
290 self.buffer = output
291 return u""
292 self.headerfixed = True
293 return newoutput
294
296 codecs.IncrementalDecoder.reset(self)
297 self.decoder = None
298 self.buffer = ""
299 self.headerfixed = False
300
303
305
306 if self.decoder is not None:
307 self.decoder.errors = errors
308 self._errors = errors
309 errors = property(_geterrors, _seterrors)
310
312 if self.decoder is not None:
313 state = (self.encoding, self.buffer, self.headerfixed, True, self.decoder.getstate())
314 else:
315 state = (self.encoding, self.buffer, self.headerfixed, False, None)
316 return ("", _bytes2int(marshal.dumps(state)))
317
319 state = _int2bytes(marshal.loads(state[1]))
320 self.encoding = state[0]
321 self.buffer = state[1]
322 self.headerfixed = state[2]
323 if state[3] is not None:
324 self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors)
325 self.decoder.setstate(state[4])
326 else:
327 self.decoder = None
328
329
330 if hasattr(codecs, "IncrementalEncoder"):
332 - def __init__(self, errors="strict", encoding=None):
340
342 for part in input:
343 result = self.encode(part, False)
344 if result:
345 yield result
346 result = self.encode(u"", True)
347 if result:
348 yield result
349
350 - def encode(self, input, final=False):
351 if self.encoder is None:
352 input = self.buffer + input
353 if self.encoding is not None:
354
355 encoding = self.encoding
356 if encoding.replace("_", "-").lower() == "utf-8-sig":
357 encoding = "utf-8"
358 newinput = _fixencoding(input, unicode(encoding), final)
359 if newinput is None:
360 self.buffer = input
361 return ""
362 input = newinput
363 else:
364
365 self.encoding = _detectencoding_unicode(input, final)
366 if self.encoding is not None:
367 if self.encoding == "css":
368 raise ValueError("css not allowed as encoding name")
369 info = codecs.lookup(self.encoding)
370 encoding = self.encoding
371 if self.encoding.replace("_", "-").lower() == "utf-8-sig":
372 input = _fixencoding(input, u"utf-8", True)
373 self.encoder = info.incrementalencoder(self._errors)
374 self.buffer = u""
375 else:
376 self.buffer = input
377 return ""
378 return self.encoder.encode(input, final)
379
384
387
389
390 if self.encoder is not None:
391 self.encoder.errors = errors
392 self._errors = errors
393 errors = property(_geterrors, _seterrors)
394
396 if self.encoder is not None:
397 state = (self.encoding, self.buffer, True, self.encoder.getstate())
398 else:
399 state = (self.encoding, self.buffer, False, None)
400 return _bytes2int(marshal.dumps(state))
401
403 state = _int2bytes(marshal.loads(state))
404 self.encoding = state[0]
405 self.buffer = state[1]
406 if state[2] is not None:
407 self.encoder = codecs.getincrementalencoder(self.encoding)(self._errors)
408 self.encoder.setstate(state[4])
409 else:
410 self.encoder = None
411
412
414 - def __init__(self, stream, errors="strict", encoding=None, header=False):
420
421 - def encode(self, input, errors='strict'):
422 li = len(input)
423 if self.streamwriter is None:
424 input = self.buffer + input
425 li = len(input)
426 if self.encoding is not None:
427
428 encoding = self.encoding
429 if encoding.replace("_", "-").lower() == "utf-8-sig":
430 encoding = "utf-8"
431 newinput = _fixencoding(input, unicode(encoding), False)
432 if newinput is None:
433 self.buffer = input
434 return ("", 0)
435 input = newinput
436 else:
437
438 self.encoding = _detectencoding_unicode(input, False)
439 if self.encoding is not None:
440 if self.encoding == "css":
441 raise ValueError("css not allowed as encoding name")
442 self.streamwriter = codecs.getwriter(self.encoding)(self.stream, self._errors)
443 encoding = self.encoding
444 if self.encoding.replace("_", "-").lower() == "utf-8-sig":
445 input = _fixencoding(input, u"utf-8", True)
446 self.buffer = u""
447 else:
448 self.buffer = input
449 return ("", 0)
450 return (self.streamwriter.encode(input, errors)[0], li)
451
454
456
457 if self.streamwriter is not None:
458 self.streamwriter.errors = errors
459 self._errors = errors
460 errors = property(_geterrors, _seterrors)
461
462
464 - def __init__(self, stream, errors="strict", encoding=None):
469
470 - def decode(self, input, errors='strict'):
471 if self.streamreader is None:
472 self.encoding = _detectencoding_str(input, False)
473 if self.encoding is None:
474 return (u"", 0)
475 if self.encoding == "css":
476 raise ValueError("css not allowed as encoding name")
477 streamreader = codecs.getreader(self.encoding)
478 streamreader = streamreader(self.stream, self._errors)
479 (output, consumed) = streamreader.decode(input, errors)
480 encoding = self.encoding
481 if encoding.replace("_", "-").lower() == "utf-8-sig":
482 encoding = "utf-8"
483 newoutput = _fixencoding(output, unicode(encoding), False)
484 if newoutput is not None:
485 self.streamreader = streamreader
486 return (newoutput, consumed)
487 return (u"", 0)
488 return self.streamreader.decode(input, errors)
489
492
494
495 if self.streamreader is not None:
496 self.streamreader.errors = errors
497 self._errors = errors
498 errors = property(_geterrors, _seterrors)
499
500
501 if hasattr(codecs, "CodecInfo"):
502
514 else:
515
517 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
518
520 prefix = 0
521 if input[:3] == codecs.BOM_UTF8:
522 input = input[3:]
523 prefix = 3
524 (output, consumed) = codecs.utf_8_decode(input, errors, True)
525 return (output, consumed+prefix)
526
534
535 - def encode(self, input, errors='strict'):
538
546
547 - def decode(self, input, errors='strict'):
548 if len(input) < 3 and codecs.BOM_UTF8.startswith(input):
549
550
551 return (u"", 0)
552 self.decode = codecs.utf_8_decode
553 return utf8sig_decode(input, errors)
554
562
563
564 codecs.register(search_function)
565
566
567
568
570 if not isinstance(exc, UnicodeEncodeError):
571 raise TypeError("don't know how to handle %r" % exc)
572 return (u"".join(u"\\%06x" % ord(c) for c in exc.object[exc.start:exc.end]), exc.end)
573
574 codecs.register_error("cssescape", cssescape)
575