1
2
3 """Python codec for CSS."""
4 __docformat__ = 'restructuredtext'
5 __author__ = 'Walter Doerwald'
6 __version__ = '$Id: util.py 1114 2008-03-05 13:22:59Z cthedot $'
7
8 import codecs, marshal
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
30 """
31 Detect the encoding of the byte string ``input``, which contains the
32 beginning of a CSS file. This function returs the detected encoding (or
33 ``None`` if it hasn't got enough data), and a flag that indicates whether
34 to encoding has been detected explicitely or implicitely. To detect the
35 encoding the first few bytes are used (or if ``input`` is ASCII compatible
36 and starts with a charset rule the encoding name from the rule). "Explicit"
37 detection means that the bytes start with a BOM or a charset rule.
38
39 If the encoding can't be detected yet, ``None`` is returned as the encoding.
40 ``final`` specifies whether more data is available in later calls or not.
41 If ``final`` is true, ``detectencoding_str()`` will never return ``None``
42 as the encoding.
43 """
44
45
46 CANDIDATE_UTF_8_SIG = 1
47 CANDIDATE_UTF_16_AS_LE = 2
48 CANDIDATE_UTF_16_AS_BE = 4
49 CANDIDATE_UTF_16_LE = 8
50 CANDIDATE_UTF_16_BE = 16
51 CANDIDATE_UTF_32_AS_LE = 32
52 CANDIDATE_UTF_32_AS_BE = 64
53 CANDIDATE_UTF_32_LE = 128
54 CANDIDATE_UTF_32_BE = 256
55 CANDIDATE_CHARSET = 512
56
57 candidates = 1023
58
59 li = len(input)
60 if li>=1:
61
62 c = input[0]
63 if c != "\xef":
64 candidates &= ~CANDIDATE_UTF_8_SIG
65 if c != "\xff":
66 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_16_AS_LE)
67 if c != "\xfe":
68 candidates &= ~CANDIDATE_UTF_16_AS_BE
69 if c != "@":
70 candidates &= ~(CANDIDATE_UTF_32_LE|CANDIDATE_UTF_16_LE|CANDIDATE_CHARSET)
71 if c != "\x00":
72 candidates &= ~(CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_BE|CANDIDATE_UTF_16_BE)
73 if li>=2:
74
75 c = input[1]
76 if c != "\xbb":
77 candidates &= ~CANDIDATE_UTF_8_SIG
78 if c != "\xfe":
79 candidates &= ~(CANDIDATE_UTF_16_AS_LE|CANDIDATE_UTF_32_AS_LE)
80 if c != "\xff":
81 candidates &= ~CANDIDATE_UTF_16_AS_BE
82 if c != "\x00":
83 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
84 if c != "@":
85 candidates &= ~CANDIDATE_UTF_16_BE
86 if c != "c":
87 candidates &= ~CANDIDATE_CHARSET
88 if li>=3:
89
90 c = input[2]
91 if c != "\xbf":
92 candidates &= ~CANDIDATE_UTF_8_SIG
93 if c != "c":
94 candidates &= ~CANDIDATE_UTF_16_LE
95 if c != "\x00":
96 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
97 if c != "\xfe":
98 candidates &= ~CANDIDATE_UTF_32_AS_BE
99 if c != "h":
100 candidates &= ~CANDIDATE_CHARSET
101 if li>=4:
102
103 c = input[3]
104 if input[2:4] == "\x00\x00":
105 candidates &= ~CANDIDATE_UTF_16_AS_LE
106 if c != "\x00":
107 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE)
108 if c != "\xff":
109 candidates &= ~CANDIDATE_UTF_32_AS_BE
110 if c != "@":
111 candidates &= ~CANDIDATE_UTF_32_BE
112 if c != "a":
113 candidates &= ~CANDIDATE_CHARSET
114 if candidates == 0:
115 return ("utf-8", False)
116 if not (candidates & (candidates-1)):
117 if candidates == CANDIDATE_UTF_8_SIG and li >= 3:
118 return ("utf-8-sig", True)
119 elif candidates == CANDIDATE_UTF_16_AS_LE and li >= 2:
120 return ("utf-16", True)
121 elif candidates == CANDIDATE_UTF_16_AS_BE and li >= 2:
122 return ("utf-16", True)
123 elif candidates == CANDIDATE_UTF_16_LE and li >= 4:
124 return ("utf-16-le", False)
125 elif candidates == CANDIDATE_UTF_16_BE and li >= 2:
126 return ("utf-16-be", False)
127 elif candidates == CANDIDATE_UTF_32_AS_LE and li >= 4:
128 return ("utf-32", True)
129 elif candidates == CANDIDATE_UTF_32_AS_BE and li >= 4:
130 return ("utf-32", True)
131 elif candidates == CANDIDATE_UTF_32_LE and li >= 4:
132 return ("utf-32-le", False)
133 elif candidates == CANDIDATE_UTF_32_BE and li >= 4:
134 return ("utf-32-be", False)
135 elif candidates == CANDIDATE_CHARSET and li >= 4:
136 prefix = '@charset "'
137 if input[:len(prefix)] == prefix:
138 pos = input.find('"', len(prefix))
139 if pos >= 0:
140 return (input[len(prefix):pos], True)
141
142
143 if final:
144 return ("utf-8", False)
145 return (None, False)
146
147
149 """
150 Detect the encoding of the unicode string ``input``, which contains the
151 beginning of a CSS file. The encoding is detected from the charset rule
152 at the beginning of ``input``. If there is no charset rule, ``"utf-8"``
153 will be returned.
154
155 If the encoding can't be detected yet, ``None`` is returned. ``final``
156 specifies whether more data will be available in later calls or not. If
157 ``final`` is true, ``detectencoding_unicode()`` will never return ``None``.
158 """
159 prefix = u'@charset "'
160 if input.startswith(prefix):
161 pos = input.find(u'"', len(prefix))
162 if pos >= 0:
163 return (input[len(prefix):pos], True)
164 elif final or not prefix.startswith(input):
165
166
167 return ("utf-8", False)
168 return (None, False)
169
170
172 """
173 Replace the name of the encoding in the charset rule at the beginning of
174 ``input`` with ``encoding``. If ``input`` doesn't starts with a charset
175 rule, ``input`` will be returned unmodified.
176
177 If the encoding can't be found yet, ``None`` is returned. ``final``
178 specifies whether more data will be available in later calls or not.
179 If ``final`` is true, ``_fixencoding()`` will never return ``None``.
180 """
181 prefix = u'@charset "'
182 if len(input) > len(prefix):
183 if input.startswith(prefix):
184 pos = input.find(u'"', len(prefix))
185 if pos >= 0:
186 if encoding.replace("_", "-").lower() == "utf-8-sig":
187 encoding = u"utf-8"
188 return prefix + encoding + input[pos:]
189
190 else:
191 return input
192 elif not prefix.startswith(input) or final:
193
194 return input
195 if final:
196 return input
197 return None
198
199
200 -def decode(input, errors="strict", encoding=None, force=True):
201 if encoding is None or not force:
202 (_encoding, explicit) = detectencoding_str(input, True)
203 if _encoding == "css":
204 raise ValueError("css not allowed as encoding name")
205 if (explicit and not force) or encoding is None:
206 encoding = _encoding
207 (input, consumed) = codecs.getdecoder(encoding)(input, errors)
208 return (_fixencoding(input, unicode(encoding), True), consumed)
209
210
211 -def encode(input, errors="strict", encoding=None):
223
224
226
227 i = 0
228 for byte in bytes:
229 i = (i<<8) + ord(byte)
230 return i
231
232
234
235 v = []
236 while i:
237 v.insert(0, chr(i&0xff))
238 i >>= 8
239 return "".join(v)
240
241
242 if hasattr(codecs, "IncrementalDecoder"):
244 - def __init__(self, errors="strict", encoding=None, force=True):
254
256 for part in input:
257 result = self.decode(part, False)
258 if result:
259 yield result
260 result = self.decode("", True)
261 if result:
262 yield result
263
264 - def decode(self, input, final=False):
265
266
267
268
269
270 if self.decoder is None:
271 input = self.buffer + input
272
273 if self.encoding is None or not self.force:
274 (encoding, explicit) = detectencoding_str(input, final)
275 if encoding is None:
276 self.buffer = input
277 return u""
278 elif encoding == "css":
279 raise ValueError("css not allowed as encoding name")
280 if (explicit and not self.force) or self.encoding is None:
281 self.encoding = encoding
282 self.buffer = ""
283 decoder = codecs.getincrementaldecoder(self.encoding)
284 self.decoder = decoder(self._errors)
285 if self.headerfixed:
286 return self.decoder.decode(input, final)
287
288
289 output = self.buffer + self.decoder.decode(input, final)
290 encoding = self.encoding
291 if encoding.replace("_", "-").lower() == "utf-8-sig":
292 encoding = "utf-8"
293 newoutput = _fixencoding(output, unicode(encoding), final)
294 if newoutput is None:
295
296 self.buffer = output
297 return u""
298 self.headerfixed = True
299 return newoutput
300
302 codecs.IncrementalDecoder.reset(self)
303 self.decoder = None
304 self.buffer = ""
305 self.headerfixed = False
306
309
311
312 if self.decoder is not None:
313 self.decoder.errors = errors
314 self._errors = errors
315 errors = property(_geterrors, _seterrors)
316
318 if self.decoder is not None:
319 state = (self.encoding, self.buffer, self.headerfixed, True, self.decoder.getstate())
320 else:
321 state = (self.encoding, self.buffer, self.headerfixed, False, None)
322 return ("", _bytes2int(marshal.dumps(state)))
323
325 state = _int2bytes(marshal.loads(state[1]))
326 self.encoding = state[0]
327 self.buffer = state[1]
328 self.headerfixed = state[2]
329 if state[3] is not None:
330 self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors)
331 self.decoder.setstate(state[4])
332 else:
333 self.decoder = None
334
335
336 if hasattr(codecs, "IncrementalEncoder"):
338 - def __init__(self, errors="strict", encoding=None):
346
348 for part in input:
349 result = self.encode(part, False)
350 if result:
351 yield result
352 result = self.encode(u"", True)
353 if result:
354 yield result
355
356 - def encode(self, input, final=False):
357 if self.encoder is None:
358 input = self.buffer + input
359 if self.encoding is not None:
360
361 encoding = self.encoding
362 if encoding.replace("_", "-").lower() == "utf-8-sig":
363 encoding = "utf-8"
364 newinput = _fixencoding(input, unicode(encoding), final)
365 if newinput is None:
366 self.buffer = input
367 return ""
368 input = newinput
369 else:
370
371 self.encoding = detectencoding_unicode(input, final)[0]
372 if self.encoding is not None:
373 if self.encoding == "css":
374 raise ValueError("css not allowed as encoding name")
375 info = codecs.lookup(self.encoding)
376 encoding = self.encoding
377 if self.encoding.replace("_", "-").lower() == "utf-8-sig":
378 input = _fixencoding(input, u"utf-8", True)
379 self.encoder = info.incrementalencoder(self._errors)
380 self.buffer = u""
381 else:
382 self.buffer = input
383 return ""
384 return self.encoder.encode(input, final)
385
390
393
395
396 if self.encoder is not None:
397 self.encoder.errors = errors
398 self._errors = errors
399 errors = property(_geterrors, _seterrors)
400
402 if self.encoder is not None:
403 state = (self.encoding, self.buffer, True, self.encoder.getstate())
404 else:
405 state = (self.encoding, self.buffer, False, None)
406 return _bytes2int(marshal.dumps(state))
407
409 state = _int2bytes(marshal.loads(state))
410 self.encoding = state[0]
411 self.buffer = state[1]
412 if state[2] is not None:
413 self.encoder = codecs.getincrementalencoder(self.encoding)(self._errors)
414 self.encoder.setstate(state[4])
415 else:
416 self.encoder = None
417
418
420 - def __init__(self, stream, errors="strict", encoding=None, header=False):
426
427 - def encode(self, input, errors='strict'):
428 li = len(input)
429 if self.streamwriter is None:
430 input = self.buffer + input
431 li = len(input)
432 if self.encoding is not None:
433
434 encoding = self.encoding
435 if encoding.replace("_", "-").lower() == "utf-8-sig":
436 encoding = "utf-8"
437 newinput = _fixencoding(input, unicode(encoding), False)
438 if newinput is None:
439 self.buffer = input
440 return ("", 0)
441 input = newinput
442 else:
443
444 self.encoding = detectencoding_unicode(input, False)[0]
445 if self.encoding is not None:
446 if self.encoding == "css":
447 raise ValueError("css not allowed as encoding name")
448 self.streamwriter = codecs.getwriter(self.encoding)(self.stream, self._errors)
449 encoding = self.encoding
450 if self.encoding.replace("_", "-").lower() == "utf-8-sig":
451 input = _fixencoding(input, u"utf-8", True)
452 self.buffer = u""
453 else:
454 self.buffer = input
455 return ("", 0)
456 return (self.streamwriter.encode(input, errors)[0], li)
457
460
462
463 if self.streamwriter is not None:
464 self.streamwriter.errors = errors
465 self._errors = errors
466 errors = property(_geterrors, _seterrors)
467
468
470 - def __init__(self, stream, errors="strict", encoding=None, force=True):
476
477 - def decode(self, input, errors='strict'):
478 if self.streamreader is None:
479 if self.encoding is None or not self.force:
480 (encoding, explicit) = detectencoding_str(input, False)
481 if encoding is None:
482 return (u"", 0)
483 elif encoding == "css":
484 raise ValueError("css not allowed as encoding name")
485 if (explicit and not self.force) or self.encoding is None:
486 self.encoding = encoding
487 streamreader = codecs.getreader(self.encoding)
488 streamreader = streamreader(self.stream, self._errors)
489 (output, consumed) = streamreader.decode(input, errors)
490 encoding = self.encoding
491 if encoding.replace("_", "-").lower() == "utf-8-sig":
492 encoding = "utf-8"
493 newoutput = _fixencoding(output, unicode(encoding), False)
494 if newoutput is not None:
495 self.streamreader = streamreader
496 return (newoutput, consumed)
497 return (u"", 0)
498 return self.streamreader.decode(input, errors)
499
502
504
505 if self.streamreader is not None:
506 self.streamreader.errors = errors
507 self._errors = errors
508 errors = property(_geterrors, _seterrors)
509
510
511 if hasattr(codecs, "CodecInfo"):
512
524 else:
525
527 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
528
530 prefix = 0
531 if input[:3] == codecs.BOM_UTF8:
532 input = input[3:]
533 prefix = 3
534 (output, consumed) = codecs.utf_8_decode(input, errors, True)
535 return (output, consumed+prefix)
536
544
545 - def encode(self, input, errors='strict'):
548
556
557 - def decode(self, input, errors='strict'):
558 if len(input) < 3 and codecs.BOM_UTF8.startswith(input):
559
560
561 return (u"", 0)
562 self.decode = codecs.utf_8_decode
563 return utf8sig_decode(input, errors)
564
572
573
574 codecs.register(search_function)
575
576
577
578
580 if not isinstance(exc, UnicodeEncodeError):
581 raise TypeError("don't know how to handle %r" % exc)
582 return (u"".join(u"\\%06x" % ord(c) for c in exc.object[exc.start:exc.end]), exc.end)
583
584 codecs.register_error("cssescape", cssescape)
585