1
2 """Python codec for CSS."""
3 __docformat__ = 'restructuredtext'
4 __author__ = 'Walter Doerwald'
5 __version__ = '$Id: util.py 1114 2008-03-05 13:22:59Z cthedot $'
6
7 import codecs, marshal
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
27 """
28 Detect the encoding of the byte string ``input``, which contains the
29 beginning of a CSS file. This function returs the detected encoding (or
30 ``None`` if it hasn't got enough data), and a flag that indicates whether
31 to encoding has been detected explicitely or implicitely. To detect the
32 encoding the first few bytes are used (or if ``input`` is ASCII compatible
33 and starts with a charset rule the encoding name from the rule). "Explicit"
34 detection means that the bytes start with a BOM or a charset rule.
35
36 If the encoding can't be detected yet, ``None`` is returned as the encoding.
37 ``final`` specifies whether more data is available in later calls or not.
38 If ``final`` is true, ``detectencoding_str()`` will never return ``None``
39 as the encoding.
40 """
41
42
43 CANDIDATE_UTF_8_SIG = 1
44 CANDIDATE_UTF_16_AS_LE = 2
45 CANDIDATE_UTF_16_AS_BE = 4
46 CANDIDATE_UTF_16_LE = 8
47 CANDIDATE_UTF_16_BE = 16
48 CANDIDATE_UTF_32_AS_LE = 32
49 CANDIDATE_UTF_32_AS_BE = 64
50 CANDIDATE_UTF_32_LE = 128
51 CANDIDATE_UTF_32_BE = 256
52 CANDIDATE_CHARSET = 512
53
54 candidates = 1023
55
56 li = len(input)
57 if li>=1:
58
59 c = input[0]
60 if c != "\xef":
61 candidates &= ~CANDIDATE_UTF_8_SIG
62 if c != "\xff":
63 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_16_AS_LE)
64 if c != "\xfe":
65 candidates &= ~CANDIDATE_UTF_16_AS_BE
66 if c != "@":
67 candidates &= ~(CANDIDATE_UTF_32_LE|CANDIDATE_UTF_16_LE|CANDIDATE_CHARSET)
68 if c != "\x00":
69 candidates &= ~(CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_BE|CANDIDATE_UTF_16_BE)
70 if li>=2:
71
72 c = input[1]
73 if c != "\xbb":
74 candidates &= ~CANDIDATE_UTF_8_SIG
75 if c != "\xfe":
76 candidates &= ~(CANDIDATE_UTF_16_AS_LE|CANDIDATE_UTF_32_AS_LE)
77 if c != "\xff":
78 candidates &= ~CANDIDATE_UTF_16_AS_BE
79 if c != "\x00":
80 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
81 if c != "@":
82 candidates &= ~CANDIDATE_UTF_16_BE
83 if c != "c":
84 candidates &= ~CANDIDATE_CHARSET
85 if li>=3:
86
87 c = input[2]
88 if c != "\xbf":
89 candidates &= ~CANDIDATE_UTF_8_SIG
90 if c != "c":
91 candidates &= ~CANDIDATE_UTF_16_LE
92 if c != "\x00":
93 candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
94 if c != "\xfe":
95 candidates &= ~CANDIDATE_UTF_32_AS_BE
96 if c != "h":
97 candidates &= ~CANDIDATE_CHARSET
98 if li>=4:
99
100 c = input[3]
101 if input[2:4] == "\x00\x00":
102 candidates &= ~CANDIDATE_UTF_16_AS_LE
103 if c != "\x00":
104 candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE)
105 if c != "\xff":
106 candidates &= ~CANDIDATE_UTF_32_AS_BE
107 if c != "@":
108 candidates &= ~CANDIDATE_UTF_32_BE
109 if c != "a":
110 candidates &= ~CANDIDATE_CHARSET
111 if candidates == 0:
112 return ("utf-8", False)
113 if not (candidates & (candidates-1)):
114 if candidates == CANDIDATE_UTF_8_SIG and li >= 3:
115 return ("utf-8-sig", True)
116 elif candidates == CANDIDATE_UTF_16_AS_LE and li >= 2:
117 return ("utf-16", True)
118 elif candidates == CANDIDATE_UTF_16_AS_BE and li >= 2:
119 return ("utf-16", True)
120 elif candidates == CANDIDATE_UTF_16_LE and li >= 4:
121 return ("utf-16-le", False)
122 elif candidates == CANDIDATE_UTF_16_BE and li >= 2:
123 return ("utf-16-be", False)
124 elif candidates == CANDIDATE_UTF_32_AS_LE and li >= 4:
125 return ("utf-32", True)
126 elif candidates == CANDIDATE_UTF_32_AS_BE and li >= 4:
127 return ("utf-32", True)
128 elif candidates == CANDIDATE_UTF_32_LE and li >= 4:
129 return ("utf-32-le", False)
130 elif candidates == CANDIDATE_UTF_32_BE and li >= 4:
131 return ("utf-32-be", False)
132 elif candidates == CANDIDATE_CHARSET and li >= 4:
133 prefix = '@charset "'
134 if input[:len(prefix)] == prefix:
135 pos = input.find('"', len(prefix))
136 if pos >= 0:
137 return (input[len(prefix):pos], True)
138
139
140 if final:
141 return ("utf-8", False)
142 return (None, False)
143
144
146 """
147 Detect the encoding of the unicode string ``input``, which contains the
148 beginning of a CSS file. The encoding is detected from the charset rule
149 at the beginning of ``input``. If there is no charset rule, ``"utf-8"``
150 will be returned.
151
152 If the encoding can't be detected yet, ``None`` is returned. ``final``
153 specifies whether more data will be available in later calls or not. If
154 ``final`` is true, ``detectencoding_unicode()`` will never return ``None``.
155 """
156 prefix = u'@charset "'
157 if input.startswith(prefix):
158 pos = input.find(u'"', len(prefix))
159 if pos >= 0:
160 return (input[len(prefix):pos], True)
161 elif final or not prefix.startswith(input):
162
163
164 return ("utf-8", False)
165 return (None, False)
166
167
169 """
170 Replace the name of the encoding in the charset rule at the beginning of
171 ``input`` with ``encoding``. If ``input`` doesn't starts with a charset
172 rule, ``input`` will be returned unmodified.
173
174 If the encoding can't be found yet, ``None`` is returned. ``final``
175 specifies whether more data will be available in later calls or not.
176 If ``final`` is true, ``_fixencoding()`` will never return ``None``.
177 """
178 prefix = u'@charset "'
179 if len(input) > len(prefix):
180 if input.startswith(prefix):
181 pos = input.find(u'"', len(prefix))
182 if pos >= 0:
183 if encoding.replace("_", "-").lower() == "utf-8-sig":
184 encoding = u"utf-8"
185 return prefix + encoding + input[pos:]
186
187 else:
188 return input
189 elif not prefix.startswith(input) or final:
190
191 return input
192 if final:
193 return input
194 return None
195
196
197 -def decode(input, errors="strict", encoding=None, force=True):
198 if encoding is None or not force:
199 (_encoding, explicit) = detectencoding_str(input, True)
200 if _encoding == "css":
201 raise ValueError("css not allowed as encoding name")
202 if (explicit and not force) or encoding is None:
203 encoding = _encoding
204 (input, consumed) = codecs.getdecoder(encoding)(input, errors)
205 return (_fixencoding(input, unicode(encoding), True), consumed)
206
207
208 -def encode(input, errors="strict", encoding=None):
220
221
223
224 i = 0
225 for byte in bytes:
226 i = (i<<8) + ord(byte)
227 return i
228
229
231
232 v = []
233 while i:
234 v.insert(0, chr(i&0xff))
235 i >>= 8
236 return "".join(v)
237
238
239 if hasattr(codecs, "IncrementalDecoder"):
241 - def __init__(self, errors="strict", encoding=None, force=True):
251
253 for part in input:
254 result = self.decode(part, False)
255 if result:
256 yield result
257 result = self.decode("", True)
258 if result:
259 yield result
260
261 - def decode(self, input, final=False):
262
263
264
265
266
267 if self.decoder is None:
268 input = self.buffer + input
269
270 if self.encoding is None or not self.force:
271 (encoding, explicit) = detectencoding_str(input, final)
272 if encoding is None:
273 self.buffer = input
274 return u""
275 elif encoding == "css":
276 raise ValueError("css not allowed as encoding name")
277 if (explicit and not self.force) or self.encoding is None:
278 self.encoding = encoding
279 self.buffer = ""
280 decoder = codecs.getincrementaldecoder(self.encoding)
281 self.decoder = decoder(self._errors)
282 if self.headerfixed:
283 return self.decoder.decode(input, final)
284
285
286 output = self.buffer + self.decoder.decode(input, final)
287 encoding = self.encoding
288 if encoding.replace("_", "-").lower() == "utf-8-sig":
289 encoding = "utf-8"
290 newoutput = _fixencoding(output, unicode(encoding), final)
291 if newoutput is None:
292
293 self.buffer = output
294 return u""
295 self.headerfixed = True
296 return newoutput
297
299 codecs.IncrementalDecoder.reset(self)
300 self.decoder = None
301 self.buffer = ""
302 self.headerfixed = False
303
306
308
309 if self.decoder is not None:
310 self.decoder.errors = errors
311 self._errors = errors
312 errors = property(_geterrors, _seterrors)
313
315 if self.decoder is not None:
316 state = (self.encoding, self.buffer, self.headerfixed, True, self.decoder.getstate())
317 else:
318 state = (self.encoding, self.buffer, self.headerfixed, False, None)
319 return ("", _bytes2int(marshal.dumps(state)))
320
322 state = _int2bytes(marshal.loads(state[1]))
323 self.encoding = state[0]
324 self.buffer = state[1]
325 self.headerfixed = state[2]
326 if state[3] is not None:
327 self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors)
328 self.decoder.setstate(state[4])
329 else:
330 self.decoder = None
331
332
333 if hasattr(codecs, "IncrementalEncoder"):
335 - def __init__(self, errors="strict", encoding=None):
343
345 for part in input:
346 result = self.encode(part, False)
347 if result:
348 yield result
349 result = self.encode(u"", True)
350 if result:
351 yield result
352
353 - def encode(self, input, final=False):
354 if self.encoder is None:
355 input = self.buffer + input
356 if self.encoding is not None:
357
358 encoding = self.encoding
359 if encoding.replace("_", "-").lower() == "utf-8-sig":
360 encoding = "utf-8"
361 newinput = _fixencoding(input, unicode(encoding), final)
362 if newinput is None:
363 self.buffer = input
364 return ""
365 input = newinput
366 else:
367
368 self.encoding = detectencoding_unicode(input, final)[0]
369 if self.encoding is not None:
370 if self.encoding == "css":
371 raise ValueError("css not allowed as encoding name")
372 info = codecs.lookup(self.encoding)
373 encoding = self.encoding
374 if self.encoding.replace("_", "-").lower() == "utf-8-sig":
375 input = _fixencoding(input, u"utf-8", True)
376 self.encoder = info.incrementalencoder(self._errors)
377 self.buffer = u""
378 else:
379 self.buffer = input
380 return ""
381 return self.encoder.encode(input, final)
382
387
390
392
393 if self.encoder is not None:
394 self.encoder.errors = errors
395 self._errors = errors
396 errors = property(_geterrors, _seterrors)
397
399 if self.encoder is not None:
400 state = (self.encoding, self.buffer, True, self.encoder.getstate())
401 else:
402 state = (self.encoding, self.buffer, False, None)
403 return _bytes2int(marshal.dumps(state))
404
406 state = _int2bytes(marshal.loads(state))
407 self.encoding = state[0]
408 self.buffer = state[1]
409 if state[2] is not None:
410 self.encoder = codecs.getincrementalencoder(self.encoding)(self._errors)
411 self.encoder.setstate(state[4])
412 else:
413 self.encoder = None
414
415
417 - def __init__(self, stream, errors="strict", encoding=None, header=False):
423
424 - def encode(self, input, errors='strict'):
425 li = len(input)
426 if self.streamwriter is None:
427 input = self.buffer + input
428 li = len(input)
429 if self.encoding is not None:
430
431 encoding = self.encoding
432 if encoding.replace("_", "-").lower() == "utf-8-sig":
433 encoding = "utf-8"
434 newinput = _fixencoding(input, unicode(encoding), False)
435 if newinput is None:
436 self.buffer = input
437 return ("", 0)
438 input = newinput
439 else:
440
441 self.encoding = detectencoding_unicode(input, False)[0]
442 if self.encoding is not None:
443 if self.encoding == "css":
444 raise ValueError("css not allowed as encoding name")
445 self.streamwriter = codecs.getwriter(self.encoding)(self.stream, self._errors)
446 encoding = self.encoding
447 if self.encoding.replace("_", "-").lower() == "utf-8-sig":
448 input = _fixencoding(input, u"utf-8", True)
449 self.buffer = u""
450 else:
451 self.buffer = input
452 return ("", 0)
453 return (self.streamwriter.encode(input, errors)[0], li)
454
457
459
460 if self.streamwriter is not None:
461 self.streamwriter.errors = errors
462 self._errors = errors
463 errors = property(_geterrors, _seterrors)
464
465
467 - def __init__(self, stream, errors="strict", encoding=None, force=True):
473
474 - def decode(self, input, errors='strict'):
475 if self.streamreader is None:
476 if self.encoding is None or not self.force:
477 (encoding, explicit) = detectencoding_str(input, False)
478 if encoding is None:
479 return (u"", 0)
480 elif encoding == "css":
481 raise ValueError("css not allowed as encoding name")
482 if (explicit and not self.force) or self.encoding is None:
483 self.encoding = encoding
484 streamreader = codecs.getreader(self.encoding)
485 streamreader = streamreader(self.stream, self._errors)
486 (output, consumed) = streamreader.decode(input, errors)
487 encoding = self.encoding
488 if encoding.replace("_", "-").lower() == "utf-8-sig":
489 encoding = "utf-8"
490 newoutput = _fixencoding(output, unicode(encoding), False)
491 if newoutput is not None:
492 self.streamreader = streamreader
493 return (newoutput, consumed)
494 return (u"", 0)
495 return self.streamreader.decode(input, errors)
496
499
501
502 if self.streamreader is not None:
503 self.streamreader.errors = errors
504 self._errors = errors
505 errors = property(_geterrors, _seterrors)
506
507
508 if hasattr(codecs, "CodecInfo"):
509
521 else:
522
524 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
525
527 prefix = 0
528 if input[:3] == codecs.BOM_UTF8:
529 input = input[3:]
530 prefix = 3
531 (output, consumed) = codecs.utf_8_decode(input, errors, True)
532 return (output, consumed+prefix)
533
541
542 - def encode(self, input, errors='strict'):
545
553
554 - def decode(self, input, errors='strict'):
555 if len(input) < 3 and codecs.BOM_UTF8.startswith(input):
556
557
558 return (u"", 0)
559 self.decode = codecs.utf_8_decode
560 return utf8sig_decode(input, errors)
561
569
570
571 codecs.register(search_function)
572
573
574
575
577 if not isinstance(exc, UnicodeEncodeError):
578 raise TypeError("don't know how to handle %r" % exc)
579 return (u"".join(u"\\%06x" % ord(c) for c in exc.object[exc.start:exc.end]), exc.end)
580
581 codecs.register_error("cssescape", cssescape)
582