Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1######################## BEGIN LICENSE BLOCK ######################## 

2# The Original Code is Mozilla Universal charset detector code. 

3# 

4# The Initial Developer of the Original Code is 

5# Netscape Communications Corporation. 

6# Portions created by the Initial Developer are Copyright (C) 2001 

7# the Initial Developer. All Rights Reserved. 

8# 

9# Contributor(s): 

10# Mark Pilgrim - port to Python 

11# Shy Shalom - original C code 

12# 

13# This library is free software; you can redistribute it and/or 

14# modify it under the terms of the GNU Lesser General Public 

15# License as published by the Free Software Foundation; either 

16# version 2.1 of the License, or (at your option) any later version. 

17# 

18# This library is distributed in the hope that it will be useful, 

19# but WITHOUT ANY WARRANTY; without even the implied warranty of 

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

21# Lesser General Public License for more details. 

22# 

23# You should have received a copy of the GNU Lesser General Public 

24# License along with this library; if not, write to the Free Software 

25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 

26# 02110-1301 USA 

27######################### END LICENSE BLOCK ######################### 

28""" 

29Module containing the UniversalDetector detector class, which is the primary 

30class a user of ``chardet`` should use. 

31 

32:author: Mark Pilgrim (initial port to Python) 

33:author: Shy Shalom (original C code) 

34:author: Dan Blanchard (major refactoring for 3.0) 

35:author: Ian Cordasco 

36""" 

37 

38 

39import codecs 

40import logging 

41import re 

42 

43from .charsetgroupprober import CharSetGroupProber 

44from .enums import InputState, LanguageFilter, ProbingState 

45from .escprober import EscCharSetProber 

46from .latin1prober import Latin1Prober 

47from .mbcsgroupprober import MBCSGroupProber 

48from .sbcsgroupprober import SBCSGroupProber 

49 

50 

51class UniversalDetector(object): 

52 """ 

53 The ``UniversalDetector`` class underlies the ``chardet.detect`` function 

54 and coordinates all of the different charset probers. 

55 

56 To get a ``dict`` containing an encoding and its confidence, you can simply 

57 run: 

58 

59 .. code:: 

60 

61 u = UniversalDetector() 

62 u.feed(some_bytes) 

63 u.close() 

64 detected = u.result 

65 

66 """ 

67 

68 MINIMUM_THRESHOLD = 0.20 

69 HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]') 

70 ESC_DETECTOR = re.compile(b'(\033|~{)') 

71 WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]') 

72 ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252', 

73 'iso-8859-2': 'Windows-1250', 

74 'iso-8859-5': 'Windows-1251', 

75 'iso-8859-6': 'Windows-1256', 

76 'iso-8859-7': 'Windows-1253', 

77 'iso-8859-8': 'Windows-1255', 

78 'iso-8859-9': 'Windows-1254', 

79 'iso-8859-13': 'Windows-1257'} 

80 

81 def __init__(self, lang_filter=LanguageFilter.ALL): 

82 self._esc_charset_prober = None 

83 self._charset_probers = [] 

84 self.result = None 

85 self.done = None 

86 self._got_data = None 

87 self._input_state = None 

88 self._last_char = None 

89 self.lang_filter = lang_filter 

90 self.logger = logging.getLogger(__name__) 

91 self._has_win_bytes = None 

92 self.reset() 

93 

94 def reset(self): 

95 """ 

96 Reset the UniversalDetector and all of its probers back to their 

97 initial states. This is called by ``__init__``, so you only need to 

98 call this directly in between analyses of different documents. 

99 """ 

100 self.result = {'encoding': None, 'confidence': 0.0, 'language': None} 

101 self.done = False 

102 self._got_data = False 

103 self._has_win_bytes = False 

104 self._input_state = InputState.PURE_ASCII 

105 self._last_char = b'' 

106 if self._esc_charset_prober: 

107 self._esc_charset_prober.reset() 

108 for prober in self._charset_probers: 

109 prober.reset() 

110 

111 def feed(self, byte_str): 

112 """ 

113 Takes a chunk of a document and feeds it through all of the relevant 

114 charset probers. 

115 

116 After calling ``feed``, you can check the value of the ``done`` 

117 attribute to see if you need to continue feeding the 

118 ``UniversalDetector`` more data, or if it has made a prediction 

119 (in the ``result`` attribute). 

120 

121 .. note:: 

122 You should always call ``close`` when you're done feeding in your 

123 document if ``done`` is not already ``True``. 

124 """ 

125 if self.done: 

126 return 

127 

128 if not len(byte_str): 

129 return 

130 

131 if not isinstance(byte_str, bytearray): 

132 byte_str = bytearray(byte_str) 

133 

134 # First check for known BOMs, since these are guaranteed to be correct 

135 if not self._got_data: 

136 # If the data starts with BOM, we know it is UTF 

137 if byte_str.startswith(codecs.BOM_UTF8): 

138 # EF BB BF UTF-8 with BOM 

139 self.result = {'encoding': "UTF-8-SIG", 

140 'confidence': 1.0, 

141 'language': ''} 

142 elif byte_str.startswith((codecs.BOM_UTF32_LE, 

143 codecs.BOM_UTF32_BE)): 

144 # FF FE 00 00 UTF-32, little-endian BOM 

145 # 00 00 FE FF UTF-32, big-endian BOM 

146 self.result = {'encoding': "UTF-32", 

147 'confidence': 1.0, 

148 'language': ''} 

149 elif byte_str.startswith(b'\xFE\xFF\x00\x00'): 

150 # FE FF 00 00 UCS-4, unusual octet order BOM (3412) 

151 self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 

152 'confidence': 1.0, 

153 'language': ''} 

154 elif byte_str.startswith(b'\x00\x00\xFF\xFE'): 

155 # 00 00 FF FE UCS-4, unusual octet order BOM (2143) 

156 self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 

157 'confidence': 1.0, 

158 'language': ''} 

159 elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)): 

160 # FF FE UTF-16, little endian BOM 

161 # FE FF UTF-16, big endian BOM 

162 self.result = {'encoding': "UTF-16", 

163 'confidence': 1.0, 

164 'language': ''} 

165 

166 self._got_data = True 

167 if self.result['encoding'] is not None: 

168 self.done = True 

169 return 

170 

171 # If none of those matched and we've only see ASCII so far, check 

172 # for high bytes and escape sequences 

173 if self._input_state == InputState.PURE_ASCII: 

174 if self.HIGH_BYTE_DETECTOR.search(byte_str): 

175 self._input_state = InputState.HIGH_BYTE 

176 elif self._input_state == InputState.PURE_ASCII and \ 

177 self.ESC_DETECTOR.search(self._last_char + byte_str): 

178 self._input_state = InputState.ESC_ASCII 

179 

180 self._last_char = byte_str[-1:] 

181 

182 # If we've seen escape sequences, use the EscCharSetProber, which 

183 # uses a simple state machine to check for known escape sequences in 

184 # HZ and ISO-2022 encodings, since those are the only encodings that 

185 # use such sequences. 

186 if self._input_state == InputState.ESC_ASCII: 

187 if not self._esc_charset_prober: 

188 self._esc_charset_prober = EscCharSetProber(self.lang_filter) 

189 if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT: 

190 self.result = {'encoding': 

191 self._esc_charset_prober.charset_name, 

192 'confidence': 

193 self._esc_charset_prober.get_confidence(), 

194 'language': 

195 self._esc_charset_prober.language} 

196 self.done = True 

197 # If we've seen high bytes (i.e., those with values greater than 127), 

198 # we need to do more complicated checks using all our multi-byte and 

199 # single-byte probers that are left. The single-byte probers 

200 # use character bigram distributions to determine the encoding, whereas 

201 # the multi-byte probers use a combination of character unigram and 

202 # bigram distributions. 

203 elif self._input_state == InputState.HIGH_BYTE: 

204 if not self._charset_probers: 

205 self._charset_probers = [MBCSGroupProber(self.lang_filter)] 

206 # If we're checking non-CJK encodings, use single-byte prober 

207 if self.lang_filter & LanguageFilter.NON_CJK: 

208 self._charset_probers.append(SBCSGroupProber()) 

209 self._charset_probers.append(Latin1Prober()) 

210 for prober in self._charset_probers: 

211 if prober.feed(byte_str) == ProbingState.FOUND_IT: 

212 self.result = {'encoding': prober.charset_name, 

213 'confidence': prober.get_confidence(), 

214 'language': prober.language} 

215 self.done = True 

216 break 

217 if self.WIN_BYTE_DETECTOR.search(byte_str): 

218 self._has_win_bytes = True 

219 

220 def close(self): 

221 """ 

222 Stop analyzing the current document and come up with a final 

223 prediction. 

224 

225 :returns: The ``result`` attribute, a ``dict`` with the keys 

226 `encoding`, `confidence`, and `language`. 

227 """ 

228 # Don't bother with checks if we're already done 

229 if self.done: 

230 return self.result 

231 self.done = True 

232 

233 if not self._got_data: 

234 self.logger.debug('no data received!') 

235 

236 # Default to ASCII if it is all we've seen so far 

237 elif self._input_state == InputState.PURE_ASCII: 

238 self.result = {'encoding': 'ascii', 

239 'confidence': 1.0, 

240 'language': ''} 

241 

242 # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD 

243 elif self._input_state == InputState.HIGH_BYTE: 

244 prober_confidence = None 

245 max_prober_confidence = 0.0 

246 max_prober = None 

247 for prober in self._charset_probers: 

248 if not prober: 

249 continue 

250 prober_confidence = prober.get_confidence() 

251 if prober_confidence > max_prober_confidence: 

252 max_prober_confidence = prober_confidence 

253 max_prober = prober 

254 if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): 

255 charset_name = max_prober.charset_name 

256 lower_charset_name = max_prober.charset_name.lower() 

257 confidence = max_prober.get_confidence() 

258 # Use Windows encoding name instead of ISO-8859 if we saw any 

259 # extra Windows-specific bytes 

260 if lower_charset_name.startswith('iso-8859'): 

261 if self._has_win_bytes: 

262 charset_name = self.ISO_WIN_MAP.get(lower_charset_name, 

263 charset_name) 

264 self.result = {'encoding': charset_name, 

265 'confidence': confidence, 

266 'language': max_prober.language} 

267 

268 # Log all prober confidences if none met MINIMUM_THRESHOLD 

269 if self.logger.getEffectiveLevel() <= logging.DEBUG: 

270 if self.result['encoding'] is None: 

271 self.logger.debug('no probers hit minimum threshold') 

272 for group_prober in self._charset_probers: 

273 if not group_prober: 

274 continue 

275 if isinstance(group_prober, CharSetGroupProber): 

276 for prober in group_prober.probers: 

277 self.logger.debug('%s %s confidence = %s', 

278 prober.charset_name, 

279 prober.language, 

280 prober.get_confidence()) 

281 else: 

282 self.logger.debug('%s %s confidence = %s', 

283 group_prober.charset_name, 

284 group_prober.language, 

285 group_prober.get_confidence()) 

286 return self.result