Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1######################## BEGIN LICENSE BLOCK ######################## 

2# The Original Code is Mozilla Universal charset detector code. 

3# 

4# The Initial Developer of the Original Code is 

5# Netscape Communications Corporation. 

6# Portions created by the Initial Developer are Copyright (C) 2001 

7# the Initial Developer. All Rights Reserved. 

8# 

9# Contributor(s): 

10# Mark Pilgrim - port to Python 

11# Shy Shalom - original C code 

12# 

13# This library is free software; you can redistribute it and/or 

14# modify it under the terms of the GNU Lesser General Public 

15# License as published by the Free Software Foundation; either 

16# version 2.1 of the License, or (at your option) any later version. 

17# 

18# This library is distributed in the hope that it will be useful, 

19# but WITHOUT ANY WARRANTY; without even the implied warranty of 

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

21# Lesser General Public License for more details. 

22# 

23# You should have received a copy of the GNU Lesser General Public 

24# License along with this library; if not, write to the Free Software 

25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 

26# 02110-1301 USA 

27######################### END LICENSE BLOCK ######################### 

28 

29from collections import namedtuple 

30 

31from .charsetprober import CharSetProber 

32from .enums import CharacterCategory, ProbingState, SequenceLikelihood 

33 

34 

35SingleByteCharSetModel = namedtuple('SingleByteCharSetModel', 

36 ['charset_name', 

37 'language', 

38 'char_to_order_map', 

39 'language_model', 

40 'typical_positive_ratio', 

41 'keep_ascii_letters', 

42 'alphabet']) 

43 

44 

45class SingleByteCharSetProber(CharSetProber): 

46 SAMPLE_SIZE = 64 

47 SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2 

48 POSITIVE_SHORTCUT_THRESHOLD = 0.95 

49 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 

50 

51 def __init__(self, model, reversed=False, name_prober=None): 

52 super(SingleByteCharSetProber, self).__init__() 

53 self._model = model 

54 # TRUE if we need to reverse every pair in the model lookup 

55 self._reversed = reversed 

56 # Optional auxiliary prober for name decision 

57 self._name_prober = name_prober 

58 self._last_order = None 

59 self._seq_counters = None 

60 self._total_seqs = None 

61 self._total_char = None 

62 self._freq_char = None 

63 self.reset() 

64 

65 def reset(self): 

66 super(SingleByteCharSetProber, self).reset() 

67 # char order of last character 

68 self._last_order = 255 

69 self._seq_counters = [0] * SequenceLikelihood.get_num_categories() 

70 self._total_seqs = 0 

71 self._total_char = 0 

72 # characters that fall in our sampling range 

73 self._freq_char = 0 

74 

75 @property 

76 def charset_name(self): 

77 if self._name_prober: 

78 return self._name_prober.charset_name 

79 else: 

80 return self._model.charset_name 

81 

82 @property 

83 def language(self): 

84 if self._name_prober: 

85 return self._name_prober.language 

86 else: 

87 return self._model.language 

88 

89 def feed(self, byte_str): 

90 # TODO: Make filter_international_words keep things in self.alphabet 

91 if not self._model.keep_ascii_letters: 

92 byte_str = self.filter_international_words(byte_str) 

93 if not byte_str: 

94 return self.state 

95 char_to_order_map = self._model.char_to_order_map 

96 language_model = self._model.language_model 

97 for char in byte_str: 

98 order = char_to_order_map.get(char, CharacterCategory.UNDEFINED) 

99 # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but 

100 # CharacterCategory.SYMBOL is actually 253, so we use CONTROL 

101 # to make it closer to the original intent. The only difference 

102 # is whether or not we count digits and control characters for 

103 # _total_char purposes. 

104 if order < CharacterCategory.CONTROL: 

105 self._total_char += 1 

106 # TODO: Follow uchardet's lead and discount confidence for frequent 

107 # control characters. 

108 # See https://github.com/BYVoid/uchardet/commit/55b4f23971db61 

109 if order < self.SAMPLE_SIZE: 

110 self._freq_char += 1 

111 if self._last_order < self.SAMPLE_SIZE: 

112 self._total_seqs += 1 

113 if not self._reversed: 

114 lm_cat = language_model[self._last_order][order] 

115 else: 

116 lm_cat = language_model[order][self._last_order] 

117 self._seq_counters[lm_cat] += 1 

118 self._last_order = order 

119 

120 charset_name = self._model.charset_name 

121 if self.state == ProbingState.DETECTING: 

122 if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD: 

123 confidence = self.get_confidence() 

124 if confidence > self.POSITIVE_SHORTCUT_THRESHOLD: 

125 self.logger.debug('%s confidence = %s, we have a winner', 

126 charset_name, confidence) 

127 self._state = ProbingState.FOUND_IT 

128 elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD: 

129 self.logger.debug('%s confidence = %s, below negative ' 

130 'shortcut threshhold %s', charset_name, 

131 confidence, 

132 self.NEGATIVE_SHORTCUT_THRESHOLD) 

133 self._state = ProbingState.NOT_ME 

134 

135 return self.state 

136 

137 def get_confidence(self): 

138 r = 0.01 

139 if self._total_seqs > 0: 

140 r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) / 

141 self._total_seqs / self._model.typical_positive_ratio) 

142 r = r * self._freq_char / self._total_char 

143 if r >= 1.0: 

144 r = 0.99 

145 return r