Coverage for common/bugfix_flashtext.py: 3%

93 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-02-05 06:46 -0600

1""" 

2crate_anon/common/bugfix_flashtext.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26THIS FILE, however, is by another author: from 

27https://github.com/vi3k6i5/flashtext/issues/44, by Ihor Bobak; added to 

28Flashtext code; licensed under the MIT License as per 

29https://github.com/vi3k6i5/flashtext/blob/master/LICENSE. 

30 

31Rationale: 

32 

33There is currently a bug in the method :meth:`replace_keywords` in the external 

34module ``flashtext`` in which certain characters provoke an 'index out of 

35range' error when working in case-insensitive mode. This is because some 

36non-ascii characters are larger in their lower-case form. Thanks to Ihor Bobak 

37for this bugfix. 

38 

39Edits for PyCharm linter. 

40""" 

41 

42from flashtext import KeywordProcessor 

43 

44 

45# noinspection PyAbstractClass 

46class KeywordProcessorFixed(KeywordProcessor): 

47 # noinspection PyUnusedLocal 

48 def replace_keywords(self, a_sentence: str) -> str: 

49 if not a_sentence: 

50 # if sentence is empty or none just return the same. 

51 return a_sentence 

52 new_sentence = [] 

53 

54 if not self.case_sensitive: 

55 sentence = a_sentence.lower() 

56 # by Ihor Bobak: 

57 # some letters can expand in size when lower() is called, therefore we will preprocess # noqa: E501 

58 # a_sentense to find those letters which lower()-ed to 2 or more symbols. # noqa: E501 

59 # So, imagine that X is lowered as yz, the rest are lowered as is: A->a, B->b, C->c # noqa: E501 

60 # then for the string ABCXABC we want to get 

61 # ['A', 'B', 'C', 'X', '', 'A', 'B', 'C'] which corresponds to 

62 # ['a', 'b', 'c', 'y', 'z', 'a', 'b', 'c'] because when the code below will run by the indexes # noqa: E501 

63 # of the lowered string, it will "glue" the original string also by THE SAME indexes # noqa: E501 

64 orig_sentence = [] 

65 for i in range(0, len(a_sentence)): 

66 char = a_sentence[i] 

67 len_char_lower = len(char.lower()) 

68 for j in range(0, len_char_lower): 

69 # in most cases it will work just one iteration and will 

70 # add the same char 

71 orig_sentence.append(char if j == 0 else "") 

72 # but if it happens that X->yz, then for z it will add '' 

73 else: 

74 sentence = a_sentence 

75 orig_sentence = a_sentence 

76 

77 current_word = "" 

78 current_dict = self.keyword_trie_dict 

79 current_white_space = "" 

80 sequence_end_pos = 0 

81 idx = 0 

82 sentence_len = len(sentence) 

83 while idx < sentence_len: 

84 char = sentence[idx] 

85 current_word += orig_sentence[idx] 

86 # when we reach whitespace 

87 if char not in self.non_word_boundaries: 

88 current_white_space = char 

89 # if end is present in current_dict 

90 if self._keyword in current_dict or char in current_dict: 

91 # update longest sequence found 

92 sequence_found = None 

93 longest_sequence_found = None 

94 is_longer_seq_found = False 

95 if self._keyword in current_dict: 

96 sequence_found = current_dict[self._keyword] 

97 longest_sequence_found = current_dict[self._keyword] 

98 sequence_end_pos = idx 

99 

100 # re look for longest_sequence from this position 

101 if char in current_dict: 

102 current_dict_continued = current_dict[char] 

103 current_word_continued = current_word 

104 idy = idx + 1 

105 while idy < sentence_len: 

106 inner_char = sentence[idy] 

107 current_word_continued += orig_sentence[idy] 

108 if ( 

109 inner_char not in self.non_word_boundaries 

110 and self._keyword in current_dict_continued 

111 ): 

112 # update longest sequence found 

113 current_white_space = inner_char 

114 longest_sequence_found = ( 

115 current_dict_continued[self._keyword] 

116 ) 

117 sequence_end_pos = idy 

118 is_longer_seq_found = True 

119 if inner_char in current_dict_continued: 

120 current_dict_continued = ( 

121 current_dict_continued[inner_char] 

122 ) 

123 else: 

124 break 

125 idy += 1 

126 else: 

127 # end of sentence reached. 

128 if self._keyword in current_dict_continued: 

129 # update longest sequence found 

130 current_white_space = "" 

131 longest_sequence_found = ( 

132 current_dict_continued[self._keyword] 

133 ) 

134 sequence_end_pos = idy 

135 is_longer_seq_found = True 

136 if is_longer_seq_found: 

137 idx = sequence_end_pos 

138 current_word = current_word_continued 

139 current_dict = self.keyword_trie_dict 

140 if longest_sequence_found: 

141 new_sentence.append(longest_sequence_found) 

142 new_sentence.append(current_white_space) 

143 current_word = "" 

144 current_white_space = "" 

145 else: 

146 new_sentence.append(current_word) 

147 current_word = "" 

148 current_white_space = "" 

149 else: 

150 # we reset current_dict 

151 current_dict = self.keyword_trie_dict 

152 new_sentence.append(current_word) 

153 current_word = "" 

154 current_white_space = "" 

155 elif char in current_dict: 

156 # we can continue from this char 

157 current_dict = current_dict[char] 

158 else: 

159 # we reset current_dict 

160 current_dict = self.keyword_trie_dict 

161 # skip to end of word 

162 idy = idx + 1 

163 while idy < sentence_len: 

164 char = sentence[idy] 

165 current_word += orig_sentence[idy] 

166 if char not in self.non_word_boundaries: 

167 break 

168 idy += 1 

169 idx = idy 

170 new_sentence.append(current_word) 

171 current_word = "" 

172 current_white_space = "" 

173 # if we are end of sentence and have a sequence discovered 

174 if idx + 1 >= sentence_len: 

175 if self._keyword in current_dict: 

176 sequence_found = current_dict[self._keyword] 

177 new_sentence.append(sequence_found) 

178 else: 

179 new_sentence.append(current_word) 

180 idx += 1 

181 return "".join(new_sentence)