Coverage for common/bugfix_flashtext.py: 3%
93 statements
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
1"""
2crate_anon/common/bugfix_flashtext.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26THIS FILE, however, is by another author: from
27https://github.com/vi3k6i5/flashtext/issues/44, by Ihor Bobak; added to
28Flashtext code; licensed under the MIT License as per
29https://github.com/vi3k6i5/flashtext/blob/master/LICENSE.
31Rationale:
33There is currently a bug in the method :meth:`replace_keywords` in the external
34module ``flashtext`` in which certain characters provoke an 'index out of
35range' error when working in case-insensitive mode. This is because some
36non-ascii characters are larger in their lower-case form. Thanks to Ihor Bobak
37for this bugfix.
39Edits for PyCharm linter.
40"""
42from flashtext import KeywordProcessor
45# noinspection PyAbstractClass
46class KeywordProcessorFixed(KeywordProcessor):
47 # noinspection PyUnusedLocal
48 def replace_keywords(self, a_sentence: str) -> str:
49 if not a_sentence:
50 # if sentence is empty or none just return the same.
51 return a_sentence
52 new_sentence = []
54 if not self.case_sensitive:
55 sentence = a_sentence.lower()
56 # by Ihor Bobak:
57 # some letters can expand in size when lower() is called, therefore we will preprocess # noqa: E501
58 # a_sentense to find those letters which lower()-ed to 2 or more symbols. # noqa: E501
59 # So, imagine that X is lowered as yz, the rest are lowered as is: A->a, B->b, C->c # noqa: E501
60 # then for the string ABCXABC we want to get
61 # ['A', 'B', 'C', 'X', '', 'A', 'B', 'C'] which corresponds to
62 # ['a', 'b', 'c', 'y', 'z', 'a', 'b', 'c'] because when the code below will run by the indexes # noqa: E501
63 # of the lowered string, it will "glue" the original string also by THE SAME indexes # noqa: E501
64 orig_sentence = []
65 for i in range(0, len(a_sentence)):
66 char = a_sentence[i]
67 len_char_lower = len(char.lower())
68 for j in range(0, len_char_lower):
69 # in most cases it will work just one iteration and will
70 # add the same char
71 orig_sentence.append(char if j == 0 else "")
72 # but if it happens that X->yz, then for z it will add ''
73 else:
74 sentence = a_sentence
75 orig_sentence = a_sentence
77 current_word = ""
78 current_dict = self.keyword_trie_dict
79 current_white_space = ""
80 sequence_end_pos = 0
81 idx = 0
82 sentence_len = len(sentence)
83 while idx < sentence_len:
84 char = sentence[idx]
85 current_word += orig_sentence[idx]
86 # when we reach whitespace
87 if char not in self.non_word_boundaries:
88 current_white_space = char
89 # if end is present in current_dict
90 if self._keyword in current_dict or char in current_dict:
91 # update longest sequence found
92 sequence_found = None
93 longest_sequence_found = None
94 is_longer_seq_found = False
95 if self._keyword in current_dict:
96 sequence_found = current_dict[self._keyword]
97 longest_sequence_found = current_dict[self._keyword]
98 sequence_end_pos = idx
100 # re look for longest_sequence from this position
101 if char in current_dict:
102 current_dict_continued = current_dict[char]
103 current_word_continued = current_word
104 idy = idx + 1
105 while idy < sentence_len:
106 inner_char = sentence[idy]
107 current_word_continued += orig_sentence[idy]
108 if (
109 inner_char not in self.non_word_boundaries
110 and self._keyword in current_dict_continued
111 ):
112 # update longest sequence found
113 current_white_space = inner_char
114 longest_sequence_found = (
115 current_dict_continued[self._keyword]
116 )
117 sequence_end_pos = idy
118 is_longer_seq_found = True
119 if inner_char in current_dict_continued:
120 current_dict_continued = (
121 current_dict_continued[inner_char]
122 )
123 else:
124 break
125 idy += 1
126 else:
127 # end of sentence reached.
128 if self._keyword in current_dict_continued:
129 # update longest sequence found
130 current_white_space = ""
131 longest_sequence_found = (
132 current_dict_continued[self._keyword]
133 )
134 sequence_end_pos = idy
135 is_longer_seq_found = True
136 if is_longer_seq_found:
137 idx = sequence_end_pos
138 current_word = current_word_continued
139 current_dict = self.keyword_trie_dict
140 if longest_sequence_found:
141 new_sentence.append(longest_sequence_found)
142 new_sentence.append(current_white_space)
143 current_word = ""
144 current_white_space = ""
145 else:
146 new_sentence.append(current_word)
147 current_word = ""
148 current_white_space = ""
149 else:
150 # we reset current_dict
151 current_dict = self.keyword_trie_dict
152 new_sentence.append(current_word)
153 current_word = ""
154 current_white_space = ""
155 elif char in current_dict:
156 # we can continue from this char
157 current_dict = current_dict[char]
158 else:
159 # we reset current_dict
160 current_dict = self.keyword_trie_dict
161 # skip to end of word
162 idy = idx + 1
163 while idy < sentence_len:
164 char = sentence[idy]
165 current_word += orig_sentence[idy]
166 if char not in self.non_word_boundaries:
167 break
168 idy += 1
169 idx = idy
170 new_sentence.append(current_word)
171 current_word = ""
172 current_white_space = ""
173 # if we are end of sentence and have a sequence discovered
174 if idx + 1 >= sentence_len:
175 if self._keyword in current_dict:
176 sequence_found = current_dict[self._keyword]
177 new_sentence.append(sequence_found)
178 else:
179 new_sentence.append(current_word)
180 idx += 1
181 return "".join(new_sentence)