Coverage for anonymise/tests/scrub_tests.py: 100%

127 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/anonymise/tests/scrub_tests.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26Unit testing. 

27 

28""" 

29 

30# ============================================================================= 

31# Imports 

32# ============================================================================= 

33 

34import re 

35import logging 

36import os 

37from tempfile import TemporaryDirectory 

38from typing import List 

39from unittest import TestCase 

40 

41from cardinal_pythonlib.hash import HmacMD5Hasher 

42 

43from crate_anon.anonymise.constants import ScrubMethod 

44from crate_anon.anonymise.scrub import ( 

45 NonspecificScrubber, 

46 PersonalizedScrubber, 

47 WordList, 

48) 

49from crate_anon.common.bugfix_flashtext import KeywordProcessorFixed 

50 

51from faker import Faker 

52 

53log = logging.getLogger(__name__) 

54 

55 

56# ============================================================================= 

57# Constants 

58# ============================================================================= 

59 

60TEST_KEY = "hello" 

61PATIENT_REPLACEMENT = "[XXX]" 

62THIRD_PARTY_REPLACEMENT = "[YYY]" 

63 

64 

65# ============================================================================= 

66# Test hashing 

67# ============================================================================= 

68 

69 

70class HashTests(TestCase): 

71 def test_str_int_hash_equivalent(self) -> None: 

72 """ 

73 Hashing an integer and its string equivalent should give the same 

74 answer. 

75 """ 

76 hasher = HmacMD5Hasher(TEST_KEY) 

77 x = 1234567 

78 y = str(x) 

79 self.assertEqual( 

80 hasher.hash(x), 

81 hasher.hash(y), 

82 "Hasher providing different answer for str and int", 

83 ) 

84 

85 

86# ============================================================================= 

87# Test WordList 

88# ============================================================================= 

89 

90 

91class WordListTests(TestCase): 

92 def setUp(self) -> None: 

93 self.tempdir = TemporaryDirectory() 

94 self.maxDiff = None # see full differences upon failure 

95 

96 def _test_flashtext_word_boundaries(self, target: str) -> None: 

97 anon_text = PATIENT_REPLACEMENT 

98 ft = KeywordProcessorFixed(case_sensitive=False) 

99 ft.add_keyword(target, anon_text) 

100 self.assertEqual( 

101 # FlashText will replace at word boundaries: 

102 ft.replace_keywords(f"x {target} x"), 

103 f"x {anon_text} x", 

104 ) 

105 self.assertEqual( 

106 # But only at word boundaries, so this won't replace: 

107 ft.replace_keywords(f"x{target}x"), 

108 f"x{target}x", 

109 ) 

110 

111 def test_flashtext_word_boundaries(self) -> None: 

112 self._test_flashtext_word_boundaries("daisy") 

113 self._test_flashtext_word_boundaries("daisy bluebell") 

114 

115 def _test_wordlist(self, regex_method: bool = False) -> None: 

116 """ 

117 Test with e.g. 

118 

119 .. code-block:: python 

120 

121 pytest -k test_wordlist --log-cli-level=INFO 

122 """ 

123 denylist_phrases = ["Alice", "Bob", "Charlie Brown", "Daisy"] 

124 anon_text = PATIENT_REPLACEMENT 

125 test_source_text = """ 

126 I met Alice in the street. 

127 She was walking with Bob. 

128 Charlie was not with them. 

129 Their gloves were brown. 

130 They stopped to inspect a daisy. 

131 They discussed Charlie Brown cartoons. 

132 They discussed Charlie Brown cartoons all day long. 

133 They made comment after comment. 

134 """ 

135 denylist_text = ( 

136 "\n# comment\n" 

137 + "\n".join(f" {x} " for x in denylist_phrases) 

138 + "\n" 

139 ) 

140 # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists # noqa: E501 

141 denylist_words = [] # type: List[str] 

142 for line in denylist_phrases: 

143 denylist_words += [x for x in line.split() if x] 

144 

145 expected_result_phrases = test_source_text 

146 for element in denylist_phrases: 

147 # https://stackoverflow.com/questions/919056/case-insensitive-replace # noqa: E501 

148 element_re = re.compile(re.escape(element), re.IGNORECASE) 

149 expected_result_phrases = element_re.sub( 

150 anon_text, expected_result_phrases 

151 ) 

152 if regex_method: 

153 # Regexes handle whitespace flexibly. 

154 expected_result_phrases = expected_result_phrases.replace( 

155 "Charlie Brown", anon_text 

156 ) 

157 

158 expected_result_words = test_source_text 

159 for element in denylist_words: 

160 element_re = re.compile(re.escape(element), re.IGNORECASE) 

161 expected_result_words = element_re.sub( 

162 anon_text, expected_result_words 

163 ) 

164 

165 filename = os.path.join(self.tempdir.name, "badwords.txt") 

166 with open(filename, "wt") as f: 

167 f.write(denylist_text) 

168 

169 wordlist_phrases = WordList( 

170 filenames=[filename], 

171 as_phrases=True, 

172 replacement_text=anon_text, 

173 regex_method=regex_method, 

174 ) 

175 wordlist_words = WordList( 

176 filenames=[filename], 

177 as_phrases=False, 

178 replacement_text=anon_text, 

179 regex_method=regex_method, 

180 ) 

181 

182 log.info(f"test_source_text: {test_source_text}") 

183 log.info(f"denylist_text: {denylist_text}") 

184 

185 result_words = wordlist_words.scrub(test_source_text) 

186 log.info(f"denylist_words: {denylist_words}") 

187 log.info(f"result_words: {result_words}") 

188 log.info(f"expected_result_words: {expected_result_words}") 

189 self.assertEqual(result_words, expected_result_words) 

190 

191 result_phrases = wordlist_phrases.scrub(test_source_text) 

192 log.info(f"denylist_phrases: {denylist_phrases}") 

193 log.info(f"result_phrases: {result_phrases}") 

194 log.info(f"expected_result_phrases: {expected_result_phrases}") 

195 self.assertEqual(result_phrases, expected_result_phrases) 

196 

197 wordlist_suffixes = WordList( 

198 words=["one", "two"], 

199 suffixes=["dog", "cat"], 

200 replacement_text=anon_text, 

201 regex_method=regex_method, 

202 ) 

203 self.assertEqual( 

204 wordlist_suffixes.scrub("x one x"), f"x {anon_text} x" 

205 ) 

206 self.assertEqual( 

207 wordlist_suffixes.scrub("x onedog x"), f"x {anon_text} x" 

208 ) 

209 self.assertEqual( 

210 wordlist_suffixes.scrub("x one dog x"), f"x {anon_text} dog x" 

211 ) 

212 

213 def test_wordlist(self) -> None: 

214 self._test_wordlist(regex_method=False) 

215 self._test_wordlist(regex_method=True) 

216 

217 

218class ScrubberTestCase(TestCase): 

219 def setUp(self) -> None: 

220 self.key = TEST_KEY 

221 self.hasher = HmacMD5Hasher(self.key) 

222 

223 

224# ============================================================================= 

225# Test PersonalizedScrubber 

226# ============================================================================= 

227 

228 

229class PersonalizedScrubberTests(ScrubberTestCase): 

230 def setUp(self) -> None: 

231 super().setUp() 

232 

233 self.anonpatient = PATIENT_REPLACEMENT 

234 self.anonthird = THIRD_PARTY_REPLACEMENT 

235 

236 def test_phrase_unless_numeric(self) -> None: 

237 tests = [ 

238 ("5", {"blah 5 blah": "blah 5 blah"}), 

239 (" 5 ", {"blah 5 blah": "blah 5 blah"}), 

240 ( 

241 " 5.0 ", 

242 { 

243 "blah 5 blah": "blah 5 blah", 

244 "blah 5. blah": "blah 5. blah", 

245 "blah 5.0 blah": "blah 5.0 blah", 

246 }, 

247 ), 

248 ( 

249 " 5. ", 

250 { 

251 "blah 5 blah": "blah 5 blah", 

252 "blah 5. blah": "blah 5. blah", 

253 "blah 5.0 blah": "blah 5.0 blah", 

254 }, 

255 ), 

256 ( 

257 "5 Tree Road", 

258 { 

259 "blah 5 blah": "blah 5 blah", 

260 "blah 5 Tree Road blah": f"blah {self.anonpatient} blah", 

261 }, 

262 ), 

263 ( 

264 " 5 Tree Road ", 

265 { 

266 "blah 5 blah": "blah 5 blah", 

267 "blah 5 Tree Road blah": f"blah {self.anonpatient} blah", 

268 }, 

269 ), 

270 (" 5b ", {"blah 5b blah": f"blah {self.anonpatient} blah"}), 

271 ] 

272 for scrubvalue, mapping in tests: 

273 scrubber = PersonalizedScrubber( 

274 replacement_text_patient=self.anonpatient, 

275 replacement_text_third_party=self.anonthird, 

276 hasher=self.hasher, 

277 min_string_length_to_scrub_with=1, 

278 debug=True, 

279 ) 

280 scrubber.add_value( 

281 scrubvalue, scrub_method=ScrubMethod.PHRASE_UNLESS_NUMERIC 

282 ) 

283 for start, end in mapping.items(): 

284 self.assertEqual( 

285 scrubber.scrub(start), 

286 end, 

287 f"Failure for scrubvalue: {scrubvalue!r}; regex elements " 

288 f"are {scrubber.re_patient_elements}", 

289 ) 

290 

291 

292class NonspecificScrubberTests(ScrubberTestCase): 

293 """ 

294 Tests nonspecific scrubbing. 

295 """ 

296 

297 def setUp(self) -> None: 

298 super().setUp() 

299 

300 self.fake = Faker(["en-GB"]) 

301 self.fake.seed_instance(1234) 

302 

303 def test_all_dates_scrubbed(self) -> None: 

304 """ 

305 Check we can remove arbitrary dates. (See also anonregex_tests.py for 

306 tests of the date detection regexes.) 

307 """ 

308 date_of_birth_1 = self.fake.date_of_birth() 

309 date_string_1 = date_of_birth_1.strftime("%d %b %Y") 

310 

311 date_of_birth_2 = self.fake.date_of_birth() 

312 date_string_2 = date_of_birth_2.strftime("%d %b %Y") 

313 

314 text = ( 

315 f"{self.fake.text()} {date_string_1} " 

316 f"{self.fake.text()} {date_string_2}" 

317 ) 

318 

319 scrubber = NonspecificScrubber( 

320 self.hasher, 

321 replacement_text_all_dates="[REDACTED]", 

322 scrub_all_dates=True, 

323 ) 

324 

325 scrubbed = scrubber.scrub(text) 

326 

327 self.assertEqual(scrubbed.count("[REDACTED]"), 2) 

328 

329 def test_all_dates_in_supported_formats_blurred(self) -> None: 

330 """ 

331 Check we can blur dates. 

332 """ 

333 tests = ( 

334 # Using "%b %Y" format: 

335 ("01 February 2003", "Feb 2003"), 

336 ("01 Feb 2003", "Feb 2003"), 

337 ("01 Feb 00", "Feb 2000"), 

338 ("01 Feb 69", "Feb 1969"), 

339 ("01 Feb 99", "Feb 1999"), 

340 ("4/5/2006", "May 2006"), 

341 ("4/5/99", "May 1999"), 

342 ("7/31/2008", "Jul 2008"), 

343 ("7/31/99", "Jul 1999"), 

344 ("8th Sept 2010", "Sep 2010"), 

345 ("8th Sept 99", "Sep 1999"), 

346 ("7/31/2008", "Jul 2008"), 

347 ("7/31/99", "Jul 1999"), 

348 ("2011-12-13", "Dec 2011"), 

349 ("99-12-13", "Dec 1999"), 

350 ("20160718", "Jul 2016"), 

351 ) 

352 

353 scrubber = NonspecificScrubber( 

354 self.hasher, 

355 scrub_all_dates=True, 

356 replacement_text_all_dates="%b %Y", 

357 ) 

358 

359 for text, expected in tests: 

360 self.assertEqual( 

361 scrubber.scrub(text), expected, msg=f"test: {text}" 

362 ) 

363 

364 def test_non_dates_scrubbed(self) -> None: 

365 """ 

366 Test that non-date things are scrubbed with non-date replacement text, 

367 even if we have special date replacements configured. 

368 """ 

369 scrubber = NonspecificScrubber( 

370 self.hasher, 

371 scrub_all_uk_postcodes=True, 

372 scrub_all_dates=True, 

373 replacement_text="[REDACTED]", 

374 replacement_text_all_dates="%b %Y", 

375 ) 

376 

377 self.assertEqual(scrubber.scrub(self.fake.postcode()), "[REDACTED]") 

378 

379 def test_scrub_all_dates_with_replacement(self) -> None: 

380 custom_placeholder_tests = [ 

381 ("[%Y-%m]", "[2022-02]"), 

382 ("[%B, %Y]", "[February, 2022]"), 

383 ("[%b '%y]", "[Feb '22]"), 

384 ("[%Y]", "[2022]"), 

385 ("[%b %Y]", "[Feb 2022]"), 

386 ] 

387 

388 for replacement, expected in custom_placeholder_tests: 

389 scrubber = NonspecificScrubber( 

390 self.hasher, 

391 scrub_all_dates=True, 

392 replacement_text_all_dates=replacement, 

393 ) 

394 

395 self.assertEqual(scrubber.scrub("2022-02-28"), expected) 

396 

397 def test_raises_for_unsupported_date_formats(self) -> None: 

398 """ 

399 Check we can detect bad % directives that we will not allow through to 

400 datetime.date.strftime(). Compare DATE_BLURRING_DIRECTIVES, the stuff 

401 we do allow. 

402 """ 

403 bad_formats = [ 

404 "%a", 

405 "%A", 

406 "%w", 

407 "%d", 

408 "%H", 

409 "%I", 

410 "%p", 

411 "%M", 

412 "%S", 

413 "%f", 

414 "%z", 

415 "%Z", 

416 "%j", 

417 "%U", 

418 "%W", 

419 "%c", 

420 "%x", 

421 "%X", 

422 "%G", 

423 "%u", 

424 "%V", 

425 "hello %V world", # detect not just at the start/end 

426 "%%", # "%%" (literal %) currently unsupported 

427 ] 

428 

429 for replacement in bad_formats: 

430 with self.assertRaises(ValueError): 

431 NonspecificScrubber( 

432 self.hasher, 

433 scrub_all_dates=True, 

434 replacement_text_all_dates=replacement, 

435 ) 

436 

437 def test_email_addresses_scrubbed(self) -> None: 

438 """ 

439 Test that e-mail addresses are scrubbed. 

440 """ 

441 scrubber = NonspecificScrubber( 

442 self.hasher, 

443 scrub_all_email_addresses=True, 

444 replacement_text="[REDACTED]", 

445 ) 

446 

447 self.assertEqual(scrubber.scrub(self.fake.email()), "[REDACTED]")