Coverage for anonymise/tests/scrub_tests.py: 100%
127 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/anonymise/tests/scrub_tests.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26Unit testing.
28"""
30# =============================================================================
31# Imports
32# =============================================================================
34import re
35import logging
36import os
37from tempfile import TemporaryDirectory
38from typing import List
39from unittest import TestCase
41from cardinal_pythonlib.hash import HmacMD5Hasher
43from crate_anon.anonymise.constants import ScrubMethod
44from crate_anon.anonymise.scrub import (
45 NonspecificScrubber,
46 PersonalizedScrubber,
47 WordList,
48)
49from crate_anon.common.bugfix_flashtext import KeywordProcessorFixed
51from faker import Faker
53log = logging.getLogger(__name__)
56# =============================================================================
57# Constants
58# =============================================================================
60TEST_KEY = "hello"
61PATIENT_REPLACEMENT = "[XXX]"
62THIRD_PARTY_REPLACEMENT = "[YYY]"
65# =============================================================================
66# Test hashing
67# =============================================================================
70class HashTests(TestCase):
71 def test_str_int_hash_equivalent(self) -> None:
72 """
73 Hashing an integer and its string equivalent should give the same
74 answer.
75 """
76 hasher = HmacMD5Hasher(TEST_KEY)
77 x = 1234567
78 y = str(x)
79 self.assertEqual(
80 hasher.hash(x),
81 hasher.hash(y),
82 "Hasher providing different answer for str and int",
83 )
86# =============================================================================
87# Test WordList
88# =============================================================================
91class WordListTests(TestCase):
92 def setUp(self) -> None:
93 self.tempdir = TemporaryDirectory()
94 self.maxDiff = None # see full differences upon failure
96 def _test_flashtext_word_boundaries(self, target: str) -> None:
97 anon_text = PATIENT_REPLACEMENT
98 ft = KeywordProcessorFixed(case_sensitive=False)
99 ft.add_keyword(target, anon_text)
100 self.assertEqual(
101 # FlashText will replace at word boundaries:
102 ft.replace_keywords(f"x {target} x"),
103 f"x {anon_text} x",
104 )
105 self.assertEqual(
106 # But only at word boundaries, so this won't replace:
107 ft.replace_keywords(f"x{target}x"),
108 f"x{target}x",
109 )
111 def test_flashtext_word_boundaries(self) -> None:
112 self._test_flashtext_word_boundaries("daisy")
113 self._test_flashtext_word_boundaries("daisy bluebell")
115 def _test_wordlist(self, regex_method: bool = False) -> None:
116 """
117 Test with e.g.
119 .. code-block:: python
121 pytest -k test_wordlist --log-cli-level=INFO
122 """
123 denylist_phrases = ["Alice", "Bob", "Charlie Brown", "Daisy"]
124 anon_text = PATIENT_REPLACEMENT
125 test_source_text = """
126 I met Alice in the street.
127 She was walking with Bob.
128 Charlie was not with them.
129 Their gloves were brown.
130 They stopped to inspect a daisy.
131 They discussed Charlie Brown cartoons.
132 They discussed Charlie Brown cartoons all day long.
133 They made comment after comment.
134 """
135 denylist_text = (
136 "\n# comment\n"
137 + "\n".join(f" {x} " for x in denylist_phrases)
138 + "\n"
139 )
140 # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists # noqa: E501
141 denylist_words = [] # type: List[str]
142 for line in denylist_phrases:
143 denylist_words += [x for x in line.split() if x]
145 expected_result_phrases = test_source_text
146 for element in denylist_phrases:
147 # https://stackoverflow.com/questions/919056/case-insensitive-replace # noqa: E501
148 element_re = re.compile(re.escape(element), re.IGNORECASE)
149 expected_result_phrases = element_re.sub(
150 anon_text, expected_result_phrases
151 )
152 if regex_method:
153 # Regexes handle whitespace flexibly.
154 expected_result_phrases = expected_result_phrases.replace(
155 "Charlie Brown", anon_text
156 )
158 expected_result_words = test_source_text
159 for element in denylist_words:
160 element_re = re.compile(re.escape(element), re.IGNORECASE)
161 expected_result_words = element_re.sub(
162 anon_text, expected_result_words
163 )
165 filename = os.path.join(self.tempdir.name, "badwords.txt")
166 with open(filename, "wt") as f:
167 f.write(denylist_text)
169 wordlist_phrases = WordList(
170 filenames=[filename],
171 as_phrases=True,
172 replacement_text=anon_text,
173 regex_method=regex_method,
174 )
175 wordlist_words = WordList(
176 filenames=[filename],
177 as_phrases=False,
178 replacement_text=anon_text,
179 regex_method=regex_method,
180 )
182 log.info(f"test_source_text: {test_source_text}")
183 log.info(f"denylist_text: {denylist_text}")
185 result_words = wordlist_words.scrub(test_source_text)
186 log.info(f"denylist_words: {denylist_words}")
187 log.info(f"result_words: {result_words}")
188 log.info(f"expected_result_words: {expected_result_words}")
189 self.assertEqual(result_words, expected_result_words)
191 result_phrases = wordlist_phrases.scrub(test_source_text)
192 log.info(f"denylist_phrases: {denylist_phrases}")
193 log.info(f"result_phrases: {result_phrases}")
194 log.info(f"expected_result_phrases: {expected_result_phrases}")
195 self.assertEqual(result_phrases, expected_result_phrases)
197 wordlist_suffixes = WordList(
198 words=["one", "two"],
199 suffixes=["dog", "cat"],
200 replacement_text=anon_text,
201 regex_method=regex_method,
202 )
203 self.assertEqual(
204 wordlist_suffixes.scrub("x one x"), f"x {anon_text} x"
205 )
206 self.assertEqual(
207 wordlist_suffixes.scrub("x onedog x"), f"x {anon_text} x"
208 )
209 self.assertEqual(
210 wordlist_suffixes.scrub("x one dog x"), f"x {anon_text} dog x"
211 )
213 def test_wordlist(self) -> None:
214 self._test_wordlist(regex_method=False)
215 self._test_wordlist(regex_method=True)
218class ScrubberTestCase(TestCase):
219 def setUp(self) -> None:
220 self.key = TEST_KEY
221 self.hasher = HmacMD5Hasher(self.key)
224# =============================================================================
225# Test PersonalizedScrubber
226# =============================================================================
229class PersonalizedScrubberTests(ScrubberTestCase):
230 def setUp(self) -> None:
231 super().setUp()
233 self.anonpatient = PATIENT_REPLACEMENT
234 self.anonthird = THIRD_PARTY_REPLACEMENT
236 def test_phrase_unless_numeric(self) -> None:
237 tests = [
238 ("5", {"blah 5 blah": "blah 5 blah"}),
239 (" 5 ", {"blah 5 blah": "blah 5 blah"}),
240 (
241 " 5.0 ",
242 {
243 "blah 5 blah": "blah 5 blah",
244 "blah 5. blah": "blah 5. blah",
245 "blah 5.0 blah": "blah 5.0 blah",
246 },
247 ),
248 (
249 " 5. ",
250 {
251 "blah 5 blah": "blah 5 blah",
252 "blah 5. blah": "blah 5. blah",
253 "blah 5.0 blah": "blah 5.0 blah",
254 },
255 ),
256 (
257 "5 Tree Road",
258 {
259 "blah 5 blah": "blah 5 blah",
260 "blah 5 Tree Road blah": f"blah {self.anonpatient} blah",
261 },
262 ),
263 (
264 " 5 Tree Road ",
265 {
266 "blah 5 blah": "blah 5 blah",
267 "blah 5 Tree Road blah": f"blah {self.anonpatient} blah",
268 },
269 ),
270 (" 5b ", {"blah 5b blah": f"blah {self.anonpatient} blah"}),
271 ]
272 for scrubvalue, mapping in tests:
273 scrubber = PersonalizedScrubber(
274 replacement_text_patient=self.anonpatient,
275 replacement_text_third_party=self.anonthird,
276 hasher=self.hasher,
277 min_string_length_to_scrub_with=1,
278 debug=True,
279 )
280 scrubber.add_value(
281 scrubvalue, scrub_method=ScrubMethod.PHRASE_UNLESS_NUMERIC
282 )
283 for start, end in mapping.items():
284 self.assertEqual(
285 scrubber.scrub(start),
286 end,
287 f"Failure for scrubvalue: {scrubvalue!r}; regex elements "
288 f"are {scrubber.re_patient_elements}",
289 )
292class NonspecificScrubberTests(ScrubberTestCase):
293 """
294 Tests nonspecific scrubbing.
295 """
297 def setUp(self) -> None:
298 super().setUp()
300 self.fake = Faker(["en-GB"])
301 self.fake.seed_instance(1234)
303 def test_all_dates_scrubbed(self) -> None:
304 """
305 Check we can remove arbitrary dates. (See also anonregex_tests.py for
306 tests of the date detection regexes.)
307 """
308 date_of_birth_1 = self.fake.date_of_birth()
309 date_string_1 = date_of_birth_1.strftime("%d %b %Y")
311 date_of_birth_2 = self.fake.date_of_birth()
312 date_string_2 = date_of_birth_2.strftime("%d %b %Y")
314 text = (
315 f"{self.fake.text()} {date_string_1} "
316 f"{self.fake.text()} {date_string_2}"
317 )
319 scrubber = NonspecificScrubber(
320 self.hasher,
321 replacement_text_all_dates="[REDACTED]",
322 scrub_all_dates=True,
323 )
325 scrubbed = scrubber.scrub(text)
327 self.assertEqual(scrubbed.count("[REDACTED]"), 2)
329 def test_all_dates_in_supported_formats_blurred(self) -> None:
330 """
331 Check we can blur dates.
332 """
333 tests = (
334 # Using "%b %Y" format:
335 ("01 February 2003", "Feb 2003"),
336 ("01 Feb 2003", "Feb 2003"),
337 ("01 Feb 00", "Feb 2000"),
338 ("01 Feb 69", "Feb 1969"),
339 ("01 Feb 99", "Feb 1999"),
340 ("4/5/2006", "May 2006"),
341 ("4/5/99", "May 1999"),
342 ("7/31/2008", "Jul 2008"),
343 ("7/31/99", "Jul 1999"),
344 ("8th Sept 2010", "Sep 2010"),
345 ("8th Sept 99", "Sep 1999"),
346 ("7/31/2008", "Jul 2008"),
347 ("7/31/99", "Jul 1999"),
348 ("2011-12-13", "Dec 2011"),
349 ("99-12-13", "Dec 1999"),
350 ("20160718", "Jul 2016"),
351 )
353 scrubber = NonspecificScrubber(
354 self.hasher,
355 scrub_all_dates=True,
356 replacement_text_all_dates="%b %Y",
357 )
359 for text, expected in tests:
360 self.assertEqual(
361 scrubber.scrub(text), expected, msg=f"test: {text}"
362 )
364 def test_non_dates_scrubbed(self) -> None:
365 """
366 Test that non-date things are scrubbed with non-date replacement text,
367 even if we have special date replacements configured.
368 """
369 scrubber = NonspecificScrubber(
370 self.hasher,
371 scrub_all_uk_postcodes=True,
372 scrub_all_dates=True,
373 replacement_text="[REDACTED]",
374 replacement_text_all_dates="%b %Y",
375 )
377 self.assertEqual(scrubber.scrub(self.fake.postcode()), "[REDACTED]")
379 def test_scrub_all_dates_with_replacement(self) -> None:
380 custom_placeholder_tests = [
381 ("[%Y-%m]", "[2022-02]"),
382 ("[%B, %Y]", "[February, 2022]"),
383 ("[%b '%y]", "[Feb '22]"),
384 ("[%Y]", "[2022]"),
385 ("[%b %Y]", "[Feb 2022]"),
386 ]
388 for replacement, expected in custom_placeholder_tests:
389 scrubber = NonspecificScrubber(
390 self.hasher,
391 scrub_all_dates=True,
392 replacement_text_all_dates=replacement,
393 )
395 self.assertEqual(scrubber.scrub("2022-02-28"), expected)
397 def test_raises_for_unsupported_date_formats(self) -> None:
398 """
399 Check we can detect bad % directives that we will not allow through to
400 datetime.date.strftime(). Compare DATE_BLURRING_DIRECTIVES, the stuff
401 we do allow.
402 """
403 bad_formats = [
404 "%a",
405 "%A",
406 "%w",
407 "%d",
408 "%H",
409 "%I",
410 "%p",
411 "%M",
412 "%S",
413 "%f",
414 "%z",
415 "%Z",
416 "%j",
417 "%U",
418 "%W",
419 "%c",
420 "%x",
421 "%X",
422 "%G",
423 "%u",
424 "%V",
425 "hello %V world", # detect not just at the start/end
426 "%%", # "%%" (literal %) currently unsupported
427 ]
429 for replacement in bad_formats:
430 with self.assertRaises(ValueError):
431 NonspecificScrubber(
432 self.hasher,
433 scrub_all_dates=True,
434 replacement_text_all_dates=replacement,
435 )
437 def test_email_addresses_scrubbed(self) -> None:
438 """
439 Test that e-mail addresses are scrubbed.
440 """
441 scrubber = NonspecificScrubber(
442 self.hasher,
443 scrub_all_email_addresses=True,
444 replacement_text="[REDACTED]",
445 )
447 self.assertEqual(scrubber.scrub(self.fake.email()), "[REDACTED]")