Coverage for anonymise/tests/anonregex_tests.py: 73%
179 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1#!/usr/bin/env python
3"""
4crate_anon/anonymise/tests/anonregex_tests.py
6===============================================================================
8 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
9 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
11 This file is part of CRATE.
13 CRATE is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 CRATE is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
26===============================================================================
28Unit testing.
30"""
32# =============================================================================
33# Imports
34# =============================================================================
36from datetime import date
37import dateutil.parser # for unit tests
38import logging
39from typing import List, Tuple
40from unittest import TestCase
42from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger
43import regex
45from crate_anon.anonymise.anonregex import (
46 EMAIL_REGEX_STR,
47 get_anon_fragments_from_string,
48 get_code_regex_elements,
49 get_date_regex_elements,
50 get_generic_date_regex_elements,
51 get_number_of_length_n_regex_elements,
52 get_phrase_regex_elements,
53 get_regex_from_elements,
54 get_regex_string_from_elements,
55 get_string_regex_elements,
56 get_uk_postcode_regex_elements,
57 get_uk_postcode_regex_string,
58 REGEX_COMPILE_FLAGS,
59)
60from crate_anon.common.stringfunc import (
61 get_digit_string_from_vaguely_numeric_string,
62 reduce_to_alphanumeric,
63)
65log = logging.getLogger(__name__)
68# =============================================================================
69# Test anonymisation regexes
70# =============================================================================
73class AnonRegexTests(TestCase):
74 """
75 Unit tests.
76 """
78 STRING_1 = r"""
79 I was born on 07 Jan 2013, m'lud.
80 It was 7 January 13, or 7/1/13, or 1/7/13, or
81 Jan 7 2013, or 2013/01/07, or 2013-01-07,
82 or 7th January
83 13 (split over a line)
84 or Jan 7th 13
85 or 07.01.13 or 7.1.2013
86 or a host of other variations.
87 And ISO-8601 formats like 20130107T0123, or just 20130107.
89 BUT NOT 8 Jan 2013, or 2013/02/07, or 2013
90 Jan 17, or just a number like 7, or a month
91 like January, or a nonspecific date like
92 Jan 2013 or 7 January. And not ISO-8601-formatted other dates
93 like 20130108T0123, or just 20130108.
95 I am 34 years old. My mother was 348, or 834, or perhaps 8348.
96 Was she 34.6? Don't think so.
98 Her IDs include NHS#123456, or 123 456, or (123) 456, or 123456.
100 I am 34 years old. My mother was 348, or 834, or perhaps 8348.
101 She wasn't my step-mother, or my grandmother, or my mother-in-law.
102 She was my MOTHER!
103 A typo is mther.
105 Unicode apostrophe: the thread’s possession
107 E-mail: bob@pobox.com, mr.jones@somewhere.nhs.uk, blah@place.com
108 Mr.Jones@somewhere.nhs.uk
110 Some numbers by size:
111 1
112 12
113 123
114 1234
115 12345
116 123456
117 1234567
118 12345678
119 123456789
120 1234567890
121 12345678901
122 123456789012
123 1234567890123
124 12345678901234
125 123456789012345
126 Some postcodes (from https://www.mrs.org.uk/pdf/postcodeformat.pdf)
127 M1 1AA
128 M60 1NW
129 CR2 6XH
130 DN55 1PT
131 W1A 1HQ
132 EC1A 1BB
133 """
135 @staticmethod
136 def report(title: str, string: str) -> None:
137 print("=" * 79)
138 print(title)
139 print("=" * 79)
140 print(string)
142 def test_most(self) -> None:
143 s = self.STRING_1
144 testnumber = 34
145 testnumber_as_text = "123456"
146 testdate_str = "7 Jan 2013"
147 testdate = dateutil.parser.parse(testdate_str)
148 teststring = "mother"
149 testphrase = "348 or 834"
150 date_19th_c = "3 Sep 1847"
151 old_testdate = dateutil.parser.parse(date_19th_c)
152 testemail = "mr.jones@somewhere.nhs.uk"
154 regex_date = get_regex_from_elements(get_date_regex_elements(testdate))
155 regex_number = get_regex_from_elements(
156 get_code_regex_elements(str(testnumber))
157 )
158 regex_number_as_text = get_regex_from_elements(
159 get_code_regex_elements(
160 get_digit_string_from_vaguely_numeric_string(
161 testnumber_as_text
162 )
163 )
164 )
165 regex_string = get_regex_from_elements(
166 get_string_regex_elements(teststring)
167 )
168 regex_email = get_regex_from_elements(
169 get_string_regex_elements(testemail)
170 )
171 regex_phrase = get_regex_from_elements(
172 get_phrase_regex_elements(testphrase)
173 )
174 regex_10digit = get_regex_from_elements(
175 get_number_of_length_n_regex_elements(10)
176 )
177 regex_postcode = get_regex_from_elements(
178 get_uk_postcode_regex_elements()
179 )
180 all_elements = (
181 get_date_regex_elements(testdate)
182 + get_code_regex_elements(str(testnumber))
183 + get_code_regex_elements(
184 get_digit_string_from_vaguely_numeric_string(
185 testnumber_as_text
186 )
187 )
188 + get_string_regex_elements(teststring)
189 + get_string_regex_elements(testemail)
190 + get_phrase_regex_elements(testphrase)
191 + get_number_of_length_n_regex_elements(10)
192 + get_uk_postcode_regex_elements()
193 )
194 regex_all = get_regex_from_elements(all_elements)
196 self.report(
197 "Removing date: " + testdate_str, regex_date.sub("DATE_GONE", s)
198 )
199 self.report(
200 f"Removing number: {testnumber}",
201 regex_number.sub("NUMBER_GONE", s),
202 )
203 self.report(
204 "Removing numbers as text: " + testnumber_as_text,
205 regex_number_as_text.sub("NUMBER_AS_TEXT_GONE", s),
206 )
207 self.report(
208 "Removing string: " + teststring,
209 regex_string.sub("STRING_GONE", s),
210 )
211 self.report(
212 "Removing email: " + testemail, regex_email.sub("EMAIL_GONE", s)
213 )
214 self.report(
215 "Removing phrase: " + testphrase,
216 regex_phrase.sub("PHRASE_GONE", s),
217 )
218 self.report(
219 "Removing 10-digit numbers",
220 regex_10digit.sub("TEN_DIGIT_NUMBERS_GONE", s),
221 )
222 self.report(
223 "Removing postcodes", regex_postcode.sub("POSTCODES_GONE", s)
224 )
225 self.report("Removing everything", regex_all.sub("EVERYTHING_GONE", s))
226 self.report(
227 "All-elements regex", get_regex_string_from_elements(all_elements)
228 )
229 self.report(
230 "Date regex",
231 get_regex_string_from_elements(get_date_regex_elements(testdate)),
232 )
233 self.report(
234 "Date regex for 19th century",
235 get_regex_string_from_elements(
236 get_date_regex_elements(old_testdate)
237 ),
238 )
239 self.report(
240 "Phrase regex",
241 get_regex_string_from_elements(
242 get_phrase_regex_elements(testphrase)
243 ),
244 )
245 self.report(
246 "10-digit-number regex",
247 get_regex_string_from_elements(
248 get_number_of_length_n_regex_elements(10)
249 ),
250 )
252 def test_generic_date(self) -> None:
253 # https://stackoverflow.com/questions/51224/regular-expression-to-match-valid-dates # noqa: E501
254 valid = (
255 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
256 # From that StackOverflow set
257 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
258 # Day, month, year
259 "2/11/73",
260 "02/11/1973",
261 "2/1/73",
262 "02/01/73",
263 "31/1/1973",
264 "02/1/1973",
265 "31.1.2011",
266 "31-1-2001",
267 "29/2/1973",
268 "29/02/1976",
269 "03/06/2010",
270 "12/6/90",
271 # month, day, year
272 "02/24/1975",
273 "06/19/66",
274 "03.31.1991",
275 "2.29.2003",
276 "02-29-55",
277 "03-13-55",
278 "03-13-1955",
279 r"12\24\1974",
280 r"12\30\1974",
281 r"1\31\1974",
282 "03/31/2001",
283 "01/21/2001",
284 "12/13/2001",
285 # Match both DMY and MDY
286 "12/12/1978",
287 "6/6/78",
288 "06/6/1978",
289 "6/06/1978",
290 # using whitespace as a delimiter
291 "13 11 2001",
292 "11 13 2001",
293 "11 13 01",
294 "13 11 01",
295 "1 1 01",
296 "1 1 2001",
297 # Year Month Day order
298 "76/02/02",
299 "1976/02/29",
300 "1976/2/13",
301 "76/09/31",
302 # YYYYMMDD sortable format
303 "19741213",
304 "19750101",
305 # Valid dates before Epoch
306 "12/1/10",
307 "12/01/00",
308 "12/01/0000",
309 # Valid date after 2038
310 "01/01/2039",
311 "01/01/39",
312 # Dates with leading or trailing characters (but still word
313 # boundaries)
314 "12/31/21/",
315 "12/10/2016 8:26:00.39",
316 "31/12/1921.10:55",
317 # Dates that runs across two lines
318 "1/12/19\n74",
319 "01/12/19\n74/13/1946",
320 "31/12/20\n08:13",
321 # Odd but accepted
322 "2/12-73",
323 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
324 # Extras with our system supporting month words/ordinals
325 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
326 "2 Sep 1990",
327 "2nd Sep 1990",
328 "2 September 1990",
329 "02 September 90",
330 "2-Sep-90",
331 "1990-Sep-02",
332 "Sep 2 1990",
333 "Sep 2nd 1990",
334 "1st Sep 90",
335 "1st Sept 2000",
336 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
337 # Additional styles from JL
338 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
339 "blah for some name dob 7.3.04 but thing",
340 "x] |D.O.B. |24/02/1973 | |Detail",
341 )
342 suboptimal_but_accepted = (
343 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
344 # From that StackOverflow set
345 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
346 # Invalid, corrupted or nonsense dates
347 "74/2/29", # wasn't a leap year
348 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
349 # Extras with our system supporting month words/ordinals
350 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
351 "1nd Sep 90", # ordinal suffix-to-number mapping not checked
352 )
353 valid_only_without_word_boundaries = (
354 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
355 # Dates with leading or trailing characters (only recognized if
356 # word boundaries not required)
357 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
358 "31/12/1921AD",
359 "wfuwdf12/11/74iuhwf",
360 "fwefew13/11/1974",
361 "01/12/1974vdwdfwe",
362 "01/01/99werwer",
363 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
364 # Additional styles from JL
365 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
366 "x y z DOB23.07.48 questionnaire",
367 )
368 not_currently_valid_perhaps_should_be = (
369 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
370 # Valid dates before Epoch
371 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
372 "12/01/660",
373 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
374 # Valid date beyond the year 9999
375 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
376 "01/01/10000",
377 )
378 invalid = (
379 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
380 # From that StackOverflow set
381 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
382 # Dates with leading or trailing characters that render it garbage
383 "12321301/01/99",
384 # Invalid, corrupted or nonsense dates
385 "00/01/2100",
386 "31/31/2001",
387 "101/12/1974",
388 # Invalid, corrupted or nonsense dates
389 "0/1/2001",
390 "1/0/2001",
391 "01/0/2001",
392 "0101/2001",
393 "01/131/2001",
394 "56/56/56",
395 "00/00/0000",
396 "0/0/1999",
397 "12/01/0",
398 "12/10/-100",
399 "12/32/45",
400 "20/12/194",
401 # Times that look like dates
402 "12:13:56",
403 "13:12:01",
404 "1:12:01PM",
405 "1:12:01 AM",
406 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
407 # Extras with our system supporting month words/ordinals
408 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
409 "1xx Sep 2000",
410 "1st Spt 2000",
411 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
412 # Irrelevant content
413 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
414 "The cat sat on the mat."
415 "He started haloperidol 5mg x7/week in 2009.",
416 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
417 # Additional styles from JL
418 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
419 "x / y z DOB 0804013",
420 )
421 working_valid = valid + suboptimal_but_accepted
422 working_invalid = not_currently_valid_perhaps_should_be + invalid
424 date_regex_wb_elements = get_generic_date_regex_elements(
425 at_word_boundaries_only=True
426 )
427 date_regex_wb_elements_str = "\n".join(date_regex_wb_elements)
428 date_regex_wb = get_regex_from_elements(date_regex_wb_elements)
429 date_regex_no_wb_elements = get_generic_date_regex_elements(
430 at_word_boundaries_only=False
431 )
432 date_regex_no_wb_elements_str = "\n".join(date_regex_no_wb_elements)
433 date_regex_no_wb = get_regex_from_elements(date_regex_no_wb_elements)
435 # match() = at beginning of string
436 # search() = anywhere in string
437 for x in working_valid:
438 self.assertTrue(
439 date_regex_wb.search(x),
440 f"[#1] Should be recognized as a date (with word "
441 f"boundaries) but isn't: {x!r}; "
442 f"regex elements =\n{date_regex_wb_elements_str}",
443 )
444 self.assertTrue(
445 date_regex_no_wb.search(x),
446 f"[#2] Should be recognized as a date (without word "
447 f"boundaries) but isn't: {x!r}; "
448 f"regex elements =\n{date_regex_no_wb_elements_str}",
449 )
450 for x in valid_only_without_word_boundaries:
451 self.assertFalse(
452 date_regex_wb.search(x),
453 f"[#3] Should not be recognized as a date (with word "
454 f"boundaries) but is: {x!r}; "
455 f"regex elements =\n{date_regex_wb_elements_str}",
456 )
457 self.assertTrue(
458 date_regex_no_wb.search(x),
459 f"[#4] Should be recognized as a date (without word "
460 f"boundaries) but isn't: {x!r}; "
461 f"regex elements =\n{date_regex_no_wb_elements_str}",
462 )
463 for x in working_invalid:
464 self.assertFalse(
465 date_regex_wb.search(x),
466 f"[#5] Should not be recognized as a date (with word "
467 f"boundaries) but is: {x!r}; "
468 f"regex elements =\n{date_regex_wb_elements_str}",
469 )
470 self.assertFalse(
471 date_regex_no_wb.search(x),
472 f"[#6] Should not be recognized as a date (without word "
473 f"boundaries) but is: {x!r}; "
474 f"regex elements =\n{date_regex_no_wb_elements_str}",
475 )
478def examples_for_paper() -> None:
479 """
480 Examples used in Cardinal (2017),
481 https://doi.org/10.1186/s12911-017-0437-1.
482 """
483 testwords = "John Al'Rahem"
484 min_string_length_to_scrub_with = 4
485 scrub_string_suffixes = [] # type: List[str]
486 max_errors = 0
487 at_word_boundaries_only = True
488 words_regexes = [] # type: List[str]
489 for s in get_anon_fragments_from_string(testwords):
490 length = len(s)
491 if length < min_string_length_to_scrub_with:
492 continue
493 words_regexes.extend(
494 get_string_regex_elements(
495 s,
496 suffixes=scrub_string_suffixes,
497 at_word_boundaries_only=at_word_boundaries_only,
498 max_errors=max_errors,
499 )
500 )
501 print(f"--- For words {testwords}:")
502 for r in words_regexes:
503 print(r)
505 testphrase = "4 Privet Drive"
506 phrase_regexes = get_phrase_regex_elements(
507 testphrase,
508 max_errors=max_errors,
509 at_word_boundaries_only=at_word_boundaries_only,
510 )
511 print(f"--- For phrase {testphrase}:")
512 for r in phrase_regexes:
513 print(r)
515 testnumber = "(01223) 123456"
516 anonymise_numbers_at_word_boundaries_only = False
517 anonymise_numbers_at_numeric_boundaries_only = True
518 number_regexes = get_code_regex_elements(
519 get_digit_string_from_vaguely_numeric_string(str(testnumber)),
520 at_word_boundaries_only=anonymise_numbers_at_word_boundaries_only,
521 at_numeric_boundaries_only=anonymise_numbers_at_numeric_boundaries_only, # noqa: E501
522 )
523 print(f"--- For number {testnumber}:")
524 for r in number_regexes:
525 print(r)
527 testcode = "CB12 3DE"
528 anonymise_codes_at_word_boundaries_only = True
529 code_regexes = get_code_regex_elements(
530 reduce_to_alphanumeric(str(testcode)),
531 at_word_boundaries_only=anonymise_codes_at_word_boundaries_only,
532 )
533 print(f"--- For code {testcode}:")
534 for r in code_regexes:
535 print(r)
537 n_digits = 10
538 nonspec_10_digit_number_regexes = get_number_of_length_n_regex_elements(
539 n_digits,
540 at_word_boundaries_only=anonymise_numbers_at_word_boundaries_only,
541 )
542 print(f"--- NONSPECIFIC: numbers of length {n_digits}:")
543 for r in nonspec_10_digit_number_regexes:
544 print(r)
546 uk_postcode_regexes = get_uk_postcode_regex_elements(
547 at_word_boundaries_only=anonymise_codes_at_word_boundaries_only
548 )
549 print("--- NONSPECIFIC: UK postcodes:")
550 for r in uk_postcode_regexes:
551 print(r)
553 testdate = date(year=2016, month=12, day=31)
554 date_regexes = get_date_regex_elements(testdate)
555 print(f"--- For date {testdate}:")
556 for r in date_regexes:
557 print(r)
560class MoreAnonRegexTests(TestCase):
561 """
562 More tests of regular expressions for anonymisation.
563 """
565 def _should_match(self, regexes: List[str], string: str) -> None:
566 self.assertTrue(
567 any(
568 # search (match anywhere), not match (match at start)
569 regex.search(pattern, string)
570 for pattern in regexes
571 ),
572 f"Failed to match {string!r} against regexes {regexes}",
573 )
575 def _should_match_all(
576 self, regexes: List[str], strings: List[str]
577 ) -> None:
578 for s in strings:
579 self._should_match(regexes, s)
581 def _should_not_match(self, regexes: List[str], string: str) -> None:
582 self.assertFalse(
583 any(
584 # search (match anywhere), not match (match at start)
585 regex.search(pattern, string)
586 for pattern in regexes
587 ),
588 f"Inappropriately matched {string!r} against regexes {regexes}",
589 )
591 def _should_not_match_any(
592 self, regexes: List[str], strings: List[str]
593 ) -> None:
594 for s in strings:
595 self._should_not_match(regexes, s)
597 def test_fragments(self) -> None:
598 self.assertEqual(
599 get_anon_fragments_from_string("John Smith"), ["John", "Smith"]
600 )
601 self.assertEqual(
602 get_anon_fragments_from_string("John D'Souza"),
603 ["John", "D", "Souza"],
604 )
605 self.assertEqual(
606 get_anon_fragments_from_string(" 42 West Street "),
607 ["42", "West", "Street"],
608 )
610 def test_date(self) -> None:
611 tests = [
612 (
613 date(2021, 12, 31),
614 [
615 # Numeric:
616 "2021-12-31",
617 "31/12/2021",
618 "31/12/21",
619 "31.12.21",
620 "12/31/2021", # American
621 "12/31/21", # American
622 "12.31.21", # American
623 # Partly textual:
624 "31 Dec 2021",
625 "31 December 2021",
626 "31 December, 2021",
627 "December 31 2021",
628 "December 31, 2021",
629 ],
630 ),
631 (
632 date(1980, 5, 6),
633 [
634 # Numeric:
635 "1980-05-06",
636 "6/5/1980",
637 "6/5/80",
638 "6.5.80",
639 "06/05/1980",
640 "5/6/80", # American
641 # Partly textual:
642 "6 May 1980",
643 "May 6, 80",
644 ],
645 ),
646 (
647 date(2004, 3, 7),
648 [
649 "blah for some name dob 7.3.04 but thing",
650 ],
651 ),
652 (
653 date(2001, 4, 8),
654 [
655 "x / y z DOB 0804013",
656 ],
657 ),
658 (
659 date(1948, 7, 23),
660 [
661 "x y z DOB23.07.48 questionnaire",
662 ],
663 ),
664 (
665 date(1973, 2, 24),
666 [
667 "x] |D.O.B. |24/02/1973 | |Detail",
668 ],
669 ),
670 ] # type: List[Tuple[date, List[str]]]
671 for testdate, text_versions in tests:
672 regexes = get_date_regex_elements(testdate)
673 for text in text_versions:
674 self._should_match(regexes, text)
676 def test_code_whitespace(self) -> None:
677 tests = [
678 (
679 "PE123AB",
680 [
681 " PE123AB ",
682 "PE12 3AB",
683 "PE 12 3 AB",
684 ],
685 ),
686 (
687 "PE 12 3AB",
688 [
689 " PE123AB ",
690 "PE12 3AB",
691 "PE 12 3 AB",
692 ],
693 ),
694 ] # type: List[Tuple[str, List[str]]]
695 for testcode, text_versions in tests:
696 regexes = get_code_regex_elements(reduce_to_alphanumeric(testcode))
697 for text in text_versions:
698 self._should_match(regexes, text)
700 def test_code_boundaries(self) -> None:
701 code = "ABC123"
703 word_boundaries = get_code_regex_elements(
704 code,
705 liberal=False,
706 very_liberal=False,
707 at_word_boundaries_only=True,
708 )
709 self._should_match_all(
710 word_boundaries,
711 [
712 f"pq {code} xy",
713 f"pq,{code},xy",
714 f"12 {code} 34",
715 f"12,{code},34",
716 ],
717 )
718 self._should_not_match_any(
719 word_boundaries,
720 [
721 f"pq{code}xy",
722 f"pq{code} xy",
723 f"pq {code}xy",
724 f"12{code}34",
725 f"12{code} 34",
726 f"12 {code}34",
727 ],
728 )
730 number_boundaries = get_code_regex_elements(
731 code,
732 liberal=False,
733 very_liberal=False,
734 at_word_boundaries_only=False,
735 at_numeric_boundaries_only=True,
736 )
737 self._should_match_all(
738 number_boundaries,
739 [
740 f"pq {code} xy",
741 f"pq,{code},xy",
742 f"12 {code} 34",
743 f"12,{code},34",
744 f"pq{code}xy",
745 f"pq{code} xy",
746 f"pq {code}xy",
747 ],
748 )
749 self._should_not_match_any(
750 number_boundaries,
751 [
752 f"12{code}34",
753 f"12{code} 34",
754 f"12 {code}34",
755 ],
756 )
758 anywhere = get_code_regex_elements(
759 code,
760 liberal=False,
761 very_liberal=False,
762 at_word_boundaries_only=False,
763 at_numeric_boundaries_only=False,
764 )
765 self._should_match_all(
766 anywhere,
767 [
768 f"pq {code} xy",
769 f"pq,{code},xy",
770 f"12 {code} 34",
771 f"12,{code},34",
772 f"pq{code}xy",
773 f"pq{code} xy",
774 f"pq {code}xy",
775 f"12{code}34",
776 f"12{code} 34",
777 f"12 {code}34",
778 ],
779 )
781 def test_uk_postcodes(self) -> None:
782 """
783 Ensure we detect postcodes properly.
784 """
785 valid_postcodes = [
786 # from https://www.mrs.org.uk/pdf/postcodeformat.pdf
787 "M1 1AA",
788 "M60 1NW",
789 "CR2 6XH",
790 "DN55 1PT",
791 "W1A 1HQ",
792 "EC1A 1BB",
793 # Some of our institutional postcodes:
794 "CB2 0QQ",
795 ]
796 # See also
797 # https://club.ministryoftesting.com/t/fun-postcodes-to-use-when-testing/10772 # noqa: E501
798 invalid_postcodes = [
799 "ABCDEFG",
800 ]
801 postcode_regex = regex.compile(
802 get_uk_postcode_regex_string(at_word_boundaries_only=False)
803 )
804 for v in valid_postcodes:
805 self.assertTrue(postcode_regex.match(v))
806 for i in invalid_postcodes:
807 self.assertFalse(postcode_regex.match(i))
809 def test_email_addresses(self) -> None:
810 """
811 Ensure we detect e-mail addresses properly.
812 This won't be completely perfect. See https://emailregex.com/.
814 Specimen values:
816 - https://help.xmatters.com/ondemand/trial/valid_email_format.htm
817 """
818 valid_email = [
819 "person@place.com",
820 "r&d@somewhere.nhs.uk",
821 "abc-d@mail.com",
822 "abc.def@mail.com",
823 "abc@mail.com",
824 "abc_def@mail.com",
825 "abc.def@mail.cc",
826 "abc.def@mail-archive.com",
827 "abc.def@mail.org",
828 "abc.def@mail.com",
829 "abc-@mail.com", # xmatters.com thinks wrong but is OK
830 "abc#def@mail.com", # xmatters.com thinks wrong but is OK
831 "abc.def@mail.c", # xmatters.com thinks wrong but ?is OK
832 ]
833 invalid_email = [
834 "person",
835 "person@",
836 "@place.com",
837 "person@place",
838 "abc..def@mail.com",
839 ".abc@mail.com",
840 "abc.def@mail#archive.com",
841 "abc.def@mail",
842 "abc.def@mail..com",
843 ]
844 email_regex = regex.compile(EMAIL_REGEX_STR, flags=REGEX_COMPILE_FLAGS)
845 for v in valid_email:
846 self.assertTrue(
847 email_regex.match(v),
848 f"Should be a valid e-mail address but was not recognized: "
849 f"{v!r}",
850 )
851 for i in invalid_email:
852 self.assertFalse(
853 email_regex.match(i),
854 f"Should not be a valid e-mail address but was accepted: "
855 f"{i!r}",
856 )
859if __name__ == "__main__":
860 main_only_quicksetup_rootlogger(level=logging.DEBUG)
861 examples_for_paper()