Coverage for anonymise/tests/anonregex_tests.py: 73%

179 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1#!/usr/bin/env python 

2 

3""" 

4crate_anon/anonymise/tests/anonregex_tests.py 

5 

6=============================================================================== 

7 

8 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

9 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

10 

11 This file is part of CRATE. 

12 

13 CRATE is free software: you can redistribute it and/or modify 

14 it under the terms of the GNU General Public License as published by 

15 the Free Software Foundation, either version 3 of the License, or 

16 (at your option) any later version. 

17 

18 CRATE is distributed in the hope that it will be useful, 

19 but WITHOUT ANY WARRANTY; without even the implied warranty of 

20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

21 GNU General Public License for more details. 

22 

23 You should have received a copy of the GNU General Public License 

24 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

25 

26=============================================================================== 

27 

28Unit testing. 

29 

30""" 

31 

32# ============================================================================= 

33# Imports 

34# ============================================================================= 

35 

36from datetime import date 

37import dateutil.parser # for unit tests 

38import logging 

39from typing import List, Tuple 

40from unittest import TestCase 

41 

42from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger 

43import regex 

44 

45from crate_anon.anonymise.anonregex import ( 

46 EMAIL_REGEX_STR, 

47 get_anon_fragments_from_string, 

48 get_code_regex_elements, 

49 get_date_regex_elements, 

50 get_generic_date_regex_elements, 

51 get_number_of_length_n_regex_elements, 

52 get_phrase_regex_elements, 

53 get_regex_from_elements, 

54 get_regex_string_from_elements, 

55 get_string_regex_elements, 

56 get_uk_postcode_regex_elements, 

57 get_uk_postcode_regex_string, 

58 REGEX_COMPILE_FLAGS, 

59) 

60from crate_anon.common.stringfunc import ( 

61 get_digit_string_from_vaguely_numeric_string, 

62 reduce_to_alphanumeric, 

63) 

64 

65log = logging.getLogger(__name__) 

66 

67 

68# ============================================================================= 

69# Test anonymisation regexes 

70# ============================================================================= 

71 

72 

73class AnonRegexTests(TestCase): 

74 """ 

75 Unit tests. 

76 """ 

77 

78 STRING_1 = r""" 

79 I was born on 07 Jan 2013, m'lud. 

80 It was 7 January 13, or 7/1/13, or 1/7/13, or 

81 Jan 7 2013, or 2013/01/07, or 2013-01-07, 

82 or 7th January 

83 13 (split over a line) 

84 or Jan 7th 13 

85 or 07.01.13 or 7.1.2013 

86 or a host of other variations. 

87 And ISO-8601 formats like 20130107T0123, or just 20130107. 

88 

89 BUT NOT 8 Jan 2013, or 2013/02/07, or 2013 

90 Jan 17, or just a number like 7, or a month 

91 like January, or a nonspecific date like 

92 Jan 2013 or 7 January. And not ISO-8601-formatted other dates 

93 like 20130108T0123, or just 20130108. 

94 

95 I am 34 years old. My mother was 348, or 834, or perhaps 8348. 

96 Was she 34.6? Don't think so. 

97 

98 Her IDs include NHS#123456, or 123 456, or (123) 456, or 123456. 

99 

100 I am 34 years old. My mother was 348, or 834, or perhaps 8348. 

101 She wasn't my step-mother, or my grandmother, or my mother-in-law. 

102 She was my MOTHER! 

103 A typo is mther. 

104 

105 Unicode apostrophe: the thread’s possession 

106 

107 E-mail: bob@pobox.com, mr.jones@somewhere.nhs.uk, blah@place.com 

108 Mr.Jones@somewhere.nhs.uk 

109 

110 Some numbers by size: 

111 1 

112 12 

113 123 

114 1234 

115 12345 

116 123456 

117 1234567 

118 12345678 

119 123456789 

120 1234567890 

121 12345678901 

122 123456789012 

123 1234567890123 

124 12345678901234 

125 123456789012345 

126 Some postcodes (from https://www.mrs.org.uk/pdf/postcodeformat.pdf) 

127 M1 1AA 

128 M60 1NW 

129 CR2 6XH 

130 DN55 1PT 

131 W1A 1HQ 

132 EC1A 1BB 

133 """ 

134 

135 @staticmethod 

136 def report(title: str, string: str) -> None: 

137 print("=" * 79) 

138 print(title) 

139 print("=" * 79) 

140 print(string) 

141 

142 def test_most(self) -> None: 

143 s = self.STRING_1 

144 testnumber = 34 

145 testnumber_as_text = "123456" 

146 testdate_str = "7 Jan 2013" 

147 testdate = dateutil.parser.parse(testdate_str) 

148 teststring = "mother" 

149 testphrase = "348 or 834" 

150 date_19th_c = "3 Sep 1847" 

151 old_testdate = dateutil.parser.parse(date_19th_c) 

152 testemail = "mr.jones@somewhere.nhs.uk" 

153 

154 regex_date = get_regex_from_elements(get_date_regex_elements(testdate)) 

155 regex_number = get_regex_from_elements( 

156 get_code_regex_elements(str(testnumber)) 

157 ) 

158 regex_number_as_text = get_regex_from_elements( 

159 get_code_regex_elements( 

160 get_digit_string_from_vaguely_numeric_string( 

161 testnumber_as_text 

162 ) 

163 ) 

164 ) 

165 regex_string = get_regex_from_elements( 

166 get_string_regex_elements(teststring) 

167 ) 

168 regex_email = get_regex_from_elements( 

169 get_string_regex_elements(testemail) 

170 ) 

171 regex_phrase = get_regex_from_elements( 

172 get_phrase_regex_elements(testphrase) 

173 ) 

174 regex_10digit = get_regex_from_elements( 

175 get_number_of_length_n_regex_elements(10) 

176 ) 

177 regex_postcode = get_regex_from_elements( 

178 get_uk_postcode_regex_elements() 

179 ) 

180 all_elements = ( 

181 get_date_regex_elements(testdate) 

182 + get_code_regex_elements(str(testnumber)) 

183 + get_code_regex_elements( 

184 get_digit_string_from_vaguely_numeric_string( 

185 testnumber_as_text 

186 ) 

187 ) 

188 + get_string_regex_elements(teststring) 

189 + get_string_regex_elements(testemail) 

190 + get_phrase_regex_elements(testphrase) 

191 + get_number_of_length_n_regex_elements(10) 

192 + get_uk_postcode_regex_elements() 

193 ) 

194 regex_all = get_regex_from_elements(all_elements) 

195 

196 self.report( 

197 "Removing date: " + testdate_str, regex_date.sub("DATE_GONE", s) 

198 ) 

199 self.report( 

200 f"Removing number: {testnumber}", 

201 regex_number.sub("NUMBER_GONE", s), 

202 ) 

203 self.report( 

204 "Removing numbers as text: " + testnumber_as_text, 

205 regex_number_as_text.sub("NUMBER_AS_TEXT_GONE", s), 

206 ) 

207 self.report( 

208 "Removing string: " + teststring, 

209 regex_string.sub("STRING_GONE", s), 

210 ) 

211 self.report( 

212 "Removing email: " + testemail, regex_email.sub("EMAIL_GONE", s) 

213 ) 

214 self.report( 

215 "Removing phrase: " + testphrase, 

216 regex_phrase.sub("PHRASE_GONE", s), 

217 ) 

218 self.report( 

219 "Removing 10-digit numbers", 

220 regex_10digit.sub("TEN_DIGIT_NUMBERS_GONE", s), 

221 ) 

222 self.report( 

223 "Removing postcodes", regex_postcode.sub("POSTCODES_GONE", s) 

224 ) 

225 self.report("Removing everything", regex_all.sub("EVERYTHING_GONE", s)) 

226 self.report( 

227 "All-elements regex", get_regex_string_from_elements(all_elements) 

228 ) 

229 self.report( 

230 "Date regex", 

231 get_regex_string_from_elements(get_date_regex_elements(testdate)), 

232 ) 

233 self.report( 

234 "Date regex for 19th century", 

235 get_regex_string_from_elements( 

236 get_date_regex_elements(old_testdate) 

237 ), 

238 ) 

239 self.report( 

240 "Phrase regex", 

241 get_regex_string_from_elements( 

242 get_phrase_regex_elements(testphrase) 

243 ), 

244 ) 

245 self.report( 

246 "10-digit-number regex", 

247 get_regex_string_from_elements( 

248 get_number_of_length_n_regex_elements(10) 

249 ), 

250 ) 

251 

252 def test_generic_date(self) -> None: 

253 # https://stackoverflow.com/questions/51224/regular-expression-to-match-valid-dates # noqa: E501 

254 valid = ( 

255 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

256 # From that StackOverflow set 

257 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

258 # Day, month, year 

259 "2/11/73", 

260 "02/11/1973", 

261 "2/1/73", 

262 "02/01/73", 

263 "31/1/1973", 

264 "02/1/1973", 

265 "31.1.2011", 

266 "31-1-2001", 

267 "29/2/1973", 

268 "29/02/1976", 

269 "03/06/2010", 

270 "12/6/90", 

271 # month, day, year 

272 "02/24/1975", 

273 "06/19/66", 

274 "03.31.1991", 

275 "2.29.2003", 

276 "02-29-55", 

277 "03-13-55", 

278 "03-13-1955", 

279 r"12\24\1974", 

280 r"12\30\1974", 

281 r"1\31\1974", 

282 "03/31/2001", 

283 "01/21/2001", 

284 "12/13/2001", 

285 # Match both DMY and MDY 

286 "12/12/1978", 

287 "6/6/78", 

288 "06/6/1978", 

289 "6/06/1978", 

290 # using whitespace as a delimiter 

291 "13 11 2001", 

292 "11 13 2001", 

293 "11 13 01", 

294 "13 11 01", 

295 "1 1 01", 

296 "1 1 2001", 

297 # Year Month Day order 

298 "76/02/02", 

299 "1976/02/29", 

300 "1976/2/13", 

301 "76/09/31", 

302 # YYYYMMDD sortable format 

303 "19741213", 

304 "19750101", 

305 # Valid dates before Epoch 

306 "12/1/10", 

307 "12/01/00", 

308 "12/01/0000", 

309 # Valid date after 2038 

310 "01/01/2039", 

311 "01/01/39", 

312 # Dates with leading or trailing characters (but still word 

313 # boundaries) 

314 "12/31/21/", 

315 "12/10/2016 8:26:00.39", 

316 "31/12/1921.10:55", 

317 # Dates that runs across two lines 

318 "1/12/19\n74", 

319 "01/12/19\n74/13/1946", 

320 "31/12/20\n08:13", 

321 # Odd but accepted 

322 "2/12-73", 

323 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

324 # Extras with our system supporting month words/ordinals 

325 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

326 "2 Sep 1990", 

327 "2nd Sep 1990", 

328 "2 September 1990", 

329 "02 September 90", 

330 "2-Sep-90", 

331 "1990-Sep-02", 

332 "Sep 2 1990", 

333 "Sep 2nd 1990", 

334 "1st Sep 90", 

335 "1st Sept 2000", 

336 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

337 # Additional styles from JL 

338 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

339 "blah for some name dob 7.3.04 but thing", 

340 "x] |D.O.B. |24/02/1973 | |Detail", 

341 ) 

342 suboptimal_but_accepted = ( 

343 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

344 # From that StackOverflow set 

345 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

346 # Invalid, corrupted or nonsense dates 

347 "74/2/29", # wasn't a leap year 

348 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

349 # Extras with our system supporting month words/ordinals 

350 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

351 "1nd Sep 90", # ordinal suffix-to-number mapping not checked 

352 ) 

353 valid_only_without_word_boundaries = ( 

354 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

355 # Dates with leading or trailing characters (only recognized if 

356 # word boundaries not required) 

357 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

358 "31/12/1921AD", 

359 "wfuwdf12/11/74iuhwf", 

360 "fwefew13/11/1974", 

361 "01/12/1974vdwdfwe", 

362 "01/01/99werwer", 

363 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

364 # Additional styles from JL 

365 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

366 "x y z DOB23.07.48 questionnaire", 

367 ) 

368 not_currently_valid_perhaps_should_be = ( 

369 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

370 # Valid dates before Epoch 

371 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

372 "12/01/660", 

373 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

374 # Valid date beyond the year 9999 

375 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

376 "01/01/10000", 

377 ) 

378 invalid = ( 

379 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

380 # From that StackOverflow set 

381 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

382 # Dates with leading or trailing characters that render it garbage 

383 "12321301/01/99", 

384 # Invalid, corrupted or nonsense dates 

385 "00/01/2100", 

386 "31/31/2001", 

387 "101/12/1974", 

388 # Invalid, corrupted or nonsense dates 

389 "0/1/2001", 

390 "1/0/2001", 

391 "01/0/2001", 

392 "0101/2001", 

393 "01/131/2001", 

394 "56/56/56", 

395 "00/00/0000", 

396 "0/0/1999", 

397 "12/01/0", 

398 "12/10/-100", 

399 "12/32/45", 

400 "20/12/194", 

401 # Times that look like dates 

402 "12:13:56", 

403 "13:12:01", 

404 "1:12:01PM", 

405 "1:12:01 AM", 

406 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

407 # Extras with our system supporting month words/ordinals 

408 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

409 "1xx Sep 2000", 

410 "1st Spt 2000", 

411 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

412 # Irrelevant content 

413 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

414 "The cat sat on the mat." 

415 "He started haloperidol 5mg x7/week in 2009.", 

416 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

417 # Additional styles from JL 

418 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

419 "x / y z DOB 0804013", 

420 ) 

421 working_valid = valid + suboptimal_but_accepted 

422 working_invalid = not_currently_valid_perhaps_should_be + invalid 

423 

424 date_regex_wb_elements = get_generic_date_regex_elements( 

425 at_word_boundaries_only=True 

426 ) 

427 date_regex_wb_elements_str = "\n".join(date_regex_wb_elements) 

428 date_regex_wb = get_regex_from_elements(date_regex_wb_elements) 

429 date_regex_no_wb_elements = get_generic_date_regex_elements( 

430 at_word_boundaries_only=False 

431 ) 

432 date_regex_no_wb_elements_str = "\n".join(date_regex_no_wb_elements) 

433 date_regex_no_wb = get_regex_from_elements(date_regex_no_wb_elements) 

434 

435 # match() = at beginning of string 

436 # search() = anywhere in string 

437 for x in working_valid: 

438 self.assertTrue( 

439 date_regex_wb.search(x), 

440 f"[#1] Should be recognized as a date (with word " 

441 f"boundaries) but isn't: {x!r}; " 

442 f"regex elements =\n{date_regex_wb_elements_str}", 

443 ) 

444 self.assertTrue( 

445 date_regex_no_wb.search(x), 

446 f"[#2] Should be recognized as a date (without word " 

447 f"boundaries) but isn't: {x!r}; " 

448 f"regex elements =\n{date_regex_no_wb_elements_str}", 

449 ) 

450 for x in valid_only_without_word_boundaries: 

451 self.assertFalse( 

452 date_regex_wb.search(x), 

453 f"[#3] Should not be recognized as a date (with word " 

454 f"boundaries) but is: {x!r}; " 

455 f"regex elements =\n{date_regex_wb_elements_str}", 

456 ) 

457 self.assertTrue( 

458 date_regex_no_wb.search(x), 

459 f"[#4] Should be recognized as a date (without word " 

460 f"boundaries) but isn't: {x!r}; " 

461 f"regex elements =\n{date_regex_no_wb_elements_str}", 

462 ) 

463 for x in working_invalid: 

464 self.assertFalse( 

465 date_regex_wb.search(x), 

466 f"[#5] Should not be recognized as a date (with word " 

467 f"boundaries) but is: {x!r}; " 

468 f"regex elements =\n{date_regex_wb_elements_str}", 

469 ) 

470 self.assertFalse( 

471 date_regex_no_wb.search(x), 

472 f"[#6] Should not be recognized as a date (without word " 

473 f"boundaries) but is: {x!r}; " 

474 f"regex elements =\n{date_regex_no_wb_elements_str}", 

475 ) 

476 

477 

478def examples_for_paper() -> None: 

479 """ 

480 Examples used in Cardinal (2017), 

481 https://doi.org/10.1186/s12911-017-0437-1. 

482 """ 

483 testwords = "John Al'Rahem" 

484 min_string_length_to_scrub_with = 4 

485 scrub_string_suffixes = [] # type: List[str] 

486 max_errors = 0 

487 at_word_boundaries_only = True 

488 words_regexes = [] # type: List[str] 

489 for s in get_anon_fragments_from_string(testwords): 

490 length = len(s) 

491 if length < min_string_length_to_scrub_with: 

492 continue 

493 words_regexes.extend( 

494 get_string_regex_elements( 

495 s, 

496 suffixes=scrub_string_suffixes, 

497 at_word_boundaries_only=at_word_boundaries_only, 

498 max_errors=max_errors, 

499 ) 

500 ) 

501 print(f"--- For words {testwords}:") 

502 for r in words_regexes: 

503 print(r) 

504 

505 testphrase = "4 Privet Drive" 

506 phrase_regexes = get_phrase_regex_elements( 

507 testphrase, 

508 max_errors=max_errors, 

509 at_word_boundaries_only=at_word_boundaries_only, 

510 ) 

511 print(f"--- For phrase {testphrase}:") 

512 for r in phrase_regexes: 

513 print(r) 

514 

515 testnumber = "(01223) 123456" 

516 anonymise_numbers_at_word_boundaries_only = False 

517 anonymise_numbers_at_numeric_boundaries_only = True 

518 number_regexes = get_code_regex_elements( 

519 get_digit_string_from_vaguely_numeric_string(str(testnumber)), 

520 at_word_boundaries_only=anonymise_numbers_at_word_boundaries_only, 

521 at_numeric_boundaries_only=anonymise_numbers_at_numeric_boundaries_only, # noqa: E501 

522 ) 

523 print(f"--- For number {testnumber}:") 

524 for r in number_regexes: 

525 print(r) 

526 

527 testcode = "CB12 3DE" 

528 anonymise_codes_at_word_boundaries_only = True 

529 code_regexes = get_code_regex_elements( 

530 reduce_to_alphanumeric(str(testcode)), 

531 at_word_boundaries_only=anonymise_codes_at_word_boundaries_only, 

532 ) 

533 print(f"--- For code {testcode}:") 

534 for r in code_regexes: 

535 print(r) 

536 

537 n_digits = 10 

538 nonspec_10_digit_number_regexes = get_number_of_length_n_regex_elements( 

539 n_digits, 

540 at_word_boundaries_only=anonymise_numbers_at_word_boundaries_only, 

541 ) 

542 print(f"--- NONSPECIFIC: numbers of length {n_digits}:") 

543 for r in nonspec_10_digit_number_regexes: 

544 print(r) 

545 

546 uk_postcode_regexes = get_uk_postcode_regex_elements( 

547 at_word_boundaries_only=anonymise_codes_at_word_boundaries_only 

548 ) 

549 print("--- NONSPECIFIC: UK postcodes:") 

550 for r in uk_postcode_regexes: 

551 print(r) 

552 

553 testdate = date(year=2016, month=12, day=31) 

554 date_regexes = get_date_regex_elements(testdate) 

555 print(f"--- For date {testdate}:") 

556 for r in date_regexes: 

557 print(r) 

558 

559 

560class MoreAnonRegexTests(TestCase): 

561 """ 

562 More tests of regular expressions for anonymisation. 

563 """ 

564 

565 def _should_match(self, regexes: List[str], string: str) -> None: 

566 self.assertTrue( 

567 any( 

568 # search (match anywhere), not match (match at start) 

569 regex.search(pattern, string) 

570 for pattern in regexes 

571 ), 

572 f"Failed to match {string!r} against regexes {regexes}", 

573 ) 

574 

575 def _should_match_all( 

576 self, regexes: List[str], strings: List[str] 

577 ) -> None: 

578 for s in strings: 

579 self._should_match(regexes, s) 

580 

581 def _should_not_match(self, regexes: List[str], string: str) -> None: 

582 self.assertFalse( 

583 any( 

584 # search (match anywhere), not match (match at start) 

585 regex.search(pattern, string) 

586 for pattern in regexes 

587 ), 

588 f"Inappropriately matched {string!r} against regexes {regexes}", 

589 ) 

590 

591 def _should_not_match_any( 

592 self, regexes: List[str], strings: List[str] 

593 ) -> None: 

594 for s in strings: 

595 self._should_not_match(regexes, s) 

596 

597 def test_fragments(self) -> None: 

598 self.assertEqual( 

599 get_anon_fragments_from_string("John Smith"), ["John", "Smith"] 

600 ) 

601 self.assertEqual( 

602 get_anon_fragments_from_string("John D'Souza"), 

603 ["John", "D", "Souza"], 

604 ) 

605 self.assertEqual( 

606 get_anon_fragments_from_string(" 42 West Street "), 

607 ["42", "West", "Street"], 

608 ) 

609 

610 def test_date(self) -> None: 

611 tests = [ 

612 ( 

613 date(2021, 12, 31), 

614 [ 

615 # Numeric: 

616 "2021-12-31", 

617 "31/12/2021", 

618 "31/12/21", 

619 "31.12.21", 

620 "12/31/2021", # American 

621 "12/31/21", # American 

622 "12.31.21", # American 

623 # Partly textual: 

624 "31 Dec 2021", 

625 "31 December 2021", 

626 "31 December, 2021", 

627 "December 31 2021", 

628 "December 31, 2021", 

629 ], 

630 ), 

631 ( 

632 date(1980, 5, 6), 

633 [ 

634 # Numeric: 

635 "1980-05-06", 

636 "6/5/1980", 

637 "6/5/80", 

638 "6.5.80", 

639 "06/05/1980", 

640 "5/6/80", # American 

641 # Partly textual: 

642 "6 May 1980", 

643 "May 6, 80", 

644 ], 

645 ), 

646 ( 

647 date(2004, 3, 7), 

648 [ 

649 "blah for some name dob 7.3.04 but thing", 

650 ], 

651 ), 

652 ( 

653 date(2001, 4, 8), 

654 [ 

655 "x / y z DOB 0804013", 

656 ], 

657 ), 

658 ( 

659 date(1948, 7, 23), 

660 [ 

661 "x y z DOB23.07.48 questionnaire", 

662 ], 

663 ), 

664 ( 

665 date(1973, 2, 24), 

666 [ 

667 "x] |D.O.B. |24/02/1973 | |Detail", 

668 ], 

669 ), 

670 ] # type: List[Tuple[date, List[str]]] 

671 for testdate, text_versions in tests: 

672 regexes = get_date_regex_elements(testdate) 

673 for text in text_versions: 

674 self._should_match(regexes, text) 

675 

676 def test_code_whitespace(self) -> None: 

677 tests = [ 

678 ( 

679 "PE123AB", 

680 [ 

681 " PE123AB ", 

682 "PE12 3AB", 

683 "PE 12 3 AB", 

684 ], 

685 ), 

686 ( 

687 "PE 12 3AB", 

688 [ 

689 " PE123AB ", 

690 "PE12 3AB", 

691 "PE 12 3 AB", 

692 ], 

693 ), 

694 ] # type: List[Tuple[str, List[str]]] 

695 for testcode, text_versions in tests: 

696 regexes = get_code_regex_elements(reduce_to_alphanumeric(testcode)) 

697 for text in text_versions: 

698 self._should_match(regexes, text) 

699 

700 def test_code_boundaries(self) -> None: 

701 code = "ABC123" 

702 

703 word_boundaries = get_code_regex_elements( 

704 code, 

705 liberal=False, 

706 very_liberal=False, 

707 at_word_boundaries_only=True, 

708 ) 

709 self._should_match_all( 

710 word_boundaries, 

711 [ 

712 f"pq {code} xy", 

713 f"pq,{code},xy", 

714 f"12 {code} 34", 

715 f"12,{code},34", 

716 ], 

717 ) 

718 self._should_not_match_any( 

719 word_boundaries, 

720 [ 

721 f"pq{code}xy", 

722 f"pq{code} xy", 

723 f"pq {code}xy", 

724 f"12{code}34", 

725 f"12{code} 34", 

726 f"12 {code}34", 

727 ], 

728 ) 

729 

730 number_boundaries = get_code_regex_elements( 

731 code, 

732 liberal=False, 

733 very_liberal=False, 

734 at_word_boundaries_only=False, 

735 at_numeric_boundaries_only=True, 

736 ) 

737 self._should_match_all( 

738 number_boundaries, 

739 [ 

740 f"pq {code} xy", 

741 f"pq,{code},xy", 

742 f"12 {code} 34", 

743 f"12,{code},34", 

744 f"pq{code}xy", 

745 f"pq{code} xy", 

746 f"pq {code}xy", 

747 ], 

748 ) 

749 self._should_not_match_any( 

750 number_boundaries, 

751 [ 

752 f"12{code}34", 

753 f"12{code} 34", 

754 f"12 {code}34", 

755 ], 

756 ) 

757 

758 anywhere = get_code_regex_elements( 

759 code, 

760 liberal=False, 

761 very_liberal=False, 

762 at_word_boundaries_only=False, 

763 at_numeric_boundaries_only=False, 

764 ) 

765 self._should_match_all( 

766 anywhere, 

767 [ 

768 f"pq {code} xy", 

769 f"pq,{code},xy", 

770 f"12 {code} 34", 

771 f"12,{code},34", 

772 f"pq{code}xy", 

773 f"pq{code} xy", 

774 f"pq {code}xy", 

775 f"12{code}34", 

776 f"12{code} 34", 

777 f"12 {code}34", 

778 ], 

779 ) 

780 

781 def test_uk_postcodes(self) -> None: 

782 """ 

783 Ensure we detect postcodes properly. 

784 """ 

785 valid_postcodes = [ 

786 # from https://www.mrs.org.uk/pdf/postcodeformat.pdf 

787 "M1 1AA", 

788 "M60 1NW", 

789 "CR2 6XH", 

790 "DN55 1PT", 

791 "W1A 1HQ", 

792 "EC1A 1BB", 

793 # Some of our institutional postcodes: 

794 "CB2 0QQ", 

795 ] 

796 # See also 

797 # https://club.ministryoftesting.com/t/fun-postcodes-to-use-when-testing/10772 # noqa: E501 

798 invalid_postcodes = [ 

799 "ABCDEFG", 

800 ] 

801 postcode_regex = regex.compile( 

802 get_uk_postcode_regex_string(at_word_boundaries_only=False) 

803 ) 

804 for v in valid_postcodes: 

805 self.assertTrue(postcode_regex.match(v)) 

806 for i in invalid_postcodes: 

807 self.assertFalse(postcode_regex.match(i)) 

808 

809 def test_email_addresses(self) -> None: 

810 """ 

811 Ensure we detect e-mail addresses properly. 

812 This won't be completely perfect. See https://emailregex.com/. 

813 

814 Specimen values: 

815 

816 - https://help.xmatters.com/ondemand/trial/valid_email_format.htm 

817 """ 

818 valid_email = [ 

819 "person@place.com", 

820 "r&d@somewhere.nhs.uk", 

821 "abc-d@mail.com", 

822 "abc.def@mail.com", 

823 "abc@mail.com", 

824 "abc_def@mail.com", 

825 "abc.def@mail.cc", 

826 "abc.def@mail-archive.com", 

827 "abc.def@mail.org", 

828 "abc.def@mail.com", 

829 "abc-@mail.com", # xmatters.com thinks wrong but is OK 

830 "abc#def@mail.com", # xmatters.com thinks wrong but is OK 

831 "abc.def@mail.c", # xmatters.com thinks wrong but ?is OK 

832 ] 

833 invalid_email = [ 

834 "person", 

835 "person@", 

836 "@place.com", 

837 "person@place", 

838 "abc..def@mail.com", 

839 ".abc@mail.com", 

840 "abc.def@mail#archive.com", 

841 "abc.def@mail", 

842 "abc.def@mail..com", 

843 ] 

844 email_regex = regex.compile(EMAIL_REGEX_STR, flags=REGEX_COMPILE_FLAGS) 

845 for v in valid_email: 

846 self.assertTrue( 

847 email_regex.match(v), 

848 f"Should be a valid e-mail address but was not recognized: " 

849 f"{v!r}", 

850 ) 

851 for i in invalid_email: 

852 self.assertFalse( 

853 email_regex.match(i), 

854 f"Should not be a valid e-mail address but was accepted: " 

855 f"{i!r}", 

856 ) 

857 

858 

859if __name__ == "__main__": 

860 main_only_quicksetup_rootlogger(level=logging.DEBUG) 

861 examples_for_paper()