Coverage for linkage/tests/fuzzy_id_match_tests.py: 99%

647 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/linkage/tests/fuzzy_id_match_tests.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26Unit tests. 

27 

28""" 

29 

30# ============================================================================= 

31# Imports 

32# ============================================================================= 

33 

34import logging 

35import unittest 

36from typing import List, Optional, Tuple, Type 

37 

38from cardinal_pythonlib.probability import probability_from_log_odds 

39from pendulum import Date 

40 

41from crate_anon.linkage.comparison import ( 

42 AdjustLogOddsComparison, 

43 Comparison, 

44 DirectComparison, 

45) 

46from crate_anon.linkage.constants import ( 

47 FuzzyDefaults, 

48 GENDER_FEMALE, 

49 GENDER_MALE, 

50 GENDER_MISSING, 

51 GENDER_OTHER, 

52 INFINITY, 

53 VALID_GENDERS, 

54) 

55from crate_anon.linkage.frequencies import ( 

56 BasicNameFreqInfo, 

57 NameFrequencyInfo, 

58 PostcodeFrequencyInfo, 

59) 

60from crate_anon.linkage.identifiers import ( 

61 DateOfBirth, 

62 DummyLetterIdentifier, 

63 DummyLetterTemporalIdentifier, 

64 Forename, 

65 gen_best_comparisons, 

66 Gender, 

67 Identifier, 

68 PerfectID, 

69 Postcode, 

70 Surname, 

71 SurnameFragment, 

72 TemporalIDHolder, 

73) 

74from crate_anon.linkage.helpers import ( 

75 get_postcode_sector, 

76 is_valid_isoformat_date, 

77 ln, 

78 POSTCODE_REGEX, 

79 remove_redundant_whitespace, 

80 safe_upper, 

81 simplify_punctuation_whitespace, 

82 standardize_name, 

83 standardize_postcode, 

84 surname_alternative_fragments, 

85) 

86from crate_anon.linkage.matchconfig import MatchConfig 

87from crate_anon.linkage.people import DuplicateIDError, People 

88from crate_anon.linkage.person import Person 

89 

90log = logging.getLogger(__name__) 

91 

92 

93# ============================================================================= 

94# Constants 

95# ============================================================================= 

96 

97BAD_DATE_STRINGS = ["1950-31-12", "1950", "blah", "2000-02-30"] 

98GOOD_DATE_STRINGS = ["1950-12-31", "1890-01-01", "2000-01-01"] 

99BAD_POSTCODES = [ 

100 "99XX99", 

101 "CB99 9XXY", 

102 "CB99", 

103 "CB2", 

104 "NW19TTTEMP", 

105 "NW19TT TEMP", 

106] 

107GOOD_POSTCODES = [ 

108 "CB99 9XY", 

109 "CB2 0QQ", 

110 "ZZ99 3VZ", 

111 "Z Z 9 9 3 V Z", 

112 " zz993vz ", 

113] # good once standardized, anyway 

114BAD_GENDERS = ["Y", "male", "female", "?"] 

115 

116 

117# ============================================================================= 

118# Rapid creation of a dummy config (without loading actual name/postcode info) 

119# ============================================================================= 

120 

121 

122def mk_test_config(**kwargs) -> MatchConfig: 

123 """ 

124 Create a dummy config, using dummy name/postcode info. 

125 """ 

126 predefined_forenames = [ 

127 BasicNameFreqInfo( 

128 name="ALICE", 

129 gender=GENDER_FEMALE, 

130 p_name=0.0032597245570899847, 

131 metaphone="ALK", 

132 p_metaphone=0.005664202032042135, 

133 p_metaphone_not_name=0.00240447747495215, 

134 f2c="AL", 

135 p_f2c=0.027635117202534115, 

136 p_f2c_not_name_metaphone=0.022541989771499515, 

137 synthetic=False, 

138 ), 

139 BasicNameFreqInfo( 

140 name="BEATRICE", 

141 gender=GENDER_FEMALE, 

142 p_name=0.0011134697472956023, 

143 metaphone="PTRK", 

144 p_metaphone=0.010795171997297154, 

145 p_metaphone_not_name=0.009681702250001551, 

146 f2c="BE", 

147 p_f2c=0.020540629656206778, 

148 p_f2c_not_name_metaphone=0.01938862260342886, 

149 synthetic=False, 

150 ), 

151 BasicNameFreqInfo( 

152 name="BETTY", 

153 gender=GENDER_FEMALE, 

154 p_name=0.005856056682186572, 

155 metaphone="PT", 

156 p_metaphone=0.007567968531021441, 

157 p_metaphone_not_name=0.0017119118488348687, 

158 f2c="BE", 

159 p_f2c=0.020540629656206778, 

160 p_f2c_not_name_metaphone=0.014031211254451567, 

161 synthetic=False, 

162 ), 

163 BasicNameFreqInfo( 

164 name="BOB", 

165 gender=GENDER_MALE, 

166 p_name=0.0005341749908504777, 

167 metaphone="PP", 

168 p_metaphone=0.002569054271327976, 

169 p_metaphone_not_name=0.0020348792804774983, 

170 f2c="BO", 

171 p_f2c=0.0035610312205931094, 

172 p_f2c_not_name_metaphone=0.0010918026974037107, 

173 synthetic=False, 

174 ), 

175 BasicNameFreqInfo( 

176 name="CAROLINE", 

177 gender=GENDER_FEMALE, 

178 p_name=0.001289812197195456, 

179 metaphone="KRLN", 

180 p_metaphone=0.005979308865585442, 

181 p_metaphone_not_name=0.004689496668389986, 

182 f2c="CA", 

183 p_f2c=0.033910941860871194, 

184 p_f2c_not_name_metaphone=0.02860674130257904, 

185 synthetic=False, 

186 ), 

187 BasicNameFreqInfo( 

188 name="CELIA", 

189 gender=GENDER_FEMALE, 

190 p_name=0.0003141885536034312, 

191 metaphone="KL", 

192 p_metaphone=0.016359410337593906, 

193 p_metaphone_not_name=0.016045221783990475, 

194 f2c="CE", 

195 p_f2c=0.0030682294813082723, 

196 p_f2c_not_name_metaphone=0.0026663592268114434, 

197 synthetic=False, 

198 ), 

199 BasicNameFreqInfo( 

200 name="DELILAH", 

201 gender=GENDER_FEMALE, 

202 p_name=0.00019936172952521078, 

203 metaphone="TLL", 

204 p_metaphone=0.000491534931894549, 

205 p_metaphone_not_name=0.00029217320236933826, 

206 f2c="DE", 

207 p_f2c=0.02472305974107954, 

208 p_f2c_not_name_metaphone=0.024435022377723725, 

209 synthetic=False, 

210 ), 

211 BasicNameFreqInfo( 

212 name="DOROTHY", 

213 gender=GENDER_FEMALE, 

214 p_name=0.006484867451993301, 

215 metaphone="TR0", 

216 p_metaphone=0.007164437365410392, 

217 p_metaphone_not_name=0.0006795699134170908, 

218 f2c="DO", 

219 p_f2c=0.020044376270378746, 

220 p_f2c_not_name_metaphone=0.01298493890496824, 

221 synthetic=False, 

222 ), 

223 BasicNameFreqInfo( 

224 name="ELIZABETH", 

225 gender=GENDER_FEMALE, 

226 p_name=0.009497275400440382, 

227 metaphone="ALSP", 

228 p_metaphone=0.010079561736620864, 

229 p_metaphone_not_name=0.0005822863361804823, 

230 f2c="EL", 

231 p_f2c=0.02543961854560152, 

232 p_f2c_not_name_metaphone=0.015404362973960957, 

233 synthetic=False, 

234 ), 

235 ] # type: List[BasicNameFreqInfo] 

236 forename_freq_info = NameFrequencyInfo( 

237 csv_filename="", 

238 cache_filename="", 

239 by_gender=True, 

240 min_frequency=FuzzyDefaults.FORENAME_MIN_FREQ, 

241 ) 

242 for f in predefined_forenames: 

243 forename_freq_info.name_gender_idx[f.name, f.gender] = f 

244 

245 predefined_surnames = [ 

246 BasicNameFreqInfo( 

247 name="JONES", 

248 gender="", 

249 p_name=0.00621, 

250 metaphone="JNS", 

251 p_metaphone=0.0068899999999999986, 

252 p_metaphone_not_name=0.0006799999999999983, 

253 f2c="JO", 

254 p_f2c=0.019480000000000268, 

255 p_f2c_not_name_metaphone=0.012984999999999938, 

256 synthetic=False, 

257 ), 

258 BasicNameFreqInfo( 

259 name="SMITH", 

260 gender="", 

261 p_name=0.01006, 

262 metaphone="SM0", 

263 p_metaphone=0.010129999999999998, 

264 p_metaphone_not_name=6.999999999999888e-05, 

265 f2c="SM", 

266 p_f2c=0.012514999999999967, 

267 p_f2c_not_name_metaphone=0.0023849999999999896, 

268 synthetic=False, 

269 ), 

270 ] # type: List[BasicNameFreqInfo] 

271 surname_freq_info = NameFrequencyInfo( 

272 csv_filename="", 

273 cache_filename="", 

274 by_gender=False, 

275 min_frequency=FuzzyDefaults.SURNAME_MIN_FREQ, 

276 ) 

277 for s in predefined_surnames: 

278 surname_freq_info.name_gender_idx[s.name, s.gender] = s 

279 

280 postcode_freq_info = PostcodeFrequencyInfo( 

281 csv_filename="", cache_filename="" 

282 ) 

283 

284 return MatchConfig( 

285 forename_freq_info=forename_freq_info, 

286 surname_freq_info=surname_freq_info, 

287 postcode_freq_info=postcode_freq_info, 

288 **kwargs, 

289 ) 

290 

291 

292# ============================================================================= 

293# Helper class 

294# ============================================================================= 

295 

296 

297class TestCondition: 

298 """ 

299 Two representations of a person and whether they should match. 

300 """ 

301 

302 def __init__( 

303 self, 

304 cfg: MatchConfig, 

305 person_a: Person, 

306 person_b: Person, 

307 should_match: bool, 

308 debug: bool = True, 

309 ) -> None: 

310 """ 

311 Args: 

312 cfg: the main :class:`MatchConfig` object 

313 person_a: one representation of a person 

314 person_b: another representation of a person 

315 should_match: should they be treated as the same person? 

316 debug: be verbose? 

317 """ 

318 self.cfg = cfg 

319 self.person_a = person_a 

320 self.person_b = person_b 

321 self.should_match = should_match 

322 

323 for id_person in (self.person_a, self.person_b): 

324 assert id_person.is_plaintext() 

325 id_person.ensure_valid_as_proband() 

326 for identifier in id_person.debug_gen_identifiers(): 

327 assert identifier.is_plaintext 

328 

329 log.info("- Making hashed versions for later") 

330 self.hashed_a = self.person_a.hashed() 

331 self.hashed_b = self.person_b.hashed() 

332 for h_person in (self.hashed_a, self.hashed_b): 

333 assert h_person.is_hashed() 

334 h_person.ensure_valid_as_proband() 

335 for identifier in h_person.debug_gen_identifiers(): 

336 assert not identifier.is_plaintext 

337 self.debug = debug 

338 

339 def log_odds_same_plaintext(self) -> float: 

340 """ 

341 Checks whether the plaintext person objects match. 

342 

343 Returns: 

344 float: the log odds that they are the same person 

345 """ 

346 return self.person_a.log_odds_same(self.person_b) 

347 

348 def log_odds_same_hashed(self) -> float: 

349 """ 

350 Checks whether the hashed versions match. 

351 

352 Returns: 

353 float: the log odds that they are the same person 

354 """ 

355 return self.hashed_a.log_odds_same(self.hashed_b) 

356 

357 def matches_plaintext(self) -> Tuple[bool, float]: 

358 """ 

359 Do the plaintext versions match, by threshold? 

360 

361 Returns: 

362 tuple: (matches, log_odds) 

363 """ 

364 log_odds = self.log_odds_same_plaintext() 

365 return self.cfg.exceeds_primary_threshold(log_odds), log_odds 

366 

367 def matches_hashed(self) -> Tuple[bool, float]: 

368 """ 

369 Do the raw versions match, by threshold? 

370 

371 Returns: 

372 bool: is there a match? 

373 """ 

374 log_odds = self.log_odds_same_hashed() 

375 return self.cfg.exceeds_primary_threshold(log_odds), log_odds 

376 

377 def check_comparison_as_expected(self) -> None: 

378 """ 

379 Asserts that both the raw and hashed versions match, or don't match, 

380 according to ``self.should_match``. 

381 """ 

382 log.info( 

383 f"Comparing:\n" f"- {self.person_a!r}\n" f"- {self.person_b!r}" 

384 ) 

385 log.info("(1) Comparing plaintext") 

386 matches_raw, log_odds_plaintext = self.matches_plaintext() 

387 p_plaintext = probability_from_log_odds(log_odds_plaintext) 

388 p_plain_str = f"P(match | D) = {p_plaintext}" 

389 if matches_raw == self.should_match: 

390 if matches_raw: 

391 log.info(f"... should and does match: {p_plain_str}") 

392 else: 

393 log.info(f"... should not and does not match: {p_plain_str}") 

394 else: 

395 log_odds = log_odds_plaintext 

396 report = self.person_a.debug_comparison_report( 

397 self.person_b, verbose=False 

398 ) 

399 raise AssertionError( 

400 f"Match failure: " 

401 f"matches_raw = {matches_raw}, " 

402 f"should_match = {self.should_match}, " 

403 f"log_odds = {log_odds}, " 

404 f"min_log_odds_for_match = {self.cfg.min_log_odds_for_match}, " 

405 f"P(match) = {probability_from_log_odds(log_odds)}, " 

406 f"person_a = {self.person_a}, " 

407 f"person_b = {self.person_b}, " 

408 f"report = {report}" 

409 ) 

410 

411 log.info( 

412 f"(2) Comparing hashed:\n" 

413 f"- {self.hashed_a}\n" 

414 f"- {self.hashed_b}" 

415 ) 

416 matches_hashed, log_odds_hashed = self.matches_hashed() 

417 p_hashed = probability_from_log_odds(log_odds_hashed) 

418 p_hashed_str = f"P(match | D) = {p_hashed}" 

419 if matches_hashed == self.should_match: 

420 if matches_hashed: 

421 log.info(f"... should and does match: {p_hashed_str}") 

422 else: 

423 log.info(f"... should not and does not match: {p_hashed_str}") 

424 else: 

425 log_odds = log_odds_hashed 

426 report = self.hashed_a.debug_comparison_report( 

427 self.hashed_b, verbose=False 

428 ) 

429 raise AssertionError( 

430 f"Match failure: " 

431 f"matches_hashed = {matches_hashed}, " 

432 f"should_match = {self.should_match}, " 

433 f"log_odds = {log_odds}, " 

434 f"threshold = {self.cfg.min_log_odds_for_match}, " 

435 f"min_log_odds_for_match = {self.cfg.min_log_odds_for_match}, " 

436 f"P(match) = {probability_from_log_odds(log_odds)}, " 

437 f"person_a = {self.person_a}, " 

438 f"person_b = {self.person_b}, " 

439 f"hashed_a = {self.hashed_a}, " 

440 f"hashed_b = {self.hashed_b}, " 

441 f"report = {report}" 

442 ) 

443 

444 log.info( 

445 "(3) Results of plaintext match should equal result of hashed " 

446 "match" 

447 ) 

448 if log_odds_hashed != log_odds_plaintext: 

449 raise AssertionError( 

450 "Plaintext/hashed comparison discrepancy: " 

451 f"person_a = {self.person_a}, " 

452 f"person_b = {self.person_b}, " 

453 "log_odds_plaintext = {log_odds_plaintext}, " 

454 f"log_odds_hashed = {log_odds_hashed}" 

455 ) 

456 

457 

458# ============================================================================= 

459# Unit tests 

460# ============================================================================= 

461 

462 

463class DummyTemporalIdentifierTests(unittest.TestCase): 

464 """ 

465 Unit tests for :class:`DummyTemporalIdentifier`. 

466 """ 

467 

468 def test_overlap(self) -> None: 

469 d1 = Date(2000, 1, 1) 

470 d2 = Date(2000, 1, 2) 

471 d3 = Date(2000, 1, 3) 

472 d4 = Date(2000, 1, 4) 

473 p = "dummypostcode" 

474 # --------------------------------------------------------------------- 

475 # Overlaps 

476 # --------------------------------------------------------------------- 

477 self.assertEqual( 

478 TemporalIDHolder(p, d1, d2).overlaps(TemporalIDHolder(p, d2, d3)), 

479 True, 

480 ) 

481 self.assertEqual( 

482 TemporalIDHolder(p, d2, d3).overlaps(TemporalIDHolder(p, d1, d2)), 

483 True, 

484 ) 

485 self.assertEqual( 

486 TemporalIDHolder(p, d1, d4).overlaps(TemporalIDHolder(p, d2, d3)), 

487 True, 

488 ) 

489 self.assertEqual( 

490 TemporalIDHolder(p, d1, None).overlaps( 

491 TemporalIDHolder(p, None, d4) 

492 ), 

493 True, 

494 ) 

495 self.assertEqual( 

496 TemporalIDHolder(p, None, None).overlaps( 

497 TemporalIDHolder(p, None, None) 

498 ), 

499 True, 

500 ) 

501 # --------------------------------------------------------------------- 

502 # Non-overlaps 

503 # --------------------------------------------------------------------- 

504 self.assertEqual( 

505 TemporalIDHolder(p, d1, d2).overlaps(TemporalIDHolder(p, d3, d4)), 

506 False, 

507 ) 

508 self.assertEqual( 

509 TemporalIDHolder(p, None, d1).overlaps( 

510 TemporalIDHolder(p, d2, None) 

511 ), 

512 False, 

513 ) 

514 

515 

516class FuzzyLinkageTests(unittest.TestCase): 

517 """ 

518 Tests of the fuzzy linkage system. 

519 """ 

520 

521 def __init__(self, *args, **kwargs) -> None: 

522 super().__init__(*args, **kwargs) 

523 self.cfg = mk_test_config(rounding_sf=None) 

524 self.p1 = Postcode( 

525 cfg=self.cfg, 

526 postcode="CB2 0QQ", # Addenbrooke's Hospital 

527 start_date=Date(2000, 1, 1), 

528 end_date=Date(2010, 1, 1), 

529 ) 

530 self.p2 = Postcode( 

531 cfg=self.cfg, 

532 postcode="CB2 3EB", # Department of Psychology 

533 start_date=Date(2000, 1, 1), 

534 end_date=Date(2010, 1, 1), 

535 ) 

536 self.alice_bcd_rarename_2000_add = Person( 

537 cfg=self.cfg, 

538 local_id="1", 

539 forenames=["Alice", "Beatrice", "Celia", "Delilah"], 

540 surnames=["Rarename"], 

541 dob="2000-01-01", 

542 postcodes=[self.p1], 

543 ) 

544 self.alec_bcd_rarename_2000_add = Person( 

545 cfg=self.cfg, 

546 local_id="2", 

547 forenames=["Alec", "Beatrice", "Celia", "Delilah"], 

548 # Alec: same metaphone as Alice 

549 surnames=["Rarename"], 

550 dob="2000-01-01", 

551 postcodes=[self.p1], 

552 ) 

553 self.bob_bcd_rarename_2000_add = Person( 

554 cfg=self.cfg, 

555 local_id="3", 

556 forenames=["Bob", "Beatrice", "Celia", "Delilah"], 

557 surnames=["Rarename"], 

558 dob="2000-01-01", 

559 postcodes=[self.p1], 

560 ) 

561 self.alice_bc_rarename_2000_add = Person( 

562 cfg=self.cfg, 

563 local_id="4", 

564 forenames=["Alice", "Beatrice", "Celia"], 

565 surnames=["Rarename"], 

566 dob="2000-01-01", 

567 postcodes=[self.p1], 

568 ) 

569 self.alice_b_rarename_2000_add = Person( 

570 cfg=self.cfg, 

571 local_id="5", 

572 forenames=["Alice", "Beatrice"], 

573 surnames=["Rarename"], 

574 dob="2000-01-01", 

575 postcodes=[self.p1], 

576 ) 

577 self.alice_jones_2000_add = Person( 

578 cfg=self.cfg, 

579 local_id="6", 

580 forenames=["Alice"], 

581 surnames=["Jones"], 

582 dob="2000-01-01", 

583 postcodes=[self.p1], 

584 ) 

585 self.bob_smith_1950_psych = Person( 

586 cfg=self.cfg, 

587 local_id="7", 

588 forenames=["Bob"], 

589 surnames=["Smith"], 

590 dob="1950-05-30", 

591 postcodes=[self.p2], 

592 ) 

593 self.alice_smith_1930 = Person( 

594 cfg=self.cfg, 

595 local_id="8", 

596 forenames=["Alice"], 

597 surnames=["Smith"], 

598 dob="1930-01-01", 

599 ) 

600 self.alice_smith_2000 = Person( 

601 cfg=self.cfg, 

602 local_id="9", 

603 forenames=["Alice"], 

604 surnames=["Smith"], 

605 dob="2000-01-01", 

606 ) 

607 self.alice_smith = Person( 

608 cfg=self.cfg, 

609 local_id="10", 

610 forenames=["Alice"], 

611 surnames=["Smith"], 

612 ) 

613 self.alice_bc_smith = Person( 

614 cfg=self.cfg, 

615 local_id="11", 

616 forenames=["Alice", "Betty", "Caroline"], 

617 surnames=["Smith"], 

618 ) 

619 self.alice_bde_smith = Person( 

620 cfg=self.cfg, 

621 local_id="12", 

622 forenames=["Alice", "Betty", "Dorothy", "Elizabeth"], 

623 surnames=["Smith"], 

624 ) 

625 self.all_people = [ 

626 self.alice_bcd_rarename_2000_add, 

627 self.alec_bcd_rarename_2000_add, 

628 self.bob_bcd_rarename_2000_add, 

629 self.alice_bc_rarename_2000_add, 

630 self.alice_b_rarename_2000_add, 

631 self.alice_jones_2000_add, 

632 self.bob_smith_1950_psych, 

633 self.alice_smith_1930, 

634 self.alice_smith_2000, 

635 self.alice_smith, 

636 self.alice_bc_smith, 

637 self.alice_bde_smith, 

638 ] 

639 self.all_people_hashed = [p.hashed() for p in self.all_people] 

640 self.people_plaintext = People(cfg=self.cfg) 

641 self.people_plaintext.add_people(self.all_people) 

642 self.people_hashed = People(cfg=self.cfg) 

643 self.people_hashed.add_people(self.all_people_hashed) 

644 

645 # ------------------------------------------------------------------------- 

646 # Basic string transformations 

647 # ------------------------------------------------------------------------- 

648 

649 def test_standardize_name(self) -> None: 

650 tests = ( 

651 # name, standardized version 

652 ("Al Jazeera", "ALJAZEERA"), 

653 ("Al'Jazeera", "ALJAZEERA"), 

654 ("Al'Jazeera'", "ALJAZEERA"), 

655 ("Alice", "ALICE"), 

656 ("ALJAZEERA", "ALJAZEERA"), 

657 ("aljazeera", "ALJAZEERA"), 

658 ("D'Souza", "DSOUZA"), 

659 ("de Clérambault", "DECLERAMBAULT"), 

660 ("Mary Ellen", "MARYELLEN"), 

661 ('"Al Jazeera"', "ALJAZEERA"), 

662 ("Müller", "MULLER"), 

663 ("Straße", "STRASSE"), 

664 ) 

665 for item, target in tests: 

666 self.assertEqual(standardize_name(item), target) 

667 

668 def test_safe_upper(self) -> None: 

669 tests = ( 

670 ("Beethoven", "BEETHOVEN"), 

671 ("Clérambault", "CLÉRAMBAULT"), 

672 ("Straße", "STRAẞE"), 

673 ) 

674 for a, b in tests: 

675 self.assertEqual(safe_upper(a), b) 

676 

677 def test_remove_redundant_whitespace(self) -> None: 

678 tests = ((" van \t \r \n Beethoven ", "van Beethoven"),) 

679 for a, b in tests: 

680 self.assertEqual(remove_redundant_whitespace(a), b) 

681 

682 def test_simplify_punctuation_whitespace(self) -> None: 

683 tests = ( 

684 ("\n ‘John said “hello”.’", " 'John said \"hello\".'"), 

685 ("\t a–b—c−d-e ", " a-b-c-d-e "), 

686 ) 

687 for a, b in tests: 

688 self.assertEqual(simplify_punctuation_whitespace(a), b) 

689 

690 def test_surname_fragments(self) -> None: 

691 cfg = self.cfg 

692 accent_transliterations = cfg.accent_transliterations 

693 nonspecific_name_components = cfg.nonspecific_name_components 

694 tests = ( 

695 # In the expected answer, the original name (standardized) comes 

696 # first; then alphabetical order of all other variants. Some 

697 # examples are silly. 

698 # 

699 # France/French: 

700 ( 

701 "Côte d'Ivoire", 

702 ["CÔTEDIVOIRE", "COTE", "COTEDIVOIRE", "CÔTE", "IVOIRE"], 

703 ), 

704 ( 

705 "de Clérambault", 

706 [ 

707 "DECLÉRAMBAULT", 

708 "CLERAMBAULT", 

709 "CLÉRAMBAULT", 

710 "DECLERAMBAULT", 

711 ], 

712 ), 

713 ( 

714 "de la Billière", 

715 ["DELABILLIÈRE", "BILLIERE", "BILLIÈRE", "DELABILLIERE"], 

716 ), 

717 ("Façade", ["FAÇADE", "FACADE"]), 

718 ("Giscard d'Estaing", ["GISCARDDESTAING", "ESTAING", "GISCARD"]), 

719 ("L'Estrange", ["LESTRANGE", "ESTRANGE"]), 

720 ("L’Estrange", ["LESTRANGE", "ESTRANGE"]), 

721 # Germany (and in Beethoven's case, ancestrally Belgium): 

722 ("Beethoven", ["BEETHOVEN"]), 

723 ("Mozart Smith", ["MOZARTSMITH", "MOZART", "SMITH"]), 

724 ("Mozart-Smith", ["MOZARTSMITH", "MOZART", "SMITH"]), 

725 ("Müller", ["MÜLLER", "MUELLER", "MULLER"]), 

726 ("Straße", ["STRAẞE", "STRASSE"]), 

727 ("van Beethoven", ["VANBEETHOVEN", "BEETHOVEN"]), 

728 # Italy: 

729 ("Calabrò", ["CALABRÒ", "CALABRO"]), 

730 ("De Marinis", ["DEMARINIS", "MARINIS"]), 

731 ("di Bisanzio", ["DIBISANZIO", "BISANZIO"]), 

732 # Sweden: 

733 ("Nyström", ["NYSTRÖM", "NYSTROEM", "NYSTROM"]), 

734 # Hmm. NYSTROEM is a German-style transliteration. Still, OK-ish. 

735 ) 

736 for surname, target_fragments in tests: 

737 self.assertEqual( 

738 surname_alternative_fragments( 

739 surname=surname, 

740 accent_transliterations=accent_transliterations, 

741 nonspecific_name_components=nonspecific_name_components, 

742 ), 

743 target_fragments, 

744 ) 

745 

746 def test_date_regex(self) -> None: 

747 for b in BAD_DATE_STRINGS: 

748 self.assertFalse(is_valid_isoformat_date(b)) 

749 for g in GOOD_DATE_STRINGS: 

750 self.assertTrue(is_valid_isoformat_date(g)) 

751 

752 def test_standardize_postcode(self) -> None: 

753 tests = ( 

754 # name, standardized version 

755 ("CB20QQ", "CB20QQ"), 

756 (" CB2 0QQ ", "CB20QQ"), 

757 (" CB2-0 QQ ", "CB20QQ"), 

758 ("cb2 0qq", "CB20QQ"), 

759 ) 

760 for item, target in tests: 

761 self.assertEqual(standardize_postcode(item), target) 

762 

763 def test_get_postcode_sector(self) -> None: 

764 tests = ( 

765 # postcode, sector 

766 ("CB20QQ", "CB20"), 

767 (" CB2 0QQ ", "CB20"), 

768 (" CB2-0 QQ ", "CB20"), 

769 ("cb2 0qq", "CB20"), 

770 ) 

771 for item, target in tests: 

772 self.assertEqual(get_postcode_sector(item), target) 

773 

774 def test_postcode_regex(self) -> None: 

775 for b in BAD_POSTCODES: 

776 self.assertIsNone( 

777 POSTCODE_REGEX.match(b), f"Postcode {b!r} matched but is bad" 

778 ) 

779 sb = standardize_postcode(b) 

780 self.assertIsNone( 

781 POSTCODE_REGEX.match(sb), 

782 f"Postcode {b!r} matched after standardization to {sb!r} " 

783 f"but is bad", 

784 ) 

785 for g in GOOD_POSTCODES: 

786 sg = standardize_postcode(g) 

787 self.assertTrue( 

788 POSTCODE_REGEX.match(sg), 

789 f"Postcode {sg!r} (from {g!r}) did not match but is good", 

790 ) 

791 

792 # ------------------------------------------------------------------------- 

793 # Frequencies 

794 # ------------------------------------------------------------------------- 

795 

796 def test_fuzzy_linkage_frequencies_name(self) -> None: 

797 cfg = self.cfg 

798 for surname in [ 

799 "Smith", 

800 "Jones", 

801 "Blair", 

802 "Cardinal", 

803 "XYZ", 

804 "W", # no metaphone 

805 ]: 

806 f = cfg.get_surname_freq_info(surname) 

807 log.info(f"Surname frequency for {surname}: {f}") 

808 

809 self.assertIsInstance(f.name, str) 

810 self.assertIsInstance(f.gender, str) 

811 self.assertIsInstance(f.p_name, float) 

812 

813 self.assertIsInstance(f.metaphone, str) 

814 self.assertIsInstance(f.p_metaphone, float) 

815 self.assertIsInstance(f.p_metaphone_not_name, float) 

816 

817 self.assertIsInstance(f.f2c, str) 

818 self.assertIsInstance(f.p_f2c, float) 

819 self.assertIsInstance(f.p_f2c_not_name_metaphone, float) 

820 

821 for forename, gender in [ 

822 ("James", GENDER_MALE), 

823 ("Rachel", GENDER_FEMALE), 

824 ("Phoebe", GENDER_FEMALE), 

825 ("Elizabeth", GENDER_FEMALE), 

826 ("Elizabeth", GENDER_MALE), 

827 ("Elizabeth", ""), 

828 ("Rowan", GENDER_FEMALE), 

829 ("Rowan", GENDER_MALE), 

830 ("Rowan", ""), 

831 ("XYZ", ""), 

832 ("W", ""), # no metaphone 

833 ]: 

834 f = cfg.get_forename_freq_info(forename, gender) 

835 log.info( 

836 f"Forename frequency for {forename}, gender {gender}: {f}" 

837 ) 

838 self.assertIsInstance(f.name, str) 

839 self.assertIsInstance(f.gender, str) 

840 self.assertIsInstance(f.p_name, float) 

841 

842 self.assertIsInstance(f.metaphone, str) 

843 self.assertIsInstance(f.p_metaphone, float) 

844 self.assertIsInstance(f.p_metaphone_not_name, float) 

845 

846 self.assertIsInstance(f.f2c, str) 

847 self.assertIsInstance(f.p_f2c, float) 

848 self.assertIsInstance(f.p_f2c_not_name_metaphone, float) 

849 

850 def test_fuzzy_linkage_frequencies_postcode(self) -> None: 

851 cfg = self.cfg 

852 # Examples are hospitals and colleges in Cambridge (not residential) 

853 # but it gives a broad idea. 

854 for postcode in ["CB2 0QQ", "CB2 0SZ", "CB2 3EB", "CB3 9DF"]: 

855 p = cfg.debug_postcode_unit_population(postcode) 

856 log.info( 

857 f"Calculated population for postcode unit {postcode}: {p}" 

858 ) 

859 

860 for ps in ["CB2 0", "CB2 1", "CB2 2", "CB2 3"]: 

861 p = cfg.debug_postcode_sector_population(ps) 

862 log.info(f"Calculated population for postcode sector {ps}: {p}") 

863 

864 # ------------------------------------------------------------------------- 

865 # Identifiers 

866 # ------------------------------------------------------------------------- 

867 

868 def test_identifier_dob(self) -> None: 

869 cfg = self.cfg 

870 

871 for b in BAD_DATE_STRINGS: 

872 with self.assertRaises(ValueError): 

873 _ = DateOfBirth(cfg, b) 

874 

875 full_match_log_lr = None # type: Optional[float] 

876 for g in GOOD_DATE_STRINGS: 

877 d = DateOfBirth(cfg, g) 

878 self.assertEqual(d.dob_str, g) 

879 self.assertEqual(str(d), g) 

880 self.assertTrue(d.fully_matches(d)) 

881 full_match_log_lr = d.comparison(d).posterior_log_odds(0) 

882 self.assertGreater(full_match_log_lr, 0) 

883 

884 partial_matches = ( 

885 ("2000-01-01", "2007-01-01"), # year mismatch only 

886 ("2000-01-01", "2000-07-01"), # month mismatch only 

887 ("2000-01-01", "2000-01-07"), # day mismatch only 

888 ) 

889 partial_match_log_lr = None # type: Optional[float] 

890 for d1_str, d2_str in partial_matches: 

891 d1 = DateOfBirth(cfg, d1_str) 

892 d2 = DateOfBirth(cfg, d2_str) 

893 self.assertFalse(d1.fully_matches(d2)) 

894 self.assertFalse(d2.fully_matches(d1)) 

895 self.assertTrue(d1.partially_matches(d2)) 

896 self.assertTrue(d2.partially_matches(d1)) 

897 partial_match_log_lr = d1.comparison(d2).posterior_log_odds(0) 

898 self.assertLess(partial_match_log_lr, full_match_log_lr) 

899 

900 not_partial_matches = ( 

901 ("2000-01-01", "2007-07-01"), # only day the same 

902 ("2000-01-01", "2000-07-07"), # only year the same 

903 ("2000-01-01", "2007-01-07"), # only month the same 

904 ) 

905 for d1_str, d2_str in not_partial_matches: 

906 d1 = DateOfBirth(cfg, d1_str) 

907 d2 = DateOfBirth(cfg, d2_str) 

908 self.assertFalse(d1.fully_matches(d2)) 

909 self.assertFalse(d2.fully_matches(d1)) 

910 self.assertFalse(d1.partially_matches(d2)) 

911 self.assertFalse(d2.partially_matches(d1)) 

912 mismatch_log_lr = d1.comparison(d2).posterior_log_odds(0) 

913 self.assertLess(mismatch_log_lr, 0) 

914 self.assertLess(mismatch_log_lr, partial_match_log_lr) 

915 

916 def test_identifier_postcode(self) -> None: 

917 cfg = self.cfg 

918 configs = [ 

919 cfg, 

920 # Check extremes of k_postcode: 

921 mk_test_config(k_postcode=1), 

922 mk_test_config(k_postcode=1000), 

923 # Check extremes of p_unknown_or_pseudo_postcode, k_pseudopostcode: 

924 mk_test_config( 

925 p_unknown_or_pseudo_postcode=0.00001, k_pseudopostcode=1.2 

926 ), 

927 mk_test_config( 

928 p_unknown_or_pseudo_postcode=0.01, k_pseudopostcode=3 

929 ), 

930 # Very high combinations, e.g. 

931 # p_unknown_or_pseudo_postcode=0.00001, k_pseudopostcode=1.001, may 

932 # cause an error here. Very high combinations, e.g. 

933 # p_unknown_or_pseudo_postcode=0.1, k_pseudopostcode=3, may also 

934 # cause an error. 

935 ] 

936 # Any invalid settings are detected by the Postcode identifier class 

937 # checking that its comparisons are in a sensible order. All 

938 # identifiers do this, in fact. 

939 

940 for b in BAD_POSTCODES: 

941 with self.assertRaises(ValueError): 

942 _ = Postcode(cfg, b) 

943 early = Date(2020, 1, 1) 

944 late = Date(2021, 12, 31) 

945 for g in GOOD_POSTCODES: # includes pseudopostcodes 

946 with self.assertRaises(ValueError): 

947 _ = Postcode(cfg, g, start_date=late, end_date=early) 

948 p = Postcode(cfg, g) 

949 self.assertEqual(p.postcode_unit, standardize_postcode(g)) 

950 self.assertTrue(p.fully_matches(p)) 

951 

952 empty = Postcode(cfg, "") 

953 self.assertEqual(str(empty), "") 

954 

955 probe_partial_mismatch = ( 

956 # Each tuple: (1) a postcode; (2) same sector, different unit; (3) 

957 # different sector. 

958 ("CB99 9XY", "CB99 9AB", "CB99 7AB"), # nonsense 

959 ("CB2 0QQ", "CB2 0SL", "SW1A 2AA"), # CUH 1, CUH 2, 10 Downing St 

960 ("ZZ99 3VZ", "ZZ99 3WZ", "ZZ99 1WZ"), # pseudo: NFA, sea, Orkney 

961 ) 

962 for probe_str, partial_str, mismatch_str in probe_partial_mismatch: 

963 for c in configs: 

964 p1 = Postcode(c, probe_str) 

965 p2 = Postcode(c, partial_str) 

966 p3 = Postcode(c, mismatch_str) 

967 

968 # Everything matches itself. 

969 self.assertTrue(p1.fully_matches(p1)) 

970 self.assertTrue(p2.fully_matches(p2)) 

971 self.assertTrue(p3.fully_matches(p3)) 

972 

973 # Nothing matches another. 

974 self.assertFalse(p1.fully_matches(p2)) 

975 self.assertFalse(p1.fully_matches(p3)) 

976 self.assertFalse(p2.fully_matches(p3)) 

977 

978 # The partial match partially matches. 

979 self.assertTrue(p1.partially_matches(p2)) 

980 

981 # The nonmatch doesn't partially match. 

982 self.assertFalse(p1.partially_matches(p3)) 

983 

984 full_comp = p1.comparison(p1) 

985 full_log_lr = full_comp.posterior_log_odds(0) 

986 partial_comp = p1.comparison(p2) 

987 partial_log_lr = partial_comp.posterior_log_odds(0) 

988 nonmatch_comp = p1.comparison(p3) 

989 nonmatch_log_lr = nonmatch_comp.posterior_log_odds(0) 

990 

991 self.assertGreater( 

992 full_log_lr, 

993 0, 

994 f"comparing {probe_str!r} to itself, giving {full_comp!r}", 

995 ) 

996 self.assertLess( 

997 partial_log_lr, 

998 full_log_lr, 

999 f"comparing {probe_str!r} to {partial_str!r} " 

1000 f"(partial match); \ncfg = {cfg};\n" 

1001 f"p1 = {p1!r};\n" 

1002 f"giving {partial_comp!r}, versus the exact comparison " 

1003 f"{full_comp!r}", 

1004 ) 

1005 self.assertLess( 

1006 nonmatch_log_lr, 

1007 partial_log_lr, 

1008 f"comparing {probe_str!r} to {mismatch_str!r} " 

1009 f"(nonmatch); \ncfg = {cfg};" 

1010 f"\np1 = {p1!r};\n" 

1011 f"giving {nonmatch_comp!r}, versus the previous partial " 

1012 f"comparison {partial_comp!r}", 

1013 ) 

1014 

1015 def test_identifier_gender(self) -> None: 

1016 cfg = self.cfg 

1017 for b in BAD_GENDERS: 

1018 with self.assertRaises(ValueError): 

1019 _ = Gender(cfg, b) 

1020 for g_str in VALID_GENDERS: 

1021 g = Gender(cfg, g_str) 

1022 log.critical(f"g = {g!r}") 

1023 self.assertEqual(g.gender_str, g_str) 

1024 self.assertEqual(str(g), g_str) 

1025 if not g: 

1026 continue 

1027 self.assertTrue(g.fully_matches(g)) 

1028 comp = g.comparison(g) 

1029 if comp: 

1030 self.assertGreater(comp.posterior_log_odds(0), 0) 

1031 

1032 empty = Gender(cfg, GENDER_MISSING) 

1033 m = Gender(cfg, GENDER_MALE) 

1034 f = Gender(cfg, GENDER_FEMALE) 

1035 x = Gender(cfg, GENDER_OTHER) 

1036 

1037 empty.ensure_has_freq_info_if_id_present() 

1038 m.ensure_has_freq_info_if_id_present() 

1039 f.ensure_has_freq_info_if_id_present() 

1040 x.ensure_has_freq_info_if_id_present() 

1041 

1042 self.assertEqual(str(empty), "") 

1043 

1044 self.assertTrue(bool(m)) 

1045 self.assertTrue(bool(f)) 

1046 self.assertTrue(bool(x)) 

1047 self.assertFalse(bool(empty)) 

1048 

1049 self.assertTrue(m.fully_matches(m)) 

1050 self.assertTrue(m.comparison_relevant(m)) 

1051 

1052 self.assertTrue(f.comparison_relevant(f)) 

1053 self.assertTrue(f.comparison_relevant(f)) 

1054 

1055 self.assertFalse(m.fully_matches(f)) 

1056 self.assertFalse(m.fully_matches(x)) 

1057 self.assertFalse(f.fully_matches(m)) 

1058 self.assertFalse(f.fully_matches(x)) 

1059 

1060 f_comp_f = f.comparison(f) 

1061 self.assertIsNotNone(f_comp_f) 

1062 self.assertGreater(f.comparison(f).posterior_log_odds(0), 0) 

1063 self.assertLess(f.comparison(m).posterior_log_odds(0), 0) 

1064 

1065 def test_identifier_surname_fragment(self) -> None: 

1066 cfg = self.cfg 

1067 f1 = SurnameFragment(cfg, name="Smith", gender=GENDER_MALE) 

1068 h1 = f1.hashed() 

1069 self.assertTrue(f1.fully_matches(f1)) 

1070 self.assertTrue(f1.partially_matches(f1)) 

1071 self.assertFalse(f1.fully_matches(h1)) 

1072 self.assertFalse(f1.partially_matches(h1)) 

1073 self.assertTrue(h1.fully_matches(h1)) 

1074 self.assertTrue(h1.partially_matches(h1)) 

1075 

1076 def test_identifier_surname(self) -> None: 

1077 # https://en.wikipedia.org/wiki/Double-barrelled_name 

1078 cfg = self.cfg 

1079 g = GENDER_FEMALE 

1080 jones = Surname(cfg, name="Jones", gender=g) 

1081 mozart = Surname(cfg, name="Mozart", gender=g) 

1082 mozart_smith_hy = Surname(cfg, name="Mozart-Smith", gender=g) 

1083 mozart_smith_sp = Surname(cfg, name="Mozart Smith", gender=g) 

1084 smith = Surname(cfg, name="Smith", gender=g) 

1085 smythe = Surname(cfg, name="Smythe", gender=g) 

1086 mozart_hashed = mozart.hashed() 

1087 mozart_smith_hashed = mozart_smith_hy.hashed() 

1088 smith_hashed = smith.hashed() 

1089 smythe_hashed = smythe.hashed() 

1090 matching = [ 

1091 (jones, jones), 

1092 (mozart_smith_hy, mozart), 

1093 (mozart_smith_hy, mozart_smith_hy), 

1094 (mozart_smith_hy, mozart_smith_sp), 

1095 (mozart_smith_hy, smith), 

1096 (mozart_smith_sp, mozart), 

1097 (mozart_smith_sp, mozart_smith_hy), 

1098 (mozart_smith_sp, smith), 

1099 (smith, smith), 

1100 (smythe, smythe), 

1101 (mozart_hashed, mozart_hashed), 

1102 (mozart_smith_hashed, mozart_smith_hashed), 

1103 (smith_hashed, smith_hashed), 

1104 (smythe_hashed, smythe_hashed), 

1105 ] 

1106 partially_matching = [ 

1107 (mozart_smith_hy, smythe), 

1108 (mozart_smith_sp, smythe), 

1109 (smith, smythe), 

1110 (smith_hashed, smythe_hashed), 

1111 (mozart_smith_hashed, smythe_hashed), 

1112 ] 

1113 nonmatching = [ 

1114 (jones, mozart_smith_hy), 

1115 (jones, mozart_smith_sp), 

1116 (smith, jones), 

1117 (smith, mozart), 

1118 (smith, smith_hashed), 

1119 (smythe, smythe_hashed), 

1120 ] 

1121 for a, b in matching: 

1122 self.assertTrue(a.fully_matches(b)) 

1123 for a, b in partially_matching: 

1124 self.assertFalse(a.fully_matches(b)) 

1125 self.assertTrue(a.partially_matches(b)) 

1126 for a, b in nonmatching: 

1127 self.assertFalse(a.fully_matches(b)) 

1128 self.assertFalse(a.partially_matches(b)) 

1129 

1130 # ------------------------------------------------------------------------- 

1131 # Lots of identifiers 

1132 # ------------------------------------------------------------------------- 

1133 

1134 def test_identifier_transformations(self) -> None: 

1135 """ 

1136 Creating hashed and plaintext JSON representation and loading an 

1137 identifier back from them. 

1138 """ 

1139 cfg = self.cfg 

1140 identifiable = [ 

1141 DateOfBirth(cfg, dob="2000-12-31"), 

1142 Forename(cfg, name="Elizabeth", gender=GENDER_FEMALE), 

1143 Gender(cfg, gender=GENDER_MALE), 

1144 PerfectID(cfg, identifiers={"nhsnum": 1}), 

1145 Postcode(cfg, postcode="CB2 0QQ"), 

1146 Surname(cfg, name="Smith", gender=GENDER_FEMALE), 

1147 SurnameFragment(cfg, name="Smith", gender=GENDER_MALE), 

1148 ] # type: List[Identifier] 

1149 for i in identifiable: 

1150 self.assertTrue(i.is_plaintext) 

1151 i_class = type(i) # type: Type[Identifier] 

1152 

1153 hd = i.as_dict(encrypt=True, include_frequencies=True) 

1154 h = i_class.from_dict(cfg, hd, hashed=True) 

1155 self.assertFalse(h.is_plaintext) 

1156 h.ensure_has_freq_info_if_id_present() 

1157 

1158 pd = i.as_dict(encrypt=False, include_frequencies=True) 

1159 p = i_class.from_dict(cfg, pd, hashed=False) 

1160 self.assertTrue(p.is_plaintext) 

1161 p.ensure_has_freq_info_if_id_present() 

1162 

1163 # ------------------------------------------------------------------------- 

1164 # Person checks 

1165 # ------------------------------------------------------------------------- 

1166 

1167 def test_person_creation(self) -> None: 

1168 cfg = self.cfg 

1169 # Test the removal of blank names, etc. 

1170 space = " " 

1171 blank = "" 

1172 p1 = Person( 

1173 cfg, local_id="p1", forenames=["A", blank, space, None, "B"] 

1174 ) 

1175 self.assertEqual(len(p1.forenames), 2) 

1176 p2 = Person( 

1177 cfg, local_id="p2", surnames=["A", blank, space, None, "B"] 

1178 ) 

1179 self.assertEqual(len(p2.surnames), 2) 

1180 p3 = Person( 

1181 cfg, 

1182 local_id="p3", 

1183 postcodes=[GOOD_POSTCODES[0], blank, space, GOOD_POSTCODES[1]], 

1184 ) 

1185 self.assertEqual(len(p3.postcodes), 2) 

1186 

1187 def test_person_equality(self) -> None: 

1188 cfg = self.cfg 

1189 p1 = Person(cfg, local_id="hello") 

1190 p2 = Person(cfg, local_id="world") 

1191 p3 = Person(cfg, local_id="world") 

1192 self.assertNotEqual(p1, p2) 

1193 self.assertEqual(p2, p3) 

1194 

1195 people = People(cfg) 

1196 people.add_person(p1) 

1197 people.add_person(p2) 

1198 self.assertRaises(DuplicateIDError, people.add_person, p3) 

1199 

1200 def test_person_copy(self) -> None: 

1201 persons = [self.alice_smith] 

1202 for orig in persons: 

1203 cp = orig.copy() 

1204 for attr in Person.ALL_PERSON_KEYS: 

1205 orig_value = getattr(orig, attr) 

1206 copy_value = getattr(cp, attr) 

1207 self.assertEqual( 

1208 orig_value, 

1209 copy_value, 

1210 f"mismatch for {attr}:\n" 

1211 f"{orig_value!r}\n!=\n{copy_value!r}", 

1212 ) 

1213 

1214 # ------------------------------------------------------------------------- 

1215 # Person comparisons 

1216 # ------------------------------------------------------------------------- 

1217 

1218 def test_fuzzy_linkage_matches(self) -> None: 

1219 test_values = [ 

1220 # Very easy match 

1221 TestCondition( 

1222 cfg=self.cfg, 

1223 person_a=self.alice_bcd_rarename_2000_add, 

1224 person_b=self.alice_bcd_rarename_2000_add, 

1225 should_match=True, 

1226 ), 

1227 # Easy match 

1228 TestCondition( 

1229 cfg=self.cfg, 

1230 person_a=self.alice_bc_rarename_2000_add, 

1231 person_b=self.alice_b_rarename_2000_add, 

1232 should_match=True, 

1233 ), 

1234 # Easy non-match 

1235 TestCondition( 

1236 cfg=self.cfg, 

1237 person_a=self.alice_jones_2000_add, 

1238 person_b=self.bob_smith_1950_psych, 

1239 should_match=False, 

1240 ), 

1241 # Very ambiguous (1) 

1242 TestCondition( 

1243 cfg=self.cfg, 

1244 person_a=self.alice_smith, 

1245 person_b=self.alice_smith_1930, 

1246 should_match=False, 

1247 ), 

1248 # Very ambiguous (2) 

1249 TestCondition( 

1250 cfg=self.cfg, 

1251 person_a=self.alice_smith, 

1252 person_b=self.alice_smith_2000, 

1253 should_match=False, 

1254 ), 

1255 TestCondition( 

1256 cfg=self.cfg, 

1257 person_a=self.alice_bcd_rarename_2000_add, 

1258 person_b=self.alec_bcd_rarename_2000_add, 

1259 should_match=True, 

1260 ), 

1261 TestCondition( 

1262 cfg=self.cfg, 

1263 person_a=self.alice_bcd_rarename_2000_add, 

1264 person_b=self.bob_bcd_rarename_2000_add, 

1265 should_match=True, # used to be False 

1266 ), 

1267 ] # type: List[TestCondition] 

1268 log.info("Testing comparisons...") 

1269 for i, test in enumerate(test_values, start=1): 

1270 log.info(f"Comparison {i}...") 

1271 test.check_comparison_as_expected() 

1272 

1273 def test_fuzzy_more_complex(self) -> None: 

1274 log.info("Testing proband-versus-sample...") 

1275 for i in range(len(self.all_people)): 

1276 proband_plaintext = self.all_people[i] 

1277 log.info(f"Plaintext search with proband: {proband_plaintext}") 

1278 plaintext_winner = self.people_plaintext.get_unique_match( 

1279 proband_plaintext 

1280 ) 

1281 log.info(f"... WINNER: {plaintext_winner}") 

1282 log.info(f"Hashed search with proband: {proband_plaintext}\n") 

1283 proband_hashed = self.all_people_hashed[i] # same order 

1284 hashed_winner = self.people_hashed.get_unique_match(proband_hashed) 

1285 log.info(f"... WINNER: {hashed_winner}") 

1286 

1287 def test_exact_match(self) -> None: 

1288 """ 

1289 Test the exact-match system. 

1290 """ 

1291 id_type = "nhsnum" 

1292 id_value = 3 

1293 # Two people with no identifiers in common: 

1294 p1 = Person( 

1295 cfg=self.cfg, local_id="p1", perfect_id={id_type: id_value} 

1296 ) 

1297 p2 = Person( 

1298 cfg=self.cfg, local_id="p2", perfect_id={id_type: id_value} 

1299 ) 

1300 # Perfect ID comparison is a function of a People object, not Person. 

1301 people = People(cfg=self.cfg, people=[p1]) 

1302 

1303 # Match to self: 

1304 result_p1 = people.get_unique_match_detailed(p1) 

1305 self.assertEqual(result_p1.winner, p1) 

1306 self.assertEqual(result_p1.best_log_odds, INFINITY) 

1307 

1308 # Match to another with the same perfect ID: 

1309 result_p2 = people.get_unique_match_detailed(p2) 

1310 self.assertEqual(result_p2.winner, p1) 

1311 self.assertEqual(result_p2.best_log_odds, INFINITY) 

1312 

1313 # No two people in a People object with the same ID: 

1314 self.assertRaises(DuplicateIDError, people.add_person, p2) 

1315 

1316 # ------------------------------------------------------------------------- 

1317 # People checks 

1318 # ------------------------------------------------------------------------- 

1319 # See also test_person_equality() above. 

1320 

1321 def test_shortlist(self) -> None: 

1322 """ 

1323 Our shortlisting process typically permits people with completely 

1324 matching or partially matching DOBs, but not those with mismatched DOBs 

1325 (for efficiency). Test that. 

1326 """ 

1327 # Some test people: 

1328 cfg1 = self.cfg 

1329 proband = Person(cfg1, local_id="p1", dob="1950-01-01") 

1330 full_dob_match = [ 

1331 # Full DOB match: 

1332 Person(cfg1, local_id="p2", dob="1950-01-01"), 

1333 ] 

1334 partial_dob_match = [ 

1335 # Two components of DOB match: 

1336 Person(cfg1, local_id="p3", dob="2000-01-01"), 

1337 Person(cfg1, local_id="p4", dob="1950-12-01"), 

1338 Person(cfg1, local_id="p5", dob="1950-01-12"), 

1339 ] 

1340 dob_mismatch = [ 

1341 # One component of DOB matches: 

1342 Person(cfg1, local_id="p6", dob="1950-12-12"), 

1343 Person(cfg1, local_id="p7", dob="2000-01-12"), 

1344 Person(cfg1, local_id="p8", dob="2000-12-01"), 

1345 # No component of DOB matches: 

1346 Person(cfg1, local_id="p9", dob="2000-12-12"), 

1347 ] 

1348 all_people = ( 

1349 [proband] + full_dob_match + partial_dob_match + dob_mismatch 

1350 ) 

1351 

1352 # A setup where we don't shortlist mismatched DOBs: 

1353 self.assertEqual(cfg1.complete_dob_mismatch_allowed, False) 

1354 self.assertEqual(cfg1.partial_dob_mismatch_allowed, True) 

1355 people1 = People(cfg1, people=all_people) 

1356 shortlist1 = list(people1.gen_shortlist(proband)) 

1357 self.assertTrue(proband in shortlist1) 

1358 for full_p in full_dob_match: 

1359 self.assertTrue(full_p in shortlist1) 

1360 for partial_p in partial_dob_match: 

1361 self.assertTrue(partial_p in shortlist1) 

1362 for mismatch_p in dob_mismatch: 

1363 self.assertFalse(mismatch_p in shortlist1) 

1364 

1365 # And one where we do: 

1366 cfg2 = mk_test_config(p_en_dob=FuzzyDefaults.P_EN_DOB_TRUE) 

1367 self.assertEqual(cfg2.complete_dob_mismatch_allowed, True) 

1368 self.assertEqual(cfg2.partial_dob_mismatch_allowed, True) 

1369 people2 = People(cfg2, people=all_people) 

1370 shortlist2 = list(people2.gen_shortlist(proband)) 

1371 for p in all_people: 

1372 self.assertTrue(p in shortlist2) 

1373 

1374 # And one where only exact DOB matches are allows: 

1375 cfg3 = mk_test_config(p_ep_dob=0, p_en_dob=0) 

1376 self.assertEqual(cfg3.complete_dob_mismatch_allowed, False) 

1377 self.assertEqual(cfg3.partial_dob_mismatch_allowed, False) 

1378 people3 = People(cfg3, people=all_people) 

1379 shortlist3 = list(people3.gen_shortlist(proband)) 

1380 self.assertTrue(proband in shortlist3) 

1381 for full_p in full_dob_match: 

1382 self.assertTrue(full_p in shortlist3) 

1383 for partial_p in partial_dob_match: 

1384 self.assertFalse(partial_p in shortlist3) 

1385 for mismatch_p in dob_mismatch: 

1386 self.assertFalse(mismatch_p in shortlist3) 

1387 

1388 

1389# ------------------------------------------------------------------------- 

1390# Multiple comparison correction checks 

1391# ------------------------------------------------------------------------- 

1392 

1393 

1394class MultipleComparisonTestBase(unittest.TestCase): 

1395 P_U = 0.1 # arbitrary 

1396 P_O = 1 - P_U 

1397 DELTA = 1e-10 # floating-point tolerance 

1398 

1399 

1400class UnorderedMultipleComparisonTests(MultipleComparisonTestBase): 

1401 @staticmethod 

1402 def compare( 

1403 proband_identifiers: List[Identifier], 

1404 candidate_identifiers: List[Identifier], 

1405 ) -> List[Comparison]: 

1406 return list( 

1407 gen_best_comparisons( 

1408 proband_identifiers=proband_identifiers, 

1409 candidate_identifiers=candidate_identifiers, 

1410 ordered=False, 

1411 ) 

1412 ) 

1413 

1414 def test_same_single_id_returns_one_match_and_no_correction( 

1415 self, 

1416 ) -> None: 

1417 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1418 # UNORDERED, one/one identifier 

1419 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1420 a = DummyLetterIdentifier("A") 

1421 

1422 result = self.compare([a], [a]) 

1423 self.assertEqual(len(result), 1) # ... one match, no correction 

1424 

1425 comparison = result[0] 

1426 self.assertIsInstance(comparison, DirectComparison) 

1427 self.assertEqual(comparison.d_description, "dummy_match:A") 

1428 

1429 def test_same_two_ids_returns_two_matches_and_a_correction( 

1430 self, 

1431 ) -> None: 

1432 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1433 # Unordered, two/two identifiers 

1434 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1435 a = DummyLetterIdentifier("A") 

1436 b = DummyLetterIdentifier("B") 

1437 

1438 result = self.compare([a, b], [a, b]) 

1439 self.assertEqual(len(result), 3) # ... two matches and a correction 

1440 

1441 comparison1 = result[0] 

1442 self.assertIsInstance(comparison1, DirectComparison) 

1443 self.assertEqual(comparison1.d_description, "dummy_match:A") 

1444 comparison2 = result[1] 

1445 self.assertIsInstance(comparison2, DirectComparison) 

1446 self.assertEqual(comparison2.d_description, "dummy_match:B") 

1447 correction = result[-1] 

1448 self.assertIsInstance(correction, AdjustLogOddsComparison) 

1449 # Correction should be for 2 hits from 2 comparisons, and a Bonferroni 

1450 # correction: 

1451 self.assertAlmostEqual( 

1452 correction.log_likelihood_ratio, -ln(2), delta=self.DELTA 

1453 ) 

1454 

1455 def test_same_three_ids_returns_three_matches_and_a_correction( 

1456 self, 

1457 ) -> None: 

1458 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1459 # Unordered, three/three identifiers 

1460 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1461 a = DummyLetterIdentifier("A") 

1462 b = DummyLetterIdentifier("B") 

1463 c = DummyLetterIdentifier("C") 

1464 

1465 result = self.compare([a, b, c], [a, b, c]) 

1466 self.assertEqual(len(result), 4) # ... three matches and a correction 

1467 

1468 comparison1 = result[0] 

1469 self.assertIsInstance(comparison1, DirectComparison) 

1470 self.assertEqual(comparison1.d_description, "dummy_match:A") 

1471 comparison2 = result[1] 

1472 self.assertIsInstance(comparison2, DirectComparison) 

1473 self.assertEqual(comparison2.d_description, "dummy_match:B") 

1474 comparison3 = result[2] 

1475 self.assertIsInstance(comparison3, DirectComparison) 

1476 self.assertEqual(comparison3.d_description, "dummy_match:C") 

1477 

1478 correction = result[-1] 

1479 self.assertIsInstance(correction, AdjustLogOddsComparison) 

1480 # Correction should be for 3 hits from 6 comparisons: 

1481 self.assertAlmostEqual( 

1482 correction.log_likelihood_ratio, -ln(6), delta=self.DELTA 

1483 ) 

1484 

1485 def test_one_out_of_three_ids_returns_three_matches_and_a_correction( 

1486 self, 

1487 ) -> None: 

1488 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1489 # Unordered, one/three identifiers 

1490 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1491 a = DummyLetterIdentifier("A") 

1492 b = DummyLetterIdentifier("B") 

1493 c = DummyLetterIdentifier("C") 

1494 

1495 result = self.compare([a], [a, b, c]) 

1496 self.assertEqual(len(result), 2) # ... one match and a correction 

1497 

1498 comparison = result[0] 

1499 self.assertIsInstance(comparison, DirectComparison) 

1500 self.assertEqual(comparison.d_description, "dummy_match:A") 

1501 

1502 correction = result[-1] 

1503 self.assertIsInstance(correction, AdjustLogOddsComparison) 

1504 # Correction should be for 1 hit from 3 comparisons: 

1505 self.assertAlmostEqual( 

1506 correction.log_likelihood_ratio, -ln(3), delta=self.DELTA 

1507 ) 

1508 

1509 def test_with_incomparable_identifiers(self) -> None: 

1510 """ 

1511 Use identifiers that aren't allowed to be compared, e.g. names with 

1512 non-overlapping timestamps. This will give a comparison that is 

1513 ``None``, and make the code coverage checks happy. 

1514 

1515 .. code-block:: bash 

1516 

1517 pip install pytest-cov 

1518 pytest --cov --cov-report html 

1519 """ 

1520 a_early = DummyLetterTemporalIdentifier( 

1521 value="A", start_date="1900-01-01", end_date="1900-12-31" 

1522 ) 

1523 a_late = DummyLetterTemporalIdentifier( 

1524 value="A", start_date="2000-01-01", end_date="2000-12-31" 

1525 ) 

1526 result = self.compare([a_early], [a_late]) 

1527 self.assertEqual(len(result), 0) # no comparisons 

1528 

1529 

1530class OrderedMultipleComparisonTests(MultipleComparisonTestBase): 

1531 def compare( 

1532 self, 

1533 proband_identifiers: List[Identifier], 

1534 candidate_identifiers: List[Identifier], 

1535 ) -> List[Comparison]: 

1536 return list( 

1537 gen_best_comparisons( 

1538 proband_identifiers=proband_identifiers, 

1539 candidate_identifiers=candidate_identifiers, 

1540 ordered=True, 

1541 p_u=self.P_U, 

1542 ) 

1543 ) 

1544 

1545 def test_same_single_identifier_returns_one_match_and_no_correction( 

1546 self, 

1547 ) -> None: 

1548 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1549 # ORDERED, one/one identifier 

1550 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1551 a = DummyLetterIdentifier("A") 

1552 

1553 result = self.compare([a], [a]) 

1554 self.assertEqual(len(result), 1) # ... one match, no correction 

1555 

1556 comparison = result[0] 

1557 self.assertIsInstance(comparison, DirectComparison) 

1558 self.assertEqual(comparison.d_description, "dummy_match:A") 

1559 

1560 def test_same_two_ids_same_order_returns_two_matches_and_a_correction( 

1561 self, 

1562 ) -> None: 

1563 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1564 # Ordered, two/two identifiers, correct order 

1565 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1566 a = DummyLetterIdentifier("A") 

1567 b = DummyLetterIdentifier("B") 

1568 

1569 result = self.compare([a, b], [a, b]) 

1570 self.assertEqual(len(result), 3) # ... two matches and a correction 

1571 

1572 comparison1 = result[0] 

1573 self.assertIsInstance(comparison1, DirectComparison) 

1574 self.assertEqual(comparison1.d_description, "dummy_match:A") 

1575 comparison2 = result[1] 

1576 self.assertIsInstance(comparison2, DirectComparison) 

1577 self.assertEqual(comparison2.d_description, "dummy_match:B") 

1578 

1579 correction = result[-1] 

1580 self.assertIsInstance(correction, AdjustLogOddsComparison) 

1581 # - P(D|H) correction: +ln(p_o). 

1582 # - P(D|¬H) correction: nothing, i.e. -ln(1) = 0. 

1583 self.assertAlmostEqual( 

1584 correction.log_likelihood_ratio, ln(self.P_O), delta=self.DELTA 

1585 ) 

1586 

1587 def test_same_two_ids_diff_order_returns_two_matches_and_a_correction( 

1588 self, 

1589 ) -> None: 

1590 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1591 # Ordered, two/two identifiers, wrong order 

1592 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1593 a = DummyLetterIdentifier("A") 

1594 b = DummyLetterIdentifier("B") 

1595 

1596 result = self.compare([a, b], [b, a]) 

1597 self.assertEqual(len(result), 3) # ... two matches and a correction 

1598 

1599 comparison1 = result[0] 

1600 self.assertIsInstance(comparison1, DirectComparison) 

1601 self.assertEqual(comparison1.d_description, "dummy_match:A") 

1602 comparison2 = result[1] 

1603 self.assertIsInstance(comparison2, DirectComparison) 

1604 self.assertEqual(comparison2.d_description, "dummy_match:B") 

1605 

1606 correction = result[-1] 

1607 self.assertIsInstance(correction, AdjustLogOddsComparison) 

1608 # - P(D|H) correction: +ln(p_u). 

1609 # - P(D|¬H) correction: Bonferroni for 2 options but minus one for the 

1610 # ordered option, so nothing. 

1611 self.assertAlmostEqual( 

1612 correction.log_likelihood_ratio, ln(self.P_U), delta=self.DELTA 

1613 ) 

1614 

1615 def test_same_three_ids_same_order_returns_three_matches_and_a_correction( 

1616 self, 

1617 ) -> None: 

1618 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1619 # Ordered, three/three identifiers, correct order 

1620 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1621 a = DummyLetterIdentifier("A") 

1622 b = DummyLetterIdentifier("B") 

1623 c = DummyLetterIdentifier("C") 

1624 

1625 result = self.compare([a, b, c], [a, b, c]) 

1626 self.assertEqual(len(result), 4) # ... three matches and a correction 

1627 

1628 comparison1 = result[0] 

1629 self.assertIsInstance(comparison1, DirectComparison) 

1630 self.assertEqual(comparison1.d_description, "dummy_match:A") 

1631 comparison2 = result[1] 

1632 self.assertIsInstance(comparison2, DirectComparison) 

1633 self.assertEqual(comparison2.d_description, "dummy_match:B") 

1634 comparison3 = result[2] 

1635 self.assertIsInstance(comparison3, DirectComparison) 

1636 self.assertEqual(comparison3.d_description, "dummy_match:C") 

1637 

1638 correction = result[-1] 

1639 self.assertIsInstance(correction, AdjustLogOddsComparison) 

1640 # - P(D|H) correction: +ln(p_o). 

1641 # - P(D|¬H) correction: nothing (correct order). 

1642 self.assertAlmostEqual( 

1643 correction.log_likelihood_ratio, ln(self.P_O), delta=self.DELTA 

1644 ) 

1645 

1646 def test_same_three_ids_diff_order_returns_three_matches_and_a_correction( 

1647 self, 

1648 ) -> None: 

1649 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1650 # Ordered, three/three identifiers, wrong order 

1651 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1652 a = DummyLetterIdentifier("A") 

1653 b = DummyLetterIdentifier("B") 

1654 c = DummyLetterIdentifier("C") 

1655 

1656 result = self.compare([a, b, c], [b, c, a]) 

1657 self.assertEqual(len(result), 4) # ... three matches and a correction 

1658 

1659 comparison1 = result[0] 

1660 self.assertIsInstance(comparison1, DirectComparison) 

1661 self.assertEqual(comparison1.d_description, "dummy_match:B") 

1662 comparison2 = result[1] 

1663 self.assertIsInstance(comparison2, DirectComparison) 

1664 self.assertEqual(comparison2.d_description, "dummy_match:C") 

1665 comparison3 = result[2] 

1666 self.assertIsInstance(comparison3, DirectComparison) 

1667 self.assertEqual(comparison3.d_description, "dummy_match:A") 

1668 

1669 correction = result[-1] 

1670 self.assertIsInstance(correction, AdjustLogOddsComparison) 

1671 # - P(D|H) correction: +ln(p_u). 

1672 # - P(D|¬H) correction: Bonferroni for 6 options minus the one for the 

1673 # correct order. 

1674 self.assertAlmostEqual( 

1675 correction.log_likelihood_ratio, 

1676 ln(self.P_U) - ln(5), 

1677 delta=self.DELTA, 

1678 ) 

1679 

1680 def test_two_of_three_matching_ids_returns_three_matches_and_a_correction( 

1681 self, 

1682 ) -> None: 

1683 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1684 # Ordered, three/three identifiers, two match, wrong order 

1685 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

1686 a = DummyLetterIdentifier("A") 

1687 b = DummyLetterIdentifier("B") 

1688 c = DummyLetterIdentifier("C") 

1689 d = DummyLetterIdentifier("D") 

1690 

1691 """ 

1692 Comparing proband [a, b, c] to candidate [b, c, d]: 

1693 

1694 p = proband index 

1695 c = candidate index 

1696 d = distance 

1697 LLR = log likelihood ratio 

1698 

1699 p c d LLR 

1700 a - b mismatch A 0 0 0 -4.5 

1701 a - c mismatch A 0 1 1 -4.5 

1702 a - d mismatch A 0 2 4 -4.5 

1703 b - b match B 1 0 1 3.2 

1704 b - c mismatch B 1 1 0 -4.5 

1705 b - d mismatch B 1 2 1 -4.5 

1706 c - b mismatch C 2 0 4 -4.5 

1707 c - c match C 2 1 1 3.2 

1708 c - d mismatch C 2 2 0 -4.5 

1709 

1710 then we sort them by -LLR and distance: 

1711 

1712 returned? 

1713 b - b match B 1 0 1 3.2 Yes 

1714 c - c match C 2 1 1 3.2 Yes 

1715 a - b mismatch A 0 0 0 -4.5 No (c=0 used) 

1716 b - c mismatch B 1 1 0 -4.5 No (p=1 or c=1 used) 

1717 c - d mismatch C 2 2 0 -4.5 No (p=2 used) 

1718 a - c mismatch A 0 1 1 -4.5 No (c=1 used) 

1719 b - d mismatch B 1 2 1 -4.5 No (p=1 used) 

1720 a - d mismatch A 0 2 4 -4.5 Yes 

1721 c - b mismatch C 2 0 4 -4.5 No (p=2 or c=0 used) 

1722 

1723 """ 

1724 

1725 result = self.compare([a, b, c], [b, c, d]) 

1726 # ... three matches (but one will be bad) and a correction 

1727 self.assertEqual(len(result), 4) 

1728 

1729 comparison1 = result[0] 

1730 self.assertIsInstance(comparison1, DirectComparison) 

1731 self.assertEqual(comparison1.d_description, "dummy_match:B") 

1732 comparison2 = result[1] 

1733 self.assertIsInstance(comparison2, DirectComparison) 

1734 self.assertEqual(comparison2.d_description, "dummy_match:C") 

1735 comparison3 = result[2] 

1736 self.assertIsInstance(comparison3, DirectComparison) 

1737 self.assertEqual(comparison3.d_description, "dummy_mismatch:A") 

1738 

1739 correction = result[-1] 

1740 self.assertIsInstance(correction, AdjustLogOddsComparison) 

1741 # - P(D|H) correction: +ln(p_u). 

1742 # - P(D|¬H) correction: Bonferroni for 6 options minus the one for the 

1743 # correct order. 

1744 self.assertAlmostEqual( 

1745 correction.log_likelihood_ratio, 

1746 ln(self.P_U) - ln(5), 

1747 delta=self.DELTA, 

1748 ) 

1749 

1750 def test_order_correct_with_duplicate_names_1(self) -> None: 

1751 """ 

1752 Compare "A A" to "A A" in ordered fashion. 

1753 

1754 Think of this as proband A_P1, A_P2 and candidate A_C1, A_C2. 

1755 

1756 Should give a "correctly ordered" match, A_P1:A_C1 and A_C2:A_C2, with 

1757 correction for P_O. 

1758 

1759 Should not treat it as an incorrectly ordered match, A_P1:A_C2 and 

1760 A_P2:A_C1, and apply a different correction for P_U etc. 

1761 

1762 This might work without the "distance" sort in ComparisonInfo (it does, 

1763 in fact), but that is a safety. See below for a test that does depend 

1764 on that distance metric. 

1765 """ 

1766 a = DummyLetterIdentifier("A") 

1767 

1768 result = self.compare([a, a], [a, a]) 

1769 self.assertEqual(len(result), 3) 

1770 comparison1 = result[0] 

1771 self.assertIsInstance(comparison1, DirectComparison) 

1772 self.assertEqual(comparison1.d_description, "dummy_match:A") 

1773 comparison2 = result[1] 

1774 self.assertIsInstance(comparison2, DirectComparison) 

1775 self.assertEqual(comparison2.d_description, "dummy_match:A") 

1776 correction = result[2] 

1777 self.assertIsInstance(correction, AdjustLogOddsComparison) 

1778 self.assertAlmostEqual( 

1779 correction.log_likelihood_ratio, 

1780 ln(self.P_O), 

1781 delta=self.DELTA, 

1782 ) 

1783 

1784 def test_order_correct_with_duplicate_names_2(self) -> None: 

1785 """ 

1786 Compare "A B" to "B B" in ordered fashion. 

1787 

1788 We want this to give A_P1:B_P1 (mismatch) and B_P2:B_C2 (ordered 

1789 match). 

1790 

1791 It should not give A_P1:B_P2 (mismatch) and B_P2:B_C1 (unordered 

1792 match). 

1793 

1794 This does not work without the "distance" part of the sort in 

1795 ComparisonInfo. 

1796 """ 

1797 a = DummyLetterIdentifier("A") 

1798 b = DummyLetterIdentifier("B") 

1799 

1800 result = self.compare([a, b], [b, b]) 

1801 self.assertEqual(len(result), 3) 

1802 # Matches come first (better LLR): 

1803 comparison1 = result[0] 

1804 self.assertIsInstance(comparison1, DirectComparison) 

1805 self.assertEqual(comparison1.d_description, "dummy_match:B") 

1806 # Then mismatches: 

1807 comparison2 = result[1] 

1808 self.assertIsInstance(comparison2, DirectComparison) 

1809 self.assertEqual(comparison2.d_description, "dummy_mismatch:A") 

1810 # Then corrections: 

1811 correction = result[2] 

1812 self.assertIsInstance(correction, AdjustLogOddsComparison) 

1813 self.assertAlmostEqual( 

1814 correction.log_likelihood_ratio, 

1815 ln(self.P_O), 

1816 delta=self.DELTA, 

1817 )