Coverage for linkage/identifiers.py: 86%

843 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1r""" 

2crate_anon/linkage/identifiers.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Helper functions for linkage tools.** 

27 

28Represents various types of person identifier (e.g. name, postcode) that may be 

29compared between two people. 

30 

31""" 

32 

33# ============================================================================= 

34# Imports 

35# ============================================================================= 

36 

37from abc import ABC, abstractmethod 

38import logging 

39from typing import ( 

40 Any, 

41 Dict, 

42 Generator, 

43 List, 

44 Optional, 

45 Set, 

46 Tuple, 

47 Type, 

48 Union, 

49) 

50 

51from cardinal_pythonlib.datetimefunc import coerce_to_pendulum_date 

52from cardinal_pythonlib.maths_py import round_sf 

53from cardinal_pythonlib.reprfunc import auto_repr 

54import pendulum 

55from pendulum.parsing.exceptions import ParserError 

56from pendulum import Date 

57 

58from crate_anon.linkage.constants import NONE_TYPE, Switches, VALID_GENDERS 

59from crate_anon.linkage.comparison import ( 

60 AdjustLogOddsComparison, 

61 CertainComparison, 

62 Comparison, 

63 DirectComparison, 

64) 

65from crate_anon.linkage.helpers import ( 

66 get_first_two_char, 

67 get_metaphone, 

68 get_postcode_sector, 

69 getdictprob, 

70 getdictval, 

71 is_valid_isoformat_date, 

72 isoformat_date_or_none, 

73 ln, 

74 mk_blurry_dates, 

75 POSTCODE_REGEX, 

76 standardize_name, 

77 standardize_perfect_id_key, 

78 standardize_perfect_id_value, 

79 standardize_postcode, 

80 surname_alternative_fragments, 

81 validate_uncertain_prob, 

82) 

83from crate_anon.linkage.matchconfig import MatchConfig 

84 

85log = logging.getLogger(__name__) 

86 

87 

88# ============================================================================= 

89# Identifier 

90# ============================================================================= 

91 

92 

93class Identifier(ABC): 

94 """ 

95 Abstract base class: generic nugget of information about a person, in 

96 identifiable (plaintext) or de-identified (hashed) form. Optionally, may 

97 convey start/end dates. 

98 

99 Note: 

100 

101 - We trust that probabilities from the config have been validated (i.e. are 

102 in the range 0-1), but we should check values arising from incoming data, 

103 primarily via :meth:`from_hashed_dict`. The 

104 :func:`crate_anon.linkage.helpers.getdictprob` does this, but more checks 

105 may be required. 

106 

107 - A typical comparison operation involves comparing a lot of people to 

108 each other, so it is usually efficient to cache "derived" information 

109 (e.g. we should calculate metaphones, etc., from names at creation, not 

110 at comparison). See :meth:`comparison`. 

111 """ 

112 

113 SEP = "/" # separator 

114 NULL_VALUES_LOWERCASE = ["none", "null", "?"] # must include "none" 

115 TEMPORAL_ID_FORMAT_HELP = ( 

116 f"Temporal identifier format: either just IDENTIFIER, or " 

117 f"IDENTIFIER{SEP}STARTDATE{SEP}ENDDATE, where dates are in YYYY-MM-DD " 

118 f"format or one of {NULL_VALUES_LOWERCASE} (case-insensitive)." 

119 ) 

120 

121 KEY_START_DATE = "start_date" 

122 KEY_END_DATE = "end_date" 

123 

124 ERR_MISSING_FREQ = "Missing frequency information" 

125 

126 # ------------------------------------------------------------------------- 

127 # Creation, and representations that support creation 

128 # ------------------------------------------------------------------------- 

129 

130 def __init__( 

131 self, 

132 cfg: Optional[MatchConfig], 

133 is_plaintext: bool, 

134 temporal: bool = False, 

135 start_date: Union[str, Date] = None, 

136 end_date: Union[str, Date] = None, 

137 ) -> None: 

138 """ 

139 Args: 

140 cfg: 

141 A configuration object. Can be ``None`` but you have to specify 

142 that manually. 

143 is_plaintext: 

144 Is this an identifiable (plaintext) version? If ``False``, then 

145 it is a de-identified (hashed) version, whose internal 

146 structure can be more complex. 

147 temporal: 

148 Store start/end dates (which can be ``None``) along with the 

149 information? 

150 start_date: 

151 The start date (first valid date), or ``None``. 

152 end_date: 

153 The end date (last valid date), or ``None``. 

154 """ 

155 assert isinstance(cfg, (MatchConfig, NONE_TYPE)) 

156 self.cfg = cfg 

157 self.is_plaintext = is_plaintext 

158 self.temporal = temporal 

159 self.actually_temporal = temporal 

160 self.start_date = None # type: Optional[Date] 

161 self.end_date = None # type: Optional[Date] 

162 self._set_dates(start_date, end_date) 

163 

164 def __str__(self) -> str: 

165 """ 

166 A string representation used for CSV files. 

167 """ 

168 if not self: 

169 # No information 

170 return "" 

171 if self.is_plaintext: 

172 # Identifiable 

173 id_str = self.plaintext_str_core() 

174 if self.actually_temporal: 

175 if self.SEP in id_str: 

176 raise ValueError( 

177 f"Temporal identifier unsuitable: " 

178 f"contains {self.SEP!r}" 

179 ) 

180 return self.SEP.join( 

181 [ 

182 id_str, 

183 str(self.start_date), 

184 str(self.end_date), 

185 ] 

186 ) 

187 else: 

188 return id_str 

189 return f"hashed_{self.__class__.__name__}" 

190 

191 @abstractmethod 

192 def __eq__(self, other: "Identifier") -> bool: 

193 """ 

194 Check equality with another, primarily for debugging. 

195 

196 Just because it's an @abstractmethod doesn't mean that you can't call 

197 it (from derived classes). 

198 """ 

199 return self._eq_check(other, ["start_date", "end_date"]) 

200 

201 def _eq_check(self, other: "Identifier", attrs: List[str]) -> bool: 

202 """ 

203 Helper function to implement equality checks. 

204 """ 

205 if type(self) != type(other): 

206 return False 

207 return all(getattr(self, a) == getattr(other, a) for a in attrs) 

208 

209 @abstractmethod 

210 def plaintext_str_core(self) -> str: 

211 """ 

212 Represents the identifier in plaintext, for CSV. Potentially 

213 encapsulated within more information by __str__(). 

214 """ 

215 pass 

216 

217 @classmethod 

218 @abstractmethod 

219 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "Identifier": 

220 """ 

221 Restore a plaintext version from a string (which has been read from 

222 CSV). Reverses __str__(), not plaintext_str_core(). 

223 """ 

224 pass 

225 

226 @abstractmethod 

227 def as_dict( 

228 self, encrypt: bool = True, include_frequencies: bool = True 

229 ) -> Dict[str, Any]: 

230 """ 

231 Represents the object in a dictionary suitable for JSON serialization, 

232 for the de-identified (hashed) version. 

233 

234 Args: 

235 encrypt: 

236 Encrypt the contents as writing, creating a hashed version. 

237 include_frequencies: 

238 Include frequency information. If you don't, this makes the 

239 resulting file suitable for use as a sample, but not as a 

240 proband file. 

241 """ 

242 pass 

243 

244 @classmethod 

245 @abstractmethod 

246 def from_dict( 

247 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool 

248 ) -> "Identifier": 

249 """ 

250 Restore a hashed or plaintext version from a dictionary (which will 

251 have been read from JSON). 

252 """ 

253 pass 

254 

255 # ------------------------------------------------------------------------- 

256 # Internal methods to support creation 

257 # ------------------------------------------------------------------------- 

258 

259 def _set_dates( 

260 self, 

261 start_date: Union[str, Date] = None, 

262 end_date: Union[str, Date] = None, 

263 ) -> None: 

264 """ 

265 Set date information. Should only be called for temporal identifiers. 

266 """ 

267 start_date = coerce_to_pendulum_date(start_date) 

268 if not isinstance(start_date, (Date, NONE_TYPE)): 

269 raise ValueError(f"Bad start_date: {start_date!r}") 

270 

271 end_date = coerce_to_pendulum_date(end_date) 

272 if not isinstance(end_date, (Date, NONE_TYPE)): 

273 raise ValueError(f"Bad end_date: {end_date!r}") 

274 

275 if start_date and end_date: 

276 if start_date > end_date: 

277 raise ValueError( 

278 f"start_date = {start_date!r} > end_date = {end_date!r}" 

279 ) 

280 

281 self.start_date = start_date 

282 self.end_date = end_date 

283 # Save some time later: this is only a temporal identifier if at least 

284 # one date is set. 

285 self.actually_temporal = bool(self.start_date) or bool(self.end_date) 

286 

287 def _set_dates_from_dict(self, d: Dict[str, Any]) -> None: 

288 """ 

289 Reads from a (JSON-derived) dictionary and sets our dates. 

290 Assumes we are a temporal identifier. 

291 """ 

292 self._set_dates( 

293 start_date=getdictval(d, self.KEY_START_DATE, str), 

294 end_date=getdictval(d, self.KEY_END_DATE, str), 

295 ) 

296 

297 def _write_dates_to_dict(self, d: Dict[str, Any]) -> None: 

298 """ 

299 For creating JSON dictionaries: write our dates to the dictionary (if 

300 we are a temporal identifier). 

301 """ 

302 if self.temporal: 

303 d[self.KEY_START_DATE] = isoformat_date_or_none(self.start_date) 

304 d[self.KEY_END_DATE] = isoformat_date_or_none(self.end_date) 

305 

306 @classmethod 

307 def _get_temporal_triplet( 

308 cls, x: str 

309 ) -> Tuple[str, Optional[Date], Optional[Date]]: 

310 """ 

311 From a string (e.g. from CSV), split into CONTENTS/START_DATE/END_DATE. 

312 If it contains no "/", treat it as CONTENTS/None/None. 

313 

314 Args: 

315 x: 

316 String to parse. 

317 

318 Returns: 

319 tuple: 

320 contents, start_date, end_date 

321 """ 

322 # Extract components of the string 

323 components = x.split(cls.SEP) 

324 

325 if len(components) == 1: 

326 # Separator not present. 

327 contents = components[0] 

328 return contents, None, None 

329 

330 if len(components) != 3: 

331 raise ValueError( 

332 f"Need three components separated by {cls.SEP!r} (or one with " 

333 f"no {cls.SEP!r}); got {x!r}" 

334 ) 

335 

336 contents, start_date_str, end_date_str = components 

337 

338 # Start date 

339 if start_date_str.lower() in cls.NULL_VALUES_LOWERCASE: 

340 start_date = None # type: Optional[Date] 

341 else: 

342 try: 

343 # noinspection PyTypeChecker 

344 start_date = pendulum.parse(start_date_str).date() 

345 except ParserError: 

346 raise ValueError(f"Bad date: {start_date_str!r}") 

347 

348 # End date 

349 if end_date_str.lower() in cls.NULL_VALUES_LOWERCASE: 

350 end_date = None # type: Optional[Date] 

351 else: 

352 try: 

353 # noinspection PyTypeChecker 

354 end_date = pendulum.parse(end_date_str).date() 

355 except ParserError: 

356 raise ValueError(f"Bad date: {end_date_str!r}") 

357 

358 return contents, start_date, end_date 

359 

360 def _round(self, x: Optional[float], encrypt: bool) -> Optional[float]: 

361 """ 

362 Implements config-defined rounding for frequency representations of 

363 hashed values. 

364 

365 Rounds frequencies to a certain number of significant figures. (Don't 

366 supply exact floating-point numbers for frequencies; may be more 

367 identifying. Don't use decimal places; we have to deal with some small 

368 numbers.) 

369 """ 

370 if x is None: 

371 return None 

372 sf = self.cfg.rounding_sf 

373 if sf is None or not encrypt: 

374 return x 

375 return round_sf(x, sf) 

376 

377 # ------------------------------------------------------------------------- 

378 # Python standard representation functions 

379 # ------------------------------------------------------------------------- 

380 

381 def __repr__(self): 

382 """ 

383 Standardized Python representation. 

384 """ 

385 return auto_repr(self, sort_attrs=False) 

386 

387 # ------------------------------------------------------------------------- 

388 # Basic tests 

389 # ------------------------------------------------------------------------- 

390 

391 @abstractmethod 

392 def __bool__(self) -> bool: 

393 """ 

394 Does this object contain information? 

395 """ 

396 pass 

397 

398 # ------------------------------------------------------------------------- 

399 # Validation 

400 # ------------------------------------------------------------------------- 

401 

402 @abstractmethod 

403 def ensure_has_freq_info_if_id_present(self) -> None: 

404 """ 

405 If we have ID information but some frequency information is missing, 

406 raise :exc:`ValueError`. Used to check validity for probands; 

407 candidates do not have to fulfil this requirement. 

408 """ 

409 pass 

410 

411 # ------------------------------------------------------------------------- 

412 # Comparison 

413 # ------------------------------------------------------------------------- 

414 

415 def comparison_relevant(self, other: "Identifier") -> bool: 

416 """ 

417 It's only relevant to compare this identifier to another if both have 

418 some information, and if they are not specifically excluded by a 

419 temporal check. 

420 """ 

421 return self and other and self.overlaps(other) 

422 

423 @abstractmethod 

424 def comparison(self, candidate_id: "Identifier") -> Optional[Comparison]: 

425 """ 

426 Return a comparison odds (embodying the change in log odds) for a 

427 comparison between the "self" identifier (as the proband) and another, 

428 the candidate. Frequency information is expected to be on the "self" 

429 (proband) side. 

430 """ 

431 pass 

432 

433 def overlaps(self, other: "Identifier") -> bool: 

434 """ 

435 Do ``self`` and ``other`` overlap in time? 

436 

437 Args: 

438 other: 

439 the other :class:`Identifier` 

440 

441 For similar logic, see 

442 :meth:`cardinal_pythonlib.interval.Interval.overlaps`. 

443 """ 

444 if not self.actually_temporal or not other.actually_temporal: 

445 return True 

446 return not ( 

447 # This inner test is for non-overlap. 

448 # (a) self ends before other starts 

449 ( 

450 self.end_date 

451 and other.start_date 

452 and self.end_date < other.start_date 

453 ) 

454 or 

455 # (b) other ends before self starts 

456 ( 

457 other.end_date 

458 and self.start_date 

459 and other.end_date < self.start_date 

460 ) 

461 ) 

462 

463 # ------------------------------------------------------------------------- 

464 # Debugging 

465 # ------------------------------------------------------------------------- 

466 

467 def hashed(self, include_frequencies: bool = True) -> "Identifier": 

468 """ 

469 For testing: hash this identifier by itself. 

470 """ 

471 encrypt = self.is_plaintext 

472 d = self.as_dict( 

473 encrypt=encrypt, include_frequencies=include_frequencies 

474 ) 

475 cls = type(self) # type: Type[Identifier] 

476 return cls.from_dict(self.cfg, d, hashed=True) 

477 

478 

479# ============================================================================= 

480# IdentifierTwoState 

481# ============================================================================= 

482 

483 

484class IdentifierTwoState(Identifier, ABC): 

485 """ 

486 Identifier that supports a two-state comparison. 

487 """ 

488 

489 def __init__(self, *args, **kwargs) -> None: 

490 super().__init__(*args, **kwargs) 

491 

492 self.comparison_full_match = None # type: Optional[DirectComparison] 

493 self.comparison_no_match = None # type: Optional[DirectComparison] 

494 

495 def _clear_comparisons(self) -> None: 

496 """ 

497 Reset our comparison objects. 

498 """ 

499 self.comparison_full_match = None # type: Optional[DirectComparison] 

500 self.comparison_no_match = None # type: Optional[DirectComparison] 

501 

502 @abstractmethod 

503 def fully_matches(self, other: "IdentifierTwoState") -> bool: 

504 """ 

505 Does this identifier fully match the other? 

506 

507 You can assume that self.comparison_relevant(other) is True. 

508 """ 

509 pass 

510 

511 def comparison( 

512 self, candidate_id: "IdentifierTwoState" 

513 ) -> Optional[Comparison]: 

514 """ 

515 Compare our identifier to another of the same type. Return None if you 

516 wish to draw no conclusions (e.g. there is missing information, or 

517 temporally defined identifiers do not overlap). 

518 

519 You should assume that frequency information must be present on the 

520 "self" side (this should be the proband); it may be missing from the 

521 "other" side (the candidate). 

522 

523 This is a high-speed function; pre-cache any fixed information that 

524 requires multi-stage lookup. 

525 """ 

526 if not self.comparison_relevant(candidate_id): 

527 # Infer no conclusions from absent information. 

528 return None 

529 if self.fully_matches(candidate_id): 

530 return self.comparison_full_match 

531 return self.comparison_no_match 

532 

533 def warn_if_llr_order_unexpected( 

534 self, full: DirectComparison, partials: List[DirectComparison] = None 

535 ) -> None: 

536 """ 

537 Partial/full comparisons are not guaranteed to be ordered as you might 

538 expect; an example is in the validation paper (and in 

539 other_examples_for_paper.py). Nor are all partial/full matches 

540 guaranteed to yield better evidence for H than a complete mismatch. 

541 However, that's what you might expect. This function warns the user if 

542 that's not the case. 

543 

544 Args: 

545 full: 

546 Comparisons for the "full match" condition. 

547 partials: 

548 Comparisons for "partial match" conditions. 

549 """ 

550 if not self.cfg.check_comparison_order: 

551 return 

552 partials = partials or [] 

553 no_match_llr = self.comparison_no_match.log_likelihood_ratio 

554 if any( 

555 c.log_likelihood_ratio < no_match_llr for c in [full] + partials 

556 ): 

557 log.warning( 

558 f"{self.__class__.__name__}: a match comparison's log " 

559 f"likelihood ratio is less than the no-match comparison's. " 

560 f"Object:\n\n{self!r}" 

561 ) 

562 full_match_llr = full.log_likelihood_ratio 

563 if any(p.log_likelihood_ratio > full_match_llr for p in partials): 

564 log.warning( 

565 f"{self.__class__.__name__}: a partial match comparison's " 

566 f"log likelihood ratio exceeds the full-match comparison's. " 

567 f"Object:\n\n{self!r}" 

568 ) 

569 

570 

571# ============================================================================= 

572# IdentifierThreeState 

573# ============================================================================= 

574 

575 

576class IdentifierThreeState(IdentifierTwoState, ABC): 

577 """ 

578 Identifier that supports a three-state comparison. 

579 """ 

580 

581 def __init__(self, *args, **kwargs) -> None: 

582 super().__init__(*args, **kwargs) 

583 

584 self.comparison_partial_match = ( 

585 None 

586 ) # type: Optional[DirectComparison] 

587 

588 def _clear_comparisons(self) -> None: 

589 """ 

590 Reset our comparison objects. 

591 """ 

592 super()._clear_comparisons() 

593 self.comparison_partial_match = ( 

594 None 

595 ) # type: Optional[DirectComparison] 

596 

597 @abstractmethod 

598 def partially_matches(self, other: "IdentifierThreeState") -> bool: 

599 """ 

600 Does this identifier partially match the other? 

601 

602 You can assume that self.comparison_relevant(other) is True. 

603 """ 

604 pass 

605 

606 def comparison( 

607 self, candidate_id: "IdentifierThreeState" 

608 ) -> Optional[Comparison]: 

609 """ 

610 See :meth:`IdentifierTwoState.comparison`. 

611 """ 

612 if not self.comparison_relevant(candidate_id): 

613 # Infer no conclusions from absent information. 

614 return None 

615 if self.fully_matches(candidate_id): 

616 return self.comparison_full_match 

617 if self.partially_matches(candidate_id): 

618 return self.comparison_partial_match 

619 return self.comparison_no_match 

620 

621 

622# ============================================================================= 

623# IdentifierFourState 

624# ============================================================================= 

625 

626 

627class IdentifierFourState(IdentifierThreeState, ABC): 

628 """ 

629 Identifier that supports a four-state comparison. 

630 """ 

631 

632 def __init__(self, *args, **kwargs) -> None: 

633 super().__init__(*args, **kwargs) 

634 

635 self.comparison_partial_match_second = ( 

636 None 

637 ) # type: Optional[DirectComparison] 

638 

639 def _clear_comparisons(self) -> None: 

640 """ 

641 Reset our comparison objects. 

642 """ 

643 super()._clear_comparisons() 

644 self.comparison_partial_match_second = ( 

645 None 

646 ) # type: Optional[DirectComparison] 

647 

648 @abstractmethod 

649 def partially_matches_second(self, other: "IdentifierFourState") -> bool: 

650 """ 

651 Does this identifier partially match the other on the first fuzzy 

652 identifier? 

653 

654 You can assume that self.comparison_relevant(other) is True. 

655 """ 

656 pass 

657 

658 def comparison( 

659 self, candidate_id: "IdentifierFourState" 

660 ) -> Optional[Comparison]: 

661 """ 

662 See :meth:`IdentifierTwoState.comparison`. 

663 """ 

664 if not self.comparison_relevant(candidate_id): 

665 # Infer no conclusions from absent information. 

666 return None 

667 if self.fully_matches(candidate_id): 

668 return self.comparison_full_match 

669 if self.partially_matches(candidate_id): 

670 return self.comparison_partial_match 

671 if self.partially_matches_second(candidate_id): 

672 return self.comparison_partial_match_second 

673 return self.comparison_no_match 

674 

675 

676# ============================================================================= 

677# TemporalIDHolder 

678# ============================================================================= 

679 

680 

681class TemporalIDHolder(Identifier): 

682 """ 

683 Limited class that allows no config and stores a plain string identifier. 

684 Used for representing postcodes between a database and CSV for validation. 

685 """ 

686 

687 BAD_METHOD = "Inappropriate function called for TemporalIDHolder" 

688 

689 def __init__( 

690 self, identifier: str, start_date: Date = None, end_date: Date = None 

691 ) -> None: 

692 super().__init__( 

693 cfg=None, 

694 is_plaintext=True, 

695 temporal=True, 

696 start_date=start_date, 

697 end_date=end_date, 

698 ) 

699 self.identifier = identifier or "" 

700 if not isinstance(self.identifier, str): 

701 raise ValueError(f"Bad identifier: {identifier!r}") 

702 

703 def __eq__(self, other: Identifier) -> bool: 

704 return super().__eq__(other) and self._eq_check(other, ["identifier"]) 

705 

706 def plaintext_str_core(self) -> str: 

707 return self.identifier 

708 

709 @classmethod 

710 def from_plaintext_str( 

711 cls, cfg: MatchConfig, x: str 

712 ) -> "TemporalIDHolder": 

713 contents, start_date, end_date = cls._get_temporal_triplet(x) 

714 return TemporalIDHolder( 

715 identifier=contents, start_date=start_date, end_date=end_date 

716 ) 

717 

718 # noinspection PyTypeChecker 

719 def as_dict( 

720 self, encrypt: bool = True, include_frequencies: bool = True 

721 ) -> Dict[str, Any]: 

722 raise AssertionError(self.BAD_METHOD) 

723 

724 # noinspection PyTypeChecker 

725 @classmethod 

726 def from_dict( 

727 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool 

728 ) -> "TemporalIDHolder": 

729 raise AssertionError(cls.BAD_METHOD) 

730 

731 def __bool__(self) -> bool: 

732 return bool(self.identifier) 

733 

734 def ensure_has_freq_info_if_id_present(self) -> None: 

735 pass 

736 

737 def comparison(self, candidate_id: "Identifier") -> Optional[Comparison]: 

738 raise AssertionError(self.BAD_METHOD) 

739 

740 

741# ============================================================================= 

742# Postcode 

743# ============================================================================= 

744 

745 

746class Postcode(IdentifierThreeState): 

747 """ 

748 Represents a UK postcode. 

749 

750 Note that we store nationwide frequencies. Final adjustment by k_postcode 

751 is only done at the last moment, allowing k_postcode to vary without having 

752 to change a hashed frequency file. Similarly for the probability of a 

753 postcode being unknown. So stored frequencies may be None. 

754 """ 

755 

756 KEY_POSTCODE_UNIT = "postcode_unit" 

757 KEY_POSTCODE_SECTOR = "postcode_sector" 

758 KEY_UNIT_FREQ = "unit_freq" # national fraction, f_f_postcode 

759 KEY_SECTOR_FREQ = "sector_freq" # national fraction, f_p_postcode 

760 

761 def __init__( 

762 self, 

763 cfg: MatchConfig, 

764 postcode: str = "", 

765 start_date: Union[str, Date] = None, 

766 end_date: Union[str, Date] = None, 

767 ): 

768 """ 

769 Plaintext creation of a postcode. 

770 """ 

771 super().__init__( 

772 cfg=cfg, 

773 is_plaintext=True, 

774 temporal=True, 

775 start_date=start_date, 

776 end_date=end_date, 

777 ) 

778 

779 if not isinstance(postcode, str): 

780 raise ValueError(f"Bad postcode: {postcode!r}") 

781 postcode = standardize_postcode(postcode) 

782 if postcode and not POSTCODE_REGEX.match(postcode): 

783 raise ValueError(f"Bad postcode: {postcode!r}") 

784 

785 if postcode: 

786 self.postcode_unit = postcode 

787 self.postcode_sector = get_postcode_sector( 

788 self.postcode_unit, prestandardized=True 

789 ) 

790 self.unit_freq, self.sector_freq = cfg.postcode_unit_sector_freq( 

791 self.postcode_unit, prestandardized=True 

792 ) 

793 # ... national fractions, f_f_postcode and f_p_postcode 

794 else: 

795 self.postcode_unit = "" 

796 self.postcode_sector = "" 

797 self.unit_freq = None # type: Optional[float] 

798 self.sector_freq = None # type: Optional[float] 

799 

800 # Precalculate comparisons, for speed, but in a way that we can update 

801 # them if we are being created via from_hashed_dict(). 

802 self._set_comparisons() 

803 

804 def _set_comparisons(self) -> None: 

805 if self.postcode_unit: 

806 cfg = self.cfg 

807 

808 # ----------------------------------------------------------------- 

809 # Population probabilities. 

810 # ----------------------------------------------------------------- 

811 # Here we apply any comparison-time adjustments, e.g. for the 

812 # probability of an unknown postcode or pseudopostcode, or the 

813 # potential that our comparison population is a geographic subset 

814 # of the UK. 

815 

816 # Unit probability, p_f 

817 f_f = self.unit_freq # national fraction (full), or None 

818 unit_unknown = f_f is None 

819 if unit_unknown: 

820 # Unknown postcode unit. This has been specified directly. 

821 p_f = cfg.p_unknown_or_pseudo_postcode_unit 

822 else: 

823 # Known postcode 

824 p_f = cfg.k_postcode * f_f * cfg.p_known_postcode 

825 

826 # Total sector probability, p_p 

827 f_p = self.sector_freq # national fraction (partial), or None 

828 sector_unknown = f_p is None 

829 if sector_unknown: 

830 # Unknown sector. This has been specified directly. 

831 p_p = cfg.p_unknown_or_pseudo_postcode_sector 

832 # A sanity check: 

833 assert unit_unknown, ( 

834 "Should be impossible that the postcode unit is known but " 

835 "the sector is not." 

836 ) 

837 else: 

838 # Known sector 

839 p_p = cfg.k_postcode * f_p * cfg.p_known_postcode 

840 # It is possible, though, that the postcode is unknown but the 

841 # sector is known (e.g. a typo in the postcode). 

842 if unit_unknown and p_p < p_f: 

843 log.warning( 

844 f"Unknown postcode unit in known sector and " 

845 f"user-specified unknown unit probability " 

846 f"p_f = {Switches.P_UNKNOWN_OR_PSEUDO_POSTCODE} " 

847 f"exceeds the calculated probability of the known " 

848 f"sector, p_p = k_postcode[{cfg.k_postcode}]" 

849 f" * f_p[{f_p}]" 

850 f" * p_known_postcode[{cfg.p_known_postcode}]" 

851 f" = {p_p}. Adjusting the sector probability up to " 

852 f"the unknown sector probability, " 

853 f"p_p = {cfg.p_unknown_or_pseudo_postcode_sector}, " 

854 f"but this may be a configuration error." 

855 ) 

856 p_p = cfg.p_unknown_or_pseudo_postcode_sector 

857 

858 validate_uncertain_prob( 

859 p_f, 

860 "Postcode p_f = k_postcode * f_f * p_known_postcode", 

861 ) 

862 validate_uncertain_prob( 

863 p_p, "Postcode p_p = k_postcode * f_p * p_known_postcode" 

864 ) 

865 # ... it's not reasonable that a postcode unit or sector is 

866 # impossible or certain. 

867 

868 # Sector-not-unit probability, p_pnf 

869 p_pnf = p_p - p_f 

870 validate_uncertain_prob( 

871 p_pnf, "Postcode p_pnf = p_p[sector] - p_f[unit]" 

872 ) 

873 # ... It is not completely unreasonable for this to be 0, e.g. for 

874 # pseudopostcodes that occupy all of their sector. But it's 

875 # dangerous, because if a partial-not-full match then does occur, 

876 # that will give P(D | ¬H) = 0 and log LR = +∞. We now enforce 

877 # k_pseudopostcode > 1 and thus p_pnf > 0. 

878 

879 # ----------------------------------------------------------------- 

880 # Error probabilities 

881 # ----------------------------------------------------------------- 

882 p_ep = cfg.p_ep_postcode 

883 p_en = cfg.p_en_postcode 

884 

885 # ----------------------------------------------------------------- 

886 # Comparisons 

887 # ----------------------------------------------------------------- 

888 self.comparison_full_match = DirectComparison( 

889 p_d_given_same_person=1 - p_ep, # p_c 

890 p_d_given_diff_person=p_f, 

891 d_description="postcode_full_match", 

892 ) 

893 self.comparison_partial_match = DirectComparison( 

894 p_d_given_same_person=p_ep, 

895 p_d_given_diff_person=p_pnf, 

896 d_description="postcode_partial_not_full_match", 

897 ) 

898 self.comparison_no_match = DirectComparison( 

899 p_d_given_same_person=p_en, 

900 p_d_given_diff_person=1 - p_p, # p_n 

901 d_description="postcode_no_match", 

902 ) 

903 self.warn_if_llr_order_unexpected( 

904 full=self.comparison_full_match, 

905 partials=[self.comparison_partial_match], 

906 ) 

907 else: 

908 self._clear_comparisons() 

909 

910 def __eq__(self, other: Identifier) -> bool: 

911 return super().__eq__(other) and self._eq_check(other, ["postcode"]) 

912 

913 def plaintext_str_core(self) -> str: 

914 """ 

915 For CSV. 

916 """ 

917 return self.postcode_unit 

918 

919 @classmethod 

920 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "Postcode": 

921 """ 

922 Creation from CSV. 

923 """ 

924 postcode_unit, start_date, end_date = cls._get_temporal_triplet(x) 

925 return Postcode( 

926 cfg=cfg, 

927 postcode=postcode_unit, 

928 start_date=start_date, 

929 end_date=end_date, 

930 ) 

931 

932 def as_dict( 

933 self, encrypt: bool = True, include_frequencies: bool = True 

934 ) -> Dict[str, Any]: 

935 """ 

936 For JSON. 

937 """ 

938 if not self.postcode_unit: 

939 postcode_unit = None 

940 postcode_sector = None 

941 elif self.is_plaintext and encrypt: 

942 postcode_unit = self.cfg.hash_fn(self.postcode_unit) 

943 postcode_sector = self.cfg.hash_fn(self.postcode_sector) 

944 else: 

945 # Was already hashed, or keeping plaintext 

946 postcode_unit = self.postcode_unit 

947 postcode_sector = self.postcode_sector 

948 d = { 

949 self.KEY_POSTCODE_UNIT: postcode_unit, 

950 self.KEY_POSTCODE_SECTOR: postcode_sector, 

951 } 

952 self._write_dates_to_dict(d) 

953 if include_frequencies: 

954 d[self.KEY_UNIT_FREQ] = self._round(self.unit_freq, encrypt) 

955 d[self.KEY_SECTOR_FREQ] = self._round(self.sector_freq, encrypt) 

956 return d 

957 

958 @classmethod 

959 def from_dict( 

960 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool 

961 ) -> "Postcode": 

962 """ 

963 Creation of a hashed postcode, ultimately from JSON. 

964 """ 

965 p = Postcode( 

966 cfg=cfg, 

967 start_date=getdictval(d, cls.KEY_START_DATE, str), 

968 end_date=getdictval(d, cls.KEY_END_DATE, str), 

969 ) 

970 p.is_plaintext = not hashed 

971 p.postcode_unit = getdictval(d, cls.KEY_POSTCODE_UNIT, str) 

972 p.postcode_sector = getdictval(d, cls.KEY_POSTCODE_SECTOR, str) 

973 p.unit_freq = getdictprob(d, cls.KEY_UNIT_FREQ) # permits None 

974 p.sector_freq = getdictprob(d, cls.KEY_SECTOR_FREQ) # permits None 

975 p._set_comparisons() 

976 return p 

977 

978 def __bool__(self) -> bool: 

979 return bool(self.postcode_unit) 

980 

981 def ensure_has_freq_info_if_id_present(self) -> None: 

982 pass 

983 # It's fine for frequency information to be missing; that means the 

984 # postcode is unknown or a pseudopostcode. We cope in 

985 # _set_comparisons(). 

986 

987 def fully_matches(self, other: "Postcode") -> bool: 

988 return self.postcode_unit == other.postcode_unit 

989 

990 def partially_matches(self, other: "Postcode") -> bool: 

991 return self.postcode_sector == other.postcode_sector 

992 

993 

994# ============================================================================= 

995# DateOfBirth 

996# ============================================================================= 

997 

998 

999class DateOfBirth(IdentifierThreeState): 

1000 """ 

1001 Represents a date of birth (DOB). 

1002 

1003 We don't store any frequencies with the hashed version, since they are all 

1004 obtainable from the config (they are not specific to a particular DOB). 

1005 """ 

1006 

1007 KEY_DOB = "dob" 

1008 KEY_DOB_MD = "dob_md" 

1009 KEY_DOB_YD = "dob_yd" 

1010 KEY_DOB_YM = "dob_ym" 

1011 

1012 def __init__(self, cfg: MatchConfig, dob: str = "") -> None: 

1013 """ 

1014 Plaintext creation of a DOB. 

1015 

1016 Args: 

1017 cfg: 

1018 The config object. 

1019 dob: 

1020 (PLAINTEXT.) The date of birth in ISO-8061 "YYYY-MM-DD" string 

1021 format. 

1022 """ 

1023 super().__init__(cfg=cfg, is_plaintext=True, temporal=False) 

1024 

1025 dob = dob or "" 

1026 if not ( 

1027 isinstance(dob, str) and (not dob or is_valid_isoformat_date(dob)) 

1028 ): 

1029 raise ValueError(f"Bad date: {dob!r}") 

1030 

1031 self.dob_str = dob or "" 

1032 # In our validation data, 93.3% of DOB errors were "single component" 

1033 # errors, e.g. year wrong but month/day right. Within that, there was 

1034 # no very dominant pattern. 

1035 if dob: 

1036 self.dob_md, self.dob_yd, self.dob_ym = mk_blurry_dates(dob) 

1037 else: 

1038 self.dob_md = "" 

1039 self.dob_yd = "" 

1040 self.dob_ym = "" 

1041 

1042 # Precalculate our comparison objects, for speed. 

1043 # We don't need a separate function here, because these frequencies are 

1044 # all set from the config, not our data. 

1045 self.comparison_full_match = DirectComparison( 

1046 p_d_given_same_person=cfg.p_c_dob, 

1047 p_d_given_diff_person=cfg.p_f_dob, 

1048 d_description="dob_full_match", 

1049 ) 

1050 self.comparison_partial_match = DirectComparison( 

1051 p_d_given_same_person=cfg.p_ep_dob, 

1052 p_d_given_diff_person=cfg.p_pnf_dob, 

1053 d_description="dob_partial_not_full_match", 

1054 ) 

1055 self.comparison_no_match = DirectComparison( 

1056 p_d_given_same_person=cfg.p_en_dob, 

1057 p_d_given_diff_person=cfg.p_n_dob, 

1058 d_description="dob_no_match", 

1059 ) 

1060 self.warn_if_llr_order_unexpected( 

1061 full=self.comparison_full_match, 

1062 partials=[self.comparison_partial_match], 

1063 ) 

1064 

1065 def __eq__(self, other: Identifier) -> bool: 

1066 return super().__eq__(other) and self._eq_check(other, ["dob_str"]) 

1067 

1068 def plaintext_str_core(self) -> str: 

1069 """ 

1070 For CSV. 

1071 """ 

1072 return self.dob_str 

1073 

1074 @classmethod 

1075 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "DateOfBirth": 

1076 """ 

1077 Creation from CSV. 

1078 """ 

1079 return DateOfBirth(cfg=cfg, dob=x) 

1080 

1081 def as_dict( 

1082 self, encrypt: bool = True, include_frequencies: bool = True 

1083 ) -> Dict[str, Any]: 

1084 """ 

1085 For JSON. 

1086 """ 

1087 if not self.dob_str: 

1088 dob = "" 

1089 dob_md = "" 

1090 dob_yd = "" 

1091 dob_ym = "" 

1092 elif self.is_plaintext and encrypt: 

1093 hash_fn = self.cfg.hash_fn 

1094 dob = hash_fn(self.dob_str) 

1095 dob_md = hash_fn(self.dob_md) 

1096 dob_yd = hash_fn(self.dob_yd) 

1097 dob_ym = hash_fn(self.dob_ym) 

1098 else: 

1099 # Was already hashed, or staying plaintext 

1100 dob = self.dob_str 

1101 dob_md = self.dob_md 

1102 dob_yd = self.dob_yd 

1103 dob_ym = self.dob_ym 

1104 return { 

1105 self.KEY_DOB: dob, 

1106 self.KEY_DOB_MD: dob_md, 

1107 self.KEY_DOB_YD: dob_yd, 

1108 self.KEY_DOB_YM: dob_ym, 

1109 } 

1110 

1111 @classmethod 

1112 def from_dict( 

1113 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool 

1114 ) -> "DateOfBirth": 

1115 """ 

1116 Creation of a hashed DOB, ultimately from JSON. 

1117 """ 

1118 x = DateOfBirth(cfg=cfg) 

1119 x.is_plaintext = not hashed 

1120 x.dob_str = getdictval(d, cls.KEY_DOB, str) 

1121 x.dob_md = getdictval(d, cls.KEY_DOB_MD, str) 

1122 x.dob_yd = getdictval(d, cls.KEY_DOB_YD, str) 

1123 x.dob_ym = getdictval(d, cls.KEY_DOB_YM, str) 

1124 return x 

1125 

1126 def __bool__(self) -> bool: 

1127 return bool(self.dob_str) 

1128 

1129 def ensure_has_freq_info_if_id_present(self) -> None: 

1130 pass # That info is always in the config; none stored here. 

1131 

1132 def fully_matches(self, other: "DateOfBirth") -> bool: 

1133 return self.dob_str == other.dob_str 

1134 

1135 def partially_matches(self, other: "DateOfBirth") -> bool: 

1136 return ( 

1137 self.dob_md == other.dob_md 

1138 or self.dob_yd == other.dob_yd 

1139 or self.dob_ym == other.dob_ym 

1140 ) 

1141 

1142 

1143# ============================================================================= 

1144# Gender 

1145# ============================================================================= 

1146 

1147 

1148class Gender(IdentifierTwoState): 

1149 """ 

1150 Represents a gender. 

1151 """ 

1152 

1153 KEY_GENDER = "gender" 

1154 KEY_GENDER_FREQ = "gender_freq" 

1155 

1156 def __init__(self, cfg: MatchConfig, gender: str = "") -> None: 

1157 """ 

1158 Plaintext creation of a gender. 

1159 

1160 Args: 

1161 cfg: 

1162 The config object. 

1163 gender: 

1164 (PLAINTEXT.) The gender. 

1165 """ 

1166 super().__init__( 

1167 cfg=cfg, 

1168 is_plaintext=True, 

1169 temporal=False, 

1170 ) 

1171 

1172 gender = gender or "" 

1173 if gender not in VALID_GENDERS: 

1174 raise ValueError(f"Bad gender: {gender!r}") 

1175 

1176 self.gender_str = gender 

1177 if gender: 

1178 self.gender_freq = cfg.gender_freq(gender) 

1179 else: 

1180 self.gender_freq = None # type: Optional[float] 

1181 

1182 self._set_comparisons() 

1183 

1184 def _set_comparisons(self) -> None: 

1185 if self.gender_freq: 

1186 p_e = self.cfg.p_e_gender_error 

1187 p_f = self.gender_freq 

1188 self.comparison_full_match = DirectComparison( 

1189 p_d_given_same_person=1 - p_e, 

1190 p_d_given_diff_person=p_f, 

1191 d_description="gender_match", 

1192 ) 

1193 self.comparison_no_match = DirectComparison( 

1194 p_d_given_same_person=p_e, 

1195 p_d_given_diff_person=1 - p_f, 

1196 d_description="gender_no_match", 

1197 ) 

1198 self.warn_if_llr_order_unexpected(full=self.comparison_full_match) 

1199 else: 

1200 self._clear_comparisons() 

1201 

1202 def __eq__(self, other: Identifier) -> bool: 

1203 return super().__eq__(other) and self._eq_check(other, ["gender_str"]) 

1204 

1205 def plaintext_str_core(self) -> str: 

1206 """ 

1207 For CSV. 

1208 """ 

1209 return self.gender_str 

1210 

1211 @classmethod 

1212 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "Gender": 

1213 """ 

1214 Creation from CSV. 

1215 """ 

1216 return Gender(cfg=cfg, gender=x) 

1217 

1218 def as_dict( 

1219 self, encrypt: bool = True, include_frequencies: bool = True 

1220 ) -> Dict[str, Any]: 

1221 """ 

1222 For JSON. 

1223 """ 

1224 if not self.gender_str: 

1225 gender = "" 

1226 elif self.is_plaintext and encrypt: 

1227 gender = self.cfg.hash_fn(self.gender_str) 

1228 else: 

1229 # Was already hashed, or staying plaintext 

1230 gender = self.gender_str 

1231 d = { 

1232 self.KEY_GENDER: gender, 

1233 } 

1234 if include_frequencies: 

1235 d[self.KEY_GENDER_FREQ] = self._round(self.gender_freq, encrypt) 

1236 return d 

1237 

1238 @classmethod 

1239 def from_dict( 

1240 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool 

1241 ) -> "Gender": 

1242 """ 

1243 Creation of a hashed gender, ultimately from JSON. 

1244 """ 

1245 g = Gender(cfg=cfg) 

1246 g.is_plaintext = not hashed 

1247 g.gender_str = getdictval(d, cls.KEY_GENDER, str) 

1248 g.gender_freq = getdictprob(d, cls.KEY_GENDER_FREQ) 

1249 g._set_comparisons() 

1250 return g 

1251 

1252 def __bool__(self) -> bool: 

1253 return bool(self.gender_str) 

1254 

1255 def ensure_has_freq_info_if_id_present(self) -> None: 

1256 if self.gender_str and self.gender_freq is None: 

1257 raise ValueError( 

1258 self.ERR_MISSING_FREQ + f" for gender {self.gender_str!r}" 

1259 ) 

1260 

1261 def fully_matches(self, other: "Gender") -> bool: 

1262 return self.gender_str == other.gender_str 

1263 

1264 

1265# ============================================================================= 

1266# BasicName 

1267# ============================================================================= 

1268 

1269 

1270class BasicName(IdentifierFourState, ABC): 

1271 """ 

1272 Base class for names. 

1273 

1274 Note that this is a pretty difficult generic problem. See 

1275 https://www.kalzumeus.com/2010/06/17/falsehoods-programmers-believe-about-names/ 

1276 

1277 The sequence of preferences is (1) full match, (2) metaphone match, (3) 

1278 first two character (F2C) match, (4) no match. Reasons are discussed in the 

1279 validation paper. Frequency representations here are slightly more complex 

1280 as the fuzzy representations are not subsets/supersets of each other, but 

1281 overlap, so we need to represent explicitly e.g. P(F2C match but not 

1282 metaphone or name match). 

1283 

1284 We will need some special gender features for both forenames and surnames: 

1285 

1286 - UK forename frequency depends on gender. 

1287 - The probability that someone's surname changes depends on gender. 

1288 

1289 As a result, because we can't access gender once hashed, we need to store 

1290 error frequencies as well as population frequencies. 

1291 

1292 Since names can change, we also support optional start/end dates. If none 

1293 are supplied, it simply becomes a non-temporal identifier. 

1294 """ 

1295 

1296 KEY_NAME = "name" 

1297 KEY_METAPHONE = "metaphone" 

1298 KEY_FIRST_TWO_CHAR = "f2c" 

1299 

1300 # Terse in the JSON, to save some space: 

1301 KEY_P_F = "p_f" # name frequency 

1302 KEY_P_P1NF = "p_p1nf" # metaphone, not name 

1303 KEY_P_P2NP1 = "p_p2np1" # F2C, not name or metaphone 

1304 

1305 KEY_P_C = "p_c" 

1306 KEY_P_EP1 = "p_ep1" 

1307 KEY_P_EP2NP1 = "p_ep2np1" 

1308 

1309 def __init__( 

1310 self, 

1311 cfg: MatchConfig, 

1312 name: str = "", 

1313 gender: str = "", 

1314 temporal: bool = False, 

1315 start_date: Union[str, Date] = None, 

1316 end_date: Union[str, Date] = None, 

1317 description: str = "name", 

1318 ) -> None: 

1319 """ 

1320 Plaintext creation of a name. 

1321 

1322 Args: 

1323 cfg: 

1324 The config object. 

1325 name: 

1326 (PLAINTEXT.) The name. 

1327 description: 

1328 Used internally for verbose comparisons. 

1329 """ 

1330 if not isinstance(name, str): 

1331 raise ValueError(f"Bad name: {name!r}") 

1332 

1333 super().__init__( 

1334 cfg=cfg, 

1335 is_plaintext=True, 

1336 temporal=temporal, 

1337 start_date=start_date, 

1338 end_date=end_date, 

1339 ) 

1340 self.description = description 

1341 

1342 # Standardization necessary for freq. lookup and metaphone. 

1343 self.name = standardize_name(name) 

1344 self.metaphone = get_metaphone(self.name) 

1345 self.f2c = get_first_two_char(self.name) 

1346 

1347 # Population frequencies -- to be overridden 

1348 self.p_f = None # type: Optional[float] 

1349 self.p_p1nf = None # type: Optional[float] 

1350 self.p_p2np1 = None # type: Optional[float] 

1351 

1352 # Error probabilities -- to be overridden 

1353 self.p_c = None # type: Optional[float] 

1354 self.p_ep1 = None # type: Optional[float] 

1355 self.p_ep2np1 = None # type: Optional[float] 

1356 

1357 self.gender = "" # changed in next step 

1358 self.set_gender(gender) # will reset frequencies and comparisons 

1359 

1360 def set_gender(self, gender: str) -> None: 

1361 """ 

1362 Special operation for identifiable reading. 

1363 """ 

1364 if gender not in VALID_GENDERS: 

1365 raise ValueError(f"Bad gender: {gender!r}") 

1366 self.gender = gender 

1367 self._reset_frequencies_identifiable() # will set comparisons 

1368 

1369 @abstractmethod 

1370 def _reset_frequencies_identifiable(self) -> None: 

1371 """ 

1372 Gender may have changed. Update any probabilities accordingly, 

1373 and call self._set_comparisons(). 

1374 """ 

1375 pass 

1376 

1377 def _clear_frequencies(self) -> None: 

1378 """ 

1379 Clear our population/error frequencies. 

1380 """ 

1381 self.p_f = None 

1382 self.p_p1nf = None 

1383 self.p_p2np1 = None 

1384 

1385 self.p_c = None 

1386 self.p_ep1 = None 

1387 self.p_ep2np1 = None 

1388 

1389 @property 

1390 def p_en(self) -> float: 

1391 """ 

1392 For internal use. Only call if frequencies are set up. 

1393 """ 

1394 p_en = 1 - self.p_c - self.p_ep1 - self.p_ep2np1 

1395 assert 0 <= p_en <= 1, "Bad error probabilities for a BasicName" 

1396 return p_en 

1397 

1398 @property 

1399 def p_n(self) -> float: 

1400 """ 

1401 For internal use. Only call if frequencies are set up. 

1402 """ 

1403 p_n = 1 - self.p_f - self.p_p1nf - self.p_p2np1 

1404 assert 0 <= p_n <= 1, "Bad population probabilities for a BasicName" 

1405 return p_n 

1406 

1407 def _set_comparisons(self) -> None: 

1408 """ 

1409 If we have identifier information, use error information from `self` 

1410 (unusually), and frequency information from `self`, to create our 

1411 comparisons. Otherwise, call :meth:`_clear_comparisons`. 

1412 """ 

1413 if self.name: 

1414 desc = self.description 

1415 self.comparison_full_match = DirectComparison( 

1416 p_d_given_same_person=self.p_c, 

1417 p_d_given_diff_person=self.p_f, 

1418 d_description=f"{desc}_full_match", 

1419 ) 

1420 self.comparison_partial_match = DirectComparison( 

1421 p_d_given_same_person=self.p_ep1, 

1422 p_d_given_diff_person=self.p_p1nf, 

1423 d_description=f"{desc}_partial_match_1_metaphone_not_full", 

1424 ) 

1425 self.comparison_partial_match_second = DirectComparison( 

1426 p_d_given_same_person=self.p_ep2np1, 

1427 p_d_given_diff_person=self.p_p2np1, 

1428 d_description=f"{desc}_partial_match_2_f2c_not_name_metaphone", 

1429 ) 

1430 self.comparison_no_match = DirectComparison( 

1431 p_d_given_same_person=self.p_en, 

1432 p_d_given_diff_person=self.p_n, 

1433 d_description=f"{desc}_no_match", 

1434 ) 

1435 self.warn_if_llr_order_unexpected( 

1436 full=self.comparison_full_match, 

1437 partials=[ 

1438 self.comparison_partial_match, 

1439 self.comparison_partial_match_second, 

1440 ], 

1441 ) 

1442 else: 

1443 self._clear_comparisons() 

1444 

1445 def __eq__(self, other: Identifier) -> bool: 

1446 return super().__eq__(other) and self._eq_check( 

1447 other, ["name", "gender"] 

1448 ) 

1449 

1450 def plaintext_str_core(self) -> str: 

1451 """ 

1452 For CSV. 

1453 """ 

1454 return self.name 

1455 

1456 def as_dict( 

1457 self, encrypt: bool = True, include_frequencies: bool = True 

1458 ) -> Dict[str, Any]: 

1459 """ 

1460 For JSON. 

1461 """ 

1462 if not self.name: 

1463 name = None 

1464 metaphone = None 

1465 f2c = None 

1466 elif self.is_plaintext and encrypt: 

1467 hash_fn = self.cfg.hash_fn 

1468 name = hash_fn(self.name) 

1469 metaphone = hash_fn(self.metaphone) 

1470 f2c = hash_fn(self.f2c) 

1471 else: 

1472 # Was already hashed, or staying plaintext 

1473 name = self.name 

1474 metaphone = self.metaphone 

1475 f2c = self.f2c 

1476 d = { 

1477 self.KEY_NAME: name, 

1478 self.KEY_METAPHONE: metaphone, 

1479 self.KEY_FIRST_TWO_CHAR: f2c, 

1480 } 

1481 self._write_dates_to_dict(d) 

1482 if include_frequencies: 

1483 d[self.KEY_P_F] = self._round(self.p_f, encrypt) 

1484 d[self.KEY_P_P1NF] = self._round(self.p_p1nf, encrypt) 

1485 d[self.KEY_P_P2NP1] = self._round(self.p_p2np1, encrypt) 

1486 d[self.KEY_P_C] = self._round(self.p_c, encrypt) 

1487 d[self.KEY_P_EP1] = self._round(self.p_ep1, encrypt) 

1488 d[self.KEY_P_EP2NP1] = self._round(self.p_ep2np1, encrypt) 

1489 return d 

1490 

1491 def _set_from_json_dict_internal(self, d: Dict[str, Any], hashed: bool): 

1492 """ 

1493 Internal function used by derived classes. Doesn't create the object, 

1494 which is specialized to the derived class, but does the reading from 

1495 the hashed dictionary and sets up the comparisons. 

1496 """ 

1497 self.is_plaintext = not hashed 

1498 

1499 if self.temporal: 

1500 self._set_dates_from_dict(d) 

1501 

1502 self.name = getdictval(d, self.KEY_NAME, str) 

1503 self.metaphone = getdictval(d, self.KEY_METAPHONE, str) 

1504 self.f2c = getdictval(d, self.KEY_FIRST_TWO_CHAR, str) 

1505 

1506 self.p_f = getdictprob(d, self.KEY_P_F) 

1507 self.p_p1nf = getdictprob(d, self.KEY_P_P1NF) 

1508 self.p_p2np1 = getdictprob(d, self.KEY_P_P2NP1) 

1509 

1510 self.p_c = getdictprob(d, self.KEY_P_C) 

1511 self.p_ep1 = getdictprob(d, self.KEY_P_EP1) 

1512 self.p_ep2np1 = getdictprob(d, self.KEY_P_EP2NP1) 

1513 

1514 self._set_comparisons() 

1515 

1516 def __bool__(self) -> bool: 

1517 return bool(self.name) 

1518 

1519 def ensure_has_freq_info_if_id_present(self) -> None: 

1520 if self.name and ( 

1521 self.p_f is None or self.p_p1nf is None or self.p_p2np1 is None 

1522 ): 

1523 raise ValueError( 

1524 self.ERR_MISSING_FREQ + f" for name {self.name!r}" 

1525 ) 

1526 

1527 def fully_matches(self, other: "BasicName") -> bool: 

1528 return self.name == other.name 

1529 

1530 def partially_matches(self, other: "BasicName") -> bool: 

1531 return self.metaphone == other.metaphone 

1532 

1533 def partially_matches_second(self, other: "BasicName") -> bool: 

1534 return self.f2c == other.f2c 

1535 

1536 

1537# ============================================================================= 

1538# SurnameFragment 

1539# ============================================================================= 

1540 

1541 

1542class SurnameFragment(BasicName): 

1543 """ 

1544 Collate information about a name fragment. This identifier is unlikely to 

1545 be used directly for comparisons, but is used by Surname. 

1546 

1547 We don't store dates; they are stored with the surname. 

1548 """ 

1549 

1550 BAD_METHOD = "Inappropriate function called for SurnameFragment" 

1551 

1552 # ------------------------------------------------------------------------- 

1553 # Creation 

1554 # ------------------------------------------------------------------------- 

1555 

1556 def __init__( 

1557 self, 

1558 cfg: MatchConfig, 

1559 name: str = "", 

1560 gender: str = "", 

1561 ) -> None: 

1562 super().__init__(cfg, name=name, gender=gender, description="surname") 

1563 # ... will call _reset_frequencies_identifiable() 

1564 

1565 @classmethod 

1566 def from_dict( 

1567 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool 

1568 ) -> "SurnameFragment": 

1569 f = SurnameFragment(cfg) 

1570 f._set_from_json_dict_internal(d, hashed) 

1571 return f 

1572 

1573 # ------------------------------------------------------------------------- 

1574 # Creation helper functions 

1575 # ------------------------------------------------------------------------- 

1576 

1577 def _reset_frequencies_identifiable(self) -> None: 

1578 if self.name: 

1579 cfg = self.cfg 

1580 f = cfg.get_surname_freq_info(self.name, prestandardized=True) 

1581 g = self.gender 

1582 

1583 self.p_f = f.p_name 

1584 self.p_p1nf = f.p_metaphone_not_name 

1585 self.p_p2np1 = f.p_f2c_not_name_metaphone 

1586 

1587 self.p_c = cfg.p_c_surname[g] 

1588 self.p_ep1 = cfg.p_ep1_surname[g] 

1589 self.p_ep2np1 = cfg.p_ep2np1_surname[g] 

1590 else: 

1591 self._clear_frequencies() 

1592 self._set_comparisons() 

1593 

1594 # ------------------------------------------------------------------------- 

1595 # Unused methods from Identifier 

1596 # ------------------------------------------------------------------------- 

1597 

1598 def plaintext_str_core(self) -> str: 

1599 raise AssertionError(self.BAD_METHOD) 

1600 

1601 @classmethod 

1602 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "SurnameFragment": 

1603 raise AssertionError(cls.BAD_METHOD) 

1604 

1605 # ------------------------------------------------------------------------- 

1606 # Sorting methods, to use the linter 

1607 # ------------------------------------------------------------------------- 

1608 

1609 @staticmethod 

1610 def sort_exact_freq(x: "SurnameFragment") -> float: 

1611 return x.p_f 

1612 

1613 @staticmethod 

1614 def sort_partial_1_freq(x: "SurnameFragment") -> float: 

1615 return x.p_p1nf 

1616 

1617 @staticmethod 

1618 def sort_partial_2_freq(x: "SurnameFragment") -> float: 

1619 return x.p_p2np1 

1620 

1621 

1622# ============================================================================= 

1623# Surname 

1624# ============================================================================= 

1625 

1626 

1627class Surname(Identifier): 

1628 """ 

1629 Represents a surname (family name). 

1630 

1631 Identifiably, we store the unmodified (unstandardized) name. 

1632 

1633 We don't inherit from BasicName, but from Identifier, because surnames 

1634 need to deal with "fragment" problems. 

1635 

1636 We need to be able to match on parts. For example, "van Beethoven" should 

1637 match "van Beethoven" but also "Beethoven". What frequency should we use 

1638 for those parts? This has to be the frequency of the part (not the 

1639 composite). For example, if someone is called "Mozart-Smith", then a match 

1640 on "Mozart-Smith" or "Mozart" is less likely in the population, and thus 

1641 more informative, than a match on "Smith". So, we need frequency 

1642 information associated with each part. 

1643 """ 

1644 

1645 KEY_FRAGMENTS = "fragments" 

1646 

1647 # ------------------------------------------------------------------------- 

1648 # Creation 

1649 # ------------------------------------------------------------------------- 

1650 

1651 def __init__( 

1652 self, 

1653 cfg: MatchConfig, 

1654 name: str = "", 

1655 gender: str = "", 

1656 start_date: Union[str, Date] = None, 

1657 end_date: Union[str, Date] = None, 

1658 ) -> None: 

1659 super().__init__( 

1660 cfg, 

1661 is_plaintext=True, 

1662 temporal=True, 

1663 start_date=start_date, 

1664 end_date=end_date, 

1665 ) 

1666 self.raw_surname = name.strip() # but retain case, internal spaces 

1667 # ... because "case" is complex for UTF8 characters. 

1668 

1669 # There is some duplication here for speed and to cope with the 

1670 # difference between identifiable and hashed versions. We want a set 

1671 # version for rapid overlap checking, and an ordered list to pick by 

1672 # frequency sometimes. 

1673 self.exact_set = set() # type: Set[str] 

1674 self.partial_set_metaphone = set() # type: Set[str] 

1675 self.partial_set_f2c = set() # type: Set[str] 

1676 self.fragments = [] # type: List[SurnameFragment] 

1677 # ... set properly by _reset_identifiable() and from_dict() 

1678 self.gender = "" # changed in next step 

1679 self.set_gender(gender) # will reset frequencies/comparisons 

1680 

1681 @classmethod 

1682 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "Surname": 

1683 """ 

1684 Creation from CSV. 

1685 """ 

1686 name, start_date, end_date = cls._get_temporal_triplet(x) 

1687 return Surname( 

1688 cfg=cfg, name=x, start_date=start_date, end_date=end_date 

1689 ) 

1690 

1691 @classmethod 

1692 def from_dict( 

1693 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool 

1694 ) -> "Surname": 

1695 """ 

1696 Creation of a hashed name, ultimately from JSON. 

1697 """ 

1698 n = Surname(cfg=cfg) 

1699 n.is_plaintext = not hashed 

1700 n._set_dates_from_dict(d) 

1701 fragments_json_list = getdictval(d, cls.KEY_FRAGMENTS, list) 

1702 n.fragments = [ 

1703 SurnameFragment.from_dict(cfg, fragment_dict, hashed) 

1704 for fragment_dict in fragments_json_list 

1705 ] 

1706 n._reset_name_sets() 

1707 return n 

1708 

1709 def __eq__(self, other: Identifier) -> bool: 

1710 return super().__eq__(other) and self._eq_check( 

1711 other, ["gender", "fragments"] 

1712 ) 

1713 

1714 # ------------------------------------------------------------------------- 

1715 # Representation 

1716 # ------------------------------------------------------------------------- 

1717 

1718 def plaintext_str_core(self) -> str: 

1719 return self.raw_surname 

1720 

1721 def as_dict( 

1722 self, encrypt: bool = True, include_frequencies: bool = True 

1723 ) -> Dict[str, Any]: 

1724 fragments = [ 

1725 f.as_dict(encrypt=encrypt, include_frequencies=include_frequencies) 

1726 for f in self.fragments 

1727 ] 

1728 d = {self.KEY_FRAGMENTS: fragments} 

1729 self._write_dates_to_dict(d) 

1730 return d 

1731 

1732 # ------------------------------------------------------------------------- 

1733 # Methods to support creation 

1734 # ------------------------------------------------------------------------- 

1735 

1736 def set_gender(self, gender: str) -> None: 

1737 """ 

1738 Special operation for identifiable reading. 

1739 """ 

1740 if gender not in VALID_GENDERS: 

1741 raise ValueError(f"Bad gender: {gender!r}") 

1742 self.gender = gender 

1743 self._reset_identifiable() # will set comparisons 

1744 

1745 def _reset_identifiable(self) -> None: 

1746 """ 

1747 If the name or gender has changed, in an identifiable copy, reset our 

1748 fragment information (with their comparisons), and the name fragment 

1749 sets for fast comparison. 

1750 """ 

1751 cfg = self.cfg 

1752 self.fragments = [] 

1753 if self.raw_surname: 

1754 for exact in surname_alternative_fragments( 

1755 surname=self.raw_surname, 

1756 accent_transliterations=cfg.accent_transliterations, 

1757 nonspecific_name_components=cfg.nonspecific_name_components, 

1758 ): 

1759 # The first of these is the full name. 

1760 fragment = SurnameFragment( 

1761 cfg=cfg, name=exact, gender=self.gender 

1762 ) 

1763 self.fragments.append(fragment) 

1764 self.exact_set.add(fragment.name) 

1765 self.partial_set_metaphone.add(fragment.metaphone) 

1766 self.partial_set_f2c.add(fragment.f2c) 

1767 self._reset_name_sets() 

1768 

1769 def _reset_name_sets(self) -> None: 

1770 """ 

1771 Reset our fast comparison sets from the name fragments. 

1772 """ 

1773 self.exact_set = set() 

1774 self.partial_set_metaphone = set() 

1775 self.partial_set_f2c = set() 

1776 for f in self.fragments: 

1777 self.exact_set.add(f.name) 

1778 self.partial_set_metaphone.add(f.metaphone) 

1779 self.partial_set_f2c.add(f.f2c) 

1780 

1781 # ------------------------------------------------------------------------- 

1782 # Basic tests 

1783 # ------------------------------------------------------------------------- 

1784 

1785 def __bool__(self) -> bool: 

1786 return bool(self.fragments) 

1787 

1788 def ensure_has_freq_info_if_id_present(self) -> None: 

1789 for f in self.fragments: 

1790 f.ensure_has_freq_info_if_id_present() 

1791 

1792 # ------------------------------------------------------------------------- 

1793 # Comparison 

1794 # ------------------------------------------------------------------------- 

1795 

1796 def fully_matches(self, other: "Surname") -> bool: 

1797 """ 

1798 Primarily for debugging; :meth:`comparison` is used for real work. 

1799 """ 

1800 return bool(self.exact_set.intersection(other.exact_set)) 

1801 

1802 def partially_matches(self, other: "Surname") -> bool: 

1803 """ 

1804 Primarily for debugging; :meth:`comparison` is used for real work. 

1805 """ 

1806 return bool( 

1807 self.partial_set_metaphone.intersection( 

1808 other.partial_set_metaphone 

1809 ) 

1810 ) 

1811 

1812 def partially_matches_second(self, other: "Surname") -> bool: 

1813 """ 

1814 Primarily for debugging; :meth:`comparison` is used for real work. 

1815 """ 

1816 return bool(self.partial_set_f2c.intersection(other.partial_set_f2c)) 

1817 

1818 def comparison(self, candidate_id: "Surname") -> Optional[Comparison]: 

1819 """ 

1820 Specialized version for surname. 

1821 """ 

1822 if not self.comparison_relevant(candidate_id): 

1823 # Infer no conclusions from absent information. 

1824 return None 

1825 

1826 overlap_exact = self.exact_set.intersection(candidate_id.exact_set) 

1827 if overlap_exact: 

1828 # Exact match. But possibly >1, e.g. "Mozart-Smith" has matched 

1829 # "Mozart-Smith", "Mozart", and "Smith". Reasonable to pick the 

1830 # most informative (rarest) version. 

1831 possibilities = [ 

1832 f for f in self.fragments if f.name in overlap_exact 

1833 ] # type: List[SurnameFragment] 

1834 possibilities.sort(key=SurnameFragment.sort_exact_freq) 

1835 # Sorted in ascending order, so first (lowest frequency) is best. 

1836 return possibilities[0].comparison_full_match 

1837 

1838 overlap_partial_1 = self.partial_set_metaphone.intersection( 

1839 candidate_id.partial_set_metaphone 

1840 ) 

1841 if overlap_partial_1: 

1842 # Similarly: 

1843 possibilities = [ 

1844 f for f in self.fragments if f.metaphone in overlap_partial_1 

1845 ] # type: List[SurnameFragment] 

1846 possibilities.sort(key=SurnameFragment.sort_partial_1_freq) 

1847 # Sorted in ascending order, so first (lowest frequency) is best. 

1848 return possibilities[0].comparison_partial_match 

1849 

1850 overlap_partial_2 = self.partial_set_f2c.intersection( 

1851 candidate_id.partial_set_f2c 

1852 ) 

1853 if overlap_partial_2: 

1854 # Similarly: 

1855 possibilities = [ 

1856 f for f in self.fragments if f.f2c in overlap_partial_2 

1857 ] # type: List[SurnameFragment] 

1858 possibilities.sort(key=SurnameFragment.sort_partial_2_freq) 

1859 # Sorted in ascending order, so first (lowest frequency) is best. 

1860 return possibilities[0].comparison_partial_match_second 

1861 

1862 # For "no match", we use the whole original name and its frequencies: 

1863 return self.fragments[0].comparison_no_match 

1864 

1865 

1866# ============================================================================= 

1867# Forename 

1868# ============================================================================= 

1869 

1870 

1871class Forename(BasicName): 

1872 """ 

1873 Represents a forename (given name). 

1874 """ 

1875 

1876 def __init__( 

1877 self, 

1878 cfg: MatchConfig, 

1879 name: str = "", 

1880 gender: str = "", 

1881 start_date: Union[str, Date] = None, 

1882 end_date: Union[str, Date] = None, 

1883 ) -> None: 

1884 super().__init__( 

1885 cfg=cfg, 

1886 name=name, 

1887 gender=gender, 

1888 temporal=True, 

1889 start_date=start_date, 

1890 end_date=end_date, 

1891 description="forename", 

1892 ) 

1893 # ... will call _reset_frequencies_identifiable() 

1894 

1895 def _reset_frequencies_identifiable(self) -> None: 

1896 if self.name: 

1897 cfg = self.cfg 

1898 g = self.gender 

1899 f = cfg.get_forename_freq_info(self.name, g, prestandardized=True) 

1900 

1901 self.p_f = f.p_name 

1902 self.p_p1nf = f.p_metaphone_not_name 

1903 self.p_p2np1 = f.p_f2c_not_name_metaphone 

1904 

1905 self.p_c = cfg.p_c_forename[g] 

1906 self.p_ep1 = cfg.p_ep1_forename[g] 

1907 self.p_ep2np1 = cfg.p_ep2np1_forename[g] 

1908 else: 

1909 self._clear_frequencies() 

1910 self._set_comparisons() 

1911 

1912 @classmethod 

1913 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "Forename": 

1914 """ 

1915 Creation from CSV. 

1916 """ 

1917 name, start_date, end_date = cls._get_temporal_triplet(x) 

1918 return Forename( 

1919 cfg=cfg, name=x, start_date=start_date, end_date=end_date 

1920 ) 

1921 

1922 @classmethod 

1923 def from_dict( 

1924 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool 

1925 ) -> "Forename": 

1926 """ 

1927 Creation of a hashed name, ultimately from JSON. 

1928 """ 

1929 n = Forename(cfg=cfg) 

1930 n._set_from_json_dict_internal(d, hashed) 

1931 return n 

1932 

1933 

1934# ============================================================================= 

1935# PerfectID 

1936# ============================================================================= 

1937 

1938 

1939class PerfectID(IdentifierTwoState): 

1940 """ 

1941 For comparing people based on one or more perfect ID values. 

1942 """ 

1943 

1944 def __init__( 

1945 self, cfg: MatchConfig, identifiers: Dict[str, Any] = None 

1946 ) -> None: 

1947 """ 

1948 The identifier values will be converted to strings, if they aren't 

1949 already. 

1950 """ 

1951 super().__init__(cfg=cfg, is_plaintext=True, temporal=False) 

1952 self.comparison_full_match = CertainComparison() 

1953 

1954 self.identifiers = {} # type: Dict[str, str] 

1955 self.key_set = set() # type: Set[str] 

1956 if identifiers: 

1957 self._set_identifiers(identifiers) 

1958 

1959 def _set_identifiers(self, identifiers: Dict[str, str] = None) -> None: 

1960 identifiers = identifiers or {} 

1961 for k, v in identifiers.items(): 

1962 self.identifiers[standardize_perfect_id_key(k)] = ( 

1963 standardize_perfect_id_value(v) 

1964 ) 

1965 self.key_set = set(self.identifiers.keys()) 

1966 

1967 @classmethod 

1968 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "PerfectID": 

1969 d = {} # type: Dict[str, str] 

1970 pair_strings = x.split(";") 

1971 for pair_str in pair_strings: 

1972 if pair_str.count(":") != 1: 

1973 raise ValueError(f"Bad PerfectID string {x!r}") 

1974 k, v = pair_str.split(":") 

1975 d[k] = v 

1976 return PerfectID(cfg=cfg, identifiers=d) 

1977 

1978 def __eq__(self, other: Identifier) -> bool: 

1979 return super().__eq__(other) and self._eq_check(other, ["identifiers"]) 

1980 

1981 def plaintext_str_core(self) -> str: 

1982 return ";".join(f"{k}={v}" for k, v in self.identifiers) 

1983 

1984 @classmethod 

1985 def from_dict( 

1986 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool 

1987 ) -> "PerfectID": 

1988 p = PerfectID(cfg=cfg) 

1989 p.is_plaintext = not hashed 

1990 p._set_identifiers(d) 

1991 return p 

1992 

1993 def as_dict( 

1994 self, encrypt: bool = True, include_frequencies: bool = True 

1995 ) -> Dict[str, Any]: 

1996 if not self.is_plaintext or not encrypt: 

1997 # Was already hashed, or staying plaintext 

1998 return self.identifiers 

1999 hash_fn = self.cfg.hash_fn 

2000 return {k: hash_fn(v) for k, v in self.identifiers.items()} 

2001 

2002 def __bool__(self) -> bool: 

2003 return bool(self.identifiers) 

2004 

2005 def ensure_has_freq_info_if_id_present(self) -> None: 

2006 pass 

2007 

2008 def fully_matches(self, other: "PerfectID") -> bool: 

2009 for k in self.key_set.intersection(other.key_set): 

2010 if self.identifiers[k] == other.identifiers[k]: 

2011 # Match 

2012 return True 

2013 return False 

2014 

2015 def comparison(self, candidate_id: "PerfectID") -> Optional[Comparison]: 

2016 return ( 

2017 self.comparison_full_match 

2018 if self.fully_matches(candidate_id) 

2019 else None 

2020 ) 

2021 

2022 

2023# ============================================================================= 

2024# DummyLetterIdentifier 

2025# ============================================================================= 

2026 

2027 

2028class DummyLetterTemporalIdentifier(IdentifierTwoState): 

2029 """ 

2030 Represents identifiers {A, B, ... Z}, each with probability 1/26, allowing 

2031 exact matching only. For testing multiple comparison algorithms. Allows a 

2032 temporal component. 

2033 """ 

2034 

2035 Q = 1 / 26 # true 

2036 P_ERROR = 0.01 # arbitrary 

2037 KEY_VALUE = "value" 

2038 

2039 def __init__( 

2040 self, 

2041 value: str, 

2042 cfg: Optional[MatchConfig] = None, 

2043 temporal: bool = False, 

2044 start_date: Union[str, Date] = None, 

2045 end_date: Union[str, Date] = None, 

2046 ) -> None: 

2047 """ 

2048 Plaintext creation of a dummy identifier. 

2049 """ 

2050 super().__init__( 

2051 cfg=cfg, 

2052 is_plaintext=True, 

2053 temporal=temporal, 

2054 start_date=start_date, 

2055 end_date=end_date, 

2056 ) 

2057 assert ( 

2058 isinstance(value, str) 

2059 and len(value) == 1 

2060 and ord("A") <= ord(value) <= ord("Z") 

2061 ) 

2062 self.value = value 

2063 self._set_comparisons() 

2064 

2065 def _set_comparisons(self) -> None: 

2066 p_e = self.P_ERROR 

2067 p_f = self.Q 

2068 self.comparison_full_match = DirectComparison( 

2069 p_d_given_same_person=1 - p_e, 

2070 p_d_given_diff_person=p_f, 

2071 d_description=f"dummy_match:{self.value}", 

2072 ) 

2073 self.comparison_no_match = DirectComparison( 

2074 p_d_given_same_person=p_e, 

2075 p_d_given_diff_person=1 - p_f, 

2076 d_description=f"dummy_mismatch:{self.value}", 

2077 ) 

2078 

2079 def __eq__(self, other: "Identifier") -> bool: 

2080 return super().__eq__(other) and self._eq_check(other, ["value"]) 

2081 

2082 def plaintext_str_core(self) -> str: 

2083 return self.value 

2084 

2085 @classmethod 

2086 def from_plaintext_str( 

2087 cls, cfg: MatchConfig, x: str 

2088 ) -> "DummyLetterTemporalIdentifier": 

2089 value, start_date, end_date = cls._get_temporal_triplet(x) 

2090 return DummyLetterTemporalIdentifier( 

2091 cfg=cfg, 

2092 value=x, 

2093 start_date=start_date, 

2094 end_date=end_date, 

2095 temporal=True, 

2096 ) 

2097 

2098 def as_dict( 

2099 self, encrypt: bool = True, include_frequencies: bool = True 

2100 ) -> Dict[str, Any]: 

2101 """ 

2102 For JSON. 

2103 """ 

2104 if self.is_plaintext and encrypt: 

2105 value = self.cfg.hash_fn(self.value) 

2106 else: 

2107 # Was already hashed, or staying plaintext 

2108 value = self.value 

2109 d = {self.KEY_VALUE: value} 

2110 self._write_dates_to_dict(d) 

2111 return d 

2112 

2113 @classmethod 

2114 def from_dict( 

2115 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool 

2116 ) -> "DummyLetterTemporalIdentifier": 

2117 i = DummyLetterTemporalIdentifier(cfg=cfg, value="A") 

2118 # ... value is a dummy, overwritten 

2119 i.is_plaintext = not hashed 

2120 i.value = getdictval(d, cls.KEY_VALUE, str) 

2121 i._set_comparisons() 

2122 return i 

2123 

2124 def __bool__(self) -> bool: 

2125 return bool(self.value) 

2126 

2127 def ensure_has_freq_info_if_id_present(self) -> None: 

2128 pass 

2129 

2130 def fully_matches(self, other: "DummyLetterIdentifier") -> bool: 

2131 return self.value == other.value 

2132 

2133 

2134# ============================================================================= 

2135# DummyLetterIdentifier 

2136# ============================================================================= 

2137 

2138 

2139class DummyLetterIdentifier(DummyLetterTemporalIdentifier): 

2140 """ 

2141 Represents identifiers {A, B, ... Z}, each with probability 1/26, allowing 

2142 exact matching only. For testing multiple comparison algorithms. No 

2143 temporal component. 

2144 """ 

2145 

2146 def __init__(self, value: str, cfg: Optional[MatchConfig] = None) -> None: 

2147 """ 

2148 Plaintext creation of a dummy identifier. 

2149 """ 

2150 super().__init__(cfg=cfg, value=value, temporal=False) 

2151 

2152 @classmethod 

2153 def from_plaintext_str( 

2154 cls, cfg: MatchConfig, x: str 

2155 ) -> "DummyLetterIdentifier": 

2156 return DummyLetterIdentifier(cfg=cfg, value=x) 

2157 

2158 @classmethod 

2159 def from_dict( 

2160 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool 

2161 ) -> "DummyLetterIdentifier": 

2162 i = DummyLetterIdentifier(cfg=cfg, value="A") # dummy, overwritten 

2163 i.is_plaintext = not hashed 

2164 i.value = getdictval(d, cls.KEY_VALUE, str) 

2165 i._set_comparisons() 

2166 return i 

2167 

2168 

2169# ============================================================================= 

2170# Comparison of multiple potentially jumbled similar identifiers 

2171# ============================================================================= 

2172 

2173NOTES_MULTIPLE_COMPARISONS = """ 

2174 

2175What can be compared? 

2176--------------------- 

2177 

2178Identifiers that are explicitly time-stamped cannot be compared with explicitly 

2179non-overlapping identifiers. (But un-time-stamped identifiers can be compared 

2180with anything.) And only information that is "present" is used for comparison. 

2181These checks are implemented by each identifier in their `comparison` method. 

2182 

2183 

2184What is a good match? 

2185--------------------- 

2186 

2187Implicitly, we prefer full > partial > no match (and similarly for comparisons 

2188with more or fewer than 3 options). But this is implemented more explicitly by 

2189log likelihood ratio: we prefer higher values. 

2190 

2191 

2192No re-use 

2193--------- 

2194 

2195No identifier can be used for >1 comparison simultaneously. "Surplus" 

2196identifiers therefore provide no evidence. For example, if candidate_identifers 

2197= [A, B, C] and proband_identifiers = [A, B], then C will be ignored (the 

2198comparisons will likely be A/A, B/B). But [A, B, C] versus [A, B, D] will 

2199likely lead to comparisons A/A, B/B, C/D. 

2200 

2201Suppose our proband has n identifiers, and our candidate has m. Then we can 

2202make c = min(n, m) comparisons. 

2203 

2204 

2205Unordered comparisons: picking the best involves implicit comparison 

2206-------------------------------------------------------------------- 

2207 

2208In unordered comparisons, if we pick the best, we have implicitly made many 

2209more comparisons. We need to adjust for that. 

2210 

2211To illustrate, suppose the population of all names is {A, B, ..., Z}, giving a 

2212set of size s = 26, and that every name is equiprobable in the population with 

2213frequency q = 1/s = 1/26. 

2214 

2215PROBABILITY OF A POPULATION (RANDOM PERSON) MATCH FOR MULTIPLE IDENTIFIERS. If 

2216we have a proband with names [A] and a candidate with a single name such as [A] 

2217or [Z], then we will declare a match if the candidate is named [A] and P(D | 

2218¬H) = P(match | randomly selected other person) = 1/26. If our candidate has 

2219two unordered names, then we would declare a match regardless of whether the 

2220candidate was [A, B] or [B, A], and so would declare a match with a random 

2221candidate with probability 1/26 + 1/26 - 1/(26 ^ 2), or more generally 2/s - 

22221/(s ^ 2) = 2q - q^2. The subtracted component is for a candidate named [A, A], 

2223who would otherwise be counted twice for [A, *] and [*, A]. More generally, for 

2224a proband with one name and a candidate with m names, the match probability is 

22251 - (1 - q) ^ m. That is, the probability of no match for each is (1 - q), and 

2226it takes m failures to match for an overall failure to match. By the Bonferroni 

2227approximation or Boole's inequality [1], this is approximately (and never more 

2228than) m * q. So mq is a slightly conservative correction for multiple 

2229comparisons. 

2230 

2231For a proband with n <= m names, we can work sequentially: the first proband 

2232named is matched by the candidate with approximately P = m * q_1; then, having 

2233used up one candidate name, the second proband name is matched by the candidate 

2234with approximately P = (m - 1) * q_2, and so on. 

2235 

2236If n > m, we simply stop the process. 

2237 

2238No correction is required for P(D | H), since (ignoring identifier errors) the 

2239probability of an unordered match given H is 1. 

2240 

2241This does NOT apply to "non-match" comparisons, where we have not gone 

2242"fishing" for the best order. 

2243 

2244[1] https://en.wikipedia.org/wiki/Boole%27s_inequality 

2245 

2246 

2247Implementing via the Bayesian log-odds system 

2248--------------------------------------------- 

2249 

2250Using this approximation makes things straightforward. The posterior log odds 

2251is the prior log odds plus the log likelihood ratio. The log likelihood ratio 

2252(LLR) for a match is ln(p_c) - ln(match probability), where p_c is the 

2253probability of a correct match given the hypothesis that the proband and 

2254candidate are the same person. 

2255 

2256So if we were using LLR = ln(p_c) - ln(q), but we actually wanted to multiply 

2257the probability q by some factor f to give LLR = ln(p_c) - ln(fq), then since 

2258ln(fq) = ln(f) + ln(q), we can simply add -ln(f) to the running total. 

2259 

2260We can therefore keep track of f = m * (m - 1) * ..., as above, and add that as 

2261a "dummy" comparison. 

2262 

2263 

2264Asymmetry 

2265--------- 

2266 

2267The method above implies asymmetry, in that the unordered comparisons 

2268 

2269 - proband = [A] 

2270 - candidate = [A, B] 

2271 

2272or 

2273 

2274 - proband = [A] 

2275 - candidate = [B, A] 

2276 

2277would be less likely than 

2278 

2279 - proband = [A, B] 

2280 - candidate = [A] 

2281 

2282because the correction (which increases the probability of a population match 

2283by chance and therefore decreases the chance of the proband/candidate being the 

2284same) relates to the number of candidate identifiers available. 

2285 

2286This is probably fine and is a defence against a "cuckoo" candidate (cf. 

2287"keyword stuffing" on web sites for search engines). For example, in our A-Z 

2288situation, a candidate called [A, B, C, ..., X, Y, Z] is "trying" to be a good 

2289match for everyone and perhaps shouldn't get the same probability of matching 

2290[A] as a candidate simply named [A]. 

2291 

2292Note that there are other asymmetries already, though less obvious ones; for 

2293example, using a very common surname and a rarer early example from the US name 

2294database: 

2295 

2296 - proband = Alice SMITH, same gender/DOB/postcode 

2297 - candidate = Alice ABADILLA, same gender/DOB/postcode 

2298 

2299 ... surname P(D|¬H) = 0.987 = P(no match | candidate not proband) 

2300 ... log odds 12.455 

2301 

2302 - proband = Alice ABADILLA, same gender/DOB/postcode 

2303 - candidate = Alice SMITH, same gender/DOB/postcode 

2304 

2305 ... surname P(D|¬H) = 0.996 = P(no match | candidate not proband) 

2306 ... log odds 12.447 

2307 

2308... because it's rarer for a randomly selected candidate to match ABADILLA than 

2309SMITH, so P(D | ¬H) for a no-match is higher for proband ABADILLA, and that 

2310provides slightly less evidence for a match when ABADILLA is the proband. 

2311 

2312We use this unordered comparison for postcodes and surnames. So this multiple 

2313comparisons correction is equivalent to saying "be a little bit more careful 

2314about declaring a match against people with multiple postcodes and multiple 

2315surnames, because they have a higher chance of appearing to match other people 

2316at random". 

2317 

2318 

2319Ordered comparisons 

2320------------------- 

2321 

2322Consider a proband such as [A, B, C] (n = 3) and a candidate such as [A, B] (m 

2323= 2), where we wish to use the information that an ordered match is superior to 

2324an unordered match. A simple way is as follows. 

2325 

2326- Establish the "best" set of comparisons (highest LLR) following our standard 

2327 rules. (In this case, that would be A/A, B/B, for c = 2.) 

2328 

2329- Establish if that best match was strictly ordered. There should only be one 

2330 way (for this method) that is defined as "strictly ordered", and we will 

2331 define this as that the indexes of the comparisons, 1 ... c, exactly match 

2332 the contributing indices of the proband (1 ... n) and the candidate (1 ... 

2333 m). That is: strict order, no gaps. 

2334 

2335- For a first draft, declare a probability p_o, the probability that if the 

2336 proband/candidate are the same (H is true), the identifiers are correct and 

2337 in same strict order, and a probability p_u that they are correct but 

2338 unordered (not in strict order), and a probability p_e that they are wrong, 

2339 such that p_o + p_u + p_e = 1. 

2340 

2341 Then if there is an ordered match, 

2342 

2343 - P(D | H) = p_o 

2344 - P(D | ¬H) = P(random ordered match) 

2345 

2346 and if there is an unordered match, 

2347 

2348 - P(D | H) = p_u 

2349 - P(D | ¬H) = P(random unordered match) - P(random unordered match) 

2350 

2351 and if no match, 

2352 

2353 - P(D | H) = p_e 

2354 - P(D | ¬H) = 1 - [P(random unordered match) - P(random unordered match)] 

2355 

2356- Then, to superimpose that on identifier comparisons that are themselves 

2357 fuzzy, we note that much of those (e.g. p_e) are already dealt with. So 

2358 if we restrict p_o and p_u to situations where there is a match (full or 

2359 partial) involving two or more identifiers, and we continue to use the 

2360 Bonferroni correction, it becomes straightforward. 

2361 

2362""" 

2363 

2364 

2365class ComparisonInfo: 

2366 """ 

2367 Used by :func:`gen_best_comparisons`. 

2368 """ 

2369 

2370 def __init__( 

2371 self, proband_idx: int, candidate_idx: int, comparison: Comparison 

2372 ) -> None: 

2373 self.proband_idx = proband_idx 

2374 self.candidate_idx = candidate_idx 

2375 self.comparison = comparison 

2376 

2377 # Precalculate these for speed (see sort_asc_best_to_worst): 

2378 self.log_likelihood_ratio = comparison.log_likelihood_ratio 

2379 self._distance = (proband_idx - candidate_idx) ** 2 

2380 

2381 @staticmethod 

2382 def sort_asc_best_to_worst(x: "ComparisonInfo") -> Tuple[float, int]: 

2383 """ 

2384 Returns a sort value suitable for ASCENDING (standard, reverse=False) 

2385 sorting to give a best-to-worst sort order. 

2386 

2387 - The first part of the tuple is negative log likelihood ratio, so 

2388 higher values are worse (because higher values of log likelihood 

2389 ratio are better). 

2390 

2391 - The second part of the tuple (the tie-breaker if NLLR is identical) 

2392 is the square of the distance between the proband and candidate 

2393 indexes. We prefer to use identical values (distance = squared 

2394 distance = 0), so higher values are worse. This tiebreaker means 

2395 that if we compare Alice Alice SMITH to Alice Alice SMITH on first 

2396 names, we will choose index pairs (1, 1) and (2, 2), not (1, 2) and 

2397 (2, 1). 

2398 """ 

2399 return -x.log_likelihood_ratio, x._distance 

2400 

2401 

2402def gen_best_comparisons( 

2403 proband_identifiers: List[Identifier], 

2404 candidate_identifiers: List[Identifier], 

2405 ordered: bool = False, 

2406 p_u: Optional[float] = None, 

2407) -> Generator[Comparison, None, None]: 

2408 """ 

2409 Generates comparisons for two sequences of identifiers (one from the 

2410 proband, one from the candidate), being indifferent to their order. The 

2411 method -- which needs to be fast -- is as described above in 

2412 NOTES_MULTIPLE_COMPARISONS. 

2413 

2414 Args: 

2415 

2416 proband_identifiers: 

2417 List of identifiers from the proband. 

2418 candidate_identifiers: 

2419 List of comparable identifiers from the candidate. 

2420 ordered: 

2421 Treat the comparison as an ordered one? 

2422 p_u: 

2423 (Applicable if ordered is True.) The probability of being 

2424 "unordered", and the complement of p_o, where p_o is the 

2425 probability, given the hypothesis H (proband and candidate are the 

2426 same person) and that c > 1 identifiers are being compared, that 

2427 the candidate identifiers will be in exactly the right order (that 

2428 is, for all matches, the index of the candidate's identifier is the 

2429 same as the index of the proband's identifier). 

2430 """ 

2431 # Compare all pairs. 

2432 ci_list = [] # type: List[ComparisonInfo] 

2433 for p_idx, proband_id in enumerate(proband_identifiers): 

2434 for c_idx, candidate_id in enumerate(candidate_identifiers): 

2435 ci = proband_id.comparison(candidate_id) 

2436 if ci is None: 

2437 # This will happen if either is missing information, or if the 

2438 # identifiers explicitly do not overlap temporally. 

2439 continue 

2440 ci_list.append( 

2441 ComparisonInfo( 

2442 proband_idx=p_idx, 

2443 candidate_idx=c_idx, 

2444 comparison=ci, 

2445 ) 

2446 ) 

2447 if not ci_list: 

2448 # No comparisons. Abort before we do something silly with a correction 

2449 # procedure. 

2450 return 

2451 

2452 # Iterate through comparisons in descending order of log likelihood ratio, 

2453 # i.e. best to worst. See ComparisonInfo.sort_asc_best_to_worst(). 

2454 ci_list.sort(key=ComparisonInfo.sort_asc_best_to_worst) 

2455 candidate_indexes_used = set() # type: Set[int] 

2456 proband_indexes_used = set() # type: Set[int] 

2457 n_candidates_available = n_candidates = len(candidate_identifiers) 

2458 n_positive = 0 

2459 n_implicit_comparisons = 1 

2460 # ... at least one (because ci_list is not empty). This is a multiplicative 

2461 # value; we will multiply it by the number of available candidates used for 

2462 # each comparison. 

2463 correct_order = True 

2464 for ci in ci_list: 

2465 if ( 

2466 ci.proband_idx in proband_indexes_used 

2467 or ci.candidate_idx in candidate_indexes_used 

2468 ): 

2469 # Each identifier can use used as part of only one comparison. 

2470 continue 

2471 yield ci.comparison 

2472 if ci.log_likelihood_ratio > 0: 

2473 # This was some form of match, so we apply our correction. 

2474 n_implicit_comparisons *= n_candidates_available 

2475 n_positive += 1 

2476 if ordered and ci.proband_idx != ci.candidate_idx: 

2477 # Note that the index of ci itself is irrelevant; that will 

2478 # vary depending on the frequency of the identifiers, e.g. John 

2479 # Zachariah versus Zachariah John. 

2480 correct_order = False 

2481 # Whether or not it was a positive match, it was a comparison; we have 

2482 # "used up" the identifiers being compared, and have one fewer 

2483 # candidate available. 

2484 candidate_indexes_used.add(ci.candidate_idx) 

2485 proband_indexes_used.add(ci.proband_idx) 

2486 n_candidates_available -= 1 

2487 

2488 # Any corrections required. 

2489 if ordered: 

2490 # Ordered comparison requested. 

2491 # - To follow this, look at the simpler "unordered" alternative first. 

2492 # - Action only required if there is an ordering to be considered. 

2493 p_o = 1 - p_u # p_o ordered, p_u unordered 

2494 if n_positive > 0 and n_candidates > 1: 

2495 # There was a "hit", and there was a choice of candidate 

2496 # identifiers, so there is an order to think about. ASSUMING unique 

2497 # identifiers (within proband, within candidate): 

2498 if correct_order: 

2499 # - Adjust P(D | H) by p_o. 

2500 # - No adjustment to P(D | ¬H) required. 

2501 yield AdjustLogOddsComparison( 

2502 log_odds_delta=ln(p_o), 

2503 description=( 

2504 f"order match: adjust P(D|H) by " 

2505 f"P(correct order) = {p_o}" 

2506 ), 

2507 ) 

2508 else: 

2509 # - Adjust P(D | H) by p_u = 1 - p_o. 

2510 # - Adjust P(D | ¬H) by the number of unordered possibilities 

2511 # considered (n_implicit_comparisons), minus the one (the 

2512 # correctly ordered option) that by definition we are not 

2513 # considering here. This uses a Bonferroni approximation, as 

2514 # above. 

2515 n_unordered_possibilities = n_implicit_comparisons - 1 

2516 description = ( 

2517 f"order mismatch: " 

2518 f"adjust P(D|H) by P(incorrect order) = {p_u}" 

2519 ) 

2520 if n_unordered_possibilities > 1: 

2521 description += ( 

2522 f", and P(D|¬H) for {n_positive} hits from " 

2523 f"{n_unordered_possibilities} comparisons" 

2524 ) 

2525 yield AdjustLogOddsComparison( 

2526 log_odds_delta=ln(p_u) - ln(n_unordered_possibilities), 

2527 description=description, 

2528 ) 

2529 

2530 else: 

2531 # Unordered comparison requested. 

2532 # - No adjustment is required to P(D | H). See paper. 

2533 # - If n_implicit_comparisons is 1, that isn't multiple comparisons, 

2534 # so no further adjustment is required. We could still use this 

2535 # process, which would add -ln(1) = 0, but it would do nothing and be 

2536 # a waste of time. 

2537 # - But if n_implicit_comparisons > 1, then we adjust P(D | ¬H), 

2538 # using the Bonferroni correction. See paper for working. 

2539 if n_implicit_comparisons > 1: 

2540 # - Correct P(D | ¬H) for the fact that we would have considered 

2541 # any order acceptable, and we made multiple comparisons to pick 

2542 # the best. This uses a Bonferroni approximation, as above. 

2543 # We add a negative log odds value. See paper for detail. 

2544 yield AdjustLogOddsComparison( 

2545 log_odds_delta=-ln(n_implicit_comparisons), 

2546 description=( 

2547 f"unordered: adjust P(D|¬H) for {n_positive} " 

2548 f"hits from {n_implicit_comparisons} comparisons" 

2549 ), 

2550 )