Coverage for linkage/person.py: 71%

303 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1r""" 

2crate_anon/linkage/person.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Person representations for fuzzy matching.** 

27 

28""" 

29 

30# ============================================================================= 

31# Imports 

32# ============================================================================= 

33 

34import json 

35import logging 

36import random 

37from typing import ( 

38 Any, 

39 Dict, 

40 Generator, 

41 List, 

42 Optional, 

43 Union, 

44) 

45 

46from cardinal_pythonlib.reprfunc import auto_repr 

47 

48from crate_anon.linkage.comparison import bayes_compare, Comparison 

49from crate_anon.linkage.helpers import ( 

50 getdictval, 

51 mutate_name, 

52 mutate_postcode, 

53) 

54from crate_anon.linkage.identifiers import ( 

55 DateOfBirth, 

56 Forename, 

57 gen_best_comparisons, 

58 Gender, 

59 Identifier, 

60 PerfectID, 

61 Postcode, 

62 Surname, 

63 TemporalIDHolder, 

64) 

65from crate_anon.linkage.matchconfig import MatchConfig 

66 

67log = logging.getLogger(__name__) 

68 

69 

70# ============================================================================= 

71# Person 

72# ============================================================================= 

73 

74 

75class Person: 

76 """ 

77 A proper representation of a person that can do hashing and comparisons. 

78 The information may be incomplete or slightly wrong. 

79 Includes frequency information and requires a config. 

80 """ 

81 

82 # ------------------------------------------------------------------------- 

83 # Class attributes 

84 # ------------------------------------------------------------------------- 

85 

86 class PersonKey: 

87 LOCAL_ID = "local_id" # person ID within relevant DB (proband/sample) 

88 FORENAMES = "forenames" 

89 SURNAMES = "surnames" 

90 DOB = "dob" 

91 GENDER = "gender" 

92 POSTCODES = "postcodes" 

93 PERFECT_ID = "perfect_id" 

94 OTHER_INFO = "other_info" # anything the user may want to attach 

95 

96 # [getattr(PersonKey, x) for x in vars(PersonKey)...] does not work here as 

97 # PersonKey is not in scope within a list comprehension here; see 

98 # check_inner_class_attr_access.py and 

99 # https://stackoverflow.com/questions/13905741. But this works: 

100 ALL_PERSON_KEYS = [] # type: List[str] 

101 for tmp in vars(PersonKey): 

102 if not tmp.startswith("_"): 

103 ALL_PERSON_KEYS.append(getattr(PersonKey, tmp)) 

104 del tmp 

105 

106 # For reading CSV: 

107 SEMICOLON_DELIMIT = [ 

108 PersonKey.FORENAMES, 

109 PersonKey.SURNAMES, 

110 PersonKey.POSTCODES, 

111 PersonKey.PERFECT_ID, 

112 ] 

113 TEMPORAL_IDENTIFIERS = [ 

114 PersonKey.FORENAMES, 

115 PersonKey.SURNAMES, 

116 PersonKey.POSTCODES, 

117 ] 

118 PLAINTEXT_CSV_FORMAT_HELP = ( 

119 f"(1) CSV format with header row. Columns: {ALL_PERSON_KEYS}. " 

120 f"(2) Semicolon-separated values are allowed within " 

121 f"{SEMICOLON_DELIMIT}. " 

122 f"(3) The fields {TEMPORAL_IDENTIFIERS} are in TemporalIdentifier " 

123 f"format. {Identifier.TEMPORAL_ID_FORMAT_HELP} " 

124 f"(4) {PersonKey.PERFECT_ID}, if specified, contains one or more " 

125 f"perfect person identifiers as key:value pairs, e.g. " 

126 f"'nhs:12345;ni:AB6789XY'. The keys will be forced to lower case; " 

127 f"values will be forced to upper case. " 

128 f"(5) {PersonKey.OTHER_INFO!r} is an arbitrary string for you to use " 

129 f"(e.g. for validation)." 

130 ) 

131 HASHED_JSONLINES_FORMAT_HELP = ( 

132 "File created by CRATE in JSON Lines (.jsonl) format. (You could use " 

133 "the 'jq' tool to inspect these.)" 

134 ) 

135 

136 # ------------------------------------------------------------------------- 

137 # Creation 

138 # ------------------------------------------------------------------------- 

139 

140 def __init__( 

141 self, 

142 cfg: MatchConfig, 

143 local_id: str = "", 

144 other_info: str = "", 

145 forenames: List[Union[None, str, TemporalIDHolder, Forename]] = None, 

146 surnames: List[Union[None, str, TemporalIDHolder, Surname]] = None, 

147 dob: Union[None, str, DateOfBirth] = "", 

148 gender: Union[None, str, Gender] = "", 

149 postcodes: List[Union[None, str, TemporalIDHolder, Postcode]] = None, 

150 perfect_id: Union[None, Dict[str, Any], PerfectID] = None, 

151 ) -> None: 

152 """ 

153 Args: 

154 cfg: 

155 The config object. 

156 local_id: 

157 Identifier within this person's local database (e.g. proband ID 

158 or sample ID). Typically a research pseudonym, not itself 

159 identifying. 

160 other_info: 

161 String containing any other attributes the user may wish to 

162 remember (e.g. in JSON). Only used for validation research 

163 (e.g. ensuring linkage is not biased by ethnicity). 

164 

165 forenames: 

166 The person's forenames (given names, first/middle names), as 

167 strings or Forename objects. 

168 surnames: 

169 The person's surname(s), as strings or Surname or 

170 TemporalIDHolder objects. 

171 dob: 

172 The date of birth, in ISO-8061 "YYYY-MM-DD" string format, 

173 or as a DateOfBirth object, or None, or ''. 

174 gender: 

175 The gender: 'M', 'F', 'X', or '', or None, or a Gender object. 

176 postcodes: 

177 Any UK postcodes for this person, with optional associated 

178 dates. 

179 perfect_id: 

180 Any named person-unique identifiers (e.g. UK NHS numbers, UK 

181 National Insurance numbers), for non-fuzzy matching. Dictionary 

182 keys will be forced to lower case, and dictionary values to 

183 upper case. 

184 """ 

185 self._is_plaintext = None # type: Optional[bool] 

186 

187 def chk_plaintext(new_identifier: Identifier) -> None: 

188 """ 

189 Ensure we don't mix plaintext and hashed data. 

190 """ 

191 new_plaintext = new_identifier.is_plaintext 

192 if self._is_plaintext is None: 

193 self._is_plaintext = new_plaintext 

194 elif new_plaintext != self._is_plaintext: 

195 new = self.plain_or_hashed_txt(new_plaintext) 

196 old = self.plain_or_hashed_txt(self._is_plaintext) 

197 raise ValueError( 

198 f"Trying to add {new} information to a Person containing " 

199 f"only {old} information; new data was " 

200 f"{new_identifier!r}; current is {self!r}" 

201 ) 

202 

203 assert isinstance(cfg, MatchConfig) 

204 self.cfg = cfg 

205 self.baseline_log_odds_same_person = ( 

206 self.cfg.baseline_log_odds_same_person 

207 ) # for speed 

208 

209 # local_id 

210 self.local_id = str(local_id) if local_id is not None else None 

211 if not self.local_id: 

212 raise ValueError(f"Bad local_id: {local_id!r}") 

213 

214 # other_info 

215 self.other_info = other_info or "" 

216 if not isinstance(self.other_info, str): 

217 raise ValueError(f"Bad other_info: {self.other_info!r}") 

218 

219 # gender 

220 gender = "" if gender is None else gender 

221 # DO NOT DO: gender = gender or "" 

222 # ... because bool(Gender(cfg, gender="")) == False. 

223 if isinstance(gender, Gender): 

224 self.gender = gender 

225 else: 

226 self.gender = Gender(cfg=cfg, gender=gender) 

227 chk_plaintext(self.gender) 

228 

229 # forenames 

230 forenames = forenames or [] 

231 if not isinstance(forenames, list): 

232 raise ValueError(f"Bad forenames: {forenames!r}") 

233 self.forenames = [] # type: List[Forename] 

234 for f in forenames: 

235 if not f: # None or "" 

236 continue 

237 elif isinstance(f, str): 

238 f = Forename(cfg=cfg, name=f, gender=self.gender.gender_str) 

239 elif isinstance(f, TemporalIDHolder): 

240 f = Forename( 

241 cfg=cfg, 

242 name=f.identifier, 

243 start_date=f.start_date, 

244 end_date=f.end_date, 

245 ) 

246 elif not isinstance(f, Forename): 

247 raise ValueError(f"Bad forename: {f!r}") 

248 if not bool(f): 

249 continue # skip blank names not detected above 

250 chk_plaintext(f) 

251 self.forenames.append(f) 

252 

253 # surnames 

254 surnames = surnames or [] 

255 if not isinstance(surnames, list): 

256 raise ValueError(f"Bad surnames: {surnames!r}") 

257 self.surnames = [] # type: List[Surname] 

258 for s in surnames: 

259 if not s: 

260 continue 

261 elif isinstance(s, str): 

262 s = Surname(cfg=cfg, name=s, gender=self.gender.gender_str) 

263 elif isinstance(s, TemporalIDHolder): 

264 s = Surname( 

265 cfg=cfg, 

266 name=s.identifier, 

267 start_date=s.start_date, 

268 end_date=s.end_date, 

269 ) 

270 elif not isinstance(s, Surname): 

271 raise ValueError(f"Bad surname: {s!r}") 

272 if not bool(s): 

273 continue # skip blank names not detected above 

274 chk_plaintext(s) 

275 self.surnames.append(s) 

276 

277 # dob (NB highly desirable for real work, but not mandatory, and we 

278 # also want to be able to create Person objects without a DOB for 

279 # testing) 

280 dob = "" if dob is None else dob 

281 if isinstance(dob, DateOfBirth): 

282 self.dob = dob 

283 else: 

284 self.dob = DateOfBirth(cfg=cfg, dob=dob or "") 

285 chk_plaintext(self.dob) 

286 

287 # postcodes 

288 postcodes = postcodes or [] 

289 if not isinstance(postcodes, list): 

290 raise ValueError(f"Bad postcodes: {postcodes!r}") 

291 self.postcodes = [] # type: List[Postcode] 

292 for p in postcodes: 

293 if not p: # None or "" 

294 continue 

295 elif isinstance(p, str): 

296 p = Postcode(cfg=cfg, postcode=p) 

297 elif isinstance(p, TemporalIDHolder): 

298 p = Postcode( 

299 cfg=cfg, 

300 postcode=p.identifier, 

301 start_date=p.start_date, 

302 end_date=p.end_date, 

303 ) 

304 elif not isinstance(p, Postcode): 

305 raise ValueError(f"Bad data structure for postcode: {p!r}") 

306 if not bool(p): 

307 continue # skip blanks not detected above 

308 chk_plaintext(p) 

309 self.postcodes.append(p) 

310 

311 # perfect_id 

312 if isinstance(perfect_id, PerfectID): 

313 self.perfect_id = perfect_id 

314 else: 

315 self.perfect_id = PerfectID(cfg=cfg, identifiers=perfect_id) 

316 chk_plaintext(self.perfect_id) 

317 

318 @staticmethod 

319 def plain_or_hashed_txt(plaintext: bool) -> str: 

320 """ 

321 Used for error messages. 

322 """ 

323 return "plaintext" if plaintext else "hashed" 

324 

325 @classmethod 

326 def from_plaintext_csv( 

327 cls, cfg: MatchConfig, rowdict: Dict[str, str] 

328 ) -> "Person": 

329 """ 

330 Returns a :class:`Person` object from a CSV row. 

331 

332 Args: 

333 cfg: a configuration object 

334 rowdict: a CSV row, read via :class:`csv.DictReader`. 

335 """ 

336 kwargs = {} # type: Dict[str, Any] 

337 for attr in cls.ALL_PERSON_KEYS: 

338 value = rowdict[attr] 

339 if attr in cls.SEMICOLON_DELIMIT: 

340 value = cls._get_semicolon_delimited_value(cfg, attr, value) 

341 kwargs[attr] = value 

342 return Person(cfg=cfg, **kwargs) 

343 

344 @classmethod 

345 def _get_semicolon_delimited_value( 

346 cls, cfg: MatchConfig, attr: str, value: Any 

347 ) -> Union[PerfectID, list[TemporalIDHolder]]: 

348 if attr == cls.PersonKey.PERFECT_ID: 

349 return PerfectID.from_plaintext_str(cfg, value) 

350 

351 assert attr in cls.TEMPORAL_IDENTIFIERS 

352 

353 return cls._get_temporal_id_holder_list(cfg, value) 

354 

355 @classmethod 

356 def _get_temporal_id_holder_list( 

357 cls, cfg: MatchConfig, value: Any 

358 ) -> list[TemporalIDHolder]: 

359 temp_id_values = [v.strip() for v in value.split(";") if v] 

360 return [ 

361 TemporalIDHolder.from_plaintext_str(cfg, v) for v in temp_id_values 

362 ] 

363 

364 @classmethod 

365 def from_json_dict( 

366 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool = True 

367 ) -> "Person": 

368 """ 

369 Restore a hashed or plaintext version from a dictionary (which has been 

370 read from JSONL). 

371 """ 

372 

373 def check_is_dict(d_: Any, name_: str) -> None: 

374 if not isinstance(d_, dict): 

375 raise ValueError( 

376 f"{name_} contains something that is not a dict: {d_!r}" 

377 ) 

378 

379 pk = cls.PersonKey 

380 forenames = [] # type: List[Forename] 

381 for mnd in getdictval(d, pk.FORENAMES, list): 

382 check_is_dict(mnd, pk.FORENAMES) 

383 forenames.append(Forename.from_dict(cfg, mnd, hashed)) 

384 surnames = [] # type: List[Surname] 

385 for sur in getdictval(d, pk.SURNAMES, list): 

386 check_is_dict(sur, pk.SURNAMES) 

387 surnames.append(Surname.from_dict(cfg, sur, hashed)) 

388 postcodes = [] # type: List[Postcode] 

389 for pd in getdictval(d, pk.POSTCODES, list): 

390 check_is_dict(pd, pk.POSTCODES) 

391 postcodes.append(Postcode.from_dict(cfg, pd, hashed)) 

392 return Person( 

393 cfg=cfg, 

394 local_id=getdictval(d, pk.LOCAL_ID, str), 

395 other_info=getdictval(d, pk.OTHER_INFO, str, mandatory=False), 

396 forenames=forenames, 

397 surnames=surnames, 

398 dob=DateOfBirth.from_dict( 

399 cfg, getdictval(d, pk.DOB, dict), hashed 

400 ), 

401 gender=Gender.from_dict( 

402 cfg, getdictval(d, pk.GENDER, dict), hashed 

403 ), 

404 postcodes=postcodes, 

405 perfect_id=PerfectID.from_dict( 

406 cfg, getdictval(d, pk.PERFECT_ID, dict), hashed 

407 ), 

408 ) 

409 

410 @classmethod 

411 def from_json_str(cls, cfg: MatchConfig, s: str) -> "Person": 

412 """ 

413 Restore a hashed version from a string representing JSON. 

414 """ 

415 d = json.loads(s) 

416 return cls.from_json_dict(cfg, d) 

417 

418 # ------------------------------------------------------------------------- 

419 # Equality, hashing -- local_id should be unique 

420 # ------------------------------------------------------------------------- 

421 # Be careful: 

422 # - https://inventwithpython.com/blog/2019/02/01/hashable-objects-must-be-immutable/ # noqa: E501 

423 # - https://docs.python.org/3/glossary.html [re "hashable"] 

424 # Here, we define equality based on local_id, which will not change. In 

425 # practice, nothing else will either. 

426 

427 def __eq__(self, other: "Person") -> bool: 

428 return self.local_id == other.local_id 

429 

430 def __hash__(self) -> int: 

431 return hash(self.local_id) 

432 

433 # ------------------------------------------------------------------------- 

434 # Representation: string 

435 # ------------------------------------------------------------------------- 

436 

437 def __repr__(self): 

438 return auto_repr(self) 

439 

440 def __str__(self) -> str: 

441 if self.is_hashed(): 

442 return f"Person<HASHED, local_id={self.local_id!r}>" 

443 names = " ".join( 

444 [str(f) for f in self.forenames] + [str(s) for s in self.surnames] 

445 ) 

446 postcodes = " - ".join(str(x) for x in self.postcodes) 

447 k = self.PersonKey 

448 details = ", ".join( 

449 [ 

450 f"{k.LOCAL_ID}={self.local_id}", 

451 f"{k.PERFECT_ID}={self.perfect_id}", 

452 f"name={names}", 

453 f"{k.GENDER}={self.gender}", 

454 f"{k.DOB}={self.dob}", 

455 f"{k.POSTCODES}={postcodes}", 

456 f"{k.OTHER_INFO}={self.other_info!r}", 

457 ] 

458 ) 

459 return f"Person<{details}>" 

460 

461 # ------------------------------------------------------------------------- 

462 # Representation: CSV 

463 # ------------------------------------------------------------------------- 

464 

465 @classmethod 

466 def plaintext_csv_columns(cls) -> List[str]: 

467 """ 

468 CSV column names -- including user-specified "other" information. 

469 """ 

470 return cls.ALL_PERSON_KEYS 

471 

472 def plaintext_csv_dict(self) -> Dict[str, str]: 

473 """ 

474 Returns a dictionary suitable for :class:`csv.DictWriter`. 

475 This is for writing identifiable content. 

476 """ 

477 d = {} # type: Dict[str, str] 

478 for k in self.ALL_PERSON_KEYS: 

479 a = getattr(self, k) 

480 if k in self.SEMICOLON_DELIMIT and k != self.PersonKey.PERFECT_ID: 

481 v = ";".join(str(x) for x in a) 

482 else: 

483 v = str(a) 

484 d[k] = v 

485 return d 

486 

487 # ------------------------------------------------------------------------- 

488 # Representation: JSON 

489 # ------------------------------------------------------------------------- 

490 

491 def as_dict( 

492 self, 

493 hashed: bool = True, 

494 include_frequencies: bool = True, 

495 include_other_info: bool = False, 

496 ) -> Dict[str, Any]: 

497 """ 

498 For JSON. 

499 

500 Args: 

501 hashed: 

502 Create a hashed/encrypted version? 

503 include_frequencies: 

504 Include frequency information. If you don't, this makes the 

505 resulting file suitable for use as a sample, but not as a 

506 proband file. 

507 include_other_info: 

508 include the (potentially identifying) ``other_info`` data? 

509 Usually ``False``; may be ``True`` for validation. 

510 """ 

511 pk = self.PersonKey 

512 

513 # This could be terser, but to be clear: 

514 if hashed: 

515 if self._is_plaintext: 

516 encrypt = True 

517 local_id = self.cfg.local_id_hash_fn(self.local_id) 

518 else: 

519 encrypt = False # already encrypted; don't do it twice 

520 local_id = self.local_id 

521 else: 

522 if self._is_plaintext: 

523 encrypt = False 

524 local_id = self.local_id 

525 else: 

526 raise AssertionError( 

527 "Can't create plaintext from hashed Person" 

528 ) 

529 

530 d = { 

531 pk.LOCAL_ID: local_id, 

532 pk.FORENAMES: [ 

533 f.as_dict(encrypt, include_frequencies) for f in self.forenames 

534 ], 

535 pk.SURNAMES: [ 

536 s.as_dict(encrypt, include_frequencies) for s in self.surnames 

537 ], 

538 pk.DOB: self.dob.as_dict(encrypt, include_frequencies), 

539 pk.GENDER: self.gender.as_dict(encrypt, include_frequencies), 

540 pk.POSTCODES: [ 

541 p.as_dict(encrypt, include_frequencies) for p in self.postcodes 

542 ], 

543 pk.PERFECT_ID: self.perfect_id.as_dict(encrypt), 

544 } 

545 if include_other_info: 

546 d[pk.OTHER_INFO] = self.other_info 

547 return d 

548 

549 # ------------------------------------------------------------------------- 

550 # Copying 

551 # ------------------------------------------------------------------------- 

552 

553 def copy(self) -> "Person": 

554 """ 

555 Returns a copy of this object. 

556 

557 - :func:`copy.deepcopy` is incredibly slow, yet :func:`copy.copy` isn't 

558 enough when we want to mutate this object. 

559 - We did do it quasi-manually, copying attributes but using 

560 ``[copy.copy(x) for x in value]`` if the value was a list. 

561 - However, since we have functions to convert to/from a dict 

562 representation, we may as well use them. 

563 """ 

564 hashed = self.is_hashed() 

565 return self.from_json_dict( 

566 self.cfg, 

567 self.as_dict( 

568 hashed=hashed, 

569 include_frequencies=True, 

570 include_other_info=True, 

571 ), 

572 hashed=hashed, 

573 ) 

574 

575 # ------------------------------------------------------------------------- 

576 # Created hashed version 

577 # ------------------------------------------------------------------------- 

578 

579 def hashed( 

580 self, 

581 include_frequencies: bool = True, 

582 include_other_info: bool = False, 

583 ) -> "Person": 

584 """ 

585 Returns a :class:`Person` object but with all the elements hashed (if 

586 they are not blank). 

587 

588 Note that you do NOT need to do this just to write a hashed version to 

589 disk. This function is primarily for comparing an entire sample of 

590 hashed people to plaintext people, or vice versa; we hash the plaintext 

591 version first. 

592 

593 Args: 

594 include_frequencies: 

595 Include frequency information. If you don't, this makes the 

596 resulting file suitable for use as a sample, but not as a 

597 proband file. 

598 include_other_info: 

599 include the (potentially identifying) ``other_info`` data? 

600 Usually ``False``; may be ``True`` for validation. 

601 """ 

602 d = self.as_dict( 

603 hashed=True, 

604 include_frequencies=include_frequencies, 

605 include_other_info=include_other_info, 

606 ) 

607 return self.from_json_dict(self.cfg, d) 

608 

609 # ------------------------------------------------------------------------- 

610 # Main comparison function 

611 # ------------------------------------------------------------------------- 

612 

613 def log_odds_same(self, candidate: "Person") -> float: 

614 """ 

615 Returns the log odds that ``self`` (the proband) and ``candidate`` are 

616 the same person. 

617 

618 Args: 

619 candidate: another :class:`Person` object 

620 

621 Returns: 

622 float: the log odds they're the same person 

623 """ 

624 # High speed function. 

625 return bayes_compare( 

626 log_odds=self.baseline_log_odds_same_person, 

627 comparisons=self._gen_comparisons(candidate), 

628 ) 

629 

630 # ------------------------------------------------------------------------- 

631 # Comparison helper functions 

632 # ------------------------------------------------------------------------- 

633 

634 def _gen_comparisons( 

635 self, candidate: "Person" 

636 ) -> Generator[Optional[Comparison], None, None]: 

637 """ 

638 Generates all relevant comparisons. 

639 

640 Args: 

641 candidate: another :class:`Person` object. 

642 

643 **Note** 

644 

645 In general, frequency information is associated with the proband, 

646 not the candidate, so use ``self.thing.comparison(candidate.thing)``. 

647 

648 """ 

649 # A perfect match would already have been tested for. The shortlisting 

650 # process may already have ensured a DOB partial match, or maybe not. 

651 # Regardless, there are no identifiers that will cause a complete 

652 # disqualification if they mismatch, so order here becomes unimportant 

653 # for speed. 

654 

655 # Surnames 

656 yield from gen_best_comparisons( 

657 proband_identifiers=self.surnames, 

658 candidate_identifiers=candidate.surnames, 

659 ordered=False, 

660 ) 

661 

662 # Forenames 

663 yield from gen_best_comparisons( 

664 proband_identifiers=self.forenames, 

665 candidate_identifiers=candidate.forenames, 

666 ordered=True, 

667 p_u=self.cfg.p_u_forename, 

668 ) 

669 

670 # DOB (see above) 

671 # There is no special treatment of 29 Feb (since this DOB is 

672 # approximately 4 times less common than other birthdays, in principle 

673 # it does merit special treatment, but we ignore that). 

674 yield self.dob.comparison(candidate.dob) 

675 

676 # Gender 

677 yield self.gender.comparison(candidate.gender) 

678 

679 # Postcodes 

680 yield from gen_best_comparisons( 

681 proband_identifiers=self.postcodes, 

682 candidate_identifiers=candidate.postcodes, 

683 ordered=False, 

684 ) 

685 

686 # ------------------------------------------------------------------------- 

687 # Info functions 

688 # ------------------------------------------------------------------------- 

689 

690 def is_plaintext(self) -> bool: 

691 """ 

692 Is this a plaintext (identifiable) Person? 

693 """ 

694 return self._is_plaintext 

695 

696 def is_hashed(self) -> bool: 

697 """ 

698 Is this a hashed (de-identified) Person? 

699 """ 

700 return not self.is_plaintext() 

701 

702 def n_forenames(self) -> int: 

703 """ 

704 Number of forenames 

705 """ 

706 return len(self.forenames) 

707 

708 def has_dob(self) -> bool: 

709 """ 

710 Do we have a DOB? 

711 """ 

712 return bool(self.dob) 

713 

714 def n_postcodes(self) -> int: 

715 """ 

716 How many postcodes does this person have? 

717 """ 

718 return len(self.postcodes) 

719 

720 # ------------------------------------------------------------------------- 

721 # Validation 

722 # ------------------------------------------------------------------------- 

723 

724 def ensure_valid_as_proband(self) -> None: 

725 """ 

726 Ensures this person has sufficient information to act as a proband, or 

727 raises :exc:`ValueError`. 

728 

729 We previously required a DOB unless debugging, but no longer. 

730 """ 

731 for f in self.forenames: 

732 f.ensure_has_freq_info_if_id_present() 

733 for s in self.surnames: 

734 s.ensure_has_freq_info_if_id_present() 

735 self.dob.ensure_has_freq_info_if_id_present() 

736 self.gender.ensure_has_freq_info_if_id_present() 

737 for p in self.postcodes: 

738 p.ensure_has_freq_info_if_id_present() 

739 

740 def ensure_valid_as_candidate(self) -> None: 

741 """ 

742 Ensures this person has sufficient information to act as a candidate, 

743 or raises :exc:`AssertionError`. 

744 

745 We previously required a DOB unless debugging, but no longer. 

746 """ 

747 pass 

748 

749 # ------------------------------------------------------------------------- 

750 # Debugging functions to check this object 

751 # ------------------------------------------------------------------------- 

752 

753 def debug_gen_identifiers(self) -> Generator[Identifier, None, None]: 

754 """ 

755 Yield all identifiers. 

756 """ 

757 yield from self.forenames 

758 yield from self.surnames 

759 if self.dob: 

760 yield self.dob 

761 if self.gender: 

762 yield self.gender 

763 yield from self.postcodes 

764 

765 def debug_comparison_report( 

766 self, candidate: "Person", verbose: bool = True 

767 ) -> str: 

768 """ 

769 Compare a person with another, log every step of the way, and return 

770 the result as a string. 

771 """ 

772 lines = [] # type: List[str] 

773 

774 def report(msg_: str) -> None: 

775 lines.append(f"{msg_} -> log_odds = {log_odds}") 

776 

777 if verbose: 

778 spacer = " - " 

779 self_id = ( 

780 "\n".join( 

781 spacer + repr(i) for i in self.debug_gen_identifiers() 

782 ) 

783 + "\n" 

784 ) 

785 candidate_id = ( 

786 "\n".join( 

787 spacer + repr(i) for i in candidate.debug_gen_identifiers() 

788 ) 

789 + "\n" 

790 ) 

791 else: 

792 self_id = "" 

793 candidate_id = "" 

794 lines.append("VERBOSE COMPARISON:") 

795 lines.append(f"- self (proband) = {self}") 

796 lines.append(self_id) 

797 lines.append(f"- candidate = {candidate}") 

798 lines.append(candidate_id) 

799 lines.append(f"- self dict = {self.as_dict(hashed=False)}") 

800 lines.append(self_id) 

801 lines.append(f"- candidate dict = {candidate.as_dict(hashed=False)}") 

802 lines.append(candidate_id) 

803 

804 log_odds = self.cfg.baseline_log_odds_same_person 

805 report("Baseline") 

806 for comp in self._gen_comparisons(candidate=candidate): 

807 if not comp: 

808 continue 

809 log_odds = comp.posterior_log_odds(log_odds) 

810 report(str(comp)) 

811 

812 return "\n".join(filter(None, lines)) 

813 

814 def debug_compare(self, candidate: "Person", verbose: bool = True) -> None: 

815 """ 

816 Compare a person with another, and log every step of the way. 

817 """ 

818 log.info(self.debug_comparison_report(candidate, verbose=verbose)) 

819 

820 # ------------------------------------------------------------------------- 

821 # Debugging functions to mutate this object 

822 # ------------------------------------------------------------------------- 

823 

824 def debug_delete_something(self) -> None: 

825 """ 

826 Randomly delete one of: a forename, or a postcode. 

827 """ 

828 n_forenames = self.n_forenames() 

829 n_postcodes = self.n_postcodes() 

830 n_possibilities = n_forenames + n_postcodes 

831 if n_possibilities == 0: 

832 log.warning(f"Unable to delete info from {self}") 

833 return 

834 which = random.randint(0, n_possibilities - 1) 

835 

836 if which < n_forenames: 

837 del self.forenames[which] 

838 return 

839 which -= n_forenames 

840 

841 del self.postcodes[which] 

842 

843 def debug_mutate_something(self) -> None: 

844 """ 

845 Randomly mutate one of: a forename, or a postcode. 

846 """ 

847 n_forenames = self.n_forenames() 

848 n_postcodes = self.n_postcodes() 

849 n_possibilities = n_forenames + n_postcodes 

850 if n_possibilities == 0: 

851 log.warning(f"Unable to mutate info from {self}") 

852 return 

853 which = random.randrange(n_possibilities) 

854 

855 cfg = self.cfg 

856 

857 if which < n_forenames: 

858 oldname = self.forenames[which] 

859 assert oldname.is_plaintext 

860 self.forenames[which] = Forename( 

861 cfg, name=mutate_name(oldname.name), gender=oldname.gender 

862 ) 

863 return 

864 which -= n_forenames 

865 

866 oldpostcode = self.postcodes[which] 

867 assert oldpostcode.is_plaintext 

868 self.postcodes[which] = Postcode( 

869 cfg, postcode=mutate_postcode(oldpostcode.postcode_unit, cfg) 

870 )