Coverage for linkage/person.py: 71%
303 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1r"""
2crate_anon/linkage/person.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Person representations for fuzzy matching.**
28"""
30# =============================================================================
31# Imports
32# =============================================================================
34import json
35import logging
36import random
37from typing import (
38 Any,
39 Dict,
40 Generator,
41 List,
42 Optional,
43 Union,
44)
46from cardinal_pythonlib.reprfunc import auto_repr
48from crate_anon.linkage.comparison import bayes_compare, Comparison
49from crate_anon.linkage.helpers import (
50 getdictval,
51 mutate_name,
52 mutate_postcode,
53)
54from crate_anon.linkage.identifiers import (
55 DateOfBirth,
56 Forename,
57 gen_best_comparisons,
58 Gender,
59 Identifier,
60 PerfectID,
61 Postcode,
62 Surname,
63 TemporalIDHolder,
64)
65from crate_anon.linkage.matchconfig import MatchConfig
67log = logging.getLogger(__name__)
70# =============================================================================
71# Person
72# =============================================================================
75class Person:
76 """
77 A proper representation of a person that can do hashing and comparisons.
78 The information may be incomplete or slightly wrong.
79 Includes frequency information and requires a config.
80 """
82 # -------------------------------------------------------------------------
83 # Class attributes
84 # -------------------------------------------------------------------------
86 class PersonKey:
87 LOCAL_ID = "local_id" # person ID within relevant DB (proband/sample)
88 FORENAMES = "forenames"
89 SURNAMES = "surnames"
90 DOB = "dob"
91 GENDER = "gender"
92 POSTCODES = "postcodes"
93 PERFECT_ID = "perfect_id"
94 OTHER_INFO = "other_info" # anything the user may want to attach
96 # [getattr(PersonKey, x) for x in vars(PersonKey)...] does not work here as
97 # PersonKey is not in scope within a list comprehension here; see
98 # check_inner_class_attr_access.py and
99 # https://stackoverflow.com/questions/13905741. But this works:
100 ALL_PERSON_KEYS = [] # type: List[str]
101 for tmp in vars(PersonKey):
102 if not tmp.startswith("_"):
103 ALL_PERSON_KEYS.append(getattr(PersonKey, tmp))
104 del tmp
106 # For reading CSV:
107 SEMICOLON_DELIMIT = [
108 PersonKey.FORENAMES,
109 PersonKey.SURNAMES,
110 PersonKey.POSTCODES,
111 PersonKey.PERFECT_ID,
112 ]
113 TEMPORAL_IDENTIFIERS = [
114 PersonKey.FORENAMES,
115 PersonKey.SURNAMES,
116 PersonKey.POSTCODES,
117 ]
118 PLAINTEXT_CSV_FORMAT_HELP = (
119 f"(1) CSV format with header row. Columns: {ALL_PERSON_KEYS}. "
120 f"(2) Semicolon-separated values are allowed within "
121 f"{SEMICOLON_DELIMIT}. "
122 f"(3) The fields {TEMPORAL_IDENTIFIERS} are in TemporalIdentifier "
123 f"format. {Identifier.TEMPORAL_ID_FORMAT_HELP} "
124 f"(4) {PersonKey.PERFECT_ID}, if specified, contains one or more "
125 f"perfect person identifiers as key:value pairs, e.g. "
126 f"'nhs:12345;ni:AB6789XY'. The keys will be forced to lower case; "
127 f"values will be forced to upper case. "
128 f"(5) {PersonKey.OTHER_INFO!r} is an arbitrary string for you to use "
129 f"(e.g. for validation)."
130 )
131 HASHED_JSONLINES_FORMAT_HELP = (
132 "File created by CRATE in JSON Lines (.jsonl) format. (You could use "
133 "the 'jq' tool to inspect these.)"
134 )
136 # -------------------------------------------------------------------------
137 # Creation
138 # -------------------------------------------------------------------------
140 def __init__(
141 self,
142 cfg: MatchConfig,
143 local_id: str = "",
144 other_info: str = "",
145 forenames: List[Union[None, str, TemporalIDHolder, Forename]] = None,
146 surnames: List[Union[None, str, TemporalIDHolder, Surname]] = None,
147 dob: Union[None, str, DateOfBirth] = "",
148 gender: Union[None, str, Gender] = "",
149 postcodes: List[Union[None, str, TemporalIDHolder, Postcode]] = None,
150 perfect_id: Union[None, Dict[str, Any], PerfectID] = None,
151 ) -> None:
152 """
153 Args:
154 cfg:
155 The config object.
156 local_id:
157 Identifier within this person's local database (e.g. proband ID
158 or sample ID). Typically a research pseudonym, not itself
159 identifying.
160 other_info:
161 String containing any other attributes the user may wish to
162 remember (e.g. in JSON). Only used for validation research
163 (e.g. ensuring linkage is not biased by ethnicity).
165 forenames:
166 The person's forenames (given names, first/middle names), as
167 strings or Forename objects.
168 surnames:
169 The person's surname(s), as strings or Surname or
170 TemporalIDHolder objects.
171 dob:
172 The date of birth, in ISO-8061 "YYYY-MM-DD" string format,
173 or as a DateOfBirth object, or None, or ''.
174 gender:
175 The gender: 'M', 'F', 'X', or '', or None, or a Gender object.
176 postcodes:
177 Any UK postcodes for this person, with optional associated
178 dates.
179 perfect_id:
180 Any named person-unique identifiers (e.g. UK NHS numbers, UK
181 National Insurance numbers), for non-fuzzy matching. Dictionary
182 keys will be forced to lower case, and dictionary values to
183 upper case.
184 """
185 self._is_plaintext = None # type: Optional[bool]
187 def chk_plaintext(new_identifier: Identifier) -> None:
188 """
189 Ensure we don't mix plaintext and hashed data.
190 """
191 new_plaintext = new_identifier.is_plaintext
192 if self._is_plaintext is None:
193 self._is_plaintext = new_plaintext
194 elif new_plaintext != self._is_plaintext:
195 new = self.plain_or_hashed_txt(new_plaintext)
196 old = self.plain_or_hashed_txt(self._is_plaintext)
197 raise ValueError(
198 f"Trying to add {new} information to a Person containing "
199 f"only {old} information; new data was "
200 f"{new_identifier!r}; current is {self!r}"
201 )
203 assert isinstance(cfg, MatchConfig)
204 self.cfg = cfg
205 self.baseline_log_odds_same_person = (
206 self.cfg.baseline_log_odds_same_person
207 ) # for speed
209 # local_id
210 self.local_id = str(local_id) if local_id is not None else None
211 if not self.local_id:
212 raise ValueError(f"Bad local_id: {local_id!r}")
214 # other_info
215 self.other_info = other_info or ""
216 if not isinstance(self.other_info, str):
217 raise ValueError(f"Bad other_info: {self.other_info!r}")
219 # gender
220 gender = "" if gender is None else gender
221 # DO NOT DO: gender = gender or ""
222 # ... because bool(Gender(cfg, gender="")) == False.
223 if isinstance(gender, Gender):
224 self.gender = gender
225 else:
226 self.gender = Gender(cfg=cfg, gender=gender)
227 chk_plaintext(self.gender)
229 # forenames
230 forenames = forenames or []
231 if not isinstance(forenames, list):
232 raise ValueError(f"Bad forenames: {forenames!r}")
233 self.forenames = [] # type: List[Forename]
234 for f in forenames:
235 if not f: # None or ""
236 continue
237 elif isinstance(f, str):
238 f = Forename(cfg=cfg, name=f, gender=self.gender.gender_str)
239 elif isinstance(f, TemporalIDHolder):
240 f = Forename(
241 cfg=cfg,
242 name=f.identifier,
243 start_date=f.start_date,
244 end_date=f.end_date,
245 )
246 elif not isinstance(f, Forename):
247 raise ValueError(f"Bad forename: {f!r}")
248 if not bool(f):
249 continue # skip blank names not detected above
250 chk_plaintext(f)
251 self.forenames.append(f)
253 # surnames
254 surnames = surnames or []
255 if not isinstance(surnames, list):
256 raise ValueError(f"Bad surnames: {surnames!r}")
257 self.surnames = [] # type: List[Surname]
258 for s in surnames:
259 if not s:
260 continue
261 elif isinstance(s, str):
262 s = Surname(cfg=cfg, name=s, gender=self.gender.gender_str)
263 elif isinstance(s, TemporalIDHolder):
264 s = Surname(
265 cfg=cfg,
266 name=s.identifier,
267 start_date=s.start_date,
268 end_date=s.end_date,
269 )
270 elif not isinstance(s, Surname):
271 raise ValueError(f"Bad surname: {s!r}")
272 if not bool(s):
273 continue # skip blank names not detected above
274 chk_plaintext(s)
275 self.surnames.append(s)
277 # dob (NB highly desirable for real work, but not mandatory, and we
278 # also want to be able to create Person objects without a DOB for
279 # testing)
280 dob = "" if dob is None else dob
281 if isinstance(dob, DateOfBirth):
282 self.dob = dob
283 else:
284 self.dob = DateOfBirth(cfg=cfg, dob=dob or "")
285 chk_plaintext(self.dob)
287 # postcodes
288 postcodes = postcodes or []
289 if not isinstance(postcodes, list):
290 raise ValueError(f"Bad postcodes: {postcodes!r}")
291 self.postcodes = [] # type: List[Postcode]
292 for p in postcodes:
293 if not p: # None or ""
294 continue
295 elif isinstance(p, str):
296 p = Postcode(cfg=cfg, postcode=p)
297 elif isinstance(p, TemporalIDHolder):
298 p = Postcode(
299 cfg=cfg,
300 postcode=p.identifier,
301 start_date=p.start_date,
302 end_date=p.end_date,
303 )
304 elif not isinstance(p, Postcode):
305 raise ValueError(f"Bad data structure for postcode: {p!r}")
306 if not bool(p):
307 continue # skip blanks not detected above
308 chk_plaintext(p)
309 self.postcodes.append(p)
311 # perfect_id
312 if isinstance(perfect_id, PerfectID):
313 self.perfect_id = perfect_id
314 else:
315 self.perfect_id = PerfectID(cfg=cfg, identifiers=perfect_id)
316 chk_plaintext(self.perfect_id)
318 @staticmethod
319 def plain_or_hashed_txt(plaintext: bool) -> str:
320 """
321 Used for error messages.
322 """
323 return "plaintext" if plaintext else "hashed"
325 @classmethod
326 def from_plaintext_csv(
327 cls, cfg: MatchConfig, rowdict: Dict[str, str]
328 ) -> "Person":
329 """
330 Returns a :class:`Person` object from a CSV row.
332 Args:
333 cfg: a configuration object
334 rowdict: a CSV row, read via :class:`csv.DictReader`.
335 """
336 kwargs = {} # type: Dict[str, Any]
337 for attr in cls.ALL_PERSON_KEYS:
338 value = rowdict[attr]
339 if attr in cls.SEMICOLON_DELIMIT:
340 value = cls._get_semicolon_delimited_value(cfg, attr, value)
341 kwargs[attr] = value
342 return Person(cfg=cfg, **kwargs)
344 @classmethod
345 def _get_semicolon_delimited_value(
346 cls, cfg: MatchConfig, attr: str, value: Any
347 ) -> Union[PerfectID, list[TemporalIDHolder]]:
348 if attr == cls.PersonKey.PERFECT_ID:
349 return PerfectID.from_plaintext_str(cfg, value)
351 assert attr in cls.TEMPORAL_IDENTIFIERS
353 return cls._get_temporal_id_holder_list(cfg, value)
355 @classmethod
356 def _get_temporal_id_holder_list(
357 cls, cfg: MatchConfig, value: Any
358 ) -> list[TemporalIDHolder]:
359 temp_id_values = [v.strip() for v in value.split(";") if v]
360 return [
361 TemporalIDHolder.from_plaintext_str(cfg, v) for v in temp_id_values
362 ]
364 @classmethod
365 def from_json_dict(
366 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool = True
367 ) -> "Person":
368 """
369 Restore a hashed or plaintext version from a dictionary (which has been
370 read from JSONL).
371 """
373 def check_is_dict(d_: Any, name_: str) -> None:
374 if not isinstance(d_, dict):
375 raise ValueError(
376 f"{name_} contains something that is not a dict: {d_!r}"
377 )
379 pk = cls.PersonKey
380 forenames = [] # type: List[Forename]
381 for mnd in getdictval(d, pk.FORENAMES, list):
382 check_is_dict(mnd, pk.FORENAMES)
383 forenames.append(Forename.from_dict(cfg, mnd, hashed))
384 surnames = [] # type: List[Surname]
385 for sur in getdictval(d, pk.SURNAMES, list):
386 check_is_dict(sur, pk.SURNAMES)
387 surnames.append(Surname.from_dict(cfg, sur, hashed))
388 postcodes = [] # type: List[Postcode]
389 for pd in getdictval(d, pk.POSTCODES, list):
390 check_is_dict(pd, pk.POSTCODES)
391 postcodes.append(Postcode.from_dict(cfg, pd, hashed))
392 return Person(
393 cfg=cfg,
394 local_id=getdictval(d, pk.LOCAL_ID, str),
395 other_info=getdictval(d, pk.OTHER_INFO, str, mandatory=False),
396 forenames=forenames,
397 surnames=surnames,
398 dob=DateOfBirth.from_dict(
399 cfg, getdictval(d, pk.DOB, dict), hashed
400 ),
401 gender=Gender.from_dict(
402 cfg, getdictval(d, pk.GENDER, dict), hashed
403 ),
404 postcodes=postcodes,
405 perfect_id=PerfectID.from_dict(
406 cfg, getdictval(d, pk.PERFECT_ID, dict), hashed
407 ),
408 )
410 @classmethod
411 def from_json_str(cls, cfg: MatchConfig, s: str) -> "Person":
412 """
413 Restore a hashed version from a string representing JSON.
414 """
415 d = json.loads(s)
416 return cls.from_json_dict(cfg, d)
418 # -------------------------------------------------------------------------
419 # Equality, hashing -- local_id should be unique
420 # -------------------------------------------------------------------------
421 # Be careful:
422 # - https://inventwithpython.com/blog/2019/02/01/hashable-objects-must-be-immutable/ # noqa: E501
423 # - https://docs.python.org/3/glossary.html [re "hashable"]
424 # Here, we define equality based on local_id, which will not change. In
425 # practice, nothing else will either.
427 def __eq__(self, other: "Person") -> bool:
428 return self.local_id == other.local_id
430 def __hash__(self) -> int:
431 return hash(self.local_id)
433 # -------------------------------------------------------------------------
434 # Representation: string
435 # -------------------------------------------------------------------------
437 def __repr__(self):
438 return auto_repr(self)
440 def __str__(self) -> str:
441 if self.is_hashed():
442 return f"Person<HASHED, local_id={self.local_id!r}>"
443 names = " ".join(
444 [str(f) for f in self.forenames] + [str(s) for s in self.surnames]
445 )
446 postcodes = " - ".join(str(x) for x in self.postcodes)
447 k = self.PersonKey
448 details = ", ".join(
449 [
450 f"{k.LOCAL_ID}={self.local_id}",
451 f"{k.PERFECT_ID}={self.perfect_id}",
452 f"name={names}",
453 f"{k.GENDER}={self.gender}",
454 f"{k.DOB}={self.dob}",
455 f"{k.POSTCODES}={postcodes}",
456 f"{k.OTHER_INFO}={self.other_info!r}",
457 ]
458 )
459 return f"Person<{details}>"
461 # -------------------------------------------------------------------------
462 # Representation: CSV
463 # -------------------------------------------------------------------------
465 @classmethod
466 def plaintext_csv_columns(cls) -> List[str]:
467 """
468 CSV column names -- including user-specified "other" information.
469 """
470 return cls.ALL_PERSON_KEYS
472 def plaintext_csv_dict(self) -> Dict[str, str]:
473 """
474 Returns a dictionary suitable for :class:`csv.DictWriter`.
475 This is for writing identifiable content.
476 """
477 d = {} # type: Dict[str, str]
478 for k in self.ALL_PERSON_KEYS:
479 a = getattr(self, k)
480 if k in self.SEMICOLON_DELIMIT and k != self.PersonKey.PERFECT_ID:
481 v = ";".join(str(x) for x in a)
482 else:
483 v = str(a)
484 d[k] = v
485 return d
487 # -------------------------------------------------------------------------
488 # Representation: JSON
489 # -------------------------------------------------------------------------
491 def as_dict(
492 self,
493 hashed: bool = True,
494 include_frequencies: bool = True,
495 include_other_info: bool = False,
496 ) -> Dict[str, Any]:
497 """
498 For JSON.
500 Args:
501 hashed:
502 Create a hashed/encrypted version?
503 include_frequencies:
504 Include frequency information. If you don't, this makes the
505 resulting file suitable for use as a sample, but not as a
506 proband file.
507 include_other_info:
508 include the (potentially identifying) ``other_info`` data?
509 Usually ``False``; may be ``True`` for validation.
510 """
511 pk = self.PersonKey
513 # This could be terser, but to be clear:
514 if hashed:
515 if self._is_plaintext:
516 encrypt = True
517 local_id = self.cfg.local_id_hash_fn(self.local_id)
518 else:
519 encrypt = False # already encrypted; don't do it twice
520 local_id = self.local_id
521 else:
522 if self._is_plaintext:
523 encrypt = False
524 local_id = self.local_id
525 else:
526 raise AssertionError(
527 "Can't create plaintext from hashed Person"
528 )
530 d = {
531 pk.LOCAL_ID: local_id,
532 pk.FORENAMES: [
533 f.as_dict(encrypt, include_frequencies) for f in self.forenames
534 ],
535 pk.SURNAMES: [
536 s.as_dict(encrypt, include_frequencies) for s in self.surnames
537 ],
538 pk.DOB: self.dob.as_dict(encrypt, include_frequencies),
539 pk.GENDER: self.gender.as_dict(encrypt, include_frequencies),
540 pk.POSTCODES: [
541 p.as_dict(encrypt, include_frequencies) for p in self.postcodes
542 ],
543 pk.PERFECT_ID: self.perfect_id.as_dict(encrypt),
544 }
545 if include_other_info:
546 d[pk.OTHER_INFO] = self.other_info
547 return d
549 # -------------------------------------------------------------------------
550 # Copying
551 # -------------------------------------------------------------------------
553 def copy(self) -> "Person":
554 """
555 Returns a copy of this object.
557 - :func:`copy.deepcopy` is incredibly slow, yet :func:`copy.copy` isn't
558 enough when we want to mutate this object.
559 - We did do it quasi-manually, copying attributes but using
560 ``[copy.copy(x) for x in value]`` if the value was a list.
561 - However, since we have functions to convert to/from a dict
562 representation, we may as well use them.
563 """
564 hashed = self.is_hashed()
565 return self.from_json_dict(
566 self.cfg,
567 self.as_dict(
568 hashed=hashed,
569 include_frequencies=True,
570 include_other_info=True,
571 ),
572 hashed=hashed,
573 )
575 # -------------------------------------------------------------------------
576 # Created hashed version
577 # -------------------------------------------------------------------------
579 def hashed(
580 self,
581 include_frequencies: bool = True,
582 include_other_info: bool = False,
583 ) -> "Person":
584 """
585 Returns a :class:`Person` object but with all the elements hashed (if
586 they are not blank).
588 Note that you do NOT need to do this just to write a hashed version to
589 disk. This function is primarily for comparing an entire sample of
590 hashed people to plaintext people, or vice versa; we hash the plaintext
591 version first.
593 Args:
594 include_frequencies:
595 Include frequency information. If you don't, this makes the
596 resulting file suitable for use as a sample, but not as a
597 proband file.
598 include_other_info:
599 include the (potentially identifying) ``other_info`` data?
600 Usually ``False``; may be ``True`` for validation.
601 """
602 d = self.as_dict(
603 hashed=True,
604 include_frequencies=include_frequencies,
605 include_other_info=include_other_info,
606 )
607 return self.from_json_dict(self.cfg, d)
609 # -------------------------------------------------------------------------
610 # Main comparison function
611 # -------------------------------------------------------------------------
613 def log_odds_same(self, candidate: "Person") -> float:
614 """
615 Returns the log odds that ``self`` (the proband) and ``candidate`` are
616 the same person.
618 Args:
619 candidate: another :class:`Person` object
621 Returns:
622 float: the log odds they're the same person
623 """
624 # High speed function.
625 return bayes_compare(
626 log_odds=self.baseline_log_odds_same_person,
627 comparisons=self._gen_comparisons(candidate),
628 )
630 # -------------------------------------------------------------------------
631 # Comparison helper functions
632 # -------------------------------------------------------------------------
634 def _gen_comparisons(
635 self, candidate: "Person"
636 ) -> Generator[Optional[Comparison], None, None]:
637 """
638 Generates all relevant comparisons.
640 Args:
641 candidate: another :class:`Person` object.
643 **Note**
645 In general, frequency information is associated with the proband,
646 not the candidate, so use ``self.thing.comparison(candidate.thing)``.
648 """
649 # A perfect match would already have been tested for. The shortlisting
650 # process may already have ensured a DOB partial match, or maybe not.
651 # Regardless, there are no identifiers that will cause a complete
652 # disqualification if they mismatch, so order here becomes unimportant
653 # for speed.
655 # Surnames
656 yield from gen_best_comparisons(
657 proband_identifiers=self.surnames,
658 candidate_identifiers=candidate.surnames,
659 ordered=False,
660 )
662 # Forenames
663 yield from gen_best_comparisons(
664 proband_identifiers=self.forenames,
665 candidate_identifiers=candidate.forenames,
666 ordered=True,
667 p_u=self.cfg.p_u_forename,
668 )
670 # DOB (see above)
671 # There is no special treatment of 29 Feb (since this DOB is
672 # approximately 4 times less common than other birthdays, in principle
673 # it does merit special treatment, but we ignore that).
674 yield self.dob.comparison(candidate.dob)
676 # Gender
677 yield self.gender.comparison(candidate.gender)
679 # Postcodes
680 yield from gen_best_comparisons(
681 proband_identifiers=self.postcodes,
682 candidate_identifiers=candidate.postcodes,
683 ordered=False,
684 )
686 # -------------------------------------------------------------------------
687 # Info functions
688 # -------------------------------------------------------------------------
690 def is_plaintext(self) -> bool:
691 """
692 Is this a plaintext (identifiable) Person?
693 """
694 return self._is_plaintext
696 def is_hashed(self) -> bool:
697 """
698 Is this a hashed (de-identified) Person?
699 """
700 return not self.is_plaintext()
702 def n_forenames(self) -> int:
703 """
704 Number of forenames
705 """
706 return len(self.forenames)
708 def has_dob(self) -> bool:
709 """
710 Do we have a DOB?
711 """
712 return bool(self.dob)
714 def n_postcodes(self) -> int:
715 """
716 How many postcodes does this person have?
717 """
718 return len(self.postcodes)
720 # -------------------------------------------------------------------------
721 # Validation
722 # -------------------------------------------------------------------------
724 def ensure_valid_as_proband(self) -> None:
725 """
726 Ensures this person has sufficient information to act as a proband, or
727 raises :exc:`ValueError`.
729 We previously required a DOB unless debugging, but no longer.
730 """
731 for f in self.forenames:
732 f.ensure_has_freq_info_if_id_present()
733 for s in self.surnames:
734 s.ensure_has_freq_info_if_id_present()
735 self.dob.ensure_has_freq_info_if_id_present()
736 self.gender.ensure_has_freq_info_if_id_present()
737 for p in self.postcodes:
738 p.ensure_has_freq_info_if_id_present()
740 def ensure_valid_as_candidate(self) -> None:
741 """
742 Ensures this person has sufficient information to act as a candidate,
743 or raises :exc:`AssertionError`.
745 We previously required a DOB unless debugging, but no longer.
746 """
747 pass
749 # -------------------------------------------------------------------------
750 # Debugging functions to check this object
751 # -------------------------------------------------------------------------
753 def debug_gen_identifiers(self) -> Generator[Identifier, None, None]:
754 """
755 Yield all identifiers.
756 """
757 yield from self.forenames
758 yield from self.surnames
759 if self.dob:
760 yield self.dob
761 if self.gender:
762 yield self.gender
763 yield from self.postcodes
765 def debug_comparison_report(
766 self, candidate: "Person", verbose: bool = True
767 ) -> str:
768 """
769 Compare a person with another, log every step of the way, and return
770 the result as a string.
771 """
772 lines = [] # type: List[str]
774 def report(msg_: str) -> None:
775 lines.append(f"{msg_} -> log_odds = {log_odds}")
777 if verbose:
778 spacer = " - "
779 self_id = (
780 "\n".join(
781 spacer + repr(i) for i in self.debug_gen_identifiers()
782 )
783 + "\n"
784 )
785 candidate_id = (
786 "\n".join(
787 spacer + repr(i) for i in candidate.debug_gen_identifiers()
788 )
789 + "\n"
790 )
791 else:
792 self_id = ""
793 candidate_id = ""
794 lines.append("VERBOSE COMPARISON:")
795 lines.append(f"- self (proband) = {self}")
796 lines.append(self_id)
797 lines.append(f"- candidate = {candidate}")
798 lines.append(candidate_id)
799 lines.append(f"- self dict = {self.as_dict(hashed=False)}")
800 lines.append(self_id)
801 lines.append(f"- candidate dict = {candidate.as_dict(hashed=False)}")
802 lines.append(candidate_id)
804 log_odds = self.cfg.baseline_log_odds_same_person
805 report("Baseline")
806 for comp in self._gen_comparisons(candidate=candidate):
807 if not comp:
808 continue
809 log_odds = comp.posterior_log_odds(log_odds)
810 report(str(comp))
812 return "\n".join(filter(None, lines))
814 def debug_compare(self, candidate: "Person", verbose: bool = True) -> None:
815 """
816 Compare a person with another, and log every step of the way.
817 """
818 log.info(self.debug_comparison_report(candidate, verbose=verbose))
820 # -------------------------------------------------------------------------
821 # Debugging functions to mutate this object
822 # -------------------------------------------------------------------------
824 def debug_delete_something(self) -> None:
825 """
826 Randomly delete one of: a forename, or a postcode.
827 """
828 n_forenames = self.n_forenames()
829 n_postcodes = self.n_postcodes()
830 n_possibilities = n_forenames + n_postcodes
831 if n_possibilities == 0:
832 log.warning(f"Unable to delete info from {self}")
833 return
834 which = random.randint(0, n_possibilities - 1)
836 if which < n_forenames:
837 del self.forenames[which]
838 return
839 which -= n_forenames
841 del self.postcodes[which]
843 def debug_mutate_something(self) -> None:
844 """
845 Randomly mutate one of: a forename, or a postcode.
846 """
847 n_forenames = self.n_forenames()
848 n_postcodes = self.n_postcodes()
849 n_possibilities = n_forenames + n_postcodes
850 if n_possibilities == 0:
851 log.warning(f"Unable to mutate info from {self}")
852 return
853 which = random.randrange(n_possibilities)
855 cfg = self.cfg
857 if which < n_forenames:
858 oldname = self.forenames[which]
859 assert oldname.is_plaintext
860 self.forenames[which] = Forename(
861 cfg, name=mutate_name(oldname.name), gender=oldname.gender
862 )
863 return
864 which -= n_forenames
866 oldpostcode = self.postcodes[which]
867 assert oldpostcode.is_plaintext
868 self.postcodes[which] = Postcode(
869 cfg, postcode=mutate_postcode(oldpostcode.postcode_unit, cfg)
870 )