Coverage for linkage/identifiers.py: 86%
843 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1r"""
2crate_anon/linkage/identifiers.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Helper functions for linkage tools.**
28Represents various types of person identifier (e.g. name, postcode) that may be
29compared between two people.
31"""
33# =============================================================================
34# Imports
35# =============================================================================
37from abc import ABC, abstractmethod
38import logging
39from typing import (
40 Any,
41 Dict,
42 Generator,
43 List,
44 Optional,
45 Set,
46 Tuple,
47 Type,
48 Union,
49)
51from cardinal_pythonlib.datetimefunc import coerce_to_pendulum_date
52from cardinal_pythonlib.maths_py import round_sf
53from cardinal_pythonlib.reprfunc import auto_repr
54import pendulum
55from pendulum.parsing.exceptions import ParserError
56from pendulum import Date
58from crate_anon.linkage.constants import NONE_TYPE, Switches, VALID_GENDERS
59from crate_anon.linkage.comparison import (
60 AdjustLogOddsComparison,
61 CertainComparison,
62 Comparison,
63 DirectComparison,
64)
65from crate_anon.linkage.helpers import (
66 get_first_two_char,
67 get_metaphone,
68 get_postcode_sector,
69 getdictprob,
70 getdictval,
71 is_valid_isoformat_date,
72 isoformat_date_or_none,
73 ln,
74 mk_blurry_dates,
75 POSTCODE_REGEX,
76 standardize_name,
77 standardize_perfect_id_key,
78 standardize_perfect_id_value,
79 standardize_postcode,
80 surname_alternative_fragments,
81 validate_uncertain_prob,
82)
83from crate_anon.linkage.matchconfig import MatchConfig
85log = logging.getLogger(__name__)
88# =============================================================================
89# Identifier
90# =============================================================================
93class Identifier(ABC):
94 """
95 Abstract base class: generic nugget of information about a person, in
96 identifiable (plaintext) or de-identified (hashed) form. Optionally, may
97 convey start/end dates.
99 Note:
101 - We trust that probabilities from the config have been validated (i.e. are
102 in the range 0-1), but we should check values arising from incoming data,
103 primarily via :meth:`from_hashed_dict`. The
104 :func:`crate_anon.linkage.helpers.getdictprob` does this, but more checks
105 may be required.
107 - A typical comparison operation involves comparing a lot of people to
108 each other, so it is usually efficient to cache "derived" information
109 (e.g. we should calculate metaphones, etc., from names at creation, not
110 at comparison). See :meth:`comparison`.
111 """
113 SEP = "/" # separator
114 NULL_VALUES_LOWERCASE = ["none", "null", "?"] # must include "none"
115 TEMPORAL_ID_FORMAT_HELP = (
116 f"Temporal identifier format: either just IDENTIFIER, or "
117 f"IDENTIFIER{SEP}STARTDATE{SEP}ENDDATE, where dates are in YYYY-MM-DD "
118 f"format or one of {NULL_VALUES_LOWERCASE} (case-insensitive)."
119 )
121 KEY_START_DATE = "start_date"
122 KEY_END_DATE = "end_date"
124 ERR_MISSING_FREQ = "Missing frequency information"
126 # -------------------------------------------------------------------------
127 # Creation, and representations that support creation
128 # -------------------------------------------------------------------------
130 def __init__(
131 self,
132 cfg: Optional[MatchConfig],
133 is_plaintext: bool,
134 temporal: bool = False,
135 start_date: Union[str, Date] = None,
136 end_date: Union[str, Date] = None,
137 ) -> None:
138 """
139 Args:
140 cfg:
141 A configuration object. Can be ``None`` but you have to specify
142 that manually.
143 is_plaintext:
144 Is this an identifiable (plaintext) version? If ``False``, then
145 it is a de-identified (hashed) version, whose internal
146 structure can be more complex.
147 temporal:
148 Store start/end dates (which can be ``None``) along with the
149 information?
150 start_date:
151 The start date (first valid date), or ``None``.
152 end_date:
153 The end date (last valid date), or ``None``.
154 """
155 assert isinstance(cfg, (MatchConfig, NONE_TYPE))
156 self.cfg = cfg
157 self.is_plaintext = is_plaintext
158 self.temporal = temporal
159 self.actually_temporal = temporal
160 self.start_date = None # type: Optional[Date]
161 self.end_date = None # type: Optional[Date]
162 self._set_dates(start_date, end_date)
164 def __str__(self) -> str:
165 """
166 A string representation used for CSV files.
167 """
168 if not self:
169 # No information
170 return ""
171 if self.is_plaintext:
172 # Identifiable
173 id_str = self.plaintext_str_core()
174 if self.actually_temporal:
175 if self.SEP in id_str:
176 raise ValueError(
177 f"Temporal identifier unsuitable: "
178 f"contains {self.SEP!r}"
179 )
180 return self.SEP.join(
181 [
182 id_str,
183 str(self.start_date),
184 str(self.end_date),
185 ]
186 )
187 else:
188 return id_str
189 return f"hashed_{self.__class__.__name__}"
191 @abstractmethod
192 def __eq__(self, other: "Identifier") -> bool:
193 """
194 Check equality with another, primarily for debugging.
196 Just because it's an @abstractmethod doesn't mean that you can't call
197 it (from derived classes).
198 """
199 return self._eq_check(other, ["start_date", "end_date"])
201 def _eq_check(self, other: "Identifier", attrs: List[str]) -> bool:
202 """
203 Helper function to implement equality checks.
204 """
205 if type(self) != type(other):
206 return False
207 return all(getattr(self, a) == getattr(other, a) for a in attrs)
209 @abstractmethod
210 def plaintext_str_core(self) -> str:
211 """
212 Represents the identifier in plaintext, for CSV. Potentially
213 encapsulated within more information by __str__().
214 """
215 pass
217 @classmethod
218 @abstractmethod
219 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "Identifier":
220 """
221 Restore a plaintext version from a string (which has been read from
222 CSV). Reverses __str__(), not plaintext_str_core().
223 """
224 pass
226 @abstractmethod
227 def as_dict(
228 self, encrypt: bool = True, include_frequencies: bool = True
229 ) -> Dict[str, Any]:
230 """
231 Represents the object in a dictionary suitable for JSON serialization,
232 for the de-identified (hashed) version.
234 Args:
235 encrypt:
236 Encrypt the contents as writing, creating a hashed version.
237 include_frequencies:
238 Include frequency information. If you don't, this makes the
239 resulting file suitable for use as a sample, but not as a
240 proband file.
241 """
242 pass
244 @classmethod
245 @abstractmethod
246 def from_dict(
247 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool
248 ) -> "Identifier":
249 """
250 Restore a hashed or plaintext version from a dictionary (which will
251 have been read from JSON).
252 """
253 pass
255 # -------------------------------------------------------------------------
256 # Internal methods to support creation
257 # -------------------------------------------------------------------------
259 def _set_dates(
260 self,
261 start_date: Union[str, Date] = None,
262 end_date: Union[str, Date] = None,
263 ) -> None:
264 """
265 Set date information. Should only be called for temporal identifiers.
266 """
267 start_date = coerce_to_pendulum_date(start_date)
268 if not isinstance(start_date, (Date, NONE_TYPE)):
269 raise ValueError(f"Bad start_date: {start_date!r}")
271 end_date = coerce_to_pendulum_date(end_date)
272 if not isinstance(end_date, (Date, NONE_TYPE)):
273 raise ValueError(f"Bad end_date: {end_date!r}")
275 if start_date and end_date:
276 if start_date > end_date:
277 raise ValueError(
278 f"start_date = {start_date!r} > end_date = {end_date!r}"
279 )
281 self.start_date = start_date
282 self.end_date = end_date
283 # Save some time later: this is only a temporal identifier if at least
284 # one date is set.
285 self.actually_temporal = bool(self.start_date) or bool(self.end_date)
287 def _set_dates_from_dict(self, d: Dict[str, Any]) -> None:
288 """
289 Reads from a (JSON-derived) dictionary and sets our dates.
290 Assumes we are a temporal identifier.
291 """
292 self._set_dates(
293 start_date=getdictval(d, self.KEY_START_DATE, str),
294 end_date=getdictval(d, self.KEY_END_DATE, str),
295 )
297 def _write_dates_to_dict(self, d: Dict[str, Any]) -> None:
298 """
299 For creating JSON dictionaries: write our dates to the dictionary (if
300 we are a temporal identifier).
301 """
302 if self.temporal:
303 d[self.KEY_START_DATE] = isoformat_date_or_none(self.start_date)
304 d[self.KEY_END_DATE] = isoformat_date_or_none(self.end_date)
306 @classmethod
307 def _get_temporal_triplet(
308 cls, x: str
309 ) -> Tuple[str, Optional[Date], Optional[Date]]:
310 """
311 From a string (e.g. from CSV), split into CONTENTS/START_DATE/END_DATE.
312 If it contains no "/", treat it as CONTENTS/None/None.
314 Args:
315 x:
316 String to parse.
318 Returns:
319 tuple:
320 contents, start_date, end_date
321 """
322 # Extract components of the string
323 components = x.split(cls.SEP)
325 if len(components) == 1:
326 # Separator not present.
327 contents = components[0]
328 return contents, None, None
330 if len(components) != 3:
331 raise ValueError(
332 f"Need three components separated by {cls.SEP!r} (or one with "
333 f"no {cls.SEP!r}); got {x!r}"
334 )
336 contents, start_date_str, end_date_str = components
338 # Start date
339 if start_date_str.lower() in cls.NULL_VALUES_LOWERCASE:
340 start_date = None # type: Optional[Date]
341 else:
342 try:
343 # noinspection PyTypeChecker
344 start_date = pendulum.parse(start_date_str).date()
345 except ParserError:
346 raise ValueError(f"Bad date: {start_date_str!r}")
348 # End date
349 if end_date_str.lower() in cls.NULL_VALUES_LOWERCASE:
350 end_date = None # type: Optional[Date]
351 else:
352 try:
353 # noinspection PyTypeChecker
354 end_date = pendulum.parse(end_date_str).date()
355 except ParserError:
356 raise ValueError(f"Bad date: {end_date_str!r}")
358 return contents, start_date, end_date
360 def _round(self, x: Optional[float], encrypt: bool) -> Optional[float]:
361 """
362 Implements config-defined rounding for frequency representations of
363 hashed values.
365 Rounds frequencies to a certain number of significant figures. (Don't
366 supply exact floating-point numbers for frequencies; may be more
367 identifying. Don't use decimal places; we have to deal with some small
368 numbers.)
369 """
370 if x is None:
371 return None
372 sf = self.cfg.rounding_sf
373 if sf is None or not encrypt:
374 return x
375 return round_sf(x, sf)
377 # -------------------------------------------------------------------------
378 # Python standard representation functions
379 # -------------------------------------------------------------------------
381 def __repr__(self):
382 """
383 Standardized Python representation.
384 """
385 return auto_repr(self, sort_attrs=False)
387 # -------------------------------------------------------------------------
388 # Basic tests
389 # -------------------------------------------------------------------------
391 @abstractmethod
392 def __bool__(self) -> bool:
393 """
394 Does this object contain information?
395 """
396 pass
398 # -------------------------------------------------------------------------
399 # Validation
400 # -------------------------------------------------------------------------
402 @abstractmethod
403 def ensure_has_freq_info_if_id_present(self) -> None:
404 """
405 If we have ID information but some frequency information is missing,
406 raise :exc:`ValueError`. Used to check validity for probands;
407 candidates do not have to fulfil this requirement.
408 """
409 pass
411 # -------------------------------------------------------------------------
412 # Comparison
413 # -------------------------------------------------------------------------
415 def comparison_relevant(self, other: "Identifier") -> bool:
416 """
417 It's only relevant to compare this identifier to another if both have
418 some information, and if they are not specifically excluded by a
419 temporal check.
420 """
421 return self and other and self.overlaps(other)
423 @abstractmethod
424 def comparison(self, candidate_id: "Identifier") -> Optional[Comparison]:
425 """
426 Return a comparison odds (embodying the change in log odds) for a
427 comparison between the "self" identifier (as the proband) and another,
428 the candidate. Frequency information is expected to be on the "self"
429 (proband) side.
430 """
431 pass
433 def overlaps(self, other: "Identifier") -> bool:
434 """
435 Do ``self`` and ``other`` overlap in time?
437 Args:
438 other:
439 the other :class:`Identifier`
441 For similar logic, see
442 :meth:`cardinal_pythonlib.interval.Interval.overlaps`.
443 """
444 if not self.actually_temporal or not other.actually_temporal:
445 return True
446 return not (
447 # This inner test is for non-overlap.
448 # (a) self ends before other starts
449 (
450 self.end_date
451 and other.start_date
452 and self.end_date < other.start_date
453 )
454 or
455 # (b) other ends before self starts
456 (
457 other.end_date
458 and self.start_date
459 and other.end_date < self.start_date
460 )
461 )
463 # -------------------------------------------------------------------------
464 # Debugging
465 # -------------------------------------------------------------------------
467 def hashed(self, include_frequencies: bool = True) -> "Identifier":
468 """
469 For testing: hash this identifier by itself.
470 """
471 encrypt = self.is_plaintext
472 d = self.as_dict(
473 encrypt=encrypt, include_frequencies=include_frequencies
474 )
475 cls = type(self) # type: Type[Identifier]
476 return cls.from_dict(self.cfg, d, hashed=True)
479# =============================================================================
480# IdentifierTwoState
481# =============================================================================
484class IdentifierTwoState(Identifier, ABC):
485 """
486 Identifier that supports a two-state comparison.
487 """
489 def __init__(self, *args, **kwargs) -> None:
490 super().__init__(*args, **kwargs)
492 self.comparison_full_match = None # type: Optional[DirectComparison]
493 self.comparison_no_match = None # type: Optional[DirectComparison]
495 def _clear_comparisons(self) -> None:
496 """
497 Reset our comparison objects.
498 """
499 self.comparison_full_match = None # type: Optional[DirectComparison]
500 self.comparison_no_match = None # type: Optional[DirectComparison]
502 @abstractmethod
503 def fully_matches(self, other: "IdentifierTwoState") -> bool:
504 """
505 Does this identifier fully match the other?
507 You can assume that self.comparison_relevant(other) is True.
508 """
509 pass
511 def comparison(
512 self, candidate_id: "IdentifierTwoState"
513 ) -> Optional[Comparison]:
514 """
515 Compare our identifier to another of the same type. Return None if you
516 wish to draw no conclusions (e.g. there is missing information, or
517 temporally defined identifiers do not overlap).
519 You should assume that frequency information must be present on the
520 "self" side (this should be the proband); it may be missing from the
521 "other" side (the candidate).
523 This is a high-speed function; pre-cache any fixed information that
524 requires multi-stage lookup.
525 """
526 if not self.comparison_relevant(candidate_id):
527 # Infer no conclusions from absent information.
528 return None
529 if self.fully_matches(candidate_id):
530 return self.comparison_full_match
531 return self.comparison_no_match
533 def warn_if_llr_order_unexpected(
534 self, full: DirectComparison, partials: List[DirectComparison] = None
535 ) -> None:
536 """
537 Partial/full comparisons are not guaranteed to be ordered as you might
538 expect; an example is in the validation paper (and in
539 other_examples_for_paper.py). Nor are all partial/full matches
540 guaranteed to yield better evidence for H than a complete mismatch.
541 However, that's what you might expect. This function warns the user if
542 that's not the case.
544 Args:
545 full:
546 Comparisons for the "full match" condition.
547 partials:
548 Comparisons for "partial match" conditions.
549 """
550 if not self.cfg.check_comparison_order:
551 return
552 partials = partials or []
553 no_match_llr = self.comparison_no_match.log_likelihood_ratio
554 if any(
555 c.log_likelihood_ratio < no_match_llr for c in [full] + partials
556 ):
557 log.warning(
558 f"{self.__class__.__name__}: a match comparison's log "
559 f"likelihood ratio is less than the no-match comparison's. "
560 f"Object:\n\n{self!r}"
561 )
562 full_match_llr = full.log_likelihood_ratio
563 if any(p.log_likelihood_ratio > full_match_llr for p in partials):
564 log.warning(
565 f"{self.__class__.__name__}: a partial match comparison's "
566 f"log likelihood ratio exceeds the full-match comparison's. "
567 f"Object:\n\n{self!r}"
568 )
571# =============================================================================
572# IdentifierThreeState
573# =============================================================================
576class IdentifierThreeState(IdentifierTwoState, ABC):
577 """
578 Identifier that supports a three-state comparison.
579 """
581 def __init__(self, *args, **kwargs) -> None:
582 super().__init__(*args, **kwargs)
584 self.comparison_partial_match = (
585 None
586 ) # type: Optional[DirectComparison]
588 def _clear_comparisons(self) -> None:
589 """
590 Reset our comparison objects.
591 """
592 super()._clear_comparisons()
593 self.comparison_partial_match = (
594 None
595 ) # type: Optional[DirectComparison]
597 @abstractmethod
598 def partially_matches(self, other: "IdentifierThreeState") -> bool:
599 """
600 Does this identifier partially match the other?
602 You can assume that self.comparison_relevant(other) is True.
603 """
604 pass
606 def comparison(
607 self, candidate_id: "IdentifierThreeState"
608 ) -> Optional[Comparison]:
609 """
610 See :meth:`IdentifierTwoState.comparison`.
611 """
612 if not self.comparison_relevant(candidate_id):
613 # Infer no conclusions from absent information.
614 return None
615 if self.fully_matches(candidate_id):
616 return self.comparison_full_match
617 if self.partially_matches(candidate_id):
618 return self.comparison_partial_match
619 return self.comparison_no_match
622# =============================================================================
623# IdentifierFourState
624# =============================================================================
627class IdentifierFourState(IdentifierThreeState, ABC):
628 """
629 Identifier that supports a four-state comparison.
630 """
632 def __init__(self, *args, **kwargs) -> None:
633 super().__init__(*args, **kwargs)
635 self.comparison_partial_match_second = (
636 None
637 ) # type: Optional[DirectComparison]
639 def _clear_comparisons(self) -> None:
640 """
641 Reset our comparison objects.
642 """
643 super()._clear_comparisons()
644 self.comparison_partial_match_second = (
645 None
646 ) # type: Optional[DirectComparison]
648 @abstractmethod
649 def partially_matches_second(self, other: "IdentifierFourState") -> bool:
650 """
651 Does this identifier partially match the other on the first fuzzy
652 identifier?
654 You can assume that self.comparison_relevant(other) is True.
655 """
656 pass
658 def comparison(
659 self, candidate_id: "IdentifierFourState"
660 ) -> Optional[Comparison]:
661 """
662 See :meth:`IdentifierTwoState.comparison`.
663 """
664 if not self.comparison_relevant(candidate_id):
665 # Infer no conclusions from absent information.
666 return None
667 if self.fully_matches(candidate_id):
668 return self.comparison_full_match
669 if self.partially_matches(candidate_id):
670 return self.comparison_partial_match
671 if self.partially_matches_second(candidate_id):
672 return self.comparison_partial_match_second
673 return self.comparison_no_match
676# =============================================================================
677# TemporalIDHolder
678# =============================================================================
681class TemporalIDHolder(Identifier):
682 """
683 Limited class that allows no config and stores a plain string identifier.
684 Used for representing postcodes between a database and CSV for validation.
685 """
687 BAD_METHOD = "Inappropriate function called for TemporalIDHolder"
689 def __init__(
690 self, identifier: str, start_date: Date = None, end_date: Date = None
691 ) -> None:
692 super().__init__(
693 cfg=None,
694 is_plaintext=True,
695 temporal=True,
696 start_date=start_date,
697 end_date=end_date,
698 )
699 self.identifier = identifier or ""
700 if not isinstance(self.identifier, str):
701 raise ValueError(f"Bad identifier: {identifier!r}")
703 def __eq__(self, other: Identifier) -> bool:
704 return super().__eq__(other) and self._eq_check(other, ["identifier"])
706 def plaintext_str_core(self) -> str:
707 return self.identifier
709 @classmethod
710 def from_plaintext_str(
711 cls, cfg: MatchConfig, x: str
712 ) -> "TemporalIDHolder":
713 contents, start_date, end_date = cls._get_temporal_triplet(x)
714 return TemporalIDHolder(
715 identifier=contents, start_date=start_date, end_date=end_date
716 )
718 # noinspection PyTypeChecker
719 def as_dict(
720 self, encrypt: bool = True, include_frequencies: bool = True
721 ) -> Dict[str, Any]:
722 raise AssertionError(self.BAD_METHOD)
724 # noinspection PyTypeChecker
725 @classmethod
726 def from_dict(
727 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool
728 ) -> "TemporalIDHolder":
729 raise AssertionError(cls.BAD_METHOD)
731 def __bool__(self) -> bool:
732 return bool(self.identifier)
734 def ensure_has_freq_info_if_id_present(self) -> None:
735 pass
737 def comparison(self, candidate_id: "Identifier") -> Optional[Comparison]:
738 raise AssertionError(self.BAD_METHOD)
741# =============================================================================
742# Postcode
743# =============================================================================
746class Postcode(IdentifierThreeState):
747 """
748 Represents a UK postcode.
750 Note that we store nationwide frequencies. Final adjustment by k_postcode
751 is only done at the last moment, allowing k_postcode to vary without having
752 to change a hashed frequency file. Similarly for the probability of a
753 postcode being unknown. So stored frequencies may be None.
754 """
756 KEY_POSTCODE_UNIT = "postcode_unit"
757 KEY_POSTCODE_SECTOR = "postcode_sector"
758 KEY_UNIT_FREQ = "unit_freq" # national fraction, f_f_postcode
759 KEY_SECTOR_FREQ = "sector_freq" # national fraction, f_p_postcode
761 def __init__(
762 self,
763 cfg: MatchConfig,
764 postcode: str = "",
765 start_date: Union[str, Date] = None,
766 end_date: Union[str, Date] = None,
767 ):
768 """
769 Plaintext creation of a postcode.
770 """
771 super().__init__(
772 cfg=cfg,
773 is_plaintext=True,
774 temporal=True,
775 start_date=start_date,
776 end_date=end_date,
777 )
779 if not isinstance(postcode, str):
780 raise ValueError(f"Bad postcode: {postcode!r}")
781 postcode = standardize_postcode(postcode)
782 if postcode and not POSTCODE_REGEX.match(postcode):
783 raise ValueError(f"Bad postcode: {postcode!r}")
785 if postcode:
786 self.postcode_unit = postcode
787 self.postcode_sector = get_postcode_sector(
788 self.postcode_unit, prestandardized=True
789 )
790 self.unit_freq, self.sector_freq = cfg.postcode_unit_sector_freq(
791 self.postcode_unit, prestandardized=True
792 )
793 # ... national fractions, f_f_postcode and f_p_postcode
794 else:
795 self.postcode_unit = ""
796 self.postcode_sector = ""
797 self.unit_freq = None # type: Optional[float]
798 self.sector_freq = None # type: Optional[float]
800 # Precalculate comparisons, for speed, but in a way that we can update
801 # them if we are being created via from_hashed_dict().
802 self._set_comparisons()
804 def _set_comparisons(self) -> None:
805 if self.postcode_unit:
806 cfg = self.cfg
808 # -----------------------------------------------------------------
809 # Population probabilities.
810 # -----------------------------------------------------------------
811 # Here we apply any comparison-time adjustments, e.g. for the
812 # probability of an unknown postcode or pseudopostcode, or the
813 # potential that our comparison population is a geographic subset
814 # of the UK.
816 # Unit probability, p_f
817 f_f = self.unit_freq # national fraction (full), or None
818 unit_unknown = f_f is None
819 if unit_unknown:
820 # Unknown postcode unit. This has been specified directly.
821 p_f = cfg.p_unknown_or_pseudo_postcode_unit
822 else:
823 # Known postcode
824 p_f = cfg.k_postcode * f_f * cfg.p_known_postcode
826 # Total sector probability, p_p
827 f_p = self.sector_freq # national fraction (partial), or None
828 sector_unknown = f_p is None
829 if sector_unknown:
830 # Unknown sector. This has been specified directly.
831 p_p = cfg.p_unknown_or_pseudo_postcode_sector
832 # A sanity check:
833 assert unit_unknown, (
834 "Should be impossible that the postcode unit is known but "
835 "the sector is not."
836 )
837 else:
838 # Known sector
839 p_p = cfg.k_postcode * f_p * cfg.p_known_postcode
840 # It is possible, though, that the postcode is unknown but the
841 # sector is known (e.g. a typo in the postcode).
842 if unit_unknown and p_p < p_f:
843 log.warning(
844 f"Unknown postcode unit in known sector and "
845 f"user-specified unknown unit probability "
846 f"p_f = {Switches.P_UNKNOWN_OR_PSEUDO_POSTCODE} "
847 f"exceeds the calculated probability of the known "
848 f"sector, p_p = k_postcode[{cfg.k_postcode}]"
849 f" * f_p[{f_p}]"
850 f" * p_known_postcode[{cfg.p_known_postcode}]"
851 f" = {p_p}. Adjusting the sector probability up to "
852 f"the unknown sector probability, "
853 f"p_p = {cfg.p_unknown_or_pseudo_postcode_sector}, "
854 f"but this may be a configuration error."
855 )
856 p_p = cfg.p_unknown_or_pseudo_postcode_sector
858 validate_uncertain_prob(
859 p_f,
860 "Postcode p_f = k_postcode * f_f * p_known_postcode",
861 )
862 validate_uncertain_prob(
863 p_p, "Postcode p_p = k_postcode * f_p * p_known_postcode"
864 )
865 # ... it's not reasonable that a postcode unit or sector is
866 # impossible or certain.
868 # Sector-not-unit probability, p_pnf
869 p_pnf = p_p - p_f
870 validate_uncertain_prob(
871 p_pnf, "Postcode p_pnf = p_p[sector] - p_f[unit]"
872 )
873 # ... It is not completely unreasonable for this to be 0, e.g. for
874 # pseudopostcodes that occupy all of their sector. But it's
875 # dangerous, because if a partial-not-full match then does occur,
876 # that will give P(D | ¬H) = 0 and log LR = +∞. We now enforce
877 # k_pseudopostcode > 1 and thus p_pnf > 0.
879 # -----------------------------------------------------------------
880 # Error probabilities
881 # -----------------------------------------------------------------
882 p_ep = cfg.p_ep_postcode
883 p_en = cfg.p_en_postcode
885 # -----------------------------------------------------------------
886 # Comparisons
887 # -----------------------------------------------------------------
888 self.comparison_full_match = DirectComparison(
889 p_d_given_same_person=1 - p_ep, # p_c
890 p_d_given_diff_person=p_f,
891 d_description="postcode_full_match",
892 )
893 self.comparison_partial_match = DirectComparison(
894 p_d_given_same_person=p_ep,
895 p_d_given_diff_person=p_pnf,
896 d_description="postcode_partial_not_full_match",
897 )
898 self.comparison_no_match = DirectComparison(
899 p_d_given_same_person=p_en,
900 p_d_given_diff_person=1 - p_p, # p_n
901 d_description="postcode_no_match",
902 )
903 self.warn_if_llr_order_unexpected(
904 full=self.comparison_full_match,
905 partials=[self.comparison_partial_match],
906 )
907 else:
908 self._clear_comparisons()
910 def __eq__(self, other: Identifier) -> bool:
911 return super().__eq__(other) and self._eq_check(other, ["postcode"])
913 def plaintext_str_core(self) -> str:
914 """
915 For CSV.
916 """
917 return self.postcode_unit
919 @classmethod
920 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "Postcode":
921 """
922 Creation from CSV.
923 """
924 postcode_unit, start_date, end_date = cls._get_temporal_triplet(x)
925 return Postcode(
926 cfg=cfg,
927 postcode=postcode_unit,
928 start_date=start_date,
929 end_date=end_date,
930 )
932 def as_dict(
933 self, encrypt: bool = True, include_frequencies: bool = True
934 ) -> Dict[str, Any]:
935 """
936 For JSON.
937 """
938 if not self.postcode_unit:
939 postcode_unit = None
940 postcode_sector = None
941 elif self.is_plaintext and encrypt:
942 postcode_unit = self.cfg.hash_fn(self.postcode_unit)
943 postcode_sector = self.cfg.hash_fn(self.postcode_sector)
944 else:
945 # Was already hashed, or keeping plaintext
946 postcode_unit = self.postcode_unit
947 postcode_sector = self.postcode_sector
948 d = {
949 self.KEY_POSTCODE_UNIT: postcode_unit,
950 self.KEY_POSTCODE_SECTOR: postcode_sector,
951 }
952 self._write_dates_to_dict(d)
953 if include_frequencies:
954 d[self.KEY_UNIT_FREQ] = self._round(self.unit_freq, encrypt)
955 d[self.KEY_SECTOR_FREQ] = self._round(self.sector_freq, encrypt)
956 return d
958 @classmethod
959 def from_dict(
960 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool
961 ) -> "Postcode":
962 """
963 Creation of a hashed postcode, ultimately from JSON.
964 """
965 p = Postcode(
966 cfg=cfg,
967 start_date=getdictval(d, cls.KEY_START_DATE, str),
968 end_date=getdictval(d, cls.KEY_END_DATE, str),
969 )
970 p.is_plaintext = not hashed
971 p.postcode_unit = getdictval(d, cls.KEY_POSTCODE_UNIT, str)
972 p.postcode_sector = getdictval(d, cls.KEY_POSTCODE_SECTOR, str)
973 p.unit_freq = getdictprob(d, cls.KEY_UNIT_FREQ) # permits None
974 p.sector_freq = getdictprob(d, cls.KEY_SECTOR_FREQ) # permits None
975 p._set_comparisons()
976 return p
978 def __bool__(self) -> bool:
979 return bool(self.postcode_unit)
981 def ensure_has_freq_info_if_id_present(self) -> None:
982 pass
983 # It's fine for frequency information to be missing; that means the
984 # postcode is unknown or a pseudopostcode. We cope in
985 # _set_comparisons().
987 def fully_matches(self, other: "Postcode") -> bool:
988 return self.postcode_unit == other.postcode_unit
990 def partially_matches(self, other: "Postcode") -> bool:
991 return self.postcode_sector == other.postcode_sector
994# =============================================================================
995# DateOfBirth
996# =============================================================================
999class DateOfBirth(IdentifierThreeState):
1000 """
1001 Represents a date of birth (DOB).
1003 We don't store any frequencies with the hashed version, since they are all
1004 obtainable from the config (they are not specific to a particular DOB).
1005 """
1007 KEY_DOB = "dob"
1008 KEY_DOB_MD = "dob_md"
1009 KEY_DOB_YD = "dob_yd"
1010 KEY_DOB_YM = "dob_ym"
1012 def __init__(self, cfg: MatchConfig, dob: str = "") -> None:
1013 """
1014 Plaintext creation of a DOB.
1016 Args:
1017 cfg:
1018 The config object.
1019 dob:
1020 (PLAINTEXT.) The date of birth in ISO-8061 "YYYY-MM-DD" string
1021 format.
1022 """
1023 super().__init__(cfg=cfg, is_plaintext=True, temporal=False)
1025 dob = dob or ""
1026 if not (
1027 isinstance(dob, str) and (not dob or is_valid_isoformat_date(dob))
1028 ):
1029 raise ValueError(f"Bad date: {dob!r}")
1031 self.dob_str = dob or ""
1032 # In our validation data, 93.3% of DOB errors were "single component"
1033 # errors, e.g. year wrong but month/day right. Within that, there was
1034 # no very dominant pattern.
1035 if dob:
1036 self.dob_md, self.dob_yd, self.dob_ym = mk_blurry_dates(dob)
1037 else:
1038 self.dob_md = ""
1039 self.dob_yd = ""
1040 self.dob_ym = ""
1042 # Precalculate our comparison objects, for speed.
1043 # We don't need a separate function here, because these frequencies are
1044 # all set from the config, not our data.
1045 self.comparison_full_match = DirectComparison(
1046 p_d_given_same_person=cfg.p_c_dob,
1047 p_d_given_diff_person=cfg.p_f_dob,
1048 d_description="dob_full_match",
1049 )
1050 self.comparison_partial_match = DirectComparison(
1051 p_d_given_same_person=cfg.p_ep_dob,
1052 p_d_given_diff_person=cfg.p_pnf_dob,
1053 d_description="dob_partial_not_full_match",
1054 )
1055 self.comparison_no_match = DirectComparison(
1056 p_d_given_same_person=cfg.p_en_dob,
1057 p_d_given_diff_person=cfg.p_n_dob,
1058 d_description="dob_no_match",
1059 )
1060 self.warn_if_llr_order_unexpected(
1061 full=self.comparison_full_match,
1062 partials=[self.comparison_partial_match],
1063 )
1065 def __eq__(self, other: Identifier) -> bool:
1066 return super().__eq__(other) and self._eq_check(other, ["dob_str"])
1068 def plaintext_str_core(self) -> str:
1069 """
1070 For CSV.
1071 """
1072 return self.dob_str
1074 @classmethod
1075 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "DateOfBirth":
1076 """
1077 Creation from CSV.
1078 """
1079 return DateOfBirth(cfg=cfg, dob=x)
1081 def as_dict(
1082 self, encrypt: bool = True, include_frequencies: bool = True
1083 ) -> Dict[str, Any]:
1084 """
1085 For JSON.
1086 """
1087 if not self.dob_str:
1088 dob = ""
1089 dob_md = ""
1090 dob_yd = ""
1091 dob_ym = ""
1092 elif self.is_plaintext and encrypt:
1093 hash_fn = self.cfg.hash_fn
1094 dob = hash_fn(self.dob_str)
1095 dob_md = hash_fn(self.dob_md)
1096 dob_yd = hash_fn(self.dob_yd)
1097 dob_ym = hash_fn(self.dob_ym)
1098 else:
1099 # Was already hashed, or staying plaintext
1100 dob = self.dob_str
1101 dob_md = self.dob_md
1102 dob_yd = self.dob_yd
1103 dob_ym = self.dob_ym
1104 return {
1105 self.KEY_DOB: dob,
1106 self.KEY_DOB_MD: dob_md,
1107 self.KEY_DOB_YD: dob_yd,
1108 self.KEY_DOB_YM: dob_ym,
1109 }
1111 @classmethod
1112 def from_dict(
1113 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool
1114 ) -> "DateOfBirth":
1115 """
1116 Creation of a hashed DOB, ultimately from JSON.
1117 """
1118 x = DateOfBirth(cfg=cfg)
1119 x.is_plaintext = not hashed
1120 x.dob_str = getdictval(d, cls.KEY_DOB, str)
1121 x.dob_md = getdictval(d, cls.KEY_DOB_MD, str)
1122 x.dob_yd = getdictval(d, cls.KEY_DOB_YD, str)
1123 x.dob_ym = getdictval(d, cls.KEY_DOB_YM, str)
1124 return x
1126 def __bool__(self) -> bool:
1127 return bool(self.dob_str)
1129 def ensure_has_freq_info_if_id_present(self) -> None:
1130 pass # That info is always in the config; none stored here.
1132 def fully_matches(self, other: "DateOfBirth") -> bool:
1133 return self.dob_str == other.dob_str
1135 def partially_matches(self, other: "DateOfBirth") -> bool:
1136 return (
1137 self.dob_md == other.dob_md
1138 or self.dob_yd == other.dob_yd
1139 or self.dob_ym == other.dob_ym
1140 )
1143# =============================================================================
1144# Gender
1145# =============================================================================
1148class Gender(IdentifierTwoState):
1149 """
1150 Represents a gender.
1151 """
1153 KEY_GENDER = "gender"
1154 KEY_GENDER_FREQ = "gender_freq"
1156 def __init__(self, cfg: MatchConfig, gender: str = "") -> None:
1157 """
1158 Plaintext creation of a gender.
1160 Args:
1161 cfg:
1162 The config object.
1163 gender:
1164 (PLAINTEXT.) The gender.
1165 """
1166 super().__init__(
1167 cfg=cfg,
1168 is_plaintext=True,
1169 temporal=False,
1170 )
1172 gender = gender or ""
1173 if gender not in VALID_GENDERS:
1174 raise ValueError(f"Bad gender: {gender!r}")
1176 self.gender_str = gender
1177 if gender:
1178 self.gender_freq = cfg.gender_freq(gender)
1179 else:
1180 self.gender_freq = None # type: Optional[float]
1182 self._set_comparisons()
1184 def _set_comparisons(self) -> None:
1185 if self.gender_freq:
1186 p_e = self.cfg.p_e_gender_error
1187 p_f = self.gender_freq
1188 self.comparison_full_match = DirectComparison(
1189 p_d_given_same_person=1 - p_e,
1190 p_d_given_diff_person=p_f,
1191 d_description="gender_match",
1192 )
1193 self.comparison_no_match = DirectComparison(
1194 p_d_given_same_person=p_e,
1195 p_d_given_diff_person=1 - p_f,
1196 d_description="gender_no_match",
1197 )
1198 self.warn_if_llr_order_unexpected(full=self.comparison_full_match)
1199 else:
1200 self._clear_comparisons()
1202 def __eq__(self, other: Identifier) -> bool:
1203 return super().__eq__(other) and self._eq_check(other, ["gender_str"])
1205 def plaintext_str_core(self) -> str:
1206 """
1207 For CSV.
1208 """
1209 return self.gender_str
1211 @classmethod
1212 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "Gender":
1213 """
1214 Creation from CSV.
1215 """
1216 return Gender(cfg=cfg, gender=x)
1218 def as_dict(
1219 self, encrypt: bool = True, include_frequencies: bool = True
1220 ) -> Dict[str, Any]:
1221 """
1222 For JSON.
1223 """
1224 if not self.gender_str:
1225 gender = ""
1226 elif self.is_plaintext and encrypt:
1227 gender = self.cfg.hash_fn(self.gender_str)
1228 else:
1229 # Was already hashed, or staying plaintext
1230 gender = self.gender_str
1231 d = {
1232 self.KEY_GENDER: gender,
1233 }
1234 if include_frequencies:
1235 d[self.KEY_GENDER_FREQ] = self._round(self.gender_freq, encrypt)
1236 return d
1238 @classmethod
1239 def from_dict(
1240 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool
1241 ) -> "Gender":
1242 """
1243 Creation of a hashed gender, ultimately from JSON.
1244 """
1245 g = Gender(cfg=cfg)
1246 g.is_plaintext = not hashed
1247 g.gender_str = getdictval(d, cls.KEY_GENDER, str)
1248 g.gender_freq = getdictprob(d, cls.KEY_GENDER_FREQ)
1249 g._set_comparisons()
1250 return g
1252 def __bool__(self) -> bool:
1253 return bool(self.gender_str)
1255 def ensure_has_freq_info_if_id_present(self) -> None:
1256 if self.gender_str and self.gender_freq is None:
1257 raise ValueError(
1258 self.ERR_MISSING_FREQ + f" for gender {self.gender_str!r}"
1259 )
1261 def fully_matches(self, other: "Gender") -> bool:
1262 return self.gender_str == other.gender_str
1265# =============================================================================
1266# BasicName
1267# =============================================================================
1270class BasicName(IdentifierFourState, ABC):
1271 """
1272 Base class for names.
1274 Note that this is a pretty difficult generic problem. See
1275 https://www.kalzumeus.com/2010/06/17/falsehoods-programmers-believe-about-names/
1277 The sequence of preferences is (1) full match, (2) metaphone match, (3)
1278 first two character (F2C) match, (4) no match. Reasons are discussed in the
1279 validation paper. Frequency representations here are slightly more complex
1280 as the fuzzy representations are not subsets/supersets of each other, but
1281 overlap, so we need to represent explicitly e.g. P(F2C match but not
1282 metaphone or name match).
1284 We will need some special gender features for both forenames and surnames:
1286 - UK forename frequency depends on gender.
1287 - The probability that someone's surname changes depends on gender.
1289 As a result, because we can't access gender once hashed, we need to store
1290 error frequencies as well as population frequencies.
1292 Since names can change, we also support optional start/end dates. If none
1293 are supplied, it simply becomes a non-temporal identifier.
1294 """
1296 KEY_NAME = "name"
1297 KEY_METAPHONE = "metaphone"
1298 KEY_FIRST_TWO_CHAR = "f2c"
1300 # Terse in the JSON, to save some space:
1301 KEY_P_F = "p_f" # name frequency
1302 KEY_P_P1NF = "p_p1nf" # metaphone, not name
1303 KEY_P_P2NP1 = "p_p2np1" # F2C, not name or metaphone
1305 KEY_P_C = "p_c"
1306 KEY_P_EP1 = "p_ep1"
1307 KEY_P_EP2NP1 = "p_ep2np1"
1309 def __init__(
1310 self,
1311 cfg: MatchConfig,
1312 name: str = "",
1313 gender: str = "",
1314 temporal: bool = False,
1315 start_date: Union[str, Date] = None,
1316 end_date: Union[str, Date] = None,
1317 description: str = "name",
1318 ) -> None:
1319 """
1320 Plaintext creation of a name.
1322 Args:
1323 cfg:
1324 The config object.
1325 name:
1326 (PLAINTEXT.) The name.
1327 description:
1328 Used internally for verbose comparisons.
1329 """
1330 if not isinstance(name, str):
1331 raise ValueError(f"Bad name: {name!r}")
1333 super().__init__(
1334 cfg=cfg,
1335 is_plaintext=True,
1336 temporal=temporal,
1337 start_date=start_date,
1338 end_date=end_date,
1339 )
1340 self.description = description
1342 # Standardization necessary for freq. lookup and metaphone.
1343 self.name = standardize_name(name)
1344 self.metaphone = get_metaphone(self.name)
1345 self.f2c = get_first_two_char(self.name)
1347 # Population frequencies -- to be overridden
1348 self.p_f = None # type: Optional[float]
1349 self.p_p1nf = None # type: Optional[float]
1350 self.p_p2np1 = None # type: Optional[float]
1352 # Error probabilities -- to be overridden
1353 self.p_c = None # type: Optional[float]
1354 self.p_ep1 = None # type: Optional[float]
1355 self.p_ep2np1 = None # type: Optional[float]
1357 self.gender = "" # changed in next step
1358 self.set_gender(gender) # will reset frequencies and comparisons
1360 def set_gender(self, gender: str) -> None:
1361 """
1362 Special operation for identifiable reading.
1363 """
1364 if gender not in VALID_GENDERS:
1365 raise ValueError(f"Bad gender: {gender!r}")
1366 self.gender = gender
1367 self._reset_frequencies_identifiable() # will set comparisons
1369 @abstractmethod
1370 def _reset_frequencies_identifiable(self) -> None:
1371 """
1372 Gender may have changed. Update any probabilities accordingly,
1373 and call self._set_comparisons().
1374 """
1375 pass
1377 def _clear_frequencies(self) -> None:
1378 """
1379 Clear our population/error frequencies.
1380 """
1381 self.p_f = None
1382 self.p_p1nf = None
1383 self.p_p2np1 = None
1385 self.p_c = None
1386 self.p_ep1 = None
1387 self.p_ep2np1 = None
1389 @property
1390 def p_en(self) -> float:
1391 """
1392 For internal use. Only call if frequencies are set up.
1393 """
1394 p_en = 1 - self.p_c - self.p_ep1 - self.p_ep2np1
1395 assert 0 <= p_en <= 1, "Bad error probabilities for a BasicName"
1396 return p_en
1398 @property
1399 def p_n(self) -> float:
1400 """
1401 For internal use. Only call if frequencies are set up.
1402 """
1403 p_n = 1 - self.p_f - self.p_p1nf - self.p_p2np1
1404 assert 0 <= p_n <= 1, "Bad population probabilities for a BasicName"
1405 return p_n
1407 def _set_comparisons(self) -> None:
1408 """
1409 If we have identifier information, use error information from `self`
1410 (unusually), and frequency information from `self`, to create our
1411 comparisons. Otherwise, call :meth:`_clear_comparisons`.
1412 """
1413 if self.name:
1414 desc = self.description
1415 self.comparison_full_match = DirectComparison(
1416 p_d_given_same_person=self.p_c,
1417 p_d_given_diff_person=self.p_f,
1418 d_description=f"{desc}_full_match",
1419 )
1420 self.comparison_partial_match = DirectComparison(
1421 p_d_given_same_person=self.p_ep1,
1422 p_d_given_diff_person=self.p_p1nf,
1423 d_description=f"{desc}_partial_match_1_metaphone_not_full",
1424 )
1425 self.comparison_partial_match_second = DirectComparison(
1426 p_d_given_same_person=self.p_ep2np1,
1427 p_d_given_diff_person=self.p_p2np1,
1428 d_description=f"{desc}_partial_match_2_f2c_not_name_metaphone",
1429 )
1430 self.comparison_no_match = DirectComparison(
1431 p_d_given_same_person=self.p_en,
1432 p_d_given_diff_person=self.p_n,
1433 d_description=f"{desc}_no_match",
1434 )
1435 self.warn_if_llr_order_unexpected(
1436 full=self.comparison_full_match,
1437 partials=[
1438 self.comparison_partial_match,
1439 self.comparison_partial_match_second,
1440 ],
1441 )
1442 else:
1443 self._clear_comparisons()
1445 def __eq__(self, other: Identifier) -> bool:
1446 return super().__eq__(other) and self._eq_check(
1447 other, ["name", "gender"]
1448 )
1450 def plaintext_str_core(self) -> str:
1451 """
1452 For CSV.
1453 """
1454 return self.name
1456 def as_dict(
1457 self, encrypt: bool = True, include_frequencies: bool = True
1458 ) -> Dict[str, Any]:
1459 """
1460 For JSON.
1461 """
1462 if not self.name:
1463 name = None
1464 metaphone = None
1465 f2c = None
1466 elif self.is_plaintext and encrypt:
1467 hash_fn = self.cfg.hash_fn
1468 name = hash_fn(self.name)
1469 metaphone = hash_fn(self.metaphone)
1470 f2c = hash_fn(self.f2c)
1471 else:
1472 # Was already hashed, or staying plaintext
1473 name = self.name
1474 metaphone = self.metaphone
1475 f2c = self.f2c
1476 d = {
1477 self.KEY_NAME: name,
1478 self.KEY_METAPHONE: metaphone,
1479 self.KEY_FIRST_TWO_CHAR: f2c,
1480 }
1481 self._write_dates_to_dict(d)
1482 if include_frequencies:
1483 d[self.KEY_P_F] = self._round(self.p_f, encrypt)
1484 d[self.KEY_P_P1NF] = self._round(self.p_p1nf, encrypt)
1485 d[self.KEY_P_P2NP1] = self._round(self.p_p2np1, encrypt)
1486 d[self.KEY_P_C] = self._round(self.p_c, encrypt)
1487 d[self.KEY_P_EP1] = self._round(self.p_ep1, encrypt)
1488 d[self.KEY_P_EP2NP1] = self._round(self.p_ep2np1, encrypt)
1489 return d
1491 def _set_from_json_dict_internal(self, d: Dict[str, Any], hashed: bool):
1492 """
1493 Internal function used by derived classes. Doesn't create the object,
1494 which is specialized to the derived class, but does the reading from
1495 the hashed dictionary and sets up the comparisons.
1496 """
1497 self.is_plaintext = not hashed
1499 if self.temporal:
1500 self._set_dates_from_dict(d)
1502 self.name = getdictval(d, self.KEY_NAME, str)
1503 self.metaphone = getdictval(d, self.KEY_METAPHONE, str)
1504 self.f2c = getdictval(d, self.KEY_FIRST_TWO_CHAR, str)
1506 self.p_f = getdictprob(d, self.KEY_P_F)
1507 self.p_p1nf = getdictprob(d, self.KEY_P_P1NF)
1508 self.p_p2np1 = getdictprob(d, self.KEY_P_P2NP1)
1510 self.p_c = getdictprob(d, self.KEY_P_C)
1511 self.p_ep1 = getdictprob(d, self.KEY_P_EP1)
1512 self.p_ep2np1 = getdictprob(d, self.KEY_P_EP2NP1)
1514 self._set_comparisons()
1516 def __bool__(self) -> bool:
1517 return bool(self.name)
1519 def ensure_has_freq_info_if_id_present(self) -> None:
1520 if self.name and (
1521 self.p_f is None or self.p_p1nf is None or self.p_p2np1 is None
1522 ):
1523 raise ValueError(
1524 self.ERR_MISSING_FREQ + f" for name {self.name!r}"
1525 )
1527 def fully_matches(self, other: "BasicName") -> bool:
1528 return self.name == other.name
1530 def partially_matches(self, other: "BasicName") -> bool:
1531 return self.metaphone == other.metaphone
1533 def partially_matches_second(self, other: "BasicName") -> bool:
1534 return self.f2c == other.f2c
1537# =============================================================================
1538# SurnameFragment
1539# =============================================================================
1542class SurnameFragment(BasicName):
1543 """
1544 Collate information about a name fragment. This identifier is unlikely to
1545 be used directly for comparisons, but is used by Surname.
1547 We don't store dates; they are stored with the surname.
1548 """
1550 BAD_METHOD = "Inappropriate function called for SurnameFragment"
1552 # -------------------------------------------------------------------------
1553 # Creation
1554 # -------------------------------------------------------------------------
1556 def __init__(
1557 self,
1558 cfg: MatchConfig,
1559 name: str = "",
1560 gender: str = "",
1561 ) -> None:
1562 super().__init__(cfg, name=name, gender=gender, description="surname")
1563 # ... will call _reset_frequencies_identifiable()
1565 @classmethod
1566 def from_dict(
1567 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool
1568 ) -> "SurnameFragment":
1569 f = SurnameFragment(cfg)
1570 f._set_from_json_dict_internal(d, hashed)
1571 return f
1573 # -------------------------------------------------------------------------
1574 # Creation helper functions
1575 # -------------------------------------------------------------------------
1577 def _reset_frequencies_identifiable(self) -> None:
1578 if self.name:
1579 cfg = self.cfg
1580 f = cfg.get_surname_freq_info(self.name, prestandardized=True)
1581 g = self.gender
1583 self.p_f = f.p_name
1584 self.p_p1nf = f.p_metaphone_not_name
1585 self.p_p2np1 = f.p_f2c_not_name_metaphone
1587 self.p_c = cfg.p_c_surname[g]
1588 self.p_ep1 = cfg.p_ep1_surname[g]
1589 self.p_ep2np1 = cfg.p_ep2np1_surname[g]
1590 else:
1591 self._clear_frequencies()
1592 self._set_comparisons()
1594 # -------------------------------------------------------------------------
1595 # Unused methods from Identifier
1596 # -------------------------------------------------------------------------
1598 def plaintext_str_core(self) -> str:
1599 raise AssertionError(self.BAD_METHOD)
1601 @classmethod
1602 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "SurnameFragment":
1603 raise AssertionError(cls.BAD_METHOD)
1605 # -------------------------------------------------------------------------
1606 # Sorting methods, to use the linter
1607 # -------------------------------------------------------------------------
1609 @staticmethod
1610 def sort_exact_freq(x: "SurnameFragment") -> float:
1611 return x.p_f
1613 @staticmethod
1614 def sort_partial_1_freq(x: "SurnameFragment") -> float:
1615 return x.p_p1nf
1617 @staticmethod
1618 def sort_partial_2_freq(x: "SurnameFragment") -> float:
1619 return x.p_p2np1
1622# =============================================================================
1623# Surname
1624# =============================================================================
1627class Surname(Identifier):
1628 """
1629 Represents a surname (family name).
1631 Identifiably, we store the unmodified (unstandardized) name.
1633 We don't inherit from BasicName, but from Identifier, because surnames
1634 need to deal with "fragment" problems.
1636 We need to be able to match on parts. For example, "van Beethoven" should
1637 match "van Beethoven" but also "Beethoven". What frequency should we use
1638 for those parts? This has to be the frequency of the part (not the
1639 composite). For example, if someone is called "Mozart-Smith", then a match
1640 on "Mozart-Smith" or "Mozart" is less likely in the population, and thus
1641 more informative, than a match on "Smith". So, we need frequency
1642 information associated with each part.
1643 """
1645 KEY_FRAGMENTS = "fragments"
1647 # -------------------------------------------------------------------------
1648 # Creation
1649 # -------------------------------------------------------------------------
1651 def __init__(
1652 self,
1653 cfg: MatchConfig,
1654 name: str = "",
1655 gender: str = "",
1656 start_date: Union[str, Date] = None,
1657 end_date: Union[str, Date] = None,
1658 ) -> None:
1659 super().__init__(
1660 cfg,
1661 is_plaintext=True,
1662 temporal=True,
1663 start_date=start_date,
1664 end_date=end_date,
1665 )
1666 self.raw_surname = name.strip() # but retain case, internal spaces
1667 # ... because "case" is complex for UTF8 characters.
1669 # There is some duplication here for speed and to cope with the
1670 # difference between identifiable and hashed versions. We want a set
1671 # version for rapid overlap checking, and an ordered list to pick by
1672 # frequency sometimes.
1673 self.exact_set = set() # type: Set[str]
1674 self.partial_set_metaphone = set() # type: Set[str]
1675 self.partial_set_f2c = set() # type: Set[str]
1676 self.fragments = [] # type: List[SurnameFragment]
1677 # ... set properly by _reset_identifiable() and from_dict()
1678 self.gender = "" # changed in next step
1679 self.set_gender(gender) # will reset frequencies/comparisons
1681 @classmethod
1682 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "Surname":
1683 """
1684 Creation from CSV.
1685 """
1686 name, start_date, end_date = cls._get_temporal_triplet(x)
1687 return Surname(
1688 cfg=cfg, name=x, start_date=start_date, end_date=end_date
1689 )
1691 @classmethod
1692 def from_dict(
1693 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool
1694 ) -> "Surname":
1695 """
1696 Creation of a hashed name, ultimately from JSON.
1697 """
1698 n = Surname(cfg=cfg)
1699 n.is_plaintext = not hashed
1700 n._set_dates_from_dict(d)
1701 fragments_json_list = getdictval(d, cls.KEY_FRAGMENTS, list)
1702 n.fragments = [
1703 SurnameFragment.from_dict(cfg, fragment_dict, hashed)
1704 for fragment_dict in fragments_json_list
1705 ]
1706 n._reset_name_sets()
1707 return n
1709 def __eq__(self, other: Identifier) -> bool:
1710 return super().__eq__(other) and self._eq_check(
1711 other, ["gender", "fragments"]
1712 )
1714 # -------------------------------------------------------------------------
1715 # Representation
1716 # -------------------------------------------------------------------------
1718 def plaintext_str_core(self) -> str:
1719 return self.raw_surname
1721 def as_dict(
1722 self, encrypt: bool = True, include_frequencies: bool = True
1723 ) -> Dict[str, Any]:
1724 fragments = [
1725 f.as_dict(encrypt=encrypt, include_frequencies=include_frequencies)
1726 for f in self.fragments
1727 ]
1728 d = {self.KEY_FRAGMENTS: fragments}
1729 self._write_dates_to_dict(d)
1730 return d
1732 # -------------------------------------------------------------------------
1733 # Methods to support creation
1734 # -------------------------------------------------------------------------
1736 def set_gender(self, gender: str) -> None:
1737 """
1738 Special operation for identifiable reading.
1739 """
1740 if gender not in VALID_GENDERS:
1741 raise ValueError(f"Bad gender: {gender!r}")
1742 self.gender = gender
1743 self._reset_identifiable() # will set comparisons
1745 def _reset_identifiable(self) -> None:
1746 """
1747 If the name or gender has changed, in an identifiable copy, reset our
1748 fragment information (with their comparisons), and the name fragment
1749 sets for fast comparison.
1750 """
1751 cfg = self.cfg
1752 self.fragments = []
1753 if self.raw_surname:
1754 for exact in surname_alternative_fragments(
1755 surname=self.raw_surname,
1756 accent_transliterations=cfg.accent_transliterations,
1757 nonspecific_name_components=cfg.nonspecific_name_components,
1758 ):
1759 # The first of these is the full name.
1760 fragment = SurnameFragment(
1761 cfg=cfg, name=exact, gender=self.gender
1762 )
1763 self.fragments.append(fragment)
1764 self.exact_set.add(fragment.name)
1765 self.partial_set_metaphone.add(fragment.metaphone)
1766 self.partial_set_f2c.add(fragment.f2c)
1767 self._reset_name_sets()
1769 def _reset_name_sets(self) -> None:
1770 """
1771 Reset our fast comparison sets from the name fragments.
1772 """
1773 self.exact_set = set()
1774 self.partial_set_metaphone = set()
1775 self.partial_set_f2c = set()
1776 for f in self.fragments:
1777 self.exact_set.add(f.name)
1778 self.partial_set_metaphone.add(f.metaphone)
1779 self.partial_set_f2c.add(f.f2c)
1781 # -------------------------------------------------------------------------
1782 # Basic tests
1783 # -------------------------------------------------------------------------
1785 def __bool__(self) -> bool:
1786 return bool(self.fragments)
1788 def ensure_has_freq_info_if_id_present(self) -> None:
1789 for f in self.fragments:
1790 f.ensure_has_freq_info_if_id_present()
1792 # -------------------------------------------------------------------------
1793 # Comparison
1794 # -------------------------------------------------------------------------
1796 def fully_matches(self, other: "Surname") -> bool:
1797 """
1798 Primarily for debugging; :meth:`comparison` is used for real work.
1799 """
1800 return bool(self.exact_set.intersection(other.exact_set))
1802 def partially_matches(self, other: "Surname") -> bool:
1803 """
1804 Primarily for debugging; :meth:`comparison` is used for real work.
1805 """
1806 return bool(
1807 self.partial_set_metaphone.intersection(
1808 other.partial_set_metaphone
1809 )
1810 )
1812 def partially_matches_second(self, other: "Surname") -> bool:
1813 """
1814 Primarily for debugging; :meth:`comparison` is used for real work.
1815 """
1816 return bool(self.partial_set_f2c.intersection(other.partial_set_f2c))
1818 def comparison(self, candidate_id: "Surname") -> Optional[Comparison]:
1819 """
1820 Specialized version for surname.
1821 """
1822 if not self.comparison_relevant(candidate_id):
1823 # Infer no conclusions from absent information.
1824 return None
1826 overlap_exact = self.exact_set.intersection(candidate_id.exact_set)
1827 if overlap_exact:
1828 # Exact match. But possibly >1, e.g. "Mozart-Smith" has matched
1829 # "Mozart-Smith", "Mozart", and "Smith". Reasonable to pick the
1830 # most informative (rarest) version.
1831 possibilities = [
1832 f for f in self.fragments if f.name in overlap_exact
1833 ] # type: List[SurnameFragment]
1834 possibilities.sort(key=SurnameFragment.sort_exact_freq)
1835 # Sorted in ascending order, so first (lowest frequency) is best.
1836 return possibilities[0].comparison_full_match
1838 overlap_partial_1 = self.partial_set_metaphone.intersection(
1839 candidate_id.partial_set_metaphone
1840 )
1841 if overlap_partial_1:
1842 # Similarly:
1843 possibilities = [
1844 f for f in self.fragments if f.metaphone in overlap_partial_1
1845 ] # type: List[SurnameFragment]
1846 possibilities.sort(key=SurnameFragment.sort_partial_1_freq)
1847 # Sorted in ascending order, so first (lowest frequency) is best.
1848 return possibilities[0].comparison_partial_match
1850 overlap_partial_2 = self.partial_set_f2c.intersection(
1851 candidate_id.partial_set_f2c
1852 )
1853 if overlap_partial_2:
1854 # Similarly:
1855 possibilities = [
1856 f for f in self.fragments if f.f2c in overlap_partial_2
1857 ] # type: List[SurnameFragment]
1858 possibilities.sort(key=SurnameFragment.sort_partial_2_freq)
1859 # Sorted in ascending order, so first (lowest frequency) is best.
1860 return possibilities[0].comparison_partial_match_second
1862 # For "no match", we use the whole original name and its frequencies:
1863 return self.fragments[0].comparison_no_match
1866# =============================================================================
1867# Forename
1868# =============================================================================
1871class Forename(BasicName):
1872 """
1873 Represents a forename (given name).
1874 """
1876 def __init__(
1877 self,
1878 cfg: MatchConfig,
1879 name: str = "",
1880 gender: str = "",
1881 start_date: Union[str, Date] = None,
1882 end_date: Union[str, Date] = None,
1883 ) -> None:
1884 super().__init__(
1885 cfg=cfg,
1886 name=name,
1887 gender=gender,
1888 temporal=True,
1889 start_date=start_date,
1890 end_date=end_date,
1891 description="forename",
1892 )
1893 # ... will call _reset_frequencies_identifiable()
1895 def _reset_frequencies_identifiable(self) -> None:
1896 if self.name:
1897 cfg = self.cfg
1898 g = self.gender
1899 f = cfg.get_forename_freq_info(self.name, g, prestandardized=True)
1901 self.p_f = f.p_name
1902 self.p_p1nf = f.p_metaphone_not_name
1903 self.p_p2np1 = f.p_f2c_not_name_metaphone
1905 self.p_c = cfg.p_c_forename[g]
1906 self.p_ep1 = cfg.p_ep1_forename[g]
1907 self.p_ep2np1 = cfg.p_ep2np1_forename[g]
1908 else:
1909 self._clear_frequencies()
1910 self._set_comparisons()
1912 @classmethod
1913 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "Forename":
1914 """
1915 Creation from CSV.
1916 """
1917 name, start_date, end_date = cls._get_temporal_triplet(x)
1918 return Forename(
1919 cfg=cfg, name=x, start_date=start_date, end_date=end_date
1920 )
1922 @classmethod
1923 def from_dict(
1924 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool
1925 ) -> "Forename":
1926 """
1927 Creation of a hashed name, ultimately from JSON.
1928 """
1929 n = Forename(cfg=cfg)
1930 n._set_from_json_dict_internal(d, hashed)
1931 return n
1934# =============================================================================
1935# PerfectID
1936# =============================================================================
1939class PerfectID(IdentifierTwoState):
1940 """
1941 For comparing people based on one or more perfect ID values.
1942 """
1944 def __init__(
1945 self, cfg: MatchConfig, identifiers: Dict[str, Any] = None
1946 ) -> None:
1947 """
1948 The identifier values will be converted to strings, if they aren't
1949 already.
1950 """
1951 super().__init__(cfg=cfg, is_plaintext=True, temporal=False)
1952 self.comparison_full_match = CertainComparison()
1954 self.identifiers = {} # type: Dict[str, str]
1955 self.key_set = set() # type: Set[str]
1956 if identifiers:
1957 self._set_identifiers(identifiers)
1959 def _set_identifiers(self, identifiers: Dict[str, str] = None) -> None:
1960 identifiers = identifiers or {}
1961 for k, v in identifiers.items():
1962 self.identifiers[standardize_perfect_id_key(k)] = (
1963 standardize_perfect_id_value(v)
1964 )
1965 self.key_set = set(self.identifiers.keys())
1967 @classmethod
1968 def from_plaintext_str(cls, cfg: MatchConfig, x: str) -> "PerfectID":
1969 d = {} # type: Dict[str, str]
1970 pair_strings = x.split(";")
1971 for pair_str in pair_strings:
1972 if pair_str.count(":") != 1:
1973 raise ValueError(f"Bad PerfectID string {x!r}")
1974 k, v = pair_str.split(":")
1975 d[k] = v
1976 return PerfectID(cfg=cfg, identifiers=d)
1978 def __eq__(self, other: Identifier) -> bool:
1979 return super().__eq__(other) and self._eq_check(other, ["identifiers"])
1981 def plaintext_str_core(self) -> str:
1982 return ";".join(f"{k}={v}" for k, v in self.identifiers)
1984 @classmethod
1985 def from_dict(
1986 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool
1987 ) -> "PerfectID":
1988 p = PerfectID(cfg=cfg)
1989 p.is_plaintext = not hashed
1990 p._set_identifiers(d)
1991 return p
1993 def as_dict(
1994 self, encrypt: bool = True, include_frequencies: bool = True
1995 ) -> Dict[str, Any]:
1996 if not self.is_plaintext or not encrypt:
1997 # Was already hashed, or staying plaintext
1998 return self.identifiers
1999 hash_fn = self.cfg.hash_fn
2000 return {k: hash_fn(v) for k, v in self.identifiers.items()}
2002 def __bool__(self) -> bool:
2003 return bool(self.identifiers)
2005 def ensure_has_freq_info_if_id_present(self) -> None:
2006 pass
2008 def fully_matches(self, other: "PerfectID") -> bool:
2009 for k in self.key_set.intersection(other.key_set):
2010 if self.identifiers[k] == other.identifiers[k]:
2011 # Match
2012 return True
2013 return False
2015 def comparison(self, candidate_id: "PerfectID") -> Optional[Comparison]:
2016 return (
2017 self.comparison_full_match
2018 if self.fully_matches(candidate_id)
2019 else None
2020 )
2023# =============================================================================
2024# DummyLetterIdentifier
2025# =============================================================================
2028class DummyLetterTemporalIdentifier(IdentifierTwoState):
2029 """
2030 Represents identifiers {A, B, ... Z}, each with probability 1/26, allowing
2031 exact matching only. For testing multiple comparison algorithms. Allows a
2032 temporal component.
2033 """
2035 Q = 1 / 26 # true
2036 P_ERROR = 0.01 # arbitrary
2037 KEY_VALUE = "value"
2039 def __init__(
2040 self,
2041 value: str,
2042 cfg: Optional[MatchConfig] = None,
2043 temporal: bool = False,
2044 start_date: Union[str, Date] = None,
2045 end_date: Union[str, Date] = None,
2046 ) -> None:
2047 """
2048 Plaintext creation of a dummy identifier.
2049 """
2050 super().__init__(
2051 cfg=cfg,
2052 is_plaintext=True,
2053 temporal=temporal,
2054 start_date=start_date,
2055 end_date=end_date,
2056 )
2057 assert (
2058 isinstance(value, str)
2059 and len(value) == 1
2060 and ord("A") <= ord(value) <= ord("Z")
2061 )
2062 self.value = value
2063 self._set_comparisons()
2065 def _set_comparisons(self) -> None:
2066 p_e = self.P_ERROR
2067 p_f = self.Q
2068 self.comparison_full_match = DirectComparison(
2069 p_d_given_same_person=1 - p_e,
2070 p_d_given_diff_person=p_f,
2071 d_description=f"dummy_match:{self.value}",
2072 )
2073 self.comparison_no_match = DirectComparison(
2074 p_d_given_same_person=p_e,
2075 p_d_given_diff_person=1 - p_f,
2076 d_description=f"dummy_mismatch:{self.value}",
2077 )
2079 def __eq__(self, other: "Identifier") -> bool:
2080 return super().__eq__(other) and self._eq_check(other, ["value"])
2082 def plaintext_str_core(self) -> str:
2083 return self.value
2085 @classmethod
2086 def from_plaintext_str(
2087 cls, cfg: MatchConfig, x: str
2088 ) -> "DummyLetterTemporalIdentifier":
2089 value, start_date, end_date = cls._get_temporal_triplet(x)
2090 return DummyLetterTemporalIdentifier(
2091 cfg=cfg,
2092 value=x,
2093 start_date=start_date,
2094 end_date=end_date,
2095 temporal=True,
2096 )
2098 def as_dict(
2099 self, encrypt: bool = True, include_frequencies: bool = True
2100 ) -> Dict[str, Any]:
2101 """
2102 For JSON.
2103 """
2104 if self.is_plaintext and encrypt:
2105 value = self.cfg.hash_fn(self.value)
2106 else:
2107 # Was already hashed, or staying plaintext
2108 value = self.value
2109 d = {self.KEY_VALUE: value}
2110 self._write_dates_to_dict(d)
2111 return d
2113 @classmethod
2114 def from_dict(
2115 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool
2116 ) -> "DummyLetterTemporalIdentifier":
2117 i = DummyLetterTemporalIdentifier(cfg=cfg, value="A")
2118 # ... value is a dummy, overwritten
2119 i.is_plaintext = not hashed
2120 i.value = getdictval(d, cls.KEY_VALUE, str)
2121 i._set_comparisons()
2122 return i
2124 def __bool__(self) -> bool:
2125 return bool(self.value)
2127 def ensure_has_freq_info_if_id_present(self) -> None:
2128 pass
2130 def fully_matches(self, other: "DummyLetterIdentifier") -> bool:
2131 return self.value == other.value
2134# =============================================================================
2135# DummyLetterIdentifier
2136# =============================================================================
2139class DummyLetterIdentifier(DummyLetterTemporalIdentifier):
2140 """
2141 Represents identifiers {A, B, ... Z}, each with probability 1/26, allowing
2142 exact matching only. For testing multiple comparison algorithms. No
2143 temporal component.
2144 """
2146 def __init__(self, value: str, cfg: Optional[MatchConfig] = None) -> None:
2147 """
2148 Plaintext creation of a dummy identifier.
2149 """
2150 super().__init__(cfg=cfg, value=value, temporal=False)
2152 @classmethod
2153 def from_plaintext_str(
2154 cls, cfg: MatchConfig, x: str
2155 ) -> "DummyLetterIdentifier":
2156 return DummyLetterIdentifier(cfg=cfg, value=x)
2158 @classmethod
2159 def from_dict(
2160 cls, cfg: MatchConfig, d: Dict[str, Any], hashed: bool
2161 ) -> "DummyLetterIdentifier":
2162 i = DummyLetterIdentifier(cfg=cfg, value="A") # dummy, overwritten
2163 i.is_plaintext = not hashed
2164 i.value = getdictval(d, cls.KEY_VALUE, str)
2165 i._set_comparisons()
2166 return i
2169# =============================================================================
2170# Comparison of multiple potentially jumbled similar identifiers
2171# =============================================================================
2173NOTES_MULTIPLE_COMPARISONS = """
2175What can be compared?
2176---------------------
2178Identifiers that are explicitly time-stamped cannot be compared with explicitly
2179non-overlapping identifiers. (But un-time-stamped identifiers can be compared
2180with anything.) And only information that is "present" is used for comparison.
2181These checks are implemented by each identifier in their `comparison` method.
2184What is a good match?
2185---------------------
2187Implicitly, we prefer full > partial > no match (and similarly for comparisons
2188with more or fewer than 3 options). But this is implemented more explicitly by
2189log likelihood ratio: we prefer higher values.
2192No re-use
2193---------
2195No identifier can be used for >1 comparison simultaneously. "Surplus"
2196identifiers therefore provide no evidence. For example, if candidate_identifers
2197= [A, B, C] and proband_identifiers = [A, B], then C will be ignored (the
2198comparisons will likely be A/A, B/B). But [A, B, C] versus [A, B, D] will
2199likely lead to comparisons A/A, B/B, C/D.
2201Suppose our proband has n identifiers, and our candidate has m. Then we can
2202make c = min(n, m) comparisons.
2205Unordered comparisons: picking the best involves implicit comparison
2206--------------------------------------------------------------------
2208In unordered comparisons, if we pick the best, we have implicitly made many
2209more comparisons. We need to adjust for that.
2211To illustrate, suppose the population of all names is {A, B, ..., Z}, giving a
2212set of size s = 26, and that every name is equiprobable in the population with
2213frequency q = 1/s = 1/26.
2215PROBABILITY OF A POPULATION (RANDOM PERSON) MATCH FOR MULTIPLE IDENTIFIERS. If
2216we have a proband with names [A] and a candidate with a single name such as [A]
2217or [Z], then we will declare a match if the candidate is named [A] and P(D |
2218¬H) = P(match | randomly selected other person) = 1/26. If our candidate has
2219two unordered names, then we would declare a match regardless of whether the
2220candidate was [A, B] or [B, A], and so would declare a match with a random
2221candidate with probability 1/26 + 1/26 - 1/(26 ^ 2), or more generally 2/s -
22221/(s ^ 2) = 2q - q^2. The subtracted component is for a candidate named [A, A],
2223who would otherwise be counted twice for [A, *] and [*, A]. More generally, for
2224a proband with one name and a candidate with m names, the match probability is
22251 - (1 - q) ^ m. That is, the probability of no match for each is (1 - q), and
2226it takes m failures to match for an overall failure to match. By the Bonferroni
2227approximation or Boole's inequality [1], this is approximately (and never more
2228than) m * q. So mq is a slightly conservative correction for multiple
2229comparisons.
2231For a proband with n <= m names, we can work sequentially: the first proband
2232named is matched by the candidate with approximately P = m * q_1; then, having
2233used up one candidate name, the second proband name is matched by the candidate
2234with approximately P = (m - 1) * q_2, and so on.
2236If n > m, we simply stop the process.
2238No correction is required for P(D | H), since (ignoring identifier errors) the
2239probability of an unordered match given H is 1.
2241This does NOT apply to "non-match" comparisons, where we have not gone
2242"fishing" for the best order.
2244[1] https://en.wikipedia.org/wiki/Boole%27s_inequality
2247Implementing via the Bayesian log-odds system
2248---------------------------------------------
2250Using this approximation makes things straightforward. The posterior log odds
2251is the prior log odds plus the log likelihood ratio. The log likelihood ratio
2252(LLR) for a match is ln(p_c) - ln(match probability), where p_c is the
2253probability of a correct match given the hypothesis that the proband and
2254candidate are the same person.
2256So if we were using LLR = ln(p_c) - ln(q), but we actually wanted to multiply
2257the probability q by some factor f to give LLR = ln(p_c) - ln(fq), then since
2258ln(fq) = ln(f) + ln(q), we can simply add -ln(f) to the running total.
2260We can therefore keep track of f = m * (m - 1) * ..., as above, and add that as
2261a "dummy" comparison.
2264Asymmetry
2265---------
2267The method above implies asymmetry, in that the unordered comparisons
2269 - proband = [A]
2270 - candidate = [A, B]
2272or
2274 - proband = [A]
2275 - candidate = [B, A]
2277would be less likely than
2279 - proband = [A, B]
2280 - candidate = [A]
2282because the correction (which increases the probability of a population match
2283by chance and therefore decreases the chance of the proband/candidate being the
2284same) relates to the number of candidate identifiers available.
2286This is probably fine and is a defence against a "cuckoo" candidate (cf.
2287"keyword stuffing" on web sites for search engines). For example, in our A-Z
2288situation, a candidate called [A, B, C, ..., X, Y, Z] is "trying" to be a good
2289match for everyone and perhaps shouldn't get the same probability of matching
2290[A] as a candidate simply named [A].
2292Note that there are other asymmetries already, though less obvious ones; for
2293example, using a very common surname and a rarer early example from the US name
2294database:
2296 - proband = Alice SMITH, same gender/DOB/postcode
2297 - candidate = Alice ABADILLA, same gender/DOB/postcode
2299 ... surname P(D|¬H) = 0.987 = P(no match | candidate not proband)
2300 ... log odds 12.455
2302 - proband = Alice ABADILLA, same gender/DOB/postcode
2303 - candidate = Alice SMITH, same gender/DOB/postcode
2305 ... surname P(D|¬H) = 0.996 = P(no match | candidate not proband)
2306 ... log odds 12.447
2308... because it's rarer for a randomly selected candidate to match ABADILLA than
2309SMITH, so P(D | ¬H) for a no-match is higher for proband ABADILLA, and that
2310provides slightly less evidence for a match when ABADILLA is the proband.
2312We use this unordered comparison for postcodes and surnames. So this multiple
2313comparisons correction is equivalent to saying "be a little bit more careful
2314about declaring a match against people with multiple postcodes and multiple
2315surnames, because they have a higher chance of appearing to match other people
2316at random".
2319Ordered comparisons
2320-------------------
2322Consider a proband such as [A, B, C] (n = 3) and a candidate such as [A, B] (m
2323= 2), where we wish to use the information that an ordered match is superior to
2324an unordered match. A simple way is as follows.
2326- Establish the "best" set of comparisons (highest LLR) following our standard
2327 rules. (In this case, that would be A/A, B/B, for c = 2.)
2329- Establish if that best match was strictly ordered. There should only be one
2330 way (for this method) that is defined as "strictly ordered", and we will
2331 define this as that the indexes of the comparisons, 1 ... c, exactly match
2332 the contributing indices of the proband (1 ... n) and the candidate (1 ...
2333 m). That is: strict order, no gaps.
2335- For a first draft, declare a probability p_o, the probability that if the
2336 proband/candidate are the same (H is true), the identifiers are correct and
2337 in same strict order, and a probability p_u that they are correct but
2338 unordered (not in strict order), and a probability p_e that they are wrong,
2339 such that p_o + p_u + p_e = 1.
2341 Then if there is an ordered match,
2343 - P(D | H) = p_o
2344 - P(D | ¬H) = P(random ordered match)
2346 and if there is an unordered match,
2348 - P(D | H) = p_u
2349 - P(D | ¬H) = P(random unordered match) - P(random unordered match)
2351 and if no match,
2353 - P(D | H) = p_e
2354 - P(D | ¬H) = 1 - [P(random unordered match) - P(random unordered match)]
2356- Then, to superimpose that on identifier comparisons that are themselves
2357 fuzzy, we note that much of those (e.g. p_e) are already dealt with. So
2358 if we restrict p_o and p_u to situations where there is a match (full or
2359 partial) involving two or more identifiers, and we continue to use the
2360 Bonferroni correction, it becomes straightforward.
2362"""
2365class ComparisonInfo:
2366 """
2367 Used by :func:`gen_best_comparisons`.
2368 """
2370 def __init__(
2371 self, proband_idx: int, candidate_idx: int, comparison: Comparison
2372 ) -> None:
2373 self.proband_idx = proband_idx
2374 self.candidate_idx = candidate_idx
2375 self.comparison = comparison
2377 # Precalculate these for speed (see sort_asc_best_to_worst):
2378 self.log_likelihood_ratio = comparison.log_likelihood_ratio
2379 self._distance = (proband_idx - candidate_idx) ** 2
2381 @staticmethod
2382 def sort_asc_best_to_worst(x: "ComparisonInfo") -> Tuple[float, int]:
2383 """
2384 Returns a sort value suitable for ASCENDING (standard, reverse=False)
2385 sorting to give a best-to-worst sort order.
2387 - The first part of the tuple is negative log likelihood ratio, so
2388 higher values are worse (because higher values of log likelihood
2389 ratio are better).
2391 - The second part of the tuple (the tie-breaker if NLLR is identical)
2392 is the square of the distance between the proband and candidate
2393 indexes. We prefer to use identical values (distance = squared
2394 distance = 0), so higher values are worse. This tiebreaker means
2395 that if we compare Alice Alice SMITH to Alice Alice SMITH on first
2396 names, we will choose index pairs (1, 1) and (2, 2), not (1, 2) and
2397 (2, 1).
2398 """
2399 return -x.log_likelihood_ratio, x._distance
2402def gen_best_comparisons(
2403 proband_identifiers: List[Identifier],
2404 candidate_identifiers: List[Identifier],
2405 ordered: bool = False,
2406 p_u: Optional[float] = None,
2407) -> Generator[Comparison, None, None]:
2408 """
2409 Generates comparisons for two sequences of identifiers (one from the
2410 proband, one from the candidate), being indifferent to their order. The
2411 method -- which needs to be fast -- is as described above in
2412 NOTES_MULTIPLE_COMPARISONS.
2414 Args:
2416 proband_identifiers:
2417 List of identifiers from the proband.
2418 candidate_identifiers:
2419 List of comparable identifiers from the candidate.
2420 ordered:
2421 Treat the comparison as an ordered one?
2422 p_u:
2423 (Applicable if ordered is True.) The probability of being
2424 "unordered", and the complement of p_o, where p_o is the
2425 probability, given the hypothesis H (proband and candidate are the
2426 same person) and that c > 1 identifiers are being compared, that
2427 the candidate identifiers will be in exactly the right order (that
2428 is, for all matches, the index of the candidate's identifier is the
2429 same as the index of the proband's identifier).
2430 """
2431 # Compare all pairs.
2432 ci_list = [] # type: List[ComparisonInfo]
2433 for p_idx, proband_id in enumerate(proband_identifiers):
2434 for c_idx, candidate_id in enumerate(candidate_identifiers):
2435 ci = proband_id.comparison(candidate_id)
2436 if ci is None:
2437 # This will happen if either is missing information, or if the
2438 # identifiers explicitly do not overlap temporally.
2439 continue
2440 ci_list.append(
2441 ComparisonInfo(
2442 proband_idx=p_idx,
2443 candidate_idx=c_idx,
2444 comparison=ci,
2445 )
2446 )
2447 if not ci_list:
2448 # No comparisons. Abort before we do something silly with a correction
2449 # procedure.
2450 return
2452 # Iterate through comparisons in descending order of log likelihood ratio,
2453 # i.e. best to worst. See ComparisonInfo.sort_asc_best_to_worst().
2454 ci_list.sort(key=ComparisonInfo.sort_asc_best_to_worst)
2455 candidate_indexes_used = set() # type: Set[int]
2456 proband_indexes_used = set() # type: Set[int]
2457 n_candidates_available = n_candidates = len(candidate_identifiers)
2458 n_positive = 0
2459 n_implicit_comparisons = 1
2460 # ... at least one (because ci_list is not empty). This is a multiplicative
2461 # value; we will multiply it by the number of available candidates used for
2462 # each comparison.
2463 correct_order = True
2464 for ci in ci_list:
2465 if (
2466 ci.proband_idx in proband_indexes_used
2467 or ci.candidate_idx in candidate_indexes_used
2468 ):
2469 # Each identifier can use used as part of only one comparison.
2470 continue
2471 yield ci.comparison
2472 if ci.log_likelihood_ratio > 0:
2473 # This was some form of match, so we apply our correction.
2474 n_implicit_comparisons *= n_candidates_available
2475 n_positive += 1
2476 if ordered and ci.proband_idx != ci.candidate_idx:
2477 # Note that the index of ci itself is irrelevant; that will
2478 # vary depending on the frequency of the identifiers, e.g. John
2479 # Zachariah versus Zachariah John.
2480 correct_order = False
2481 # Whether or not it was a positive match, it was a comparison; we have
2482 # "used up" the identifiers being compared, and have one fewer
2483 # candidate available.
2484 candidate_indexes_used.add(ci.candidate_idx)
2485 proband_indexes_used.add(ci.proband_idx)
2486 n_candidates_available -= 1
2488 # Any corrections required.
2489 if ordered:
2490 # Ordered comparison requested.
2491 # - To follow this, look at the simpler "unordered" alternative first.
2492 # - Action only required if there is an ordering to be considered.
2493 p_o = 1 - p_u # p_o ordered, p_u unordered
2494 if n_positive > 0 and n_candidates > 1:
2495 # There was a "hit", and there was a choice of candidate
2496 # identifiers, so there is an order to think about. ASSUMING unique
2497 # identifiers (within proband, within candidate):
2498 if correct_order:
2499 # - Adjust P(D | H) by p_o.
2500 # - No adjustment to P(D | ¬H) required.
2501 yield AdjustLogOddsComparison(
2502 log_odds_delta=ln(p_o),
2503 description=(
2504 f"order match: adjust P(D|H) by "
2505 f"P(correct order) = {p_o}"
2506 ),
2507 )
2508 else:
2509 # - Adjust P(D | H) by p_u = 1 - p_o.
2510 # - Adjust P(D | ¬H) by the number of unordered possibilities
2511 # considered (n_implicit_comparisons), minus the one (the
2512 # correctly ordered option) that by definition we are not
2513 # considering here. This uses a Bonferroni approximation, as
2514 # above.
2515 n_unordered_possibilities = n_implicit_comparisons - 1
2516 description = (
2517 f"order mismatch: "
2518 f"adjust P(D|H) by P(incorrect order) = {p_u}"
2519 )
2520 if n_unordered_possibilities > 1:
2521 description += (
2522 f", and P(D|¬H) for {n_positive} hits from "
2523 f"{n_unordered_possibilities} comparisons"
2524 )
2525 yield AdjustLogOddsComparison(
2526 log_odds_delta=ln(p_u) - ln(n_unordered_possibilities),
2527 description=description,
2528 )
2530 else:
2531 # Unordered comparison requested.
2532 # - No adjustment is required to P(D | H). See paper.
2533 # - If n_implicit_comparisons is 1, that isn't multiple comparisons,
2534 # so no further adjustment is required. We could still use this
2535 # process, which would add -ln(1) = 0, but it would do nothing and be
2536 # a waste of time.
2537 # - But if n_implicit_comparisons > 1, then we adjust P(D | ¬H),
2538 # using the Bonferroni correction. See paper for working.
2539 if n_implicit_comparisons > 1:
2540 # - Correct P(D | ¬H) for the fact that we would have considered
2541 # any order acceptable, and we made multiple comparisons to pick
2542 # the best. This uses a Bonferroni approximation, as above.
2543 # We add a negative log odds value. See paper for detail.
2544 yield AdjustLogOddsComparison(
2545 log_odds_delta=-ln(n_implicit_comparisons),
2546 description=(
2547 f"unordered: adjust P(D|¬H) for {n_positive} "
2548 f"hits from {n_implicit_comparisons} comparisons"
2549 ),
2550 )