Coverage for linkage/matchconfig.py: 86%
188 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1r"""
2crate_anon/linkage/matchconfig.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Helper functions for linkage tools.**
28"""
30# =============================================================================
31# Imports
32# =============================================================================
34import logging
35from typing import Any, Dict, NoReturn, Optional, Set, Tuple, Union
37from cardinal_pythonlib.hash import make_hasher
38from cardinal_pythonlib.maths_py import round_sf, normal_round_int
39from cardinal_pythonlib.probability import log_odds_from_1_in_n
40from cardinal_pythonlib.reprfunc import auto_repr
42from crate_anon.linkage.constants import (
43 DAYS_PER_MONTH,
44 DAYS_PER_YEAR,
45 FuzzyDefaults,
46 GENDER_FEMALE,
47 GENDER_MALE,
48 GENDER_MISSING,
49 GENDER_OTHER,
50 MONTHS_PER_YEAR,
51 Switches,
52 UK_POPULATION_2017,
53 VALID_GENDERS,
54)
55from crate_anon.linkage.frequencies import (
56 BasicNameFreqInfo,
57 NameFrequencyInfo,
58 PostcodeFrequencyInfo,
59)
60from crate_anon.linkage.helpers import (
61 dict_from_str,
62 safe_upper,
63 standardize_name,
64 standardize_perfect_id_key,
65 standardize_perfect_id_value,
66)
68log = logging.getLogger(__name__)
71# =============================================================================
72# Main configuration class, supporting frequency-based probability calculations
73# =============================================================================
76class MatchConfig:
77 """
78 Master config class. It's more convenient to pass one of these round than
79 lots of its components.
81 Default arguments are there for testing.
82 """
84 def __init__(
85 self,
86 hash_key: str = FuzzyDefaults.HASH_KEY,
87 hash_method: str = FuzzyDefaults.HASH_METHOD,
88 rounding_sf: Optional[int] = FuzzyDefaults.ROUNDING_SF,
89 local_id_hash_key: str = None,
90 population_size: int = FuzzyDefaults.POPULATION_SIZE,
91 forename_sex_csv_filename: str = FuzzyDefaults.FORENAME_SEX_FREQ_CSV,
92 forename_cache_filename: str = FuzzyDefaults.FORENAME_CACHE_FILENAME,
93 forename_freq_info: Optional[NameFrequencyInfo] = None,
94 forename_min_frequency: float = FuzzyDefaults.FORENAME_MIN_FREQ,
95 surname_csv_filename: str = FuzzyDefaults.SURNAME_FREQ_CSV,
96 surname_cache_filename: str = FuzzyDefaults.SURNAME_CACHE_FILENAME,
97 surname_freq_info: Optional[NameFrequencyInfo] = None,
98 surname_min_frequency: float = FuzzyDefaults.SURNAME_MIN_FREQ,
99 accent_transliterations_csv: str = (
100 FuzzyDefaults.ACCENT_TRANSLITERATIONS_SLASH_CSV
101 ),
102 nonspecific_name_components_csv: str = (
103 FuzzyDefaults.NONSPECIFIC_NAME_COMPONENTS_CSV
104 ),
105 birth_year_pseudo_range: float = FuzzyDefaults.BIRTH_YEAR_PSEUDO_RANGE,
106 p_not_male_or_female: float = FuzzyDefaults.P_NOT_MALE_OR_FEMALE,
107 p_female_given_male_or_female: float = (
108 FuzzyDefaults.P_FEMALE_GIVEN_MALE_OR_FEMALE
109 ),
110 postcode_csv_filename: str = FuzzyDefaults.POSTCODES_CSV,
111 postcode_cache_filename: str = FuzzyDefaults.POSTCODE_CACHE_FILENAME,
112 postcode_freq_info: Optional[PostcodeFrequencyInfo] = None,
113 k_postcode: Optional[float] = FuzzyDefaults.K_POSTCODE,
114 p_unknown_or_pseudo_postcode: float = (
115 FuzzyDefaults.P_UNKNOWN_OR_PSEUDO_POSTCODE
116 ),
117 k_pseudopostcode: float = FuzzyDefaults.K_PSEUDOPOSTCODE,
118 p_ep1_forename: str = FuzzyDefaults.P_EP1_FORENAME_CSV,
119 p_ep2np1_forename: str = FuzzyDefaults.P_EP2NP1_FORENAME_CSV,
120 p_u_forename: float = FuzzyDefaults.P_U_FORENAME,
121 p_en_forename: str = FuzzyDefaults.P_EN_FORENAME_CSV,
122 p_ep1_surname: str = FuzzyDefaults.P_EP1_SURNAME_CSV,
123 p_ep2np1_surname: str = FuzzyDefaults.P_EP2NP1_SURNAME_CSV,
124 p_en_surname: str = FuzzyDefaults.P_EN_SURNAME_CSV,
125 p_ep_dob: float = FuzzyDefaults.P_EP_DOB,
126 p_en_dob: float = FuzzyDefaults.P_EN_DOB,
127 p_e_gender: float = FuzzyDefaults.P_E_GENDER,
128 p_ep_postcode: float = FuzzyDefaults.P_EP_POSTCODE,
129 p_en_postcode: float = FuzzyDefaults.P_EN_POSTCODE,
130 min_log_odds_for_match: float = FuzzyDefaults.MIN_LOG_ODDS_FOR_MATCH,
131 exceeds_next_best_log_odds: float = (
132 FuzzyDefaults.EXCEEDS_NEXT_BEST_LOG_ODDS
133 ),
134 perfect_id_translation: Union[
135 Dict[str, str], str
136 ] = FuzzyDefaults.PERFECT_ID_TRANSLATION,
137 extra_validation_output: bool = False,
138 check_comparison_order: bool = FuzzyDefaults.CHECK_COMPARISON_ORDER,
139 report_every: int = FuzzyDefaults.REPORT_EVERY,
140 min_probands_for_parallel: int = (
141 FuzzyDefaults.MIN_PROBANDS_FOR_PARALLEL
142 ),
143 n_workers: int = FuzzyDefaults.N_PROCESSES,
144 verbose: bool = False,
145 ) -> None:
146 """
147 Args:
148 hash_key:
149 Key (passphrase) for hasher.
150 hash_method:
151 Method to use for hashhing.
152 rounding_sf:
153 Number of significant figures to use when rounding frequency
154 information in hashed copies. Use ``None`` for no rounding.
155 local_id_hash_key:
156 If specified, then for hash operations, the local_id values
157 will also be hashed, using this key.
159 population_size:
160 The size of the entire population (not our sample). See
161 docstrings above.
163 forename_sex_csv_filename:
164 Forename frequencies. CSV file, with no header, of "name,
165 frequency" pairs.
166 forename_cache_filename:
167 File in which to cache forename information for faster loading.
168 forename_freq_info:
169 Debugging option: overrides forename_sex_csv_filename by
170 providing a NameFrequencyInfo object directly.
171 forename_min_frequency:
172 Minimum frequency for forenames.
174 surname_csv_filename:
175 Surname frequencies. CSV file, with no header, of "name,
176 frequency" pairs.
177 surname_cache_filename:
178 File in which to cache forename information for faster loading.
179 surname_freq_info:
180 Debugging option: overrides surname_csv_filename by
181 providing a NameFrequencyInfo object directly.
182 surname_min_frequency:
183 Minimum frequency for surnames.
184 accent_transliterations_csv:
185 Accent transliteration map. String of the form "Ä/AE,Ö/OE" --
186 comma-separated pairs, with slashed separating each pair.
187 nonspecific_name_components_csv:
188 CSV-separated list of nonspecific name components (e.g.
189 nobiliary particles), which will be avoided as equivalent name
190 fragments.
192 birth_year_pseudo_range:
193 b, such that P(two people share a DOB) = 1/(365.25 * b).
195 p_not_male_or_female:
196 Probability that a person in the population has gender 'X'.
197 p_female_given_male_or_female:
198 Probability that a person in the population is female, given
199 that they are either male or female.
201 postcode_csv_filename:
202 Postcode mapping. CSV (or ZIP) file. Special format; see
203 :class:`PostcodeFrequencyInfo`.
204 postcode_cache_filename:
205 File in which to cache postcode information for faster loading.
206 postcode_freq_info:
207 Debugging option: overrides postcode_csv_filename by
208 providing a PostcodeFrequencyInfo object directly.
209 k_postcode:
210 Multiple applied to postcode unit/sector frequencies, such that
211 p_f_postcode = k_postcode * f_f_postcode and p_p_postcode =
212 k_postcode * f_p_postcode. If None, defaults to
213 UK_POPULATION_2017 / population_size, appropriate if the
214 population under consideration is geographically constrained
215 (rather than sampled from across the UK).
216 p_unknown_or_pseudo_postcode:
217 Probability that a random person will have a pseudo-postcode,
218 e.g. ZZ99 3VZ (no fixed abode) or a postcode not known to our
219 database. Specifically, P(each pseudopostcode or unknown
220 postcode unit | ¬H).
221 k_pseudopostcode:
222 Probability multiple: P(pseudopostcode sector or unknown
223 postcode sector match | ¬H) = k_pseudopostcode *
224 p_unknown_or_pseudo_postcode. Must strictly be >=1 and we
225 enforce >1; see paper.
227 p_ep1_forename:
228 Error probability that a forename fails a full match but passes
229 a partial 1 (metaphone) match. [GPD]
230 p_ep2np1_forename:
231 Error probability that a forename fails a full match and a
232 partial 1 match but passes a partial 2 (F2C) match. [GPD]
233 p_en_forename:
234 Error probability that a forename yields no match at all. [GPD]
235 p_ep1_surname:
236 Error probability that a surname fails a full match but passes
237 a partial 1 (metaphone) match. [GPD]
238 p_ep2np1_surname:
239 Error probability that a surname fails a full match and a
240 partial 1 match but passes a partial 2 (F2C) match. [GPD]
241 p_en_surname:
242 Error probability that a surname yields no match at all. [GPD]
243 p_ep_dob:
244 Error probability that a DOB fails a full (YMD) match but
245 passes a partial (YM, MD, or YD) match.
246 p_en_dob:
247 Error probability that a DOB produces no match at all.
248 p_e_gender:
249 Error probability of no gender match.
250 p_ep_postcode:
251 Probability that a postcode fails a full (unit) match but
252 passes a partial (sector) match (due to error or a move within
253 a sector).
254 p_en_postcode:
255 Probability that a postcode gives no match at all.
256 min_log_odds_for_match:
257 minimum log odds of a match, to consider two people a match
258 exceeds_next_best_log_odds:
259 In a multi-person comparison, the log odds of the best match
260 must exceed those of the next-best match by this much for the
261 best to be considered a unique winner.
262 perfect_id_translation:
263 Option dictionary mapping the perfect ID names in the proband
264 to the equivalents in the sample, e.g. {"nhsnum": "nhsnumber"}.
266 extra_validation_output:
267 Add extra columns to the output for validation purposes?
268 check_comparison_order:
269 Check that comparisons follow the general rule "no match ≤
270 partial(s) ≤ full" and warn if not.
271 report_every:
272 Report progress every n probands.
273 min_probands_for_parallel:
274 Minimum number of probands for which we will bother to use
275 parallel processing.
276 n_workers:
277 Number of parallel processes to use, if parallel processing
278 is used.
279 verbose:
280 Be verbose on creation?
282 - [GPD] In ``{gender:p, ...}`` dict-as-string format.
284 - F2C = First two characters.
285 """
286 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
287 # Input validation
288 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
290 def raise_bad(x_: Any, name_: str) -> NoReturn:
291 """
292 Raise an informative ValueError.
293 """
294 raise ValueError(f"Bad {name_}: {x_!r}")
296 def check_prob(
297 p_: float, name_: str, not_certain: bool = False
298 ) -> float:
299 """
300 Ensure that something is a probability, and return it.
301 """
302 if not_certain:
303 if not 0 < p_ < 1:
304 raise_bad(p_, name_ + " [must be in range (0, 1)]")
305 else:
306 if not 0 <= p_ <= 1:
307 raise_bad(p_, name_)
308 return p_
310 def mk_gender_p_dict(csv_: str, name_: str) -> Dict[str, float]:
311 """
312 Transform a comma-separated list of ``gender:p`` values into
313 a corresponding dictionary, and fill in the blanks.
314 """
315 d = {} # type: Dict[str, float]
316 for gender_p_str in csv_.split(","):
317 g_p_components = gender_p_str.split(":")
318 if len(g_p_components) != 2:
319 raise ValueError(f"Bad {name_}: {csv_!r}")
320 g = g_p_components[0].strip()
321 try:
322 p = check_prob(float(g_p_components[1].strip()), name_)
323 except (ValueError, TypeError):
324 raise ValueError(f"Bad probability in {name_}: {csv_!r}")
325 d[g] = p
326 if GENDER_FEMALE not in d:
327 raise ValueError(
328 f"Gender {GENDER_FEMALE} not specified in {name_}"
329 )
330 if GENDER_MALE not in d:
331 raise ValueError(
332 f"Gender {GENDER_MALE} not specified in {name_}"
333 )
334 weighted_mean_m_f = (
335 self.p_female_given_m_or_f * d[GENDER_FEMALE]
336 + self.p_male_given_m_or_f * d[GENDER_MALE]
337 )
338 d.setdefault(GENDER_OTHER, weighted_mean_m_f)
339 d.setdefault(GENDER_MISSING, weighted_mean_m_f)
340 if set(d.keys()) != set(VALID_GENDERS):
341 raise ValueError(
342 f"Missing or bad genders in {name_}: {csv_!r} -- genders "
343 f"should be {VALID_GENDERS}"
344 )
345 return d
347 def mk_p_c_dict(
348 p_ep1_: Dict[str, float],
349 p_ep2np1_: Dict[str, float],
350 p_en_: Dict[str, float],
351 ) -> Dict[str, float]:
352 """
353 Calculates p_c = 1 - p_ep1 - p_ep2np1 = p_en.
354 """
355 d = {} # type: Dict[str, float]
356 for g in VALID_GENDERS:
357 p_c_ = 1 - p_ep1_[g] - p_ep2np1_[g] - p_en_[g]
358 assert 0 <= p_c_ <= 1
359 d[g] = p_c_
360 return d
362 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
363 # Basic creation
364 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
366 if verbose:
367 log.debug("Building MatchConfig...")
369 # Hash information
371 self.hash_fn = make_hasher(hash_method=hash_method, key=hash_key).hash
372 if not (rounding_sf is None or 1 <= rounding_sf):
373 raise_bad(rounding_sf, Switches.ROUNDING_SF)
374 self.rounding_sf = rounding_sf
375 if local_id_hash_key:
376 self.local_id_hash_fn = make_hasher(
377 hash_method=hash_method, key=local_id_hash_key
378 ).hash
379 else:
380 # Convert to string if necessary; otherwise, an identity function:
381 self.local_id_hash_fn = str
383 # Overall population
385 if not (population_size > 0):
386 raise_bad(population_size, Switches.POPULATION_SIZE)
387 self.population_size = population_size
388 # Precalculate this, for access speed:
389 self.baseline_log_odds_same_person = log_odds_from_1_in_n(
390 self.population_size
391 )
393 # Name handling: generic
395 accent_dict = {} # type: Dict[str, str]
396 for accent_pair in accent_transliterations_csv.split(","):
397 accent_components = accent_pair.split("/")
398 if len(accent_components) != 2:
399 raise ValueError(
400 f"Bad accent_transliterations_csv: "
401 f"{accent_transliterations_csv!r}"
402 )
403 accented = safe_upper(accent_components[0].strip())
404 plain = safe_upper(accent_components[1].strip())
405 if len(accented) != 1:
406 raise ValueError(
407 f"Bad accent_transliterations_csv: "
408 f"{accent_transliterations_csv!r} -- contains accented "
409 f"character {accented!r}, which should be of length 1"
410 )
411 accent_dict[accented] = plain
412 self.accent_transliterations = str.maketrans(accent_dict)
413 self.nonspecific_name_components = set() # type: Set[str]
414 for nonspec in nonspecific_name_components_csv.split(","):
415 self.nonspecific_name_components.add(nonspec.strip().upper())
417 # Name handling: forenames
419 self.forename_freq_info = forename_freq_info or NameFrequencyInfo(
420 csv_filename=forename_sex_csv_filename,
421 cache_filename=forename_cache_filename,
422 min_frequency=check_prob(
423 forename_min_frequency, Switches.FORENAME_MIN_FREQUENCY
424 ),
425 by_gender=True,
426 )
427 if not isinstance(self.forename_freq_info, NameFrequencyInfo):
428 raise ValueError("Bad forename_freq_info")
430 # Name handling: surnames
432 self.surname_freq_info = surname_freq_info or NameFrequencyInfo(
433 csv_filename=surname_csv_filename,
434 cache_filename=surname_cache_filename,
435 min_frequency=check_prob(
436 surname_min_frequency, Switches.SURNAME_MIN_FREQUENCY
437 ),
438 by_gender=False,
439 )
440 if not isinstance(self.surname_freq_info, NameFrequencyInfo):
441 raise ValueError("Bad surname_freq_info")
443 # Population frequencies: DOB
445 self.birth_year_pseudo_range = birth_year_pseudo_range
446 if not (birth_year_pseudo_range >= 1):
447 raise_bad(
448 birth_year_pseudo_range, Switches.BIRTH_YEAR_PSEUDO_RANGE
449 )
451 # Population frequencies: sex/gender
453 # ... Check this before using mk_gender_p_dict:
454 self.p_female_given_m_or_f = check_prob(
455 p_female_given_male_or_female,
456 Switches.P_FEMALE_GIVEN_MALE_OR_FEMALE,
457 )
458 self.p_male_given_m_or_f = 1 - self.p_female_given_m_or_f
459 self.p_not_male_or_female = check_prob(
460 p_not_male_or_female, Switches.P_NOT_MALE_OR_FEMALE
461 )
462 p_male_or_female = 1 - p_not_male_or_female
463 self.p_female = p_female_given_male_or_female * p_male_or_female
464 self.p_male = p_male_or_female - self.p_female
466 # Population frequencies: postcode
468 self.postcode_freq_info = postcode_freq_info or PostcodeFrequencyInfo(
469 csv_filename=postcode_csv_filename,
470 cache_filename=postcode_cache_filename,
471 )
472 if not isinstance(self.postcode_freq_info, PostcodeFrequencyInfo):
473 raise ValueError("Bad postcode_freq_info")
474 self.p_unknown_or_pseudo_postcode_unit = check_prob(
475 p_unknown_or_pseudo_postcode,
476 Switches.P_UNKNOWN_OR_PSEUDO_POSTCODE,
477 not_certain=True,
478 )
479 if k_pseudopostcode <= 1:
480 raise ValueError(f"Bad {Switches.K_PSEUDOPOSTCODE}: must be >1")
481 self.k_pseudopostcode = k_pseudopostcode
482 self.p_unknown_or_pseudo_postcode_sector = check_prob(
483 k_pseudopostcode * p_unknown_or_pseudo_postcode,
484 f"P(unknown postcode or pseudopostcode sector | ¬H) = "
485 f"{Switches.K_PSEUDOPOSTCODE} * "
486 f"{Switches.P_UNKNOWN_OR_PSEUDO_POSTCODE}",
487 not_certain=True,
488 )
489 self.k_postcode = (
490 UK_POPULATION_2017 / self.population_size
491 if k_postcode is None
492 else k_postcode
493 )
494 self.p_known_postcode = 1 - self.p_unknown_or_pseudo_postcode_sector
496 # Error probabilities: forenames
498 self.p_ep1_forename = mk_gender_p_dict(
499 p_ep1_forename, Switches.P_EP1_FORENAME
500 )
501 self.p_ep2np1_forename = mk_gender_p_dict(
502 p_ep2np1_forename, Switches.P_EP2NP1_FORENAME
503 )
504 self.p_en_forename = mk_gender_p_dict(
505 p_en_forename, Switches.P_EN_FORENAME
506 )
507 self.p_c_forename = mk_p_c_dict(
508 p_ep1_=self.p_ep1_forename,
509 p_ep2np1_=self.p_ep2np1_forename,
510 p_en_=self.p_en_forename,
511 )
512 self.p_u_forename = check_prob(p_u_forename, Switches.P_U_FORENAME)
514 # Error probabilities: surnames
516 self.p_ep1_surname = mk_gender_p_dict(
517 p_ep1_surname, Switches.P_EP1_SURNAME
518 )
519 self.p_ep2np1_surname = mk_gender_p_dict(
520 p_ep2np1_surname, Switches.P_EP2NP1_SURNAME
521 )
522 self.p_en_surname = mk_gender_p_dict(
523 p_en_surname, Switches.P_EN_SURNAME
524 )
525 self.p_c_surname = mk_p_c_dict(
526 p_ep1_=self.p_ep1_surname,
527 p_ep2np1_=self.p_ep2np1_surname,
528 p_en_=self.p_en_surname,
529 )
531 # Error probabilities: DOB
533 self.p_ep_dob = check_prob(p_ep_dob, Switches.P_EP_DOB)
534 self.p_en_dob = check_prob(p_en_dob, Switches.P_EN_DOB)
536 # Error probabilities: gender
538 self.p_e_gender_error = check_prob(
539 p_e_gender,
540 Switches.P_E_GENDER,
541 )
543 # Error probabilities: postcode
545 self.p_ep_postcode = check_prob(p_ep_postcode, Switches.P_EP_POSTCODE)
546 self.p_en_postcode = check_prob(p_en_postcode, Switches.P_EN_POSTCODE)
548 # Matching rules
550 self.min_log_odds_for_match = min_log_odds_for_match
551 self.exceeds_next_best_log_odds = exceeds_next_best_log_odds
552 if perfect_id_translation is None:
553 perfect_id_xlate_raw = {}
554 elif isinstance(perfect_id_translation, dict):
555 perfect_id_xlate_raw = perfect_id_translation
556 elif isinstance(perfect_id_translation, str):
557 perfect_id_xlate_raw = dict_from_str(perfect_id_translation)
558 else:
559 raise ValueError(
560 f"Bad perfect_id_translation: {perfect_id_translation!r}"
561 )
562 self.perfect_id_translation = {
563 standardize_perfect_id_key(k): standardize_perfect_id_value(v)
564 for k, v in perfect_id_xlate_raw.values()
565 }
566 if self.perfect_id_translation:
567 log.info(
568 f"Using proband-to-sample perfect ID translation: "
569 f"{self.perfect_id_translation}"
570 )
572 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
573 # Some derived frequencies
574 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
576 # DOB:
578 self.p_c_dob = 1 - self.p_ep_dob - self.p_en_dob
579 assert 0 <= self.p_c_dob <= 1
580 # These ignore the specialness of 29 February:
581 self.p_f_dob = 1 / (DAYS_PER_YEAR * birth_year_pseudo_range)
582 p_share_dob_md_not_ymd = (1 / DAYS_PER_YEAR) - self.p_f_dob
583 p_share_dob_yd_not_ymd = (
584 1 / (DAYS_PER_MONTH * birth_year_pseudo_range)
585 ) - self.p_f_dob
586 p_share_dob_ym_not_ymd = (
587 1 / (MONTHS_PER_YEAR * birth_year_pseudo_range)
588 ) - self.p_f_dob
589 # These three are mutually exclusive possibilities (e.g. you can't
590 # share YM and MD without sharing YMD), so we can just sum:
591 self.p_pnf_dob = (
592 p_share_dob_md_not_ymd
593 + p_share_dob_yd_not_ymd
594 + p_share_dob_ym_not_ymd
595 )
596 self.p_n_dob = 1 - self.p_f_dob - self.p_pnf_dob
597 assert 0 <= self.p_f_dob <= 1
598 assert 0 <= p_share_dob_md_not_ymd <= 1
599 assert 0 <= p_share_dob_yd_not_ymd <= 1
600 assert 0 <= p_share_dob_ym_not_ymd <= 1
601 assert 0 <= self.p_pnf_dob <= 1
602 assert 0 <= self.p_n_dob <= 1
604 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
605 # Technical
606 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
608 self.extra_validation_output = extra_validation_output
609 self.check_comparison_order = check_comparison_order
610 self.report_every = report_every
611 self.min_probands_for_parallel = min_probands_for_parallel
612 self.n_workers = n_workers
614 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
615 # Reporting
616 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
618 self.partial_dob_mismatch_allowed = self.p_c_dob < 1
619 self.complete_dob_mismatch_allowed = self.p_en_dob > 0
620 if self.complete_dob_mismatch_allowed:
621 potential_speedup_factor = round_sf(
622 normal_round_int(1 / (1 - self.p_n_dob)),
623 n=3,
624 )
625 log.warning(
626 f"You are allowing a person's DOB to be completely different, "
627 f"with p = {self.p_en_dob}. That is valid but much less "
628 f"efficient computationally (by an estimated factor of about "
629 f"{potential_speedup_factor})."
630 )
631 _ = """
632 Speedup: for a 90-year range (b = 90), this is a factor of about 252.
634 For a single year, it's about 9; if I'm born on 1 Jan, allowing
635 single-component errors mean we need to consider 1 Jan, but also all of
636 Jan, and all other firsts of the month -- total 42 out of 365 days, or
637 1/8.69 of the year.
639 For a multi-year range, the speedup increases: if I'm born on 1 Jan
640 1950 and we are considering 1900-1999, we'd need to consider 1950-01-01
641 (1), ????-01-01 (100), 1950-01-?? (31), 1950-??-01 (12), minus the
642 overlaps (3), giving 141 possibilities but out of about 36500, i.e.
643 considering only 1/259 of the candidates.
645 To find probabilities in terms of b, using Octave:
647 pkg load symbolic
648 syms b p_f_dob p_pnf_dob p_n_dob speedup_no_mismatch speedup_no_partial second_stage_speedup
649 DAYS_PER_YEAR = 365.25
650 DAYS_PER_MONTH = 30.4375
651 MONTHS_PER_YEAR = 12
653 p_f_dob = 1 / (DAYS_PER_YEAR * b)
654 # = 4 / (1461⋅b)
656 p_pnf_dob = (
657 1 / DAYS_PER_YEAR
658 + 1 / (DAYS_PER_MONTH * b)
659 + 1 / (MONTHS_PER_YEAR * b)
660 - 3 / (DAYS_PER_YEAR * b)
661 )
662 simplify(p_pnf_dob)
663 # = (16⋅b + 631) / (5844⋅b)
665 p_n_dob = 1 - p_f_dob - p_pnf_dob
666 simplify(p_n_dob)
668 p_full_or_partial_match = 1 - p_n_dob
669 speedup_no_mismatch = 1 / p_full_or_partial_match
670 simplify(speedup_no_mismatch)
671 # = 5844⋅b / (16⋅b + 647)
673 speedup_no_partial = 1 / p_f_dob
674 simplify(speedup_no_partial)
675 # = 1461⋅b / 4
677 second_stage_speedup = speedup_no_partial / speedup_no_mismatch
678 simplify(second_stage_speedup)
679 # = b + 647 / 16
681 """ # noqa: E501
683 if verbose:
684 log.debug(f"... MatchConfig built. Settings: {self}")
685 # log.debug(
686 # f"p_dob_correct = {self.p_dob_correct}, "
687 # f"p_dob_single_component_error = "
688 # f"{self.p_dob_single_component_error}, "
689 # f"p_dob_major_error = {self.p_dob_major_error}"
690 # )
691 # log.debug(
692 # f"p_two_people_share_dob_ymd = "
693 # f"{self.p_two_people_share_dob_ymd}, "
694 # f"p_share_dob_md_not_ymd = {p_share_dob_md_not_ymd}, "
695 # f"p_share_dob_yd_not_ymd = {p_share_dob_yd_not_ymd}, "
696 # f"p_share_dob_ym_not_ymd = {p_share_dob_ym_not_ymd}, "
697 # f"p_two_people_have_partial_dob_match = "
698 # f"{self.p_two_people_partial_dob_match}, "
699 # f"p_two_people_no_dob_similarity = "
700 # f"{self.p_two_people_no_dob_similarity}"
701 # )
703 # -------------------------------------------------------------------------
704 # String representation
705 # -------------------------------------------------------------------------
707 def __str__(self) -> str:
708 return auto_repr(self)
710 # not __repr__(), or it clutters up all the other objects
712 # -------------------------------------------------------------------------
713 # Identifier frequency information
714 # -------------------------------------------------------------------------
716 def get_forename_freq_info(
717 self, name: str, gender: str, prestandardized: bool = False
718 ) -> BasicNameFreqInfo:
719 """
720 Returns the baseline frequency of a forename.
722 Args:
723 name: the name to check
724 gender: the gender to look up for
725 prestandardized: was the name pre-standardized?
726 """
727 if not prestandardized:
728 name = standardize_name(name)
729 freq_func = self.forename_freq_info.name_frequency_info
730 if gender in (GENDER_FEMALE, GENDER_MALE):
731 return freq_func(name, gender, prestandardized=True)
732 # Otherwise, take the mean across genders:
733 return BasicNameFreqInfo.weighted_mean(
734 objects=[
735 freq_func(name, GENDER_FEMALE, prestandardized=True),
736 freq_func(name, GENDER_MALE, prestandardized=True),
737 ],
738 weights=[self.p_female, self.p_male],
739 )
741 def get_surname_freq_info(
742 self, name: str, prestandardized: bool = False
743 ) -> BasicNameFreqInfo:
744 """
745 Returns the baseline frequency of a surname.
747 Args:
748 name: the name to check
749 prestandardized: was it pre-standardized?
750 """
751 return self.surname_freq_info.name_frequency_info(
752 name, prestandardized=prestandardized
753 )
755 def gender_freq(self, gender: str) -> Optional[float]:
756 if not gender:
757 return None
758 elif gender == GENDER_FEMALE:
759 return self.p_female
760 elif gender == GENDER_MALE:
761 return self.p_male
762 else:
763 return self.p_not_male_or_female
765 def is_valid_postcode(self, postcode_unit: str) -> bool:
766 """
767 Is this a valid postcode?
768 """
769 return self.postcode_freq_info.debug_is_valid_postcode(postcode_unit)
771 def postcode_unit_sector_freq(
772 self, postcode_unit: str, prestandardized: bool = False
773 ) -> Tuple[float, float]:
774 """
775 Returns the frequency for a full postcode, or postcode unit (the
776 proportion of the population who live in that postcode), and the
777 corresponding larger-scale postcode sector.
779 The underlying function ensures that the sector frequency is as least
780 as big as the unit frequency.
781 """
782 return self.postcode_freq_info.postcode_unit_sector_frequency(
783 postcode_unit, prestandardized=prestandardized
784 )
786 def debug_postcode_unit_population(
787 self, postcode_unit: str, prestandardized: bool = False
788 ) -> float:
789 """
790 Returns the calculated population of a postcode unit.
792 Args:
793 postcode_unit: the postcode unit to check
794 prestandardized: was the postcode pre-standardized in format?
795 """
796 return self.postcode_freq_info.debug_postcode_unit_population(
797 postcode_unit, prestandardized=prestandardized
798 )
800 def debug_postcode_sector_population(
801 self, postcode_sector: str, prestandardized: bool = False
802 ) -> float:
803 """
804 Returns the calculated population of a postcode sector.
806 Args:
807 postcode_sector: the postcode sector to check
808 prestandardized: was the postcode pre-standardized in format?
809 """
810 return self.postcode_freq_info.debug_postcode_sector_population(
811 postcode_sector, prestandardized=prestandardized
812 )
814 # -------------------------------------------------------------------------
815 # Comparisons
816 # -------------------------------------------------------------------------
818 def exceeds_primary_threshold(self, log_odds_match: float) -> bool:
819 """
820 Decides as to whether the log odds, representing P(H | D) from a
821 comparison of two :class:`Person` objects, are sufficient for a match,
822 based on our threshold.
824 Args:
825 log_odds_match: log odds that they're the same person
827 Returns:
828 bool: binary decision
829 """
830 return log_odds_match >= self.min_log_odds_for_match
832 # -------------------------------------------------------------------------
833 # Perfect ID handling
834 # -------------------------------------------------------------------------
836 def remap_perfect_id_key(self, key: str) -> str:
837 return self.perfect_id_translation.get(key, key)
840# =============================================================================
841# Dummy config that doesn't load frequency information
842# =============================================================================
845def mk_dummy_match_config() -> MatchConfig:
846 """
847 Returns a dummy config with empty frequency information.
848 """
849 return MatchConfig(
850 forename_cache_filename="",
851 forename_sex_csv_filename="",
852 surname_cache_filename="",
853 surname_csv_filename="",
854 postcode_cache_filename="",
855 postcode_csv_filename="",
856 )