Coverage for linkage/matchconfig.py: 86%

188 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1r""" 

2crate_anon/linkage/matchconfig.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Helper functions for linkage tools.** 

27 

28""" 

29 

30# ============================================================================= 

31# Imports 

32# ============================================================================= 

33 

34import logging 

35from typing import Any, Dict, NoReturn, Optional, Set, Tuple, Union 

36 

37from cardinal_pythonlib.hash import make_hasher 

38from cardinal_pythonlib.maths_py import round_sf, normal_round_int 

39from cardinal_pythonlib.probability import log_odds_from_1_in_n 

40from cardinal_pythonlib.reprfunc import auto_repr 

41 

42from crate_anon.linkage.constants import ( 

43 DAYS_PER_MONTH, 

44 DAYS_PER_YEAR, 

45 FuzzyDefaults, 

46 GENDER_FEMALE, 

47 GENDER_MALE, 

48 GENDER_MISSING, 

49 GENDER_OTHER, 

50 MONTHS_PER_YEAR, 

51 Switches, 

52 UK_POPULATION_2017, 

53 VALID_GENDERS, 

54) 

55from crate_anon.linkage.frequencies import ( 

56 BasicNameFreqInfo, 

57 NameFrequencyInfo, 

58 PostcodeFrequencyInfo, 

59) 

60from crate_anon.linkage.helpers import ( 

61 dict_from_str, 

62 safe_upper, 

63 standardize_name, 

64 standardize_perfect_id_key, 

65 standardize_perfect_id_value, 

66) 

67 

68log = logging.getLogger(__name__) 

69 

70 

71# ============================================================================= 

72# Main configuration class, supporting frequency-based probability calculations 

73# ============================================================================= 

74 

75 

76class MatchConfig: 

77 """ 

78 Master config class. It's more convenient to pass one of these round than 

79 lots of its components. 

80 

81 Default arguments are there for testing. 

82 """ 

83 

84 def __init__( 

85 self, 

86 hash_key: str = FuzzyDefaults.HASH_KEY, 

87 hash_method: str = FuzzyDefaults.HASH_METHOD, 

88 rounding_sf: Optional[int] = FuzzyDefaults.ROUNDING_SF, 

89 local_id_hash_key: str = None, 

90 population_size: int = FuzzyDefaults.POPULATION_SIZE, 

91 forename_sex_csv_filename: str = FuzzyDefaults.FORENAME_SEX_FREQ_CSV, 

92 forename_cache_filename: str = FuzzyDefaults.FORENAME_CACHE_FILENAME, 

93 forename_freq_info: Optional[NameFrequencyInfo] = None, 

94 forename_min_frequency: float = FuzzyDefaults.FORENAME_MIN_FREQ, 

95 surname_csv_filename: str = FuzzyDefaults.SURNAME_FREQ_CSV, 

96 surname_cache_filename: str = FuzzyDefaults.SURNAME_CACHE_FILENAME, 

97 surname_freq_info: Optional[NameFrequencyInfo] = None, 

98 surname_min_frequency: float = FuzzyDefaults.SURNAME_MIN_FREQ, 

99 accent_transliterations_csv: str = ( 

100 FuzzyDefaults.ACCENT_TRANSLITERATIONS_SLASH_CSV 

101 ), 

102 nonspecific_name_components_csv: str = ( 

103 FuzzyDefaults.NONSPECIFIC_NAME_COMPONENTS_CSV 

104 ), 

105 birth_year_pseudo_range: float = FuzzyDefaults.BIRTH_YEAR_PSEUDO_RANGE, 

106 p_not_male_or_female: float = FuzzyDefaults.P_NOT_MALE_OR_FEMALE, 

107 p_female_given_male_or_female: float = ( 

108 FuzzyDefaults.P_FEMALE_GIVEN_MALE_OR_FEMALE 

109 ), 

110 postcode_csv_filename: str = FuzzyDefaults.POSTCODES_CSV, 

111 postcode_cache_filename: str = FuzzyDefaults.POSTCODE_CACHE_FILENAME, 

112 postcode_freq_info: Optional[PostcodeFrequencyInfo] = None, 

113 k_postcode: Optional[float] = FuzzyDefaults.K_POSTCODE, 

114 p_unknown_or_pseudo_postcode: float = ( 

115 FuzzyDefaults.P_UNKNOWN_OR_PSEUDO_POSTCODE 

116 ), 

117 k_pseudopostcode: float = FuzzyDefaults.K_PSEUDOPOSTCODE, 

118 p_ep1_forename: str = FuzzyDefaults.P_EP1_FORENAME_CSV, 

119 p_ep2np1_forename: str = FuzzyDefaults.P_EP2NP1_FORENAME_CSV, 

120 p_u_forename: float = FuzzyDefaults.P_U_FORENAME, 

121 p_en_forename: str = FuzzyDefaults.P_EN_FORENAME_CSV, 

122 p_ep1_surname: str = FuzzyDefaults.P_EP1_SURNAME_CSV, 

123 p_ep2np1_surname: str = FuzzyDefaults.P_EP2NP1_SURNAME_CSV, 

124 p_en_surname: str = FuzzyDefaults.P_EN_SURNAME_CSV, 

125 p_ep_dob: float = FuzzyDefaults.P_EP_DOB, 

126 p_en_dob: float = FuzzyDefaults.P_EN_DOB, 

127 p_e_gender: float = FuzzyDefaults.P_E_GENDER, 

128 p_ep_postcode: float = FuzzyDefaults.P_EP_POSTCODE, 

129 p_en_postcode: float = FuzzyDefaults.P_EN_POSTCODE, 

130 min_log_odds_for_match: float = FuzzyDefaults.MIN_LOG_ODDS_FOR_MATCH, 

131 exceeds_next_best_log_odds: float = ( 

132 FuzzyDefaults.EXCEEDS_NEXT_BEST_LOG_ODDS 

133 ), 

134 perfect_id_translation: Union[ 

135 Dict[str, str], str 

136 ] = FuzzyDefaults.PERFECT_ID_TRANSLATION, 

137 extra_validation_output: bool = False, 

138 check_comparison_order: bool = FuzzyDefaults.CHECK_COMPARISON_ORDER, 

139 report_every: int = FuzzyDefaults.REPORT_EVERY, 

140 min_probands_for_parallel: int = ( 

141 FuzzyDefaults.MIN_PROBANDS_FOR_PARALLEL 

142 ), 

143 n_workers: int = FuzzyDefaults.N_PROCESSES, 

144 verbose: bool = False, 

145 ) -> None: 

146 """ 

147 Args: 

148 hash_key: 

149 Key (passphrase) for hasher. 

150 hash_method: 

151 Method to use for hashhing. 

152 rounding_sf: 

153 Number of significant figures to use when rounding frequency 

154 information in hashed copies. Use ``None`` for no rounding. 

155 local_id_hash_key: 

156 If specified, then for hash operations, the local_id values 

157 will also be hashed, using this key. 

158 

159 population_size: 

160 The size of the entire population (not our sample). See 

161 docstrings above. 

162 

163 forename_sex_csv_filename: 

164 Forename frequencies. CSV file, with no header, of "name, 

165 frequency" pairs. 

166 forename_cache_filename: 

167 File in which to cache forename information for faster loading. 

168 forename_freq_info: 

169 Debugging option: overrides forename_sex_csv_filename by 

170 providing a NameFrequencyInfo object directly. 

171 forename_min_frequency: 

172 Minimum frequency for forenames. 

173 

174 surname_csv_filename: 

175 Surname frequencies. CSV file, with no header, of "name, 

176 frequency" pairs. 

177 surname_cache_filename: 

178 File in which to cache forename information for faster loading. 

179 surname_freq_info: 

180 Debugging option: overrides surname_csv_filename by 

181 providing a NameFrequencyInfo object directly. 

182 surname_min_frequency: 

183 Minimum frequency for surnames. 

184 accent_transliterations_csv: 

185 Accent transliteration map. String of the form "Ä/AE,Ö/OE" -- 

186 comma-separated pairs, with slashed separating each pair. 

187 nonspecific_name_components_csv: 

188 CSV-separated list of nonspecific name components (e.g. 

189 nobiliary particles), which will be avoided as equivalent name 

190 fragments. 

191 

192 birth_year_pseudo_range: 

193 b, such that P(two people share a DOB) = 1/(365.25 * b). 

194 

195 p_not_male_or_female: 

196 Probability that a person in the population has gender 'X'. 

197 p_female_given_male_or_female: 

198 Probability that a person in the population is female, given 

199 that they are either male or female. 

200 

201 postcode_csv_filename: 

202 Postcode mapping. CSV (or ZIP) file. Special format; see 

203 :class:`PostcodeFrequencyInfo`. 

204 postcode_cache_filename: 

205 File in which to cache postcode information for faster loading. 

206 postcode_freq_info: 

207 Debugging option: overrides postcode_csv_filename by 

208 providing a PostcodeFrequencyInfo object directly. 

209 k_postcode: 

210 Multiple applied to postcode unit/sector frequencies, such that 

211 p_f_postcode = k_postcode * f_f_postcode and p_p_postcode = 

212 k_postcode * f_p_postcode. If None, defaults to 

213 UK_POPULATION_2017 / population_size, appropriate if the 

214 population under consideration is geographically constrained 

215 (rather than sampled from across the UK). 

216 p_unknown_or_pseudo_postcode: 

217 Probability that a random person will have a pseudo-postcode, 

218 e.g. ZZ99 3VZ (no fixed abode) or a postcode not known to our 

219 database. Specifically, P(each pseudopostcode or unknown 

220 postcode unit | ¬H). 

221 k_pseudopostcode: 

222 Probability multiple: P(pseudopostcode sector or unknown 

223 postcode sector match | ¬H) = k_pseudopostcode * 

224 p_unknown_or_pseudo_postcode. Must strictly be >=1 and we 

225 enforce >1; see paper. 

226 

227 p_ep1_forename: 

228 Error probability that a forename fails a full match but passes 

229 a partial 1 (metaphone) match. [GPD] 

230 p_ep2np1_forename: 

231 Error probability that a forename fails a full match and a 

232 partial 1 match but passes a partial 2 (F2C) match. [GPD] 

233 p_en_forename: 

234 Error probability that a forename yields no match at all. [GPD] 

235 p_ep1_surname: 

236 Error probability that a surname fails a full match but passes 

237 a partial 1 (metaphone) match. [GPD] 

238 p_ep2np1_surname: 

239 Error probability that a surname fails a full match and a 

240 partial 1 match but passes a partial 2 (F2C) match. [GPD] 

241 p_en_surname: 

242 Error probability that a surname yields no match at all. [GPD] 

243 p_ep_dob: 

244 Error probability that a DOB fails a full (YMD) match but 

245 passes a partial (YM, MD, or YD) match. 

246 p_en_dob: 

247 Error probability that a DOB produces no match at all. 

248 p_e_gender: 

249 Error probability of no gender match. 

250 p_ep_postcode: 

251 Probability that a postcode fails a full (unit) match but 

252 passes a partial (sector) match (due to error or a move within 

253 a sector). 

254 p_en_postcode: 

255 Probability that a postcode gives no match at all. 

256 min_log_odds_for_match: 

257 minimum log odds of a match, to consider two people a match 

258 exceeds_next_best_log_odds: 

259 In a multi-person comparison, the log odds of the best match 

260 must exceed those of the next-best match by this much for the 

261 best to be considered a unique winner. 

262 perfect_id_translation: 

263 Option dictionary mapping the perfect ID names in the proband 

264 to the equivalents in the sample, e.g. {"nhsnum": "nhsnumber"}. 

265 

266 extra_validation_output: 

267 Add extra columns to the output for validation purposes? 

268 check_comparison_order: 

269 Check that comparisons follow the general rule "no match ≤ 

270 partial(s) ≤ full" and warn if not. 

271 report_every: 

272 Report progress every n probands. 

273 min_probands_for_parallel: 

274 Minimum number of probands for which we will bother to use 

275 parallel processing. 

276 n_workers: 

277 Number of parallel processes to use, if parallel processing 

278 is used. 

279 verbose: 

280 Be verbose on creation? 

281 

282 - [GPD] In ``{gender:p, ...}`` dict-as-string format. 

283 

284 - F2C = First two characters. 

285 """ 

286 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

287 # Input validation 

288 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

289 

290 def raise_bad(x_: Any, name_: str) -> NoReturn: 

291 """ 

292 Raise an informative ValueError. 

293 """ 

294 raise ValueError(f"Bad {name_}: {x_!r}") 

295 

296 def check_prob( 

297 p_: float, name_: str, not_certain: bool = False 

298 ) -> float: 

299 """ 

300 Ensure that something is a probability, and return it. 

301 """ 

302 if not_certain: 

303 if not 0 < p_ < 1: 

304 raise_bad(p_, name_ + " [must be in range (0, 1)]") 

305 else: 

306 if not 0 <= p_ <= 1: 

307 raise_bad(p_, name_) 

308 return p_ 

309 

310 def mk_gender_p_dict(csv_: str, name_: str) -> Dict[str, float]: 

311 """ 

312 Transform a comma-separated list of ``gender:p`` values into 

313 a corresponding dictionary, and fill in the blanks. 

314 """ 

315 d = {} # type: Dict[str, float] 

316 for gender_p_str in csv_.split(","): 

317 g_p_components = gender_p_str.split(":") 

318 if len(g_p_components) != 2: 

319 raise ValueError(f"Bad {name_}: {csv_!r}") 

320 g = g_p_components[0].strip() 

321 try: 

322 p = check_prob(float(g_p_components[1].strip()), name_) 

323 except (ValueError, TypeError): 

324 raise ValueError(f"Bad probability in {name_}: {csv_!r}") 

325 d[g] = p 

326 if GENDER_FEMALE not in d: 

327 raise ValueError( 

328 f"Gender {GENDER_FEMALE} not specified in {name_}" 

329 ) 

330 if GENDER_MALE not in d: 

331 raise ValueError( 

332 f"Gender {GENDER_MALE} not specified in {name_}" 

333 ) 

334 weighted_mean_m_f = ( 

335 self.p_female_given_m_or_f * d[GENDER_FEMALE] 

336 + self.p_male_given_m_or_f * d[GENDER_MALE] 

337 ) 

338 d.setdefault(GENDER_OTHER, weighted_mean_m_f) 

339 d.setdefault(GENDER_MISSING, weighted_mean_m_f) 

340 if set(d.keys()) != set(VALID_GENDERS): 

341 raise ValueError( 

342 f"Missing or bad genders in {name_}: {csv_!r} -- genders " 

343 f"should be {VALID_GENDERS}" 

344 ) 

345 return d 

346 

347 def mk_p_c_dict( 

348 p_ep1_: Dict[str, float], 

349 p_ep2np1_: Dict[str, float], 

350 p_en_: Dict[str, float], 

351 ) -> Dict[str, float]: 

352 """ 

353 Calculates p_c = 1 - p_ep1 - p_ep2np1 = p_en. 

354 """ 

355 d = {} # type: Dict[str, float] 

356 for g in VALID_GENDERS: 

357 p_c_ = 1 - p_ep1_[g] - p_ep2np1_[g] - p_en_[g] 

358 assert 0 <= p_c_ <= 1 

359 d[g] = p_c_ 

360 return d 

361 

362 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

363 # Basic creation 

364 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

365 

366 if verbose: 

367 log.debug("Building MatchConfig...") 

368 

369 # Hash information 

370 

371 self.hash_fn = make_hasher(hash_method=hash_method, key=hash_key).hash 

372 if not (rounding_sf is None or 1 <= rounding_sf): 

373 raise_bad(rounding_sf, Switches.ROUNDING_SF) 

374 self.rounding_sf = rounding_sf 

375 if local_id_hash_key: 

376 self.local_id_hash_fn = make_hasher( 

377 hash_method=hash_method, key=local_id_hash_key 

378 ).hash 

379 else: 

380 # Convert to string if necessary; otherwise, an identity function: 

381 self.local_id_hash_fn = str 

382 

383 # Overall population 

384 

385 if not (population_size > 0): 

386 raise_bad(population_size, Switches.POPULATION_SIZE) 

387 self.population_size = population_size 

388 # Precalculate this, for access speed: 

389 self.baseline_log_odds_same_person = log_odds_from_1_in_n( 

390 self.population_size 

391 ) 

392 

393 # Name handling: generic 

394 

395 accent_dict = {} # type: Dict[str, str] 

396 for accent_pair in accent_transliterations_csv.split(","): 

397 accent_components = accent_pair.split("/") 

398 if len(accent_components) != 2: 

399 raise ValueError( 

400 f"Bad accent_transliterations_csv: " 

401 f"{accent_transliterations_csv!r}" 

402 ) 

403 accented = safe_upper(accent_components[0].strip()) 

404 plain = safe_upper(accent_components[1].strip()) 

405 if len(accented) != 1: 

406 raise ValueError( 

407 f"Bad accent_transliterations_csv: " 

408 f"{accent_transliterations_csv!r} -- contains accented " 

409 f"character {accented!r}, which should be of length 1" 

410 ) 

411 accent_dict[accented] = plain 

412 self.accent_transliterations = str.maketrans(accent_dict) 

413 self.nonspecific_name_components = set() # type: Set[str] 

414 for nonspec in nonspecific_name_components_csv.split(","): 

415 self.nonspecific_name_components.add(nonspec.strip().upper()) 

416 

417 # Name handling: forenames 

418 

419 self.forename_freq_info = forename_freq_info or NameFrequencyInfo( 

420 csv_filename=forename_sex_csv_filename, 

421 cache_filename=forename_cache_filename, 

422 min_frequency=check_prob( 

423 forename_min_frequency, Switches.FORENAME_MIN_FREQUENCY 

424 ), 

425 by_gender=True, 

426 ) 

427 if not isinstance(self.forename_freq_info, NameFrequencyInfo): 

428 raise ValueError("Bad forename_freq_info") 

429 

430 # Name handling: surnames 

431 

432 self.surname_freq_info = surname_freq_info or NameFrequencyInfo( 

433 csv_filename=surname_csv_filename, 

434 cache_filename=surname_cache_filename, 

435 min_frequency=check_prob( 

436 surname_min_frequency, Switches.SURNAME_MIN_FREQUENCY 

437 ), 

438 by_gender=False, 

439 ) 

440 if not isinstance(self.surname_freq_info, NameFrequencyInfo): 

441 raise ValueError("Bad surname_freq_info") 

442 

443 # Population frequencies: DOB 

444 

445 self.birth_year_pseudo_range = birth_year_pseudo_range 

446 if not (birth_year_pseudo_range >= 1): 

447 raise_bad( 

448 birth_year_pseudo_range, Switches.BIRTH_YEAR_PSEUDO_RANGE 

449 ) 

450 

451 # Population frequencies: sex/gender 

452 

453 # ... Check this before using mk_gender_p_dict: 

454 self.p_female_given_m_or_f = check_prob( 

455 p_female_given_male_or_female, 

456 Switches.P_FEMALE_GIVEN_MALE_OR_FEMALE, 

457 ) 

458 self.p_male_given_m_or_f = 1 - self.p_female_given_m_or_f 

459 self.p_not_male_or_female = check_prob( 

460 p_not_male_or_female, Switches.P_NOT_MALE_OR_FEMALE 

461 ) 

462 p_male_or_female = 1 - p_not_male_or_female 

463 self.p_female = p_female_given_male_or_female * p_male_or_female 

464 self.p_male = p_male_or_female - self.p_female 

465 

466 # Population frequencies: postcode 

467 

468 self.postcode_freq_info = postcode_freq_info or PostcodeFrequencyInfo( 

469 csv_filename=postcode_csv_filename, 

470 cache_filename=postcode_cache_filename, 

471 ) 

472 if not isinstance(self.postcode_freq_info, PostcodeFrequencyInfo): 

473 raise ValueError("Bad postcode_freq_info") 

474 self.p_unknown_or_pseudo_postcode_unit = check_prob( 

475 p_unknown_or_pseudo_postcode, 

476 Switches.P_UNKNOWN_OR_PSEUDO_POSTCODE, 

477 not_certain=True, 

478 ) 

479 if k_pseudopostcode <= 1: 

480 raise ValueError(f"Bad {Switches.K_PSEUDOPOSTCODE}: must be >1") 

481 self.k_pseudopostcode = k_pseudopostcode 

482 self.p_unknown_or_pseudo_postcode_sector = check_prob( 

483 k_pseudopostcode * p_unknown_or_pseudo_postcode, 

484 f"P(unknown postcode or pseudopostcode sector | ¬H) = " 

485 f"{Switches.K_PSEUDOPOSTCODE} * " 

486 f"{Switches.P_UNKNOWN_OR_PSEUDO_POSTCODE}", 

487 not_certain=True, 

488 ) 

489 self.k_postcode = ( 

490 UK_POPULATION_2017 / self.population_size 

491 if k_postcode is None 

492 else k_postcode 

493 ) 

494 self.p_known_postcode = 1 - self.p_unknown_or_pseudo_postcode_sector 

495 

496 # Error probabilities: forenames 

497 

498 self.p_ep1_forename = mk_gender_p_dict( 

499 p_ep1_forename, Switches.P_EP1_FORENAME 

500 ) 

501 self.p_ep2np1_forename = mk_gender_p_dict( 

502 p_ep2np1_forename, Switches.P_EP2NP1_FORENAME 

503 ) 

504 self.p_en_forename = mk_gender_p_dict( 

505 p_en_forename, Switches.P_EN_FORENAME 

506 ) 

507 self.p_c_forename = mk_p_c_dict( 

508 p_ep1_=self.p_ep1_forename, 

509 p_ep2np1_=self.p_ep2np1_forename, 

510 p_en_=self.p_en_forename, 

511 ) 

512 self.p_u_forename = check_prob(p_u_forename, Switches.P_U_FORENAME) 

513 

514 # Error probabilities: surnames 

515 

516 self.p_ep1_surname = mk_gender_p_dict( 

517 p_ep1_surname, Switches.P_EP1_SURNAME 

518 ) 

519 self.p_ep2np1_surname = mk_gender_p_dict( 

520 p_ep2np1_surname, Switches.P_EP2NP1_SURNAME 

521 ) 

522 self.p_en_surname = mk_gender_p_dict( 

523 p_en_surname, Switches.P_EN_SURNAME 

524 ) 

525 self.p_c_surname = mk_p_c_dict( 

526 p_ep1_=self.p_ep1_surname, 

527 p_ep2np1_=self.p_ep2np1_surname, 

528 p_en_=self.p_en_surname, 

529 ) 

530 

531 # Error probabilities: DOB 

532 

533 self.p_ep_dob = check_prob(p_ep_dob, Switches.P_EP_DOB) 

534 self.p_en_dob = check_prob(p_en_dob, Switches.P_EN_DOB) 

535 

536 # Error probabilities: gender 

537 

538 self.p_e_gender_error = check_prob( 

539 p_e_gender, 

540 Switches.P_E_GENDER, 

541 ) 

542 

543 # Error probabilities: postcode 

544 

545 self.p_ep_postcode = check_prob(p_ep_postcode, Switches.P_EP_POSTCODE) 

546 self.p_en_postcode = check_prob(p_en_postcode, Switches.P_EN_POSTCODE) 

547 

548 # Matching rules 

549 

550 self.min_log_odds_for_match = min_log_odds_for_match 

551 self.exceeds_next_best_log_odds = exceeds_next_best_log_odds 

552 if perfect_id_translation is None: 

553 perfect_id_xlate_raw = {} 

554 elif isinstance(perfect_id_translation, dict): 

555 perfect_id_xlate_raw = perfect_id_translation 

556 elif isinstance(perfect_id_translation, str): 

557 perfect_id_xlate_raw = dict_from_str(perfect_id_translation) 

558 else: 

559 raise ValueError( 

560 f"Bad perfect_id_translation: {perfect_id_translation!r}" 

561 ) 

562 self.perfect_id_translation = { 

563 standardize_perfect_id_key(k): standardize_perfect_id_value(v) 

564 for k, v in perfect_id_xlate_raw.values() 

565 } 

566 if self.perfect_id_translation: 

567 log.info( 

568 f"Using proband-to-sample perfect ID translation: " 

569 f"{self.perfect_id_translation}" 

570 ) 

571 

572 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

573 # Some derived frequencies 

574 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

575 

576 # DOB: 

577 

578 self.p_c_dob = 1 - self.p_ep_dob - self.p_en_dob 

579 assert 0 <= self.p_c_dob <= 1 

580 # These ignore the specialness of 29 February: 

581 self.p_f_dob = 1 / (DAYS_PER_YEAR * birth_year_pseudo_range) 

582 p_share_dob_md_not_ymd = (1 / DAYS_PER_YEAR) - self.p_f_dob 

583 p_share_dob_yd_not_ymd = ( 

584 1 / (DAYS_PER_MONTH * birth_year_pseudo_range) 

585 ) - self.p_f_dob 

586 p_share_dob_ym_not_ymd = ( 

587 1 / (MONTHS_PER_YEAR * birth_year_pseudo_range) 

588 ) - self.p_f_dob 

589 # These three are mutually exclusive possibilities (e.g. you can't 

590 # share YM and MD without sharing YMD), so we can just sum: 

591 self.p_pnf_dob = ( 

592 p_share_dob_md_not_ymd 

593 + p_share_dob_yd_not_ymd 

594 + p_share_dob_ym_not_ymd 

595 ) 

596 self.p_n_dob = 1 - self.p_f_dob - self.p_pnf_dob 

597 assert 0 <= self.p_f_dob <= 1 

598 assert 0 <= p_share_dob_md_not_ymd <= 1 

599 assert 0 <= p_share_dob_yd_not_ymd <= 1 

600 assert 0 <= p_share_dob_ym_not_ymd <= 1 

601 assert 0 <= self.p_pnf_dob <= 1 

602 assert 0 <= self.p_n_dob <= 1 

603 

604 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

605 # Technical 

606 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

607 

608 self.extra_validation_output = extra_validation_output 

609 self.check_comparison_order = check_comparison_order 

610 self.report_every = report_every 

611 self.min_probands_for_parallel = min_probands_for_parallel 

612 self.n_workers = n_workers 

613 

614 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

615 # Reporting 

616 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

617 

618 self.partial_dob_mismatch_allowed = self.p_c_dob < 1 

619 self.complete_dob_mismatch_allowed = self.p_en_dob > 0 

620 if self.complete_dob_mismatch_allowed: 

621 potential_speedup_factor = round_sf( 

622 normal_round_int(1 / (1 - self.p_n_dob)), 

623 n=3, 

624 ) 

625 log.warning( 

626 f"You are allowing a person's DOB to be completely different, " 

627 f"with p = {self.p_en_dob}. That is valid but much less " 

628 f"efficient computationally (by an estimated factor of about " 

629 f"{potential_speedup_factor})." 

630 ) 

631 _ = """ 

632 Speedup: for a 90-year range (b = 90), this is a factor of about 252. 

633 

634 For a single year, it's about 9; if I'm born on 1 Jan, allowing 

635 single-component errors mean we need to consider 1 Jan, but also all of 

636 Jan, and all other firsts of the month -- total 42 out of 365 days, or 

637 1/8.69 of the year. 

638 

639 For a multi-year range, the speedup increases: if I'm born on 1 Jan 

640 1950 and we are considering 1900-1999, we'd need to consider 1950-01-01 

641 (1), ????-01-01 (100), 1950-01-?? (31), 1950-??-01 (12), minus the 

642 overlaps (3), giving 141 possibilities but out of about 36500, i.e. 

643 considering only 1/259 of the candidates. 

644 

645 To find probabilities in terms of b, using Octave: 

646 

647 pkg load symbolic 

648 syms b p_f_dob p_pnf_dob p_n_dob speedup_no_mismatch speedup_no_partial second_stage_speedup 

649 DAYS_PER_YEAR = 365.25 

650 DAYS_PER_MONTH = 30.4375 

651 MONTHS_PER_YEAR = 12 

652 

653 p_f_dob = 1 / (DAYS_PER_YEAR * b) 

654 # = 4 / (1461⋅b) 

655 

656 p_pnf_dob = ( 

657 1 / DAYS_PER_YEAR 

658 + 1 / (DAYS_PER_MONTH * b) 

659 + 1 / (MONTHS_PER_YEAR * b) 

660 - 3 / (DAYS_PER_YEAR * b) 

661 ) 

662 simplify(p_pnf_dob) 

663 # = (16⋅b + 631) / (5844⋅b) 

664 

665 p_n_dob = 1 - p_f_dob - p_pnf_dob 

666 simplify(p_n_dob) 

667 

668 p_full_or_partial_match = 1 - p_n_dob 

669 speedup_no_mismatch = 1 / p_full_or_partial_match 

670 simplify(speedup_no_mismatch) 

671 # = 5844⋅b / (16⋅b + 647) 

672 

673 speedup_no_partial = 1 / p_f_dob 

674 simplify(speedup_no_partial) 

675 # = 1461⋅b / 4 

676 

677 second_stage_speedup = speedup_no_partial / speedup_no_mismatch 

678 simplify(second_stage_speedup) 

679 # = b + 647 / 16 

680 

681 """ # noqa: E501 

682 

683 if verbose: 

684 log.debug(f"... MatchConfig built. Settings: {self}") 

685 # log.debug( 

686 # f"p_dob_correct = {self.p_dob_correct}, " 

687 # f"p_dob_single_component_error = " 

688 # f"{self.p_dob_single_component_error}, " 

689 # f"p_dob_major_error = {self.p_dob_major_error}" 

690 # ) 

691 # log.debug( 

692 # f"p_two_people_share_dob_ymd = " 

693 # f"{self.p_two_people_share_dob_ymd}, " 

694 # f"p_share_dob_md_not_ymd = {p_share_dob_md_not_ymd}, " 

695 # f"p_share_dob_yd_not_ymd = {p_share_dob_yd_not_ymd}, " 

696 # f"p_share_dob_ym_not_ymd = {p_share_dob_ym_not_ymd}, " 

697 # f"p_two_people_have_partial_dob_match = " 

698 # f"{self.p_two_people_partial_dob_match}, " 

699 # f"p_two_people_no_dob_similarity = " 

700 # f"{self.p_two_people_no_dob_similarity}" 

701 # ) 

702 

703 # ------------------------------------------------------------------------- 

704 # String representation 

705 # ------------------------------------------------------------------------- 

706 

707 def __str__(self) -> str: 

708 return auto_repr(self) 

709 

710 # not __repr__(), or it clutters up all the other objects 

711 

712 # ------------------------------------------------------------------------- 

713 # Identifier frequency information 

714 # ------------------------------------------------------------------------- 

715 

716 def get_forename_freq_info( 

717 self, name: str, gender: str, prestandardized: bool = False 

718 ) -> BasicNameFreqInfo: 

719 """ 

720 Returns the baseline frequency of a forename. 

721 

722 Args: 

723 name: the name to check 

724 gender: the gender to look up for 

725 prestandardized: was the name pre-standardized? 

726 """ 

727 if not prestandardized: 

728 name = standardize_name(name) 

729 freq_func = self.forename_freq_info.name_frequency_info 

730 if gender in (GENDER_FEMALE, GENDER_MALE): 

731 return freq_func(name, gender, prestandardized=True) 

732 # Otherwise, take the mean across genders: 

733 return BasicNameFreqInfo.weighted_mean( 

734 objects=[ 

735 freq_func(name, GENDER_FEMALE, prestandardized=True), 

736 freq_func(name, GENDER_MALE, prestandardized=True), 

737 ], 

738 weights=[self.p_female, self.p_male], 

739 ) 

740 

741 def get_surname_freq_info( 

742 self, name: str, prestandardized: bool = False 

743 ) -> BasicNameFreqInfo: 

744 """ 

745 Returns the baseline frequency of a surname. 

746 

747 Args: 

748 name: the name to check 

749 prestandardized: was it pre-standardized? 

750 """ 

751 return self.surname_freq_info.name_frequency_info( 

752 name, prestandardized=prestandardized 

753 ) 

754 

755 def gender_freq(self, gender: str) -> Optional[float]: 

756 if not gender: 

757 return None 

758 elif gender == GENDER_FEMALE: 

759 return self.p_female 

760 elif gender == GENDER_MALE: 

761 return self.p_male 

762 else: 

763 return self.p_not_male_or_female 

764 

765 def is_valid_postcode(self, postcode_unit: str) -> bool: 

766 """ 

767 Is this a valid postcode? 

768 """ 

769 return self.postcode_freq_info.debug_is_valid_postcode(postcode_unit) 

770 

771 def postcode_unit_sector_freq( 

772 self, postcode_unit: str, prestandardized: bool = False 

773 ) -> Tuple[float, float]: 

774 """ 

775 Returns the frequency for a full postcode, or postcode unit (the 

776 proportion of the population who live in that postcode), and the 

777 corresponding larger-scale postcode sector. 

778 

779 The underlying function ensures that the sector frequency is as least 

780 as big as the unit frequency. 

781 """ 

782 return self.postcode_freq_info.postcode_unit_sector_frequency( 

783 postcode_unit, prestandardized=prestandardized 

784 ) 

785 

786 def debug_postcode_unit_population( 

787 self, postcode_unit: str, prestandardized: bool = False 

788 ) -> float: 

789 """ 

790 Returns the calculated population of a postcode unit. 

791 

792 Args: 

793 postcode_unit: the postcode unit to check 

794 prestandardized: was the postcode pre-standardized in format? 

795 """ 

796 return self.postcode_freq_info.debug_postcode_unit_population( 

797 postcode_unit, prestandardized=prestandardized 

798 ) 

799 

800 def debug_postcode_sector_population( 

801 self, postcode_sector: str, prestandardized: bool = False 

802 ) -> float: 

803 """ 

804 Returns the calculated population of a postcode sector. 

805 

806 Args: 

807 postcode_sector: the postcode sector to check 

808 prestandardized: was the postcode pre-standardized in format? 

809 """ 

810 return self.postcode_freq_info.debug_postcode_sector_population( 

811 postcode_sector, prestandardized=prestandardized 

812 ) 

813 

814 # ------------------------------------------------------------------------- 

815 # Comparisons 

816 # ------------------------------------------------------------------------- 

817 

818 def exceeds_primary_threshold(self, log_odds_match: float) -> bool: 

819 """ 

820 Decides as to whether the log odds, representing P(H | D) from a 

821 comparison of two :class:`Person` objects, are sufficient for a match, 

822 based on our threshold. 

823 

824 Args: 

825 log_odds_match: log odds that they're the same person 

826 

827 Returns: 

828 bool: binary decision 

829 """ 

830 return log_odds_match >= self.min_log_odds_for_match 

831 

832 # ------------------------------------------------------------------------- 

833 # Perfect ID handling 

834 # ------------------------------------------------------------------------- 

835 

836 def remap_perfect_id_key(self, key: str) -> str: 

837 return self.perfect_id_translation.get(key, key) 

838 

839 

840# ============================================================================= 

841# Dummy config that doesn't load frequency information 

842# ============================================================================= 

843 

844 

845def mk_dummy_match_config() -> MatchConfig: 

846 """ 

847 Returns a dummy config with empty frequency information. 

848 """ 

849 return MatchConfig( 

850 forename_cache_filename="", 

851 forename_sex_csv_filename="", 

852 surname_cache_filename="", 

853 surname_csv_filename="", 

854 postcode_cache_filename="", 

855 postcode_csv_filename="", 

856 )