Coverage for linkage/constants.py: 97%
142 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1r"""
2crate_anon/linkage/constants.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Constants for linkage tools.**
28"""
30# =============================================================================
31# Imports
32# =============================================================================
34import math
35from multiprocessing import cpu_count
36import os
37import platform
38from typing import Dict
40import appdirs
41from cardinal_pythonlib.hash import HashMethods
42from cardinal_pythonlib.probability import probability_from_log_odds
44from crate_anon.common.constants import EnvVar
47# =============================================================================
48# Helper functions
49# =============================================================================
52def _mk_dictstr(x: Dict[str, float]) -> str:
53 return ",".join(f"{k}:{v}" for k, v in x.items())
56# =============================================================================
57# Constants
58# =============================================================================
60# CHECK_BASIC_ASSERTIONS_IN_HIGH_SPEED_FUNCTIONS = False # for debugging only
62INFINITY = math.inf
63MINUS_INFINITY = -math.inf
64NONE_TYPE = type(None)
66DAYS_PER_YEAR = 365.25 # approximately!
67MONTHS_PER_YEAR = 12
68DAYS_PER_MONTH = DAYS_PER_YEAR / MONTHS_PER_YEAR # on average
70THIS_DIR = os.path.abspath(os.path.dirname(__file__))
72UK_MEAN_OA_POPULATION_2011 = 309 # not used any more! Left here for interest.
73# ... https://www.ons.gov.uk/methodology/geography/ukgeographies/censusgeography # noqa: E501
74UK_POPULATION_2017 = 66040000 # 2017 figure, 66.04m
75CAMBS_PBORO_POPULATION_2018 = 852523
77GENDER_MALE = "M"
78GENDER_FEMALE = "F"
79GENDER_OTHER = "X"
80GENDER_MISSING = ""
81VALID_GENDERS = [GENDER_MISSING, GENDER_MALE, GENDER_FEMALE, GENDER_OTHER]
82# ... standard three gender codes; "" = missing
85SIMPLIFY_PUNCTUATION_WHITESPACE_TRANS = str.maketrans(
86 {
87 "\t": " ", # tab -> space
88 "\n": " ", # linefeed -> space
89 "\r": " ", # carriage return -> space
90 "“": '"', # curly left double quote -> straight double quote
91 "”": '"', # curly right double quote -> straight double quote
92 "‘": "'", # curly left single quote -> straight single quote
93 "’": "'", # curly right single quote -> straight single quote
94 "–": "-", # en dash -> hyphen
95 "—": "-", # em dash -> hyphen
96 "−": "-", # minus -> hyphen
97 }
98)
101# A capital Eszett was introduced for the first time in 2017. Before that, SS
102# was the capital version. See https://en.wikipedia.org/wiki/%C3%9F.
103ESZETT_LOWER_CASE = "ß"
104ESZETT_UPPER_CASE = "ẞ"
105SAFE_UPPER_PRETRANSLATE = str.maketrans({ESZETT_LOWER_CASE: ESZETT_UPPER_CASE})
106MANGLE_PRETRANSLATE = str.maketrans(
107 {
108 ESZETT_LOWER_CASE: "ss",
109 ESZETT_UPPER_CASE: "SS",
110 }
111)
114class Switches:
115 """
116 Argparse option switches that are used in several places, and also the
117 names of MatchConfig parameters, used for error messages.
118 """
120 INPUT = "input"
121 OUTPUT = "output"
122 INCLUDE_OTHER_INFO = "include_other_info"
124 EXTRA_VALIDATION_OUTPUT = "extra_validation_output"
125 CHECK_COMPARISON_ORDER = "check_comparison_order"
126 REPORT_EVERY = "report_every"
127 MIN_PROBANDS_FOR_PARALLEL = "min_probands_for_parallel"
128 N_WORKERS = "n_workers"
130 KEY = "key"
131 ALLOW_DEFAULT_HASH_KEY = "allow_default_hash_key"
132 HASH_METHOD = "hash_method"
133 ROUNDING_SF = "rounding_sf"
134 LOCAL_ID_HASH_KEY = "local_id_hash_key"
136 POPULATION_SIZE = "population_size"
138 ACCENT_TRANSLITERATIONS = "accent_transliterations"
139 FORENAME_CACHE_FILENAME = "forename_cache_filename"
140 FORENAME_SEX_FREQ_CSV = "forename_sex_freq_csv"
141 FORENAME_MIN_FREQUENCY = "forename_min_frequency"
142 NONSPECIFIC_NAME_COMPONENTS = "nonspecific_name_components"
143 SURNAME_CACHE_FILENAME = "surname_cache_filename"
144 SURNAME_FREQ_CSV = "surname_freq_csv"
145 SURNAME_MIN_FREQUENCY = "surname_min_frequency"
147 BIRTH_YEAR_PSEUDO_RANGE = "birth_year_pseudo_range"
149 P_NOT_MALE_OR_FEMALE = "p_not_male_or_female"
150 P_FEMALE_GIVEN_MALE_OR_FEMALE = "p_female_given_male_or_female"
152 POSTCODE_CACHE_FILENAME = "postcode_cache_filename"
153 POSTCODE_CSV_FILENAME = "postcode_csv_filename"
154 P_UNKNOWN_OR_PSEUDO_POSTCODE = "p_unknown_or_pseudo_postcode"
156 P_EP1_FORENAME = "p_ep1_forename"
157 P_EP2NP1_FORENAME = "p_ep2np1_forename"
158 P_EN_FORENAME = "p_en_forename"
159 P_U_FORENAME = "p_u_forename"
161 P_EP1_SURNAME = "p_ep1_surname"
162 P_EP2NP1_SURNAME = "p_ep2np1_surname"
163 P_EN_SURNAME = "p_en_surname"
165 P_EP_DOB = "p_ep_dob"
166 P_EN_DOB = "p_en_dob"
168 P_E_GENDER = "p_e_gender"
170 P_EP_POSTCODE = "p_ep_postcode"
171 P_EN_POSTCODE = "p_en_postcode"
172 K_POSTCODE = "k_postcode"
173 K_PSEUDOPOSTCODE = "k_pseudopostcode"
175 MIN_LOG_ODDS_FOR_MATCH = "min_log_odds_for_match"
176 EXCEEDS_NEXT_BEST_LOG_ODDS = "exceeds_next_best_log_odds"
177 PERFECT_ID_TRANSLATION = "perfect_id_translation"
180class FuzzyDefaults:
181 """
182 Some configuration defaults.
183 """
185 # -------------------------------------------------------------------------
186 # Filenames
187 # -------------------------------------------------------------------------
188 _appname = "crate"
190 # Public data that we provide a local copy of
191 _THIS_DIR = os.path.abspath(os.path.dirname(__file__))
192 if EnvVar.GENERATING_CRATE_DOCS in os.environ:
193 _DATA_DIR = "/path/to/linkage/data/"
194 else:
195 _DATA_DIR = os.path.join(_THIS_DIR, "data")
196 FORENAME_SEX_FREQ_CSV = os.path.join(_DATA_DIR, "us_forename_sex_freq.zip")
197 SURNAME_FREQ_CSV = os.path.join(_DATA_DIR, "us_surname_freq.zip")
198 POSTCODES_CSV = os.path.join(_DATA_DIR, "ONSPD_MAY_2022_UK.zip")
200 if EnvVar.GENERATING_CRATE_DOCS in os.environ:
201 DEFAULT_CACHE_DIR = "/path/to/crate/user/data"
202 N_PROCESSES = 8
203 else:
204 DEFAULT_CACHE_DIR = os.path.join(
205 appdirs.user_data_dir(appname=_appname)
206 )
207 if platform.system() == "Windows":
208 N_PROCESSES = 1 # usually faster!
209 else:
210 N_PROCESSES = cpu_count()
212 # Caches
213 FORENAME_CACHE_FILENAME = os.path.join(
214 DEFAULT_CACHE_DIR, "fuzzy_forename_cache.jsonl"
215 )
216 POSTCODE_CACHE_FILENAME = os.path.join(
217 DEFAULT_CACHE_DIR, "fuzzy_postcode_cache.json"
218 )
219 SURNAME_CACHE_FILENAME = os.path.join(
220 DEFAULT_CACHE_DIR, "fuzzy_surname_cache.jsonl"
221 )
223 # -------------------------------------------------------------------------
224 # Hashing, rounding
225 # -------------------------------------------------------------------------
226 HASH_KEY = "fuzzy_id_match_default_hash_key_DO_NOT_USE_FOR_LIVE_DATA"
227 HASH_METHOD = HashMethods.HMAC_SHA256
228 ROUNDING_SF = 5
229 # ... number of significant figures for frequency rounding; 3 may be too
230 # small, e.g. surname Smith 0.01006, corresponding metaphone SM0
231 # 0.010129999999999998 would be the same at 3sf.
233 # -------------------------------------------------------------------------
234 # Run-time options
235 # -------------------------------------------------------------------------
237 CHECK_COMPARISON_ORDER = False
239 MIN_PROBANDS_FOR_PARALLEL = 1000
240 # ... a machine that takes ~30s to set up a basic parallel run (and 107.9s
241 # for a 10k-to-10k comparison) processes single results at about 37/s... so
242 # the break-even point is probably around 1000. But that does depend on the
243 # sample size too.
245 # -------------------------------------------------------------------------
246 # Population priors
247 # -------------------------------------------------------------------------
248 # See command-line help.
249 # (E) Empirical; see validation paper.
250 # (N) From national data.
252 POPULATION_SIZE = CAMBS_PBORO_POPULATION_2018 # (N)
254 FORENAME_MIN_FREQ = 5e-6
255 SURNAME_MIN_FREQ = 5e-6
256 # Tried with (a) forename minimum frequency 2.9e-8, on the basis of US
257 # forename data giving a floor at 2.875e-8 (M), 2.930e-8 (F), so 2.9e-8 to
258 # 2sf; and (b) surname minimum frequency at 1.5e-7, since in the US surname
259 # data, values below 3e-7 are reported as 0, so 1.5e-7 is the midpoint of
260 # the low-frequency range. This doesn't (materially) affect the best
261 # performance: accuracy etc. are still optimized at theta = delta = 0, MID
262 # is still optimized at theta = delta = 15, and the WPM is optimized at
263 # theta = 6, delta = 0 (rather than theta = 5, delta = 0). However, these
264 # very low values just inflate MID overall and are not very plausible; much
265 # below 1/n_p is not very plausible, and likely over-emphasizes matches on
266 # unusual/unknown names. So: 5e-6 as originally planned (since the previous
267 # US surname data had a floor at 1e-5, and since we will pilot with n_p
268 # ~1e6).
270 BIRTH_YEAR_PSEUDO_RANGE = 30 # (E) UK-wide ~90, perhaps; 30 empirically.
272 P_FEMALE_GIVEN_MALE_OR_FEMALE = 0.51 # (N)
273 P_NOT_MALE_OR_FEMALE = 0.004 # (N)
275 K_POSTCODE = None # default is to autocalculate from population; see paper
277 # noinspection HttpUrlsUsage
278 _ = """
280 P_UNKNOWN_OR_PSEUDO_POSTCODE
281 ----------------------------
283 - Pseudo-postcodes: e.g. ZZ99 3VZ, no fixed abode; ZZ99 3CZ, England/UK
284 not otherwise specified [4].
285 - These postcodes are not in the ONS Postcode Directory.
286 - In Apr-Jun 2019, 11.4% of households in England who were {homeless or
287 threatened with homelessness} had no fixed abode [1, Table 2].
288 - That table totals 68,180 households, so that probably matches the
289 68,170 households in England used as the summary figure on p1 [1].
290 - In 2020, there were ~27.8 million households in the UK [2].
291 - The mean household size in the UK is 2.4 [2]. (Although the proportion
292 who are homeless is likely biased towards single individuals?)
293 Yes, "Nearly two-thirds of these were single households (households
294 without children)."
295 - 0.843 of the UK population live in England
296 - So, the fraction of homelessness can be estimated as
298 avg_people_per_household = 2.4
299 n_people_per_homeless_household = (2 / 3) * 1 + (1 / 3) * avg_people_per_household
300 n_people_homeless_england = (11.4 / 100) * 68180 * n_people_per_homeless_household
301 n_people_uk = 27.8e6 * 2.4 # 66.7 million, so that's about right
302 n_people_england = 0.843 * n_people_uk
303 p_homeless = n_people_homeless_england / n_people_england
305 = 0.0002026794
306 We'll round: 0.000203
307 (So that's about 13.5k people with postcode ZZ99 3VZ, estimated.)
309 [1] https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/852953/Statutory_Homelessness_Statistical_Release_Apr-Jun_2019.pdf # noqa: E501
310 [2] https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/families/bulletins/familiesandhouseholds/2020 # noqa: E501
311 [3] https://pubmed.ncbi.nlm.nih.gov/35477868/
312 [4] http://www.datadictionary.wales.nhs.uk/index.html#!WordDocuments/postcode.htm # noqa: E501
314 However, our empirical rate is 0.00201 for ZZ99 3VZ (SystmOne; see
315 empirical_rates.sql).
317 K_PSEUDOPOSTCODE
318 ----------------
320 Distinct postcodes in sector ZZ993, from
321 https://files.digital.nhs.uk/assets/ods/current/Look%20Ups.zip:
323 ZZ99 3AZ Eire / Irish Republic / Southern Ireland
324 ZZ99 3BZ Isle of Man
325 ZZ99 3CZ England / Great Britain / United Kingdom (not otherwise stated)
326 ZZ99 3EZ Guernsey / Herm / Jethou Island / Lihou
327 ZZ99 3FZ Jersey
328 ZZ99 3GZ Wales
329 ZZ99 3HZ Channel Islands (not otherwise stated) / Alderney / Brechou / Sark, Little and Great
330 ZZ99 3VZ No fixed abode
331 ZZ99 3WZ At sea / In the air / Inadequately described/specified / Information refused / Not collected / Not known / Not stated/specified
333 So there are 9 postcode units in the ZZ993 sector. Our estimate above is
334 for homelessness, which is likely overrepresented, but these are also big
335 groupings of visitors. No particularly strong evidence to deviate from 9
336 at present (acknowledging this is a fairly fuzzy estimate anyway). The most
337 important thing is that k_pseudopostcode > 1. It would be invalid for it
338 to be <1 and if it is exactly 1, then p_pnf_postcode = 0 (because the
339 sector probability will exactly match the unit probability) and any
340 inadvertent sector-not-unit match will give a log likelihood of +∞ and a
341 certain match.
343 However, empirically in SystmOne, ZZ993 / ZZ993VZ = 1.83 (see paper).
345 """
347 P_UNKNOWN_OR_PSEUDO_POSTCODE = 0.00201 # (E)
348 K_PSEUDOPOSTCODE = 1.83 # (E)
350 # -------------------------------------------------------------------------
351 # Error rates
352 # -------------------------------------------------------------------------
353 # (E) Empirical; see validation paper.
354 # (*) Using the empirical value is much less efficient computationally.
355 P_EP1_FORENAME = {
356 GENDER_FEMALE: 0.00894, # (E)
357 GENDER_MALE: 0.00840, # (E)
358 }
359 P_EP2NP1_FORENAME = {
360 GENDER_FEMALE: 0.00881, # (E)
361 GENDER_MALE: 0.00688, # (E)
362 }
363 P_EN_FORENAME = {
364 GENDER_FEMALE: 0.00572, # (E)
365 GENDER_MALE: 0.00625, # (E)
366 }
368 P_U_FORENAME = 0.00191 # (E)
370 P_EP1_SURNAME = {
371 GENDER_FEMALE: 0.00551, # (E)
372 GENDER_MALE: 0.00471, # (E)
373 }
374 P_EP2NP1_SURNAME = {
375 GENDER_FEMALE: 0.00378, # (E)
376 GENDER_MALE: 0.00247, # (E)
377 }
378 P_EN_SURNAME = {
379 GENDER_FEMALE: 0.0567, # (E)
380 GENDER_MALE: 0.0134, # (E)
381 }
383 _P_E_DOB = 0.00492 # DOB not full match (E)
384 _P_EP_DOB_GIVEN_P_E_DOB = 0.933 # P(partial | not full); (E)
385 P_EP_DOB = _P_E_DOB * _P_EP_DOB_GIVEN_P_E_DOB # (E)
386 P_EN_DOB_TRUE = _P_E_DOB * (1 - _P_EP_DOB_GIVEN_P_E_DOB) # (E) (*)
387 P_EN_DOB = 0 # Much faster (*)
389 P_E_GENDER = 0.0033 # (E)
391 P_EP_POSTCODE = 0.0097 # (E)
392 P_EN_POSTCODE = 0.300 # (E)
394 # -------------------------------------------------------------------------
395 # Matching process
396 # -------------------------------------------------------------------------
397 MIN_LOG_ODDS_FOR_MATCH = 5 # theta, in the validation paper
398 EXCEEDS_NEXT_BEST_LOG_ODDS = 0 # delta, in the validation paper
399 PERFECT_ID_TRANSLATION = ""
400 REPORT_EVERY = 100 # cosmetic only
402 # -------------------------------------------------------------------------
403 # Name handling
404 # -------------------------------------------------------------------------
406 NONSPECIFIC_NAME_COMPONENTS = set(
407 # Includes nobiliary particles:
408 # https://en.wikipedia.org/wiki/Nobiliary_particle. Typically these
409 # mean "of", "of the", or "the". See also
410 # https://en.wikipedia.org/wiki/List_of_family_name_affixes;
411 # https://en.wikipedia.org/wiki/Suffix_(name).
412 x.upper()
413 for x in (
414 # Arabic-speaking countries
415 "Al",
416 "El",
417 # Belgian
418 "de",
419 "der",
420 "van",
421 # Danish
422 "af",
423 # Dutch
424 "tot",
425 "thoe",
426 "van",
427 # English, Welsh, Scottish
428 "of",
429 # French
430 "d", # e.g. Giscard d'Estaing
431 "de",
432 "des",
433 "du",
434 "l", # e.g. L'Estrange
435 "la",
436 "le",
437 # German
438 "auf",
439 "von",
440 "zu",
441 # Italian
442 "da",
443 "dai",
444 "dal",
445 "dalla",
446 "dei",
447 "del",
448 "dell",
449 "della",
450 "di",
451 # Portuguese,
452 "da",
453 "das",
454 "do",
455 "dos",
456 # Somali
457 "Aw",
458 # Spanish
459 "de",
460 # Swedish
461 "af",
462 "av",
463 "von",
464 # Swiss
465 "de",
466 "von",
467 # Thai
468 "na",
469 "Phra",
470 "Sri",
471 # USA: seniority
472 "Jnr",
473 "Jr",
474 "Snr",
475 "Sr",
476 # USA: numbering (not just the USA in theory; e.g. Richard III).
477 "I",
478 "II",
479 "III",
480 "IV",
481 "V",
482 "VI",
483 "VII",
484 "VIII",
485 "IX",
486 "X",
487 )
488 )
489 ACCENT_TRANSLITERATIONS = [
490 # Only upper-case versions are required.
491 # German: https://en.wikipedia.org/wiki/German_orthography
492 ("Ä", "AE"),
493 ("Ö", "OE"),
494 ("Ü", "UE"),
495 (ESZETT_UPPER_CASE, "SS"),
496 ]
498 # -------------------------------------------------------------------------
499 # Derived
500 # -------------------------------------------------------------------------
502 MIN_P_FOR_MATCH = probability_from_log_odds(MIN_LOG_ODDS_FOR_MATCH)
504 P_EP1_FORENAME_CSV = _mk_dictstr(P_EP1_FORENAME)
505 P_EP2NP1_FORENAME_CSV = _mk_dictstr(P_EP2NP1_FORENAME)
506 P_EN_FORENAME_CSV = _mk_dictstr(P_EN_FORENAME)
508 P_EP1_SURNAME_CSV = _mk_dictstr(P_EP1_SURNAME)
509 P_EP2NP1_SURNAME_CSV = _mk_dictstr(P_EP2NP1_SURNAME)
510 P_EN_SURNAME_CSV = _mk_dictstr(P_EN_SURNAME)
512 NONSPECIFIC_NAME_COMPONENTS_CSV = ",".join(
513 sorted(NONSPECIFIC_NAME_COMPONENTS)
514 )
515 ACCENT_TRANSLITERATIONS_SLASH_CSV = ",".join(
516 f"{accent}/{plain}" for accent, plain in ACCENT_TRANSLITERATIONS
517 )
518 ACCENT_TRANSLITERATIONS_TRANS = str.maketrans(
519 {accent: plain for accent, plain in ACCENT_TRANSLITERATIONS}
520 )