Coverage for linkage/constants.py: 97%

142 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1r""" 

2crate_anon/linkage/constants.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Constants for linkage tools.** 

27 

28""" 

29 

30# ============================================================================= 

31# Imports 

32# ============================================================================= 

33 

34import math 

35from multiprocessing import cpu_count 

36import os 

37import platform 

38from typing import Dict 

39 

40import appdirs 

41from cardinal_pythonlib.hash import HashMethods 

42from cardinal_pythonlib.probability import probability_from_log_odds 

43 

44from crate_anon.common.constants import EnvVar 

45 

46 

47# ============================================================================= 

48# Helper functions 

49# ============================================================================= 

50 

51 

52def _mk_dictstr(x: Dict[str, float]) -> str: 

53 return ",".join(f"{k}:{v}" for k, v in x.items()) 

54 

55 

56# ============================================================================= 

57# Constants 

58# ============================================================================= 

59 

60# CHECK_BASIC_ASSERTIONS_IN_HIGH_SPEED_FUNCTIONS = False # for debugging only 

61 

62INFINITY = math.inf 

63MINUS_INFINITY = -math.inf 

64NONE_TYPE = type(None) 

65 

66DAYS_PER_YEAR = 365.25 # approximately! 

67MONTHS_PER_YEAR = 12 

68DAYS_PER_MONTH = DAYS_PER_YEAR / MONTHS_PER_YEAR # on average 

69 

70THIS_DIR = os.path.abspath(os.path.dirname(__file__)) 

71 

72UK_MEAN_OA_POPULATION_2011 = 309 # not used any more! Left here for interest. 

73# ... https://www.ons.gov.uk/methodology/geography/ukgeographies/censusgeography # noqa: E501 

74UK_POPULATION_2017 = 66040000 # 2017 figure, 66.04m 

75CAMBS_PBORO_POPULATION_2018 = 852523 

76 

77GENDER_MALE = "M" 

78GENDER_FEMALE = "F" 

79GENDER_OTHER = "X" 

80GENDER_MISSING = "" 

81VALID_GENDERS = [GENDER_MISSING, GENDER_MALE, GENDER_FEMALE, GENDER_OTHER] 

82# ... standard three gender codes; "" = missing 

83 

84 

85SIMPLIFY_PUNCTUATION_WHITESPACE_TRANS = str.maketrans( 

86 { 

87 "\t": " ", # tab -> space 

88 "\n": " ", # linefeed -> space 

89 "\r": " ", # carriage return -> space 

90 "“": '"', # curly left double quote -> straight double quote 

91 "”": '"', # curly right double quote -> straight double quote 

92 "‘": "'", # curly left single quote -> straight single quote 

93 "’": "'", # curly right single quote -> straight single quote 

94 "–": "-", # en dash -> hyphen 

95 "—": "-", # em dash -> hyphen 

96 "−": "-", # minus -> hyphen 

97 } 

98) 

99 

100 

101# A capital Eszett was introduced for the first time in 2017. Before that, SS 

102# was the capital version. See https://en.wikipedia.org/wiki/%C3%9F. 

103ESZETT_LOWER_CASE = "ß" 

104ESZETT_UPPER_CASE = "ẞ" 

105SAFE_UPPER_PRETRANSLATE = str.maketrans({ESZETT_LOWER_CASE: ESZETT_UPPER_CASE}) 

106MANGLE_PRETRANSLATE = str.maketrans( 

107 { 

108 ESZETT_LOWER_CASE: "ss", 

109 ESZETT_UPPER_CASE: "SS", 

110 } 

111) 

112 

113 

114class Switches: 

115 """ 

116 Argparse option switches that are used in several places, and also the 

117 names of MatchConfig parameters, used for error messages. 

118 """ 

119 

120 INPUT = "input" 

121 OUTPUT = "output" 

122 INCLUDE_OTHER_INFO = "include_other_info" 

123 

124 EXTRA_VALIDATION_OUTPUT = "extra_validation_output" 

125 CHECK_COMPARISON_ORDER = "check_comparison_order" 

126 REPORT_EVERY = "report_every" 

127 MIN_PROBANDS_FOR_PARALLEL = "min_probands_for_parallel" 

128 N_WORKERS = "n_workers" 

129 

130 KEY = "key" 

131 ALLOW_DEFAULT_HASH_KEY = "allow_default_hash_key" 

132 HASH_METHOD = "hash_method" 

133 ROUNDING_SF = "rounding_sf" 

134 LOCAL_ID_HASH_KEY = "local_id_hash_key" 

135 

136 POPULATION_SIZE = "population_size" 

137 

138 ACCENT_TRANSLITERATIONS = "accent_transliterations" 

139 FORENAME_CACHE_FILENAME = "forename_cache_filename" 

140 FORENAME_SEX_FREQ_CSV = "forename_sex_freq_csv" 

141 FORENAME_MIN_FREQUENCY = "forename_min_frequency" 

142 NONSPECIFIC_NAME_COMPONENTS = "nonspecific_name_components" 

143 SURNAME_CACHE_FILENAME = "surname_cache_filename" 

144 SURNAME_FREQ_CSV = "surname_freq_csv" 

145 SURNAME_MIN_FREQUENCY = "surname_min_frequency" 

146 

147 BIRTH_YEAR_PSEUDO_RANGE = "birth_year_pseudo_range" 

148 

149 P_NOT_MALE_OR_FEMALE = "p_not_male_or_female" 

150 P_FEMALE_GIVEN_MALE_OR_FEMALE = "p_female_given_male_or_female" 

151 

152 POSTCODE_CACHE_FILENAME = "postcode_cache_filename" 

153 POSTCODE_CSV_FILENAME = "postcode_csv_filename" 

154 P_UNKNOWN_OR_PSEUDO_POSTCODE = "p_unknown_or_pseudo_postcode" 

155 

156 P_EP1_FORENAME = "p_ep1_forename" 

157 P_EP2NP1_FORENAME = "p_ep2np1_forename" 

158 P_EN_FORENAME = "p_en_forename" 

159 P_U_FORENAME = "p_u_forename" 

160 

161 P_EP1_SURNAME = "p_ep1_surname" 

162 P_EP2NP1_SURNAME = "p_ep2np1_surname" 

163 P_EN_SURNAME = "p_en_surname" 

164 

165 P_EP_DOB = "p_ep_dob" 

166 P_EN_DOB = "p_en_dob" 

167 

168 P_E_GENDER = "p_e_gender" 

169 

170 P_EP_POSTCODE = "p_ep_postcode" 

171 P_EN_POSTCODE = "p_en_postcode" 

172 K_POSTCODE = "k_postcode" 

173 K_PSEUDOPOSTCODE = "k_pseudopostcode" 

174 

175 MIN_LOG_ODDS_FOR_MATCH = "min_log_odds_for_match" 

176 EXCEEDS_NEXT_BEST_LOG_ODDS = "exceeds_next_best_log_odds" 

177 PERFECT_ID_TRANSLATION = "perfect_id_translation" 

178 

179 

180class FuzzyDefaults: 

181 """ 

182 Some configuration defaults. 

183 """ 

184 

185 # ------------------------------------------------------------------------- 

186 # Filenames 

187 # ------------------------------------------------------------------------- 

188 _appname = "crate" 

189 

190 # Public data that we provide a local copy of 

191 _THIS_DIR = os.path.abspath(os.path.dirname(__file__)) 

192 if EnvVar.GENERATING_CRATE_DOCS in os.environ: 

193 _DATA_DIR = "/path/to/linkage/data/" 

194 else: 

195 _DATA_DIR = os.path.join(_THIS_DIR, "data") 

196 FORENAME_SEX_FREQ_CSV = os.path.join(_DATA_DIR, "us_forename_sex_freq.zip") 

197 SURNAME_FREQ_CSV = os.path.join(_DATA_DIR, "us_surname_freq.zip") 

198 POSTCODES_CSV = os.path.join(_DATA_DIR, "ONSPD_MAY_2022_UK.zip") 

199 

200 if EnvVar.GENERATING_CRATE_DOCS in os.environ: 

201 DEFAULT_CACHE_DIR = "/path/to/crate/user/data" 

202 N_PROCESSES = 8 

203 else: 

204 DEFAULT_CACHE_DIR = os.path.join( 

205 appdirs.user_data_dir(appname=_appname) 

206 ) 

207 if platform.system() == "Windows": 

208 N_PROCESSES = 1 # usually faster! 

209 else: 

210 N_PROCESSES = cpu_count() 

211 

212 # Caches 

213 FORENAME_CACHE_FILENAME = os.path.join( 

214 DEFAULT_CACHE_DIR, "fuzzy_forename_cache.jsonl" 

215 ) 

216 POSTCODE_CACHE_FILENAME = os.path.join( 

217 DEFAULT_CACHE_DIR, "fuzzy_postcode_cache.json" 

218 ) 

219 SURNAME_CACHE_FILENAME = os.path.join( 

220 DEFAULT_CACHE_DIR, "fuzzy_surname_cache.jsonl" 

221 ) 

222 

223 # ------------------------------------------------------------------------- 

224 # Hashing, rounding 

225 # ------------------------------------------------------------------------- 

226 HASH_KEY = "fuzzy_id_match_default_hash_key_DO_NOT_USE_FOR_LIVE_DATA" 

227 HASH_METHOD = HashMethods.HMAC_SHA256 

228 ROUNDING_SF = 5 

229 # ... number of significant figures for frequency rounding; 3 may be too 

230 # small, e.g. surname Smith 0.01006, corresponding metaphone SM0 

231 # 0.010129999999999998 would be the same at 3sf. 

232 

233 # ------------------------------------------------------------------------- 

234 # Run-time options 

235 # ------------------------------------------------------------------------- 

236 

237 CHECK_COMPARISON_ORDER = False 

238 

239 MIN_PROBANDS_FOR_PARALLEL = 1000 

240 # ... a machine that takes ~30s to set up a basic parallel run (and 107.9s 

241 # for a 10k-to-10k comparison) processes single results at about 37/s... so 

242 # the break-even point is probably around 1000. But that does depend on the 

243 # sample size too. 

244 

245 # ------------------------------------------------------------------------- 

246 # Population priors 

247 # ------------------------------------------------------------------------- 

248 # See command-line help. 

249 # (E) Empirical; see validation paper. 

250 # (N) From national data. 

251 

252 POPULATION_SIZE = CAMBS_PBORO_POPULATION_2018 # (N) 

253 

254 FORENAME_MIN_FREQ = 5e-6 

255 SURNAME_MIN_FREQ = 5e-6 

256 # Tried with (a) forename minimum frequency 2.9e-8, on the basis of US 

257 # forename data giving a floor at 2.875e-8 (M), 2.930e-8 (F), so 2.9e-8 to 

258 # 2sf; and (b) surname minimum frequency at 1.5e-7, since in the US surname 

259 # data, values below 3e-7 are reported as 0, so 1.5e-7 is the midpoint of 

260 # the low-frequency range. This doesn't (materially) affect the best 

261 # performance: accuracy etc. are still optimized at theta = delta = 0, MID 

262 # is still optimized at theta = delta = 15, and the WPM is optimized at 

263 # theta = 6, delta = 0 (rather than theta = 5, delta = 0). However, these 

264 # very low values just inflate MID overall and are not very plausible; much 

265 # below 1/n_p is not very plausible, and likely over-emphasizes matches on 

266 # unusual/unknown names. So: 5e-6 as originally planned (since the previous 

267 # US surname data had a floor at 1e-5, and since we will pilot with n_p 

268 # ~1e6). 

269 

270 BIRTH_YEAR_PSEUDO_RANGE = 30 # (E) UK-wide ~90, perhaps; 30 empirically. 

271 

272 P_FEMALE_GIVEN_MALE_OR_FEMALE = 0.51 # (N) 

273 P_NOT_MALE_OR_FEMALE = 0.004 # (N) 

274 

275 K_POSTCODE = None # default is to autocalculate from population; see paper 

276 

277 # noinspection HttpUrlsUsage 

278 _ = """ 

279 

280 P_UNKNOWN_OR_PSEUDO_POSTCODE 

281 ---------------------------- 

282 

283 - Pseudo-postcodes: e.g. ZZ99 3VZ, no fixed abode; ZZ99 3CZ, England/UK 

284 not otherwise specified [4]. 

285 - These postcodes are not in the ONS Postcode Directory. 

286 - In Apr-Jun 2019, 11.4% of households in England who were {homeless or 

287 threatened with homelessness} had no fixed abode [1, Table 2]. 

288 - That table totals 68,180 households, so that probably matches the 

289 68,170 households in England used as the summary figure on p1 [1]. 

290 - In 2020, there were ~27.8 million households in the UK [2]. 

291 - The mean household size in the UK is 2.4 [2]. (Although the proportion 

292 who are homeless is likely biased towards single individuals?) 

293 Yes, "Nearly two-thirds of these were single households (households 

294 without children)." 

295 - 0.843 of the UK population live in England 

296 - So, the fraction of homelessness can be estimated as 

297 

298 avg_people_per_household = 2.4 

299 n_people_per_homeless_household = (2 / 3) * 1 + (1 / 3) * avg_people_per_household 

300 n_people_homeless_england = (11.4 / 100) * 68180 * n_people_per_homeless_household 

301 n_people_uk = 27.8e6 * 2.4 # 66.7 million, so that's about right 

302 n_people_england = 0.843 * n_people_uk 

303 p_homeless = n_people_homeless_england / n_people_england 

304 

305 = 0.0002026794 

306 We'll round: 0.000203 

307 (So that's about 13.5k people with postcode ZZ99 3VZ, estimated.) 

308 

309 [1] https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/852953/Statutory_Homelessness_Statistical_Release_Apr-Jun_2019.pdf # noqa: E501 

310 [2] https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/families/bulletins/familiesandhouseholds/2020 # noqa: E501 

311 [3] https://pubmed.ncbi.nlm.nih.gov/35477868/ 

312 [4] http://www.datadictionary.wales.nhs.uk/index.html#!WordDocuments/postcode.htm # noqa: E501 

313 

314 However, our empirical rate is 0.00201 for ZZ99 3VZ (SystmOne; see 

315 empirical_rates.sql). 

316 

317 K_PSEUDOPOSTCODE 

318 ---------------- 

319 

320 Distinct postcodes in sector ZZ993, from 

321 https://files.digital.nhs.uk/assets/ods/current/Look%20Ups.zip: 

322 

323 ZZ99 3AZ Eire / Irish Republic / Southern Ireland 

324 ZZ99 3BZ Isle of Man 

325 ZZ99 3CZ England / Great Britain / United Kingdom (not otherwise stated) 

326 ZZ99 3EZ Guernsey / Herm / Jethou Island / Lihou 

327 ZZ99 3FZ Jersey 

328 ZZ99 3GZ Wales 

329 ZZ99 3HZ Channel Islands (not otherwise stated) / Alderney / Brechou / Sark, Little and Great 

330 ZZ99 3VZ No fixed abode 

331 ZZ99 3WZ At sea / In the air / Inadequately described/specified / Information refused / Not collected / Not known / Not stated/specified 

332 

333 So there are 9 postcode units in the ZZ993 sector. Our estimate above is 

334 for homelessness, which is likely overrepresented, but these are also big 

335 groupings of visitors. No particularly strong evidence to deviate from 9 

336 at present (acknowledging this is a fairly fuzzy estimate anyway). The most 

337 important thing is that k_pseudopostcode > 1. It would be invalid for it 

338 to be <1 and if it is exactly 1, then p_pnf_postcode = 0 (because the 

339 sector probability will exactly match the unit probability) and any 

340 inadvertent sector-not-unit match will give a log likelihood of +∞ and a 

341 certain match. 

342 

343 However, empirically in SystmOne, ZZ993 / ZZ993VZ = 1.83 (see paper). 

344 

345 """ 

346 

347 P_UNKNOWN_OR_PSEUDO_POSTCODE = 0.00201 # (E) 

348 K_PSEUDOPOSTCODE = 1.83 # (E) 

349 

350 # ------------------------------------------------------------------------- 

351 # Error rates 

352 # ------------------------------------------------------------------------- 

353 # (E) Empirical; see validation paper. 

354 # (*) Using the empirical value is much less efficient computationally. 

355 P_EP1_FORENAME = { 

356 GENDER_FEMALE: 0.00894, # (E) 

357 GENDER_MALE: 0.00840, # (E) 

358 } 

359 P_EP2NP1_FORENAME = { 

360 GENDER_FEMALE: 0.00881, # (E) 

361 GENDER_MALE: 0.00688, # (E) 

362 } 

363 P_EN_FORENAME = { 

364 GENDER_FEMALE: 0.00572, # (E) 

365 GENDER_MALE: 0.00625, # (E) 

366 } 

367 

368 P_U_FORENAME = 0.00191 # (E) 

369 

370 P_EP1_SURNAME = { 

371 GENDER_FEMALE: 0.00551, # (E) 

372 GENDER_MALE: 0.00471, # (E) 

373 } 

374 P_EP2NP1_SURNAME = { 

375 GENDER_FEMALE: 0.00378, # (E) 

376 GENDER_MALE: 0.00247, # (E) 

377 } 

378 P_EN_SURNAME = { 

379 GENDER_FEMALE: 0.0567, # (E) 

380 GENDER_MALE: 0.0134, # (E) 

381 } 

382 

383 _P_E_DOB = 0.00492 # DOB not full match (E) 

384 _P_EP_DOB_GIVEN_P_E_DOB = 0.933 # P(partial | not full); (E) 

385 P_EP_DOB = _P_E_DOB * _P_EP_DOB_GIVEN_P_E_DOB # (E) 

386 P_EN_DOB_TRUE = _P_E_DOB * (1 - _P_EP_DOB_GIVEN_P_E_DOB) # (E) (*) 

387 P_EN_DOB = 0 # Much faster (*) 

388 

389 P_E_GENDER = 0.0033 # (E) 

390 

391 P_EP_POSTCODE = 0.0097 # (E) 

392 P_EN_POSTCODE = 0.300 # (E) 

393 

394 # ------------------------------------------------------------------------- 

395 # Matching process 

396 # ------------------------------------------------------------------------- 

397 MIN_LOG_ODDS_FOR_MATCH = 5 # theta, in the validation paper 

398 EXCEEDS_NEXT_BEST_LOG_ODDS = 0 # delta, in the validation paper 

399 PERFECT_ID_TRANSLATION = "" 

400 REPORT_EVERY = 100 # cosmetic only 

401 

402 # ------------------------------------------------------------------------- 

403 # Name handling 

404 # ------------------------------------------------------------------------- 

405 

406 NONSPECIFIC_NAME_COMPONENTS = set( 

407 # Includes nobiliary particles: 

408 # https://en.wikipedia.org/wiki/Nobiliary_particle. Typically these 

409 # mean "of", "of the", or "the". See also 

410 # https://en.wikipedia.org/wiki/List_of_family_name_affixes; 

411 # https://en.wikipedia.org/wiki/Suffix_(name). 

412 x.upper() 

413 for x in ( 

414 # Arabic-speaking countries 

415 "Al", 

416 "El", 

417 # Belgian 

418 "de", 

419 "der", 

420 "van", 

421 # Danish 

422 "af", 

423 # Dutch 

424 "tot", 

425 "thoe", 

426 "van", 

427 # English, Welsh, Scottish 

428 "of", 

429 # French 

430 "d", # e.g. Giscard d'Estaing 

431 "de", 

432 "des", 

433 "du", 

434 "l", # e.g. L'Estrange 

435 "la", 

436 "le", 

437 # German 

438 "auf", 

439 "von", 

440 "zu", 

441 # Italian 

442 "da", 

443 "dai", 

444 "dal", 

445 "dalla", 

446 "dei", 

447 "del", 

448 "dell", 

449 "della", 

450 "di", 

451 # Portuguese, 

452 "da", 

453 "das", 

454 "do", 

455 "dos", 

456 # Somali 

457 "Aw", 

458 # Spanish 

459 "de", 

460 # Swedish 

461 "af", 

462 "av", 

463 "von", 

464 # Swiss 

465 "de", 

466 "von", 

467 # Thai 

468 "na", 

469 "Phra", 

470 "Sri", 

471 # USA: seniority 

472 "Jnr", 

473 "Jr", 

474 "Snr", 

475 "Sr", 

476 # USA: numbering (not just the USA in theory; e.g. Richard III). 

477 "I", 

478 "II", 

479 "III", 

480 "IV", 

481 "V", 

482 "VI", 

483 "VII", 

484 "VIII", 

485 "IX", 

486 "X", 

487 ) 

488 ) 

489 ACCENT_TRANSLITERATIONS = [ 

490 # Only upper-case versions are required. 

491 # German: https://en.wikipedia.org/wiki/German_orthography 

492 ("Ä", "AE"), 

493 ("Ö", "OE"), 

494 ("Ü", "UE"), 

495 (ESZETT_UPPER_CASE, "SS"), 

496 ] 

497 

498 # ------------------------------------------------------------------------- 

499 # Derived 

500 # ------------------------------------------------------------------------- 

501 

502 MIN_P_FOR_MATCH = probability_from_log_odds(MIN_LOG_ODDS_FOR_MATCH) 

503 

504 P_EP1_FORENAME_CSV = _mk_dictstr(P_EP1_FORENAME) 

505 P_EP2NP1_FORENAME_CSV = _mk_dictstr(P_EP2NP1_FORENAME) 

506 P_EN_FORENAME_CSV = _mk_dictstr(P_EN_FORENAME) 

507 

508 P_EP1_SURNAME_CSV = _mk_dictstr(P_EP1_SURNAME) 

509 P_EP2NP1_SURNAME_CSV = _mk_dictstr(P_EP2NP1_SURNAME) 

510 P_EN_SURNAME_CSV = _mk_dictstr(P_EN_SURNAME) 

511 

512 NONSPECIFIC_NAME_COMPONENTS_CSV = ",".join( 

513 sorted(NONSPECIFIC_NAME_COMPONENTS) 

514 ) 

515 ACCENT_TRANSLITERATIONS_SLASH_CSV = ",".join( 

516 f"{accent}/{plain}" for accent, plain in ACCENT_TRANSLITERATIONS 

517 ) 

518 ACCENT_TRANSLITERATIONS_TRANS = str.maketrans( 

519 {accent: plain for accent, plain in ACCENT_TRANSLITERATIONS} 

520 )