Coverage for linkage/people.py: 87%

115 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1r""" 

2crate_anon/linkage/people.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**People representations for fuzzy matching.** 

27 

28""" 

29 

30# ============================================================================= 

31# Imports 

32# ============================================================================= 

33 

34from collections import defaultdict 

35import logging 

36from typing import ( 

37 Dict, 

38 Generator, 

39 Iterable, 

40 List, 

41 Optional, 

42 Set, 

43) 

44 

45from ordered_set import OrderedSet 

46 

47from crate_anon.linkage.constants import INFINITY, MINUS_INFINITY 

48from crate_anon.linkage.matchconfig import MatchConfig 

49from crate_anon.linkage.matchresult import MatchResult 

50from crate_anon.linkage.person import Person 

51 

52log = logging.getLogger(__name__) 

53 

54 

55# ============================================================================= 

56# Exceptions 

57# ============================================================================= 

58 

59 

60class DuplicateIDError(Exception): 

61 pass 

62 

63 

64# ============================================================================= 

65# People: a collection of Person objects 

66# ============================================================================= 

67# Try staring at the word "people" for a while and watch it look odd... 

68 

69 

70class People: 

71 """ 

72 Represents a group of people, and implements a shortlist. 

73 """ 

74 

75 def __init__( 

76 self, 

77 cfg: MatchConfig, 

78 person: Person = None, 

79 people: Iterable[Person] = None, 

80 ) -> None: 

81 """ 

82 Creates a blank collection. 

83 

84 Raises :exc:`crate_anon.linkage.fuzzy_id_match.DuplicateLocalIDError` 

85 if some people have duplicate ``local_id`` values. 

86 """ 

87 self.cfg = cfg 

88 self.people = [] # type: List[Person] 

89 # ... list is preferable to set, as we may slice it for parallel 

90 # processing, and it maintains order. 

91 

92 # These may be plaintext or hashed DOB strings depending on our people: 

93 self.dob_md_to_people = defaultdict( 

94 list 

95 ) # type: Dict[str, List[Person]] 

96 self.dob_yd_to_people = defaultdict( 

97 list 

98 ) # type: Dict[str, List[Person]] 

99 self.dob_ym_to_people = defaultdict( 

100 list 

101 ) # type: Dict[str, List[Person]] 

102 self.dob_ymd_to_people = defaultdict( 

103 list 

104 ) # type: Dict[str, List[Person]] 

105 

106 self.perfect_id_map = defaultdict( 

107 dict 

108 ) # type: Dict[str, Dict[str, Person]] 

109 

110 self._known_local_ids = set() # type: Set[str] 

111 self._people_are_plaintext = None # type: Optional[bool] 

112 

113 if person: 

114 self.add_person(person) 

115 if people: 

116 self.add_people(people) 

117 

118 def add_person(self, person: Person) -> None: 

119 """ 

120 Adds a single person. 

121 

122 Raises :exc:`crate_anon.linkage.fuzzy_id_match.DuplicateLocalIDError` 

123 if the person has a ``local_id`` value already in our collection. 

124 """ 

125 # Plaintext or hashed? 

126 if self.people: 

127 # Not the first person. 

128 if person.is_plaintext() != self._people_are_plaintext: 

129 new = Person.plain_or_hashed_txt(person.is_plaintext()) 

130 old = Person.plain_or_hashed_txt(self._people_are_plaintext) 

131 raise ValueError( 

132 f"Trying to add a {new} person but all existing people " 

133 f"are {old}" 

134 ) 

135 else: 

136 # First person. 

137 self._people_are_plaintext = person.is_plaintext() 

138 

139 # Check local ID not duplicated. 

140 if person.local_id in self._known_local_ids: 

141 raise DuplicateIDError( 

142 f"Person with duplicate local ID {person.local_id!r}" 

143 ) 

144 self._known_local_ids.add(person.local_id) 

145 

146 # Build perfect ID map and ensure no duplication. 

147 for key, value in person.perfect_id.identifiers.items(): 

148 # e.g. key = "nhsnum", value = some NHS number as a string, or a 

149 # hashed equivalent. 

150 id_to_person = self.perfect_id_map[key] # e.g. for NHS# 

151 if value in id_to_person: 

152 raise DuplicateIDError( 

153 f"Person with duplicate perfect ID {key} = {value!r}" 

154 ) 

155 id_to_person[value] = person 

156 

157 # Add to DOB maps. 

158 dob = person.dob 

159 if dob: 

160 self.dob_md_to_people[dob.dob_md].append(person) 

161 self.dob_yd_to_people[dob.dob_yd].append(person) 

162 self.dob_ym_to_people[dob.dob_ym].append(person) 

163 self.dob_ymd_to_people[dob.dob_str].append(person) 

164 else: 

165 # DOB absent. 

166 # We do need a way to retrieve people with no DOB. 

167 # We use a blank string key for this: 

168 self.dob_ymd_to_people[""].append(person) 

169 # It's also true that dob.dob_str will be "", so this is just for 

170 # clarity. 

171 # We do not need to add to the partial DOB maps. See 

172 # gen_shortlist(). 

173 

174 # Add the person. 

175 self.people.append(person) 

176 

177 def add_people(self, people: Iterable[Person]) -> None: 

178 """ 

179 Adds multiple people. 

180 

181 Raises :exc:`crate_anon.linkage.fuzzy_id_match.DuplicateLocalIDError` 

182 if some people have duplicate ``local_id`` values with respect to those 

183 we already know. 

184 """ 

185 for person in people: 

186 self.add_person(person) 

187 

188 def size(self) -> int: 

189 """ 

190 Returns the number of people in this object. 

191 """ 

192 return len(self.people) 

193 

194 def ensure_valid_as_probands(self) -> None: 

195 """ 

196 Ensures all people have sufficient information to act as a proband, 

197 or raises :exc:`ValueError`. 

198 """ 

199 log.info("Validating probands...") 

200 for p in self.people: 

201 p.ensure_valid_as_proband() 

202 log.debug("... OK") 

203 

204 def ensure_valid_as_sample(self) -> None: 

205 """ 

206 Ensures all people have sufficient information to act as a candidate 

207 from a sample, or raises :exc:`ValueError`. 

208 """ 

209 log.info("Validating sample...") 

210 for p in self.people: 

211 p.ensure_valid_as_candidate() 

212 log.debug("... OK") 

213 

214 def get_perfect_match(self, proband: Person) -> Optional[Person]: 

215 """ 

216 Returns the first person who matches on a perfect (person-unique) ID, 

217 or ``None``. 

218 """ 

219 for key, value in proband.perfect_id.identifiers.items(): 

220 key = self.cfg.remap_perfect_id_key(key) 

221 winner = self.perfect_id_map[key].get(value) 

222 if winner: 

223 return winner 

224 return None 

225 

226 def gen_shortlist(self, proband: Person) -> Generator[Person, None, None]: 

227 """ 

228 Generates a shortlist of potential candidates for fuzzy matching (e.g. 

229 by restriction to same/similar dates of birth -- or with no such 

230 restriction, if preferred). 

231 

232 Yields: 

233 proband: a :class:`Person` 

234 """ 

235 # A high-speed function. 

236 cfg = self.cfg 

237 dob = proband.dob 

238 

239 # 2023-02-28 update for referees: 

240 # - Allow comparison where the DOB is missing. 

241 # - Of necessity, probands with no DOBs must be compared to all 

242 # candidates. 

243 # - Likewise, if we permit a complete DOB mismatch (where DOBs are 

244 # present), we must compare to all candidates. 

245 if cfg.complete_dob_mismatch_allowed or not dob: 

246 # No shortlisting; everyone's a candidate. Slow. 

247 for person in self.people: 

248 # self.people is a list, so order is consistent and matches 

249 # the input. 

250 yield person 

251 else: 

252 # Implement the shortlist by DOB. 

253 # Most efficient to let set operations determine uniqueness, then 

254 # iterate through the set. 

255 # We use an OrderedSet to be sure of consistency; the precise 

256 # ordering is as below (e.g. people with the same DOB, then those 

257 # with the partial matches as shown below). Within each category, 

258 # the ordering will be as the input. (Thus, if configured for 

259 # duplicate detection, which entails identical DOBs, the earliest 

260 # winner will always be the first in the input.) 

261 

262 # First, exact matches: 

263 shortlist = OrderedSet(self.dob_ymd_to_people[dob.dob_str]) 

264 

265 # Now, we'll slow it all down with partial matches: 

266 if cfg.partial_dob_mismatch_allowed: 

267 shortlist.update(self.dob_md_to_people[dob.dob_md]) 

268 shortlist.update(self.dob_yd_to_people[dob.dob_yd]) 

269 shortlist.update(self.dob_ym_to_people[dob.dob_ym]) 

270 

271 # But also, we must include any candidates who have no DOB. 

272 # (We already know that our proband has a DOB, or we wouldn't be 

273 # in this part of the if statement.) 

274 shortlist.update(self.dob_ymd_to_people[""]) 

275 

276 for person in shortlist: 

277 yield person 

278 

279 def get_unique_match_detailed(self, proband: Person) -> MatchResult: 

280 """ 

281 Returns a single person matching the proband, or ``None`` if there is 

282 no match (as defined by the probability settings in ``cfg``). 

283 

284 Args: 

285 proband: a :class:`Person` 

286 """ 

287 

288 # 2020-04-25: Do this in one pass. 

289 # A bit like 

290 # https://www.geeksforgeeks.org/python-program-to-find-second-largest-number-in-a-list/ # noqa: E501 

291 # ... but modified, as that fails to deal with joint winners 

292 # ... and it's not a super algorithm anyway. 

293 

294 # Step 1. Scan everything in a single pass, establishing the best 

295 # candidate and the runner-up. 

296 cfg = self.cfg 

297 best_log_odds = MINUS_INFINITY 

298 second_best_log_odds = MINUS_INFINITY 

299 

300 second_best_candidate = None # type: Optional[Person] 

301 best_candidate = self.get_perfect_match(proband) 

302 if best_candidate: 

303 best_log_odds = INFINITY 

304 else: 

305 # Fuzzy matching 

306 proband_log_odds_same = proband.log_odds_same # for speed 

307 for candidate in self.gen_shortlist(proband): 

308 log_odds = proband_log_odds_same(candidate) 

309 if log_odds > best_log_odds: 

310 second_best_log_odds = best_log_odds 

311 second_best_candidate = best_candidate 

312 best_log_odds = log_odds 

313 best_candidate = candidate 

314 elif log_odds > second_best_log_odds: 

315 second_best_log_odds = log_odds 

316 second_best_candidate = candidate 

317 # If log_odds == best_log_odds, we don't change the winner, 

318 # i.e. the first-encountered candidate continues in the lead. 

319 # The shortlist is generated in a consistent order. 

320 

321 result = MatchResult( 

322 best_log_odds=best_log_odds, 

323 second_best_log_odds=second_best_log_odds, 

324 best_candidate=best_candidate, 

325 second_best_candidate=second_best_candidate, 

326 proband=proband, 

327 ) 

328 

329 # Is there a winner? 

330 if ( 

331 best_candidate 

332 and best_log_odds >= cfg.min_log_odds_for_match 

333 and best_log_odds 

334 >= (second_best_log_odds + cfg.exceeds_next_best_log_odds) 

335 ): 

336 # (a) There needs to be a best candidate. 

337 # (b) The best needs to be good enough. 

338 # (c) The best must beat the runner-up by a sufficient margin. 

339 result.winner = best_candidate 

340 

341 return result 

342 

343 def get_unique_match(self, proband: Person) -> Optional[Person]: 

344 """ 

345 Returns a single person matching the proband, or ``None`` if there is 

346 no match (as defined by the probability settings in ``cfg``). 

347 

348 Args: 

349 proband: a :class:`Person` 

350 

351 Returns: 

352 the winner (a :class:`Person`) or ``None`` 

353 """ 

354 result = self.get_unique_match_detailed(proband) 

355 return result.winner 

356 

357 def hashed(self) -> "People": 

358 """ 

359 Returns a hashed version of itself. 

360 """ 

361 return People(cfg=self.cfg, people=[p.hashed() for p in self.people]) 

362 

363 def copy(self) -> "People": 

364 """ 

365 Returns a copy of itself. 

366 """ 

367 return People(cfg=self.cfg, people=[p.copy() for p in self.people])