Coverage for linkage/people.py: 87%
115 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1r"""
2crate_anon/linkage/people.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**People representations for fuzzy matching.**
28"""
30# =============================================================================
31# Imports
32# =============================================================================
34from collections import defaultdict
35import logging
36from typing import (
37 Dict,
38 Generator,
39 Iterable,
40 List,
41 Optional,
42 Set,
43)
45from ordered_set import OrderedSet
47from crate_anon.linkage.constants import INFINITY, MINUS_INFINITY
48from crate_anon.linkage.matchconfig import MatchConfig
49from crate_anon.linkage.matchresult import MatchResult
50from crate_anon.linkage.person import Person
52log = logging.getLogger(__name__)
55# =============================================================================
56# Exceptions
57# =============================================================================
60class DuplicateIDError(Exception):
61 pass
64# =============================================================================
65# People: a collection of Person objects
66# =============================================================================
67# Try staring at the word "people" for a while and watch it look odd...
70class People:
71 """
72 Represents a group of people, and implements a shortlist.
73 """
75 def __init__(
76 self,
77 cfg: MatchConfig,
78 person: Person = None,
79 people: Iterable[Person] = None,
80 ) -> None:
81 """
82 Creates a blank collection.
84 Raises :exc:`crate_anon.linkage.fuzzy_id_match.DuplicateLocalIDError`
85 if some people have duplicate ``local_id`` values.
86 """
87 self.cfg = cfg
88 self.people = [] # type: List[Person]
89 # ... list is preferable to set, as we may slice it for parallel
90 # processing, and it maintains order.
92 # These may be plaintext or hashed DOB strings depending on our people:
93 self.dob_md_to_people = defaultdict(
94 list
95 ) # type: Dict[str, List[Person]]
96 self.dob_yd_to_people = defaultdict(
97 list
98 ) # type: Dict[str, List[Person]]
99 self.dob_ym_to_people = defaultdict(
100 list
101 ) # type: Dict[str, List[Person]]
102 self.dob_ymd_to_people = defaultdict(
103 list
104 ) # type: Dict[str, List[Person]]
106 self.perfect_id_map = defaultdict(
107 dict
108 ) # type: Dict[str, Dict[str, Person]]
110 self._known_local_ids = set() # type: Set[str]
111 self._people_are_plaintext = None # type: Optional[bool]
113 if person:
114 self.add_person(person)
115 if people:
116 self.add_people(people)
118 def add_person(self, person: Person) -> None:
119 """
120 Adds a single person.
122 Raises :exc:`crate_anon.linkage.fuzzy_id_match.DuplicateLocalIDError`
123 if the person has a ``local_id`` value already in our collection.
124 """
125 # Plaintext or hashed?
126 if self.people:
127 # Not the first person.
128 if person.is_plaintext() != self._people_are_plaintext:
129 new = Person.plain_or_hashed_txt(person.is_plaintext())
130 old = Person.plain_or_hashed_txt(self._people_are_plaintext)
131 raise ValueError(
132 f"Trying to add a {new} person but all existing people "
133 f"are {old}"
134 )
135 else:
136 # First person.
137 self._people_are_plaintext = person.is_plaintext()
139 # Check local ID not duplicated.
140 if person.local_id in self._known_local_ids:
141 raise DuplicateIDError(
142 f"Person with duplicate local ID {person.local_id!r}"
143 )
144 self._known_local_ids.add(person.local_id)
146 # Build perfect ID map and ensure no duplication.
147 for key, value in person.perfect_id.identifiers.items():
148 # e.g. key = "nhsnum", value = some NHS number as a string, or a
149 # hashed equivalent.
150 id_to_person = self.perfect_id_map[key] # e.g. for NHS#
151 if value in id_to_person:
152 raise DuplicateIDError(
153 f"Person with duplicate perfect ID {key} = {value!r}"
154 )
155 id_to_person[value] = person
157 # Add to DOB maps.
158 dob = person.dob
159 if dob:
160 self.dob_md_to_people[dob.dob_md].append(person)
161 self.dob_yd_to_people[dob.dob_yd].append(person)
162 self.dob_ym_to_people[dob.dob_ym].append(person)
163 self.dob_ymd_to_people[dob.dob_str].append(person)
164 else:
165 # DOB absent.
166 # We do need a way to retrieve people with no DOB.
167 # We use a blank string key for this:
168 self.dob_ymd_to_people[""].append(person)
169 # It's also true that dob.dob_str will be "", so this is just for
170 # clarity.
171 # We do not need to add to the partial DOB maps. See
172 # gen_shortlist().
174 # Add the person.
175 self.people.append(person)
177 def add_people(self, people: Iterable[Person]) -> None:
178 """
179 Adds multiple people.
181 Raises :exc:`crate_anon.linkage.fuzzy_id_match.DuplicateLocalIDError`
182 if some people have duplicate ``local_id`` values with respect to those
183 we already know.
184 """
185 for person in people:
186 self.add_person(person)
188 def size(self) -> int:
189 """
190 Returns the number of people in this object.
191 """
192 return len(self.people)
194 def ensure_valid_as_probands(self) -> None:
195 """
196 Ensures all people have sufficient information to act as a proband,
197 or raises :exc:`ValueError`.
198 """
199 log.info("Validating probands...")
200 for p in self.people:
201 p.ensure_valid_as_proband()
202 log.debug("... OK")
204 def ensure_valid_as_sample(self) -> None:
205 """
206 Ensures all people have sufficient information to act as a candidate
207 from a sample, or raises :exc:`ValueError`.
208 """
209 log.info("Validating sample...")
210 for p in self.people:
211 p.ensure_valid_as_candidate()
212 log.debug("... OK")
214 def get_perfect_match(self, proband: Person) -> Optional[Person]:
215 """
216 Returns the first person who matches on a perfect (person-unique) ID,
217 or ``None``.
218 """
219 for key, value in proband.perfect_id.identifiers.items():
220 key = self.cfg.remap_perfect_id_key(key)
221 winner = self.perfect_id_map[key].get(value)
222 if winner:
223 return winner
224 return None
226 def gen_shortlist(self, proband: Person) -> Generator[Person, None, None]:
227 """
228 Generates a shortlist of potential candidates for fuzzy matching (e.g.
229 by restriction to same/similar dates of birth -- or with no such
230 restriction, if preferred).
232 Yields:
233 proband: a :class:`Person`
234 """
235 # A high-speed function.
236 cfg = self.cfg
237 dob = proband.dob
239 # 2023-02-28 update for referees:
240 # - Allow comparison where the DOB is missing.
241 # - Of necessity, probands with no DOBs must be compared to all
242 # candidates.
243 # - Likewise, if we permit a complete DOB mismatch (where DOBs are
244 # present), we must compare to all candidates.
245 if cfg.complete_dob_mismatch_allowed or not dob:
246 # No shortlisting; everyone's a candidate. Slow.
247 for person in self.people:
248 # self.people is a list, so order is consistent and matches
249 # the input.
250 yield person
251 else:
252 # Implement the shortlist by DOB.
253 # Most efficient to let set operations determine uniqueness, then
254 # iterate through the set.
255 # We use an OrderedSet to be sure of consistency; the precise
256 # ordering is as below (e.g. people with the same DOB, then those
257 # with the partial matches as shown below). Within each category,
258 # the ordering will be as the input. (Thus, if configured for
259 # duplicate detection, which entails identical DOBs, the earliest
260 # winner will always be the first in the input.)
262 # First, exact matches:
263 shortlist = OrderedSet(self.dob_ymd_to_people[dob.dob_str])
265 # Now, we'll slow it all down with partial matches:
266 if cfg.partial_dob_mismatch_allowed:
267 shortlist.update(self.dob_md_to_people[dob.dob_md])
268 shortlist.update(self.dob_yd_to_people[dob.dob_yd])
269 shortlist.update(self.dob_ym_to_people[dob.dob_ym])
271 # But also, we must include any candidates who have no DOB.
272 # (We already know that our proband has a DOB, or we wouldn't be
273 # in this part of the if statement.)
274 shortlist.update(self.dob_ymd_to_people[""])
276 for person in shortlist:
277 yield person
279 def get_unique_match_detailed(self, proband: Person) -> MatchResult:
280 """
281 Returns a single person matching the proband, or ``None`` if there is
282 no match (as defined by the probability settings in ``cfg``).
284 Args:
285 proband: a :class:`Person`
286 """
288 # 2020-04-25: Do this in one pass.
289 # A bit like
290 # https://www.geeksforgeeks.org/python-program-to-find-second-largest-number-in-a-list/ # noqa: E501
291 # ... but modified, as that fails to deal with joint winners
292 # ... and it's not a super algorithm anyway.
294 # Step 1. Scan everything in a single pass, establishing the best
295 # candidate and the runner-up.
296 cfg = self.cfg
297 best_log_odds = MINUS_INFINITY
298 second_best_log_odds = MINUS_INFINITY
300 second_best_candidate = None # type: Optional[Person]
301 best_candidate = self.get_perfect_match(proband)
302 if best_candidate:
303 best_log_odds = INFINITY
304 else:
305 # Fuzzy matching
306 proband_log_odds_same = proband.log_odds_same # for speed
307 for candidate in self.gen_shortlist(proband):
308 log_odds = proband_log_odds_same(candidate)
309 if log_odds > best_log_odds:
310 second_best_log_odds = best_log_odds
311 second_best_candidate = best_candidate
312 best_log_odds = log_odds
313 best_candidate = candidate
314 elif log_odds > second_best_log_odds:
315 second_best_log_odds = log_odds
316 second_best_candidate = candidate
317 # If log_odds == best_log_odds, we don't change the winner,
318 # i.e. the first-encountered candidate continues in the lead.
319 # The shortlist is generated in a consistent order.
321 result = MatchResult(
322 best_log_odds=best_log_odds,
323 second_best_log_odds=second_best_log_odds,
324 best_candidate=best_candidate,
325 second_best_candidate=second_best_candidate,
326 proband=proband,
327 )
329 # Is there a winner?
330 if (
331 best_candidate
332 and best_log_odds >= cfg.min_log_odds_for_match
333 and best_log_odds
334 >= (second_best_log_odds + cfg.exceeds_next_best_log_odds)
335 ):
336 # (a) There needs to be a best candidate.
337 # (b) The best needs to be good enough.
338 # (c) The best must beat the runner-up by a sufficient margin.
339 result.winner = best_candidate
341 return result
343 def get_unique_match(self, proband: Person) -> Optional[Person]:
344 """
345 Returns a single person matching the proband, or ``None`` if there is
346 no match (as defined by the probability settings in ``cfg``).
348 Args:
349 proband: a :class:`Person`
351 Returns:
352 the winner (a :class:`Person`) or ``None``
353 """
354 result = self.get_unique_match_detailed(proband)
355 return result.winner
357 def hashed(self) -> "People":
358 """
359 Returns a hashed version of itself.
360 """
361 return People(cfg=self.cfg, people=[p.hashed() for p in self.people])
363 def copy(self) -> "People":
364 """
365 Returns a copy of itself.
366 """
367 return People(cfg=self.cfg, people=[p.copy() for p in self.people])