Coverage for linkage/comparison.py: 63%
155 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1r"""
2crate_anon/linkage/comparison.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Comparison classes for linkage tools.**
28These implement the maths without regard to the kind of identifier being
29compared. Includes classes for full/partial matches, and a function to iterate
30through a bunch of comparisons as part of a Bayesian probability calculation.
31The hypothesis H throughout is that two people being compared are in fact the
32same person.
34"""
36# =============================================================================
37# Imports
38# =============================================================================
40from typing import Iterable, Optional
42from cardinal_pythonlib.reprfunc import auto_repr
44from crate_anon.linkage.helpers import (
45 log_likelihood_ratio_from_p,
46 log_posterior_odds_from_pdh_pdnh,
47)
48from crate_anon.linkage.constants import INFINITY, MINUS_INFINITY
51# =============================================================================
52# Simple person-related probability calculations
53# =============================================================================
56class Comparison:
57 """
58 Abstract base class for comparing two pieces of information and calculating
59 the posterior probability of a person match.
61 This code must be fast, so avoid extraneous parameters.
62 """
64 def __init__(self) -> None:
65 pass
67 def __str__(self) -> str:
68 """
69 Returns a brief description.
70 """
71 return (
72 f"{self.d_description} "
73 f"[P(D|H)={self.p_d_given_h}, "
74 f"P(D|¬H)={self.p_d_given_not_h}]"
75 )
77 def __repr__(self) -> str:
78 return auto_repr(self)
80 @property
81 def d_description(self) -> str:
82 """
83 A description of D, the data (e.g. "match" or "mismatch").
84 """
85 raise NotImplementedError("Implement in derived class!")
87 @property
88 def p_d_given_h(self) -> float:
89 """
90 Returns :math:`P(D | H)`, the probability of the observed data given
91 the hypothesis of a match.
92 """
93 raise NotImplementedError("Implement in derived class!")
95 @property
96 def p_d_given_not_h(self) -> float:
97 r"""
98 Returns :math:`P(D | \neg H)`, the probability of the observed data
99 given no match.
100 """
101 raise NotImplementedError("Implement in derived class!")
103 @property
104 def log_likelihood_ratio(self) -> float:
105 return log_likelihood_ratio_from_p(
106 self.p_d_given_h, self.p_d_given_not_h
107 )
109 def posterior_log_odds(self, prior_log_odds: float) -> float:
110 """
111 Returns the posterior log odds, given the prior log odds. Often
112 overriden in derived classes for a faster version.
114 Args:
115 prior_log_odds:
116 prior log odds that they're the same person
118 Returns:
119 float: posterior log odds, O(H | D), as above
120 """
121 # if self.p_d_given_h == 0:
122 # # Shortcut: P(H | D) must be 0 (since likelihood ratio is 0)
123 # return MINUS_INFINITY
124 # ... but: a Python shortcut is slower than a compiled log.
125 return log_posterior_odds_from_pdh_pdnh(
126 log_prior_odds=prior_log_odds,
127 p_d_given_h=self.p_d_given_h,
128 p_d_given_not_h=self.p_d_given_not_h,
129 )
132class ImpossibleComparison(Comparison):
133 """
134 Special comparison to denote impossibility/failure, i.e. for when P(D | H)
135 = 0, that doesn't bother with all the calculations involved in calculating
136 a likelihood ratio of 0.
137 """
139 @property
140 def d_description(self) -> str:
141 return "ImpossibleComparison"
143 @property
144 def p_d_given_h(self) -> float:
145 return 0
147 @property
148 def p_d_given_not_h(self) -> float:
149 # Unimportant!
150 return 1 # makes things "in principle" calculable
152 def posterior_log_odds(self, prior_log_odds: float) -> float:
153 # Nice and quick:
154 return MINUS_INFINITY
157class CertainComparison(Comparison):
158 """
159 Special comparison to denote failure, i.e. for when P(D | H) = 0, that
160 doesn't bother with all the calculations involved in calculating a
161 likelihood ratio of 0.
162 """
164 @property
165 def d_description(self) -> str:
166 return "CertainComparison"
168 @property
169 def p_d_given_h(self) -> float:
170 # Unimportant as long as it's not 0.
171 return 1
173 @property
174 def p_d_given_not_h(self) -> float:
175 # Not used. But zero.
176 return 0 # makes things "in principle" calculable
178 def posterior_log_odds(self, prior_log_odds: float) -> float:
179 # Nice and quick:
180 return INFINITY
183class DirectComparison(Comparison):
184 r"""
185 Represents a comparison where the user supplies :math:`P(D | H)` and
186 :math:`P(D | \neg H)` directly. This is the fastest real comparison. It
187 precalculates the log likelihood ratio for speed; that way, our comparison
188 can be re-used fast.
189 """
191 def __init__(
192 self,
193 p_d_given_same_person: float,
194 p_d_given_diff_person: float,
195 d_description: str = "?",
196 ) -> None:
197 r"""
198 Args:
199 p_d_given_same_person: :math:`P(D | H)`
200 p_d_given_diff_person: :math:`P(D | \neg H)`
201 """
202 super().__init__()
203 self._p_d_given_h = p_d_given_same_person
204 self._p_d_given_not_h = p_d_given_diff_person
205 self._log_likelihood_ratio = log_likelihood_ratio_from_p(
206 p_d_given_h=p_d_given_same_person,
207 p_d_given_not_h=p_d_given_diff_person,
208 )
209 self._description = d_description
211 def __str__(self) -> str:
212 return (
213 f"DirectComparison"
214 f"[{self._description}, "
215 f"P(D|H)={self.p_d_given_h}, "
216 f"P(D|¬H)={self.p_d_given_not_h}, "
217 f"log_likelihood_ratio={self._log_likelihood_ratio}]"
218 )
220 @property
221 def d_description(self) -> str:
222 return self._description
224 @property
225 def p_d_given_h(self) -> float:
226 return self._p_d_given_h
228 @property
229 def p_d_given_not_h(self) -> float:
230 return self._p_d_given_not_h
232 @property
233 def log_likelihood_ratio(self) -> float:
234 return self._log_likelihood_ratio
236 def posterior_log_odds(self, prior_log_odds: float) -> float:
237 # Fast version.
238 # (You can't use use numba to compile a member function; the only
239 # option is numba.jitclass() on the whole class. And making
240 # DirectComparison a jitclass actually slowed things down.)
241 return prior_log_odds + self._log_likelihood_ratio
244class MatchNoMatchComparison(Comparison):
245 """
246 Represents a comparison when there can be a match or not.
248 The purpose of this is to represent this choice CLEARLY. Code that produces
249 one of these could equally produce one of two :class:`DirectComparison`
250 objects, conditional upon ``match``, but this is often clearer.
252 Not currently used in main code.
253 """
255 def __init__(
256 self,
257 match: bool,
258 p_match_given_same_person: float,
259 p_match_given_diff_person: float,
260 ) -> None:
261 r"""
262 Args:
263 match:
264 D; is there a match?
265 p_match_given_same_person:
266 If match:
267 :math:`P(D | H) = P(\text{match given same person}) = 1 - p_e`.
268 If no match:
269 :math:`P(D | H) = 1 - P(\text{match given same person}) = p_e`.
270 p_match_given_diff_person:
271 If match:
272 :math:`P(D | \neg H) = P(\text{match given different person}) = p_f`.
273 If no match:
274 :math:`P(D | \neg H) = 1 - P(\text{match given different person}) = 1 - p_f`.
275 """ # noqa: E501
276 super().__init__()
277 self.match = match
278 self.p_match_given_same_person = p_match_given_same_person
279 self.p_match_given_diff_person = p_match_given_diff_person
281 @property
282 def d_description(self) -> str:
283 return "match" if self.match else "mismatch"
285 @property
286 def p_d_given_h(self) -> float:
287 if self.match:
288 return self.p_match_given_same_person # 1 - p_e
289 else:
290 return 1 - self.p_match_given_same_person # p_e
292 @property
293 def p_d_given_not_h(self) -> float:
294 if self.match:
295 return self.p_match_given_diff_person # p_f
296 else:
297 return 1 - self.p_match_given_diff_person # 1 - p_f
300class FullPartialNoMatchComparison(Comparison):
301 """
302 Represents a comparison where there can be a full or a partial match.
303 (If there is neither a full nor a partial match, the hypothesis is
304 rejected.)
306 Again, this is for clarity. Code that produces one of these could equally
307 produce one of three :class:`DirectComparison` objects, conditional upon
308 ``full_match`` and ``partial_match``, but this is generally much clearer.
310 Not currently used in main code.
311 """
313 def __init__(
314 self,
315 full_match: bool,
316 p_f: float,
317 p_e: float,
318 partial_match: bool,
319 p_p: float,
320 ) -> None:
321 r"""
322 Args:
323 full_match:
324 was there a full match?
325 p_f:
326 :math:`p_f = P(\text{full match} | \neg H)`
327 p_e:
328 :math:`p_e = P(\text{partial but not full match} | H)`
329 partial_match:
330 was there a partial match?
331 p_p:
332 :math:`p_p = P(\text{partial match} | \neg H)`
333 """
334 super().__init__()
335 assert p_f <= p_p, f"p_p={p_p} < p_f={p_f}, but should have p_f <= p_p"
336 self.full_match = full_match
337 self.p_f = p_f
338 self.p_e = p_e
339 self.partial_match = partial_match
340 self.p_p = p_p
342 @property
343 def d_description(self) -> str:
344 if self.full_match:
345 return "full match"
346 elif self.partial_match:
347 return "partial match"
348 else:
349 return "mismatch"
351 @property
352 def p_d_given_h(self) -> float:
353 if self.full_match:
354 return 1 - self.p_e
355 elif self.partial_match:
356 return self.p_e
357 else:
358 return 0
360 @property
361 def p_d_given_not_h(self) -> float:
362 if self.full_match:
363 return self.p_f
364 elif self.partial_match:
365 return self.p_p - self.p_f
366 else:
367 return 1 - self.p_p # IRRELEVANT since p_d_given_h == 0
369 def posterior_log_odds(self, prior_log_odds: float) -> float:
370 if not self.full_match and not self.partial_match:
371 # No match.
372 # Shortcut, since p_d_given_h is 0 and therefore LR is 0:
373 return MINUS_INFINITY
374 return super().posterior_log_odds(prior_log_odds)
377class AdjustLogOddsComparison(Comparison):
378 """
379 Used to adjust log odds (via the log likelihood ratio) directly. See
380 :func:`crate_anon.linkage.identifiers.gen_best_comparisons_unordered`.
381 """
383 BAD_METHOD = "Bad method"
385 def __init__(
386 self,
387 log_odds_delta: float,
388 description: str = "?",
389 ) -> None:
390 super().__init__()
391 self._p_d_given_h = None
392 self._p_d_given_not_h = None
393 self._log_likelihood_ratio = log_odds_delta
394 self._description = description
396 def __str__(self) -> str:
397 return (
398 f"AdjustLogOddsComparison[{self._description}, "
399 f"log_odds_delta={self._log_likelihood_ratio}]"
400 )
402 @property
403 def d_description(self) -> str:
404 return self._description
406 @property
407 def p_d_given_h(self) -> float:
408 raise AssertionError(self.BAD_METHOD)
410 @property
411 def p_d_given_not_h(self) -> float:
412 raise AssertionError(self.BAD_METHOD)
414 @property
415 def log_likelihood_ratio(self) -> float:
416 return self._log_likelihood_ratio
418 def posterior_log_odds(self, prior_log_odds: float) -> float:
419 return prior_log_odds + self._log_likelihood_ratio
422# =============================================================================
423# The main Bayesian comparison point
424# =============================================================================
427def bayes_compare(
428 log_odds: float,
429 comparisons: Iterable[Optional[Comparison]],
430) -> float:
431 """
432 Works through multiple comparisons and returns posterior log odds.
433 Ignore comparisons that are ``None``.
435 Args:
436 log_odds: prior log odds
437 comparisons: an iterable of :class:`Comparison` objects
439 Returns:
440 float: posterior log odds
441 """
442 # High speed function.
443 # Fractionally faster to call the incoming parameter "log_odds" and not
444 # assign it to a further variable here.
445 for comparison in filter(None, comparisons):
446 log_odds = comparison.posterior_log_odds(log_odds)
447 # If there is a realistic chance of hitting -∞, this saves time:
448 if log_odds == MINUS_INFINITY:
449 return MINUS_INFINITY
450 # We could check for +∞ too, but that (via PerfectID) is done outside
451 # the Bayesian process.
452 return log_odds