Coverage for linkage/comparison.py: 63%

155 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1r""" 

2crate_anon/linkage/comparison.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Comparison classes for linkage tools.** 

27 

28These implement the maths without regard to the kind of identifier being 

29compared. Includes classes for full/partial matches, and a function to iterate 

30through a bunch of comparisons as part of a Bayesian probability calculation. 

31The hypothesis H throughout is that two people being compared are in fact the 

32same person. 

33 

34""" 

35 

36# ============================================================================= 

37# Imports 

38# ============================================================================= 

39 

40from typing import Iterable, Optional 

41 

42from cardinal_pythonlib.reprfunc import auto_repr 

43 

44from crate_anon.linkage.helpers import ( 

45 log_likelihood_ratio_from_p, 

46 log_posterior_odds_from_pdh_pdnh, 

47) 

48from crate_anon.linkage.constants import INFINITY, MINUS_INFINITY 

49 

50 

51# ============================================================================= 

52# Simple person-related probability calculations 

53# ============================================================================= 

54 

55 

56class Comparison: 

57 """ 

58 Abstract base class for comparing two pieces of information and calculating 

59 the posterior probability of a person match. 

60 

61 This code must be fast, so avoid extraneous parameters. 

62 """ 

63 

64 def __init__(self) -> None: 

65 pass 

66 

67 def __str__(self) -> str: 

68 """ 

69 Returns a brief description. 

70 """ 

71 return ( 

72 f"{self.d_description} " 

73 f"[P(D|H)={self.p_d_given_h}, " 

74 f"P(D|¬H)={self.p_d_given_not_h}]" 

75 ) 

76 

77 def __repr__(self) -> str: 

78 return auto_repr(self) 

79 

80 @property 

81 def d_description(self) -> str: 

82 """ 

83 A description of D, the data (e.g. "match" or "mismatch"). 

84 """ 

85 raise NotImplementedError("Implement in derived class!") 

86 

87 @property 

88 def p_d_given_h(self) -> float: 

89 """ 

90 Returns :math:`P(D | H)`, the probability of the observed data given 

91 the hypothesis of a match. 

92 """ 

93 raise NotImplementedError("Implement in derived class!") 

94 

95 @property 

96 def p_d_given_not_h(self) -> float: 

97 r""" 

98 Returns :math:`P(D | \neg H)`, the probability of the observed data 

99 given no match. 

100 """ 

101 raise NotImplementedError("Implement in derived class!") 

102 

103 @property 

104 def log_likelihood_ratio(self) -> float: 

105 return log_likelihood_ratio_from_p( 

106 self.p_d_given_h, self.p_d_given_not_h 

107 ) 

108 

109 def posterior_log_odds(self, prior_log_odds: float) -> float: 

110 """ 

111 Returns the posterior log odds, given the prior log odds. Often 

112 overriden in derived classes for a faster version. 

113 

114 Args: 

115 prior_log_odds: 

116 prior log odds that they're the same person 

117 

118 Returns: 

119 float: posterior log odds, O(H | D), as above 

120 """ 

121 # if self.p_d_given_h == 0: 

122 # # Shortcut: P(H | D) must be 0 (since likelihood ratio is 0) 

123 # return MINUS_INFINITY 

124 # ... but: a Python shortcut is slower than a compiled log. 

125 return log_posterior_odds_from_pdh_pdnh( 

126 log_prior_odds=prior_log_odds, 

127 p_d_given_h=self.p_d_given_h, 

128 p_d_given_not_h=self.p_d_given_not_h, 

129 ) 

130 

131 

132class ImpossibleComparison(Comparison): 

133 """ 

134 Special comparison to denote impossibility/failure, i.e. for when P(D | H) 

135 = 0, that doesn't bother with all the calculations involved in calculating 

136 a likelihood ratio of 0. 

137 """ 

138 

139 @property 

140 def d_description(self) -> str: 

141 return "ImpossibleComparison" 

142 

143 @property 

144 def p_d_given_h(self) -> float: 

145 return 0 

146 

147 @property 

148 def p_d_given_not_h(self) -> float: 

149 # Unimportant! 

150 return 1 # makes things "in principle" calculable 

151 

152 def posterior_log_odds(self, prior_log_odds: float) -> float: 

153 # Nice and quick: 

154 return MINUS_INFINITY 

155 

156 

157class CertainComparison(Comparison): 

158 """ 

159 Special comparison to denote failure, i.e. for when P(D | H) = 0, that 

160 doesn't bother with all the calculations involved in calculating a 

161 likelihood ratio of 0. 

162 """ 

163 

164 @property 

165 def d_description(self) -> str: 

166 return "CertainComparison" 

167 

168 @property 

169 def p_d_given_h(self) -> float: 

170 # Unimportant as long as it's not 0. 

171 return 1 

172 

173 @property 

174 def p_d_given_not_h(self) -> float: 

175 # Not used. But zero. 

176 return 0 # makes things "in principle" calculable 

177 

178 def posterior_log_odds(self, prior_log_odds: float) -> float: 

179 # Nice and quick: 

180 return INFINITY 

181 

182 

183class DirectComparison(Comparison): 

184 r""" 

185 Represents a comparison where the user supplies :math:`P(D | H)` and 

186 :math:`P(D | \neg H)` directly. This is the fastest real comparison. It 

187 precalculates the log likelihood ratio for speed; that way, our comparison 

188 can be re-used fast. 

189 """ 

190 

191 def __init__( 

192 self, 

193 p_d_given_same_person: float, 

194 p_d_given_diff_person: float, 

195 d_description: str = "?", 

196 ) -> None: 

197 r""" 

198 Args: 

199 p_d_given_same_person: :math:`P(D | H)` 

200 p_d_given_diff_person: :math:`P(D | \neg H)` 

201 """ 

202 super().__init__() 

203 self._p_d_given_h = p_d_given_same_person 

204 self._p_d_given_not_h = p_d_given_diff_person 

205 self._log_likelihood_ratio = log_likelihood_ratio_from_p( 

206 p_d_given_h=p_d_given_same_person, 

207 p_d_given_not_h=p_d_given_diff_person, 

208 ) 

209 self._description = d_description 

210 

211 def __str__(self) -> str: 

212 return ( 

213 f"DirectComparison" 

214 f"[{self._description}, " 

215 f"P(D|H)={self.p_d_given_h}, " 

216 f"P(D|¬H)={self.p_d_given_not_h}, " 

217 f"log_likelihood_ratio={self._log_likelihood_ratio}]" 

218 ) 

219 

220 @property 

221 def d_description(self) -> str: 

222 return self._description 

223 

224 @property 

225 def p_d_given_h(self) -> float: 

226 return self._p_d_given_h 

227 

228 @property 

229 def p_d_given_not_h(self) -> float: 

230 return self._p_d_given_not_h 

231 

232 @property 

233 def log_likelihood_ratio(self) -> float: 

234 return self._log_likelihood_ratio 

235 

236 def posterior_log_odds(self, prior_log_odds: float) -> float: 

237 # Fast version. 

238 # (You can't use use numba to compile a member function; the only 

239 # option is numba.jitclass() on the whole class. And making 

240 # DirectComparison a jitclass actually slowed things down.) 

241 return prior_log_odds + self._log_likelihood_ratio 

242 

243 

244class MatchNoMatchComparison(Comparison): 

245 """ 

246 Represents a comparison when there can be a match or not. 

247 

248 The purpose of this is to represent this choice CLEARLY. Code that produces 

249 one of these could equally produce one of two :class:`DirectComparison` 

250 objects, conditional upon ``match``, but this is often clearer. 

251 

252 Not currently used in main code. 

253 """ 

254 

255 def __init__( 

256 self, 

257 match: bool, 

258 p_match_given_same_person: float, 

259 p_match_given_diff_person: float, 

260 ) -> None: 

261 r""" 

262 Args: 

263 match: 

264 D; is there a match? 

265 p_match_given_same_person: 

266 If match: 

267 :math:`P(D | H) = P(\text{match given same person}) = 1 - p_e`. 

268 If no match: 

269 :math:`P(D | H) = 1 - P(\text{match given same person}) = p_e`. 

270 p_match_given_diff_person: 

271 If match: 

272 :math:`P(D | \neg H) = P(\text{match given different person}) = p_f`. 

273 If no match: 

274 :math:`P(D | \neg H) = 1 - P(\text{match given different person}) = 1 - p_f`. 

275 """ # noqa: E501 

276 super().__init__() 

277 self.match = match 

278 self.p_match_given_same_person = p_match_given_same_person 

279 self.p_match_given_diff_person = p_match_given_diff_person 

280 

281 @property 

282 def d_description(self) -> str: 

283 return "match" if self.match else "mismatch" 

284 

285 @property 

286 def p_d_given_h(self) -> float: 

287 if self.match: 

288 return self.p_match_given_same_person # 1 - p_e 

289 else: 

290 return 1 - self.p_match_given_same_person # p_e 

291 

292 @property 

293 def p_d_given_not_h(self) -> float: 

294 if self.match: 

295 return self.p_match_given_diff_person # p_f 

296 else: 

297 return 1 - self.p_match_given_diff_person # 1 - p_f 

298 

299 

300class FullPartialNoMatchComparison(Comparison): 

301 """ 

302 Represents a comparison where there can be a full or a partial match. 

303 (If there is neither a full nor a partial match, the hypothesis is 

304 rejected.) 

305 

306 Again, this is for clarity. Code that produces one of these could equally 

307 produce one of three :class:`DirectComparison` objects, conditional upon 

308 ``full_match`` and ``partial_match``, but this is generally much clearer. 

309 

310 Not currently used in main code. 

311 """ 

312 

313 def __init__( 

314 self, 

315 full_match: bool, 

316 p_f: float, 

317 p_e: float, 

318 partial_match: bool, 

319 p_p: float, 

320 ) -> None: 

321 r""" 

322 Args: 

323 full_match: 

324 was there a full match? 

325 p_f: 

326 :math:`p_f = P(\text{full match} | \neg H)` 

327 p_e: 

328 :math:`p_e = P(\text{partial but not full match} | H)` 

329 partial_match: 

330 was there a partial match? 

331 p_p: 

332 :math:`p_p = P(\text{partial match} | \neg H)` 

333 """ 

334 super().__init__() 

335 assert p_f <= p_p, f"p_p={p_p} < p_f={p_f}, but should have p_f <= p_p" 

336 self.full_match = full_match 

337 self.p_f = p_f 

338 self.p_e = p_e 

339 self.partial_match = partial_match 

340 self.p_p = p_p 

341 

342 @property 

343 def d_description(self) -> str: 

344 if self.full_match: 

345 return "full match" 

346 elif self.partial_match: 

347 return "partial match" 

348 else: 

349 return "mismatch" 

350 

351 @property 

352 def p_d_given_h(self) -> float: 

353 if self.full_match: 

354 return 1 - self.p_e 

355 elif self.partial_match: 

356 return self.p_e 

357 else: 

358 return 0 

359 

360 @property 

361 def p_d_given_not_h(self) -> float: 

362 if self.full_match: 

363 return self.p_f 

364 elif self.partial_match: 

365 return self.p_p - self.p_f 

366 else: 

367 return 1 - self.p_p # IRRELEVANT since p_d_given_h == 0 

368 

369 def posterior_log_odds(self, prior_log_odds: float) -> float: 

370 if not self.full_match and not self.partial_match: 

371 # No match. 

372 # Shortcut, since p_d_given_h is 0 and therefore LR is 0: 

373 return MINUS_INFINITY 

374 return super().posterior_log_odds(prior_log_odds) 

375 

376 

377class AdjustLogOddsComparison(Comparison): 

378 """ 

379 Used to adjust log odds (via the log likelihood ratio) directly. See 

380 :func:`crate_anon.linkage.identifiers.gen_best_comparisons_unordered`. 

381 """ 

382 

383 BAD_METHOD = "Bad method" 

384 

385 def __init__( 

386 self, 

387 log_odds_delta: float, 

388 description: str = "?", 

389 ) -> None: 

390 super().__init__() 

391 self._p_d_given_h = None 

392 self._p_d_given_not_h = None 

393 self._log_likelihood_ratio = log_odds_delta 

394 self._description = description 

395 

396 def __str__(self) -> str: 

397 return ( 

398 f"AdjustLogOddsComparison[{self._description}, " 

399 f"log_odds_delta={self._log_likelihood_ratio}]" 

400 ) 

401 

402 @property 

403 def d_description(self) -> str: 

404 return self._description 

405 

406 @property 

407 def p_d_given_h(self) -> float: 

408 raise AssertionError(self.BAD_METHOD) 

409 

410 @property 

411 def p_d_given_not_h(self) -> float: 

412 raise AssertionError(self.BAD_METHOD) 

413 

414 @property 

415 def log_likelihood_ratio(self) -> float: 

416 return self._log_likelihood_ratio 

417 

418 def posterior_log_odds(self, prior_log_odds: float) -> float: 

419 return prior_log_odds + self._log_likelihood_ratio 

420 

421 

422# ============================================================================= 

423# The main Bayesian comparison point 

424# ============================================================================= 

425 

426 

427def bayes_compare( 

428 log_odds: float, 

429 comparisons: Iterable[Optional[Comparison]], 

430) -> float: 

431 """ 

432 Works through multiple comparisons and returns posterior log odds. 

433 Ignore comparisons that are ``None``. 

434 

435 Args: 

436 log_odds: prior log odds 

437 comparisons: an iterable of :class:`Comparison` objects 

438 

439 Returns: 

440 float: posterior log odds 

441 """ 

442 # High speed function. 

443 # Fractionally faster to call the incoming parameter "log_odds" and not 

444 # assign it to a further variable here. 

445 for comparison in filter(None, comparisons): 

446 log_odds = comparison.posterior_log_odds(log_odds) 

447 # If there is a realistic chance of hitting -∞, this saves time: 

448 if log_odds == MINUS_INFINITY: 

449 return MINUS_INFINITY 

450 # We could check for +∞ too, but that (via PerfectID) is done outside 

451 # the Bayesian process. 

452 return log_odds