Coverage for crateweb/anonymise_api/serializers.py: 98%

114 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/crateweb/anonymise_api/serializers.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26Django REST Framework serializer to anonymise the data. 

27 

28""" 

29 

30from collections import OrderedDict 

31from typing import Dict, List, Optional 

32 

33from django.conf import settings 

34 

35from cardinal_pythonlib.hash import GenericHasher, HashMethods, make_hasher 

36from rest_framework.serializers import ( 

37 BooleanField, 

38 CharField, 

39 DictField, 

40 IntegerField, 

41 ListField, 

42 Serializer, 

43 SerializerMethodField, 

44) 

45 

46from crate_anon.anonymise.constants import ( 

47 AnonymiseConfigDefaults as Defaults, 

48 AnonymiseConfigKeys as ConfigKeys, 

49 DATE_BLURRING_DIRECTIVES_CSV, 

50 ScrubMethod, 

51) 

52from crate_anon.anonymise.scrub import ( 

53 NonspecificScrubber, 

54 PersonalizedScrubber, 

55 WordList, 

56) 

57from crate_anon.crateweb.anonymise_api.constants import ( 

58 ApiKeys, 

59 ApiSettingsKeys, 

60) 

61 

62 

63# noinspection PyAbstractClass 

64class SpecificSerializer(Serializer): 

65 """ 

66 Represents scrubbing information about a specific person or group of people 

67 (e.g. patient data, third-party data). 

68 """ 

69 

70 dates = ListField( 

71 child=CharField(), 

72 help_text="List of dates to be scrubbed.", 

73 default=[], 

74 initial=[], 

75 ) 

76 phrases = ListField( 

77 child=CharField(), 

78 help_text=( 

79 "List of phrases (words appearing consecutively) to " 

80 "be scrubbed." 

81 ), 

82 default=[], 

83 initial=[], 

84 ) 

85 non_numeric_phrases = ListField( 

86 child=CharField(), 

87 help_text=( 

88 "List of phrases (words appearing consecutively) to " 

89 "be scrubbed. If a phrase is purely numeric it will be " 

90 "ignored." 

91 ), 

92 default=[], 

93 initial=[], 

94 ) 

95 words = ListField( 

96 child=CharField(), 

97 help_text="List of words to be scrubbed.", 

98 default=[], 

99 initial=[], 

100 ) 

101 numbers = ListField( 

102 child=CharField(), 

103 help_text="List of numbers to be scrubbed.", 

104 default=[], 

105 initial=[], 

106 ) 

107 codes = ListField( 

108 child=CharField(), 

109 help_text="List of codes (e.g. postcodes) to be scrubbed.", 

110 default=[], 

111 initial=[], 

112 ) 

113 

114 

115# noinspection PyAbstractClass 

116class AllowlistSerializer(Serializer): 

117 """ 

118 Represents allowlist options. 

119 """ 

120 

121 words = ListField( 

122 child=CharField(), 

123 help_text="Do not scrub these specific words.", 

124 default=[], 

125 initial=[], 

126 ) 

127 files = ListField( 

128 child=CharField(), 

129 help_text=( 

130 "Do not scrub words from these filename aliases " 

131 "(defined on the server)." 

132 ), 

133 default=[], 

134 initial=[], 

135 ) 

136 

137 

138# noinspection PyAbstractClass 

139class DenylistSerializer(Serializer): 

140 """ 

141 Represents denylist options. 

142 """ 

143 

144 words = ListField( 

145 child=CharField(), 

146 help_text="Scrub these specific words.", 

147 default=[], 

148 initial=[], 

149 ) 

150 files = ListField( 

151 child=CharField(), 

152 help_text=( 

153 "Scrub words from these filename aliases (defined on the server)." 

154 ), 

155 default=[], 

156 initial=[], 

157 ) 

158 

159 

160# noinspection PyAbstractClass 

161class ScrubSerializer(Serializer): 

162 """ 

163 Represents all scrubber settings, including data to be scrubbed and 

164 scrubber configuration settings. 

165 """ 

166 

167 # Input/Output fields 

168 # default implies required=False 

169 text = DictField( 

170 child=CharField(help_text="Text to be scrubbed."), 

171 help_text=( 

172 "The lines of text to be scrubbed, each keyed on a unique " 

173 "ID supplied by the caller." 

174 ), 

175 ) 

176 patient = SpecificSerializer( 

177 required=False, help_text="Specific patient data to be scrubbed." 

178 ) 

179 third_party = SpecificSerializer( 

180 required=False, 

181 help_text="Third party (e.g. family members') data to be scrubbed.", 

182 ) 

183 anonymise_codes_at_word_boundaries_only = BooleanField( 

184 default=Defaults.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY, 

185 initial=Defaults.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY, 

186 help_text=( 

187 "Ensure the codes to be scrubbed begin and end with a word " 

188 "boundary." 

189 ), 

190 ) 

191 anonymise_dates_at_word_boundaries_only = BooleanField( 

192 default=Defaults.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY, 

193 initial=Defaults.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY, 

194 help_text=( 

195 "Ensure the codes to be scrubbed begin and end with a word " 

196 "boundary." 

197 ), 

198 ) 

199 anonymise_numbers_at_word_boundaries_only = BooleanField( 

200 default=Defaults.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY, 

201 initial=Defaults.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY, 

202 help_text=( 

203 "Ensure the numbers to be scrubbed begin and end with a " 

204 "word boundary." 

205 ), 

206 ) 

207 anonymise_numbers_at_numeric_boundaries_only = BooleanField( 

208 default=Defaults.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY, 

209 initial=Defaults.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY, 

210 help_text=( 

211 "Ensure the numbers to be scrubbed begin and end with a " 

212 "numeric boundary." 

213 ), 

214 ) 

215 anonymise_strings_at_word_boundaries_only = BooleanField( 

216 default=Defaults.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY, 

217 initial=Defaults.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY, 

218 help_text=( 

219 "Ensure the numbers to be scrubbed begin and end with a " 

220 "word boundary." 

221 ), 

222 ) 

223 string_max_regex_errors = IntegerField( 

224 default=Defaults.STRING_MAX_REGEX_ERRORS, 

225 initial=Defaults.STRING_MAX_REGEX_ERRORS, 

226 help_text=( 

227 "The maximum number of typographical insertion/deletion/" 

228 "substitution errors to permit." 

229 ), 

230 ) 

231 min_string_length_for_errors = IntegerField( 

232 default=Defaults.MIN_STRING_LENGTH_FOR_ERRORS, 

233 initial=Defaults.MIN_STRING_LENGTH_FOR_ERRORS, 

234 help_text=( 

235 "The minimum string length at which typographical " 

236 "errors will be permitted." 

237 ), 

238 ) 

239 min_string_length_to_scrub_with = IntegerField( 

240 default=Defaults.MIN_STRING_LENGTH_TO_SCRUB_WITH, 

241 initial=Defaults.MIN_STRING_LENGTH_TO_SCRUB_WITH, 

242 help_text="Do not scrub strings shorter than this length.", 

243 ) 

244 scrub_string_suffixes = ListField( 

245 child=CharField(), 

246 help_text=( 

247 'A list of suffixes to permit on strings. e.g. ["s"] ' 

248 "for plural forms." 

249 ), 

250 default=[], 

251 initial=[], 

252 ) 

253 allowlist = AllowlistSerializer( 

254 required=False, help_text="Allowlist options." 

255 ) 

256 denylist = DenylistSerializer( 

257 required=False, help_text="Denylist options." 

258 ) 

259 replace_patient_info_with = CharField( 

260 default=Defaults.REPLACE_PATIENT_INFO_WITH, 

261 initial=Defaults.REPLACE_PATIENT_INFO_WITH, 

262 help_text="Replace sensitive patient content with this.", 

263 ) 

264 replace_third_party_info_with = CharField( 

265 default=Defaults.REPLACE_THIRD_PARTY_INFO_WITH, 

266 initial=Defaults.REPLACE_THIRD_PARTY_INFO_WITH, 

267 help_text=( 

268 "Replace sensitive third party (e.g. family members') " 

269 "content with this." 

270 ), 

271 ) 

272 replace_nonspecific_info_with = CharField( 

273 default=Defaults.REPLACE_NONSPECIFIC_INFO_WITH, 

274 initial=Defaults.REPLACE_NONSPECIFIC_INFO_WITH, 

275 help_text="Replace any other sensitive content with this.", 

276 ) 

277 replace_all_dates_with = CharField( 

278 required=False, 

279 help_text=( 

280 "When scrubbing all dates, replace with this text. If the " 

281 "replacement text includes supported datetime.directives " 

282 f"({DATE_BLURRING_DIRECTIVES_CSV}), the date is 'blurred' " 

283 "to include just those components." 

284 ), 

285 ) 

286 scrub_all_numbers_of_n_digits = ListField( 

287 child=IntegerField(), 

288 help_text=( 

289 "Scrub all numbers with these lengths " 

290 "(e.g. [10] for all UK NHS numbers)." 

291 ), 

292 default=[], 

293 initial=[], 

294 ) 

295 scrub_all_uk_postcodes = BooleanField( 

296 default=Defaults.SCRUB_ALL_UK_POSTCODES, 

297 initial=Defaults.SCRUB_ALL_UK_POSTCODES, 

298 help_text="Scrub all UK postcodes.", 

299 ) 

300 scrub_all_dates = BooleanField( 

301 default=Defaults.SCRUB_ALL_DATES, 

302 initial=Defaults.SCRUB_ALL_DATES, 

303 help_text=( 

304 "Scrub all dates. Currently assumes the default locale " 

305 "for month names and ordinal suffixes." 

306 ), 

307 ) 

308 scrub_all_email_addresses = BooleanField( 

309 default=Defaults.SCRUB_ALL_EMAIL_ADDRESSES, 

310 initial=Defaults.SCRUB_ALL_EMAIL_ADDRESSES, 

311 help_text="Scrub all e-mail addresses.", 

312 ) 

313 alternatives = ListField( 

314 child=ListField(child=CharField()), 

315 help_text=( 

316 "List of alternative words to scrub. " 

317 'e.g.: [["Street", "St"], ["Road", "Rd"], ["Avenue", "Ave"]]' 

318 ), 

319 default=[[]], 

320 initial=[[]], 

321 ) 

322 

323 # Output-only fields 

324 # SerializerMethodField is read-only by default 

325 anonymised = SerializerMethodField( 

326 help_text=( 

327 "The anonymised text, keyed on the unique IDs supplied by " 

328 "the caller in the 'text' parameter of the request." 

329 ) 

330 ) 

331 

332 def get_anonymised(self, data: OrderedDict) -> Dict[str, str]: 

333 """ 

334 Returns the anonymised text keyed on the unique IDs supplied by the 

335 caller. 

336 """ 

337 scrubber = self._get_personalized_scrubber(data) 

338 

339 anonymised = dict() 

340 

341 for key, value in data[ApiKeys.TEXT].items(): 

342 anonymised[key] = scrubber.scrub(value) 

343 

344 return anonymised 

345 

346 def _get_personalized_scrubber( 

347 self, data: OrderedDict 

348 ) -> PersonalizedScrubber: 

349 """ 

350 Create a CRATE scrubber representing patient and third-party scrubbing 

351 settings. 

352 """ 

353 hasher = make_hasher( 

354 HashMethods.HMAC_MD5, 

355 settings.ANONYMISE_API[ApiSettingsKeys.HASH_KEY], 

356 ) 

357 

358 options = ( 

359 ConfigKeys.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY, 

360 ConfigKeys.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY, 

361 ConfigKeys.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY, 

362 ConfigKeys.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY, 

363 ConfigKeys.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY, 

364 ConfigKeys.STRING_MAX_REGEX_ERRORS, 

365 ConfigKeys.MIN_STRING_LENGTH_FOR_ERRORS, 

366 ConfigKeys.MIN_STRING_LENGTH_TO_SCRUB_WITH, 

367 ConfigKeys.SCRUB_STRING_SUFFIXES, 

368 ) 

369 

370 kwargs = {k: v for (k, v) in data.items() if k in options} 

371 

372 replacement_text_patient = data[ConfigKeys.REPLACE_PATIENT_INFO_WITH] 

373 replacement_text_third_party = data[ 

374 ConfigKeys.REPLACE_THIRD_PARTY_INFO_WITH 

375 ] 

376 

377 scrubber = PersonalizedScrubber( 

378 hasher, 

379 replacement_text_patient, 

380 replacement_text_third_party, 

381 nonspecific_scrubber=self._get_nonspecific_scrubber(data, hasher), 

382 allowlist=self._get_allowlist(data, hasher), 

383 alternatives=self._get_alternatives(data), 

384 **kwargs, 

385 ) 

386 

387 for label in (ApiKeys.PATIENT, ApiKeys.THIRD_PARTY): 

388 if label in data: 

389 self._add_values_to_scrubber(scrubber, label, data) 

390 

391 return scrubber 

392 

393 @staticmethod 

394 def _get_alternatives(data: OrderedDict) -> Optional[List[List[str]]]: 

395 """ 

396 Returns a list of list of equivalents; see 

397 :func:`crate_anon.anonymise.config.get_word_alternatives` and 

398 :class:`crate_anon.anonymise.scrub.PersonalizedScrubber`. 

399 """ 

400 try: 

401 return [ 

402 [word.upper() for word in words] 

403 for words in data[ApiKeys.ALTERNATIVES] 

404 ] 

405 except KeyError: 

406 return None 

407 

408 @staticmethod 

409 def _get_allowlist( 

410 data: OrderedDict, hasher: GenericHasher 

411 ) -> Optional[WordList]: 

412 """ 

413 Returns a :class:`crate_anon.anonymise.scrub.WordList` of words to be 

414 allowed through. 

415 """ 

416 try: 

417 allowlist_data = data[ApiKeys.ALLOWLIST] 

418 except KeyError: 

419 return None 

420 

421 options = (ApiKeys.WORDS,) 

422 

423 kwargs = {k: v for (k, v) in allowlist_data.items() if k in options} 

424 files = allowlist_data[ApiKeys.FILES] 

425 filename_lookup = settings.ANONYMISE_API.get( 

426 ApiSettingsKeys.ALLOWLIST_FILENAMES, {} 

427 ) 

428 

429 filenames = [ 

430 filename 

431 for label, filename in filename_lookup.items() 

432 if label in files 

433 ] 

434 kwargs.update(filenames=filenames) 

435 

436 return WordList(hasher=hasher, **kwargs) 

437 

438 def _get_nonspecific_scrubber( 

439 self, data: OrderedDict, hasher: GenericHasher 

440 ) -> NonspecificScrubber: 

441 """ 

442 Returns a nonspecific scrubber for the current settings. 

443 """ 

444 denylist = self._get_denylist(data, hasher) 

445 options = ( 

446 # Also kwargs for NonspecificScrubber 

447 ConfigKeys.SCRUB_ALL_NUMBERS_OF_N_DIGITS, 

448 ConfigKeys.SCRUB_ALL_UK_POSTCODES, 

449 ConfigKeys.SCRUB_ALL_DATES, 

450 ConfigKeys.SCRUB_ALL_EMAIL_ADDRESSES, 

451 ConfigKeys.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY, 

452 ConfigKeys.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY, 

453 ) 

454 kwargs = {k: v for (k, v) in data.items() if k in options} 

455 

456 # TODO: extra_regexes (might be a security no-no) 

457 replacement_text = data[ConfigKeys.REPLACE_NONSPECIFIC_INFO_WITH] 

458 

459 try: 

460 kwargs["replacement_text_all_dates"] = data[ 

461 ConfigKeys.REPLACE_ALL_DATES_WITH 

462 ] 

463 except KeyError: 

464 pass 

465 

466 return NonspecificScrubber( 

467 hasher, 

468 replacement_text=replacement_text, 

469 denylist=denylist, 

470 **kwargs, 

471 ) 

472 

473 @staticmethod 

474 def _get_denylist( 

475 data: OrderedDict, hasher: GenericHasher 

476 ) -> Optional[WordList]: 

477 """ 

478 Returns a :class:`crate_anon.anonymise.scrub.WordList` of words to be 

479 scrubbed. 

480 """ 

481 try: 

482 denylist_data = data[ApiKeys.DENYLIST] 

483 except KeyError: 

484 return None 

485 

486 options = (ApiKeys.WORDS,) 

487 

488 kwargs = {k: v for (k, v) in denylist_data.items() if k in options} 

489 kwargs["replacement_text"] = data[ 

490 ConfigKeys.REPLACE_NONSPECIFIC_INFO_WITH 

491 ] 

492 

493 files = denylist_data[ApiKeys.FILES] 

494 filename_lookup = settings.ANONYMISE_API.get( 

495 ApiSettingsKeys.DENYLIST_FILENAMES, {} 

496 ) 

497 

498 filenames = [ 

499 filename 

500 for label, filename in filename_lookup.items() 

501 if label in files 

502 ] 

503 kwargs.update(filenames=filenames) 

504 

505 # TODO: None of these are currently configurable 

506 # from crate_anon/anonymise/config.py 

507 # Do we care about them here? 

508 # suffixes 

509 # at_word_boundaries_only (for regex_method=True) 

510 # max_errors 

511 # regex_method: True 

512 return WordList(hasher=hasher, **kwargs) 

513 

514 @staticmethod 

515 def _add_values_to_scrubber( 

516 scrubber: PersonalizedScrubber, label: str, data: OrderedDict 

517 ) -> None: 

518 """ 

519 Adds values to be scrubbed to either the patient or the third-party 

520 component of a scrubber. 

521 """ 

522 method_lookup = { 

523 ApiKeys.DATES: ScrubMethod.DATE, 

524 ApiKeys.PHRASES: ScrubMethod.PHRASE, 

525 ApiKeys.NON_NUMERIC_PHRASES: ScrubMethod.PHRASE_UNLESS_NUMERIC, 

526 ApiKeys.WORDS: ScrubMethod.WORDS, 

527 ApiKeys.NUMBERS: ScrubMethod.NUMERIC, 

528 ApiKeys.CODES: ScrubMethod.CODE, 

529 } 

530 

531 is_patient = label == ApiKeys.PATIENT 

532 

533 for name, values in data[label].items(): 

534 method = method_lookup[name] 

535 for value in values: 

536 scrubber.add_value(value, method, patient=is_patient)