Coverage for crateweb/anonymise_api/serializers.py: 98%
114 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/crateweb/anonymise_api/serializers.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26Django REST Framework serializer to anonymise the data.
28"""
30from collections import OrderedDict
31from typing import Dict, List, Optional
33from django.conf import settings
35from cardinal_pythonlib.hash import GenericHasher, HashMethods, make_hasher
36from rest_framework.serializers import (
37 BooleanField,
38 CharField,
39 DictField,
40 IntegerField,
41 ListField,
42 Serializer,
43 SerializerMethodField,
44)
46from crate_anon.anonymise.constants import (
47 AnonymiseConfigDefaults as Defaults,
48 AnonymiseConfigKeys as ConfigKeys,
49 DATE_BLURRING_DIRECTIVES_CSV,
50 ScrubMethod,
51)
52from crate_anon.anonymise.scrub import (
53 NonspecificScrubber,
54 PersonalizedScrubber,
55 WordList,
56)
57from crate_anon.crateweb.anonymise_api.constants import (
58 ApiKeys,
59 ApiSettingsKeys,
60)
63# noinspection PyAbstractClass
64class SpecificSerializer(Serializer):
65 """
66 Represents scrubbing information about a specific person or group of people
67 (e.g. patient data, third-party data).
68 """
70 dates = ListField(
71 child=CharField(),
72 help_text="List of dates to be scrubbed.",
73 default=[],
74 initial=[],
75 )
76 phrases = ListField(
77 child=CharField(),
78 help_text=(
79 "List of phrases (words appearing consecutively) to "
80 "be scrubbed."
81 ),
82 default=[],
83 initial=[],
84 )
85 non_numeric_phrases = ListField(
86 child=CharField(),
87 help_text=(
88 "List of phrases (words appearing consecutively) to "
89 "be scrubbed. If a phrase is purely numeric it will be "
90 "ignored."
91 ),
92 default=[],
93 initial=[],
94 )
95 words = ListField(
96 child=CharField(),
97 help_text="List of words to be scrubbed.",
98 default=[],
99 initial=[],
100 )
101 numbers = ListField(
102 child=CharField(),
103 help_text="List of numbers to be scrubbed.",
104 default=[],
105 initial=[],
106 )
107 codes = ListField(
108 child=CharField(),
109 help_text="List of codes (e.g. postcodes) to be scrubbed.",
110 default=[],
111 initial=[],
112 )
115# noinspection PyAbstractClass
116class AllowlistSerializer(Serializer):
117 """
118 Represents allowlist options.
119 """
121 words = ListField(
122 child=CharField(),
123 help_text="Do not scrub these specific words.",
124 default=[],
125 initial=[],
126 )
127 files = ListField(
128 child=CharField(),
129 help_text=(
130 "Do not scrub words from these filename aliases "
131 "(defined on the server)."
132 ),
133 default=[],
134 initial=[],
135 )
138# noinspection PyAbstractClass
139class DenylistSerializer(Serializer):
140 """
141 Represents denylist options.
142 """
144 words = ListField(
145 child=CharField(),
146 help_text="Scrub these specific words.",
147 default=[],
148 initial=[],
149 )
150 files = ListField(
151 child=CharField(),
152 help_text=(
153 "Scrub words from these filename aliases (defined on the server)."
154 ),
155 default=[],
156 initial=[],
157 )
160# noinspection PyAbstractClass
161class ScrubSerializer(Serializer):
162 """
163 Represents all scrubber settings, including data to be scrubbed and
164 scrubber configuration settings.
165 """
167 # Input/Output fields
168 # default implies required=False
169 text = DictField(
170 child=CharField(help_text="Text to be scrubbed."),
171 help_text=(
172 "The lines of text to be scrubbed, each keyed on a unique "
173 "ID supplied by the caller."
174 ),
175 )
176 patient = SpecificSerializer(
177 required=False, help_text="Specific patient data to be scrubbed."
178 )
179 third_party = SpecificSerializer(
180 required=False,
181 help_text="Third party (e.g. family members') data to be scrubbed.",
182 )
183 anonymise_codes_at_word_boundaries_only = BooleanField(
184 default=Defaults.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY,
185 initial=Defaults.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY,
186 help_text=(
187 "Ensure the codes to be scrubbed begin and end with a word "
188 "boundary."
189 ),
190 )
191 anonymise_dates_at_word_boundaries_only = BooleanField(
192 default=Defaults.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY,
193 initial=Defaults.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY,
194 help_text=(
195 "Ensure the codes to be scrubbed begin and end with a word "
196 "boundary."
197 ),
198 )
199 anonymise_numbers_at_word_boundaries_only = BooleanField(
200 default=Defaults.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY,
201 initial=Defaults.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY,
202 help_text=(
203 "Ensure the numbers to be scrubbed begin and end with a "
204 "word boundary."
205 ),
206 )
207 anonymise_numbers_at_numeric_boundaries_only = BooleanField(
208 default=Defaults.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY,
209 initial=Defaults.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY,
210 help_text=(
211 "Ensure the numbers to be scrubbed begin and end with a "
212 "numeric boundary."
213 ),
214 )
215 anonymise_strings_at_word_boundaries_only = BooleanField(
216 default=Defaults.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY,
217 initial=Defaults.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY,
218 help_text=(
219 "Ensure the numbers to be scrubbed begin and end with a "
220 "word boundary."
221 ),
222 )
223 string_max_regex_errors = IntegerField(
224 default=Defaults.STRING_MAX_REGEX_ERRORS,
225 initial=Defaults.STRING_MAX_REGEX_ERRORS,
226 help_text=(
227 "The maximum number of typographical insertion/deletion/"
228 "substitution errors to permit."
229 ),
230 )
231 min_string_length_for_errors = IntegerField(
232 default=Defaults.MIN_STRING_LENGTH_FOR_ERRORS,
233 initial=Defaults.MIN_STRING_LENGTH_FOR_ERRORS,
234 help_text=(
235 "The minimum string length at which typographical "
236 "errors will be permitted."
237 ),
238 )
239 min_string_length_to_scrub_with = IntegerField(
240 default=Defaults.MIN_STRING_LENGTH_TO_SCRUB_WITH,
241 initial=Defaults.MIN_STRING_LENGTH_TO_SCRUB_WITH,
242 help_text="Do not scrub strings shorter than this length.",
243 )
244 scrub_string_suffixes = ListField(
245 child=CharField(),
246 help_text=(
247 'A list of suffixes to permit on strings. e.g. ["s"] '
248 "for plural forms."
249 ),
250 default=[],
251 initial=[],
252 )
253 allowlist = AllowlistSerializer(
254 required=False, help_text="Allowlist options."
255 )
256 denylist = DenylistSerializer(
257 required=False, help_text="Denylist options."
258 )
259 replace_patient_info_with = CharField(
260 default=Defaults.REPLACE_PATIENT_INFO_WITH,
261 initial=Defaults.REPLACE_PATIENT_INFO_WITH,
262 help_text="Replace sensitive patient content with this.",
263 )
264 replace_third_party_info_with = CharField(
265 default=Defaults.REPLACE_THIRD_PARTY_INFO_WITH,
266 initial=Defaults.REPLACE_THIRD_PARTY_INFO_WITH,
267 help_text=(
268 "Replace sensitive third party (e.g. family members') "
269 "content with this."
270 ),
271 )
272 replace_nonspecific_info_with = CharField(
273 default=Defaults.REPLACE_NONSPECIFIC_INFO_WITH,
274 initial=Defaults.REPLACE_NONSPECIFIC_INFO_WITH,
275 help_text="Replace any other sensitive content with this.",
276 )
277 replace_all_dates_with = CharField(
278 required=False,
279 help_text=(
280 "When scrubbing all dates, replace with this text. If the "
281 "replacement text includes supported datetime.directives "
282 f"({DATE_BLURRING_DIRECTIVES_CSV}), the date is 'blurred' "
283 "to include just those components."
284 ),
285 )
286 scrub_all_numbers_of_n_digits = ListField(
287 child=IntegerField(),
288 help_text=(
289 "Scrub all numbers with these lengths "
290 "(e.g. [10] for all UK NHS numbers)."
291 ),
292 default=[],
293 initial=[],
294 )
295 scrub_all_uk_postcodes = BooleanField(
296 default=Defaults.SCRUB_ALL_UK_POSTCODES,
297 initial=Defaults.SCRUB_ALL_UK_POSTCODES,
298 help_text="Scrub all UK postcodes.",
299 )
300 scrub_all_dates = BooleanField(
301 default=Defaults.SCRUB_ALL_DATES,
302 initial=Defaults.SCRUB_ALL_DATES,
303 help_text=(
304 "Scrub all dates. Currently assumes the default locale "
305 "for month names and ordinal suffixes."
306 ),
307 )
308 scrub_all_email_addresses = BooleanField(
309 default=Defaults.SCRUB_ALL_EMAIL_ADDRESSES,
310 initial=Defaults.SCRUB_ALL_EMAIL_ADDRESSES,
311 help_text="Scrub all e-mail addresses.",
312 )
313 alternatives = ListField(
314 child=ListField(child=CharField()),
315 help_text=(
316 "List of alternative words to scrub. "
317 'e.g.: [["Street", "St"], ["Road", "Rd"], ["Avenue", "Ave"]]'
318 ),
319 default=[[]],
320 initial=[[]],
321 )
323 # Output-only fields
324 # SerializerMethodField is read-only by default
325 anonymised = SerializerMethodField(
326 help_text=(
327 "The anonymised text, keyed on the unique IDs supplied by "
328 "the caller in the 'text' parameter of the request."
329 )
330 )
332 def get_anonymised(self, data: OrderedDict) -> Dict[str, str]:
333 """
334 Returns the anonymised text keyed on the unique IDs supplied by the
335 caller.
336 """
337 scrubber = self._get_personalized_scrubber(data)
339 anonymised = dict()
341 for key, value in data[ApiKeys.TEXT].items():
342 anonymised[key] = scrubber.scrub(value)
344 return anonymised
346 def _get_personalized_scrubber(
347 self, data: OrderedDict
348 ) -> PersonalizedScrubber:
349 """
350 Create a CRATE scrubber representing patient and third-party scrubbing
351 settings.
352 """
353 hasher = make_hasher(
354 HashMethods.HMAC_MD5,
355 settings.ANONYMISE_API[ApiSettingsKeys.HASH_KEY],
356 )
358 options = (
359 ConfigKeys.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY,
360 ConfigKeys.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY,
361 ConfigKeys.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY,
362 ConfigKeys.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY,
363 ConfigKeys.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY,
364 ConfigKeys.STRING_MAX_REGEX_ERRORS,
365 ConfigKeys.MIN_STRING_LENGTH_FOR_ERRORS,
366 ConfigKeys.MIN_STRING_LENGTH_TO_SCRUB_WITH,
367 ConfigKeys.SCRUB_STRING_SUFFIXES,
368 )
370 kwargs = {k: v for (k, v) in data.items() if k in options}
372 replacement_text_patient = data[ConfigKeys.REPLACE_PATIENT_INFO_WITH]
373 replacement_text_third_party = data[
374 ConfigKeys.REPLACE_THIRD_PARTY_INFO_WITH
375 ]
377 scrubber = PersonalizedScrubber(
378 hasher,
379 replacement_text_patient,
380 replacement_text_third_party,
381 nonspecific_scrubber=self._get_nonspecific_scrubber(data, hasher),
382 allowlist=self._get_allowlist(data, hasher),
383 alternatives=self._get_alternatives(data),
384 **kwargs,
385 )
387 for label in (ApiKeys.PATIENT, ApiKeys.THIRD_PARTY):
388 if label in data:
389 self._add_values_to_scrubber(scrubber, label, data)
391 return scrubber
393 @staticmethod
394 def _get_alternatives(data: OrderedDict) -> Optional[List[List[str]]]:
395 """
396 Returns a list of list of equivalents; see
397 :func:`crate_anon.anonymise.config.get_word_alternatives` and
398 :class:`crate_anon.anonymise.scrub.PersonalizedScrubber`.
399 """
400 try:
401 return [
402 [word.upper() for word in words]
403 for words in data[ApiKeys.ALTERNATIVES]
404 ]
405 except KeyError:
406 return None
408 @staticmethod
409 def _get_allowlist(
410 data: OrderedDict, hasher: GenericHasher
411 ) -> Optional[WordList]:
412 """
413 Returns a :class:`crate_anon.anonymise.scrub.WordList` of words to be
414 allowed through.
415 """
416 try:
417 allowlist_data = data[ApiKeys.ALLOWLIST]
418 except KeyError:
419 return None
421 options = (ApiKeys.WORDS,)
423 kwargs = {k: v for (k, v) in allowlist_data.items() if k in options}
424 files = allowlist_data[ApiKeys.FILES]
425 filename_lookup = settings.ANONYMISE_API.get(
426 ApiSettingsKeys.ALLOWLIST_FILENAMES, {}
427 )
429 filenames = [
430 filename
431 for label, filename in filename_lookup.items()
432 if label in files
433 ]
434 kwargs.update(filenames=filenames)
436 return WordList(hasher=hasher, **kwargs)
438 def _get_nonspecific_scrubber(
439 self, data: OrderedDict, hasher: GenericHasher
440 ) -> NonspecificScrubber:
441 """
442 Returns a nonspecific scrubber for the current settings.
443 """
444 denylist = self._get_denylist(data, hasher)
445 options = (
446 # Also kwargs for NonspecificScrubber
447 ConfigKeys.SCRUB_ALL_NUMBERS_OF_N_DIGITS,
448 ConfigKeys.SCRUB_ALL_UK_POSTCODES,
449 ConfigKeys.SCRUB_ALL_DATES,
450 ConfigKeys.SCRUB_ALL_EMAIL_ADDRESSES,
451 ConfigKeys.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY,
452 ConfigKeys.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY,
453 )
454 kwargs = {k: v for (k, v) in data.items() if k in options}
456 # TODO: extra_regexes (might be a security no-no)
457 replacement_text = data[ConfigKeys.REPLACE_NONSPECIFIC_INFO_WITH]
459 try:
460 kwargs["replacement_text_all_dates"] = data[
461 ConfigKeys.REPLACE_ALL_DATES_WITH
462 ]
463 except KeyError:
464 pass
466 return NonspecificScrubber(
467 hasher,
468 replacement_text=replacement_text,
469 denylist=denylist,
470 **kwargs,
471 )
473 @staticmethod
474 def _get_denylist(
475 data: OrderedDict, hasher: GenericHasher
476 ) -> Optional[WordList]:
477 """
478 Returns a :class:`crate_anon.anonymise.scrub.WordList` of words to be
479 scrubbed.
480 """
481 try:
482 denylist_data = data[ApiKeys.DENYLIST]
483 except KeyError:
484 return None
486 options = (ApiKeys.WORDS,)
488 kwargs = {k: v for (k, v) in denylist_data.items() if k in options}
489 kwargs["replacement_text"] = data[
490 ConfigKeys.REPLACE_NONSPECIFIC_INFO_WITH
491 ]
493 files = denylist_data[ApiKeys.FILES]
494 filename_lookup = settings.ANONYMISE_API.get(
495 ApiSettingsKeys.DENYLIST_FILENAMES, {}
496 )
498 filenames = [
499 filename
500 for label, filename in filename_lookup.items()
501 if label in files
502 ]
503 kwargs.update(filenames=filenames)
505 # TODO: None of these are currently configurable
506 # from crate_anon/anonymise/config.py
507 # Do we care about them here?
508 # suffixes
509 # at_word_boundaries_only (for regex_method=True)
510 # max_errors
511 # regex_method: True
512 return WordList(hasher=hasher, **kwargs)
514 @staticmethod
515 def _add_values_to_scrubber(
516 scrubber: PersonalizedScrubber, label: str, data: OrderedDict
517 ) -> None:
518 """
519 Adds values to be scrubbed to either the patient or the third-party
520 component of a scrubber.
521 """
522 method_lookup = {
523 ApiKeys.DATES: ScrubMethod.DATE,
524 ApiKeys.PHRASES: ScrubMethod.PHRASE,
525 ApiKeys.NON_NUMERIC_PHRASES: ScrubMethod.PHRASE_UNLESS_NUMERIC,
526 ApiKeys.WORDS: ScrubMethod.WORDS,
527 ApiKeys.NUMBERS: ScrubMethod.NUMERIC,
528 ApiKeys.CODES: ScrubMethod.CODE,
529 }
531 is_patient = label == ApiKeys.PATIENT
533 for name, values in data[label].items():
534 method = method_lookup[name]
535 for value in values:
536 scrubber.add_value(value, method, patient=is_patient)