Coverage for testing/providers.py: 44%
153 statements
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
1"""
2crate_anon/testing/providers.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Faker test data providers.**
28There may be some interest in a Faker Medical community provider if we felt it
29was worth the effort.
31https://github.com/joke2k/faker/issues/1142
33"""
35import datetime
36from typing import Any, List
38from cardinal_pythonlib.datetimefunc import pendulum_to_datetime
39from cardinal_pythonlib.nhs import generate_random_nhs_number
40from faker import Faker
41from faker.providers import BaseProvider
42from faker_file.base import StringValue
43from faker_file.providers.docx_file import DocxFileProvider
44from faker_file.providers.odt_file import OdtFileProvider
45from faker_file.providers.pdf_file import PdfFileProvider
46from faker_file.providers.pdf_file.generators.reportlab_generator import (
47 ReportlabPdfGenerator,
48)
49import pendulum
50from pendulum import DateTime as Pendulum
53class ChoiceProvider(BaseProvider):
54 def random_choice(self, choices: List, **kwargs) -> Any:
55 """
56 Given a list of choices return a random value
57 """
58 choices = self.generator.random.choices(choices, **kwargs)
60 return choices[0]
63class DateFormatProvider(ChoiceProvider):
64 """
65 Return a random date format.
66 """
68 def date_format(self) -> str:
69 return self.random_choice(
70 [
71 "%d %b %Y", # e.g. 24 Jul 2013
72 "%d %B %Y", # e.g. 24 July 2013
73 "%Y-%m-%d", # e.g. 2013-07-24
74 "%Y-%m-%d", # e.g. 20130724
75 "%Y%m%d", # e.g. 20130724
76 ]
77 )
80class AgeProvider(BaseProvider):
81 def age(self) -> int:
82 return self.generator.pyint(min_value=0, max_value=120)
85class SexProvider(ChoiceProvider):
86 """
87 Return a random sex, with realistic distribution.
88 """
90 def sex(self) -> str:
91 return self.random_choice(["M", "F", "X"], weights=[49.8, 49.8, 0.4])
94class ForenameProvider(BaseProvider):
95 """
96 Return a forename given the sex of the person
97 """
99 def forename(self, sex: str) -> str:
100 if sex == "M":
101 return self.generator.first_name_male()
103 if sex == "F":
104 return self.generator.first_name_female()
106 return self.generator.first_name()[:1]
109class FormattedDateOfBirthProvider(BaseProvider):
110 """
111 Return a random date of birth in a random format
112 """
114 def formatted_date_of_birth(self) -> str:
115 dob = self.generator.date_of_birth()
116 format = self.generator.date_format()
118 return dob.strftime(format)
121# No one is born after this
122_max_birth_datetime = Pendulum(year=2000, month=1, day=1, hour=9)
123_datetime = _max_birth_datetime
126class IncrementingDateProvider(BaseProvider):
127 """
128 Return a datetime one day more than the previous one.
129 Starts at 1st January 2000.
130 """
132 def incrementing_date(self) -> datetime.datetime:
133 global _datetime
134 _p = _datetime
135 _datetime = _datetime.add(days=1)
136 return pendulum_to_datetime(_p)
139class FormattedIncrementingDateProvider(BaseProvider):
140 """
141 Returns an incrementing date in a random format.
142 """
144 def formatted_incrementing_date(self) -> datetime.datetime:
145 date = self.generator.incrementing_date()
146 format = self.generator.date_format()
148 return date.strftime(format)
151class ConsistentDateOfBirthProvider(BaseProvider):
152 """
153 Returns a date of birth no greater than 1st January 2000. All patient notes
154 are created after this date.
156 Faker date_of_birth calculates from the current time so gives different
157 results on different days. In our case we don't want the date of birth to
158 be greater than the date stamp on the note.
159 """
161 def consistent_date_of_birth(self) -> datetime.datetime:
162 return self.generator.date_between_dates(
163 date_start=pendulum.date(1900, 1, 1),
164 date_end=_max_birth_datetime,
165 )
168class RelationshipProvider(ChoiceProvider):
169 def relationship(self) -> str:
170 # independent of sex for now
171 return self.random_choice(
172 [
173 "child",
174 "parent",
175 "sibling",
176 "spouse",
177 "partner",
178 "carer",
179 ]
180 )
183class AlcoholProvider(ChoiceProvider):
184 def alcohol(self) -> str:
185 units = self.generator.pyint(max_value=100)
186 alcohol = self.random_choice(
187 [
188 f"Alcohol {units} u/w",
189 f"EtOH = {units} u/w",
190 f"Alcohol (units/week): {units}",
191 f"alcohol {units} I.U./week",
192 f"Was previously drinking {units} u/w",
193 "teetotal",
194 "Alcohol: no",
195 "Abstinent from alcohol",
196 f"Alcohol: presently less than {units} u/w",
197 ]
198 )
200 return alcohol
203class PatientNoteProvider(BaseProvider):
204 @staticmethod
205 def _possessive_pronoun(sex: str) -> str:
206 possessive_pronouns = {
207 "M": "his",
208 "F": "her",
209 "X": "their",
210 }
212 return possessive_pronouns[sex]
214 def patient_note(
215 self,
216 forename: str = None,
217 surname: str = None,
218 sex: str = None,
219 dob: datetime.datetime = None,
220 nhs_number: int = None,
221 patient_id: int = None,
222 note_datetime: datetime.datetime = None,
223 relation_name: str = None,
224 relation_relationship: str = None,
225 words_per_note: int = 1000,
226 pad_paragraph: str = None,
227 ) -> str:
229 if sex is None:
230 sex = self.generator.sex()
232 if forename is None:
233 forename = self.generator.forename(sex)
235 if surname is None:
236 surname = self.generator.last_name()
238 if dob is None:
239 dob = self.generator.consistent_date_of_birth()
241 if nhs_number is None:
242 nhs_number = self.generator.nhs_number()
244 if patient_id is None:
245 patient_id = self.generator.pyint(min_value=1, max_value=100000)
247 if note_datetime is None:
248 note_datetime = self.generator.incrementing_date()
250 if relation_name is None:
251 relation_name = f"{self.generator.name()}"
253 if relation_relationship is None:
254 relation_relationship = self.generator.relationship()
256 if pad_paragraph is None:
257 pad_paragraph = self.generator.paragraph(
258 nb_sentences=words_per_note / 2, # way more than we need
259 )
261 possessive_pronoun = self._possessive_pronoun(sex)
263 other_notes = [
264 "Start aspirin 75mg od. Remains on Lipitor 40mg nocte",
265 "For haloperidol 2mg po prn max qds",
266 "Start amoxicillin 500 mg b.i.d. for 7 days",
267 f"{possessive_pronoun.capitalize()} CRP is 10",
268 (
269 f"{possessive_pronoun.capitalize()} "
270 "previous CRP was <13 mg/dl"
271 ),
272 "Sodium 140",
273 "TSH 3.5; urea normal",
274 "Height 1.82m, weight 75kg, BMI 22.6. BP 135/82",
275 "MMSE 28/30. ACE-R 72, ACE-II 73, ACE 73",
276 "ESR 16 (H) mm/h",
277 (
278 "WBC 9.2; neutrophils 4.3; lymphocytes 2.6; "
279 "eosinophils 0.4; monocytes 1.2; basophils 0.6"
280 ),
281 (
282 f"{forename} took venlafaxine 375 M/R od, "
283 "and is due to start clozapine 75mg bd"
284 ),
285 ]
287 other_note = self.generator.word(other_notes)
289 formatted_dob = dob.strftime(self.generator.date_format())
290 note_date_formatted = note_datetime.strftime(
291 self.generator.date_format()
292 )
293 another_date_formatted = self.generator.formatted_date_of_birth()
294 alcohol = self.generator.alcohol()
296 note_text = (
297 f"I saw {forename} {surname} on "
298 f"{note_date_formatted} "
299 f"(DOB: {formatted_dob}, NHS {nhs_number}, "
300 f"Patient id: {patient_id}), "
301 f"accompanied by {possessive_pronoun} "
302 f"{relation_relationship} {relation_name}. "
303 f"{alcohol}. "
304 f"Another date: {another_date_formatted}. "
305 f"{other_note}."
306 )
308 num_pad_words = words_per_note - len(note_text.split())
309 pad_words = " ".join(pad_paragraph.split()[:num_pad_words])
310 return f"{note_text} {pad_words}"
313class PatientFileProvider(BaseProvider):
314 def patient_filename(
315 self,
316 forename: str = None,
317 surname: str = None,
318 sex: str = None,
319 dob: datetime.datetime = None,
320 nhs_number: int = None,
321 patient_id: int = None,
322 pad_paragraph: str = None,
323 ) -> str:
324 if sex is None:
325 sex = self.generator.sex()
327 if forename is None:
328 forename = self.generator.forename(sex)
330 if surname is None:
331 surname = self.generator.last_name()
333 if dob is None:
334 dob = self.generator.consistent_date_of_birth()
336 if nhs_number is None:
337 nhs_number = self.generator.nhs_number()
339 if patient_id is None:
340 patient_id = self.generator.pyint(min_value=1, max_value=100000)
342 if pad_paragraph is None:
343 pad_paragraph = self.generator.paragraph(nb_sentences=50)
345 file_ext = self.generator.random_choice(["docx", "odt", "pdf"])
346 file_obj = self.generate_file(
347 file_ext,
348 forename=forename,
349 surname=surname,
350 dob=dob,
351 nhs_number=nhs_number,
352 pad_paragraph=pad_paragraph,
353 )
355 return file_obj.data["filename"]
357 def generate_file(
358 self,
359 file_ext: str,
360 forename: str,
361 surname: str,
362 dob: datetime.datetime,
363 nhs_number: int,
364 pad_paragraph: str,
365 ) -> StringValue:
366 other_name = self.generator.name()
367 formatted_dob = dob.strftime(self.generator.date_format())
369 content = f"""
370Dear {forename} {surname},\n
371\n
372NHS Number: {nhs_number}.\n
373Date of Birth: {formatted_dob},\n
374\n
375{pad_paragraph}
376\n
377\n
378Yours sincerely,
379\n
380\n
381{other_name}
383\n
384"""
386 if file_ext == "docx":
387 return self.generate_docx_file(content)
389 elif file_ext == "odt":
390 return self.generate_odt_file(content)
392 else:
393 return self.generate_pdf_file(content)
395 def generate_docx_file(self, content: str) -> StringValue:
396 return self.generator.docx_file(content=content)
398 def generate_odt_file(self, content: str) -> StringValue:
399 return self.generator.odt_file(content=content)
401 def generate_pdf_file(self, content: str) -> StringValue:
402 return self.generator.pdf_file(
403 content=content,
404 pdf_generator_cls=ReportlabPdfGenerator,
405 )
408class NhsNumberProvider(BaseProvider):
409 def nhs_number(self) -> str:
410 return generate_random_nhs_number()
413def register_all_providers(fake: Faker) -> None:
414 # Our own
415 fake.add_provider(AgeProvider)
416 fake.add_provider(AlcoholProvider)
417 fake.add_provider(ChoiceProvider)
418 fake.add_provider(ConsistentDateOfBirthProvider)
419 fake.add_provider(DateFormatProvider)
420 fake.add_provider(ForenameProvider)
421 fake.add_provider(FormattedDateOfBirthProvider)
422 fake.add_provider(FormattedIncrementingDateProvider)
423 fake.add_provider(IncrementingDateProvider)
424 fake.add_provider(NhsNumberProvider)
425 fake.add_provider(PatientFileProvider)
426 fake.add_provider(PatientNoteProvider)
427 fake.add_provider(RelationshipProvider)
428 fake.add_provider(SexProvider)
430 # Third party:
431 # faker-file
432 fake.add_provider(DocxFileProvider)
433 fake.add_provider(OdtFileProvider)
434 fake.add_provider(PdfFileProvider)