Coverage for testing/providers.py: 44%

153 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-02-05 06:46 -0600

1""" 

2crate_anon/testing/providers.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Faker test data providers.** 

27 

28There may be some interest in a Faker Medical community provider if we felt it 

29was worth the effort. 

30 

31https://github.com/joke2k/faker/issues/1142 

32 

33""" 

34 

35import datetime 

36from typing import Any, List 

37 

38from cardinal_pythonlib.datetimefunc import pendulum_to_datetime 

39from cardinal_pythonlib.nhs import generate_random_nhs_number 

40from faker import Faker 

41from faker.providers import BaseProvider 

42from faker_file.base import StringValue 

43from faker_file.providers.docx_file import DocxFileProvider 

44from faker_file.providers.odt_file import OdtFileProvider 

45from faker_file.providers.pdf_file import PdfFileProvider 

46from faker_file.providers.pdf_file.generators.reportlab_generator import ( 

47 ReportlabPdfGenerator, 

48) 

49import pendulum 

50from pendulum import DateTime as Pendulum 

51 

52 

53class ChoiceProvider(BaseProvider): 

54 def random_choice(self, choices: List, **kwargs) -> Any: 

55 """ 

56 Given a list of choices return a random value 

57 """ 

58 choices = self.generator.random.choices(choices, **kwargs) 

59 

60 return choices[0] 

61 

62 

63class DateFormatProvider(ChoiceProvider): 

64 """ 

65 Return a random date format. 

66 """ 

67 

68 def date_format(self) -> str: 

69 return self.random_choice( 

70 [ 

71 "%d %b %Y", # e.g. 24 Jul 2013 

72 "%d %B %Y", # e.g. 24 July 2013 

73 "%Y-%m-%d", # e.g. 2013-07-24 

74 "%Y-%m-%d", # e.g. 20130724 

75 "%Y%m%d", # e.g. 20130724 

76 ] 

77 ) 

78 

79 

80class AgeProvider(BaseProvider): 

81 def age(self) -> int: 

82 return self.generator.pyint(min_value=0, max_value=120) 

83 

84 

85class SexProvider(ChoiceProvider): 

86 """ 

87 Return a random sex, with realistic distribution. 

88 """ 

89 

90 def sex(self) -> str: 

91 return self.random_choice(["M", "F", "X"], weights=[49.8, 49.8, 0.4]) 

92 

93 

94class ForenameProvider(BaseProvider): 

95 """ 

96 Return a forename given the sex of the person 

97 """ 

98 

99 def forename(self, sex: str) -> str: 

100 if sex == "M": 

101 return self.generator.first_name_male() 

102 

103 if sex == "F": 

104 return self.generator.first_name_female() 

105 

106 return self.generator.first_name()[:1] 

107 

108 

109class FormattedDateOfBirthProvider(BaseProvider): 

110 """ 

111 Return a random date of birth in a random format 

112 """ 

113 

114 def formatted_date_of_birth(self) -> str: 

115 dob = self.generator.date_of_birth() 

116 format = self.generator.date_format() 

117 

118 return dob.strftime(format) 

119 

120 

121# No one is born after this 

122_max_birth_datetime = Pendulum(year=2000, month=1, day=1, hour=9) 

123_datetime = _max_birth_datetime 

124 

125 

126class IncrementingDateProvider(BaseProvider): 

127 """ 

128 Return a datetime one day more than the previous one. 

129 Starts at 1st January 2000. 

130 """ 

131 

132 def incrementing_date(self) -> datetime.datetime: 

133 global _datetime 

134 _p = _datetime 

135 _datetime = _datetime.add(days=1) 

136 return pendulum_to_datetime(_p) 

137 

138 

139class FormattedIncrementingDateProvider(BaseProvider): 

140 """ 

141 Returns an incrementing date in a random format. 

142 """ 

143 

144 def formatted_incrementing_date(self) -> datetime.datetime: 

145 date = self.generator.incrementing_date() 

146 format = self.generator.date_format() 

147 

148 return date.strftime(format) 

149 

150 

151class ConsistentDateOfBirthProvider(BaseProvider): 

152 """ 

153 Returns a date of birth no greater than 1st January 2000. All patient notes 

154 are created after this date. 

155 

156 Faker date_of_birth calculates from the current time so gives different 

157 results on different days. In our case we don't want the date of birth to 

158 be greater than the date stamp on the note. 

159 """ 

160 

161 def consistent_date_of_birth(self) -> datetime.datetime: 

162 return self.generator.date_between_dates( 

163 date_start=pendulum.date(1900, 1, 1), 

164 date_end=_max_birth_datetime, 

165 ) 

166 

167 

168class RelationshipProvider(ChoiceProvider): 

169 def relationship(self) -> str: 

170 # independent of sex for now 

171 return self.random_choice( 

172 [ 

173 "child", 

174 "parent", 

175 "sibling", 

176 "spouse", 

177 "partner", 

178 "carer", 

179 ] 

180 ) 

181 

182 

183class AlcoholProvider(ChoiceProvider): 

184 def alcohol(self) -> str: 

185 units = self.generator.pyint(max_value=100) 

186 alcohol = self.random_choice( 

187 [ 

188 f"Alcohol {units} u/w", 

189 f"EtOH = {units} u/w", 

190 f"Alcohol (units/week): {units}", 

191 f"alcohol {units} I.U./week", 

192 f"Was previously drinking {units} u/w", 

193 "teetotal", 

194 "Alcohol: no", 

195 "Abstinent from alcohol", 

196 f"Alcohol: presently less than {units} u/w", 

197 ] 

198 ) 

199 

200 return alcohol 

201 

202 

203class PatientNoteProvider(BaseProvider): 

204 @staticmethod 

205 def _possessive_pronoun(sex: str) -> str: 

206 possessive_pronouns = { 

207 "M": "his", 

208 "F": "her", 

209 "X": "their", 

210 } 

211 

212 return possessive_pronouns[sex] 

213 

214 def patient_note( 

215 self, 

216 forename: str = None, 

217 surname: str = None, 

218 sex: str = None, 

219 dob: datetime.datetime = None, 

220 nhs_number: int = None, 

221 patient_id: int = None, 

222 note_datetime: datetime.datetime = None, 

223 relation_name: str = None, 

224 relation_relationship: str = None, 

225 words_per_note: int = 1000, 

226 pad_paragraph: str = None, 

227 ) -> str: 

228 

229 if sex is None: 

230 sex = self.generator.sex() 

231 

232 if forename is None: 

233 forename = self.generator.forename(sex) 

234 

235 if surname is None: 

236 surname = self.generator.last_name() 

237 

238 if dob is None: 

239 dob = self.generator.consistent_date_of_birth() 

240 

241 if nhs_number is None: 

242 nhs_number = self.generator.nhs_number() 

243 

244 if patient_id is None: 

245 patient_id = self.generator.pyint(min_value=1, max_value=100000) 

246 

247 if note_datetime is None: 

248 note_datetime = self.generator.incrementing_date() 

249 

250 if relation_name is None: 

251 relation_name = f"{self.generator.name()}" 

252 

253 if relation_relationship is None: 

254 relation_relationship = self.generator.relationship() 

255 

256 if pad_paragraph is None: 

257 pad_paragraph = self.generator.paragraph( 

258 nb_sentences=words_per_note / 2, # way more than we need 

259 ) 

260 

261 possessive_pronoun = self._possessive_pronoun(sex) 

262 

263 other_notes = [ 

264 "Start aspirin 75mg od. Remains on Lipitor 40mg nocte", 

265 "For haloperidol 2mg po prn max qds", 

266 "Start amoxicillin 500 mg b.i.d. for 7 days", 

267 f"{possessive_pronoun.capitalize()} CRP is 10", 

268 ( 

269 f"{possessive_pronoun.capitalize()} " 

270 "previous CRP was <13 mg/dl" 

271 ), 

272 "Sodium 140", 

273 "TSH 3.5; urea normal", 

274 "Height 1.82m, weight 75kg, BMI 22.6. BP 135/82", 

275 "MMSE 28/30. ACE-R 72, ACE-II 73, ACE 73", 

276 "ESR 16 (H) mm/h", 

277 ( 

278 "WBC 9.2; neutrophils 4.3; lymphocytes 2.6; " 

279 "eosinophils 0.4; monocytes 1.2; basophils 0.6" 

280 ), 

281 ( 

282 f"{forename} took venlafaxine 375 M/R od, " 

283 "and is due to start clozapine 75mg bd" 

284 ), 

285 ] 

286 

287 other_note = self.generator.word(other_notes) 

288 

289 formatted_dob = dob.strftime(self.generator.date_format()) 

290 note_date_formatted = note_datetime.strftime( 

291 self.generator.date_format() 

292 ) 

293 another_date_formatted = self.generator.formatted_date_of_birth() 

294 alcohol = self.generator.alcohol() 

295 

296 note_text = ( 

297 f"I saw {forename} {surname} on " 

298 f"{note_date_formatted} " 

299 f"(DOB: {formatted_dob}, NHS {nhs_number}, " 

300 f"Patient id: {patient_id}), " 

301 f"accompanied by {possessive_pronoun} " 

302 f"{relation_relationship} {relation_name}. " 

303 f"{alcohol}. " 

304 f"Another date: {another_date_formatted}. " 

305 f"{other_note}." 

306 ) 

307 

308 num_pad_words = words_per_note - len(note_text.split()) 

309 pad_words = " ".join(pad_paragraph.split()[:num_pad_words]) 

310 return f"{note_text} {pad_words}" 

311 

312 

313class PatientFileProvider(BaseProvider): 

314 def patient_filename( 

315 self, 

316 forename: str = None, 

317 surname: str = None, 

318 sex: str = None, 

319 dob: datetime.datetime = None, 

320 nhs_number: int = None, 

321 patient_id: int = None, 

322 pad_paragraph: str = None, 

323 ) -> str: 

324 if sex is None: 

325 sex = self.generator.sex() 

326 

327 if forename is None: 

328 forename = self.generator.forename(sex) 

329 

330 if surname is None: 

331 surname = self.generator.last_name() 

332 

333 if dob is None: 

334 dob = self.generator.consistent_date_of_birth() 

335 

336 if nhs_number is None: 

337 nhs_number = self.generator.nhs_number() 

338 

339 if patient_id is None: 

340 patient_id = self.generator.pyint(min_value=1, max_value=100000) 

341 

342 if pad_paragraph is None: 

343 pad_paragraph = self.generator.paragraph(nb_sentences=50) 

344 

345 file_ext = self.generator.random_choice(["docx", "odt", "pdf"]) 

346 file_obj = self.generate_file( 

347 file_ext, 

348 forename=forename, 

349 surname=surname, 

350 dob=dob, 

351 nhs_number=nhs_number, 

352 pad_paragraph=pad_paragraph, 

353 ) 

354 

355 return file_obj.data["filename"] 

356 

357 def generate_file( 

358 self, 

359 file_ext: str, 

360 forename: str, 

361 surname: str, 

362 dob: datetime.datetime, 

363 nhs_number: int, 

364 pad_paragraph: str, 

365 ) -> StringValue: 

366 other_name = self.generator.name() 

367 formatted_dob = dob.strftime(self.generator.date_format()) 

368 

369 content = f""" 

370Dear {forename} {surname},\n 

371\n 

372NHS Number: {nhs_number}.\n 

373Date of Birth: {formatted_dob},\n 

374\n 

375{pad_paragraph} 

376\n 

377\n 

378Yours sincerely, 

379\n 

380\n 

381{other_name} 

382 

383\n 

384""" 

385 

386 if file_ext == "docx": 

387 return self.generate_docx_file(content) 

388 

389 elif file_ext == "odt": 

390 return self.generate_odt_file(content) 

391 

392 else: 

393 return self.generate_pdf_file(content) 

394 

395 def generate_docx_file(self, content: str) -> StringValue: 

396 return self.generator.docx_file(content=content) 

397 

398 def generate_odt_file(self, content: str) -> StringValue: 

399 return self.generator.odt_file(content=content) 

400 

401 def generate_pdf_file(self, content: str) -> StringValue: 

402 return self.generator.pdf_file( 

403 content=content, 

404 pdf_generator_cls=ReportlabPdfGenerator, 

405 ) 

406 

407 

408class NhsNumberProvider(BaseProvider): 

409 def nhs_number(self) -> str: 

410 return generate_random_nhs_number() 

411 

412 

413def register_all_providers(fake: Faker) -> None: 

414 # Our own 

415 fake.add_provider(AgeProvider) 

416 fake.add_provider(AlcoholProvider) 

417 fake.add_provider(ChoiceProvider) 

418 fake.add_provider(ConsistentDateOfBirthProvider) 

419 fake.add_provider(DateFormatProvider) 

420 fake.add_provider(ForenameProvider) 

421 fake.add_provider(FormattedDateOfBirthProvider) 

422 fake.add_provider(FormattedIncrementingDateProvider) 

423 fake.add_provider(IncrementingDateProvider) 

424 fake.add_provider(NhsNumberProvider) 

425 fake.add_provider(PatientFileProvider) 

426 fake.add_provider(PatientNoteProvider) 

427 fake.add_provider(RelationshipProvider) 

428 fake.add_provider(SexProvider) 

429 

430 # Third party: 

431 # faker-file 

432 fake.add_provider(DocxFileProvider) 

433 fake.add_provider(OdtFileProvider) 

434 fake.add_provider(PdfFileProvider)