Coverage for preprocess/tests/text_extractor_tests.py: 100%

189 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-02-06 03:32 -0600

1""" 

2crate_anon/preprocess/tests/text_extractor_tests.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Test text extraction from documents.** 

27 

28""" 

29 

30import logging 

31import os 

32from pathlib import Path 

33import shutil 

34import tempfile 

35from unittest import mock 

36 

37from faker.providers import BaseProvider 

38from faker_file.storages.filesystem import FileSystemStorage 

39from sqlalchemy.exc import IntegrityError, MultipleResultsFound, NoResultFound 

40 

41from crate_anon.preprocess.constants import ( 

42 CRATE_COL_PK, 

43 CRATE_IDX_PREFIX, 

44 CRATE_TABLE_EXTRACTED_TEXT, 

45) 

46from crate_anon.preprocess.systmone_ddgen import S1GenericCol, SystmOneContext 

47from crate_anon.preprocess.text_extractor import SystmOneTextExtractor 

48from crate_anon.testing.classes import CrateTestCase 

49 

50 

51class RowIdentifierProvider(BaseProvider): 

52 def row_identifier(self) -> int: 

53 return self.generator.pyint(1_000_000_000, 9_000_000_000) 

54 

55 

56class DocumentUidProvider(BaseProvider): 

57 def document_uid(self) -> int: 

58 return self.generator.pyint( 

59 0x1000_0000_0000_0000, 0xFFFF_FFFF_FFFF_FFFF 

60 ) 

61 

62 

63class SubfolderProvider(BaseProvider): 

64 def subfolder(self) -> int: 

65 return self.generator.pyint(1, 4) 

66 

67 

68class IndexProvider(BaseProvider): 

69 def index(self) -> int: 

70 return self.generator.pyint(0, 9) 

71 

72 

73class PatientIdProvider(BaseProvider): 

74 def patient_id(self) -> int: 

75 return self.generator.pyint(1, 10_000_000) 

76 

77 

78class SystmOneTextExtractorTests(CrateTestCase): 

79 def setUp(self) -> None: 

80 super().setUp() 

81 

82 self.mock_one = mock.Mock() 

83 self.mock_result = mock.Mock(one=self.mock_one) 

84 self.mock_execute = mock.Mock(return_value=self.mock_result) 

85 self.mock_connection = mock.Mock(execute=self.mock_execute) 

86 self.mock_connect_cm = mock.Mock() 

87 self.mock_connect_cm.__enter__ = mock.Mock( 

88 return_value=self.mock_connection 

89 ) 

90 self.mock_connect_cm.__exit__ = mock.Mock() 

91 self.mock_connect = mock.Mock(return_value=self.mock_connect_cm) 

92 self.mock_engine = mock.Mock(connect=self.mock_connect) 

93 

94 self.mock_s1_documents_table = mock.Mock() 

95 self.mock_drop = mock.Mock() 

96 self.mock_extracted_text_table = mock.Mock(drop=self.mock_drop) 

97 self.mock_metadata = mock.Mock( 

98 tables={ 

99 "S1_Documents": self.mock_s1_documents_table, 

100 CRATE_TABLE_EXTRACTED_TEXT: self.mock_extracted_text_table, 

101 } 

102 ) 

103 self.context = SystmOneContext["cpft_dw"] 

104 

105 self.root_directory = tempfile.mkdtemp() 

106 self.storage = FileSystemStorage( 

107 root_path=self.root_directory, 

108 rel_path="tmp", 

109 ) 

110 

111 self.mock_table_class = mock.Mock() 

112 

113 self.extractor = SystmOneTextExtractor( 

114 self.mock_engine, 

115 self.mock_metadata, 

116 self.context, 

117 self.root_directory, 

118 ) 

119 self.mock_select_object = mock.Mock() 

120 self.mock_select_fn = mock.Mock(return_value=self.mock_select_object) 

121 

122 self.mock_document_to_text = mock.Mock() 

123 self.mock_last_extracted = self.fake.past_datetime() 

124 

125 self.mock_insert_values = mock.Mock() 

126 self.mock_insert_result = mock.Mock(values=self.mock_insert_values) 

127 self.mock_insert = mock.Mock(return_value=self.mock_insert_result) 

128 

129 self.mock_update_values = mock.Mock() 

130 self.mock_update_result = mock.Mock(values=self.mock_update_values) 

131 self.mock_update = mock.Mock(return_value=self.mock_update_result) 

132 

133 self.register_providers() 

134 

135 def register_providers(self) -> None: 

136 self.fake.add_provider(RowIdentifierProvider) 

137 self.fake.add_provider(DocumentUidProvider) 

138 self.fake.add_provider(SubfolderProvider) 

139 self.fake.add_provider(IndexProvider) 

140 self.fake.add_provider(PatientIdProvider) 

141 

142 def tearDown(self) -> None: 

143 shutil.rmtree(self.root_directory) 

144 

145 def generate_filename( 

146 self, 

147 extension: str, 

148 row_identifier: int = None, 

149 document_uid: int = None, 

150 subfolder: int = None, 

151 index: int = None, 

152 ) -> str: 

153 if row_identifier is None: 

154 row_identifier = self.fake.row_identifier() 

155 

156 if document_uid is None: 

157 document_uid = self.fake.document_uid() 

158 

159 if subfolder is None: 

160 subfolder = self.fake.subfolder() 

161 

162 if index is None: 

163 index = self.fake.index() 

164 

165 return f"{row_identifier}_{document_uid:x}_{subfolder}_{index}.{extension}" # noqa: E501 

166 

167 def test_invalid_filename_skipped(self) -> None: 

168 filename = os.path.join(self.root_directory, "test.txt") 

169 content = self.fake.paragraph(nb_sentences=10) 

170 self.storage.write_text(filename, content) 

171 

172 with self.assertLogs(level=logging.INFO) as logging_cm: 

173 self.extractor.extract_all() 

174 

175 self.assert_logged( 

176 "crate_anon.preprocess.text_extractor", 

177 logging.INFO, 

178 f"Completely ignoring {filename}", 

179 logging_cm, 

180 ) 

181 

182 def test_unknown_row_identifier_skipped(self) -> None: 

183 content = self.fake.paragraph(nb_sentences=10) 

184 row_identifier = self.fake.row_identifier() 

185 filename = os.path.join( 

186 self.root_directory, 

187 self.generate_filename("txt", row_identifier=row_identifier), 

188 ) 

189 self.storage.write_text(filename, content) 

190 

191 with mock.patch.multiple( 

192 "crate_anon.preprocess.text_extractor", 

193 select=self.mock_select_fn, 

194 ): 

195 self.mock_one.side_effect = NoResultFound() 

196 with self.assertLogs(level=logging.ERROR) as logging_cm: 

197 self.extractor.extract_all() 

198 

199 self.assert_logged( 

200 "crate_anon.preprocess.text_extractor", 

201 logging.ERROR, 

202 f"... no row found for RowIdentifier: {row_identifier}", 

203 logging_cm, 

204 ) 

205 

206 def test_multiple_results_skipped(self) -> None: 

207 # Not seen in the real world but theoretically possible. 

208 content = self.fake.paragraph(nb_sentences=10) 

209 row_identifier = self.fake.row_identifier() 

210 filename = os.path.join( 

211 self.root_directory, 

212 self.generate_filename("txt", row_identifier=row_identifier), 

213 ) 

214 self.storage.write_text(filename, content) 

215 

216 with mock.patch.multiple( 

217 "crate_anon.preprocess.text_extractor", 

218 select=self.mock_select_fn, 

219 ): 

220 self.mock_one.side_effect = MultipleResultsFound() 

221 with self.assertLogs(level=logging.ERROR) as logging_cm: 

222 self.extractor.extract_all() 

223 

224 self.assert_logged( 

225 "crate_anon.preprocess.text_extractor", 

226 logging.ERROR, 

227 ( 

228 "... multiple rows found with RowIdentifier: " 

229 f"{row_identifier}" 

230 ), 

231 logging_cm, 

232 ) 

233 

234 def test_row_inserted_into_table(self) -> None: 

235 content = self.fake.paragraph(nb_sentences=10) 

236 row_identifier = self.fake.row_identifier() 

237 document_uid = self.fake.document_uid() 

238 filename = os.path.join( 

239 self.root_directory, 

240 self.generate_filename( 

241 "txt", row_identifier=row_identifier, document_uid=document_uid 

242 ), 

243 ) 

244 self.storage.write_text(filename, content) 

245 

246 patient_id = self.fake.patient_id() 

247 self.mock_one.return_value = mock.Mock( 

248 _mapping={ 

249 S1GenericCol.PATIENT_ID: patient_id, 

250 } 

251 ) 

252 self.mock_document_to_text.return_value = content 

253 

254 with mock.patch.multiple( 

255 "crate_anon.preprocess.text_extractor", 

256 select=self.mock_select_fn, 

257 document_to_text=self.mock_document_to_text, 

258 Pendulum=mock.Mock( 

259 now=mock.Mock(return_value=self.mock_last_extracted) 

260 ), 

261 insert=self.mock_insert, 

262 ): 

263 self.extractor.extract_all() 

264 

265 values = dict( 

266 RowIdentifier=row_identifier, 

267 DocumentUID=f"{document_uid:x}", 

268 IDPatient=patient_id, 

269 crate_file_path=str(Path(*Path(filename).parts[-2:])), 

270 crate_text=content, 

271 crate_text_last_extracted=self.mock_last_extracted, 

272 ) 

273 

274 self.mock_insert_values.assert_called_once_with(**values) 

275 

276 def test_null_text_inserted_when_extension_not_supported(self) -> None: 

277 content = self.fake.paragraph(nb_sentences=10) 

278 filename = os.path.join( 

279 self.root_directory, 

280 self.generate_filename("tex"), 

281 ) 

282 self.storage.write_text(filename, content) 

283 

284 patient_id = self.fake.patient_id() 

285 self.mock_one.return_value = mock.Mock( 

286 _mapping={ 

287 S1GenericCol.PATIENT_ID: patient_id, 

288 } 

289 ) 

290 self.mock_document_to_text.return_value = content 

291 

292 with mock.patch.multiple( 

293 "crate_anon.preprocess.text_extractor", 

294 select=self.mock_select_fn, 

295 document_to_text=self.mock_document_to_text, 

296 Pendulum=mock.Mock( 

297 now=mock.Mock(return_value=self.mock_last_extracted) 

298 ), 

299 insert=self.mock_insert, 

300 ): 

301 with self.assertLogs(level=logging.INFO) as logging_cm: 

302 self.extractor.extract_all() 

303 

304 self.assert_logged( 

305 "crate_anon.preprocess.text_extractor", 

306 logging.INFO, 

307 "... unsupported file extension '.tex'.", 

308 logging_cm, 

309 ) 

310 

311 args, kwargs = self.mock_insert_values.call_args 

312 self.assertIsNone(kwargs["crate_text"]) 

313 

314 def test_row_updated_in_table(self) -> None: 

315 content = self.fake.paragraph(nb_sentences=10) 

316 row_identifier = self.fake.row_identifier() 

317 document_uid = self.fake.document_uid() 

318 filename = os.path.join( 

319 self.root_directory, 

320 self.generate_filename( 

321 "txt", row_identifier=row_identifier, document_uid=document_uid 

322 ), 

323 ) 

324 self.storage.write_text(filename, content) 

325 

326 patient_id = self.fake.patient_id() 

327 self.mock_one.return_value = mock.Mock( 

328 _mapping={ 

329 S1GenericCol.PATIENT_ID: patient_id, 

330 } 

331 ) 

332 self.mock_document_to_text.return_value = content 

333 

334 self.mock_execute.side_effect = [ 

335 self.mock_result, 

336 IntegrityError(None, None, None), 

337 ] 

338 

339 with mock.patch.multiple( 

340 "crate_anon.preprocess.text_extractor", 

341 select=self.mock_select_fn, 

342 document_to_text=self.mock_document_to_text, 

343 Pendulum=mock.Mock( 

344 now=mock.Mock(return_value=self.mock_last_extracted) 

345 ), 

346 insert=self.mock_insert, 

347 update=self.mock_update, 

348 ): 

349 self.extractor.extract_all() 

350 

351 values = dict( 

352 RowIdentifier=row_identifier, 

353 DocumentUID=f"{document_uid:x}", 

354 IDPatient=patient_id, 

355 crate_file_path=str(Path(*Path(filename).parts[-2:])), 

356 crate_text=content, 

357 crate_text_last_extracted=self.mock_last_extracted, 

358 ) 

359 

360 self.mock_update_values.assert_called_once_with(**values) 

361 

362 def test_exception_from_text_conversion_handled(self) -> None: 

363 content = self.fake.paragraph(nb_sentences=10) 

364 row_identifier = self.fake.row_identifier() 

365 document_uid = self.fake.document_uid() 

366 filename = os.path.join( 

367 self.root_directory, 

368 self.generate_filename( 

369 "txt", row_identifier=row_identifier, document_uid=document_uid 

370 ), 

371 ) 

372 self.storage.write_text(filename, content) 

373 

374 patient_id = self.fake.patient_id() 

375 self.mock_one.return_value = mock.Mock( 

376 _mapping={ 

377 S1GenericCol.PATIENT_ID: patient_id, 

378 } 

379 ) 

380 self.mock_document_to_text.side_effect = Exception( 

381 "Something bad happened" 

382 ) 

383 

384 with mock.patch.multiple( 

385 "crate_anon.preprocess.text_extractor", 

386 select=self.mock_select_fn, 

387 document_to_text=self.mock_document_to_text, 

388 ): 

389 with self.assertLogs(level=logging.ERROR) as logging_cm: 

390 self.extractor.extract_all() 

391 

392 self.assert_logged( 

393 "crate_anon.preprocess.text_extractor", 

394 logging.ERROR, 

395 ( 

396 "... caught exception from document_to_text: " 

397 "Something bad happened" 

398 ), 

399 logging_cm, 

400 ) 

401 

402 def test_table_dropped(self) -> None: 

403 self.extractor.drop_table = True 

404 self.mock_table_class.return_value = self.mock_extracted_text_table 

405 self.mock_extracted_text_table.columns = [] 

406 

407 with mock.patch.multiple( 

408 "crate_anon.preprocess.text_extractor", 

409 Table=self.mock_table_class, 

410 ): 

411 self.extractor.extract_all() 

412 self.mock_drop.assert_called_once_with(checkfirst=True) 

413 

414 def test_columns_indexed(self) -> None: 

415 self.extractor.drop_table = True 

416 self.mock_table_class.return_value = self.mock_extracted_text_table 

417 

418 mock_pk_column = mock.Mock() 

419 mock_pk_column.name = CRATE_COL_PK 

420 mock_row_id_column = mock.Mock() 

421 mock_row_id_column.name = S1GenericCol.ROW_ID 

422 mock_patient_id_column = mock.Mock() 

423 mock_patient_id_column.name = S1GenericCol.PATIENT_ID 

424 

425 self.mock_extracted_text_table.columns = [ 

426 mock_pk_column, 

427 mock_row_id_column, 

428 mock_patient_id_column, 

429 ] 

430 

431 mock_add_indexes = mock.Mock() 

432 mock_pk_info = mock.Mock() 

433 mock_row_id_info = mock.Mock() 

434 mock_patient_id_info = mock.Mock() 

435 mock_add_indexes = mock.Mock() 

436 

437 mock_index_creation_info = mock.Mock( 

438 side_effect=[ 

439 mock_pk_info, 

440 mock_row_id_info, 

441 mock_patient_id_info, 

442 ] 

443 ) 

444 

445 with mock.patch.multiple( 

446 "crate_anon.preprocess.text_extractor", 

447 Table=self.mock_table_class, 

448 add_indexes=mock_add_indexes, 

449 IndexCreationInfo=mock_index_creation_info, 

450 ): 

451 self.extractor.extract_all() 

452 mock_add_indexes.assert_any_call( 

453 self.mock_engine, 

454 self.mock_extracted_text_table, 

455 [mock_pk_info], 

456 ) 

457 mock_add_indexes.assert_any_call( 

458 self.mock_engine, 

459 self.mock_extracted_text_table, 

460 [mock_row_id_info], 

461 ) 

462 mock_add_indexes.assert_any_call( 

463 self.mock_engine, 

464 self.mock_extracted_text_table, 

465 [mock_patient_id_info], 

466 ) 

467 

468 mock_index_creation_info.assert_any_call( 

469 index_name=f"{CRATE_IDX_PREFIX}_{CRATE_COL_PK}", 

470 column=CRATE_COL_PK, 

471 unique=False, 

472 ) 

473 mock_index_creation_info.assert_any_call( 

474 index_name=f"{CRATE_IDX_PREFIX}_{S1GenericCol.ROW_ID}", 

475 column=S1GenericCol.ROW_ID, 

476 unique=False, 

477 ) 

478 mock_index_creation_info.assert_any_call( 

479 index_name=f"{CRATE_IDX_PREFIX}_{S1GenericCol.PATIENT_ID}", 

480 column=S1GenericCol.PATIENT_ID, 

481 unique=False, 

482 )