Coverage for preprocess/tests/text_extractor_tests.py: 100%
189 statements
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-06 03:32 -0600
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-06 03:32 -0600
1"""
2crate_anon/preprocess/tests/text_extractor_tests.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Test text extraction from documents.**
28"""
30import logging
31import os
32from pathlib import Path
33import shutil
34import tempfile
35from unittest import mock
37from faker.providers import BaseProvider
38from faker_file.storages.filesystem import FileSystemStorage
39from sqlalchemy.exc import IntegrityError, MultipleResultsFound, NoResultFound
41from crate_anon.preprocess.constants import (
42 CRATE_COL_PK,
43 CRATE_IDX_PREFIX,
44 CRATE_TABLE_EXTRACTED_TEXT,
45)
46from crate_anon.preprocess.systmone_ddgen import S1GenericCol, SystmOneContext
47from crate_anon.preprocess.text_extractor import SystmOneTextExtractor
48from crate_anon.testing.classes import CrateTestCase
51class RowIdentifierProvider(BaseProvider):
52 def row_identifier(self) -> int:
53 return self.generator.pyint(1_000_000_000, 9_000_000_000)
56class DocumentUidProvider(BaseProvider):
57 def document_uid(self) -> int:
58 return self.generator.pyint(
59 0x1000_0000_0000_0000, 0xFFFF_FFFF_FFFF_FFFF
60 )
63class SubfolderProvider(BaseProvider):
64 def subfolder(self) -> int:
65 return self.generator.pyint(1, 4)
68class IndexProvider(BaseProvider):
69 def index(self) -> int:
70 return self.generator.pyint(0, 9)
73class PatientIdProvider(BaseProvider):
74 def patient_id(self) -> int:
75 return self.generator.pyint(1, 10_000_000)
78class SystmOneTextExtractorTests(CrateTestCase):
79 def setUp(self) -> None:
80 super().setUp()
82 self.mock_one = mock.Mock()
83 self.mock_result = mock.Mock(one=self.mock_one)
84 self.mock_execute = mock.Mock(return_value=self.mock_result)
85 self.mock_connection = mock.Mock(execute=self.mock_execute)
86 self.mock_connect_cm = mock.Mock()
87 self.mock_connect_cm.__enter__ = mock.Mock(
88 return_value=self.mock_connection
89 )
90 self.mock_connect_cm.__exit__ = mock.Mock()
91 self.mock_connect = mock.Mock(return_value=self.mock_connect_cm)
92 self.mock_engine = mock.Mock(connect=self.mock_connect)
94 self.mock_s1_documents_table = mock.Mock()
95 self.mock_drop = mock.Mock()
96 self.mock_extracted_text_table = mock.Mock(drop=self.mock_drop)
97 self.mock_metadata = mock.Mock(
98 tables={
99 "S1_Documents": self.mock_s1_documents_table,
100 CRATE_TABLE_EXTRACTED_TEXT: self.mock_extracted_text_table,
101 }
102 )
103 self.context = SystmOneContext["cpft_dw"]
105 self.root_directory = tempfile.mkdtemp()
106 self.storage = FileSystemStorage(
107 root_path=self.root_directory,
108 rel_path="tmp",
109 )
111 self.mock_table_class = mock.Mock()
113 self.extractor = SystmOneTextExtractor(
114 self.mock_engine,
115 self.mock_metadata,
116 self.context,
117 self.root_directory,
118 )
119 self.mock_select_object = mock.Mock()
120 self.mock_select_fn = mock.Mock(return_value=self.mock_select_object)
122 self.mock_document_to_text = mock.Mock()
123 self.mock_last_extracted = self.fake.past_datetime()
125 self.mock_insert_values = mock.Mock()
126 self.mock_insert_result = mock.Mock(values=self.mock_insert_values)
127 self.mock_insert = mock.Mock(return_value=self.mock_insert_result)
129 self.mock_update_values = mock.Mock()
130 self.mock_update_result = mock.Mock(values=self.mock_update_values)
131 self.mock_update = mock.Mock(return_value=self.mock_update_result)
133 self.register_providers()
135 def register_providers(self) -> None:
136 self.fake.add_provider(RowIdentifierProvider)
137 self.fake.add_provider(DocumentUidProvider)
138 self.fake.add_provider(SubfolderProvider)
139 self.fake.add_provider(IndexProvider)
140 self.fake.add_provider(PatientIdProvider)
142 def tearDown(self) -> None:
143 shutil.rmtree(self.root_directory)
145 def generate_filename(
146 self,
147 extension: str,
148 row_identifier: int = None,
149 document_uid: int = None,
150 subfolder: int = None,
151 index: int = None,
152 ) -> str:
153 if row_identifier is None:
154 row_identifier = self.fake.row_identifier()
156 if document_uid is None:
157 document_uid = self.fake.document_uid()
159 if subfolder is None:
160 subfolder = self.fake.subfolder()
162 if index is None:
163 index = self.fake.index()
165 return f"{row_identifier}_{document_uid:x}_{subfolder}_{index}.{extension}" # noqa: E501
167 def test_invalid_filename_skipped(self) -> None:
168 filename = os.path.join(self.root_directory, "test.txt")
169 content = self.fake.paragraph(nb_sentences=10)
170 self.storage.write_text(filename, content)
172 with self.assertLogs(level=logging.INFO) as logging_cm:
173 self.extractor.extract_all()
175 self.assert_logged(
176 "crate_anon.preprocess.text_extractor",
177 logging.INFO,
178 f"Completely ignoring {filename}",
179 logging_cm,
180 )
182 def test_unknown_row_identifier_skipped(self) -> None:
183 content = self.fake.paragraph(nb_sentences=10)
184 row_identifier = self.fake.row_identifier()
185 filename = os.path.join(
186 self.root_directory,
187 self.generate_filename("txt", row_identifier=row_identifier),
188 )
189 self.storage.write_text(filename, content)
191 with mock.patch.multiple(
192 "crate_anon.preprocess.text_extractor",
193 select=self.mock_select_fn,
194 ):
195 self.mock_one.side_effect = NoResultFound()
196 with self.assertLogs(level=logging.ERROR) as logging_cm:
197 self.extractor.extract_all()
199 self.assert_logged(
200 "crate_anon.preprocess.text_extractor",
201 logging.ERROR,
202 f"... no row found for RowIdentifier: {row_identifier}",
203 logging_cm,
204 )
206 def test_multiple_results_skipped(self) -> None:
207 # Not seen in the real world but theoretically possible.
208 content = self.fake.paragraph(nb_sentences=10)
209 row_identifier = self.fake.row_identifier()
210 filename = os.path.join(
211 self.root_directory,
212 self.generate_filename("txt", row_identifier=row_identifier),
213 )
214 self.storage.write_text(filename, content)
216 with mock.patch.multiple(
217 "crate_anon.preprocess.text_extractor",
218 select=self.mock_select_fn,
219 ):
220 self.mock_one.side_effect = MultipleResultsFound()
221 with self.assertLogs(level=logging.ERROR) as logging_cm:
222 self.extractor.extract_all()
224 self.assert_logged(
225 "crate_anon.preprocess.text_extractor",
226 logging.ERROR,
227 (
228 "... multiple rows found with RowIdentifier: "
229 f"{row_identifier}"
230 ),
231 logging_cm,
232 )
234 def test_row_inserted_into_table(self) -> None:
235 content = self.fake.paragraph(nb_sentences=10)
236 row_identifier = self.fake.row_identifier()
237 document_uid = self.fake.document_uid()
238 filename = os.path.join(
239 self.root_directory,
240 self.generate_filename(
241 "txt", row_identifier=row_identifier, document_uid=document_uid
242 ),
243 )
244 self.storage.write_text(filename, content)
246 patient_id = self.fake.patient_id()
247 self.mock_one.return_value = mock.Mock(
248 _mapping={
249 S1GenericCol.PATIENT_ID: patient_id,
250 }
251 )
252 self.mock_document_to_text.return_value = content
254 with mock.patch.multiple(
255 "crate_anon.preprocess.text_extractor",
256 select=self.mock_select_fn,
257 document_to_text=self.mock_document_to_text,
258 Pendulum=mock.Mock(
259 now=mock.Mock(return_value=self.mock_last_extracted)
260 ),
261 insert=self.mock_insert,
262 ):
263 self.extractor.extract_all()
265 values = dict(
266 RowIdentifier=row_identifier,
267 DocumentUID=f"{document_uid:x}",
268 IDPatient=patient_id,
269 crate_file_path=str(Path(*Path(filename).parts[-2:])),
270 crate_text=content,
271 crate_text_last_extracted=self.mock_last_extracted,
272 )
274 self.mock_insert_values.assert_called_once_with(**values)
276 def test_null_text_inserted_when_extension_not_supported(self) -> None:
277 content = self.fake.paragraph(nb_sentences=10)
278 filename = os.path.join(
279 self.root_directory,
280 self.generate_filename("tex"),
281 )
282 self.storage.write_text(filename, content)
284 patient_id = self.fake.patient_id()
285 self.mock_one.return_value = mock.Mock(
286 _mapping={
287 S1GenericCol.PATIENT_ID: patient_id,
288 }
289 )
290 self.mock_document_to_text.return_value = content
292 with mock.patch.multiple(
293 "crate_anon.preprocess.text_extractor",
294 select=self.mock_select_fn,
295 document_to_text=self.mock_document_to_text,
296 Pendulum=mock.Mock(
297 now=mock.Mock(return_value=self.mock_last_extracted)
298 ),
299 insert=self.mock_insert,
300 ):
301 with self.assertLogs(level=logging.INFO) as logging_cm:
302 self.extractor.extract_all()
304 self.assert_logged(
305 "crate_anon.preprocess.text_extractor",
306 logging.INFO,
307 "... unsupported file extension '.tex'.",
308 logging_cm,
309 )
311 args, kwargs = self.mock_insert_values.call_args
312 self.assertIsNone(kwargs["crate_text"])
314 def test_row_updated_in_table(self) -> None:
315 content = self.fake.paragraph(nb_sentences=10)
316 row_identifier = self.fake.row_identifier()
317 document_uid = self.fake.document_uid()
318 filename = os.path.join(
319 self.root_directory,
320 self.generate_filename(
321 "txt", row_identifier=row_identifier, document_uid=document_uid
322 ),
323 )
324 self.storage.write_text(filename, content)
326 patient_id = self.fake.patient_id()
327 self.mock_one.return_value = mock.Mock(
328 _mapping={
329 S1GenericCol.PATIENT_ID: patient_id,
330 }
331 )
332 self.mock_document_to_text.return_value = content
334 self.mock_execute.side_effect = [
335 self.mock_result,
336 IntegrityError(None, None, None),
337 ]
339 with mock.patch.multiple(
340 "crate_anon.preprocess.text_extractor",
341 select=self.mock_select_fn,
342 document_to_text=self.mock_document_to_text,
343 Pendulum=mock.Mock(
344 now=mock.Mock(return_value=self.mock_last_extracted)
345 ),
346 insert=self.mock_insert,
347 update=self.mock_update,
348 ):
349 self.extractor.extract_all()
351 values = dict(
352 RowIdentifier=row_identifier,
353 DocumentUID=f"{document_uid:x}",
354 IDPatient=patient_id,
355 crate_file_path=str(Path(*Path(filename).parts[-2:])),
356 crate_text=content,
357 crate_text_last_extracted=self.mock_last_extracted,
358 )
360 self.mock_update_values.assert_called_once_with(**values)
362 def test_exception_from_text_conversion_handled(self) -> None:
363 content = self.fake.paragraph(nb_sentences=10)
364 row_identifier = self.fake.row_identifier()
365 document_uid = self.fake.document_uid()
366 filename = os.path.join(
367 self.root_directory,
368 self.generate_filename(
369 "txt", row_identifier=row_identifier, document_uid=document_uid
370 ),
371 )
372 self.storage.write_text(filename, content)
374 patient_id = self.fake.patient_id()
375 self.mock_one.return_value = mock.Mock(
376 _mapping={
377 S1GenericCol.PATIENT_ID: patient_id,
378 }
379 )
380 self.mock_document_to_text.side_effect = Exception(
381 "Something bad happened"
382 )
384 with mock.patch.multiple(
385 "crate_anon.preprocess.text_extractor",
386 select=self.mock_select_fn,
387 document_to_text=self.mock_document_to_text,
388 ):
389 with self.assertLogs(level=logging.ERROR) as logging_cm:
390 self.extractor.extract_all()
392 self.assert_logged(
393 "crate_anon.preprocess.text_extractor",
394 logging.ERROR,
395 (
396 "... caught exception from document_to_text: "
397 "Something bad happened"
398 ),
399 logging_cm,
400 )
402 def test_table_dropped(self) -> None:
403 self.extractor.drop_table = True
404 self.mock_table_class.return_value = self.mock_extracted_text_table
405 self.mock_extracted_text_table.columns = []
407 with mock.patch.multiple(
408 "crate_anon.preprocess.text_extractor",
409 Table=self.mock_table_class,
410 ):
411 self.extractor.extract_all()
412 self.mock_drop.assert_called_once_with(checkfirst=True)
414 def test_columns_indexed(self) -> None:
415 self.extractor.drop_table = True
416 self.mock_table_class.return_value = self.mock_extracted_text_table
418 mock_pk_column = mock.Mock()
419 mock_pk_column.name = CRATE_COL_PK
420 mock_row_id_column = mock.Mock()
421 mock_row_id_column.name = S1GenericCol.ROW_ID
422 mock_patient_id_column = mock.Mock()
423 mock_patient_id_column.name = S1GenericCol.PATIENT_ID
425 self.mock_extracted_text_table.columns = [
426 mock_pk_column,
427 mock_row_id_column,
428 mock_patient_id_column,
429 ]
431 mock_add_indexes = mock.Mock()
432 mock_pk_info = mock.Mock()
433 mock_row_id_info = mock.Mock()
434 mock_patient_id_info = mock.Mock()
435 mock_add_indexes = mock.Mock()
437 mock_index_creation_info = mock.Mock(
438 side_effect=[
439 mock_pk_info,
440 mock_row_id_info,
441 mock_patient_id_info,
442 ]
443 )
445 with mock.patch.multiple(
446 "crate_anon.preprocess.text_extractor",
447 Table=self.mock_table_class,
448 add_indexes=mock_add_indexes,
449 IndexCreationInfo=mock_index_creation_info,
450 ):
451 self.extractor.extract_all()
452 mock_add_indexes.assert_any_call(
453 self.mock_engine,
454 self.mock_extracted_text_table,
455 [mock_pk_info],
456 )
457 mock_add_indexes.assert_any_call(
458 self.mock_engine,
459 self.mock_extracted_text_table,
460 [mock_row_id_info],
461 )
462 mock_add_indexes.assert_any_call(
463 self.mock_engine,
464 self.mock_extracted_text_table,
465 [mock_patient_id_info],
466 )
468 mock_index_creation_info.assert_any_call(
469 index_name=f"{CRATE_IDX_PREFIX}_{CRATE_COL_PK}",
470 column=CRATE_COL_PK,
471 unique=False,
472 )
473 mock_index_creation_info.assert_any_call(
474 index_name=f"{CRATE_IDX_PREFIX}_{S1GenericCol.ROW_ID}",
475 column=S1GenericCol.ROW_ID,
476 unique=False,
477 )
478 mock_index_creation_info.assert_any_call(
479 index_name=f"{CRATE_IDX_PREFIX}_{S1GenericCol.PATIENT_ID}",
480 column=S1GenericCol.PATIENT_ID,
481 unique=False,
482 )