Coverage for anonymise/tests/anonymise_tests.py: 85%

474 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-01-09 10:40 -0600

1""" 

2crate_anon/anonymise/tests/anonymise_tests.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26""" 

27 

28# ============================================================================= 

29# Imports 

30# ============================================================================= 

31 

32import logging 

33from typing import Any, Dict, Generator, List, Tuple, TYPE_CHECKING 

34from unittest import mock 

35 

36from cardinal_pythonlib.hash import HmacMD5Hasher 

37from cardinal_pythonlib.sqlalchemy.schema import ( 

38 execute_ddl, 

39 mssql_table_has_ft_index, 

40) 

41import factory 

42import pytest 

43from sortedcontainers import SortedSet 

44from sqlalchemy import ( 

45 Boolean, 

46 Column, 

47 create_engine, 

48 DateTime, 

49 inspect, 

50 Integer, 

51 String, 

52 Text, 

53) 

54 

55from crate_anon.anonymise.anonymise import ( 

56 create_indexes, 

57 gen_opt_out_pids_from_database, 

58 process_patient_tables, 

59 process_table, 

60 validate_optouts, 

61) 

62from crate_anon.anonymise.altermethod import AlterMethod 

63from crate_anon.anonymise.constants import IndexType, ScrubMethod 

64from crate_anon.anonymise.models import PatientInfo 

65from crate_anon.anonymise.dd import ScrubSourceFieldInfo 

66from crate_anon.anonymise.ddr import DataDictionaryRow 

67from crate_anon.anonymise.tests.factories import PatientInfoFactory 

68from crate_anon.testing import AnonTestBase, SourceTestBase 

69from crate_anon.testing.classes import ( 

70 DatabaseTestCase, 

71 SlowSecretDatabaseTestCase, 

72) 

73from crate_anon.testing.factories import ( 

74 AnonTestBaseFactory, 

75 Fake, 

76 SourceTestBaseFactory, 

77) 

78 

79if TYPE_CHECKING: 

80 from factory.builder import Resolver 

81 

82 

83# ============================================================================= 

84# SQLAlchemy test tables 

85# ============================================================================= 

86 

87 

88class TestBoolOptOut(SourceTestBase): 

89 __tablename__ = "test_opt_out_bool" 

90 

91 pid = Column(Integer, primary_key=True, comment="Patient ID") 

92 mpid = Column(Integer, comment="Master patient ID") 

93 opt_out = Column(Boolean, comment="Opt out?") 

94 

95 

96class TestBoolOptOutFactory(SourceTestBaseFactory): 

97 class Meta: 

98 model = TestBoolOptOut 

99 

100 pid = factory.Sequence(lambda n: n + 1) 

101 mpid = factory.Sequence(lambda n: n + 1) 

102 

103 

104class TestStringOptOut(SourceTestBase): 

105 __tablename__ = "test_opt_out_string" 

106 

107 pid = Column(Integer, primary_key=True, comment="Patient ID") 

108 mpid = Column(Integer, comment="Master patient ID") 

109 opt_out = Column(String(4), comment="Opt out?") 

110 

111 

112class TestStringOptOutFactory(SourceTestBaseFactory): 

113 class Meta: 

114 model = TestStringOptOut 

115 

116 pid = factory.Sequence(lambda n: n + 1) 

117 mpid = factory.Sequence(lambda n: n + 1) 

118 

119 

120class TestAnonNote(AnonTestBase): 

121 __tablename__ = "test_anon_note" 

122 

123 note_id = Column(Integer, primary_key=True, comment="Note ID") 

124 note1 = Column(Text, comment="Text of note 1") 

125 note2 = Column(Text, comment="Text of note 2") 

126 

127 

128class TestPatient(SourceTestBase): 

129 __tablename__ = "test_patient" 

130 

131 pid = Column(Integer, primary_key=True, comment="Patient ID") 

132 forename = Column(String(50), comment="Forename") 

133 surname = Column(String(50), comment="Surname") 

134 

135 @property 

136 def name(self) -> str: 

137 return f"{self.forename} {self.surname}" 

138 

139 

140class TestPatientFactory(SourceTestBaseFactory): 

141 class Meta: 

142 model = TestPatient 

143 

144 forename = factory.LazyFunction(Fake.en_gb.first_name) 

145 surname = factory.LazyFunction(Fake.en_gb.last_name) 

146 

147 

148class TestPatientWithStringMPID(SourceTestBase): 

149 __tablename__ = "test_patient_with_string_mpid" 

150 

151 pid = Column(Integer, primary_key=True, comment="Patient ID") 

152 nhsnum = Column(String(10), comment="NHS Number") 

153 

154 

155class TestPatientWithStringMPIDFactory(SourceTestBaseFactory): 

156 class Meta: 

157 model = TestPatientWithStringMPID 

158 

159 pid = factory.Sequence(lambda n: n + 1) 

160 

161 @factory.lazy_attribute 

162 def nhsnum(obj: "Resolver") -> str: 

163 return str(Fake.en_gb.nhs_number()) 

164 

165 

166class TestRecord(SourceTestBase): 

167 __tablename__ = "test_record" 

168 

169 pk = Column(Integer, primary_key=True, comment="PK") 

170 pid = Column(Integer, comment="Patient ID") 

171 row_identifier = Column(Integer, comment="Row ID") 

172 third_party_pid = Column(Integer, comment="Third party PID") 

173 nhsnum = Column(Integer, comment="NHS Number") 

174 other = Column(String(50), comment="Other column") 

175 

176 

177class TestRecordFactory(SourceTestBaseFactory): 

178 class Meta: 

179 model = TestRecord 

180 

181 pk = factory.Sequence(lambda n: n + 1) 

182 row_identifier = factory.Sequence(lambda n: n + 10000) 

183 nhsnum = factory.LazyFunction(Fake.en_gb.nhs_number) 

184 third_party_pid = factory.Sequence(lambda n: n + 1000) 

185 

186 

187class TestAnonRecord(AnonTestBase): 

188 __tablename__ = "test_anon_record" 

189 

190 row_identifier = Column(Integer, primary_key=True, comment="Row ID") 

191 nhshash = Column(String(32)) 

192 third_party_pid_hash = Column(String(32)) 

193 other = Column(String(50), comment="Other column") 

194 _src_hash = Column(String(32)) 

195 _when_processed_utc = Column(DateTime()) 

196 

197 

198class TestAnonRecordFactory(AnonTestBaseFactory): 

199 class Meta: 

200 model = TestAnonRecord 

201 

202 

203class TestPidAsPkRecord(SourceTestBase): 

204 __tablename__ = "test_pid_as_pk_record" 

205 

206 pid = Column(Integer, primary_key=True, comment="Patient ID") 

207 other = Column(String(50), comment="Other column") 

208 

209 

210class TestPidAsPkRecordFactory(SourceTestBaseFactory): 

211 class Meta: 

212 model = TestPidAsPkRecord 

213 

214 

215class TestAnonPidAsPkRecord(AnonTestBase): 

216 __tablename__ = "test_anon_pid_as_pk_record" 

217 

218 rid = Column(String(32), primary_key=True, comment="Research ID") 

219 trid = Column(Integer) 

220 mrid = Column(String(32)) 

221 _src_hash = Column(String(32)) 

222 _when_processed_utc = Column(DateTime()) 

223 

224 

225class TestAnonPidAsPkRecordFactory(AnonTestBaseFactory): 

226 class Meta: 

227 model = TestAnonPidAsPkRecord 

228 

229 

230# ============================================================================= 

231# Unit tests 

232# ============================================================================= 

233class AnonymiseTestMixin: 

234 def mock_dd_row( 

235 self, 

236 omit: bool = False, 

237 skip_row_by_value: mock.Mock = None, 

238 primary_pid: bool = False, 

239 master_pid: bool = False, 

240 third_party_pid: bool = False, 

241 alter_methods: list[AlterMethod] = None, 

242 add_src_hash: bool = False, 

243 **kwargs, 

244 ) -> mock.Mock: 

245 if skip_row_by_value is None: 

246 skip_row_by_value = mock.Mock(return_value=False) 

247 

248 if alter_methods is None: 

249 alter_methods = [] 

250 

251 return mock.Mock( 

252 omit=omit, 

253 skip_row_by_value=skip_row_by_value, 

254 primary_pid=primary_pid, 

255 master_pid=master_pid, 

256 third_party_pid=third_party_pid, 

257 alter_methods=alter_methods, 

258 add_src_hash=add_src_hash, 

259 **kwargs, 

260 ) 

261 

262 

263class GenOptOutPidsFromDatabaseTests(DatabaseTestCase): 

264 def test_string_in_optout_col_values_ignored_for_boolean_column( 

265 self, 

266 ) -> None: 

267 optout_defining_fields = mock.Mock( 

268 return_value=[ 

269 ( 

270 "db", 

271 "test_opt_out_bool", 

272 "opt_out", 

273 "pid", 

274 "mpid", 

275 ) 

276 ] 

277 ) 

278 mock_dd = mock.Mock(get_optout_defining_fields=optout_defining_fields) 

279 mock_sources = { 

280 "db": mock.Mock( 

281 session=self.source_dbsession, 

282 engine=self.source_engine, 

283 metadata=SourceTestBase.metadata, 

284 ), 

285 } 

286 

287 opt_out_1 = TestBoolOptOutFactory(opt_out=True) 

288 opt_out_2 = TestBoolOptOutFactory(opt_out=True) 

289 opt_out_3 = TestBoolOptOutFactory(opt_out=True) 

290 opt_out_4 = TestBoolOptOutFactory(opt_out=False) 

291 self.source_dbsession.flush() 

292 

293 with mock.patch.multiple( 

294 "crate_anon.anonymise.anonymise.config", 

295 dd=mock_dd, 

296 sources=mock_sources, 

297 optout_col_values=[True, 1, "1"], 

298 ): 

299 pids = list(gen_opt_out_pids_from_database()) 

300 

301 self.assertIn(opt_out_1.pid, pids) 

302 self.assertIn(opt_out_2.pid, pids) 

303 self.assertIn(opt_out_3.pid, pids) 

304 self.assertNotIn(opt_out_4.pid, pids) 

305 

306 def test_invalid_boolean_optout_col_value_logged( 

307 self, 

308 ) -> None: 

309 optout_defining_fields = mock.Mock( 

310 return_value=[ 

311 ( 

312 "db", 

313 "test_opt_out_bool", 

314 "opt_out", 

315 "pid", 

316 "mpid", 

317 ) 

318 ] 

319 ) 

320 mock_dd = mock.Mock(get_optout_defining_fields=optout_defining_fields) 

321 mock_sources = { 

322 "db": mock.Mock( 

323 session=self.source_dbsession, 

324 engine=self.source_engine, 

325 metadata=SourceTestBase.metadata, 

326 ), 

327 } 

328 

329 TestBoolOptOutFactory(opt_out=True) 

330 self.source_dbsession.flush() 

331 

332 with mock.patch.multiple( 

333 "crate_anon.anonymise.anonymise.config", 

334 dd=mock_dd, 

335 sources=mock_sources, 

336 optout_col_values=["1"], 

337 ): 

338 with self.assertLogs(level=logging.INFO) as logging_cm: 

339 list(gen_opt_out_pids_from_database()) 

340 

341 self.assert_logged( 

342 "crate_anon.anonymise.anonymise", 

343 logging.INFO, 

344 ( 

345 "... ignoring non-boolean value (1), type 'str' " 

346 "for boolean column 'opt_out'" 

347 ), 

348 logging_cm, 

349 ) 

350 

351 def test_string_in_optout_col_values_valid_for_string_column( 

352 self, 

353 ) -> None: 

354 optout_defining_fields = mock.Mock( 

355 return_value=[ 

356 ( 

357 "db", 

358 "test_opt_out_string", 

359 "opt_out", 

360 "pid", 

361 "mpid", 

362 ) 

363 ] 

364 ) 

365 mock_dd = mock.Mock(get_optout_defining_fields=optout_defining_fields) 

366 mock_sources = { 

367 "db": mock.Mock( 

368 session=self.source_dbsession, 

369 engine=self.source_engine, 

370 metadata=SourceTestBase.metadata, 

371 ), 

372 } 

373 

374 opt_out_1 = TestStringOptOutFactory(opt_out="yes") 

375 opt_out_2 = TestStringOptOutFactory(opt_out="1") 

376 opt_out_3 = TestStringOptOutFactory(opt_out="no") 

377 opt_out_4 = TestStringOptOutFactory(opt_out="0") 

378 self.source_dbsession.flush() 

379 

380 with mock.patch.multiple( 

381 "crate_anon.anonymise.anonymise.config", 

382 dd=mock_dd, 

383 sources=mock_sources, 

384 optout_col_values=["yes", "1"], 

385 ): 

386 pids = list(gen_opt_out_pids_from_database()) 

387 

388 self.assertIn(opt_out_1.pid, pids) 

389 self.assertIn(opt_out_2.pid, pids) 

390 self.assertNotIn(opt_out_3.pid, pids) 

391 self.assertNotIn(opt_out_4.pid, pids) 

392 

393 

394class ValidateOptoutsTests(DatabaseTestCase): 

395 def test_error_reported_if_no_valid_optout_fields(self) -> None: 

396 optout_defining_fields = mock.Mock( 

397 return_value=[ 

398 ( 

399 "db", 

400 "test_opt_out_bool", 

401 "opt_out", 

402 "pid", 

403 "mpid", 

404 ) 

405 ] 

406 ) 

407 mock_dd = mock.Mock( 

408 get_optout_defining_fields=optout_defining_fields, 

409 ) 

410 mock_sources = { 

411 "db": mock.Mock( 

412 session=self.source_dbsession, 

413 engine=self.source_engine, 

414 metadata=SourceTestBase.metadata, 

415 ), 

416 } 

417 

418 TestBoolOptOutFactory(opt_out=True) 

419 TestBoolOptOutFactory(opt_out=False) 

420 self.source_dbsession.flush() 

421 

422 with mock.patch.multiple( 

423 "crate_anon.anonymise.anonymise.config", 

424 dd=mock_dd, 

425 sources=mock_sources, 

426 optout_col_values=[3.14159, "1"], 

427 ): 

428 with self.assertRaises(ValueError) as cm: 

429 validate_optouts() 

430 

431 self.assertEqual( 

432 str(cm.exception), 

433 "No valid opt-out values for column 'opt_out'", 

434 ) 

435 

436 

437class CreateIndexesTests(DatabaseTestCase): 

438 def setUp(self) -> None: 

439 super().setUp() 

440 self._engine_outside_transaction = None 

441 

442 def test_full_text_index_created_with_mysql(self) -> None: 

443 if self.anon_engine.dialect.name != "mysql": 

444 pytest.skip("Skipping MySQL-only test") 

445 

446 if self._get_mysql_anon_note_table_full_text_indexes(): 

447 self._drop_mysql_full_text_indexes() 

448 

449 indexes = self._get_mysql_anon_note_table_full_text_indexes() 

450 self.assertEqual(len(indexes), 0) 

451 

452 self._make_full_text_index() 

453 indexes = self._get_mysql_anon_note_table_full_text_indexes() 

454 

455 self.assertEqual(len(indexes), 2) 

456 self.assertEqual(indexes["note1"]["type"], "FULLTEXT") 

457 self.assertEqual(indexes["note2"]["type"], "FULLTEXT") 

458 

459 def _drop_mysql_full_text_indexes(self) -> None: 

460 execute_ddl( 

461 self.anon_engine, sql="DROP INDEX _idxft_note1 ON test_anon_note" 

462 ) 

463 execute_ddl( 

464 self.anon_engine, sql="DROP INDEX _idxft_note2 ON test_anon_note" 

465 ) 

466 

467 def _get_mysql_anon_note_table_full_text_indexes( 

468 self, 

469 ) -> Dict[str, List[Dict[str, Any]]]: 

470 return { 

471 i["column_names"][0]: i 

472 for i in inspect(self.anon_engine).get_indexes("test_anon_note") 

473 } 

474 

475 def test_full_text_index_created_with_mssql(self) -> None: 

476 if self.anon_engine.dialect.name != "mssql": 

477 pytest.skip("Skipping mssql-only test") 

478 

479 self._drop_mssql_full_text_indexes() 

480 

481 self.assertFalse(self._mssql_anon_note_table_has_full_text_index()) 

482 self._make_full_text_index() 

483 

484 self.assertTrue(self._mssql_anon_note_table_has_full_text_index()) 

485 

486 def _mssql_anon_note_table_has_full_text_index(self) -> bool: 

487 return mssql_table_has_ft_index( 

488 self.engine_outside_transaction, "test_anon_note", "dbo" 

489 ) 

490 

491 def _drop_mssql_full_text_indexes(self) -> None: 

492 # SQL Server only. Need to be outside a transaction to drop indexes 

493 sql = """ 

494 IF EXISTS ( 

495 SELECT fti.object_id FROM sys.fulltext_indexes fti 

496 WHERE fti.object_id = OBJECT_ID(N'[dbo].[test_anon_note]') 

497 ) 

498 DROP FULLTEXT INDEX ON [dbo].[test_anon_note] 

499 """ 

500 execute_ddl(self.engine_outside_transaction, sql) 

501 

502 @property 

503 def engine_outside_transaction(self) -> None: 

504 if self._engine_outside_transaction is None: 

505 self._engine_outside_transaction = create_engine( 

506 self.anon_engine.url, 

507 connect_args={"autocommit": True}, # for pyodbc 

508 future=True, 

509 ) 

510 

511 return self._engine_outside_transaction 

512 

513 def _make_full_text_index(self) -> None: 

514 mock_config = None 

515 

516 # noinspection PyUnusedLocal 

517 def index_row_sets( 

518 tasknum: int = 0, ntasks: int = 1 

519 ) -> Generator[Tuple[str, List[DataDictionaryRow]], None, None]: 

520 note1_row = DataDictionaryRow(mock_config) 

521 note1_row.dest_field = "note1" 

522 note1_row.index = IndexType.FULLTEXT 

523 note2_row = DataDictionaryRow(mock_config) 

524 note2_row.dest_field = "note2" 

525 note2_row.index = IndexType.FULLTEXT 

526 

527 for set_ in [ 

528 ("TestAnonNote", [note1_row, note2_row]), 

529 ]: 

530 yield set_ 

531 

532 mock_dd = mock.Mock( 

533 get_dest_sqla_table=mock.Mock(return_value=TestAnonNote.__table__) 

534 ) 

535 with mock.patch.multiple( 

536 "crate_anon.anonymise.anonymise", 

537 gen_index_row_sets_by_table=index_row_sets, 

538 ): 

539 with mock.patch.multiple( 

540 "crate_anon.anonymise.anonymise.config", 

541 dd=mock_dd, 

542 _destination_database_url=self.anon_engine.url, 

543 ) as mock_config: 

544 create_indexes() 

545 

546 

547class ProcessPatientTablesMPidTests( 

548 SlowSecretDatabaseTestCase, AnonymiseTestMixin 

549): 

550 def setUp(self) -> None: 

551 super().setUp() 

552 

553 self.mock_admindb = mock.Mock(session=self.secret_dbsession) 

554 mock_srccfg = mock.Mock(debug_limited_tables=[]) 

555 self.mock_sourcedb = mock.Mock( 

556 session=self.source_dbsession, 

557 srccfg=mock_srccfg, 

558 engine=self.source_engine, 

559 metadata=SourceTestBase.metadata, 

560 ) 

561 self.mock_get_scrub_from_rows_as_fieldinfo = mock.Mock( 

562 return_value=[ 

563 ScrubSourceFieldInfo( 

564 is_mpid=True, 

565 is_patient=False, 

566 recurse=False, 

567 required_scrubber=False, 

568 scrub_method=ScrubMethod.NUMERIC, 

569 signature=None, 

570 value_fieldname="nhsnum", 

571 ), 

572 ] 

573 ) 

574 

575 self.mock_get_scrub_from_db_table_pairs = mock.Mock( 

576 return_value=[ 

577 ("source1", "test_patient_with_string_mpid"), 

578 ] 

579 ) 

580 

581 self.mock_get_pid_name = mock.Mock(return_value="pid") 

582 self.mock_estimate_count_patients = mock.Mock(return_value=1) 

583 self.mock_opting_out_pid = mock.Mock(return_value=False) 

584 

585 mock_row = self.mock_dd_row( 

586 src_field="row_identifier", 

587 dest_field="row_identifier", 

588 ) 

589 mock_rows_for_src_table = mock.Mock(return_value=[mock_row]) 

590 

591 self.mock_dd = mock.Mock( 

592 get_scrub_from_db_table_pairs=( 

593 self.mock_get_scrub_from_db_table_pairs 

594 ), 

595 get_scrub_from_rows_as_fieldinfo=( 

596 self.mock_get_scrub_from_rows_as_fieldinfo 

597 ), 

598 get_pid_name=self.mock_get_pid_name, 

599 get_mandatory_scrubber_sigs=mock.Mock(return_value=set()), 

600 get_source_databases=mock.Mock( 

601 return_value=SortedSet(["source1"]) 

602 ), 

603 get_patient_src_tables_with_active_dest=mock.Mock( 

604 return_value=SortedSet(["test_record"]) 

605 ), 

606 get_rows_for_src_table=mock_rows_for_src_table, 

607 ) 

608 

609 def test_patient_saved_in_secret_database(self) -> None: 

610 patient = TestPatientWithStringMPIDFactory() 

611 self.source_dbsession.commit() 

612 

613 pids = [patient.pid] 

614 

615 with mock.patch.multiple( 

616 "crate_anon.anonymise.anonymise", 

617 estimate_count_patients=self.mock_estimate_count_patients, 

618 opting_out_pid=self.mock_opting_out_pid, 

619 ): 

620 with mock.patch.multiple( 

621 "crate_anon.anonymise.anonymise.config", 

622 dd=self.mock_dd, 

623 _destination_database_url=self.anon_engine.url, 

624 admindb=self.mock_admindb, 

625 sources={"source1": self.mock_sourcedb}, 

626 ): 

627 process_patient_tables(specified_pids=pids) 

628 

629 patient_info = self.secret_dbsession.query(PatientInfo).one() 

630 self.assertEqual(patient_info.pid, patient.pid) 

631 self.assertEqual(str(patient_info.mpid), patient.nhsnum) 

632 

633 def test_patient_mpid_updated_in_secret_database(self) -> None: 

634 patient = TestPatientWithStringMPIDFactory() 

635 self.source_dbsession.commit() 

636 

637 patient_info = self.secret_dbsession.query(PatientInfo).one_or_none() 

638 self.assertIsNone(patient_info) 

639 

640 patient_info = PatientInfoFactory(pid=patient.pid, mpid=None) 

641 self.secret_dbsession.commit() 

642 

643 pids = [patient.pid] 

644 

645 with mock.patch.multiple( 

646 "crate_anon.anonymise.anonymise", 

647 estimate_count_patients=self.mock_estimate_count_patients, 

648 opting_out_pid=self.mock_opting_out_pid, 

649 ): 

650 with mock.patch.multiple( 

651 "crate_anon.anonymise.anonymise.config", 

652 dd=self.mock_dd, 

653 _destination_database_url=self.anon_engine.url, 

654 admindb=self.mock_admindb, 

655 sources={"source1": self.mock_sourcedb}, 

656 ): 

657 process_patient_tables(specified_pids=pids) 

658 

659 patient_info = self.secret_dbsession.query(PatientInfo).one() 

660 self.assertEqual(patient_info.pid, patient.pid) 

661 self.assertEqual(str(patient_info.mpid), patient.nhsnum) 

662 

663 def test_patient_with_invalid_mpid_skipped(self) -> None: 

664 if self.source_engine.dialect.name == "sqlite": 

665 pytest.skip( 

666 "Skipping test because SQLite would allow non-integer values " 

667 "in an integer field" 

668 ) 

669 

670 patient_info = self.secret_dbsession.query(PatientInfo).one_or_none() 

671 self.assertIsNone(patient_info) 

672 

673 patient = TestPatientWithStringMPIDFactory(nhsnum="ABC123") 

674 self.source_dbsession.commit() 

675 

676 pid = patient.pid 

677 pids = [pid] 

678 

679 with mock.patch.multiple( 

680 "crate_anon.anonymise.anonymise", 

681 estimate_count_patients=self.mock_estimate_count_patients, 

682 opting_out_pid=self.mock_opting_out_pid, 

683 ): 

684 with mock.patch.multiple( 

685 "crate_anon.anonymise.anonymise.config", 

686 dd=self.mock_dd, 

687 _destination_database_url=self.anon_engine.url, 

688 admindb=self.mock_admindb, 

689 sources={"source1": self.mock_sourcedb}, 

690 ): 

691 with self.assertLogs(level=logging.WARNING) as logging_cm: 

692 process_patient_tables(specified_pids=pids) 

693 

694 self.assertIsNone( 

695 self.secret_dbsession.query(PatientInfo).one_or_none() 

696 ) 

697 self.assert_logged( 

698 "crate_anon.anonymise.anonymise", 

699 logging.WARNING, 

700 ( 

701 f"Skipping patient with PID={pid} because the record could " 

702 "not be saved to the secret_map table" 

703 ), 

704 logging_cm, 

705 ) 

706 

707 def test_valid_patients_added_when_invalid_mpid_skipped(self) -> None: 

708 if self.source_engine.dialect.name == "sqlite": 

709 pytest.skip( 

710 "Skipping test because SQLite would allow non-integer values " 

711 "in an integer field" 

712 ) 

713 

714 patient_info = self.secret_dbsession.query(PatientInfo).one_or_none() 

715 self.assertIsNone(patient_info) 

716 invalid_patient = TestPatientWithStringMPIDFactory(nhsnum="ABC123") 

717 self.source_dbsession.commit() 

718 valid_patient1 = TestPatientWithStringMPIDFactory() 

719 self.source_dbsession.commit() 

720 valid_patient2 = TestPatientWithStringMPIDFactory() 

721 self.source_dbsession.commit() 

722 

723 invalid_pid = invalid_patient.pid 

724 valid_pid1 = valid_patient1.pid 

725 valid_pid2 = valid_patient2.pid 

726 pids = [valid_pid1, invalid_pid, valid_pid2] 

727 

728 with mock.patch.multiple( 

729 "crate_anon.anonymise.anonymise", 

730 estimate_count_patients=self.mock_estimate_count_patients, 

731 opting_out_pid=self.mock_opting_out_pid, 

732 ): 

733 with mock.patch.multiple( 

734 "crate_anon.anonymise.anonymise.config", 

735 dd=self.mock_dd, 

736 _destination_database_url=self.anon_engine.url, 

737 admindb=self.mock_admindb, 

738 sources={"source1": self.mock_sourcedb}, 

739 ): 

740 process_patient_tables(specified_pids=pids) 

741 

742 pids = [p.pid for p in self.secret_dbsession.query(PatientInfo)] 

743 self.assertIn(valid_patient1.pid, pids) 

744 self.assertIn(valid_patient2.pid, pids) 

745 

746 # For some reason these end up being a mixture of strings and ints 

747 nhsnums = [ 

748 int(p.mpid) for p in self.secret_dbsession.query(PatientInfo) 

749 ] 

750 self.assertIn(int(valid_patient1.nhsnum), nhsnums) 

751 self.assertIn(int(valid_patient2.nhsnum), nhsnums) 

752 

753 

754class ProcessPatientTablesPKTests(DatabaseTestCase, AnonymiseTestMixin): 

755 def setUp(self) -> None: 

756 super().setUp() 

757 

758 self.mock_admindb = mock.Mock(session=self.secret_dbsession) 

759 self.mock_destdb = mock.Mock( 

760 session=self.anon_dbsession, 

761 engine=self.anon_engine, 

762 metadata=AnonTestBase.metadata, 

763 ) 

764 mock_srccfg = mock.Mock(debug_limited_tables=[]) 

765 self.mock_sourcedb = mock.Mock( 

766 session=self.source_dbsession, 

767 srccfg=mock_srccfg, 

768 engine=self.source_engine, 

769 metadata=SourceTestBase.metadata, 

770 ) 

771 self.mock_get_scrub_from_rows_as_fieldinfo = mock.Mock( 

772 return_value=[ 

773 ScrubSourceFieldInfo( 

774 is_mpid=True, 

775 is_patient=False, 

776 recurse=False, 

777 required_scrubber=False, 

778 scrub_method=ScrubMethod.NUMERIC, 

779 signature=None, 

780 value_fieldname="nhsnum", 

781 ), 

782 ] 

783 ) 

784 

785 self.mock_get_scrub_from_db_table_pairs = mock.Mock( 

786 return_value=[ 

787 ("source1", "test_patient_with_string_mpid"), 

788 ] 

789 ) 

790 

791 self.mock_get_pid_name = mock.Mock(return_value="pid") 

792 self.mock_estimate_count_patients = mock.Mock(return_value=1) 

793 self.mock_opting_out_pid = mock.Mock(return_value=False) 

794 

795 mock_row = self.mock_dd_row( 

796 src_field="row_identifier", 

797 dest_field="row_identifier", 

798 ) 

799 mock_rows_for_src_table = mock.Mock(return_value=[mock_row]) 

800 

801 self.mock_dd = mock.Mock( 

802 get_scrub_from_db_table_pairs=( 

803 self.mock_get_scrub_from_db_table_pairs 

804 ), 

805 get_scrub_from_rows_as_fieldinfo=( 

806 self.mock_get_scrub_from_rows_as_fieldinfo 

807 ), 

808 get_pid_name=self.mock_get_pid_name, 

809 get_mandatory_scrubber_sigs=mock.Mock(return_value=set()), 

810 get_source_databases=mock.Mock( 

811 return_value=SortedSet(["source1"]) 

812 ), 

813 get_patient_src_tables_with_active_dest=mock.Mock( 

814 return_value=SortedSet(["test_record"]) 

815 ), 

816 get_rows_for_src_table=mock_rows_for_src_table, 

817 get_dest_sqla_table=mock.Mock( 

818 return_value=TestAnonRecord.__table__ 

819 ), 

820 ) 

821 

822 def test_duplicate_primary_key_skipped(self) -> None: 

823 # row_identifier is the primary key in the destination 

824 # database but not in the source 

825 

826 # MySQL supports ON DUPLICATE KEY UPDATE 

827 if self.anon_engine.dialect.name == "mysql": 

828 pytest.skip("Skipping different behaviour for MySQL") 

829 

830 patient = TestPatientWithStringMPIDFactory() 

831 record = TestRecordFactory(pid=patient.pid) 

832 TestRecordFactory( 

833 pid=patient.pid, row_identifier=record.row_identifier 

834 ) 

835 self.source_dbsession.commit() 

836 

837 pids = [patient.pid] 

838 

839 with mock.patch.multiple( 

840 "crate_anon.anonymise.anonymise", 

841 estimate_count_patients=self.mock_estimate_count_patients, 

842 opting_out_pid=self.mock_opting_out_pid, 

843 ): 

844 with mock.patch.multiple( 

845 "crate_anon.anonymise.anonymise.config", 

846 dd=self.mock_dd, 

847 _destination_database_url=self.anon_engine.url, 

848 admindb=self.mock_admindb, 

849 destdb=self.mock_destdb, 

850 sources={"source1": self.mock_sourcedb}, 

851 rows_inserted_per_table={("source1", "test_record"): 0}, 

852 timefield=None, 

853 ): 

854 with self.assertLogs(level=logging.WARNING) as logging_cm: 

855 process_patient_tables(specified_pids=pids) 

856 

857 self.assert_logged( 

858 "crate_anon.anonymise.anonymise", 

859 logging.WARNING, 

860 "Skipping record due to IntegrityError", 

861 logging_cm, 

862 ) 

863 

864 self.assertEqual(self.anon_dbsession.query(TestAnonRecord).count(), 1) 

865 

866 

867class ProcessTableTests(DatabaseTestCase, AnonymiseTestMixin): 

868 def setUp(self) -> None: 

869 super().setUp() 

870 

871 self.patient = TestPatientFactory() 

872 self.source_dbsession.commit() 

873 

874 # Passphrases match those in get_demo_config() 

875 self.pid_hasher = HmacMD5Hasher("SOME_PASSPHRASE_REPLACE_ME") 

876 self.mpid_hasher = HmacMD5Hasher("SOME_OTHER_PASSPHRASE_REPLACE_ME") 

877 self.change_hasher = HmacMD5Hasher("YETANOTHER") 

878 

879 mock_srccfg = mock.Mock(debug_limited_tables=[]) 

880 self.mock_sourcedb = mock.Mock( 

881 session=self.source_dbsession, 

882 srccfg=mock_srccfg, 

883 engine=self.source_engine, 

884 metadata=SourceTestBase.metadata, 

885 ) 

886 

887 self.mock_destdb = mock.Mock( 

888 session=self.anon_dbsession, 

889 engine=self.anon_engine, 

890 metadata=AnonTestBase.metadata, 

891 ) 

892 

893 def test_record_anonymised(self) -> None: 

894 TestRecordFactory(pid=self.patient.pid, other="Personal information") 

895 self.source_dbsession.commit() 

896 

897 mock_alter_method = mock.Mock( 

898 alter=mock.Mock(return_value=("ANONYMISED", False)) 

899 ) 

900 

901 mock_rows = [ 

902 self.mock_dd_row( 

903 omit=True, 

904 src_field="pk", 

905 dest_table="test_anon_record", 

906 dest_field="pk", 

907 ), 

908 self.mock_dd_row( 

909 omit=True, 

910 src_field="pid", 

911 dest_table="test_anon_record", 

912 dest_field="pid", 

913 ), 

914 self.mock_dd_row( 

915 src_field="row_identifier", 

916 dest_table="test_anon_record", 

917 dest_field="row_identifier", 

918 ), 

919 self.mock_dd_row( 

920 src_field="other", 

921 dest_table="test_anon_record", 

922 dest_field="other", 

923 alter_methods=[mock_alter_method], 

924 ), 

925 ] 

926 mock_rows_for_src_table = mock.Mock(return_value=mock_rows) 

927 

928 mock_dd = mock.Mock( 

929 get_rows_for_src_table=mock_rows_for_src_table, 

930 get_dest_sqla_table=mock.Mock( 

931 return_value=TestAnonRecord.__table__ 

932 ), 

933 ) 

934 

935 with mock.patch.multiple( 

936 "crate_anon.anonymise.anonymise.config", 

937 dd=mock_dd, 

938 sources={"source": self.mock_sourcedb}, 

939 _destination_database_url=self.anon_engine.url, 

940 destdb=self.mock_destdb, 

941 rows_inserted_per_table={("source", "test_record"): 0}, 

942 ): 

943 process_table("source", "test_record", incremental=True) 

944 

945 anon_record = self.anon_dbsession.query(TestAnonRecord).one() 

946 

947 self.assertEqual(anon_record.other, "ANONYMISED") 

948 

949 def test_primary_pid_altered_to_patient_rid(self) -> None: 

950 TestPidAsPkRecordFactory(pid=self.patient.pid, other="Other") 

951 self.source_dbsession.commit() 

952 

953 mock_row = self.mock_dd_row( 

954 src_field="pid", 

955 primary_pid=True, 

956 dest_table="test_anon_pid_as_pk_record", 

957 dest_field="rid", 

958 add_src_hash=True, 

959 ) 

960 mock_rows_for_src_table = mock.Mock(return_value=[mock_row]) 

961 

962 mock_dd = mock.Mock( 

963 get_rows_for_src_table=mock_rows_for_src_table, 

964 get_dest_sqla_table=mock.Mock( 

965 return_value=TestAnonPidAsPkRecord.__table__ 

966 ), 

967 get_pid_name=mock.Mock(return_value="pid"), 

968 ) 

969 mock_patient = mock.Mock( 

970 pid=self.patient.pid, 

971 rid="not-a-real-rid", 

972 trid=123456, 

973 mrid="not-a-real-mrid", 

974 ) 

975 with mock.patch.multiple( 

976 "crate_anon.anonymise.anonymise.config", 

977 dd=mock_dd, 

978 sources={"source": self.mock_sourcedb}, 

979 _destination_database_url=self.anon_engine.url, 

980 destdb=self.mock_destdb, 

981 rows_inserted_per_table={("source", "test_pid_as_pk_record"): 0}, 

982 add_mrid_wherever_rid_added=True, 

983 master_research_id_fieldname="mrid", 

984 ): 

985 process_table( 

986 "source", 

987 "test_pid_as_pk_record", 

988 patient=mock_patient, 

989 ) 

990 

991 anon_record = self.anon_dbsession.query(TestAnonPidAsPkRecord).one() 

992 

993 self.assertEqual(anon_record.rid, mock_patient.rid) 

994 self.assertEqual(anon_record.mrid, mock_patient.mrid) 

995 

996 def test_master_pid_encrypted(self) -> None: 

997 test_record = TestRecordFactory(pid=self.patient.pid) 

998 self.source_dbsession.commit() 

999 

1000 mock_row = self.mock_dd_row( 

1001 src_field="nhsnum", 

1002 dest_table="test_anon_record", 

1003 dest_field="nhshash", 

1004 master_pid=True, 

1005 ) 

1006 mock_rows_for_src_table = mock.Mock(return_value=[mock_row]) 

1007 

1008 mock_dd = mock.Mock( 

1009 get_rows_for_src_table=mock_rows_for_src_table, 

1010 get_dest_sqla_table=mock.Mock( 

1011 return_value=TestAnonRecord.__table__ 

1012 ), 

1013 ) 

1014 with mock.patch.multiple( 

1015 "crate_anon.anonymise.anonymise.config", 

1016 dd=mock_dd, 

1017 sources={"source": self.mock_sourcedb}, 

1018 _destination_database_url=self.anon_engine.url, 

1019 destdb=self.mock_destdb, 

1020 rows_inserted_per_table={("source", "test_record"): 0}, 

1021 ): 

1022 process_table( 

1023 "source", 

1024 "test_record", 

1025 ) 

1026 

1027 anon_record = self.anon_dbsession.query(TestAnonRecord).one() 

1028 expected_hash = self.mpid_hasher.hash(test_record.nhsnum) 

1029 

1030 self.assertEqual(anon_record.nhshash, expected_hash) 

1031 

1032 def test_third_party_pid_encrypted(self) -> None: 

1033 test_record = TestRecordFactory(pid=self.patient.pid) 

1034 self.source_dbsession.commit() 

1035 

1036 mock_row = self.mock_dd_row( 

1037 src_field="third_party_pid", 

1038 dest_table="test_anon_record", 

1039 dest_field="third_party_pid_hash", 

1040 third_party_pid=True, 

1041 ) 

1042 mock_rows_for_src_table = mock.Mock(return_value=[mock_row]) 

1043 

1044 mock_dd = mock.Mock( 

1045 get_rows_for_src_table=mock_rows_for_src_table, 

1046 get_dest_sqla_table=mock.Mock( 

1047 return_value=TestAnonRecord.__table__ 

1048 ), 

1049 ) 

1050 with mock.patch.multiple( 

1051 "crate_anon.anonymise.anonymise.config", 

1052 dd=mock_dd, 

1053 sources={"source": self.mock_sourcedb}, 

1054 _destination_database_url=self.anon_engine.url, 

1055 destdb=self.mock_destdb, 

1056 rows_inserted_per_table={("source", "test_record"): 0}, 

1057 ): 

1058 process_table( 

1059 "source", 

1060 "test_record", 

1061 ) 

1062 

1063 anon_record = self.anon_dbsession.query(TestAnonRecord).one() 

1064 expected_hash = self.pid_hasher.hash(test_record.third_party_pid) 

1065 

1066 self.assertEqual(anon_record.third_party_pid_hash, expected_hash) 

1067 

1068 def test_row_skipped_by_alter_method(self) -> None: 

1069 TestRecordFactory(pid=self.patient.pid, other="Personal information") 

1070 self.source_dbsession.commit() 

1071 

1072 mock_alter_method = mock.Mock( 

1073 alter=mock.Mock(return_value=(None, True)) 

1074 ) 

1075 

1076 mock_rows = [ 

1077 self.mock_dd_row( 

1078 omit=True, 

1079 src_field="pk", 

1080 dest_table="test_anon_record", 

1081 dest_field="pk", 

1082 ), 

1083 self.mock_dd_row( 

1084 omit=True, 

1085 src_field="pid", 

1086 dest_table="test_anon_record", 

1087 dest_field="pid", 

1088 ), 

1089 self.mock_dd_row( 

1090 src_field="row_identifier", 

1091 dest_table="test_anon_record", 

1092 dest_field="row_identifier", 

1093 ), 

1094 self.mock_dd_row( 

1095 src_field="other", 

1096 dest_table="test_anon_record", 

1097 dest_field="other", 

1098 alter_methods=[mock_alter_method], 

1099 ), 

1100 ] 

1101 mock_rows_for_src_table = mock.Mock(return_value=mock_rows) 

1102 

1103 mock_dd = mock.Mock( 

1104 get_rows_for_src_table=mock_rows_for_src_table, 

1105 get_dest_sqla_table=mock.Mock( 

1106 return_value=TestAnonRecord.__table__ 

1107 ), 

1108 ) 

1109 

1110 with mock.patch.multiple( 

1111 "crate_anon.anonymise.anonymise.config", 

1112 dd=mock_dd, 

1113 sources={"source": self.mock_sourcedb}, 

1114 _destination_database_url=self.anon_engine.url, 

1115 destdb=self.mock_destdb, 

1116 rows_inserted_per_table={("source", "test_record"): 0}, 

1117 ): 

1118 process_table("source", "test_record") 

1119 

1120 self.assertIsNone( 

1121 self.anon_dbsession.query(TestAnonRecord).one_or_none() 

1122 ) 

1123 

1124 def test_skipped_by_free_text_limit(self) -> None: 

1125 TestRecordFactory(pid=self.patient.pid) 

1126 self.source_dbsession.commit() 

1127 

1128 mock_rows = [ 

1129 self.mock_dd_row( 

1130 src_field="other", 

1131 dest_table="test_anon_record", 

1132 dest_field="other", 

1133 src_textlength=100, 

1134 ), 

1135 ] 

1136 mock_rows_for_src_table = mock.Mock(return_value=mock_rows) 

1137 

1138 mock_dd = mock.Mock( 

1139 get_rows_for_src_table=mock_rows_for_src_table, 

1140 get_dest_sqla_table=mock.Mock( 

1141 return_value=TestAnonRecord.__table__ 

1142 ), 

1143 ) 

1144 

1145 with mock.patch.multiple( 

1146 "crate_anon.anonymise.anonymise.config", 

1147 dd=mock_dd, 

1148 sources={"source": self.mock_sourcedb}, 

1149 _destination_database_url=self.anon_engine.url, 

1150 destdb=self.mock_destdb, 

1151 rows_inserted_per_table={("source", "test_record"): 0}, 

1152 ): 

1153 process_table("source", "test_record", free_text_limit=50) 

1154 

1155 self.assertIsNone( 

1156 self.anon_dbsession.query(TestAnonRecord).one_or_none() 

1157 ) 

1158 

1159 def test_skipped_when_scrubbed_excluded(self) -> None: 

1160 TestRecordFactory(pid=self.patient.pid) 

1161 self.source_dbsession.commit() 

1162 

1163 mock_rows = [ 

1164 self.mock_dd_row( 

1165 src_field="other", 

1166 dest_table="test_anon_record", 

1167 dest_field="other", 

1168 src_is_textual=True, 

1169 being_scrubbed=True, 

1170 ), 

1171 ] 

1172 mock_rows_for_src_table = mock.Mock(return_value=mock_rows) 

1173 

1174 mock_dd = mock.Mock( 

1175 get_rows_for_src_table=mock_rows_for_src_table, 

1176 get_dest_sqla_table=mock.Mock( 

1177 return_value=TestAnonRecord.__table__ 

1178 ), 

1179 ) 

1180 

1181 with mock.patch.multiple( 

1182 "crate_anon.anonymise.anonymise.config", 

1183 dd=mock_dd, 

1184 sources={"source": self.mock_sourcedb}, 

1185 _destination_database_url=self.anon_engine.url, 

1186 destdb=self.mock_destdb, 

1187 rows_inserted_per_table={("source", "test_record"): 0}, 

1188 ): 

1189 process_table("source", "test_record", exclude_scrubbed_fields=True) 

1190 

1191 self.assertIsNone( 

1192 self.anon_dbsession.query(TestAnonRecord).one_or_none() 

1193 ) 

1194 

1195 def test_unchanged_record_matching_hash_with_plain_rid_skipped( 

1196 self, 

1197 ) -> None: 

1198 test_record = TestRecordFactory(pid=self.patient.pid) 

1199 self.source_dbsession.commit() 

1200 TestAnonRecordFactory( 

1201 row_identifier=test_record.row_identifier, 

1202 _src_hash=self.change_hasher.hash( 

1203 repr([test_record.row_identifier]) 

1204 ), 

1205 ) 

1206 self.anon_dbsession.commit() 

1207 

1208 mock_row = self.mock_dd_row( 

1209 src_field="row_identifier", 

1210 dest_table="test_anon_record", 

1211 dest_field="row_identifier", 

1212 add_src_hash=True, 

1213 ) 

1214 mock_rows_for_src_table = mock.Mock(return_value=[mock_row]) 

1215 

1216 mock_dd = mock.Mock( 

1217 get_rows_for_src_table=mock_rows_for_src_table, 

1218 get_dest_sqla_table=mock.Mock( 

1219 return_value=TestAnonRecord.__table__ 

1220 ), 

1221 ) 

1222 with mock.patch.multiple( 

1223 "crate_anon.anonymise.anonymise.config", 

1224 dd=mock_dd, 

1225 sources={"source": self.mock_sourcedb}, 

1226 _destination_database_url=self.anon_engine.url, 

1227 destdb=self.mock_destdb, 

1228 rows_inserted_per_table={("source", "test_record"): 0}, 

1229 ): 

1230 with self.assertLogs(level=logging.DEBUG) as logging_cm: 

1231 process_table("source", "test_record", incremental=True) 

1232 

1233 self.assert_logged( 

1234 "crate_anon.anonymise.anonymise", 

1235 logging.DEBUG, 

1236 "... ... skipping unchanged record (identical by hash): ", 

1237 logging_cm, 

1238 ) 

1239 

1240 def test_unchanged_record_matching_hash_with_hashed_rid_skipped( 

1241 self, 

1242 ) -> None: 

1243 test_record = TestPidAsPkRecordFactory( 

1244 pid=self.patient.pid, other="Other" 

1245 ) 

1246 self.source_dbsession.commit() 

1247 TestAnonPidAsPkRecordFactory( 

1248 rid=self.pid_hasher.hash(self.patient.pid), 

1249 _src_hash=self.change_hasher.hash(repr([test_record.pid])), 

1250 ) 

1251 self.anon_dbsession.commit() 

1252 

1253 mock_row = self.mock_dd_row( 

1254 src_field="pid", 

1255 primary_pid=True, 

1256 dest_table="test_anon_pid_as_pk_record", 

1257 dest_field="rid", 

1258 add_src_hash=True, 

1259 ) 

1260 mock_rows_for_src_table = mock.Mock(return_value=[mock_row]) 

1261 

1262 mock_dd = mock.Mock( 

1263 get_rows_for_src_table=mock_rows_for_src_table, 

1264 get_dest_sqla_table=mock.Mock( 

1265 return_value=TestAnonPidAsPkRecord.__table__ 

1266 ), 

1267 get_pid_name=mock.Mock(return_value="pid"), 

1268 ) 

1269 mock_patient = mock.Mock(pid=self.patient.pid) 

1270 with mock.patch.multiple( 

1271 "crate_anon.anonymise.anonymise.config", 

1272 dd=mock_dd, 

1273 sources={"source": self.mock_sourcedb}, 

1274 _destination_database_url=self.anon_engine.url, 

1275 destdb=self.mock_destdb, 

1276 rows_inserted_per_table={("source", "test_pid_as_pk_record"): 0}, 

1277 ): 

1278 with self.assertLogs(level=logging.DEBUG) as logging_cm: 

1279 process_table( 

1280 "source", 

1281 "test_pid_as_pk_record", 

1282 patient=mock_patient, 

1283 incremental=True, 

1284 ) 

1285 

1286 self.assert_logged( 

1287 "crate_anon.anonymise.anonymise", 

1288 logging.DEBUG, 

1289 "... ... skipping unchanged record (identical by hash): ", 

1290 logging_cm, 

1291 ) 

1292 

1293 def test_constant_record_matching_pk_skipped( 

1294 self, 

1295 ) -> None: 

1296 test_record = TestRecordFactory(pid=self.patient.pid) 

1297 self.source_dbsession.commit() 

1298 TestAnonRecordFactory( 

1299 row_identifier=test_record.row_identifier, 

1300 ) 

1301 self.anon_dbsession.commit() 

1302 

1303 mock_row = self.mock_dd_row( 

1304 src_field="row_identifier", 

1305 dest_table="test_anon_record", 

1306 dest_field="row_identifier", 

1307 constant=True, 

1308 ) 

1309 mock_rows_for_src_table = mock.Mock(return_value=[mock_row]) 

1310 

1311 mock_dd = mock.Mock( 

1312 get_rows_for_src_table=mock_rows_for_src_table, 

1313 get_dest_sqla_table=mock.Mock( 

1314 return_value=TestAnonRecord.__table__ 

1315 ), 

1316 ) 

1317 with mock.patch.multiple( 

1318 "crate_anon.anonymise.anonymise.config", 

1319 dd=mock_dd, 

1320 sources={"source": self.mock_sourcedb}, 

1321 _destination_database_url=self.anon_engine.url, 

1322 destdb=self.mock_destdb, 

1323 rows_inserted_per_table={("source", "test_record"): 0}, 

1324 ): 

1325 with self.assertLogs(level=logging.DEBUG) as logging_cm: 

1326 process_table("source", "test_record", incremental=True) 

1327 

1328 self.assert_logged( 

1329 "crate_anon.anonymise.anonymise", 

1330 logging.DEBUG, 

1331 ( 

1332 "... ... skipping unchanged record (identical by PK and " 

1333 "marked as constant): " 

1334 ), 

1335 logging_cm, 

1336 ) 

1337 

1338 def test_does_nothing_if_all_ddrows_omitted(self) -> None: 

1339 TestRecordFactory(pid=self.patient.pid) 

1340 self.source_dbsession.commit() 

1341 

1342 mock_rows = [ 

1343 self.mock_dd_row( 

1344 omit=True, 

1345 src_field="pk", 

1346 dest_table="test_anon_record", 

1347 dest_field="pk", 

1348 add_src_hash=True, 

1349 ), 

1350 self.mock_dd_row( 

1351 omit=True, 

1352 src_field="pid", 

1353 dest_table="test_anon_record", 

1354 dest_field="pid", 

1355 add_src_hash=True, 

1356 ), 

1357 self.mock_dd_row( 

1358 omit=True, 

1359 src_field="row_identifier", 

1360 dest_table="test_anon_record", 

1361 dest_field="row_identifier", 

1362 add_src_hash=True, 

1363 ), 

1364 ] 

1365 mock_rows_for_src_table = mock.Mock(return_value=mock_rows) 

1366 

1367 mock_dd = mock.Mock( 

1368 get_rows_for_src_table=mock_rows_for_src_table, 

1369 get_dest_sqla_table=mock.Mock( 

1370 return_value=TestAnonRecord.__table__ 

1371 ), 

1372 ) 

1373 with mock.patch.multiple( 

1374 "crate_anon.anonymise.anonymise.config", 

1375 dd=mock_dd, 

1376 sources={"source": self.mock_sourcedb}, 

1377 _destination_database_url=self.anon_engine.url, 

1378 destdb=self.mock_destdb, 

1379 rows_inserted_per_table={("source", "test_record"): 0}, 

1380 ): 

1381 with self.assertLogs(level=logging.DEBUG) as logging_cm: 

1382 process_table("source", "test_record", incremental=True) 

1383 

1384 self.assert_logged( 

1385 "crate_anon.anonymise.anonymise", 

1386 logging.DEBUG, 

1387 "... ... all columns omitted", 

1388 logging_cm, 

1389 ) 

1390 

1391 def test_row_skipped_by_value(self) -> None: 

1392 TestRecordFactory(pid=self.patient.pid) 

1393 self.source_dbsession.commit() 

1394 

1395 mock_rows = [ 

1396 self.mock_dd_row( 

1397 src_field="row_identifier", 

1398 dest_table="test_anon_record", 

1399 dest_field="row_identifier", 

1400 skip_row_by_value=mock.Mock(return_value=True), 

1401 ), 

1402 ] 

1403 mock_rows_for_src_table = mock.Mock(return_value=mock_rows) 

1404 

1405 mock_dd = mock.Mock( 

1406 get_rows_for_src_table=mock_rows_for_src_table, 

1407 get_dest_sqla_table=mock.Mock( 

1408 return_value=TestAnonRecord.__table__ 

1409 ), 

1410 ) 

1411 with mock.patch.multiple( 

1412 "crate_anon.anonymise.anonymise.config", 

1413 dd=mock_dd, 

1414 sources={"source": self.mock_sourcedb}, 

1415 _destination_database_url=self.anon_engine.url, 

1416 destdb=self.mock_destdb, 

1417 rows_inserted_per_table={("source", "test_record"): 0}, 

1418 ): 

1419 with self.assertLogs(level=logging.DEBUG) as logging_cm: 

1420 process_table("source", "test_record") 

1421 

1422 self.assert_logged( 

1423 "crate_anon.anonymise.anonymise", 

1424 logging.DEBUG, 

1425 "... ... skipping row based on inclusion/exclusion values", 

1426 logging_cm, 

1427 ) 

1428 

1429 self.assertIsNone( 

1430 self.anon_dbsession.query(TestAnonRecord).one_or_none() 

1431 )