Coverage for anonymise/constants.py: 100%

255 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/anonymise/constants.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Shared constants for CRATE anonymiser.** 

27 

28""" 

29 

30import calendar 

31from enum import unique 

32 

33from sqlalchemy import Integer 

34from cardinal_pythonlib.enumlike import StrEnum 

35 

36from crate_anon.version import CRATE_VERSION, CRATE_VERSION_DATE 

37from crate_anon.nlp_manager.constants import DatabaseConfigKeys 

38 

39 

40# ============================================================================= 

41# Logging 

42# ============================================================================= 

43 

44LOG_DATEFMT = "%Y-%m-%d %H:%M:%S" 

45 

46LOG_COLORS = { 

47 "DEBUG": "cyan", 

48 "INFO": "green", 

49 "WARNING": "yellow", 

50 "ERROR": "red", 

51 "CRITICAL": "red,bg_white", 

52} 

53 

54 

55# ============================================================================= 

56# Cosmetic 

57# ============================================================================= 

58 

59BIGSEP = "=" * 20 + " " 

60SEP = "-" * 20 + " " 

61 

62 

63# ============================================================================= 

64# Defaults for command-line options 

65# ============================================================================= 

66 

67DEFAULT_REPORT_EVERY = 100000 # 100k 

68DEFAULT_CHUNKSIZE = 100000 # 100k 

69 

70 

71# ============================================================================= 

72# Environment 

73# ============================================================================= 

74 

75ANON_CONFIG_ENV_VAR = "CRATE_ANON_CONFIG" 

76 

77# ============================================================================= 

78# Data dictionary 

79# ============================================================================= 

80 

81DATEFORMAT_ISO8601 = "%Y-%m-%dT%H:%M:%S%z" # e.g. 2013-07-24T20:04:07+0100 

82DEFAULT_INDEX_LEN = 20 # for data types where it's mandatory 

83 

84LONGTEXT = "LONGTEXT" 

85 

86MAX_PID_STR = "9" * 10 # e.g. NHS numbers are 10-digit 

87 

88TridType = Integer 

89MAX_TRID = 2**31 - 1 

90# https://dev.mysql.com/doc/refman/5.0/en/numeric-type-overview.html 

91# Maximum INT UNSIGNED is 4294967295 == 2 ** 32 - 1. 

92# INT range is -2147483648 == -(2 ** 31) to 

93# +2147483647 == 2 ** 31 - 1 == 2.1 billion 

94# ... note that this is inadequate for 10-digit NHS numbers. 

95# Maximum BIGINT UNSIGNED is 18446744073709551615 == 2 ** 64 - 1. 

96# BIGINT range is -9223372036854775808 == -(2 ** 63) to 

97# +9223372036854775807 == 2 ** 64 - 1 

98 

99 

100# When scrub_all_dates is True and the replacement text is a date format 

101# string, allow these directives. 

102# https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior 

103DATE_BLURRING_DIRECTIVES = ( 

104 "b", # Month as locale's abbreviated name 

105 "B", # Month as locale's full name 

106 "m", # Month as zero-padded decimal number 

107 "Y", # Year with century as decimal number 

108 "y", # Year without century as zero-padded decimal number 

109 # Among things that are not currently supported: %% (literal %). 

110) 

111DATE_BLURRING_DIRECTIVES_CSV = ", ".join( 

112 [f"%{d}" for d in DATE_BLURRING_DIRECTIVES] 

113) 

114 

115# https://stackoverflow.com/questions/3418050/month-name-to-month-number-and-vice-versa-in-python 

116MONTH_3_LETTER_INDEX = { 

117 # See _month_word_regex_fragment() in anonregex.py 

118 # Assuming this may not be the same as calendar.month_abbr in some locales 

119 month[:3]: index 

120 for index, month in enumerate(calendar.month_name) 

121 if month 

122} 

123 

124 

125@unique 

126class AlterMethodType(StrEnum): 

127 BINARY_TO_TEXT = "binary_to_text" 

128 FILENAME_FORMAT_TO_TEXT = "filename_format_to_text" # new in v0.18.18 

129 FILENAME_TO_TEXT = "filename_to_text" 

130 HASH = "hash" 

131 # HTML_ESCAPE = "html_escape" 

132 HTML_UNESCAPE = "html_unescape" 

133 HTML_UNTAG = "html_untag" 

134 SCRUBIN = "scrub" 

135 SKIP_IF_TEXT_EXTRACT_FAILS = "skip_if_extract_fails" 

136 TRUNCATEDATE = "truncate_date" 

137 

138 

139@unique 

140class Decision(StrEnum): 

141 OMIT = "OMIT" 

142 INCLUDE = "include" 

143 

144 

145@unique 

146class IndexType(StrEnum): 

147 NONE = "" 

148 NORMAL = "I" 

149 UNIQUE = "U" 

150 FULLTEXT = "F" 

151 

152 

153@unique 

154class ScrubMethod(StrEnum): 

155 WORDS = "words" 

156 PHRASE = "phrase" 

157 PHRASE_UNLESS_NUMERIC = "phrase_unless_numeric" 

158 NUMERIC = "number" 

159 DATE = "date" 

160 CODE = "code" 

161 

162 

163@unique 

164class ScrubSrc(StrEnum): 

165 PATIENT = "patient" 

166 THIRDPARTY = "thirdparty" 

167 THIRDPARTY_XREF_PID = "thirdparty_xref_pid" 

168 

169 

170@unique 

171class SrcFlag(StrEnum): 

172 PK = "K" 

173 NOT_NULL = "N" 

174 ADD_SRC_HASH = "H" 

175 PRIMARY_PID = "P" 

176 DEFINES_PRIMARY_PIDS = "*" 

177 MASTER_PID = "M" 

178 CONSTANT = "C" 

179 ADDITION_ONLY = "A" 

180 OPT_OUT = "!" 

181 REQUIRED_SCRUBBER = "R" 

182 

183 

184# ============================================================================= 

185# Databases 

186# ============================================================================= 

187 

188CHARSET = "utf8" 

189TABLE_KWARGS = { 

190 # MySQL: 

191 "mysql_charset": CHARSET, 

192 "mysql_engine": "InnoDB", 

193} 

194COMMENT = "comment" 

195 

196MYSQL_MAX_IDENTIFIER_LENGTH = 64 

197# MySQL: 64 -- http://dev.mysql.com/doc/refman/5.7/en/identifiers.html 

198SQLSERVER_MAX_IDENTIFIER_LENGTH = 128 

199# Microsoft SQL Server: 128 -- 

200# https://docs.microsoft.com/en-us/sql/relational-databases/databases/database-identifiers # noqa: E501 

201 

202 

203# ============================================================================= 

204# Config keys 

205# ============================================================================= 

206 

207 

208class AnonymiseConfigKeys: 

209 # Sections 

210 SECTION_MAIN = "main" 

211 SECTION_EXTRA_REGEXES = "extra_regexes" 

212 

213 # Data dictionary 

214 DATA_DICTIONARY_FILENAME = "data_dictionary_filename" 

215 

216 # Critical field types 

217 SQLATYPE_MPID = "sqlatype_mpid" 

218 SQLATYPE_PID = "sqlatype_pid" 

219 

220 # Encryption phrases/passwords 

221 CHANGE_DETECTION_ENCRYPTION_PHRASE = "change_detection_encryption_phrase" 

222 EXTRA_HASH_CONFIG_SECTIONS = "extra_hash_config_sections" 

223 HASH_METHOD = "hash_method" 

224 MASTER_PATIENT_ID_ENCRYPTION_PHRASE = "master_patient_id_encryption_phrase" 

225 PER_TABLE_PATIENT_ID_ENCRYPTION_PHRASE = ( 

226 "per_table_patient_id_encryption_phrase" 

227 ) 

228 

229 # Text extraction 

230 EXTRACT_TEXT_EXTENSIONS_CASE_SENSITIVE = ( 

231 "extract_text_extensions_case_sensitive" 

232 ) 

233 EXTRACT_TEXT_EXTENSIONS_PERMITTED = "extract_text_extensions_permitted" 

234 EXTRACT_TEXT_EXTENSIONS_PROHIBITED = "extract_text_extensions_prohibited" 

235 EXTRACT_TEXT_PLAIN = "extract_text_plain" 

236 EXTRACT_TEXT_WIDTH = "extract_text_width" 

237 

238 # Anonymisation 

239 ALLOWLIST_FILENAMES = "allowlist_filenames" 

240 ALLOW_NO_PATIENT_INFO = "allow_no_patient_info" 

241 ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY = ( 

242 "anonymise_codes_at_word_boundaries_only" 

243 ) 

244 ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY = ( 

245 "anonymise_codes_at_numeric_boundaries_only" 

246 ) 

247 ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY = ( 

248 "anonymise_dates_at_word_boundaries_only" 

249 ) 

250 ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY = ( 

251 "anonymise_numbers_at_numeric_boundaries_only" 

252 ) 

253 ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY = ( 

254 "anonymise_numbers_at_word_boundaries_only" 

255 ) 

256 ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY = ( 

257 "anonymise_strings_at_word_boundaries_only" 

258 ) 

259 DENYLIST_FILENAMES = "denylist_filenames" 

260 DENYLIST_FILES_AS_PHRASES = "denylist_files_as_phrases" 

261 DENYLIST_USE_REGEX = "denylist_use_regex" 

262 DEPRECATED_BLACKLIST_FILENAMES = "blacklist_filenames" 

263 DEPRECATED_WHITELIST_FILENAMES = "whitelist_filenames" 

264 MIN_STRING_LENGTH_FOR_ERRORS = "min_string_length_for_errors" 

265 MIN_STRING_LENGTH_TO_SCRUB_WITH = "min_string_length_to_scrub_with" 

266 NONSPECIFIC_SCRUBBER_FIRST = "nonspecific_scrubber_first" 

267 PHRASE_ALTERNATIVE_WORD_FILENAMES = "phrase_alternative_word_filenames" 

268 REPLACE_ALL_DATES_WITH = "replace_all_dates_with" 

269 REPLACE_NONSPECIFIC_INFO_WITH = "replace_nonspecific_info_with" 

270 REPLACE_PATIENT_INFO_WITH = "replace_patient_info_with" 

271 REPLACE_THIRD_PARTY_INFO_WITH = "replace_third_party_info_with" 

272 SCRUB_ALL_DATES = "scrub_all_dates" 

273 SCRUB_ALL_EMAIL_ADDRESSES = "scrub_all_email_addresses" 

274 SCRUB_ALL_NUMBERS_OF_N_DIGITS = "scrub_all_numbers_of_n_digits" 

275 SCRUB_ALL_UK_POSTCODES = "scrub_all_uk_postcodes" 

276 SCRUB_STRING_SUFFIXES = "scrub_string_suffixes" 

277 STRING_MAX_REGEX_ERRORS = "string_max_regex_errors" 

278 THIRDPARTY_XREF_MAX_DEPTH = "thirdparty_xref_max_depth" 

279 TIMEFIELD_NAME = "timefield_name" 

280 

281 # Output fields and formatting 

282 RESEARCH_ID_FIELDNAME = "research_id_fieldname" 

283 TRID_FIELDNAME = "trid_fieldname" 

284 MASTER_RESEARCH_ID_FIELDNAME = "master_research_id_fieldname" 

285 ADD_MRID_WHEREVER_RID_ADDED = "add_mrid_wherever_rid_added" 

286 SOURCE_HASH_FIELDNAME = "source_hash_fieldname" 

287 

288 # Destination database configuration 

289 MAX_ROWS_BEFORE_COMMIT = "max_rows_before_commit" 

290 MAX_BYTES_BEFORE_COMMIT = "max_bytes_before_commit" 

291 TEMPORARY_TABLENAME = "temporary_tablename" 

292 

293 # Databases 

294 ADMIN_DATABASE = "admin_database" 

295 DESTINATION_DATABASE = "destination_database" 

296 SOURCE_DATABASES = "source_databases" 

297 

298 # Processing options 

299 DEBUG_MAX_N_PATIENTS = "debug_max_n_patients" 

300 DEBUG_PID_LIST = "debug_pid_list" 

301 

302 # Opting out 

303 OPTOUT_COL_VALUES = "optout_col_values" 

304 OPTOUT_MPID_FILENAMES = "optout_mpid_filenames" 

305 OPTOUT_PID_FILENAMES = "optout_pid_filenames" 

306 

307 

308class AnonymiseConfigDefaults: 

309 # Critical field types 

310 SQLATYPE_MPID = "BigInteger" 

311 SQLATYPE_PID = "BigInteger" 

312 

313 # Encryption phrases/passwords 

314 HASH_METHOD = "HMAC_MD5" 

315 

316 # Text extraction 

317 EXTRACT_TEXT_EXTENSIONS_CASE_SENSITIVE = False 

318 EXTRACT_TEXT_PLAIN = True 

319 EXTRACT_TEXT_WIDTH = 80 

320 

321 # Anonymisation 

322 ALLOW_NO_PATIENT_INFO = False 

323 ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY = True 

324 ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY = True 

325 ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY = True 

326 ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY = True 

327 ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY = False 

328 ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY = True 

329 DENYLIST_FILES_AS_PHRASES = False 

330 DENYLIST_USE_REGEX = False 

331 MIN_STRING_LENGTH_FOR_ERRORS = 3 

332 MIN_STRING_LENGTH_TO_SCRUB_WITH = 2 

333 NONSPECIFIC_SCRUBBER_FIRST = False 

334 REPLACE_ALL_DATES_WITH = "[~~~]" 

335 REPLACE_NONSPECIFIC_INFO_WITH = "[~~~]" 

336 REPLACE_PATIENT_INFO_WITH = "[__PPP__]" 

337 REPLACE_THIRD_PARTY_INFO_WITH = "[__TTT__]" 

338 SCRUB_ALL_DATES = False 

339 SCRUB_ALL_EMAIL_ADDRESSES = False 

340 SCRUB_ALL_UK_POSTCODES = False 

341 STRING_MAX_REGEX_ERRORS = 0 

342 THIRDPARTY_XREF_MAX_DEPTH = 1 

343 TIMEFIELD_NAME = "_when_processed_utc" 

344 

345 # Output fields and formatting 

346 RESEARCH_ID_FIELDNAME = "rid" 

347 TRID_FIELDNAME = "trid" 

348 MASTER_RESEARCH_ID_FIELDNAME = "mrid" 

349 ADD_MRID_WHEREVER_RID_ADDED = True 

350 SOURCE_HASH_FIELDNAME = "_src_hash" 

351 

352 # Destination database configuration 

353 MAX_ROWS_BEFORE_COMMIT = 1000 

354 MAX_BYTES_BEFORE_COMMIT = 80 * 1024 * 1024 # 80 Mb 

355 TEMPORARY_TABLENAME = "_crate_temp_table" 

356 

357 # Processing options 

358 DEBUG_MAX_N_PATIENTS = 0 

359 

360 

361class AnonymiseDatabaseSafeConfigKeys: 

362 """ 

363 Non-sensitive config keys relating to a specific database. 

364 """ 

365 

366 DDGEN_ADD_PER_TABLE_PIDS_TO_SCRUBBER = ( 

367 "ddgen_add_per_table_pids_to_scrubber" 

368 ) 

369 DDGEN_ADDITION_ONLY = "ddgen_addition_only" 

370 DDGEN_ADDITION_ONLY_TABLES = "ddgen_addition_only_tables" 

371 DDGEN_ALLOW_FULLTEXT_INDEXING = "ddgen_allow_fulltext_indexing" 

372 DDGEN_APPEND_SOURCE_INFO_TO_COMMENT = "ddgen_append_source_info_to_comment" 

373 DDGEN_BINARY_TO_TEXT_FIELD_PAIRS = "ddgen_binary_to_text_field_pairs" 

374 DDGEN_CONSTANT_CONTENT = "ddgen_constant_content" 

375 DDGEN_CONSTANT_CONTENT_TABLES = "ddgen_constant_content_tables" 

376 DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE = ( 

377 "ddgen_convert_odd_chars_to_underscore" 

378 ) 

379 DDGEN_DELETION_POSSIBLE_TABLES = "ddgen_deletion_possible_tables" 

380 DDGEN_EXTRA_HASH_FIELDS = "ddgen_extra_hash_fields" 

381 DDGEN_FIELD_ALLOWLIST = "ddgen_field_allowlist" 

382 DDGEN_FIELD_DENYLIST = "ddgen_field_denylist" 

383 DDGEN_FILENAME_TO_TEXT_FIELDS = "ddgen_filename_to_text_fields" 

384 DDGEN_FORCE_LOWER_CASE = "ddgen_force_lower_case" 

385 DDGEN_FREETEXT_INDEX_MIN_LENGTH = "ddgen_freetext_index_min_length" 

386 DDGEN_INCLUDE_FIELDS = "ddgen_include_fields" 

387 DDGEN_INDEX_FIELDS = "ddgen_index_fields" 

388 DDGEN_MASTER_PID_FIELDNAME = "ddgen_master_pid_fieldname" 

389 DDGEN_MIN_LENGTH_FOR_SCRUBBING = "ddgen_min_length_for_scrubbing" 

390 DDGEN_NONCONSTANT_CONTENT_TABLES = "ddgen_nonconstant_content_tables" 

391 DDGEN_OMIT_BY_DEFAULT = "ddgen_omit_by_default" 

392 DDGEN_OMIT_FIELDS = "ddgen_omit_fields" 

393 DDGEN_PATIENT_OPT_OUT_FIELDS = "ddgen_patient_opt_out_fields" 

394 DDGEN_PER_TABLE_PID_FIELD = "ddgen_per_table_pid_field" 

395 DDGEN_PID_DEFINING_FIELDNAMES = "ddgen_pid_defining_fieldnames" 

396 DDGEN_PK_FIELDS = "ddgen_pk_fields" 

397 DDGEN_PREFER_ORIGINAL_PK = "ddgen_prefer_original_pk" 

398 DDGEN_RENAME_TABLES_REMOVE_SUFFIXES = "ddgen_rename_tables_remove_suffixes" 

399 DDGEN_REQUIRED_SCRUBSRC_FIELDS = "ddgen_required_scrubsrc_fields" 

400 DDGEN_SAFE_FIELDS_EXEMPT_FROM_SCRUBBING = ( 

401 "ddgen_safe_fields_exempt_from_scrubbing" 

402 ) 

403 DDGEN_SCRUBMETHOD_CODE_FIELDS = "ddgen_scrubmethod_code_fields" 

404 DDGEN_SCRUBMETHOD_DATE_FIELDS = "ddgen_scrubmethod_date_fields" 

405 DDGEN_SCRUBMETHOD_NUMBER_FIELDS = "ddgen_scrubmethod_number_fields" 

406 DDGEN_SCRUBMETHOD_PHRASE_FIELDS = "ddgen_scrubmethod_phrase_fields" 

407 DDGEN_SCRUBSRC_PATIENT_FIELDS = "ddgen_scrubsrc_patient_fields" 

408 DDGEN_SCRUBSRC_THIRDPARTY_FIELDS = "ddgen_scrubsrc_thirdparty_fields" 

409 DDGEN_SCRUBSRC_THIRDPARTY_XREF_PID_FIELDS = ( 

410 "ddgen_scrubsrc_thirdparty_xref_pid_fields" 

411 ) 

412 DDGEN_SKIP_ROW_IF_EXTRACT_TEXT_FAILS_FIELDS = ( 

413 "ddgen_skip_row_if_extract_text_fails_fields" 

414 ) 

415 DDGEN_TABLE_ALLOWLIST = "ddgen_table_allowlist" 

416 DDGEN_TABLE_DEFINES_PIDS = "ddgen_table_defines_pids" 

417 DDGEN_TABLE_DENYLIST = "ddgen_table_denylist" 

418 DDGEN_TABLE_REQUIRE_FIELD_ABSOLUTE = "ddgen_table_require_field_absolute" 

419 DDGEN_TABLE_REQUIRE_FIELD_CONDITIONAL = ( 

420 "ddgen_table_require_field_conditional" 

421 ) 

422 DDGEN_TRUNCATE_DATE_FIELDS = "ddgen_truncate_date_fields" 

423 DEBUG_LIMITED_TABLES = "debug_limited_tables" 

424 DEBUG_ROW_LIMIT = "debug_row_limit" 

425 DEPRECATED_DDGEN_FIELD_BLACKLIST = "ddgen_field_blacklist" 

426 DEPRECATED_DDGEN_FIELD_WHITELIST = "ddgen_field_whitelist" 

427 DEPRECATED_DDGEN_TABLE_BLACKLIST = "ddgen_table_blacklist" 

428 DEPRECATED_DDGEN_TABLE_WHITELIST = "ddgen_table_whitelist" 

429 

430 

431class AnonymiseDatabaseSafeConfigDefaults: 

432 """ 

433 Defaults for the keys above 

434 """ 

435 

436 DDGEN_ADD_PER_TABLE_PIDS_TO_SCRUBBER = False 

437 DDGEN_ADDITION_ONLY = False 

438 DDGEN_ALLOW_FULLTEXT_INDEXING = True 

439 DDGEN_APPEND_SOURCE_INFO_TO_COMMENT = True 

440 DDGEN_CONSTANT_CONTENT = False 

441 DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE = True 

442 DDGEN_FORCE_LOWER_CASE = False 

443 DDGEN_FREETEXT_INDEX_MIN_LENGTH = 1000 

444 DDGEN_MIN_LENGTH_FOR_SCRUBBING = 50 

445 DDGEN_OMIT_BY_DEFAULT = True 

446 DDGEN_PREFER_ORIGINAL_PK = False 

447 DEBUG_ROW_LIMIT = 0 

448 

449 

450class AnonymiseColumnComments: 

451 TIMEFIELD_COMMENT = "Date/time that CRATE processed the source row (UTC)" 

452 

453 

454class HashConfigKeys: 

455 """ 

456 Config file keys for defining extra hashers. 

457 """ 

458 

459 HASH_METHOD = "hash_method" 

460 SECRET_KEY = "secret_key" 

461 

462 

463# ============================================================================= 

464# Demo config 

465# ============================================================================= 

466# This does not need to vary with Docker status. 

467 

468_AK = AnonymiseConfigKeys 

469_DA = AnonymiseConfigDefaults 

470_DK = DatabaseConfigKeys 

471_SK = AnonymiseDatabaseSafeConfigKeys 

472_DS = AnonymiseDatabaseSafeConfigDefaults 

473# noinspection PyPep8 

474DEMO_CONFIG = rf"""# Configuration file for CRATE anonymiser (crate_anonymise). 

475# Version {CRATE_VERSION} ({CRATE_VERSION_DATE}). 

476# 

477# SEE HELP FOR DETAILS. 

478 

479# ============================================================================= 

480# Main settings 

481# ============================================================================= 

482 

483[{_AK.SECTION_MAIN}] 

484 

485# ----------------------------------------------------------------------------- 

486# Data dictionary 

487# ----------------------------------------------------------------------------- 

488 

489{_AK.DATA_DICTIONARY_FILENAME} = @@data_dictionary_filename@@ 

490 

491# ----------------------------------------------------------------------------- 

492# Critical field types 

493# ----------------------------------------------------------------------------- 

494 

495{_AK.SQLATYPE_PID} = 

496{_AK.SQLATYPE_MPID} = 

497 

498# ----------------------------------------------------------------------------- 

499# Encryption phrases/passwords 

500# ----------------------------------------------------------------------------- 

501 

502{_AK.HASH_METHOD} = {_DA.HASH_METHOD} 

503{_AK.PER_TABLE_PATIENT_ID_ENCRYPTION_PHRASE} = @@per_table_patient_id_encryption_phrase@@ 

504{_AK.MASTER_PATIENT_ID_ENCRYPTION_PHRASE} = @@master_patient_id_encryption_phrase@@ 

505{_AK.CHANGE_DETECTION_ENCRYPTION_PHRASE} = @@change_detection_encryption_phrase@@ 

506{_AK.EXTRA_HASH_CONFIG_SECTIONS} = 

507 

508# ----------------------------------------------------------------------------- 

509# Text extraction 

510# ----------------------------------------------------------------------------- 

511 

512{_AK.EXTRACT_TEXT_EXTENSIONS_PERMITTED} = 

513{_AK.EXTRACT_TEXT_EXTENSIONS_PROHIBITED} = 

514{_AK.EXTRACT_TEXT_EXTENSIONS_CASE_SENSITIVE} = {_DA.EXTRACT_TEXT_EXTENSIONS_CASE_SENSITIVE} 

515{_AK.EXTRACT_TEXT_PLAIN} = {_DA.EXTRACT_TEXT_PLAIN} 

516{_AK.EXTRACT_TEXT_WIDTH} = {_DA.EXTRACT_TEXT_WIDTH} 

517 

518# ----------------------------------------------------------------------------- 

519# Anonymisation 

520# ----------------------------------------------------------------------------- 

521 

522{_AK.ALLOW_NO_PATIENT_INFO} = {_DA.ALLOW_NO_PATIENT_INFO} 

523{_AK.REPLACE_ALL_DATES_WITH} = {_DA.REPLACE_ALL_DATES_WITH} 

524{_AK.REPLACE_PATIENT_INFO_WITH} = {_DA.REPLACE_PATIENT_INFO_WITH} 

525{_AK.REPLACE_THIRD_PARTY_INFO_WITH} = {_DA.REPLACE_THIRD_PARTY_INFO_WITH} 

526{_AK.REPLACE_NONSPECIFIC_INFO_WITH} = {_DA.REPLACE_NONSPECIFIC_INFO_WITH} 

527{_AK.THIRDPARTY_XREF_MAX_DEPTH} = {_DA.THIRDPARTY_XREF_MAX_DEPTH} 

528{_AK.SCRUB_STRING_SUFFIXES} = 

529 s 

530{_AK.STRING_MAX_REGEX_ERRORS} = {_DA.STRING_MAX_REGEX_ERRORS} 

531{_AK.MIN_STRING_LENGTH_FOR_ERRORS} = {_DA.MIN_STRING_LENGTH_FOR_ERRORS} 

532{_AK.MIN_STRING_LENGTH_TO_SCRUB_WITH} = {_DA.MIN_STRING_LENGTH_TO_SCRUB_WITH} 

533{_AK.ALLOWLIST_FILENAMES} = 

534{_AK.DENYLIST_FILENAMES} = 

535{_AK.DENYLIST_FILES_AS_PHRASES} = {_DA.DENYLIST_FILES_AS_PHRASES} 

536{_AK.DENYLIST_USE_REGEX} = {_DA.DENYLIST_USE_REGEX} 

537{_AK.PHRASE_ALTERNATIVE_WORD_FILENAMES} = 

538{_AK.SCRUB_ALL_DATES} = {_DA.SCRUB_ALL_DATES} 

539{_AK.SCRUB_ALL_EMAIL_ADDRESSES} = {_DA.SCRUB_ALL_EMAIL_ADDRESSES} 

540{_AK.SCRUB_ALL_NUMBERS_OF_N_DIGITS} = 

541{_AK.SCRUB_ALL_UK_POSTCODES} = {_DA.SCRUB_ALL_UK_POSTCODES} 

542{_AK.NONSPECIFIC_SCRUBBER_FIRST} = {_DA.NONSPECIFIC_SCRUBBER_FIRST} 

543{_AK.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY} = {_DA.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY} 

544{_AK.ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY} = {_DA.ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY} 

545{_AK.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY} = {_DA.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY} 

546{_AK.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY} = {_DA.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY} 

547{_AK.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY} = {_DA.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY} 

548{_AK.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY} = {_DA.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY} 

549 

550# ----------------------------------------------------------------------------- 

551# Output fields and formatting 

552# ----------------------------------------------------------------------------- 

553 

554{_AK.TIMEFIELD_NAME} = {_DA.TIMEFIELD_NAME} 

555{_AK.RESEARCH_ID_FIELDNAME} = {_DA.RESEARCH_ID_FIELDNAME} 

556{_AK.TRID_FIELDNAME} = {_DA.TRID_FIELDNAME} 

557{_AK.MASTER_RESEARCH_ID_FIELDNAME} = {_DA.MASTER_RESEARCH_ID_FIELDNAME} 

558{_AK.SOURCE_HASH_FIELDNAME} = {_DA.SOURCE_HASH_FIELDNAME} 

559 

560# ----------------------------------------------------------------------------- 

561# Destination database configuration 

562# See the [destination_database] section for connection details. 

563# ----------------------------------------------------------------------------- 

564 

565{_AK.MAX_ROWS_BEFORE_COMMIT} = {_DA.MAX_ROWS_BEFORE_COMMIT} 

566{_AK.MAX_BYTES_BEFORE_COMMIT} = {_DA.MAX_BYTES_BEFORE_COMMIT} 

567{_AK.TEMPORARY_TABLENAME} = {_DA.TEMPORARY_TABLENAME} 

568 

569# ----------------------------------------------------------------------------- 

570# Choose databases (defined in their own sections). 

571# ----------------------------------------------------------------------------- 

572 

573{_AK.SOURCE_DATABASES} = 

574 sourcedb1 

575# sourcedb2 

576{_AK.DESTINATION_DATABASE} = destination_database 

577{_AK.ADMIN_DATABASE} = admin_database 

578 

579# ----------------------------------------------------------------------------- 

580# PROCESSING OPTIONS, TO LIMIT DATA QUANTITY FOR TESTING 

581# ----------------------------------------------------------------------------- 

582 

583{_AK.DEBUG_MAX_N_PATIENTS} = 

584{_AK.DEBUG_PID_LIST} = 

585 

586# ----------------------------------------------------------------------------- 

587# Opting out entirely 

588# ----------------------------------------------------------------------------- 

589 

590{_AK.OPTOUT_PID_FILENAMES} = 

591{_AK.OPTOUT_MPID_FILENAMES} = 

592{_AK.OPTOUT_COL_VALUES} = 

593 

594 

595# ============================================================================= 

596# Extra regular expression patterns you wish to be scrubbed from the text 

597# as nonspecific information. See help. 

598# ============================================================================= 

599 

600[{_AK.SECTION_EXTRA_REGEXES}] 

601 

602 

603# ============================================================================= 

604# Destination database details. User should have WRITE access. 

605# ============================================================================= 

606 

607[destination_database] 

608 

609{_DK.URL} = @@dest_db_url@@ 

610 

611 

612# ============================================================================= 

613# Administrative database. User should have WRITE access. 

614# ============================================================================= 

615 

616[admin_database] 

617 

618{_DK.URL} = @@admin_db_url@@ 

619 

620 

621# ============================================================================= 

622# SOURCE DATABASE DETAILS BELOW HERE. 

623# User should have READ access only for safety. 

624# ============================================================================= 

625 

626# ----------------------------------------------------------------------------- 

627# Source database example 1 

628# ----------------------------------------------------------------------------- 

629 

630[sourcedb1] 

631 

632 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

633 # CONNECTION DETAILS 

634 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

635 

636{_DK.URL} = @@source_db1_url@@ 

637 

638 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

639 # INPUT FIELDS, FOR THE AUTOGENERATION OF DATA DICTIONARIES 

640 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

641 

642{_SK.DDGEN_OMIT_BY_DEFAULT} = {_DS.DDGEN_OMIT_BY_DEFAULT} 

643{_SK.DDGEN_OMIT_FIELDS} = 

644{_SK.DDGEN_INCLUDE_FIELDS} = @@source_db1_ddgen_include_fields@@ 

645{_SK.DDGEN_PER_TABLE_PID_FIELD} = patient_id 

646{_SK.DDGEN_TABLE_DEFINES_PIDS} = patient 

647{_SK.DDGEN_ADD_PER_TABLE_PIDS_TO_SCRUBBER} = {_DS.DDGEN_ADD_PER_TABLE_PIDS_TO_SCRUBBER} 

648{_SK.DDGEN_MASTER_PID_FIELDNAME} = nhsnum 

649{_SK.DDGEN_TABLE_DENYLIST} = 

650{_SK.DDGEN_TABLE_ALLOWLIST} = 

651{_SK.DDGEN_TABLE_REQUIRE_FIELD_ABSOLUTE} = 

652{_SK.DDGEN_TABLE_REQUIRE_FIELD_CONDITIONAL} = 

653{_SK.DDGEN_FIELD_DENYLIST} = 

654{_SK.DDGEN_FIELD_ALLOWLIST} = 

655{_SK.DDGEN_PK_FIELDS} = 

656{_SK.DDGEN_PREFER_ORIGINAL_PK} = {_DS.DDGEN_PREFER_ORIGINAL_PK} 

657{_SK.DDGEN_CONSTANT_CONTENT} = {_DS.DDGEN_CONSTANT_CONTENT} 

658{_SK.DDGEN_CONSTANT_CONTENT_TABLES} = 

659{_SK.DDGEN_NONCONSTANT_CONTENT_TABLES} = 

660{_SK.DDGEN_ADDITION_ONLY} = {_DS.DDGEN_ADDITION_ONLY} 

661{_SK.DDGEN_ADDITION_ONLY_TABLES} = 

662{_SK.DDGEN_DELETION_POSSIBLE_TABLES} = 

663{_SK.DDGEN_PID_DEFINING_FIELDNAMES} = 

664{_SK.DDGEN_SCRUBSRC_PATIENT_FIELDS} = @@source_db1_ddgen_scrubsrc_patient_fields@@ 

665{_SK.DDGEN_SCRUBSRC_THIRDPARTY_FIELDS} = 

666{_SK.DDGEN_SCRUBSRC_THIRDPARTY_XREF_PID_FIELDS} = 

667{_SK.DDGEN_REQUIRED_SCRUBSRC_FIELDS} = 

668{_SK.DDGEN_SCRUBMETHOD_CODE_FIELDS} = 

669{_SK.DDGEN_SCRUBMETHOD_DATE_FIELDS} = 

670{_SK.DDGEN_SCRUBMETHOD_NUMBER_FIELDS} = 

671{_SK.DDGEN_SCRUBMETHOD_PHRASE_FIELDS} = 

672{_SK.DDGEN_SAFE_FIELDS_EXEMPT_FROM_SCRUBBING} = 

673{_SK.DDGEN_MIN_LENGTH_FOR_SCRUBBING} = {_DS.DDGEN_MIN_LENGTH_FOR_SCRUBBING} 

674{_SK.DDGEN_TRUNCATE_DATE_FIELDS} = 

675{_SK.DDGEN_FILENAME_TO_TEXT_FIELDS} = 

676{_SK.DDGEN_BINARY_TO_TEXT_FIELD_PAIRS} = 

677{_SK.DDGEN_SKIP_ROW_IF_EXTRACT_TEXT_FAILS_FIELDS} = 

678{_SK.DDGEN_RENAME_TABLES_REMOVE_SUFFIXES} = 

679{_SK.DDGEN_PATIENT_OPT_OUT_FIELDS} = 

680{_SK.DDGEN_EXTRA_HASH_FIELDS} = 

681 

682 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

683 # DESTINATION INDEXING 

684 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

685 

686{_SK.DDGEN_INDEX_FIELDS} = 

687{_SK.DDGEN_ALLOW_FULLTEXT_INDEXING} = {_DS.DDGEN_ALLOW_FULLTEXT_INDEXING} 

688 

689 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

690 # DATA DICTIONARY MANIPULATION TO DESTINATION TABLE/FIELD NAMES 

691 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

692 

693{_SK.DDGEN_FORCE_LOWER_CASE} = {_DS.DDGEN_FORCE_LOWER_CASE} 

694{_SK.DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE} = {_DS.DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE} 

695 

696 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

697 # PROCESSING OPTIONS, TO LIMIT DATA QUANTITY FOR TESTING 

698 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

699 

700{_SK.DEBUG_ROW_LIMIT} = 

701{_SK.DEBUG_LIMITED_TABLES} = 

702 

703# ----------------------------------------------------------------------------- 

704# Source database example 2 

705# ----------------------------------------------------------------------------- 

706 

707[mysourcedb2] 

708 

709{_DK.URL} = mysql+mysqldb://username:password@127.0.0.1:3306/source2_databasename?charset=utf8 

710 

711{_SK.DDGEN_FORCE_LOWER_CASE} = {_DS.DDGEN_FORCE_LOWER_CASE} 

712{_SK.DDGEN_APPEND_SOURCE_INFO_TO_COMMENT} = {_DS.DDGEN_APPEND_SOURCE_INFO_TO_COMMENT} 

713{_SK.DDGEN_PER_TABLE_PID_FIELD} = patient_id 

714{_SK.DDGEN_MASTER_PID_FIELDNAME} = nhsnum 

715{_SK.DDGEN_TABLE_DENYLIST} = 

716{_SK.DDGEN_FIELD_DENYLIST} = 

717{_SK.DDGEN_TABLE_REQUIRE_FIELD_ABSOLUTE} = 

718{_SK.DDGEN_TABLE_REQUIRE_FIELD_CONDITIONAL} = 

719{_SK.DDGEN_PK_FIELDS} = 

720{_SK.DDGEN_PREFER_ORIGINAL_PK} = {_DS.DDGEN_PREFER_ORIGINAL_PK} 

721{_SK.DDGEN_CONSTANT_CONTENT} = {_DS.DDGEN_CONSTANT_CONTENT} 

722{_SK.DDGEN_SCRUBSRC_PATIENT_FIELDS} = 

723{_SK.DDGEN_SCRUBSRC_THIRDPARTY_FIELDS} = 

724{_SK.DDGEN_SCRUBMETHOD_CODE_FIELDS} = 

725{_SK.DDGEN_SCRUBMETHOD_DATE_FIELDS} = 

726{_SK.DDGEN_SCRUBMETHOD_NUMBER_FIELDS} = 

727{_SK.DDGEN_SCRUBMETHOD_PHRASE_FIELDS} = 

728{_SK.DDGEN_SAFE_FIELDS_EXEMPT_FROM_SCRUBBING} = 

729{_SK.DDGEN_MIN_LENGTH_FOR_SCRUBBING} = {_DS.DDGEN_MIN_LENGTH_FOR_SCRUBBING} 

730{_SK.DDGEN_TRUNCATE_DATE_FIELDS} = 

731{_SK.DDGEN_FILENAME_TO_TEXT_FIELDS} = 

732{_SK.DDGEN_BINARY_TO_TEXT_FIELD_PAIRS} = 

733 

734# ----------------------------------------------------------------------------- 

735# Source database example 3 

736# ----------------------------------------------------------------------------- 

737 

738[camcops] 

739# Example for the CamCOPS anonymisation staging database 

740 

741{_DK.URL} = mysql+mysqldb://username:password@127.0.0.1:3306/camcops_databasename?charset=utf8 

742 

743# FOR EXAMPLE: 

744{_SK.DDGEN_FORCE_LOWER_CASE} = False 

745{_SK.DDGEN_PER_TABLE_PID_FIELD} = _patient_idnum1 

746{_SK.DDGEN_PID_DEFINING_FIELDNAMES} = _patient_idnum1 

747{_SK.DDGEN_MASTER_PID_FIELDNAME} = _patient_idnum2 

748{_SK.DDGEN_TABLE_DENYLIST} = 

749{_SK.DDGEN_FIELD_DENYLIST} = _patient_iddesc1 

750 _patient_idshortdesc1 

751 _patient_iddesc2 

752 _patient_idshortdesc2 

753 _patient_iddesc3 

754 _patient_idshortdesc3 

755 _patient_iddesc4 

756 _patient_idshortdesc4 

757 _patient_iddesc5 

758 _patient_idshortdesc5 

759 _patient_iddesc6 

760 _patient_idshortdesc6 

761 _patient_iddesc7 

762 _patient_idshortdesc7 

763 _patient_iddesc8 

764 _patient_idshortdesc8 

765 id 

766 patient_id 

767 _device 

768 _era 

769 _current 

770 _when_removed_exact 

771 _when_removed_batch_utc 

772 _removing_user 

773 _preserving_user 

774 _forcibly_preserved 

775 _predecessor_pk 

776 _successor_pk 

777 _manually_erased 

778 _manually_erased_at 

779 _manually_erasing_user 

780 _addition_pending 

781 _removal_pending 

782 _move_off_tablet 

783 

784{_SK.DDGEN_TABLE_REQUIRE_FIELD_ABSOLUTE} = 

785{_SK.DDGEN_TABLE_REQUIRE_FIELD_CONDITIONAL} = 

786{_SK.DDGEN_PK_FIELDS} = _pk 

787{_SK.DDGEN_PREFER_ORIGINAL_PK} = {_DS.DDGEN_PREFER_ORIGINAL_PK} 

788{_SK.DDGEN_CONSTANT_CONTENT} = False 

789{_SK.DDGEN_SCRUBSRC_PATIENT_FIELDS} = _patient_forename 

790 _patient_surname 

791 _patient_dob 

792 _patient_idnum1 

793 _patient_idnum2 

794 _patient_idnum3 

795 _patient_idnum4 

796 _patient_idnum5 

797 _patient_idnum6 

798 _patient_idnum7 

799 _patient_idnum8 

800{_SK.DDGEN_SCRUBSRC_THIRDPARTY_FIELDS} = 

801{_SK.DDGEN_SCRUBMETHOD_CODE_FIELDS} = 

802{_SK.DDGEN_SCRUBMETHOD_DATE_FIELDS} = _patient_dob 

803{_SK.DDGEN_SCRUBMETHOD_NUMBER_FIELDS} = 

804{_SK.DDGEN_SCRUBMETHOD_PHRASE_FIELDS} = 

805{_SK.DDGEN_SAFE_FIELDS_EXEMPT_FROM_SCRUBBING} = _device 

806 _era 

807 _when_added_exact 

808 _adding_user 

809 _when_removed_exact 

810 _removing_user 

811 _preserving_user 

812 _manually_erased_at 

813 _manually_erasing_user 

814 when_last_modified 

815 when_created 

816 when_firstexit 

817 clinician_specialty 

818 clinician_name 

819 clinician_post 

820 clinician_professional_registration 

821 clinician_contact_details 

822# ... now some task-specific ones 

823 bdi_scale 

824 pause_start_time 

825 pause_end_time 

826 trial_start_time 

827 cue_start_time 

828 target_start_time 

829 detection_start_time 

830 iti_start_time 

831 iti_end_time 

832 trial_end_time 

833 response_time 

834 target_time 

835 choice_time 

836 discharge_date 

837 discharge_reason_code 

838 diagnosis_psych_1_icd10code 

839 diagnosis_psych_1_description 

840 diagnosis_psych_2_icd10code 

841 diagnosis_psych_2_description 

842 diagnosis_psych_3_icd10code 

843 diagnosis_psych_3_description 

844 diagnosis_psych_4_icd10code 

845 diagnosis_psych_4_description 

846 diagnosis_medical_1 

847 diagnosis_medical_2 

848 diagnosis_medical_3 

849 diagnosis_medical_4 

850 category_start_time 

851 category_response_time 

852 category_chosen 

853 gamble_fixed_option 

854 gamble_lottery_option_p 

855 gamble_lottery_option_q 

856 gamble_start_time 

857 gamble_response_time 

858 likelihood 

859{_SK.DDGEN_MIN_LENGTH_FOR_SCRUBBING} = {_DS.DDGEN_MIN_LENGTH_FOR_SCRUBBING} 

860{_SK.DDGEN_TRUNCATE_DATE_FIELDS} = _patient_dob 

861{_SK.DDGEN_FILENAME_TO_TEXT_FIELDS} = 

862{_SK.DDGEN_BINARY_TO_TEXT_FIELD_PAIRS} = 

863 

864""" # noqa: E501 

865 

866# For the style: 

867# [source_databases] 

868# source1 = blah 

869# source2 = thing 

870# ... you can't have multiple keys with the same name. 

871# https://stackoverflow.com/questions/287757 

872 

873 

874class PatientInfoConstants: 

875 SECRET_MAP_TABLENAME = "secret_map" 

876 PID_FIELDNAME = "pid" 

877 MPID_FIELDNAME = "mpid" 

878 RID_FIELDNAME = "rid" 

879 MRID_FIELDNAME = "mrid" 

880 TRID_FIELDNAME = "trid"