Coverage for anonymise/constants.py: 100%
255 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/anonymise/constants.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Shared constants for CRATE anonymiser.**
28"""
30import calendar
31from enum import unique
33from sqlalchemy import Integer
34from cardinal_pythonlib.enumlike import StrEnum
36from crate_anon.version import CRATE_VERSION, CRATE_VERSION_DATE
37from crate_anon.nlp_manager.constants import DatabaseConfigKeys
40# =============================================================================
41# Logging
42# =============================================================================
44LOG_DATEFMT = "%Y-%m-%d %H:%M:%S"
46LOG_COLORS = {
47 "DEBUG": "cyan",
48 "INFO": "green",
49 "WARNING": "yellow",
50 "ERROR": "red",
51 "CRITICAL": "red,bg_white",
52}
55# =============================================================================
56# Cosmetic
57# =============================================================================
59BIGSEP = "=" * 20 + " "
60SEP = "-" * 20 + " "
63# =============================================================================
64# Defaults for command-line options
65# =============================================================================
67DEFAULT_REPORT_EVERY = 100000 # 100k
68DEFAULT_CHUNKSIZE = 100000 # 100k
71# =============================================================================
72# Environment
73# =============================================================================
75ANON_CONFIG_ENV_VAR = "CRATE_ANON_CONFIG"
77# =============================================================================
78# Data dictionary
79# =============================================================================
81DATEFORMAT_ISO8601 = "%Y-%m-%dT%H:%M:%S%z" # e.g. 2013-07-24T20:04:07+0100
82DEFAULT_INDEX_LEN = 20 # for data types where it's mandatory
84LONGTEXT = "LONGTEXT"
86MAX_PID_STR = "9" * 10 # e.g. NHS numbers are 10-digit
88TridType = Integer
89MAX_TRID = 2**31 - 1
90# https://dev.mysql.com/doc/refman/5.0/en/numeric-type-overview.html
91# Maximum INT UNSIGNED is 4294967295 == 2 ** 32 - 1.
92# INT range is -2147483648 == -(2 ** 31) to
93# +2147483647 == 2 ** 31 - 1 == 2.1 billion
94# ... note that this is inadequate for 10-digit NHS numbers.
95# Maximum BIGINT UNSIGNED is 18446744073709551615 == 2 ** 64 - 1.
96# BIGINT range is -9223372036854775808 == -(2 ** 63) to
97# +9223372036854775807 == 2 ** 64 - 1
100# When scrub_all_dates is True and the replacement text is a date format
101# string, allow these directives.
102# https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
103DATE_BLURRING_DIRECTIVES = (
104 "b", # Month as locale's abbreviated name
105 "B", # Month as locale's full name
106 "m", # Month as zero-padded decimal number
107 "Y", # Year with century as decimal number
108 "y", # Year without century as zero-padded decimal number
109 # Among things that are not currently supported: %% (literal %).
110)
111DATE_BLURRING_DIRECTIVES_CSV = ", ".join(
112 [f"%{d}" for d in DATE_BLURRING_DIRECTIVES]
113)
115# https://stackoverflow.com/questions/3418050/month-name-to-month-number-and-vice-versa-in-python
116MONTH_3_LETTER_INDEX = {
117 # See _month_word_regex_fragment() in anonregex.py
118 # Assuming this may not be the same as calendar.month_abbr in some locales
119 month[:3]: index
120 for index, month in enumerate(calendar.month_name)
121 if month
122}
125@unique
126class AlterMethodType(StrEnum):
127 BINARY_TO_TEXT = "binary_to_text"
128 FILENAME_FORMAT_TO_TEXT = "filename_format_to_text" # new in v0.18.18
129 FILENAME_TO_TEXT = "filename_to_text"
130 HASH = "hash"
131 # HTML_ESCAPE = "html_escape"
132 HTML_UNESCAPE = "html_unescape"
133 HTML_UNTAG = "html_untag"
134 SCRUBIN = "scrub"
135 SKIP_IF_TEXT_EXTRACT_FAILS = "skip_if_extract_fails"
136 TRUNCATEDATE = "truncate_date"
139@unique
140class Decision(StrEnum):
141 OMIT = "OMIT"
142 INCLUDE = "include"
145@unique
146class IndexType(StrEnum):
147 NONE = ""
148 NORMAL = "I"
149 UNIQUE = "U"
150 FULLTEXT = "F"
153@unique
154class ScrubMethod(StrEnum):
155 WORDS = "words"
156 PHRASE = "phrase"
157 PHRASE_UNLESS_NUMERIC = "phrase_unless_numeric"
158 NUMERIC = "number"
159 DATE = "date"
160 CODE = "code"
163@unique
164class ScrubSrc(StrEnum):
165 PATIENT = "patient"
166 THIRDPARTY = "thirdparty"
167 THIRDPARTY_XREF_PID = "thirdparty_xref_pid"
170@unique
171class SrcFlag(StrEnum):
172 PK = "K"
173 NOT_NULL = "N"
174 ADD_SRC_HASH = "H"
175 PRIMARY_PID = "P"
176 DEFINES_PRIMARY_PIDS = "*"
177 MASTER_PID = "M"
178 CONSTANT = "C"
179 ADDITION_ONLY = "A"
180 OPT_OUT = "!"
181 REQUIRED_SCRUBBER = "R"
184# =============================================================================
185# Databases
186# =============================================================================
188CHARSET = "utf8"
189TABLE_KWARGS = {
190 # MySQL:
191 "mysql_charset": CHARSET,
192 "mysql_engine": "InnoDB",
193}
194COMMENT = "comment"
196MYSQL_MAX_IDENTIFIER_LENGTH = 64
197# MySQL: 64 -- http://dev.mysql.com/doc/refman/5.7/en/identifiers.html
198SQLSERVER_MAX_IDENTIFIER_LENGTH = 128
199# Microsoft SQL Server: 128 --
200# https://docs.microsoft.com/en-us/sql/relational-databases/databases/database-identifiers # noqa: E501
203# =============================================================================
204# Config keys
205# =============================================================================
208class AnonymiseConfigKeys:
209 # Sections
210 SECTION_MAIN = "main"
211 SECTION_EXTRA_REGEXES = "extra_regexes"
213 # Data dictionary
214 DATA_DICTIONARY_FILENAME = "data_dictionary_filename"
216 # Critical field types
217 SQLATYPE_MPID = "sqlatype_mpid"
218 SQLATYPE_PID = "sqlatype_pid"
220 # Encryption phrases/passwords
221 CHANGE_DETECTION_ENCRYPTION_PHRASE = "change_detection_encryption_phrase"
222 EXTRA_HASH_CONFIG_SECTIONS = "extra_hash_config_sections"
223 HASH_METHOD = "hash_method"
224 MASTER_PATIENT_ID_ENCRYPTION_PHRASE = "master_patient_id_encryption_phrase"
225 PER_TABLE_PATIENT_ID_ENCRYPTION_PHRASE = (
226 "per_table_patient_id_encryption_phrase"
227 )
229 # Text extraction
230 EXTRACT_TEXT_EXTENSIONS_CASE_SENSITIVE = (
231 "extract_text_extensions_case_sensitive"
232 )
233 EXTRACT_TEXT_EXTENSIONS_PERMITTED = "extract_text_extensions_permitted"
234 EXTRACT_TEXT_EXTENSIONS_PROHIBITED = "extract_text_extensions_prohibited"
235 EXTRACT_TEXT_PLAIN = "extract_text_plain"
236 EXTRACT_TEXT_WIDTH = "extract_text_width"
238 # Anonymisation
239 ALLOWLIST_FILENAMES = "allowlist_filenames"
240 ALLOW_NO_PATIENT_INFO = "allow_no_patient_info"
241 ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY = (
242 "anonymise_codes_at_word_boundaries_only"
243 )
244 ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY = (
245 "anonymise_codes_at_numeric_boundaries_only"
246 )
247 ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY = (
248 "anonymise_dates_at_word_boundaries_only"
249 )
250 ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY = (
251 "anonymise_numbers_at_numeric_boundaries_only"
252 )
253 ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY = (
254 "anonymise_numbers_at_word_boundaries_only"
255 )
256 ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY = (
257 "anonymise_strings_at_word_boundaries_only"
258 )
259 DENYLIST_FILENAMES = "denylist_filenames"
260 DENYLIST_FILES_AS_PHRASES = "denylist_files_as_phrases"
261 DENYLIST_USE_REGEX = "denylist_use_regex"
262 DEPRECATED_BLACKLIST_FILENAMES = "blacklist_filenames"
263 DEPRECATED_WHITELIST_FILENAMES = "whitelist_filenames"
264 MIN_STRING_LENGTH_FOR_ERRORS = "min_string_length_for_errors"
265 MIN_STRING_LENGTH_TO_SCRUB_WITH = "min_string_length_to_scrub_with"
266 NONSPECIFIC_SCRUBBER_FIRST = "nonspecific_scrubber_first"
267 PHRASE_ALTERNATIVE_WORD_FILENAMES = "phrase_alternative_word_filenames"
268 REPLACE_ALL_DATES_WITH = "replace_all_dates_with"
269 REPLACE_NONSPECIFIC_INFO_WITH = "replace_nonspecific_info_with"
270 REPLACE_PATIENT_INFO_WITH = "replace_patient_info_with"
271 REPLACE_THIRD_PARTY_INFO_WITH = "replace_third_party_info_with"
272 SCRUB_ALL_DATES = "scrub_all_dates"
273 SCRUB_ALL_EMAIL_ADDRESSES = "scrub_all_email_addresses"
274 SCRUB_ALL_NUMBERS_OF_N_DIGITS = "scrub_all_numbers_of_n_digits"
275 SCRUB_ALL_UK_POSTCODES = "scrub_all_uk_postcodes"
276 SCRUB_STRING_SUFFIXES = "scrub_string_suffixes"
277 STRING_MAX_REGEX_ERRORS = "string_max_regex_errors"
278 THIRDPARTY_XREF_MAX_DEPTH = "thirdparty_xref_max_depth"
279 TIMEFIELD_NAME = "timefield_name"
281 # Output fields and formatting
282 RESEARCH_ID_FIELDNAME = "research_id_fieldname"
283 TRID_FIELDNAME = "trid_fieldname"
284 MASTER_RESEARCH_ID_FIELDNAME = "master_research_id_fieldname"
285 ADD_MRID_WHEREVER_RID_ADDED = "add_mrid_wherever_rid_added"
286 SOURCE_HASH_FIELDNAME = "source_hash_fieldname"
288 # Destination database configuration
289 MAX_ROWS_BEFORE_COMMIT = "max_rows_before_commit"
290 MAX_BYTES_BEFORE_COMMIT = "max_bytes_before_commit"
291 TEMPORARY_TABLENAME = "temporary_tablename"
293 # Databases
294 ADMIN_DATABASE = "admin_database"
295 DESTINATION_DATABASE = "destination_database"
296 SOURCE_DATABASES = "source_databases"
298 # Processing options
299 DEBUG_MAX_N_PATIENTS = "debug_max_n_patients"
300 DEBUG_PID_LIST = "debug_pid_list"
302 # Opting out
303 OPTOUT_COL_VALUES = "optout_col_values"
304 OPTOUT_MPID_FILENAMES = "optout_mpid_filenames"
305 OPTOUT_PID_FILENAMES = "optout_pid_filenames"
308class AnonymiseConfigDefaults:
309 # Critical field types
310 SQLATYPE_MPID = "BigInteger"
311 SQLATYPE_PID = "BigInteger"
313 # Encryption phrases/passwords
314 HASH_METHOD = "HMAC_MD5"
316 # Text extraction
317 EXTRACT_TEXT_EXTENSIONS_CASE_SENSITIVE = False
318 EXTRACT_TEXT_PLAIN = True
319 EXTRACT_TEXT_WIDTH = 80
321 # Anonymisation
322 ALLOW_NO_PATIENT_INFO = False
323 ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY = True
324 ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY = True
325 ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY = True
326 ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY = True
327 ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY = False
328 ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY = True
329 DENYLIST_FILES_AS_PHRASES = False
330 DENYLIST_USE_REGEX = False
331 MIN_STRING_LENGTH_FOR_ERRORS = 3
332 MIN_STRING_LENGTH_TO_SCRUB_WITH = 2
333 NONSPECIFIC_SCRUBBER_FIRST = False
334 REPLACE_ALL_DATES_WITH = "[~~~]"
335 REPLACE_NONSPECIFIC_INFO_WITH = "[~~~]"
336 REPLACE_PATIENT_INFO_WITH = "[__PPP__]"
337 REPLACE_THIRD_PARTY_INFO_WITH = "[__TTT__]"
338 SCRUB_ALL_DATES = False
339 SCRUB_ALL_EMAIL_ADDRESSES = False
340 SCRUB_ALL_UK_POSTCODES = False
341 STRING_MAX_REGEX_ERRORS = 0
342 THIRDPARTY_XREF_MAX_DEPTH = 1
343 TIMEFIELD_NAME = "_when_processed_utc"
345 # Output fields and formatting
346 RESEARCH_ID_FIELDNAME = "rid"
347 TRID_FIELDNAME = "trid"
348 MASTER_RESEARCH_ID_FIELDNAME = "mrid"
349 ADD_MRID_WHEREVER_RID_ADDED = True
350 SOURCE_HASH_FIELDNAME = "_src_hash"
352 # Destination database configuration
353 MAX_ROWS_BEFORE_COMMIT = 1000
354 MAX_BYTES_BEFORE_COMMIT = 80 * 1024 * 1024 # 80 Mb
355 TEMPORARY_TABLENAME = "_crate_temp_table"
357 # Processing options
358 DEBUG_MAX_N_PATIENTS = 0
361class AnonymiseDatabaseSafeConfigKeys:
362 """
363 Non-sensitive config keys relating to a specific database.
364 """
366 DDGEN_ADD_PER_TABLE_PIDS_TO_SCRUBBER = (
367 "ddgen_add_per_table_pids_to_scrubber"
368 )
369 DDGEN_ADDITION_ONLY = "ddgen_addition_only"
370 DDGEN_ADDITION_ONLY_TABLES = "ddgen_addition_only_tables"
371 DDGEN_ALLOW_FULLTEXT_INDEXING = "ddgen_allow_fulltext_indexing"
372 DDGEN_APPEND_SOURCE_INFO_TO_COMMENT = "ddgen_append_source_info_to_comment"
373 DDGEN_BINARY_TO_TEXT_FIELD_PAIRS = "ddgen_binary_to_text_field_pairs"
374 DDGEN_CONSTANT_CONTENT = "ddgen_constant_content"
375 DDGEN_CONSTANT_CONTENT_TABLES = "ddgen_constant_content_tables"
376 DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE = (
377 "ddgen_convert_odd_chars_to_underscore"
378 )
379 DDGEN_DELETION_POSSIBLE_TABLES = "ddgen_deletion_possible_tables"
380 DDGEN_EXTRA_HASH_FIELDS = "ddgen_extra_hash_fields"
381 DDGEN_FIELD_ALLOWLIST = "ddgen_field_allowlist"
382 DDGEN_FIELD_DENYLIST = "ddgen_field_denylist"
383 DDGEN_FILENAME_TO_TEXT_FIELDS = "ddgen_filename_to_text_fields"
384 DDGEN_FORCE_LOWER_CASE = "ddgen_force_lower_case"
385 DDGEN_FREETEXT_INDEX_MIN_LENGTH = "ddgen_freetext_index_min_length"
386 DDGEN_INCLUDE_FIELDS = "ddgen_include_fields"
387 DDGEN_INDEX_FIELDS = "ddgen_index_fields"
388 DDGEN_MASTER_PID_FIELDNAME = "ddgen_master_pid_fieldname"
389 DDGEN_MIN_LENGTH_FOR_SCRUBBING = "ddgen_min_length_for_scrubbing"
390 DDGEN_NONCONSTANT_CONTENT_TABLES = "ddgen_nonconstant_content_tables"
391 DDGEN_OMIT_BY_DEFAULT = "ddgen_omit_by_default"
392 DDGEN_OMIT_FIELDS = "ddgen_omit_fields"
393 DDGEN_PATIENT_OPT_OUT_FIELDS = "ddgen_patient_opt_out_fields"
394 DDGEN_PER_TABLE_PID_FIELD = "ddgen_per_table_pid_field"
395 DDGEN_PID_DEFINING_FIELDNAMES = "ddgen_pid_defining_fieldnames"
396 DDGEN_PK_FIELDS = "ddgen_pk_fields"
397 DDGEN_PREFER_ORIGINAL_PK = "ddgen_prefer_original_pk"
398 DDGEN_RENAME_TABLES_REMOVE_SUFFIXES = "ddgen_rename_tables_remove_suffixes"
399 DDGEN_REQUIRED_SCRUBSRC_FIELDS = "ddgen_required_scrubsrc_fields"
400 DDGEN_SAFE_FIELDS_EXEMPT_FROM_SCRUBBING = (
401 "ddgen_safe_fields_exempt_from_scrubbing"
402 )
403 DDGEN_SCRUBMETHOD_CODE_FIELDS = "ddgen_scrubmethod_code_fields"
404 DDGEN_SCRUBMETHOD_DATE_FIELDS = "ddgen_scrubmethod_date_fields"
405 DDGEN_SCRUBMETHOD_NUMBER_FIELDS = "ddgen_scrubmethod_number_fields"
406 DDGEN_SCRUBMETHOD_PHRASE_FIELDS = "ddgen_scrubmethod_phrase_fields"
407 DDGEN_SCRUBSRC_PATIENT_FIELDS = "ddgen_scrubsrc_patient_fields"
408 DDGEN_SCRUBSRC_THIRDPARTY_FIELDS = "ddgen_scrubsrc_thirdparty_fields"
409 DDGEN_SCRUBSRC_THIRDPARTY_XREF_PID_FIELDS = (
410 "ddgen_scrubsrc_thirdparty_xref_pid_fields"
411 )
412 DDGEN_SKIP_ROW_IF_EXTRACT_TEXT_FAILS_FIELDS = (
413 "ddgen_skip_row_if_extract_text_fails_fields"
414 )
415 DDGEN_TABLE_ALLOWLIST = "ddgen_table_allowlist"
416 DDGEN_TABLE_DEFINES_PIDS = "ddgen_table_defines_pids"
417 DDGEN_TABLE_DENYLIST = "ddgen_table_denylist"
418 DDGEN_TABLE_REQUIRE_FIELD_ABSOLUTE = "ddgen_table_require_field_absolute"
419 DDGEN_TABLE_REQUIRE_FIELD_CONDITIONAL = (
420 "ddgen_table_require_field_conditional"
421 )
422 DDGEN_TRUNCATE_DATE_FIELDS = "ddgen_truncate_date_fields"
423 DEBUG_LIMITED_TABLES = "debug_limited_tables"
424 DEBUG_ROW_LIMIT = "debug_row_limit"
425 DEPRECATED_DDGEN_FIELD_BLACKLIST = "ddgen_field_blacklist"
426 DEPRECATED_DDGEN_FIELD_WHITELIST = "ddgen_field_whitelist"
427 DEPRECATED_DDGEN_TABLE_BLACKLIST = "ddgen_table_blacklist"
428 DEPRECATED_DDGEN_TABLE_WHITELIST = "ddgen_table_whitelist"
431class AnonymiseDatabaseSafeConfigDefaults:
432 """
433 Defaults for the keys above
434 """
436 DDGEN_ADD_PER_TABLE_PIDS_TO_SCRUBBER = False
437 DDGEN_ADDITION_ONLY = False
438 DDGEN_ALLOW_FULLTEXT_INDEXING = True
439 DDGEN_APPEND_SOURCE_INFO_TO_COMMENT = True
440 DDGEN_CONSTANT_CONTENT = False
441 DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE = True
442 DDGEN_FORCE_LOWER_CASE = False
443 DDGEN_FREETEXT_INDEX_MIN_LENGTH = 1000
444 DDGEN_MIN_LENGTH_FOR_SCRUBBING = 50
445 DDGEN_OMIT_BY_DEFAULT = True
446 DDGEN_PREFER_ORIGINAL_PK = False
447 DEBUG_ROW_LIMIT = 0
450class AnonymiseColumnComments:
451 TIMEFIELD_COMMENT = "Date/time that CRATE processed the source row (UTC)"
454class HashConfigKeys:
455 """
456 Config file keys for defining extra hashers.
457 """
459 HASH_METHOD = "hash_method"
460 SECRET_KEY = "secret_key"
463# =============================================================================
464# Demo config
465# =============================================================================
466# This does not need to vary with Docker status.
468_AK = AnonymiseConfigKeys
469_DA = AnonymiseConfigDefaults
470_DK = DatabaseConfigKeys
471_SK = AnonymiseDatabaseSafeConfigKeys
472_DS = AnonymiseDatabaseSafeConfigDefaults
473# noinspection PyPep8
474DEMO_CONFIG = rf"""# Configuration file for CRATE anonymiser (crate_anonymise).
475# Version {CRATE_VERSION} ({CRATE_VERSION_DATE}).
476#
477# SEE HELP FOR DETAILS.
479# =============================================================================
480# Main settings
481# =============================================================================
483[{_AK.SECTION_MAIN}]
485# -----------------------------------------------------------------------------
486# Data dictionary
487# -----------------------------------------------------------------------------
489{_AK.DATA_DICTIONARY_FILENAME} = @@data_dictionary_filename@@
491# -----------------------------------------------------------------------------
492# Critical field types
493# -----------------------------------------------------------------------------
495{_AK.SQLATYPE_PID} =
496{_AK.SQLATYPE_MPID} =
498# -----------------------------------------------------------------------------
499# Encryption phrases/passwords
500# -----------------------------------------------------------------------------
502{_AK.HASH_METHOD} = {_DA.HASH_METHOD}
503{_AK.PER_TABLE_PATIENT_ID_ENCRYPTION_PHRASE} = @@per_table_patient_id_encryption_phrase@@
504{_AK.MASTER_PATIENT_ID_ENCRYPTION_PHRASE} = @@master_patient_id_encryption_phrase@@
505{_AK.CHANGE_DETECTION_ENCRYPTION_PHRASE} = @@change_detection_encryption_phrase@@
506{_AK.EXTRA_HASH_CONFIG_SECTIONS} =
508# -----------------------------------------------------------------------------
509# Text extraction
510# -----------------------------------------------------------------------------
512{_AK.EXTRACT_TEXT_EXTENSIONS_PERMITTED} =
513{_AK.EXTRACT_TEXT_EXTENSIONS_PROHIBITED} =
514{_AK.EXTRACT_TEXT_EXTENSIONS_CASE_SENSITIVE} = {_DA.EXTRACT_TEXT_EXTENSIONS_CASE_SENSITIVE}
515{_AK.EXTRACT_TEXT_PLAIN} = {_DA.EXTRACT_TEXT_PLAIN}
516{_AK.EXTRACT_TEXT_WIDTH} = {_DA.EXTRACT_TEXT_WIDTH}
518# -----------------------------------------------------------------------------
519# Anonymisation
520# -----------------------------------------------------------------------------
522{_AK.ALLOW_NO_PATIENT_INFO} = {_DA.ALLOW_NO_PATIENT_INFO}
523{_AK.REPLACE_ALL_DATES_WITH} = {_DA.REPLACE_ALL_DATES_WITH}
524{_AK.REPLACE_PATIENT_INFO_WITH} = {_DA.REPLACE_PATIENT_INFO_WITH}
525{_AK.REPLACE_THIRD_PARTY_INFO_WITH} = {_DA.REPLACE_THIRD_PARTY_INFO_WITH}
526{_AK.REPLACE_NONSPECIFIC_INFO_WITH} = {_DA.REPLACE_NONSPECIFIC_INFO_WITH}
527{_AK.THIRDPARTY_XREF_MAX_DEPTH} = {_DA.THIRDPARTY_XREF_MAX_DEPTH}
528{_AK.SCRUB_STRING_SUFFIXES} =
529 s
530{_AK.STRING_MAX_REGEX_ERRORS} = {_DA.STRING_MAX_REGEX_ERRORS}
531{_AK.MIN_STRING_LENGTH_FOR_ERRORS} = {_DA.MIN_STRING_LENGTH_FOR_ERRORS}
532{_AK.MIN_STRING_LENGTH_TO_SCRUB_WITH} = {_DA.MIN_STRING_LENGTH_TO_SCRUB_WITH}
533{_AK.ALLOWLIST_FILENAMES} =
534{_AK.DENYLIST_FILENAMES} =
535{_AK.DENYLIST_FILES_AS_PHRASES} = {_DA.DENYLIST_FILES_AS_PHRASES}
536{_AK.DENYLIST_USE_REGEX} = {_DA.DENYLIST_USE_REGEX}
537{_AK.PHRASE_ALTERNATIVE_WORD_FILENAMES} =
538{_AK.SCRUB_ALL_DATES} = {_DA.SCRUB_ALL_DATES}
539{_AK.SCRUB_ALL_EMAIL_ADDRESSES} = {_DA.SCRUB_ALL_EMAIL_ADDRESSES}
540{_AK.SCRUB_ALL_NUMBERS_OF_N_DIGITS} =
541{_AK.SCRUB_ALL_UK_POSTCODES} = {_DA.SCRUB_ALL_UK_POSTCODES}
542{_AK.NONSPECIFIC_SCRUBBER_FIRST} = {_DA.NONSPECIFIC_SCRUBBER_FIRST}
543{_AK.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY} = {_DA.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY}
544{_AK.ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY} = {_DA.ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY}
545{_AK.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY} = {_DA.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY}
546{_AK.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY} = {_DA.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY}
547{_AK.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY} = {_DA.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY}
548{_AK.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY} = {_DA.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY}
550# -----------------------------------------------------------------------------
551# Output fields and formatting
552# -----------------------------------------------------------------------------
554{_AK.TIMEFIELD_NAME} = {_DA.TIMEFIELD_NAME}
555{_AK.RESEARCH_ID_FIELDNAME} = {_DA.RESEARCH_ID_FIELDNAME}
556{_AK.TRID_FIELDNAME} = {_DA.TRID_FIELDNAME}
557{_AK.MASTER_RESEARCH_ID_FIELDNAME} = {_DA.MASTER_RESEARCH_ID_FIELDNAME}
558{_AK.SOURCE_HASH_FIELDNAME} = {_DA.SOURCE_HASH_FIELDNAME}
560# -----------------------------------------------------------------------------
561# Destination database configuration
562# See the [destination_database] section for connection details.
563# -----------------------------------------------------------------------------
565{_AK.MAX_ROWS_BEFORE_COMMIT} = {_DA.MAX_ROWS_BEFORE_COMMIT}
566{_AK.MAX_BYTES_BEFORE_COMMIT} = {_DA.MAX_BYTES_BEFORE_COMMIT}
567{_AK.TEMPORARY_TABLENAME} = {_DA.TEMPORARY_TABLENAME}
569# -----------------------------------------------------------------------------
570# Choose databases (defined in their own sections).
571# -----------------------------------------------------------------------------
573{_AK.SOURCE_DATABASES} =
574 sourcedb1
575# sourcedb2
576{_AK.DESTINATION_DATABASE} = destination_database
577{_AK.ADMIN_DATABASE} = admin_database
579# -----------------------------------------------------------------------------
580# PROCESSING OPTIONS, TO LIMIT DATA QUANTITY FOR TESTING
581# -----------------------------------------------------------------------------
583{_AK.DEBUG_MAX_N_PATIENTS} =
584{_AK.DEBUG_PID_LIST} =
586# -----------------------------------------------------------------------------
587# Opting out entirely
588# -----------------------------------------------------------------------------
590{_AK.OPTOUT_PID_FILENAMES} =
591{_AK.OPTOUT_MPID_FILENAMES} =
592{_AK.OPTOUT_COL_VALUES} =
595# =============================================================================
596# Extra regular expression patterns you wish to be scrubbed from the text
597# as nonspecific information. See help.
598# =============================================================================
600[{_AK.SECTION_EXTRA_REGEXES}]
603# =============================================================================
604# Destination database details. User should have WRITE access.
605# =============================================================================
607[destination_database]
609{_DK.URL} = @@dest_db_url@@
612# =============================================================================
613# Administrative database. User should have WRITE access.
614# =============================================================================
616[admin_database]
618{_DK.URL} = @@admin_db_url@@
621# =============================================================================
622# SOURCE DATABASE DETAILS BELOW HERE.
623# User should have READ access only for safety.
624# =============================================================================
626# -----------------------------------------------------------------------------
627# Source database example 1
628# -----------------------------------------------------------------------------
630[sourcedb1]
632 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
633 # CONNECTION DETAILS
634 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
636{_DK.URL} = @@source_db1_url@@
638 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
639 # INPUT FIELDS, FOR THE AUTOGENERATION OF DATA DICTIONARIES
640 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
642{_SK.DDGEN_OMIT_BY_DEFAULT} = {_DS.DDGEN_OMIT_BY_DEFAULT}
643{_SK.DDGEN_OMIT_FIELDS} =
644{_SK.DDGEN_INCLUDE_FIELDS} = @@source_db1_ddgen_include_fields@@
645{_SK.DDGEN_PER_TABLE_PID_FIELD} = patient_id
646{_SK.DDGEN_TABLE_DEFINES_PIDS} = patient
647{_SK.DDGEN_ADD_PER_TABLE_PIDS_TO_SCRUBBER} = {_DS.DDGEN_ADD_PER_TABLE_PIDS_TO_SCRUBBER}
648{_SK.DDGEN_MASTER_PID_FIELDNAME} = nhsnum
649{_SK.DDGEN_TABLE_DENYLIST} =
650{_SK.DDGEN_TABLE_ALLOWLIST} =
651{_SK.DDGEN_TABLE_REQUIRE_FIELD_ABSOLUTE} =
652{_SK.DDGEN_TABLE_REQUIRE_FIELD_CONDITIONAL} =
653{_SK.DDGEN_FIELD_DENYLIST} =
654{_SK.DDGEN_FIELD_ALLOWLIST} =
655{_SK.DDGEN_PK_FIELDS} =
656{_SK.DDGEN_PREFER_ORIGINAL_PK} = {_DS.DDGEN_PREFER_ORIGINAL_PK}
657{_SK.DDGEN_CONSTANT_CONTENT} = {_DS.DDGEN_CONSTANT_CONTENT}
658{_SK.DDGEN_CONSTANT_CONTENT_TABLES} =
659{_SK.DDGEN_NONCONSTANT_CONTENT_TABLES} =
660{_SK.DDGEN_ADDITION_ONLY} = {_DS.DDGEN_ADDITION_ONLY}
661{_SK.DDGEN_ADDITION_ONLY_TABLES} =
662{_SK.DDGEN_DELETION_POSSIBLE_TABLES} =
663{_SK.DDGEN_PID_DEFINING_FIELDNAMES} =
664{_SK.DDGEN_SCRUBSRC_PATIENT_FIELDS} = @@source_db1_ddgen_scrubsrc_patient_fields@@
665{_SK.DDGEN_SCRUBSRC_THIRDPARTY_FIELDS} =
666{_SK.DDGEN_SCRUBSRC_THIRDPARTY_XREF_PID_FIELDS} =
667{_SK.DDGEN_REQUIRED_SCRUBSRC_FIELDS} =
668{_SK.DDGEN_SCRUBMETHOD_CODE_FIELDS} =
669{_SK.DDGEN_SCRUBMETHOD_DATE_FIELDS} =
670{_SK.DDGEN_SCRUBMETHOD_NUMBER_FIELDS} =
671{_SK.DDGEN_SCRUBMETHOD_PHRASE_FIELDS} =
672{_SK.DDGEN_SAFE_FIELDS_EXEMPT_FROM_SCRUBBING} =
673{_SK.DDGEN_MIN_LENGTH_FOR_SCRUBBING} = {_DS.DDGEN_MIN_LENGTH_FOR_SCRUBBING}
674{_SK.DDGEN_TRUNCATE_DATE_FIELDS} =
675{_SK.DDGEN_FILENAME_TO_TEXT_FIELDS} =
676{_SK.DDGEN_BINARY_TO_TEXT_FIELD_PAIRS} =
677{_SK.DDGEN_SKIP_ROW_IF_EXTRACT_TEXT_FAILS_FIELDS} =
678{_SK.DDGEN_RENAME_TABLES_REMOVE_SUFFIXES} =
679{_SK.DDGEN_PATIENT_OPT_OUT_FIELDS} =
680{_SK.DDGEN_EXTRA_HASH_FIELDS} =
682 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
683 # DESTINATION INDEXING
684 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
686{_SK.DDGEN_INDEX_FIELDS} =
687{_SK.DDGEN_ALLOW_FULLTEXT_INDEXING} = {_DS.DDGEN_ALLOW_FULLTEXT_INDEXING}
689 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
690 # DATA DICTIONARY MANIPULATION TO DESTINATION TABLE/FIELD NAMES
691 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
693{_SK.DDGEN_FORCE_LOWER_CASE} = {_DS.DDGEN_FORCE_LOWER_CASE}
694{_SK.DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE} = {_DS.DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE}
696 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
697 # PROCESSING OPTIONS, TO LIMIT DATA QUANTITY FOR TESTING
698 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
700{_SK.DEBUG_ROW_LIMIT} =
701{_SK.DEBUG_LIMITED_TABLES} =
703# -----------------------------------------------------------------------------
704# Source database example 2
705# -----------------------------------------------------------------------------
707[mysourcedb2]
709{_DK.URL} = mysql+mysqldb://username:password@127.0.0.1:3306/source2_databasename?charset=utf8
711{_SK.DDGEN_FORCE_LOWER_CASE} = {_DS.DDGEN_FORCE_LOWER_CASE}
712{_SK.DDGEN_APPEND_SOURCE_INFO_TO_COMMENT} = {_DS.DDGEN_APPEND_SOURCE_INFO_TO_COMMENT}
713{_SK.DDGEN_PER_TABLE_PID_FIELD} = patient_id
714{_SK.DDGEN_MASTER_PID_FIELDNAME} = nhsnum
715{_SK.DDGEN_TABLE_DENYLIST} =
716{_SK.DDGEN_FIELD_DENYLIST} =
717{_SK.DDGEN_TABLE_REQUIRE_FIELD_ABSOLUTE} =
718{_SK.DDGEN_TABLE_REQUIRE_FIELD_CONDITIONAL} =
719{_SK.DDGEN_PK_FIELDS} =
720{_SK.DDGEN_PREFER_ORIGINAL_PK} = {_DS.DDGEN_PREFER_ORIGINAL_PK}
721{_SK.DDGEN_CONSTANT_CONTENT} = {_DS.DDGEN_CONSTANT_CONTENT}
722{_SK.DDGEN_SCRUBSRC_PATIENT_FIELDS} =
723{_SK.DDGEN_SCRUBSRC_THIRDPARTY_FIELDS} =
724{_SK.DDGEN_SCRUBMETHOD_CODE_FIELDS} =
725{_SK.DDGEN_SCRUBMETHOD_DATE_FIELDS} =
726{_SK.DDGEN_SCRUBMETHOD_NUMBER_FIELDS} =
727{_SK.DDGEN_SCRUBMETHOD_PHRASE_FIELDS} =
728{_SK.DDGEN_SAFE_FIELDS_EXEMPT_FROM_SCRUBBING} =
729{_SK.DDGEN_MIN_LENGTH_FOR_SCRUBBING} = {_DS.DDGEN_MIN_LENGTH_FOR_SCRUBBING}
730{_SK.DDGEN_TRUNCATE_DATE_FIELDS} =
731{_SK.DDGEN_FILENAME_TO_TEXT_FIELDS} =
732{_SK.DDGEN_BINARY_TO_TEXT_FIELD_PAIRS} =
734# -----------------------------------------------------------------------------
735# Source database example 3
736# -----------------------------------------------------------------------------
738[camcops]
739# Example for the CamCOPS anonymisation staging database
741{_DK.URL} = mysql+mysqldb://username:password@127.0.0.1:3306/camcops_databasename?charset=utf8
743# FOR EXAMPLE:
744{_SK.DDGEN_FORCE_LOWER_CASE} = False
745{_SK.DDGEN_PER_TABLE_PID_FIELD} = _patient_idnum1
746{_SK.DDGEN_PID_DEFINING_FIELDNAMES} = _patient_idnum1
747{_SK.DDGEN_MASTER_PID_FIELDNAME} = _patient_idnum2
748{_SK.DDGEN_TABLE_DENYLIST} =
749{_SK.DDGEN_FIELD_DENYLIST} = _patient_iddesc1
750 _patient_idshortdesc1
751 _patient_iddesc2
752 _patient_idshortdesc2
753 _patient_iddesc3
754 _patient_idshortdesc3
755 _patient_iddesc4
756 _patient_idshortdesc4
757 _patient_iddesc5
758 _patient_idshortdesc5
759 _patient_iddesc6
760 _patient_idshortdesc6
761 _patient_iddesc7
762 _patient_idshortdesc7
763 _patient_iddesc8
764 _patient_idshortdesc8
765 id
766 patient_id
767 _device
768 _era
769 _current
770 _when_removed_exact
771 _when_removed_batch_utc
772 _removing_user
773 _preserving_user
774 _forcibly_preserved
775 _predecessor_pk
776 _successor_pk
777 _manually_erased
778 _manually_erased_at
779 _manually_erasing_user
780 _addition_pending
781 _removal_pending
782 _move_off_tablet
784{_SK.DDGEN_TABLE_REQUIRE_FIELD_ABSOLUTE} =
785{_SK.DDGEN_TABLE_REQUIRE_FIELD_CONDITIONAL} =
786{_SK.DDGEN_PK_FIELDS} = _pk
787{_SK.DDGEN_PREFER_ORIGINAL_PK} = {_DS.DDGEN_PREFER_ORIGINAL_PK}
788{_SK.DDGEN_CONSTANT_CONTENT} = False
789{_SK.DDGEN_SCRUBSRC_PATIENT_FIELDS} = _patient_forename
790 _patient_surname
791 _patient_dob
792 _patient_idnum1
793 _patient_idnum2
794 _patient_idnum3
795 _patient_idnum4
796 _patient_idnum5
797 _patient_idnum6
798 _patient_idnum7
799 _patient_idnum8
800{_SK.DDGEN_SCRUBSRC_THIRDPARTY_FIELDS} =
801{_SK.DDGEN_SCRUBMETHOD_CODE_FIELDS} =
802{_SK.DDGEN_SCRUBMETHOD_DATE_FIELDS} = _patient_dob
803{_SK.DDGEN_SCRUBMETHOD_NUMBER_FIELDS} =
804{_SK.DDGEN_SCRUBMETHOD_PHRASE_FIELDS} =
805{_SK.DDGEN_SAFE_FIELDS_EXEMPT_FROM_SCRUBBING} = _device
806 _era
807 _when_added_exact
808 _adding_user
809 _when_removed_exact
810 _removing_user
811 _preserving_user
812 _manually_erased_at
813 _manually_erasing_user
814 when_last_modified
815 when_created
816 when_firstexit
817 clinician_specialty
818 clinician_name
819 clinician_post
820 clinician_professional_registration
821 clinician_contact_details
822# ... now some task-specific ones
823 bdi_scale
824 pause_start_time
825 pause_end_time
826 trial_start_time
827 cue_start_time
828 target_start_time
829 detection_start_time
830 iti_start_time
831 iti_end_time
832 trial_end_time
833 response_time
834 target_time
835 choice_time
836 discharge_date
837 discharge_reason_code
838 diagnosis_psych_1_icd10code
839 diagnosis_psych_1_description
840 diagnosis_psych_2_icd10code
841 diagnosis_psych_2_description
842 diagnosis_psych_3_icd10code
843 diagnosis_psych_3_description
844 diagnosis_psych_4_icd10code
845 diagnosis_psych_4_description
846 diagnosis_medical_1
847 diagnosis_medical_2
848 diagnosis_medical_3
849 diagnosis_medical_4
850 category_start_time
851 category_response_time
852 category_chosen
853 gamble_fixed_option
854 gamble_lottery_option_p
855 gamble_lottery_option_q
856 gamble_start_time
857 gamble_response_time
858 likelihood
859{_SK.DDGEN_MIN_LENGTH_FOR_SCRUBBING} = {_DS.DDGEN_MIN_LENGTH_FOR_SCRUBBING}
860{_SK.DDGEN_TRUNCATE_DATE_FIELDS} = _patient_dob
861{_SK.DDGEN_FILENAME_TO_TEXT_FIELDS} =
862{_SK.DDGEN_BINARY_TO_TEXT_FIELD_PAIRS} =
864""" # noqa: E501
866# For the style:
867# [source_databases]
868# source1 = blah
869# source2 = thing
870# ... you can't have multiple keys with the same name.
871# https://stackoverflow.com/questions/287757
874class PatientInfoConstants:
875 SECRET_MAP_TABLENAME = "secret_map"
876 PID_FIELDNAME = "pid"
877 MPID_FIELDNAME = "mpid"
878 RID_FIELDNAME = "rid"
879 MRID_FIELDNAME = "mrid"
880 TRID_FIELDNAME = "trid"