Coverage for cc_modules/cc_anon.py : 17%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python
3"""
4camcops_server/cc_modules/cc_anon.py
6===============================================================================
8 Copyright (C) 2012-2020 Rudolf Cardinal (rudolf@pobox.com).
10 This file is part of CamCOPS.
12 CamCOPS is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
17 CamCOPS is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with CamCOPS. If not, see <https://www.gnu.org/licenses/>.
25===============================================================================
27**Anonymisation functions.**
29Largely superseded by CRATE (https://doi.org/10.1186%2Fs12911-017-0437-1).
31"""
33from collections import OrderedDict
34import csv
35import sys
36from typing import (
37 Dict, List, Generator, TextIO, Tuple, TYPE_CHECKING, Union,
38)
40from cardinal_pythonlib.sqlalchemy.orm_inspect import coltype_as_typeengine
41from cardinal_pythonlib.sqlalchemy.schema import (
42 convert_sqla_type_for_dialect,
43 does_sqlatype_require_index_len,
44 is_sqlatype_date,
45 is_sqlatype_text_of_length_at_least,
46 RE_COLTYPE_WITH_ONE_PARAM,
47)
48from cardinal_pythonlib.sqlalchemy.session import SQLITE_MEMORY_URL
49# from sqlalchemy.dialects.mssql.base import MSDialect
50from sqlalchemy.dialects.mysql.base import MySQLDialect
51from sqlalchemy.engine import create_engine
52from sqlalchemy.engine.interfaces import Dialect
53from sqlalchemy.orm import Session as SqlASession, sessionmaker
54from sqlalchemy.sql.schema import Column
56from camcops_server.cc_modules.cc_constants import TABLET_ID_FIELD
57from camcops_server.cc_modules.cc_db import FN_PK
58from camcops_server.cc_modules.cc_dump import DumpController
59from camcops_server.cc_modules.cc_patient import Patient
60from camcops_server.cc_modules.cc_patientidnum import (
61 extra_id_colname,
62 EXTRA_IDNUM_FIELD_PREFIX,
63)
64from camcops_server.cc_modules.cc_simpleobjects import TaskExportOptions
65from camcops_server.cc_modules.cc_sqla_coltypes import CamcopsColumn
67if TYPE_CHECKING:
68 from camcops_server.cc_modules.cc_exportrecipientinfo import ExportRecipientInfo # noqa
69 from camcops_server.cc_modules.cc_request import CamcopsRequest
71# =============================================================================
72# Constants
73# =============================================================================
75MIN_STRING_LENGTH_TO_CONSIDER_SCRUBBING = 256
78# =============================================================================
79# Write data dictionaries for anonymisation tools
80# =============================================================================
82def _gen_columns_for_anon_staging_db(req: "CamcopsRequest",
83 recipient: "ExportRecipientInfo") \
84 -> Generator[Union[Column, CamcopsColumn], None, None]:
85 """
86 Generates all columns for an anonymisation staging database.
87 """
88 url = SQLITE_MEMORY_URL
89 engine = create_engine(url, echo=False)
90 session = sessionmaker(bind=engine)() # type: SqlASession
91 export_options = TaskExportOptions(
92 include_blobs=recipient.db_include_blobs,
93 db_patient_id_per_row=recipient.db_patient_id_per_row,
94 db_make_all_tables_even_empty=True,
95 db_include_summaries=recipient.db_add_summaries,
96 )
98 dc = DumpController(dst_engine=engine,
99 dst_session=session,
100 export_options=export_options,
101 req=req)
102 for col in dc.gen_all_dest_columns():
103 yield col
106# -----------------------------------------------------------------------------
107# CRIS
108# -----------------------------------------------------------------------------
110def _get_type_size_as_text_from_sqltype(sqltype: str) -> Tuple[str, str]:
111 """
112 Splits SQL size definitions like ``VARCHAR(10)`` into tuples like
113 ``('VARCHAR', '10')`` If it doesn't fit that format, return
114 ``(sqltype, '')``.
115 """
116 m = RE_COLTYPE_WITH_ONE_PARAM.match(sqltype)
117 if m is not None:
118 finaltype = m.group('type').upper()
119 size = m.group('size').strip().upper()
120 else:
121 size = ""
122 finaltype = sqltype
123 return finaltype, size
126# noinspection PyUnusedLocal
127def _get_cris_dd_row(column: Union[Column, CamcopsColumn, None],
128 recipient: "ExportRecipientInfo",
129 dest_dialect: Dialect = None) -> Dict:
130 """
131 Args:
132 column:
133 A column specification (or ``None`` to create a dummy dictionary).
134 dest_dialect:
135 The SQL dialect of the destination database. If ``None``, then
136 MySQL is used as the default.
138 Returns:
139 An :class:`OrderedDict` with information for a CRIS data dictionary
140 row.
141 """
142 dest_dialect = dest_dialect or MySQLDialect() # MSDialect() for SQL Server
143 valid_values = None
144 if column is None:
145 # Dummy row
146 colname = None
147 tablename = None
148 taskname = None
149 comment = None
150 feft = None
151 security_status = None
152 finaltype = None
153 tlfa = None
154 size = None
155 else:
156 colname = column.name
157 tablename = column.table.name
158 taskname = tablename
159 comment = column.comment
160 coltype = coltype_as_typeengine(column.type)
161 is_free_text = is_sqlatype_text_of_length_at_least(
162 coltype, min_length=MIN_STRING_LENGTH_TO_CONSIDER_SCRUBBING)
163 exempt_from_anonymisation = False
164 identifies_patient = False
166 if isinstance(column, CamcopsColumn):
167 exempt_from_anonymisation = column.exempt_from_anonymisation
168 identifies_patient = column.identifies_patient
169 if column.permitted_value_checker:
170 valid_values = column.permitted_value_checker.permitted_values_csv() # noqa
172 needs_scrubbing = is_free_text and not exempt_from_anonymisation
174 # Tag list - fields anon
175 tlfa = "Y" if needs_scrubbing else ""
177 # Destination SQL type
178 desttype = convert_sqla_type_for_dialect(
179 coltype=coltype,
180 dialect=dest_dialect,
181 strip_collation=True,
182 expand_for_scrubbing=needs_scrubbing,
183 )
184 destsqltype = desttype.compile(dialect=dest_dialect)
185 finaltype, size = _get_type_size_as_text_from_sqltype(destsqltype)
187 # Security status
188 system_id = (
189 colname == TABLET_ID_FIELD or
190 colname.endswith("_id")
191 )
192 patient_idnum_field = colname.startswith(EXTRA_IDNUM_FIELD_PREFIX)
193 internal_field = colname.startswith("_")
194 if identifies_patient and (tablename == Patient.__tablename__ and
195 colname == Patient.dob.name):
196 security_status = 3 # truncate (e.g. DOB, postcode)
197 elif identifies_patient and tablename == Patient.__tablename__:
198 security_status = 2 # use to scrub
199 elif system_id or internal_field or identifies_patient:
200 security_status = 1 # drop (e.g. for pointless internal keys)
201 else:
202 security_status = 4 # bring through
204 # Front end field type
205 if system_id or patient_idnum_field:
206 feft = 34 # patient ID; other internal keys
207 elif is_sqlatype_date(coltype):
208 feft = 4 # dates
209 elif is_free_text:
210 feft = 3 # giant free text, I think
211 elif valid_values is not None:
212 feft = 2 # picklist
213 else:
214 feft = 1 # text, numbers
216 return OrderedDict([
217 ("Tab", "CamCOPS"),
218 ("Form name", taskname),
219 ("CRIS tree label", colname),
220 ("Source system table name", tablename),
221 ("SQL column name", colname),
222 ("Front end field type", feft),
223 ("Valid values", valid_values),
224 ("Result column name", colname),
225 ("Family doc tab name", ""),
226 ("Family doc form name", ""),
227 ("Security status", security_status),
228 ("Exclude", ""),
229 ("End SQL Type", finaltype),
230 ("Header field (Y/N)", ""),
231 ("Header field name", ""),
232 ("Header field active (Y/N)", ""),
233 ("View name", ""),
234 ("Exclude from family doc", ""),
235 ("Tag list - fields anon", tlfa),
236 ("Anon type", ""), # formerly "Additional info"
237 ("Form start date", ""),
238 ("Form end date", ""),
239 ("Source", ""),
240 ("Size", size),
241 ("Header logic", ""),
242 ("Patient/contact", ""),
243 ("Comments", comment),
244 ])
247def write_cris_data_dictionary(req: "CamcopsRequest",
248 recipient: "ExportRecipientInfo",
249 file: TextIO = sys.stdout) -> None:
250 """
251 Generates a draft CRIS data dictionary.
253 CRIS is an anonymisation tool. See
255 - Stewart R, Soremekun M, Perera G, Broadbent M, Callard F, Denis M, Hotopf
256 M, Thornicroft G, Lovestone S (2009).
257 The South London and Maudsley NHS Foundation Trust Biomedical Research
258 Centre (SLAM BRC) case register: development and descriptive data.
259 *BMC Psychiatry* 9: 51.
260 https://www.ncbi.nlm.nih.gov/pubmed/19674459
262 - Fernandes AC, Cloete D, Broadbent MT, Hayes RD, Chang CK, Jackson RG,
263 Roberts A, Tsang J, Soncul M, Liebscher J, Stewart R, Callard F (2013).
264 Development and evaluation of a de-identification procedure for a case
265 register sourced from mental health electronic records.
266 *BMC Med Inform Decis Mak.* 13: 71.
267 https://www.ncbi.nlm.nih.gov/pubmed/23842533
269 Args:
270 req: a :class:`camcops_server.cc_modules.cc_request.CamcopsRequest`
271 recipient: a :class:`camcops_server.cc_modules.cc_exportrecipientinfo.ExportRecipientInfo`
272 file: output file
273 """ # noqa
274 dummy = _get_cris_dd_row(column=None, recipient=recipient)
275 wr = csv.DictWriter(file, fieldnames=list(dummy.keys()))
276 wr.writeheader()
277 for col in _gen_columns_for_anon_staging_db(req, recipient):
278 d = _get_cris_dd_row(column=col, recipient=recipient)
279 wr.writerow(d)
282# -----------------------------------------------------------------------------
283# CRATE
284# -----------------------------------------------------------------------------
286def _get_crate_dd_row(column: Union[Column, CamcopsColumn, None],
287 recipient: "ExportRecipientInfo",
288 dest_dialect: Dialect = None,
289 src_db: str = "camcops",
290 default_indexlen: int = 100) -> Dict:
291 """
292 Args:
293 column:
294 A column specification (or ``None`` to create a dummy dictionary).
295 recipient:
296 a :class:`camcops_server.cc_modules.cc_exportrecipientinfo.ExportRecipientInfo`
297 dest_dialect:
298 The SQL dialect of the destination database. If ``None``, then
299 MySQL is used as the default.
300 src_db:
301 Value to be used for the "src_db" field.
302 default_indexlen:
303 Default index length for fields that require one.
305 Returns:
306 An :class:`OrderedDict` with information for a CRATE data dictionary
307 row.
308 """ # noqa
309 dest_dialect = dest_dialect or MySQLDialect()
310 exempt_from_anonymisation = False
311 identifies_patient = False
312 identifies_respondent = False
313 force_include = False
314 if column is None:
315 # Dummy row
316 colname = None
317 tablename = None
318 comment = None
319 coltype = None
320 needs_scrubbing = False
321 desttype = None
322 destsqltype = None
323 else:
324 colname = column.name
325 tablename = column.table.name
326 comment = column.comment
327 coltype = coltype_as_typeengine(column.type)
328 is_free_text = is_sqlatype_text_of_length_at_least(
329 coltype, min_length=MIN_STRING_LENGTH_TO_CONSIDER_SCRUBBING)
331 if isinstance(column, CamcopsColumn):
332 exempt_from_anonymisation = column.exempt_from_anonymisation
333 identifies_patient = column.identifies_patient
334 force_include = column.include_in_anon_staging_db
336 needs_scrubbing = is_free_text and not exempt_from_anonymisation
337 desttype = convert_sqla_type_for_dialect(
338 coltype=coltype,
339 dialect=dest_dialect,
340 strip_collation=True,
341 expand_for_scrubbing=needs_scrubbing,
342 )
343 destsqltype = desttype.compile(dialect=dest_dialect)
345 # src_flags
346 src_flags = [] # type: List[str]
347 primary_key = colname == FN_PK
348 if primary_key:
349 src_flags.extend(["K", "C"])
350 primary_pid = (
351 recipient.db_patient_id_per_row and # otherwise just in PatientIdNum
352 recipient.primary_idnum and
353 colname == extra_id_colname(recipient.primary_idnum)
354 )
355 if primary_pid:
356 src_flags.append("P")
357 defines_primary_pids = False # no single unique table for this...
358 if defines_primary_pids:
359 src_flags.append("*")
360 master_pid = False # not supported for now
361 if master_pid:
362 src_flags.append("M")
364 # scrub_src
365 if identifies_patient and tablename == Patient.__tablename__:
366 scrub_src = "patient"
367 elif identifies_respondent:
368 scrub_src = "thirdparty"
369 else:
370 scrub_src = None
372 # scrub_method
373 scrub_method = None # default is fine
375 # Include in output?
376 include = (
377 force_include or
378 primary_key or
379 primary_pid or
380 master_pid or
381 not (identifies_patient or identifies_respondent)
382 )
384 # alter_method
385 if needs_scrubbing:
386 alter_method = "scrub"
387 elif tablename == Patient.__tablename__ and colname == Patient.dob.name:
388 alter_method = "truncate_date"
389 else:
390 alter_method = None
392 # Indexing
393 crate_index = None
394 crate_indexlen = None
395 if column is not None and column.index:
396 crate_index = "U" if column.unique else "I"
397 if does_sqlatype_require_index_len(desttype):
398 crate_indexlen = default_indexlen
400 return OrderedDict([
401 ("src_db", src_db),
402 ("src_table", tablename),
403 ("src_field", colname),
404 ("src_datatype", str(coltype)),
405 ("src_flags", "".join(src_flags) if src_flags else None),
406 ("scrub_src", scrub_src),
407 ("scrub_method", scrub_method),
408 ("decision", "include" if include else "OMIT"),
409 ("inclusion_values", None),
410 ("exclusion_values", None),
411 ("alter_method", alter_method),
412 ("dest_table", tablename),
413 ("dest_field", colname),
414 ("dest_datatype", destsqltype),
415 ("index", crate_index),
416 ("indexlen", crate_indexlen),
417 ("comment", comment),
418 ])
421def write_crate_data_dictionary(req: "CamcopsRequest",
422 recipient: "ExportRecipientInfo",
423 file: TextIO = sys.stdout) -> None:
424 """
425 Generates a draft CRATE data dictionary.
427 CRATE is an anonymisation tool. See:
429 - Cardinal RN (2017).
430 Clinical records anonymisation and text extraction (CRATE): an
431 open-source software system.
432 *BMC Medical Informatics and Decision Making* 17: 50.
433 https://www.pubmed.gov/28441940;
434 https://doi.org/10.1186/s12911-017-0437-1.
436 - https://crateanon.readthedocs.io/
438 Args:
439 req: a :class:`camcops_server.cc_modules.cc_request.CamcopsRequest`
440 recipient: a :class:`camcops_server.cc_modules.cc_exportrecipientinfo.ExportRecipientInfo`
441 file: output file
442 """ # noqa
443 dummy = _get_crate_dd_row(column=None, recipient=recipient)
444 wr = csv.DictWriter(file, fieldnames=list(dummy.keys()))
445 wr.writeheader()
446 for col in _gen_columns_for_anon_staging_db(req, recipient):
447 d = _get_crate_dd_row(column=col, recipient=recipient)
448 wr.writerow(d)