Coverage for crateweb/research/research_db_info.py: 27%
481 statements
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
1"""
2crate_anon/crateweb/research/research_db_info.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Represents information about the structure of our research database(s).**
28"""
30from collections import OrderedDict
32import logging
33import re
34from typing import Any, Dict, List, Optional, Set
36from cardinal_pythonlib.dbfunc import dictfetchall
37from cardinal_pythonlib.django.django_constants import ConnectionVendors
38from cardinal_pythonlib.django.function_cache import django_cache_function
39from cardinal_pythonlib.excel import excel_to_bytes
40from cardinal_pythonlib.json_utils.serialize import (
41 METHOD_NO_ARGS,
42 register_for_json,
43)
44from cardinal_pythonlib.logs import BraceStyleAdapter
45from cardinal_pythonlib.reprfunc import auto_repr
46from cardinal_pythonlib.sql.sql_grammar import SqlGrammar
47from cardinal_pythonlib.sql.validation import (
48 SQLTYPES_TEXT,
49 SQLTYPES_WITH_DATE,
50)
51from cardinal_pythonlib.sqlalchemy.dialect import SqlaDialectName
52from cardinal_pythonlib.sqlalchemy.schema import (
53 MSSQL_DEFAULT_SCHEMA,
54 POSTGRES_DEFAULT_SCHEMA,
55)
56from cardinal_pythonlib.tsv import dictlist_to_tsv
57from django.db import connections
58from django.db.backends.base.base import BaseDatabaseWrapper
59from django.conf import settings
60from openpyxl import Workbook
61from requests.structures import CaseInsensitiveDict
63from crate_anon.common.constants import RUNNING_WITHOUT_CONFIG
64from crate_anon.common.sql import (
65 ColumnId,
66 QB_DATATYPE_DATE,
67 QB_DATATYPE_FLOAT,
68 QB_DATATYPE_INTEGER,
69 QB_DATATYPE_STRING,
70 QB_DATATYPE_STRING_FULLTEXT,
71 QB_DATATYPE_UNKNOWN,
72 SQLTYPES_FLOAT_OR_OTHER_NUMERIC,
73 SQLTYPES_INTEGER_OR_BIT,
74 SchemaId,
75 SqlArgsTupleType,
76 TableId,
77 is_sql_column_type_textual,
78 make_grammar,
79 translate_sql_qmark_to_percent,
80)
81from crate_anon.crateweb.core.constants import (
82 RESEARCH_DB_CONNECTION_NAME,
83 SettingsKeys,
84)
85from crate_anon.crateweb.config.constants import ResearchDbInfoKeys
87log = BraceStyleAdapter(logging.getLogger(__name__))
90# =============================================================================
91# Constants
92# =============================================================================
94SUPPORTED_DIALECTS = [
95 SqlaDialectName.MSSQL,
96 SqlaDialectName.MYSQL,
97 # SqlaDialectName.POSTGRES, # no grammar yet
98]
101class PatientFieldPythonTypes:
102 """
103 Represents Python types for each type of patient ID field.
104 """
106 PID = int
107 MPID = int
108 RID = str
109 MRID = str
110 TRID = int
113# =============================================================================
114# Information about the research database
115# =============================================================================
118class ColumnInfo:
119 """
120 Represents information about a database column, reflected from the
121 database.
123 See also ``querybuilder.js``.
124 """
126 def __init__(self, **kwargs) -> None:
127 # Different databases may vary the case, although the column headings
128 # are ANSI standard.
129 cid = CaseInsensitiveDict(kwargs)
130 try:
131 self.table_catalog = cid["table_catalog"] # type: str
132 self.table_schema = cid["table_schema"] # type: str
133 self.table_name = cid["table_name"] # type: str
134 self.column_name = cid["column_name"] # type: str
135 self.is_nullable = bool(cid["is_nullable"])
136 self.column_type = cid["column_type"] # type: str
137 self.column_comment = cid["column_comment"] # type: str
138 self.indexed = bool(cid["indexed"])
139 self.indexed_fulltext = bool(cid["indexed_fulltext"])
140 except KeyError:
141 log.critical(
142 "Information missing from column info dictionary: {}", kwargs
143 )
144 raise
146 @property
147 def basetype(self) -> str:
148 """
149 Returns the SQL base type, such as ``VARCHAR``.
150 """
151 return self.column_type.split("(")[0].upper()
153 @property
154 def querybuilder_type(self) -> str:
155 """
156 Returns a string that is defined in ``querybuilder.js`` and that
157 defines our field type, like ``"int"`` or ``"date"``. See source.
158 """
159 basetype = self.basetype
160 if basetype in SQLTYPES_FLOAT_OR_OTHER_NUMERIC:
161 return QB_DATATYPE_FLOAT
162 if basetype in SQLTYPES_WITH_DATE:
163 return QB_DATATYPE_DATE
164 if basetype in SQLTYPES_TEXT:
165 if self.indexed_fulltext:
166 return QB_DATATYPE_STRING_FULLTEXT
167 else:
168 return QB_DATATYPE_STRING
169 if basetype in SQLTYPES_INTEGER_OR_BIT:
170 return QB_DATATYPE_INTEGER
171 return QB_DATATYPE_UNKNOWN
173 @property
174 def column_id(self) -> ColumnId:
175 """
176 Returns a :class:`crate_anon.common.sql.ColumnId` describing this
177 column.
178 """
179 return ColumnId(
180 db=self.table_catalog,
181 schema=self.table_schema,
182 table=self.table_name,
183 column=self.column_name,
184 )
186 @property
187 def table_id(self) -> TableId:
188 """
189 Returns a :class:`crate_anon.common.sql.TableId` describing this
190 column's table.
191 """
192 return TableId(
193 db=self.table_catalog,
194 schema=self.table_schema,
195 table=self.table_name,
196 )
198 def __repr__(self) -> str:
199 return auto_repr(self, sort_attrs=False)
201 def __str__(self) -> str:
202 return str(self.column_id)
205class SingleResearchDatabase:
206 """
207 Represents, and adds information to, a single entry from the
208 ``RESEARCH_DB_INFO`` list: that is, a research database. (It's a list
209 because it's ordered.)
210 """
212 def __init__(
213 self,
214 index: int,
215 grammar: SqlGrammar,
216 rdb_info: "ResearchDatabaseInfo",
217 connection: BaseDatabaseWrapper,
218 ) -> None:
219 """
220 Instantiates, reading database information as follows:
222 - the relevant dictionary is ``settings.RESEARCH_DB_INFO[index]``
223 - the keys to that dictionary are in
224 :class:`crate_anon.crateweb.config.constants.ResearchDbInfoKeys`
226 Args:
227 index:
228 Python zero-based index to ``settings.RESEARCH_DB_INFO``
229 grammar:
230 a :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar`,
231 for the SQL dialect we're using
232 rdb_info:
233 a :class:`ResearchDatabaseInfo`, describing the structure of
234 the research database
235 connection:
236 a :class:`django.db.backends.base.base.BaseDatabaseWrapper`,
237 i.e. a Django database connection. This includes
238 connection.vendor, the Django database vendor name; see e.g.
239 https://docs.djangoproject.com/en/2.1/ref/models/options/
240 """
241 assert 0 <= index <= len(settings.RESEARCH_DB_INFO)
242 infodict = settings.RESEARCH_DB_INFO[index]
244 # Don't store self.connection; the Django cache will pickle and it is
245 # not pickleable.
247 self.index = index
248 self.is_first_db = index == 0
249 self.grammar = grammar
250 self.rdb_info = rdb_info
252 try:
253 self.name = infodict[ResearchDbInfoKeys.NAME] # type: str
254 self.description = infodict[
255 ResearchDbInfoKeys.DESCRIPTION
256 ] # type: str
257 self.database = infodict[ResearchDbInfoKeys.DATABASE] # type: str
258 self.schema_name = infodict[ResearchDbInfoKeys.SCHEMA] # type: str
259 self.trid_field = infodict[
260 ResearchDbInfoKeys.TRID_FIELD
261 ] # type: str
262 self.rid_field = infodict[
263 ResearchDbInfoKeys.RID_FIELD
264 ] # type: str
265 self.rid_family = infodict[
266 ResearchDbInfoKeys.RID_FAMILY
267 ] # type: int
268 self.mrid_table = infodict[
269 ResearchDbInfoKeys.MRID_TABLE
270 ] # type: str
271 self.mrid_field = infodict[
272 ResearchDbInfoKeys.MRID_FIELD
273 ] # type: str
274 except KeyError as e:
275 raise KeyError(
276 f"Key {e} is missing from settings.RESEARCH_DB_INFO "
277 f"for this dict: {infodict!r}"
278 )
280 assert isinstance(self.name, str) and self.name # no blanks
281 assert re.match(r"^\w+$", self.name), (
282 f"Database name {self.name!r} should contain only "
283 f"alphanumeric/underscore characters"
284 )
285 assert (
286 isinstance(self.description, str) and self.description
287 ) # no blanks
289 assert isinstance(self.database, str) # may be blank
290 assert (
291 isinstance(self.schema_name, str) and self.schema_name
292 ) # no blanks
294 assert isinstance(self.trid_field, str) # may be blank
295 assert isinstance(self.rid_field, str) and self.rid_field # no blanks
296 assert isinstance(self.rid_family, int) # may be blank
297 assert self.rid_family > 0 # positive integers only
298 assert isinstance(self.mrid_table, str) # may be blank
299 assert isinstance(self.mrid_field, str) # may be blank
301 self.pid_pseudo_field = infodict.get(
302 ResearchDbInfoKeys.PID_PSEUDO_FIELD, ""
303 ) # type: str
304 self.mpid_pseudo_field = infodict.get(
305 ResearchDbInfoKeys.MPID_PSEUDO_FIELD, ""
306 ) # type: str
307 assert isinstance(
308 self.pid_pseudo_field, str
309 ) # may be blank unless it's a lookup DB
310 assert isinstance(
311 self.mpid_pseudo_field, str
312 ) # may be blank unless it's a lookup DB
314 self.pid_description = infodict.get(
315 ResearchDbInfoKeys.PID_DESCRIPTION,
316 "Patient ID (PID) for database " + self.description,
317 ) # type: str
318 self.mpid_description = infodict.get(
319 ResearchDbInfoKeys.MPID_DESCRIPTION, "Master patient ID (MPID)"
320 ) # type: str
321 self.rid_description = infodict.get(
322 ResearchDbInfoKeys.RID_DESCRIPTION,
323 "Research ID (RID) for database " + self.description,
324 ) # type: str
325 self.mrid_description = infodict.get(
326 ResearchDbInfoKeys.MRID_DESCRIPTION, "Master research ID (MRID)"
327 ) # type: str
328 self.trid_description = infodict.get(
329 ResearchDbInfoKeys.TRID_DESCRIPTION,
330 "Transient research ID (TRID) for database " + self.description,
331 ) # type: str
332 assert isinstance(self.pid_description, str)
333 assert isinstance(self.mpid_description, str)
334 assert isinstance(self.rid_description, str)
335 assert isinstance(self.mrid_description, str)
336 assert isinstance(self.trid_description, str)
338 self.secret_lookup_db = infodict.get(
339 ResearchDbInfoKeys.SECRET_LOOKUP_DB, ""
340 )
341 assert isinstance(self.secret_lookup_db, str)
342 if self.secret_lookup_db:
343 assert self.secret_lookup_db in settings.DATABASES, (
344 f"Research database named {self.name!r} in "
345 f"settings.RESEARCH_DB_INFO has an invalid "
346 f"secret_lookup_db: {self.secret_lookup_db!r}"
347 )
348 assert re.match(r"^\w+$", self.pid_pseudo_field), (
349 f"The research database named {self.name!r} should have a "
350 f"pid_pseudo_field containing only alphanumeric/underscore "
351 f"characters (it's {self.pid_pseudo_field!r})"
352 )
353 assert re.match(r"^\w+$", self.mpid_pseudo_field), (
354 f"The research database named {self.name!r} should have a "
355 f"mpid_pseudo_field containing only alphanumeric/underscore "
356 f"characters (it's {self.mpid_pseudo_field!r})"
357 )
359 self.date_fields_by_table = infodict.get(
360 ResearchDbInfoKeys.DATE_FIELDS_BY_TABLE, {}
361 ) # type: Dict[str, str]
362 assert isinstance(self.date_fields_by_table, dict)
363 for k, v in self.date_fields_by_table.items():
364 assert isinstance(k, str) and k, (
365 f"Bad key {k!r} for {ResearchDbInfoKeys.DATE_FIELDS_BY_TABLE} "
366 f"for database named {self.name!r}"
367 )
368 assert isinstance(v, str) and v, (
369 f"Bad value {v!r} for "
370 f"{ResearchDbInfoKeys.DATE_FIELDS_BY_TABLE} "
371 f"for database named {self.name!r}"
372 )
374 self.default_date_fields = infodict.get(
375 ResearchDbInfoKeys.DEFAULT_DATE_FIELDS, []
376 ) # type: List[str]
377 assert isinstance(self.default_date_fields, list)
378 for v in self.default_date_fields:
379 assert isinstance(v, str) and v, (
380 f"Bad item {v!r} for {ResearchDbInfoKeys.DEFAULT_DATE_FIELDS} "
381 f"for database named {self.name!r}"
382 )
384 # Field for when the record was last updated in db
385 self.update_date_field = infodict.get(
386 ResearchDbInfoKeys.UPDATE_DATE_FIELD, ""
387 )
388 assert isinstance(self.update_date_field, str)
390 self.schema_id = SchemaId(self.database, self.schema_name)
391 assert self.schema_id
393 # Now discover the schema
394 self._schema_infodictlist = self.get_schema_infodictlist(connection)
395 self._colinfolist = None # type: Optional[List[ColumnInfo]]
397 @property
398 def schema_infodictlist(self) -> List[Dict[str, Any]]:
399 """
400 Discovers the schema. Returns the results of
401 :meth:`get_schema_infodictlist` for our connection and vendor.
402 Implements caching.
403 """
404 return self._schema_infodictlist
406 @property
407 def colinfolist(self) -> List[ColumnInfo]:
408 """
409 Returns a list of :class:`ColumnInfo` objects for our research
410 database.
411 """
412 if self._colinfolist is None:
413 self._colinfolist = [
414 ColumnInfo(**d) for d in self.schema_infodictlist
415 ]
416 return self._colinfolist
418 @property
419 def schema_identifier(self) -> str:
420 """
421 Returns the SQL schema identifier.
422 """
423 return self.schema_id.identifier(self.grammar)
425 @property
426 def eligible_for_query_builder(self) -> bool:
427 """
428 Is this database eligible to participate in the CRATE assisted query
429 builder?
431 This is ``True`` if it's the first database, or if it can link directly
432 to the first database (shares a common RID), or if it and the first
433 database share a common MRID.
434 """
435 if self.is_first_db:
436 # First one: always eligible
437 return True
438 first_db = self.rdb_info.first_dbinfo
439 return (
440 first_db.talks_to_world and self.talks_to_world
441 ) or self.can_communicate_directly(first_db)
443 @property
444 def talks_to_world(self) -> bool:
445 """
446 Does this database "talk to the world", i.e. have an MRID?
447 """
448 return self.has_mrid
450 @property
451 def has_mrid(self) -> bool:
452 """
453 Does this database have a defined master research ID (MRID), i.e. a
454 table with one row per patient and one column that is the MRID?
455 """
456 return bool(self.mrid_table and self.mrid_field)
458 def can_communicate_directly(
459 self, other: "SingleResearchDatabase"
460 ) -> bool:
461 """
462 Can this database "talk" (link, join) to another?
464 Args:
465 other: the other :class:`SingleResearchDatabase`
467 Returns:
468 ``True`` if they are the same database or share a common RID type
470 """
471 if self.schema_id == other.schema_id:
472 return True
473 return self.rid_family == other.rid_family
475 def get_default_date_field(self, table_id: TableId) -> Optional[ColumnId]:
476 """
477 Gets the default date column for the specified table, or ``None`` if
478 none exists.
480 Args:
481 table_id: a :class:`crate_anon.common.sql.TableId`
483 Returns:
484 a :class:`crate_anon.common.sql.ColumnId`, or ``None``
485 """
486 if table_id.table in self.date_fields_by_table:
487 # We've been told about a specific date column for this table.
488 column_id = ColumnId(
489 db=table_id.db,
490 schema=table_id.schema,
491 table=table_id.table,
492 column=self.date_fields_by_table[table_id.table],
493 )
494 # Now, does it actually exist?
495 if self.column_present(column_id):
496 # Yes.
497 return column_id
498 # No.
499 for datecolname in self.default_date_fields:
500 column_id = ColumnId(
501 db=table_id.db,
502 schema=table_id.schema,
503 table=table_id.table,
504 column=datecolname,
505 )
506 if self.column_present(column_id):
507 return column_id
508 return None
510 def column_present(self, column_id: ColumnId) -> bool:
511 """
512 Is the named column present in this database?
514 Args:
515 column_id: a :class:`crate_anon.common.sql.ColumnId`
516 """
517 for ci in self.colinfolist:
518 if ci.column_id == column_id:
519 return True
520 return False
522 # -------------------------------------------------------------------------
523 # Fetching schema info from the database: internals
524 # -------------------------------------------------------------------------
526 @classmethod
527 def _schema_query_microsoft(
528 cls, db_name: str, schema_names: List[str]
529 ) -> SqlArgsTupleType:
530 """
531 Returns a query to fetch the database structure from an SQL Server
532 database.
534 The columns returned are as expected by
535 :func:`get_schema_infodictlist`.
537 Args:
538 db_name: a database name
539 schema_names: a list of schema names within the database
541 Returns:
542 tuple: ``sql, args``
544 Notes:
546 - SQL Server ``INFORMATION_SCHEMA.COLUMNS``: see
547 https://msdn.microsoft.com/en-us/library/ms188348.aspx
548 - Re fulltext indexes:
550 - https://stackoverflow.com/questions/16280918/how-to-find-full-text-indexing-on-database-in-sql-server-2008
551 - ``sys.fulltext_indexes``: https://msdn.microsoft.com/en-us/library/ms186903.aspx
552 - ``sys.fulltext_catalogs``: https://msdn.microsoft.com/en-us/library/ms188779.aspx
553 - ``sys.fulltext_index_columns``: https://msdn.microsoft.com/en-us/library/ms188335.aspx
555 """ # noqa: E501
556 if not schema_names:
557 raise ValueError(
558 "No schema_names specified (for SQL Server " "database)"
559 )
560 schema_placeholder = ",".join(["?"] * len(schema_names))
561 sql = translate_sql_qmark_to_percent(
562 f"""
563SELECT
564 ? AS table_catalog,
565 d.table_schema,
566 d.table_name,
567 d.column_name,
568 d.is_nullable,
569 d.column_type,
570 d.column_comment,
571 CASE WHEN COUNT(d.index_id) > 0 THEN 1 ELSE 0 END AS indexed,
572 CASE
573 WHEN COUNT(d.fulltext_index_object_id) > 0 THEN 1
574 ELSE 0
575 END AS indexed_fulltext
576FROM (
577 SELECT
578 s.name AS table_schema,
579 ta.name AS table_name,
580 c.name AS column_name,
581 c.is_nullable,
582 UPPER(ty.name) + '(' + CONVERT(VARCHAR(100), c.max_length) + ')'
583 AS column_type,
584 CONVERT(VARCHAR(1000), x.value) AS column_comment,
585 -- x.value is of type SQL_VARIANT
586 i.index_id,
587 fi.object_id AS fulltext_index_object_id
588 FROM [{db_name}].sys.tables ta
589 INNER JOIN [{db_name}].sys.schemas s ON ta.schema_id = s.schema_id
590 INNER JOIN [{db_name}].sys.columns c ON c.object_id = ta.object_id
591 INNER JOIN [{db_name}].sys.types ty ON ty.system_type_id = c.system_type_id
592 LEFT JOIN [{db_name}].sys.extended_properties x ON (
593 x.major_id = c.object_id
594 AND x.minor_id = c.column_id
595 )
596 LEFT JOIN [{db_name}].sys.index_columns i ON (
597 i.object_id = c.object_id
598 AND i.column_id = c.column_id
599 )
600 LEFT JOIN [{db_name}].sys.fulltext_index_columns fi ON (
601 fi.object_id = c.object_id
602 AND fi.column_id = c.column_id
603 )
604 WHERE s.name IN ({schema_placeholder})
605 AND ty.user_type_id = ty.system_type_id
606 -- restricts to system data types; eliminates 'sysname' type
607) AS d
608GROUP BY
609 table_schema,
610 table_name,
611 column_name,
612 is_nullable,
613 column_type,
614 column_comment
615ORDER BY
616 table_schema,
617 table_name,
618 column_name
619 """
620 )
621 args = [db_name] + schema_names
622 return sql, args
624 @classmethod
625 def _schema_query_mysql(cls, db_and_schema_name: str) -> SqlArgsTupleType:
626 """
627 Returns a query to fetch the database structure from a MySQL database.
629 The columns returned are as expected by
630 :func:`get_schema_infodictlist`.
632 Args:
633 db_and_schema_name: the database (and schema) name
635 Returns:
636 tuple: ``sql, args``
638 Notes:
640 - MySQL's ``INFORMATION_SCHEMA.COLUMNS``: see
641 https://dev.mysql.com/doc/refman/5.7/en/tables-table.html
643 """
644 # ---------------------------------------------------------------------
645 # Method A. Stupidly slow, e.g. 47s for the query.
646 # ---------------------------------------------------------------------
647 # It's the EXISTS stuff that's slow.
648 #
649 # sql = translate_sql_qmark_to_percent("""
650 # SELECT
651 # c.table_schema,
652 # c.table_name,
653 # c.column_name,
654 # c.is_nullable,
655 # c.column_type, /* MySQL: e.g. varchar(32) */
656 # c.column_comment, /* MySQL */
657 # EXISTS (
658 # SELECT *
659 # FROM information_schema.statistics s
660 # WHERE s.table_schema = c.table_schema
661 # AND s.table_name = c.table_name
662 # AND s.column_name = c.column_name
663 # ) AS indexed,
664 # EXISTS (
665 # SELECT *
666 # FROM information_schema.statistics s
667 # WHERE s.table_schema = c.table_schema
668 # AND s.table_name = c.table_name
669 # AND s.column_name = c.column_name
670 # AND s.index_type LIKE 'FULLTEXT%'
671 # ) AS indexed_fulltext
672 # FROM
673 # information_schema.columns c
674 # WHERE
675 # c.table_schema IN ({schema_placeholder})
676 # ORDER BY
677 # c.table_schema,
678 # c.table_name,
679 # c.column_name
680 # """.format(
681 # schema_placeholder=",".join(["?"] * len(schemas)),
682 # ))
683 # args = schemas
685 # ---------------------------------------------------------------------
686 # Method B. Much faster, e.g. 0.35s for the same thing.
687 # ---------------------------------------------------------------------
688 # http://www.codeproject.com/Articles/33052/Visual-Representation-of-SQL-Joins # noqa: E501
689 # (Note that EXISTS() above returns 0 or 1.)
690 # The LEFT JOIN below will produce NULL values for the index
691 # columns for non-indexed fields.
692 # However, you can have more than one index on a column, in which
693 # case the column appears in two rows.
694 sql = translate_sql_qmark_to_percent(
695 """
696SELECT
697 '' AS table_catalog,
698 d.table_schema,
699 d.table_name,
700 d.column_name,
701 d.is_nullable,
702 d.column_type,
703 d.column_comment,
704 d.indexed,
705 MAX(d.indexed_fulltext) AS indexed_fulltext
706FROM (
707 SELECT
708 -- c.table_catalog, -- will always be 'def'
709 c.table_schema,
710 c.table_name,
711 c.column_name,
712 c.is_nullable,
713 c.column_type, /* MySQL: e.g. varchar(32) */
714 c.column_comment, /* MySQL */
715 /* s.index_name, */
716 /* s.index_type, */
717 IF(s.index_type IS NOT NULL, 1, 0) AS indexed,
718 IF(s.index_type LIKE 'FULLTEXT%', 1, 0) AS indexed_fulltext
719 FROM
720 information_schema.columns c
721 LEFT JOIN information_schema.statistics s
722 ON (
723 c.table_schema = s.table_schema
724 AND c.table_name = s.table_name
725 AND c.column_name = s.column_name
726 )
727 WHERE
728 c.table_schema = ?
729) AS d /* "Every derived table must have its own alias" */
730GROUP BY
731 table_catalog,
732 table_schema,
733 table_name,
734 column_name,
735 is_nullable,
736 column_type,
737 column_comment,
738 indexed
739ORDER BY
740 table_catalog,
741 table_schema,
742 table_name,
743 column_name
744 """
745 )
746 args = [db_and_schema_name]
747 return sql, args
749 @classmethod
750 def _schema_query_postgres(
751 cls, schema_names: List[str]
752 ) -> SqlArgsTupleType:
753 """
754 Returns a query to fetch the database structure from an SQL Server
755 database.
757 The columns returned are as expected by
758 :func:`get_schema_infodictlist`.
760 Args:
761 schema_names: a list of schema names within the database
763 Returns:
764 tuple: ``sql, args``
766 Notes:
768 - A PostgreSQL connection is always to a single database; see
769 https://stackoverflow.com/questions/10335561/use-database-name-command-in-postgresql
770 - https://dba.stackexchange.com/questions/75015
771 - https://stackoverflow.com/questions/14713774
772 - Note that creating a GIN index looks like:
774 .. code-block:: sql
776 ALTER TABLE t ADD COLUMN tsv_mytext TSVECTOR;
777 UPDATE t SET tsv_mytext = to_tsvector(mytext);
778 CREATE INDEX idx_t_mytext_gin ON t USING GIN(tsv_mytext);
780 - PostgreSQL ``INFORMATION_SCHEMA.COLUMNS``: see
781 https://www.postgresql.org/docs/9.1/static/infoschema-columns.html
783 """
784 if not schema_names:
785 raise ValueError(
786 "No schema_names specified (for PostgreSQL " "database)"
787 )
788 schema_placeholder = ",".join(["?"] * len(schema_names))
789 sql = translate_sql_qmark_to_percent(
790 f"""
791SELECT
792 '' AS table_catalog,
793 d.table_schema,
794 d.table_name,
795 d.column_name,
796 d.is_nullable,
797 d.column_type,
798 d.column_comment,
799 CASE WHEN COUNT(d.indrelid) > 0 THEN 1 ELSE 0 END AS indexed,
800 MAX(d.indexed_fulltext) AS indexed_fulltext
801FROM (
802 SELECT
803 -- c.table_catalog, -- will always be the connection's database name
804 c.table_schema,
805 c.table_name,
806 c.column_name,
807 a.attnum as column_seq_num,
808 c.is_nullable,
809 pg_catalog.format_type(a.atttypid, a.atttypmod) as column_type,
810 pgd.description AS column_comment,
811 i.indrelid,
812 CASE
813 WHEN pg_get_indexdef(indexrelid) ~ 'USING (gin |gist )' THEN 1
814 ELSE 0
815 END AS indexed_fulltext
816 FROM pg_catalog.pg_statio_all_tables AS t
817 INNER JOIN information_schema.columns c ON (
818 c.table_schema = t.schemaname
819 AND c.table_name = t.relname
820 )
821 INNER JOIN pg_catalog.pg_attribute a ON ( -- one row per column
822 a.attrelid = t.relid
823 AND a.attname = c.column_name
824 )
825 LEFT JOIN pg_catalog.pg_index AS i ON (
826 i.indrelid = t.relid -- match on table
827 AND i.indkey[0] = a.attnum -- match on column sequence number
828 AND i.indnatts = 1 -- one column in the index
829 )
830 LEFT JOIN pg_catalog.pg_description pgd ON (
831 pgd.objoid = t.relid
832 AND pgd.objsubid = c.ordinal_position
833 )
834 WHERE t.schemaname IN ({schema_placeholder})
835) AS d
836GROUP BY
837 table_catalog,
838 table_schema,
839 table_name,
840 column_name,
841 is_nullable,
842 column_type,
843 column_comment
844ORDER BY
845 table_catalog,
846 table_schema,
847 table_name,
848 column_name
849 """
850 )
851 args = schema_names
852 return sql, args
854 @classmethod
855 def _schema_query_sqlite_as_infodictlist(
856 cls, connection: BaseDatabaseWrapper, debug: bool = False
857 ) -> List[Dict[str, Any]]:
858 """
859 Queries an SQLite databases and returns columns as expected by
860 :func:`get_schema_infodictlist`.
861 """
862 # 1. Catalogue tables.
863 # pragma table_info(sqlite_master);
864 empty_args = []
865 sql_get_tables = """
866 SELECT tbl_name AS tablename
867 FROM sqlite_master
868 WHERE type='table'
869 """
870 table_info_rows = cls._exec_sql_query(
871 connection, (sql_get_tables, empty_args), debug=debug
872 )
873 table_names = [row["tablename"] for row in table_info_rows]
875 # 2. Catalogue each tables
876 results = [] # type: List[Dict[str, Any]]
877 for table_name in table_names:
878 # A "PRAGMA table_info()" call doesn't work with arguments.
879 sql_inspect_table = f"PRAGMA table_info({table_name})"
880 column_info_rows = cls._exec_sql_query(
881 connection, (sql_inspect_table, empty_args), debug=debug
882 )
883 for ci in column_info_rows:
884 results.append(
885 dict(
886 table_catalog="",
887 table_schema="",
888 table_name=table_name,
889 column_name=ci["name"],
890 is_nullable=1 - ci["notnull"],
891 column_type=ci["type"],
892 column_comment="",
893 indexed=0,
894 indexed_fulltext=0,
895 )
896 )
897 # Ignored:
898 # - "cid" (column ID)
899 # - "dflt_value"
900 # - "pk"
901 return results
903 @classmethod
904 def _exec_sql_query(
905 cls,
906 connection: BaseDatabaseWrapper,
907 sql_args: SqlArgsTupleType,
908 debug: bool = False,
909 ) -> List[Dict[str, Any]]:
910 """
911 Used by get_schema_infodictlist() as a common function to translate an
912 sql/args pair into the desired results. But it does that because the
913 incoming SQL has the right column names; the function is more generic
914 and just runs a query.
916 Args:
917 connection:
918 a :class:`django.db.backends.base.base.BaseDatabaseWrapper`,
919 i.e. a Django database connection
920 sql_args:
921 tuple of SQL and arguments
922 debug:
923 be verbose to the log?
925 Returns:
926 A list of dictionaries, each mapping column names to values.
927 The dictionaries are suitable for use as ``**kwargs`` to
928 :class:`ColumnInfo`.
929 """
930 # We execute this one directly, rather than using the Query class,
931 # since this is a system rather than a per-user query.
932 sql, args = sql_args
933 cursor = connection.cursor()
934 if debug:
935 log.debug(f"- sql = {sql}\n- args = {args!r}")
936 cursor.execute(sql, args)
937 # Re passing multiple values to SQL via args:
938 # - Don't circumvent the parameter protection against SQL injection.
939 # - Too much hassle to use Django's ORM model here, though that would
940 # also be possible.
941 # - https://stackoverflow.com/questions/907806
942 # - Similarly via SQLAlchemy reflection/inspection.
943 results = dictfetchall(cursor) # list of OrderedDicts
944 if debug:
945 log.debug(f"results = {results!r}")
946 log.debug("... done")
947 return results
949 # -------------------------------------------------------------------------
950 # Fetching schema info from the database: main (still internal) interface
951 # -------------------------------------------------------------------------
953 def get_schema_infodictlist(
954 self, connection: BaseDatabaseWrapper, debug: bool = False
955 ) -> List[Dict[str, Any]]:
956 """
957 Fetch structure information for a specific database, by asking the
958 database.
960 Args:
961 connection:
962 a :class:`django.db.backends.base.base.BaseDatabaseWrapper`,
963 i.e. a Django database connection
964 debug:
965 be verbose to the log?
967 Returns:
968 A list of dictionaries, each mapping column names to values.
969 The dictionaries are suitable for use as ``**kwargs`` to
970 :class:`ColumnInfo`.
972 """
973 db_name = self.database
974 schema_name = self.schema_name
975 log.info(
976 f"Fetching/caching database structure (for database "
977 f"{db_name!r}, schema {schema_name!r})..."
978 )
979 # The db/schema names are guaranteed to be strings by __init__().
980 if connection.vendor == ConnectionVendors.MICROSOFT:
981 if not db_name:
982 raise ValueError(f"{db_name=!r}; required for MSSQL")
983 if not schema_name:
984 raise ValueError(f"{schema_name=!r}; required for MSSQL")
985 results = self._exec_sql_query(
986 connection,
987 sql_args=self._schema_query_microsoft(db_name, [schema_name]),
988 debug=debug,
989 )
990 elif connection.vendor == ConnectionVendors.POSTGRESQL:
991 if db_name:
992 raise ValueError(f"{db_name=!r}; must be '' for PostgreSQL")
993 if not schema_name:
994 raise ValueError(f"{schema_name=!r}; required for PostgreSQL")
995 results = self._exec_sql_query(
996 connection,
997 sql_args=self._schema_query_postgres([schema_name]),
998 debug=debug,
999 )
1000 elif connection.vendor == ConnectionVendors.MYSQL:
1001 if db_name:
1002 raise ValueError(f"{db_name=!r}; must be '' for MySQL")
1003 if not schema_name:
1004 raise ValueError(f"{schema_name=!r}; required for MySQL")
1005 results = self._exec_sql_query(
1006 connection,
1007 sql_args=self._schema_query_mysql(
1008 db_and_schema_name=schema_name
1009 ),
1010 debug=debug,
1011 )
1012 elif connection.vendor == ConnectionVendors.SQLITE:
1013 # db_name: don't care?
1014 # schema_name: don't care?
1015 # This one can't be done as a single query; the following function
1016 # builds up the information by querying a list of tables, then each
1017 # table.
1018 results = self._schema_query_sqlite_as_infodictlist(
1019 connection, debug=debug
1020 )
1021 else:
1022 raise ValueError(
1023 f"Don't know how to get metadata for "
1024 f"{connection.vendor=!r}"
1025 )
1027 if not results:
1028 log.warning(
1029 f"SingleResearchDatabase.get_schema_infodictlist(): no "
1030 f"results for database/schema {self.schema_identifier!r} "
1031 f"database - misconfigured?"
1032 )
1033 return results
1036@register_for_json(method=METHOD_NO_ARGS)
1037class ResearchDatabaseInfo:
1038 """
1039 Fetches schema information from the research databases. There can be
1040 several, and this class represents the whole collection.
1042 Notes:
1044 - Class primarily exists to be able to use ``@cached_property``.
1046 - ... replaced by ``@lru_cache``
1047 - ... replaced by ``@django_cache_function``
1048 """
1050 # We fetch the dialect at first request; this enables us to import the
1051 # class without Django configured.
1053 def __init__(self, running_without_config: bool = False) -> None:
1054 self.dbinfolist = [] # type: List[SingleResearchDatabase]
1056 if running_without_config:
1057 self.dialect = ""
1058 self.grammar = None # type: Optional[SqlGrammar]
1059 self.dbinfo_for_contact_lookup = (
1060 None
1061 ) # type: Optional[SingleResearchDatabase]
1063 else:
1064 self.dialect = settings.RESEARCH_DB_DIALECT # type: str
1065 assert (
1066 self.dialect in SUPPORTED_DIALECTS
1067 ), f"Unsupported dialect: {self.dialect!r}"
1069 self.grammar = make_grammar(self.dialect) # not expensive
1071 connection = self._connection()
1073 for index in range(len(settings.RESEARCH_DB_INFO)):
1074 self.dbinfolist.append(
1075 SingleResearchDatabase(
1076 index=index,
1077 grammar=self.grammar,
1078 rdb_info=self,
1079 connection=connection,
1080 )
1081 )
1082 assert (
1083 len(self.dbinfolist) > 0
1084 ), "No research databases configured in RESEARCH_DB_INFO"
1085 names = [x.name for x in self.dbinfolist]
1086 assert len(names) == len(
1087 set(names)
1088 ), f"Duplicate database names in {names!r}"
1090 try:
1091 self.dbinfo_for_contact_lookup = self.get_dbinfo_by_name(
1092 settings.RESEARCH_DB_FOR_CONTACT_LOOKUP
1093 )
1094 except ValueError:
1095 raise ValueError(
1096 f"In your settings, RESEARCH_DB_FOR_CONTACT_LOOKUP "
1097 f"specifies {settings.RESEARCH_DB_FOR_CONTACT_LOOKUP!r} "
1098 f"but there is no database with that name in "
1099 f"RESEARCH_DB_INFO"
1100 )
1101 assert self.dbinfo_for_contact_lookup.secret_lookup_db, (
1102 f"Research database {self.dbinfo_for_contact_lookup.name!r} "
1103 f"is set as your RESEARCH_DB_FOR_CONTACT_LOOKUP but has no "
1104 f"{ResearchDbInfoKeys.SECRET_LOOKUP_DB!r} attribute"
1105 )
1107 self.nlp_sourcedb_map = getattr(
1108 settings, SettingsKeys.NLP_SOURCEDB_MAP, {}
1109 ) # type: Dict[str, str]
1110 try:
1111 assert isinstance(self.nlp_sourcedb_map, dict)
1112 for k, v in self.nlp_sourcedb_map.items():
1113 assert isinstance(k, str)
1114 assert isinstance(v, str)
1115 except AssertionError:
1116 raise ValueError(
1117 "settings.NLP_SOURCEDB_MAP is not a Dict[str, str]"
1118 )
1120 # -------------------------------------------------------------------------
1121 # Classmethods, staticmethods
1122 # -------------------------------------------------------------------------
1124 @classmethod
1125 def _connection(cls) -> BaseDatabaseWrapper:
1126 """
1127 Returns the Django connection to the research database(s), from
1128 ``connections[RESEARCH_DB_CONNECTION_NAME]``, meaning
1129 ``connections['research']``.
1131 **This must be a read-only connection, enforced by the database.
1132 Researchers will be allowed to execute unrestricted SQL via this
1133 connection.**
1135 """
1136 return connections[RESEARCH_DB_CONNECTION_NAME]
1138 @classmethod
1139 def uses_database_level(cls) -> bool:
1140 """
1141 Does the database simultaneously offer a "database" level above its
1142 "schema" level?
1143 """
1144 return cls._offers_db_above_schema(cls._connection())
1146 @classmethod
1147 def format_db_schema(cls, db: str, schema: str) -> str:
1148 if cls.uses_database_level():
1149 return f"{db}.{schema}"
1150 else:
1151 return schema
1153 @staticmethod
1154 def _offers_db_above_schema(connection: BaseDatabaseWrapper) -> bool:
1155 """
1156 Does the database simultaneously offer a "database" level above its
1157 "schema" level?
1159 - True for Microsoft SQL Server
1160 - False for MySQL (in which "database" and "schema" are synonymous)
1161 - False for PostgreSQL (in which a connection can only talk to one
1162 database at once, though there can be many schemas within each
1163 database).
1165 Args:
1166 connection:
1167 a :class:`django.db.backends.base.base.BaseDatabaseWrapper`,
1168 i.e. a Django database connection
1169 """
1170 return connection.vendor in [ConnectionVendors.MICROSOFT]
1172 # -------------------------------------------------------------------------
1173 # Whole-database/schema information
1174 # -------------------------------------------------------------------------
1176 @property
1177 def single_research_db(self) -> bool:
1178 """
1179 Do we have only a single research database?
1180 """
1181 return len(self.dbinfolist) == 1
1183 @property
1184 def single_research_db_with_secret_map(self) -> bool:
1185 """
1186 Do we have only one database that has an associated secret lookup
1187 database to patient IDs?
1188 """
1189 return len(self.dbs_with_secret_map) == 1
1191 @property
1192 def dbs_with_secret_map(self) -> List[SingleResearchDatabase]:
1193 """
1194 Which of our databases has an associated secret lookup database to
1195 patient IDs?
1197 Returns:
1198 a list of :class:`SingleResearchDatabase` objects
1199 """
1200 return [db for db in self.dbinfolist if db.secret_lookup_db]
1202 def _get_dbinfo_by_index(self, index: int) -> SingleResearchDatabase:
1203 """
1204 Returns a :class:`SingleResearchDatabase` by its zero-based index, or
1205 raise an exception if the index is bad.
1207 Args:
1208 index: a zero-based index
1210 Returns:
1211 a :class:`SingleResearchDatabase`
1213 Raises:
1214 :exc:`IndexError` if the index was bad
1215 """
1216 return self.dbinfolist[index]
1218 def get_dbinfo_by_name(self, name: str) -> SingleResearchDatabase:
1219 """
1220 Returns the research database whose name (from
1221 ``settings.RESEARCH_DB_INFO``) is ``name``.
1223 Args:
1224 name: the name of a database, as per ``settings.RESEARCH_DB_INFO``
1226 Returns:
1227 a :class:`SingleResearchDatabase`
1229 Raises:
1230 :exc:`ValueError` if none is found
1232 """
1233 try:
1234 return next(x for x in self.dbinfolist if x.name == name)
1235 except StopIteration:
1236 raise ValueError(f"No research database named {name!r}")
1238 def get_dbinfo_by_schema(
1239 self, schema_id: SchemaId
1240 ) -> SingleResearchDatabase:
1241 """
1242 Returns the first database representing the specified schema.
1244 Args:
1245 schema_id: a :class:`crate_anon.common.sql.SchemaId`
1247 Returns:
1248 a :class:`SingleResearchDatabase`
1250 Raises:
1251 :exc:`ValueError` if none is found
1252 """
1253 if schema_id.is_blank():
1254 raise ValueError("get_db_info(): error: blank schema provided")
1255 try:
1256 return next(x for x in self.dbinfolist if x.schema_id == schema_id)
1257 except StopIteration:
1258 raise ValueError(
1259 f"No such database/schema: "
1260 f"{schema_id.identifier(self.grammar)!r}"
1261 )
1263 @property
1264 def first_dbinfo(self) -> SingleResearchDatabase:
1265 """
1266 Returns the first :class:`SingleResearchDatabase` that we know about.
1267 """
1268 return self._get_dbinfo_by_index(0)
1270 @property
1271 def first_dbinfo_with_secret_map(self) -> Optional[SingleResearchDatabase]:
1272 """
1273 Returns the first :class:`SingleResearchDatabase` that has a secret map
1274 (an associated secret lookup database to patient IDs), or ``None`` if
1275 there isn't one.
1276 """
1277 dbs = self.dbs_with_secret_map
1278 if len(dbs) == 0:
1279 return None
1280 return dbs[0]
1282 @property
1283 def researchdb_schemas(self) -> List[SchemaId]:
1284 """
1285 Returns all :class:`crate_anon.common.sql.SchemaId` values for our
1286 databases (one per database).
1287 """
1288 return [x.schema_id for x in self.dbinfolist]
1290 def get_default_database_name(self) -> str:
1291 """
1292 Returns the default "database name" for our dialect.
1294 - For Microsoft SQL Server, this is
1295 ``settings.DATABASES['research']['NAME']``
1296 - For MySQL, this is blank
1297 - For PostgreSQL, this is blank
1298 """
1299 dialect = self.dialect
1300 if dialect == SqlaDialectName.MSSQL:
1301 return settings.DATABASES[RESEARCH_DB_CONNECTION_NAME]["NAME"]
1302 elif dialect == SqlaDialectName.POSTGRES:
1303 return ""
1304 elif dialect == SqlaDialectName.MYSQL:
1305 return ""
1306 else:
1307 raise ValueError("Bad settings.RESEARCH_DB_DIALECT")
1309 def get_default_schema_name(self) -> str:
1310 """
1311 Returns the default "schema name" for our dialect.
1313 - For Microsoft SQL Server, this is ``'dbo'``
1314 - For PostgreSQL, this is ``'public'``
1315 - For MySQL, this is ``settings.DATABASES['research']['NAME']``
1317 """
1318 dialect = self.dialect
1319 if dialect == SqlaDialectName.MSSQL:
1320 return MSSQL_DEFAULT_SCHEMA
1321 elif dialect == SqlaDialectName.POSTGRES:
1322 return POSTGRES_DEFAULT_SCHEMA
1323 elif dialect == SqlaDialectName.MYSQL:
1324 return settings.DATABASES[RESEARCH_DB_CONNECTION_NAME]["NAME"]
1325 else:
1326 raise ValueError("Bad settings.RESEARCH_DB_DIALECT")
1328 # -------------------------------------------------------------------------
1329 # Database-wide fields and descriptions
1330 # -------------------------------------------------------------------------
1332 def get_rid_column(self, table: TableId) -> ColumnId:
1333 """
1334 Returns the RID column in the specified table (which may or may not
1335 exist).
1337 Args:
1338 table: a :class:`crate_anon.common.sql.TableId`
1340 Returns:
1341 a :class:`crate_anon.common.sql.ColumnId`, which may be blank
1343 """
1344 dbinfo = self.get_dbinfo_by_schema(table.schema_id)
1345 return table.column_id(dbinfo.rid_field)
1347 def get_trid_column(self, table: TableId) -> ColumnId:
1348 """
1349 Returns the TRID column in the specified table (which may or may not
1350 exist).
1352 Args:
1353 table: a :class:`crate_anon.common.sql.TableId`
1355 Returns:
1356 a :class:`crate_anon.common.sql.ColumnId`, which may be blank
1358 """
1359 dbinfo = self.get_dbinfo_by_schema(table.schema_id)
1360 return table.column_id(dbinfo.trid_field)
1362 def get_mrid_column_from_schema(self, schema: SchemaId) -> ColumnId:
1363 """
1364 Returns the MRID column in the MRID master table for a given
1365 schema/database.
1367 Args:
1368 schema: a :class:`crate_anon.common.sql.SchemaId`
1370 Returns:
1371 a :class:`crate_anon.common.sql.ColumnId`, which may be blank
1373 """
1374 dbinfo = self.get_dbinfo_by_schema(schema)
1375 return schema.column_id(
1376 table=dbinfo.mrid_table, column=dbinfo.mrid_field
1377 )
1379 def get_mrid_column_from_table(self, table: TableId) -> ColumnId:
1380 """
1381 Returns the MRID column in the specified table (which may or may not
1382 exist).
1384 Args:
1385 table: a :class:`crate_anon.common.sql.TableId`; this should be the
1386 MRID master table
1388 Returns:
1389 a :class:`crate_anon.common.sql.ColumnId`, which may be blank
1391 """
1392 return self.get_mrid_column_from_schema(table.schema_id)
1394 def get_linked_mrid_column(self, table: TableId) -> Optional[ColumnId]:
1395 """
1396 Returns either (a) the MRID column in the schema containing the table
1397 specified, or (b) one that can be linked to it automatically.
1399 Args:
1400 table: a :class:`crate_anon.common.sql.TableId`
1402 Returns:
1403 a :class:`crate_anon.common.sql.ColumnId`, or ``None``
1405 """
1406 mrid_in_same_db = self.get_mrid_column_from_table(table)
1407 if mrid_in_same_db:
1408 return mrid_in_same_db
1409 # OK. So our table isn't from a database with an MRID table, but it
1410 # might be linked to one.
1411 table_db = self.get_dbinfo_by_schema(table.schema_id)
1412 first_db = self.first_dbinfo
1413 if not first_db.talks_to_world:
1414 return None
1415 if table_db.talks_to_world or table_db.can_communicate_directly(
1416 first_db
1417 ):
1418 return self.get_mrid_column_from_schema(first_db.schema_id)
1420 def get_default_date_column(self, table: TableId) -> Optional[ColumnId]:
1421 """
1422 Returns the default date column in the specified table (which may or
1423 may not exist).
1425 Args:
1426 table: a :class:`crate_anon.common.sql.TableId`
1428 Returns:
1429 a :class:`crate_anon.common.sql.ColumnId`, which may be blank
1431 """
1432 dbinfo = self.get_dbinfo_by_schema(table.schema_id)
1433 return dbinfo.get_default_date_field(table)
1435 # -------------------------------------------------------------------------
1436 # Table/column information
1437 # -------------------------------------------------------------------------
1439 @django_cache_function(timeout=None)
1440 def get_schema_infodictlist(self) -> List[Dict[str, Any]]:
1441 """
1442 Get details of all columns in all research databases.
1444 Returns:
1445 list: across all databases, of all dictionaries provided by their
1446 :meth:`SingleResearchDatabase.get_schema_infodictlist` function
1447 """
1448 results = [] # type: List[Dict[str, Any]]
1449 for dbinfo in self.dbinfolist:
1450 results.extend(dbinfo.schema_infodictlist)
1451 return results
1453 @django_cache_function(timeout=None)
1454 def get_colinfolist(self) -> List[ColumnInfo]:
1455 """
1456 Get details of all columns in all research databases.
1458 Returns:
1459 list: across all databases, of :class:`ColumnInfo` objects
1460 """
1461 colinfolist = [] # type: List[ColumnInfo]
1462 for dbi in self.dbinfolist:
1463 colinfolist.extend(dbi.colinfolist)
1464 return colinfolist
1466 @django_cache_function(timeout=None)
1467 def get_colinfolist_by_tables(self) -> Dict[TableId, List[ColumnInfo]]:
1468 """
1469 Get details of all columns in all research databases, by table.
1471 Returns:
1472 OrderedDict: across all databases, ``{table_id: columns}`` where
1473 ``table_id`` is a :class:`crate_anon.common.sql.TableId` and
1474 ``columns`` is a list of :class:`ColumnInfo` objects
1475 """
1476 colinfolist = self.get_colinfolist()
1477 table_to_colinfolist = {} # type: Dict[TableId, List[ColumnInfo]]
1478 for c in colinfolist:
1479 table_id = c.table_id
1480 if table_id not in table_to_colinfolist:
1481 table_to_colinfolist[table_id] = [] # type: List[ColumnInfo]
1482 table_to_colinfolist[table_id].append(c)
1483 # noinspection PyTypeChecker
1484 return OrderedDict(sorted(table_to_colinfolist.items()))
1486 @django_cache_function(timeout=None)
1487 def get_colinfolist_by_schema(self) -> Dict[SchemaId, List[ColumnInfo]]:
1488 """
1489 Get details of all columns in all research databases, by schema.
1491 Returns:
1492 OrderedDict: across all databases, ``{schema_id: columns}`` where
1493 ``schema_id`` is a :class:`crate_anon.common.sql.SchemaId` and
1494 ``columns`` is a list of :class:`ColumnInfo` objects
1495 """
1496 colinfolist = self.get_colinfolist()
1497 schema_to_colinfolist = {} # type: Dict[SchemaId, List[ColumnInfo]]
1498 for c in colinfolist:
1499 table_id = c.table_id
1500 schema = table_id.schema_id
1501 if schema not in schema_to_colinfolist:
1502 schema_to_colinfolist[schema] = [] # type: List[ColumnInfo]
1503 schema_to_colinfolist[schema].append(c)
1504 # noinspection PyTypeChecker
1505 return OrderedDict(sorted(schema_to_colinfolist.items()))
1507 def tables_containing_field(self, fieldname: str) -> List[TableId]:
1508 """
1509 Returns all tables containing the column (field) with the specified
1510 name.
1512 Args:
1513 fieldname: field (column) name
1515 Returns:
1516 a list of :class:`crate_anon.common.sql.TableId` objects
1518 - We won't use a ``SELECT`` on ``INFORMATION_SCHEMA`` here, since we
1519 already have the information.
1521 """
1522 columns = self.get_colinfolist()
1523 results = [] # type: List[TableId]
1524 for column in columns:
1525 if column.column_name == fieldname:
1526 table_id = column.table_id
1527 if table_id not in results:
1528 results.append(table_id)
1529 return results
1531 def text_columns(
1532 self, table_id: TableId, min_length: int = 1
1533 ) -> List[ColumnInfo]:
1534 """
1535 Returns all text columns from the specified table.
1537 Args:
1538 table_id: a :class:`crate_anon.common.sql.TableId`
1539 min_length: the minimum SQL text length to include the column
1541 Returns:
1542 a list of :class:`crate_anon.common.sql.ColumnInfo` objects
1544 """
1545 results = [] # type: List[ColumnInfo]
1546 for column in self.get_colinfolist():
1547 if column.table_id != table_id:
1548 continue
1549 if not is_sql_column_type_textual(column.column_type, min_length):
1550 # log.debug("Skipping {!r}", column)
1551 continue
1552 results.append(column)
1553 # log.debug("text_columns for {} with min_length={}: [{}]", table_id,
1554 # min_length, ", ".join(str(x) for x in results))
1555 return results
1557 @django_cache_function(timeout=None)
1558 def all_columns(self, table_id: TableId) -> List[ColumnInfo]:
1559 """
1560 Returns all columns from the specified table.
1562 Args:
1563 table_id: a :class:`crate_anon.common.sql.TableId`
1565 Returns:
1566 a list of :class:`crate_anon.common.sql.ColumnInfo` objects
1568 """
1569 results = [] # type: List[ColumnInfo]
1570 for column in self.get_colinfolist():
1571 if column.table_id != table_id:
1572 continue
1573 results.append(column)
1574 return results
1576 def get_tsv(self) -> str:
1577 """
1578 Returns a tab-separated value (TSV) file detailing of all columns in
1579 all research databases.
1581 Returns:
1582 str: TSV
1584 """
1585 return dictlist_to_tsv(self.get_schema_infodictlist())
1587 def get_excel(self) -> bytes:
1588 """
1589 Returns an XLSX (Excel) file detailing of all columns in all research
1590 databases.
1592 Returns:
1593 bytes: binary XLSX file
1595 """
1596 wb = Workbook()
1597 wb.remove_sheet(wb.active) # remove the autocreated blank sheet
1598 schema_colinfolist_dict = self.get_colinfolist_by_schema()
1599 for schema, colinfolist in schema_colinfolist_dict.items():
1600 ws = wb.create_sheet(title=schema.identifier(self.grammar))
1601 ws.append(
1602 [
1603 "table_catalog",
1604 "table_schema",
1605 "table_name",
1606 "column_name",
1607 "is_nullable",
1608 "column_type",
1609 "column_comment",
1610 "indexed",
1611 "indexed_fulltext",
1612 "basetype",
1613 "full_identifier",
1614 ]
1615 )
1616 for c in colinfolist: # type: ColumnInfo
1617 ws.append(
1618 [
1619 c.table_catalog,
1620 c.table_schema,
1621 c.table_name,
1622 c.column_name,
1623 c.is_nullable,
1624 c.column_type,
1625 c.column_comment,
1626 c.indexed,
1627 c.indexed_fulltext,
1628 c.basetype,
1629 c.column_id.identifier(self.grammar),
1630 ]
1631 )
1632 return excel_to_bytes(wb)
1634 @django_cache_function(timeout=None)
1635 def get_tables(self) -> List[TableId]:
1636 """
1637 Returns all tables in the research database(s).
1639 Returns:
1640 a list of :class:`crate_anon.common.sql.TableId` objects
1641 """
1642 tables = set() # type: Set[TableId]
1643 for column in self.get_colinfolist():
1644 tables.add(column.table_id)
1645 return sorted(list(tables))
1647 @django_cache_function(timeout=None)
1648 def table_contains_rid(self, table_id: TableId) -> bool:
1649 """
1650 Does the specified table contain a research ID (RID)?
1652 Args:
1653 table_id: a :class:`crate_anon.common.sql.TableId`
1654 """
1655 target_rid_column = self.get_rid_column(table_id)
1656 for column in self.get_colinfolist():
1657 if column.column_id == target_rid_column:
1658 return True
1659 return False
1661 def table_contains(self, table_id: TableId, column_id: ColumnId) -> bool:
1662 """
1663 Does the specified table contain the specified column?
1665 Args:
1666 table_id: a :class:`crate_anon.common.sql.TableId`
1667 column_id: a :class:`crate_anon.common.sql.ColumnId`
1668 """
1669 for c in self.all_columns(table_id):
1670 if c.column_id == column_id:
1671 return True
1672 return False
1674 @django_cache_function(timeout=None)
1675 def get_mrid_linkable_patient_tables(self) -> List[TableId]:
1676 """
1677 Returns all tables in the research database(s) that are linkable on
1678 MRID (i.e. that contain a RID and live in a database with a RID-to-MRID
1679 lookup table).
1681 Returns:
1682 a list of :class:`crate_anon.common.sql.TableId` objects
1683 """
1684 eligible_tables = set() # type: Set[TableId]
1685 for table in self.get_tables():
1686 dbinfo = self.get_dbinfo_by_schema(table.schema_id)
1687 if not dbinfo.has_mrid:
1688 continue
1689 if self.table_contains_rid(table):
1690 eligible_tables.add(table)
1691 return sorted(list(eligible_tables))
1694@django_cache_function(timeout=None)
1695def get_research_db_info() -> ResearchDatabaseInfo:
1696 return ResearchDatabaseInfo(RUNNING_WITHOUT_CONFIG)