Coverage for crateweb/research/models.py: 31%
856 statements
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
1"""
2crate_anon/crateweb/research/models.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26"""
28from collections import OrderedDict
29import datetime
30import io
31import logging
32from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, Union
33import weakref
34import zipfile
35import json
37from cardinal_pythonlib.dbfunc import dictfetchall, get_fieldnames_from_cursor
38from cardinal_pythonlib.django.fields.jsonclassfield import JsonClassField
39from cardinal_pythonlib.excel import excel_to_bytes
40from cardinal_pythonlib.exceptions import add_info_to_exception
41from cardinal_pythonlib.hash import (
42 get_longest_supported_hasher_output_length,
43 hash64,
44)
45from cardinal_pythonlib.json_utils.serialize import (
46 json_encode,
47 METHOD_STRIP_UNDERSCORE,
48 register_for_json,
49)
50from cardinal_pythonlib.reprfunc import simple_repr
51from cardinal_pythonlib.sql.sql_grammar import format_sql
52from cardinal_pythonlib.tsv import make_tsv_row
53from cardinal_pythonlib.django.function_cache import django_cache_function
54from django.db import connections, DatabaseError, models
55from django.db.models import QuerySet
56from django.conf import settings
57from django.http.request import HttpRequest
58from django.db.backends.utils import CursorWrapper
59from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
60from openpyxl.workbook.workbook import Workbook
61from openpyxl.worksheet.worksheet import Worksheet
63from crate_anon.anonymise.constants import PatientInfoConstants
64from crate_anon.common.constants import JSON_SEPARATORS_COMPACT
65from crate_anon.common.sql import (
66 ColumnId,
67 columns_to_table_column_hierarchy,
68 escape_percent_for_python_dbapi,
69 make_grammar,
70 sql_string_literal,
71 SqlArgsTupleType,
72 TableId,
73 translate_sql_qmark_to_percent,
74 WhereCondition,
75)
76from crate_anon.crateweb.config.constants import ResearchDbInfoKeys
77from crate_anon.crateweb.core.constants import (
78 RESEARCH_DB_CONNECTION_NAME,
79 SettingsKeys,
80)
81from crate_anon.crateweb.research.html_functions import (
82 HtmlElementCounter,
83 N_CSS_HIGHLIGHT_CLASSES,
84 prettify_sql_html,
85)
86from crate_anon.crateweb.research.research_db_info import (
87 get_research_db_info,
88 SingleResearchDatabase,
89)
90from crate_anon.crateweb.research.sql_writer import (
91 add_to_select,
92 SelectElement,
93)
95log = logging.getLogger(__name__)
98# =============================================================================
99# Hacking django-pyodbc-azure, to stop it calling cursor.nextset() every time
100# you ask it to do cursor.fetchone()
101# =============================================================================
102# BUG in django-pyodbc-azure==1.10.4.0 (providing
103# sql_server/*), 2017-02-17: this causes
104# ProgrammingError "No results. Previous SQL was not a query."
105# The problem relates to sql_server/pyodbc/base.py
106# CursorWrapper.fetchone() calling self.cursor.nextset(); if
107# you comment this out, it works fine.
108# Related:
109# - https://github.com/pymssql/pymssql/issues/98
111DJANGO_PYODBC_AZURE_ENGINE = "sql_server.pyodbc"
114def replacement_sqlserver_pyodbc_cursorwrapper_fetchone(self) -> List[Any]:
115 """
116 A function to replace ``CursorWrapper.fetchone()`` in
117 ``sql_server/pyodbc/base.py`` from ``django-pyodbc-azure``.
118 This replacement function does not call ``cursor.nextset()``.
119 """
120 # log.debug("Using monkeypatched fetchone(); self: {}; self.cursor: "
121 # "{}".format(repr(self), repr(self.cursor)))
122 row = self.cursor.fetchone()
123 if row is not None:
124 row = self.format_row(row)
125 # BUT DO NOT CALL self.cursor.nextset()
126 return row
129def hack_django_pyodbc_azure_cursorwrapper() -> None:
130 """
131 Monkey-patch part of the ``sql_server.pyodbc`` library from
132 ``django-pyodbc-azure``. It replaces the ``fetchone()`` method with a
133 version that doesn't call ``cursor.nextset()`` automatically.
135 **It looks like this becomes unnecessary in django-pyodbc-azure==2.0.6.1
136 or similar, because the call to ``cursor.nextset()`` is now only performed
137 ``if not self.connection.supports_mars``.**
139 *Notes*
141 - I thought I wanted to modify an *instance*, not a *class*
142 (https://tryolabs.com/blog/2013/07/05/run-time-method-patching-python/).
144 - To modify a class, we do ``SomeClass.method = newmethod``.
146 - But to modify an instance, we use ``instance.method =
147 types.MethodType(newmethod, instance)``.
149 - However, it turned out the instance was actually part of a long chain
150 of cursor wrappers, including the Django debug toolbar. Classes included
151 ``debug_toolbar.panels.sql.tracking.NormalCursorWrapper``;
152 ``django.db.backends.utils.CursorDebugWrapper``.
153 And in any case, modifying the class is a sensible thing.
155 """
156 try:
157 # noinspection PyUnresolvedReferences
158 from sql_server.pyodbc.base import CursorWrapper
160 log.info(
161 "Monkey-patching sql_server.pyodbc.base.CursorWrapper."
162 "fetchone to disable automatic call to cursor.nextset()"
163 )
164 CursorWrapper.fetchone = (
165 replacement_sqlserver_pyodbc_cursorwrapper_fetchone
166 )
167 except ImportError:
168 return
171if getattr(
172 settings,
173 SettingsKeys.DISABLE_DJANGO_PYODBC_AZURE_CURSOR_FETCHONE_NEXTSET,
174 True,
175):
176 # https://stackoverflow.com/questions/5601590/how-to-define-a-default-value-for-a-custom-django-setting # noqa: E501
177 hack_django_pyodbc_azure_cursorwrapper()
180@django_cache_function(timeout=None)
181def database_last_updated(dbname: str) -> Optional[datetime.datetime]:
182 """
183 Returns a datetime object specifying when the database was last run.
185 If there are any tables with a column specifying last updated time
186 (based on the column name provided in the config file) but all dates
187 are null, the function will return the minimum date possible. If there
188 are no such tables, the function will return None.
189 """
190 research_database_info = get_research_db_info()
191 try:
192 dbinfo = research_database_info.get_dbinfo_by_name(dbname)
193 except ValueError:
194 raise ValueError(f"Database {dbname} is not specified in config file")
195 tables_with_timecol = [] # type: List[str]
196 for col in dbinfo.colinfolist:
197 if col.column_name == dbinfo.update_date_field:
198 tables_with_timecol.append(col.table_name)
199 if not tables_with_timecol:
200 return None
201 latest_time = mindate = datetime.datetime.min
202 for table in tables_with_timecol:
203 # Not quite sure about the different dialects ...
204 if dbinfo.rdb_info.dialect == "mysql":
205 sql = (
206 f"SELECT {dbinfo.update_date_field} "
207 f"FROM {dbinfo.schema_name}.{table}"
208 )
209 else:
210 # Dialect must be mssql because ResearchDatabaseInfo checks if
211 # supported dialect
212 sql = (
213 f"SELECT {dbinfo.update_date_field} "
214 f"FROM {dbinfo.database}.{dbinfo.schema_name}.{table}"
215 )
216 with get_executed_researchdb_cursor(sql) as cursor:
217 times = cursor.fetchall()
218 times = [t[0] if t[0] else mindate for t in times]
219 times.append(latest_time)
220 latest_time = max(times)
221 return latest_time
224@django_cache_function(timeout=None)
225def last_updated_all_dbs() -> datetime.datetime:
226 """
227 Returns a datetime object specifying the most recent databse update, or
228 None if there is no information on this.
229 """
230 last_updated_all = datetime.datetime.min
231 for db in settings.RESEARCH_DB_INFO:
232 db_last_updated = database_last_updated(db[ResearchDbInfoKeys.NAME])
233 if db_last_updated is None:
234 continue
235 last_updated_all = max(last_updated_all, db_last_updated)
236 if last_updated_all == datetime.datetime.min:
237 # This will only happen if none of the databases have update time
238 # fields
239 last_updated_all = None
240 return last_updated_all
243# =============================================================================
244# Cursors
245# =============================================================================
248def debug_query() -> None:
249 """
250 Executes a test query that just selects a constant, using the research
251 database (i.e. ``connections['research']``).
252 """
253 cursor = connections[RESEARCH_DB_CONNECTION_NAME].cursor()
254 cursor.execute("SELECT 'debug'")
257def get_executed_researchdb_cursor(
258 sql: str, args: List[Any] = None
259) -> CursorWrapper:
260 """
261 Executes a query on the research database. Returns a wrapped cursor that
262 can be used as a context manager that will close the cursor on completion.
264 Args:
265 sql: SQL text
266 args: arguments to SQL query
268 Returns:
269 a :class:`django.db.backends.utils.CursorWrapper`, which is a context
270 manager that behaves as the executed cursor and also closes it on
271 completion
273 Test code:
275 .. code-block:: python
277 import os
278 import django
279 os.environ['DJANGO_SETTINGS_MODULE'] = 'crate_anon.crateweb.config.settings'
280 django.setup()
281 from crate_anon.crateweb.research.models import *
282 c = get_executed_researchdb_cursor("SELECT 1")
284 """ # noqa: E501
285 args = args or []
286 cursor = connections[
287 RESEARCH_DB_CONNECTION_NAME
288 ].cursor() # type: CursorWrapper
289 try:
290 cursor.execute(sql, args or None)
291 except DatabaseError as exception:
292 add_info_to_exception(exception, {"sql": sql, "args": args})
293 raise
294 return cursor
297def get_executed_researchdb_cursor_qmark_placeholders(
298 sql: str, args: List[Any] = None
299) -> CursorWrapper:
300 """
301 As for :func:`get_executed_researchdb_cursor`, but assumes its SQL may
302 contain question-mark parameter placeholders (``?``) and translates these
303 to the ones we need internally.
304 """
305 sql = translate_sql_qmark_to_percent(sql)
306 return get_executed_researchdb_cursor(sql, args)
309# =============================================================================
310# Data going to Excel files
311# =============================================================================
313ILLEGAL_CHARACTERS_REPLACED_WITH = ""
316def gen_excel_row_elements(
317 worksheet: Worksheet, row: Iterable
318) -> Generator[Any, None, None]:
319 r"""
320 Given an Excel worksheet row, generate individual cell contents, cell by
321 cell.
323 Args:
324 worksheet: a :class:`openpyxl.worksheet.worksheet.Worksheet`, which we
325 need in order to find the worksheet's encoding
326 row: the row to iterate through
328 Yields:
329 the contents of each cell
331 Reasons for this function:
333 1. We need a tuple/list/generator, as ``openpyxl`` checks its types
334 manually.
336 - We want to have a Worksheet object from ``openpyxl``, and say something
337 like
339 .. code-block:: python
341 ws.append(row)
343 where "row" has come from a database query.
345 - However, ``openpyxl`` doesn't believe in duck-typing; see
346 ``Worksheet.append()`` in ``openpyxl/worksheet/worksheet.py``. So
347 sometimes the plain append works (e.g. from MySQL results), but
348 sometimes it fails, e.g. when the row is of type ``pyodbc.Row``.
350 - So we must coerce it to a tuple, list, or generator.
352 - A generator will be the most efficient.
354 2. If a string fails certain checks, openpyxl will raise an
355 :exc:`IllegalCharacterError` exception. We need to work around that.
356 We'll use the "forgiveness, not permission" maxim. Specifically, it
357 dislikes strings matching its ``ILLEGAL_CHARACTERS_RE``, which contains
358 unprintable low characters matching this:
360 .. code-block:: python
362 r'[\000-\010]|[\013-\014]|[\016-\037]'
364 Note the use of octal; ``\037`` is decimal 31.
366 ``openpyxl`` gets to its ``Cell.check_string()`` function for these
367 types:
369 .. code-block:: python
371 STRING_TYPES = (basestring, unicode, bytes)
373 In Python 3, this means ``(str, str, bytes)``. So we should check
374 ``str`` and ``bytes``. (For ``bytes``, we'll follow its method of
375 converting to ``str`` in the encoding of the worksheet's choice.)
376 """
377 # Docstring must be a raw string for Sphinx! See
378 # http://openalea.gforge.inria.fr/doc/openalea/doc/_build/html/source/sphinx/rest_syntax.html#text-syntax-bold-italic-verbatim-and-special-characters # noqa: E501
379 for element in row:
380 if isinstance(element, bytes):
381 # Convert to str using the worksheet's encoding.
382 element = element.decode(worksheet.encoding)
383 # ... or: str(element, encoding)
385 if isinstance(element, str):
386 yield ILLEGAL_CHARACTERS_RE.sub(
387 ILLEGAL_CHARACTERS_REPLACED_WITH, element
388 )
389 else:
390 yield element
393# =============================================================================
394# Query highlighting class
395# =============================================================================
397HIGHLIGHT_FWD_REF = "Highlight"
400class Highlight(models.Model):
401 """
402 Represents the highlighting of a query.
403 """
405 id = models.AutoField(primary_key=True) # automatic
406 user = models.ForeignKey(
407 settings.AUTH_USER_MODEL, on_delete=models.CASCADE
408 )
409 colour = models.PositiveSmallIntegerField(verbose_name="Colour number")
410 text = models.CharField(max_length=255, verbose_name="Text to highlight")
411 active = models.BooleanField(default=True)
413 def __str__(self) -> str:
414 return f"colour={self.colour}, text={self.text}"
416 def get_safe_colour(self) -> int:
417 """
418 Returns our ``colour`` attribute, coerced to the range ``[0,
419 N_CSS_HIGHLIGHT_CLASSES - 1]`` (inclusive).
420 """
421 if self.colour is None:
422 return 0
423 return max(0, min(self.colour, N_CSS_HIGHLIGHT_CLASSES - 1))
425 @staticmethod
426 def as_ordered_dict(
427 highlight_list: Iterable[HIGHLIGHT_FWD_REF],
428 ) -> Dict[int, List[HIGHLIGHT_FWD_REF]]:
429 """
430 Converts a iterable of :class:`Highlight` objects into a dictionary
431 that collects them by highlight number.
433 Args:
434 highlight_list: list of :class:`Highlight` objects
436 Returns:
437 an OrderedDict whose keys are highlight colour numbers (in
438 ascending order), and whose values are lists of all the
439 :class:`Highlight` objects using that highlight colour number
441 """
442 d = dict() # type: Dict[int, List[HIGHLIGHT_FWD_REF]]
443 for highlight in highlight_list:
444 n = highlight.get_safe_colour()
445 if n not in d:
446 d[n] = [] # type: List[HIGHLIGHT_FWD_REF]
447 d[n].append(highlight)
448 # noinspection PyTypeChecker
449 return OrderedDict(sorted(d.items()))
451 @staticmethod
452 def get_active_highlights(request: HttpRequest) -> QuerySet:
453 """
454 Return all active highlights for the current user.
456 Args:
457 request: the :class:`django.http.request.HttpRequest`
459 Returns:
460 a :class:`django.db.models.QuerySet` of the :class:`Highlight`
461 objects
463 """
464 return Highlight.objects.filter(user=request.user, active=True)
466 def activate(self) -> None:
467 """
468 Mark this highlight as active.
469 """
470 self.active = True
471 self.save()
473 def deactivate(self) -> None:
474 """
475 Mark this highlight as inactive.
476 """
477 self.active = False
478 self.save()
481# =============================================================================
482# Query classes
483# =============================================================================
485QUERY_FWD_REF = "Query"
488class QueryBase(models.Model):
489 """
490 Abstract base class for the two query classes.
491 """
493 class Meta:
494 abstract = True
495 app_label = "research"
497 id = models.AutoField(primary_key=True) # automatic
499 sql = models.TextField(verbose_name="SQL query")
500 sql_hash = models.BigIntegerField(
501 verbose_name="64-bit non-cryptographic hash of SQL query"
502 )
503 args = JsonClassField(verbose_name="SQL arguments (as JSON)", null=True)
504 # ... https://github.com/shrubberysoft/django-picklefield
505 raw = models.BooleanField(
506 default=False, verbose_name="SQL is raw, not parameter-substituted"
507 )
508 qmark = models.BooleanField(
509 default=True,
510 verbose_name="Parameter-substituted SQL uses ?, not %s, "
511 "as placeholders",
512 )
513 # active = models.BooleanField(default=True) # see save() below
514 created = models.DateTimeField(auto_now_add=True)
515 deleted = models.BooleanField(
516 default=False,
517 verbose_name="Deleted from the user's perspective. "
518 "Audited queries are never properly deleted.",
519 )
520 formatted_sql = models.TextField(
521 default=None,
522 null=True,
523 verbose_name="SQL with highlighting and formatting",
524 )
526 def __repr__(self) -> str:
527 return simple_repr(
528 self, ["id", "sql", "args", "raw", "qmark", "created", "deleted"]
529 )
531 # -------------------------------------------------------------------------
532 # SQL queries
533 # -------------------------------------------------------------------------
535 def get_original_sql(self) -> str:
536 """
537 Returns the stored raw SQL.
538 """
539 # noinspection PyTypeChecker
540 return self.sql
542 def set_formatted_sql(self, reformat: bool = False) -> None:
543 """
544 Sets 'formatted_sql' by highlighting the syntax and possibly
545 reformatting.
546 """
547 # noinspection PyTypeChecker
548 self.formatted_sql = prettify_sql_html(sql=self.sql, reformat=reformat)
550 def get_formatted_sql(self) -> str:
551 """
552 Getter for 'formatted_sql'.
553 """
554 return self.formatted_sql
557def _close_cursor(cursor: Optional[CursorWrapper]) -> None:
558 if cursor:
559 # log.debug("Closing cursor")
560 cursor.close()
563class Query(QueryBase):
564 """
565 Class to query the research database.
566 """
568 NO_NULL = "_no_null" # special output
570 user = models.ForeignKey(
571 settings.AUTH_USER_MODEL, on_delete=models.CASCADE
572 )
573 active = models.BooleanField(default=True) # see save() below
574 audited = models.BooleanField(default=False)
575 display = models.TextField(
576 default="[]", verbose_name="Subset of output columns to be displayed"
577 )
578 no_null = models.BooleanField(
579 default=False,
580 verbose_name="Omit Null columns for this query when displayed",
581 )
582 last_run = models.DateTimeField(null=True, default=None)
584 def __init__(self, *args, **kwargs) -> None:
585 """
586 Initialize our cache.
587 """
588 super().__init__(*args, **kwargs)
589 self._executed_cursor = None # type: Optional[CursorWrapper]
590 self._column_names = None # type: Optional[List[str]]
591 self._rowcount = None # type: Optional[int]
592 self._rows = None # type: Optional[List[List[Any]]]
593 self._display_list = None # type: Optional[List[str]]
594 self._display_indexes = None # type: Optional[List[int]]
595 self._n_times_executed = 0
596 self._finalizer = None # type: Optional[weakref.finalize]
598 def activate(self) -> None:
599 """
600 Activate this query (and deactivates any others).
601 """
602 self.active = True
603 self.save()
605 def __repr__(self) -> str:
606 return simple_repr(
607 self,
608 [
609 "id",
610 "user",
611 "sql",
612 "args",
613 "raw",
614 "qmark",
615 "active",
616 "created",
617 "deleted",
618 "audited",
619 ],
620 )
622 def save(self, *args, update_fields=None, **kwargs) -> None:
623 """
624 Custom save method. Ensures that only one :class:`Query` has ``active
625 == True`` for a given user. Also sets the hash.
626 """
627 # https://stackoverflow.com/questions/1455126/unique-booleanfield-value-in-django # noqa: E501
628 if self.active:
629 Query.objects.filter(user=self.user, active=True).update(
630 active=False
631 )
632 if update_fields is None or "sql" in update_fields:
633 self.set_formatted_sql()
634 self.sql_hash = hash64(self.sql)
635 if update_fields is not None:
636 update_fields = {"formatted_sql", "sql_hash"}.union(
637 update_fields
638 )
639 super().save(*args, update_fields=update_fields, **kwargs)
641 # -------------------------------------------------------------------------
642 # SQL queries
643 # -------------------------------------------------------------------------
645 def get_sql_args_for_django(self) -> Tuple[str, Optional[List[Any]]]:
646 """
647 Get sql/args in a format suitable for Django, with ``%s`` placeholders,
648 or as escaped raw SQL.
650 Returns:
651 tuple: ``sql, args``
653 - If :attr:`raw` is set, return our raw SQL with ``%`` escaped to
654 ``%%``;
655 - otherwise, if :attr:`qmark` is set, return our raw SQL with ``?``
656 argument placeholders translated to ``%s`` argument placeholders;
657 - otherwise, return the raw SQL.
659 """
660 if self.raw:
661 # noinspection PyTypeChecker
662 sql = escape_percent_for_python_dbapi(self.sql)
663 args = None
664 else:
665 if self.qmark:
666 # noinspection PyTypeChecker
667 sql = translate_sql_qmark_to_percent(self.sql)
668 else:
669 sql = self.sql
670 args = self.args
671 return sql, args
673 def get_executed_cursor(self) -> CursorWrapper:
674 """
675 Get cursor with a query executed (based on our attributes :attr:`sql`,
676 :attr:`args`, :attr:`raw`, :attr:`qmark`).
678 Returns:
679 a :class:`django.db.backends.utils.CursorWrapper`
681 Do NOT use this with ``with``, as in:
683 .. code-block:: python
685 with query.get_executed_cursor() as cursor:
686 # do stuff
688 You could do that (and in general it's what Django advises) but we are
689 trying to be fancy here and use the cursor more efficiently.
691 """
692 if self._executed_cursor is None:
693 sql, args = self.get_sql_args_for_django()
694 cursor = get_executed_researchdb_cursor(sql, args)
695 self._n_times_executed += 1
696 # log.debug("Query: n_times_executed: {}".format(
697 # self._n_times_executed))
698 # log.debug("\n" + "".join(traceback.format_stack()))
699 if self._n_times_executed > 1:
700 log.warning(
701 f"Inefficient: Query executed "
702 f"{self._n_times_executed} times"
703 )
704 try:
705 # noinspection PyTypeChecker
706 self._column_names = get_fieldnames_from_cursor(cursor)
707 except TypeError:
708 self._column_names = [] # type: List[str]
709 self._rowcount = cursor.rowcount
710 self._executed_cursor = cursor
711 self._finalizer = weakref.finalize(
712 self, _close_cursor, self._executed_cursor
713 )
714 return self._executed_cursor
716 def _invalidate_executed_cursor(self) -> None:
717 """
718 Mark the executed cursor as dead (e.g. iterated through).
719 """
720 if self._executed_cursor is not None:
721 self._finalizer()
722 self._finalizer = None # type: Optional[weakref.finalize]
723 self._executed_cursor = None # type: Optional[CursorWrapper]
725 def _cache_basics(self) -> None:
726 """
727 Cache rowcount and column names.
729 Raises:
730 :exc:`DatabaseError` if the query fails
731 """
732 if self._rowcount is None:
733 self.get_executed_cursor() # will cache
735 def _cache_all(self) -> None:
736 """
737 Fetch everything from the query and cache it.
739 Raises:
740 :exc:`DatabaseError` if the query fails
741 """
742 if self._rows is None:
743 cursor = self.get_executed_cursor()
744 self._rows = cursor.fetchall()
745 self._invalidate_executed_cursor()
747 def get_column_names(self) -> List[str]:
748 """
749 Returns column names from the query's cursor.
751 Raises:
752 :exc:`DatabaseError` if the query fails
753 """
754 if self._column_names is None:
755 self._cache_basics()
756 return self._column_names
758 def get_rowcount(self) -> int:
759 """
760 Returns the rowcount from the cursor.
762 Raises:
763 :exc:`DatabaseError` if the query fails
764 """
765 if self._rowcount is None:
766 self._cache_basics()
767 return self._rowcount
769 def get_rows(self) -> List[List[Any]]:
770 """
771 Returns all rows from the query, as a list.
773 Raises:
774 :exc:`DatabaseError` if the query fails
775 """
776 self._cache_all()
777 return self._rows
779 def gen_rows(self) -> Generator[List[Any], None, None]:
780 """
781 Generate rows from the query.
783 Raises:
784 :exc:`DatabaseError` if the query fails
785 """
786 if self._rows is None:
787 # No cache
788 cursor = self.get_executed_cursor()
789 row = cursor.fetchone()
790 while row is not None:
791 yield row
792 row = cursor.fetchone()
793 self._invalidate_executed_cursor()
794 else:
795 # Cache
796 for row in self._rows:
797 yield row
799 def dictfetchall(self) -> List[Dict[str, Any]]:
800 """
801 Executes the query and returns all results as a list of OrderedDicts
802 (one for each row, mapping column names to values).
804 Raises:
805 :exc:`DatabaseError` if the query fails
806 """
807 if self._rows is None:
808 # No cache
809 cursor = self.get_executed_cursor()
810 # noinspection PyTypeChecker
811 result = dictfetchall(cursor)
812 self._invalidate_executed_cursor()
813 return result
814 else:
815 # Cache
816 columns = self._column_names
817 return [OrderedDict(zip(columns, row)) for row in self._rows]
819 def update_last_run(self) -> None:
820 self.last_run = datetime.datetime.now()
821 self.save()
823 @property
824 def run_since_update(self) -> Optional[bool]:
825 # Currently doesn't check which databases are involved in the
826 # query - just checks all databases
827 last_updated_all = last_updated_all_dbs()
828 if last_updated_all is None: # no info from dbs
829 return None
830 elif self.last_run is None: # query never run
831 return False
832 else:
833 # Make last_updated_all timezone aware so they can be compared
834 last_updated_all.replace(datetime.timezone.utc)
835 return self.last_run > last_updated_all
837 # -------------------------------------------------------------------------
838 # Fetching
839 # -------------------------------------------------------------------------
841 @staticmethod
842 def get_active_query_or_none(
843 request: HttpRequest,
844 ) -> Optional[QUERY_FWD_REF]:
845 """
846 Returns the active query for this user, or ``None``.
848 Args:
849 request: the :class:`django.http.request.HttpRequest`
851 Returns:
852 a :class:`Query`, or ``None``.
854 """
855 if not request.user.is_authenticated:
856 return None
857 try:
858 return Query.objects.get(user=request.user, active=True)
859 except Query.DoesNotExist:
860 return None
862 @staticmethod
863 def get_active_query_id_or_none(request: HttpRequest) -> Optional[int]:
864 """
865 Returns the active query's integer ID for this user, or ``None``.
867 Args:
868 request: the :class:`django.http.request.HttpRequest`
870 Returns:
871 the active query's integer PK, or ``None``.
873 """
874 if not request.user.is_authenticated:
875 return None
876 try:
877 query = Query.objects.get(user=request.user, active=True)
878 return query.id
879 except Query.DoesNotExist:
880 return None
882 # -------------------------------------------------------------------------
883 # Activating, deleting, auditing
884 # -------------------------------------------------------------------------
886 # This isn't needed in the base class because it only applies to
887 # audited queries
888 def mark_deleted(self) -> None:
889 """
890 Mark the query as deleted.
892 This will stop it being shown. It will not delete it from the database.
894 We use this deletion method for queries that have been executed, so
895 need an audit trail.
896 """
897 if self.deleted:
898 # log.debug("pointless")
899 return
900 self.deleted = True
901 self.active = False
902 # log.debug("about to save")
903 self.save()
904 # log.debug("saved")
906 def mark_audited(self) -> None:
907 """
908 Mark the query as having been executed and audited. (This prevents it
909 from being wholly deleted.)
910 """
911 if self.audited:
912 return
913 self.audited = True
914 self.save()
916 def audit(
917 self,
918 count_only: bool = False,
919 n_records: int = 0,
920 failed: bool = False,
921 fail_msg: str = "",
922 ) -> None:
923 """
924 Audit the execution of this query:
926 - insert an audit entry referring to this query
927 - mark the query as having been audited (so it's not deleted)
929 Args:
930 count_only: did we know (in advance) that this was a
931 ``COUNT()``-only query?
932 n_records: how many records were returned?
933 failed: did the query fail?
934 fail_msg: if the query failed, the associated failure message
935 """
936 a = QueryAudit(
937 query=self,
938 count_only=count_only,
939 n_records=n_records,
940 failed=failed,
941 fail_msg=fail_msg,
942 )
943 a.save()
944 self.mark_audited()
946 def delete_if_permitted(self) -> None:
947 """
948 Delete the query.
950 - If a query has been executed and therefore audited, it isn't properly
951 deleted; it's just marked as deleted.
952 - If a query has never been executed, we can delete it entirely.
953 """
954 if self.deleted:
955 log.debug("already flagged as deleted")
956 return
957 if self.audited:
958 log.debug("marking as deleted")
959 self.mark_deleted()
960 else:
961 # actually delete
962 log.debug("actually deleting")
963 self.delete()
965 # -------------------------------------------------------------------------
966 # Filtering columns for display output
967 # -------------------------------------------------------------------------
969 def set_display_list(self, display_list: List[str]) -> None:
970 """
971 Sets the internal JSON field, stored in the database, from a list of
972 column headings to display.
974 Args:
975 display_list: list of columns to display
976 """
977 self.display = json.dumps(
978 display_list, separators=JSON_SEPARATORS_COMPACT
979 )
980 self._display_list = None # clear cache
982 def _get_display_list(self) -> List[str]:
983 """
984 Returns a list of columns to display, from our internal JSON
985 representation.
986 """
987 if not self.display:
988 return []
989 try:
990 result = json.loads(self.display)
991 except json.decoder.JSONDecodeError: # e.g. junk
992 log.warning("Query.display field: bad JSON, returning []")
993 return []
994 # Now check it's a list of str:
995 if not isinstance(result, list):
996 log.warning("Query.display field: not a list, returning []")
997 return []
998 if not all(isinstance(x, str) for x in result):
999 log.warning(
1000 "Query.display field: contains non-strings, " "returning []"
1001 )
1002 return []
1003 return result
1005 def get_display_list(self) -> List[str]:
1006 """
1007 Returns a list of columns to display, from our internal JSON
1008 representation. Uses :func:`_get_display_list` and caches it.
1009 """
1010 if self._display_list is None:
1011 self._display_list = self._get_display_list()
1012 # log.debug("Query.get_display_list() -> {!r}".format(
1013 # self._display_list))
1014 return self._display_list
1016 def _get_display_indexes(self) -> Optional[List[int]]:
1017 """
1018 Returns the indexes of the result columns that we wish to display.
1020 Raises:
1021 :exc:`DatabaseError` on query failure
1022 """
1023 display_fieldnames = self.get_display_list()
1024 # If the display attribute is empty assume the user wants all fields
1025 select_all = not display_fieldnames
1027 if self.no_null:
1028 self._cache_all() # writes to self._rows
1030 all_column_names = self.get_column_names()
1032 if select_all and not self.no_null:
1033 # No filtering. Provide the original indexes quickly.
1034 return list(range(len(all_column_names)))
1036 field_indexes = [] # type: List[int]
1037 # Do this to make sure included fields are actually in the results
1038 for i, name in enumerate(all_column_names):
1039 if select_all or name in display_fieldnames:
1040 if self.no_null:
1041 # Exclude fields where all values are null, if no_null
1042 # is switched on.
1043 for row in self._rows:
1044 if row[i] is not None:
1045 field_indexes.append(i)
1046 break
1047 else:
1048 field_indexes.append(i)
1049 return field_indexes
1051 def get_display_indexes(self) -> Optional[List[int]]:
1052 """
1053 Returns the indexes of the result columns that we wish to display.
1054 Uses :func:`_get_display_indexes` and caches it.
1056 Raises:
1057 :exc:`DatabaseError` on query failure
1058 """
1059 if self._display_indexes is None:
1060 self._display_indexes = self._get_display_indexes()
1061 return self._display_indexes
1063 def get_display_column_names(self) -> List[str]:
1064 """
1065 Returns the filtered column names.
1066 """
1067 column_names = self.get_column_names()
1068 display_indexes = self.get_display_indexes()
1069 return [column_names[i] for i in display_indexes]
1071 def gen_display_rows(self) -> Generator[List[Any], None, None]:
1072 """
1073 Generates all filtered rows.
1074 """
1075 field_indexes = self.get_display_indexes()
1076 if not field_indexes:
1077 # No columns specifically selected; return all columns
1078 for row in self.gen_rows():
1079 yield row
1080 else:
1081 for row in self.gen_rows():
1082 yield [row[i] for i in field_indexes]
1084 def get_display_rows(self) -> List[List[Any]]:
1085 """
1086 Returns a list of all filtered rows.
1087 """
1088 if self._rows is not None:
1089 # Pre-cached; there may be a shortcut
1090 field_indexes = self.get_display_indexes()
1091 if not field_indexes:
1092 # No columns specifically selected; return all columns
1093 return self._rows
1094 # Otherwise, use the generator:
1095 return list(self.gen_display_rows())
1097 def make_tsv(self) -> str:
1098 """
1099 Executes the query and returns a TSV result (as a multiline string).
1100 """
1101 fieldnames = self.get_display_column_names()
1102 tsv = make_tsv_row(fieldnames)
1103 for row in self.gen_display_rows():
1104 tsv += make_tsv_row(row)
1105 self.update_last_run()
1106 return tsv
1108 def make_excel(self) -> bytes:
1109 """
1110 Executes the query and returns an Excel workbook, in binary.
1111 """
1112 self._cache_all()
1113 wb = Workbook()
1114 wb.remove_sheet(wb.active) # remove the autocreated blank sheet
1115 sheetname = f"query_{self.id}"
1116 ws = wb.create_sheet(sheetname)
1117 now = datetime.datetime.now()
1119 fieldnames = self.get_display_column_names()
1120 ws.append(fieldnames)
1121 for row in self.gen_display_rows():
1122 ws.append(gen_excel_row_elements(ws, row))
1124 sql_ws = wb.create_sheet(title="SQL")
1125 sql_ws.append(["SQL", "Executed_at"])
1126 sql_ws.append([self.get_original_sql(), now])
1127 self.update_last_run()
1128 return excel_to_bytes(wb)
1131class SitewideQuery(QueryBase):
1132 """
1133 Class representing a site-wide query for research database.
1135 - Site-wide queries are not attached to any particular user.
1136 - They are templatized with placeholders.
1137 - Placeholders begin with ``[[`` and end with ``]]``.
1138 - The user is asked to fill in values for the placeholders.
1140 """
1142 description = models.TextField(
1143 verbose_name="query description", default=""
1144 )
1146 @property
1147 def sql_chunks(self) -> List[str]:
1148 """
1149 Returns a list of SQL chunks and placeholders made from the original
1150 SQL. Placeholders begin with ``[[`` and end with ``]]``.
1152 For example, if the sql is
1154 .. code-block:: none
1156 SELECT * FROM [[table]] WHERE brcid="[[brcid]]";
1158 then ``sql_chunks`` will be
1160 .. code-block:: python
1162 [
1163 'SELECT * FROM ',
1164 'table',
1165 ' WHERE brcid="',
1166 'brcid',
1167 '";'
1168 ]
1170 Note that the first element (and all elements with even [zero-based]
1171 list indexes) are SQL, not placeholders. All elements with odd indexes
1172 are placeholders.
1173 """
1174 sql_string = self.sql
1175 placeholder_start = "[["
1176 placeholder_end = "]]"
1177 startlen = len(placeholder_start)
1178 endlen = len(placeholder_end)
1179 chunks = [] # type: List[str]
1180 index1 = sql_string.find(placeholder_start)
1181 index2 = sql_string.find(placeholder_end)
1182 while index1 != -1 and index2 != -1: # placeholder present
1183 # get bit of sql up to next '[['
1184 chunk = sql_string[:index1]
1185 # get bit of sql between '[[' and ']]'
1186 placeholder = sql_string[index1 + startlen : index2]
1187 chunks.append(chunk)
1188 chunks.append(placeholder)
1189 # get bit of sql after '[[' - this forms new substring to check
1190 sql_string = sql_string[index2 + endlen :]
1191 index1 = sql_string.find(placeholder_start)
1192 index2 = sql_string.find(placeholder_end)
1193 # Deal with any remainder
1194 if sql_string:
1195 # noinspection PyTypeChecker
1196 chunks.append(sql_string)
1198 return chunks
1200 @property
1201 def prettified_chunks(self) -> List[str]:
1202 """
1203 Returns chunks (see sql_chunks) but with formatting.
1204 """
1205 prettified_chunks = [] # type: List[str]
1206 for i, chunk in enumerate(self.sql_chunks):
1207 if (i + 1) % 2 == 1:
1208 chunk = prettify_sql_html(chunk)
1209 prettified_chunks.append(chunk)
1210 return prettified_chunks
1212 def save(self, *args, update_fields=None, **kwargs) -> None:
1213 """
1214 Custom save method. Sets the hash.
1215 """
1216 if update_fields is None or "sql" in update_fields:
1217 self.set_formatted_sql()
1218 self.sql_hash = hash64(self.sql)
1219 if update_fields is not None:
1220 update_fields = {"formatted_sql", "sql_hash"}.union(
1221 update_fields
1222 )
1223 super().save(*args, update_fields=update_fields, **kwargs)
1226# =============================================================================
1227# Query auditing class
1228# =============================================================================
1231class QueryAudit(models.Model):
1232 """
1233 Audit log for a query.
1234 """
1236 id = models.AutoField(primary_key=True) # automatic
1237 query = models.ForeignKey("Query", on_delete=models.PROTECT)
1238 # ... contains information about which user
1239 when = models.DateTimeField(auto_now_add=True)
1240 count_only = models.BooleanField(default=False)
1241 n_records = models.IntegerField(default=0)
1242 # ... not PositiveIntegerField; SQL Server gives -1, for example
1243 failed = models.BooleanField(default=False)
1244 fail_msg = models.TextField()
1246 def __str__(self) -> str:
1247 return f"<QueryAudit id={self.id}>"
1250# =============================================================================
1251# Lookup class for secret RID-to-PID conversion
1252# =============================================================================
1254# class PidLookupRouter:
1255# # https://docs.djangoproject.com/en/1.8/topics/db/multi-db/
1256# # https://newcircle.com/s/post/1242/django_multiple_database_support
1257# # noinspection PyMethodMayBeStatic,PyUnusedLocal
1258# def db_for_read(self, model: Type[models.Model], **hints) -> Optional[str]: # noqa: E501
1259# """
1260# read model PidLookup -> look at database secret
1261# """
1262# # log.debug("PidLookupRouter: {}".format(model._meta.model_name))
1263# # if model._meta.model_name == PidLookup._meta.model_name:
1264# if model == PidLookup:
1265# return 'secret'
1266# return None
1267#
1268# # noinspection PyUnusedLocal
1269# @staticmethod
1270# def allow_migrate(db: str, app_label: str, model_name: str = None,
1271# **hints) -> bool:
1272# # 2017-02-12, to address bug:
1273# # - https://code.djangoproject.com/ticket/27054
1274# # See also:
1275# # - https://docs.djangoproject.com/en/1.10/topics/db/multi-db/#using-other-management-commands # noqa: E501
1276# return db == 'default'
1279class PidLookup(models.Model):
1280 """
1281 Lookup class for secret RID-to-PID conversion.
1283 - Used via one or other of the 'secret' database connections.
1284 - Intended for READ-ONLY access to that table.
1286 - Since we have fixed the tablenames for the anonymiser, we remove the
1287 ``settings.SECRET_MAP`` option. See
1288 :class:`crate_anon.anonymise.models.PatientInfo`. Moreover, we fix the
1289 maximum length, regardless of the specifics of the config used.
1291 - Use as e.g. ``Lookup(pid=XXX)``.
1293 """
1295 pid = models.PositiveIntegerField(
1296 primary_key=True, db_column=PatientInfoConstants.PID_FIELDNAME
1297 )
1298 mpid = models.PositiveIntegerField(
1299 db_column=PatientInfoConstants.MPID_FIELDNAME
1300 )
1301 rid = models.CharField(
1302 db_column=PatientInfoConstants.RID_FIELDNAME,
1303 max_length=get_longest_supported_hasher_output_length(),
1304 )
1305 mrid = models.CharField(
1306 db_column=PatientInfoConstants.MRID_FIELDNAME,
1307 max_length=get_longest_supported_hasher_output_length(),
1308 )
1309 trid = models.PositiveIntegerField(
1310 db_column=PatientInfoConstants.TRID_FIELDNAME
1311 )
1313 class Meta:
1314 managed = False
1315 db_table = PatientInfoConstants.SECRET_MAP_TABLENAME
1317 # https://stackoverflow.com/questions/12158463/how-can-i-make-a-model-read-only # noqa: E501
1318 def save(self, *args, **kwargs) -> None:
1319 return
1321 def delete(self, *args, **kwargs) -> None:
1322 return
1325def get_pid_lookup(
1326 dbinfo: SingleResearchDatabase,
1327 pid: Union[int, str] = None,
1328 mpid: Union[int, str] = None,
1329 trid: int = None,
1330 rid: str = None,
1331 mrid: str = None,
1332) -> Optional[PidLookup]:
1333 """
1334 Looks up a patient in the secret lookup database associated with a
1335 database, from one of several possible identifiers.
1337 Args:
1338 dbinfo: a
1339 :class:`crate_anon.crateweb.research.research_db_info.SingleResearchDatabase`
1340 pid: optional patient identifier (PID) value
1341 mpid: optional master patient identifier (MPID) value
1342 trid: optional transient research identifier (TRID) value
1343 rid: optional research identifier (RID) value
1344 mrid: optional master research identifier (MRID) value
1346 Returns:
1347 a :class:`crate_anon.crateweb.research.models.PidLookup` or ``None``
1349 Raises:
1350 :exc:`ValueError` if none of the IDs was specified
1352 """
1353 dbalias = dbinfo.secret_lookup_db
1354 assert dbalias
1355 q = PidLookup.objects.using(dbalias)
1356 if trid is not None:
1357 lookup = q.get(trid=trid)
1358 elif rid is not None:
1359 lookup = q.get(rid=rid)
1360 elif mrid is not None:
1361 lookup = q.get(mrid=mrid)
1362 elif pid is not None:
1363 lookup = q.get(pid=pid)
1364 elif mpid is not None:
1365 lookup = q.get(mpid=mpid)
1366 else:
1367 raise ValueError("no input")
1368 return lookup
1371def get_mpid(
1372 dbinfo: SingleResearchDatabase,
1373 trid: int = None,
1374 rid: str = None,
1375 mrid: str = None,
1376) -> int:
1377 """
1378 Returns the MPID for a patient, looked up from one of the research IDs.
1380 Args:
1381 dbinfo: a
1382 :class:`crate_anon.crateweb.research.research_db_info.SingleResearchDatabase`
1383 trid: optional transient research identifier (TRID) value
1384 rid: optional research identifier (RID) value
1385 mrid: optional master research identifier (MRID) value
1387 Returns:
1388 the integer MPID, or ``None``
1390 Raises:
1391 :exc:`ValueError` if none of the IDs was specified
1392 """
1393 lookup = get_pid_lookup(dbinfo=dbinfo, trid=trid, rid=rid, mrid=mrid)
1394 # noinspection PyTypeChecker
1395 return lookup.mpid
1398def get_pid(
1399 dbinfo: SingleResearchDatabase,
1400 trid: int = None,
1401 rid: str = None,
1402 mrid: str = None,
1403) -> int:
1404 """
1405 Returns the PID for a patient, looked up from one of the research IDs.
1407 Args:
1408 dbinfo: a
1409 :class:`crate_anon.crateweb.research.research_db_info.SingleResearchDatabase`
1410 trid: optional transient research identifier (TRID) value
1411 rid: optional research identifier (RID) value
1412 mrid: optional master research identifier (MRID) value
1414 Returns:
1415 the integer PID, or ``None``
1417 Raises:
1418 :exc:`ValueError` if none of the IDs was specified
1419 """
1420 lookup = get_pid_lookup(dbinfo=dbinfo, trid=trid, rid=rid, mrid=mrid)
1421 # noinspection PyTypeChecker
1422 return lookup.pid
1425# =============================================================================
1426# Patient Explorer multi-query classes
1427# =============================================================================
1430class TableQueryArgs:
1431 """
1432 Represents SQL for a specific table, with arguments for the SQL. Used by
1433 :class:`PatientMultiQuery`.
1434 """
1436 def __init__(self, table_id: TableId, sql: str, args: List[Any]) -> None:
1437 """
1438 Args:
1439 table_id: a :class:`crate_anon.common.sql.TableId` that this query
1440 is selecting from
1441 sql: SQL text
1442 args: a list of arguments to the SQL
1443 """
1444 self.table_id = table_id
1445 self.sql = sql
1446 self.args = args
1449@register_for_json(method=METHOD_STRIP_UNDERSCORE)
1450class PatientMultiQuery:
1451 """
1452 Represents a set of queries across many tables relating to one or several
1453 patients (but the same patients across all the tables).
1455 Used for the Patient Explorer.
1457 *Development notes:*
1459 - Patient ID query
1461 - Single database is easy; we can use RID or TRID, and therefore TRID for
1462 performance.
1464 Note that ``UNION`` gives only ``DISTINCT`` results by default (``UNION
1465 ALL`` gives everything); see
1466 https://stackoverflow.com/questions/49925/what-is-the-difference-between-union-and-union-all.
1468 .. code-block:: sql
1470 -- Clear, but extensibility of boolean logic less clear:
1471 SELECT trid
1472 FROM diagnosis_table
1473 WHERE diagnosis LIKE 'F20%'
1474 INTERSECT
1475 SELECT trid
1476 FROM progress_note_table
1477 WHERE note LIKE '%schizophreni%' OR note LIKE '%depression%'
1478 ORDER BY trid
1479 -- ... logic across tables requires careful arrangement of UNION vs. INTERSECT
1480 -- ... logic for multiple fields within one table can be done with AND/OR
1482 -- Slower (?), but simpler to manipulate logic?
1483 SELECT DISTINCT something.trid
1484 FROM diagnosis_table INNER JOIN progress_note_table
1485 ON diagnosis_table.trid = progress_note_table.trid
1486 WHERE
1487 diagnosis_table.diagnosis LIKE 'F20%'
1488 AND (progress_note_table.note LIKE '%schizophreni%'
1489 OR progress_note_table.notenote LIKE '%depression%')
1490 ORDER BY something.trid
1491 -- ... boolean logic can all be encapsulated in a single WHERE clause
1492 -- ... can also share existing join code
1493 -- ... ?reasonable speed since the TRID fields will be indexed
1494 -- ... preferable.
1496 - Which ID for the patient ID query?
1498 - the TRID (for speed, inc. sorting) of the first database
1499 - can use the TRID from the first "where clause" table
1500 (don't have to join to a master patient table)
1501 - join everything across databases as before
1503 - Results queries
1505 .. code-block:: none
1507 -- Something like:
1509 SELECT rid, date_of_note, note
1510 FROM progress_note_table
1511 WHERE trid IN ( ... patient_id_query ... )
1512 ORDER BY trid
1514 SELECT rid, date_of_diagnosis, diagnosis, diagnosis_description
1515 FROM diagnosis_table
1516 WHERE trid IN ( ... patient_id_query ... )
1517 ORDER BY trid
1519 This means we will repeat the patient_id_query, which may be inefficient.
1520 Options:
1522 - store the TRIDs in Python, then pass them as arguments
1524 - at which point the SQL string/packet length becomes relevant;
1525 - https://stackoverflow.com/questions/1869753/maximum-size-for-a-sql-server-query-in-clause-is-there-a-better-approach
1526 - https://stackoverflow.com/questions/16335011/what-is-maximum-query-size-for-mysql
1527 - https://stackoverflow.com/questions/96553/practical-limit-to-length-of-sql-query-specifically-mysql
1529 - let the database worry about it
1531 - probably best for now!
1533 - Display
1535 - One patient per page, with multiple results tables.
1537 - Boolean logic on patient selection
1539 - ... within
1541 """ # noqa: E501
1543 def __init__(
1544 self,
1545 output_columns: List[ColumnId] = None,
1546 patient_conditions: List[WhereCondition] = None,
1547 manual_patient_id_query: str = "",
1548 ):
1549 """
1550 Args:
1551 output_columns:
1552 database columns that will be in the output, as
1553 list of :class:`crate_anon.common.sql.ColumnId` objects
1554 patient_conditions:
1555 restrictions on the patient, as a list of
1556 :class:`crate_anon.common.sql.WhereCondition` objects; they
1557 will be joined with ``AND``
1558 manual_patient_id_query:
1559 raw SQL; if specified, overrides ``patient_conditions`` and is
1560 used as the patient-finding part of the query; see
1561 :func:`set_override_query`
1562 """
1563 self._output_columns = output_columns or [] # type: List[ColumnId]
1564 self._patient_conditions = (
1565 patient_conditions or []
1566 ) # type: List[WhereCondition]
1567 self._manual_patient_id_query = manual_patient_id_query or ""
1569 def __repr__(self) -> str:
1570 return (
1571 "{qualname}("
1572 "output_columns={output_columns}, "
1573 "patient_conditions={patient_conditions}, "
1574 "manual_patient_id_query={manual_patient_id_query}"
1575 ")".format(
1576 qualname=self.__class__.__qualname__,
1577 output_columns=repr(self._output_columns),
1578 patient_conditions=repr(self._patient_conditions),
1579 manual_patient_id_query=repr(self._manual_patient_id_query),
1580 )
1581 )
1583 def __eq__(self, other: "PatientMultiQuery") -> bool:
1584 return (
1585 self._output_columns == other._output_columns
1586 and self._patient_conditions == other._patient_conditions
1587 and self._manual_patient_id_query == other._manual_patient_id_query
1588 )
1590 def __hash__(self) -> int:
1591 """
1592 WARNING: Python's hash() function converts the result of __hash__()
1593 to the integer width of the host machine, so 64-bit results can get
1594 down-converted to 32 bits. Use hash64() directly if you want a 64-bit
1595 result.
1596 """
1597 return self.hash64
1599 @property
1600 def hash64(self) -> int:
1601 """
1602 Return an integer (non-cryptographic) hash of the query.
1603 """
1604 return hash64(json_encode(self))
1606 @property
1607 def output_columns(self) -> List[ColumnId]:
1608 """
1609 Returns the output columns, as a list of
1610 :class:`crate_anon.common.sql.ColumnId` objects.
1611 """
1612 return self._output_columns
1614 @property
1615 def has_output_columns(self) -> bool:
1616 """
1617 Does this multiquery have any output columns?
1618 """
1619 return bool(self._output_columns)
1621 @property
1622 def ok_to_run(self) -> bool:
1623 """
1624 Is this OK to run, i.e. does it have a patient ID query and some output
1625 columns?
1626 """
1627 return self.has_output_columns and self.has_patient_id_query
1629 @property
1630 def patient_conditions(self) -> List[WhereCondition]:
1631 """
1632 Returns all ``WHERE`` conditions restricting the patient, as a list of
1633 :class:`crate_anon.common.sql.WhereCondition` objects.
1634 """
1635 return self._patient_conditions
1637 @property
1638 def manual_patient_id_query(self) -> str:
1639 """
1640 Returns the manual override SQL for the patient ID query.
1641 """
1642 return self._manual_patient_id_query
1644 def add_output_column(self, column_id: ColumnId) -> None:
1645 """
1646 Adds a database column to the output.
1647 """
1648 if column_id not in self._output_columns:
1649 self._output_columns.append(column_id)
1650 self._output_columns.sort()
1652 def clear_output_columns(self) -> None:
1653 """
1654 Removes all output columns from the multiquery.
1655 """
1656 self._output_columns = [] # type: List[ColumnId]
1658 def add_patient_condition(self, where: WhereCondition) -> None:
1659 """
1660 Adds a patient ``WHERE`` condition.
1662 Args:
1663 where: a :class:`crate_anon.common.sql.WhereCondition`
1664 """
1665 if where not in self._patient_conditions:
1666 self._patient_conditions.append(where)
1667 self._patient_conditions.sort()
1669 def clear_patient_conditions(self) -> None:
1670 """
1671 Removes all ``WHERE`` conditions on the patient.
1672 """
1673 self._patient_conditions = [] # type: List[WhereCondition]
1675 def set_override_query(self, query: str) -> None:
1676 """
1677 Sets the manual override SQL for the patient ID query.
1679 Args:
1680 query: raw SQL
1682 This query should return a single column of MRID values that is fetched
1683 into Python and used to restrict other queries. Here's a fictional
1684 example to fetch the MRIDs for all patients who have the word
1685 "neutrophils" in their notes:
1687 .. code-block:: sql
1689 SELECT DISTINCT anonymous_output.patient.nhshash AS _mrid
1690 FROM anonymous_output.patient
1691 INNER JOIN anonymous_output.note ON anonymous_output.note.trid = anonymous_output.patient.trid
1692 WHERE MATCH (anonymous_output.note.note) AGAINST ('neutrophils')
1693 AND anonymous_output.patient.nhshash IS NOT NULL
1694 ORDER BY _mrid
1696 """ # noqa: E501
1697 self._manual_patient_id_query = query
1699 def _get_select_mrid_column(self) -> Optional[ColumnId]:
1700 """
1701 Returns the MRID column from the first table in the patient ``WHERE``
1702 conditions, or ``None``.
1704 Returns:
1705 a :class:`crate_anon.common.sql.ColumnId` or ``None``
1707 """
1708 if not self._patient_conditions:
1709 return None
1710 return self._research_database_info.get_linked_mrid_column(
1711 self._patient_conditions[0].table_id
1712 )
1714 @property
1715 def has_patient_id_query(self) -> bool:
1716 """
1717 Does this multiquery have a patient ID query? This can either be one
1718 that the user has specified manually, or one built from ``WHERE``
1719 conditions that appears to refer to an MRID.
1720 """
1721 if self._manual_patient_id_query:
1722 return True
1723 if self._patient_conditions:
1724 mrid_col = self._get_select_mrid_column()
1725 if mrid_col and mrid_col.is_valid:
1726 return True
1727 return False
1729 def patient_id_query(self, with_order_by: bool = True) -> str:
1730 """
1731 Returns an SQL ``SELECT`` statement based on the list of ``WHERE``
1732 conditions already stored, joined with ``AND`` by default. (If a manual
1733 patient ID query has been specified, return that instead.)
1735 Args:
1736 with_order_by: add an ``ORDER BY`` query on the MRID; such an
1737 ordering is important for consistency across runs (but is
1738 prohibited by SQL Server in subqueries -- "The ORDER BY clause
1739 is invalid in views, inline functions, derived tables,
1740 subqueries, ... unless TOP, OFFSET or FOR XML is specified.")
1742 Returns:
1743 str: SQL
1745 """
1747 if self._manual_patient_id_query:
1748 # User has specified one manually.
1749 return self._manual_patient_id_query
1751 if not self._patient_conditions:
1752 return ""
1754 select_mrid_column = self._get_select_mrid_column()
1755 if not select_mrid_column.is_valid:
1756 log.warning(
1757 f"PatientMultiQuery.patient_id_query(): invalid"
1758 f" select_mrid_column: {select_mrid_column!r}"
1759 )
1760 # One way this can happen: (1) a user saves a PMQ; (2) the
1761 # administrator removes one of the databases!
1762 return ""
1764 research_database_info = get_research_db_info()
1765 grammar = research_database_info.grammar
1767 mrid_alias = "_mrid"
1768 sql = add_to_select(
1769 "",
1770 grammar=grammar,
1771 select_elements=[
1772 SelectElement(column_id=select_mrid_column, alias=mrid_alias)
1773 ],
1774 distinct=True,
1775 where_conditions=(
1776 self._patient_conditions
1777 + [
1778 WhereCondition(
1779 column_id=select_mrid_column, op="IS NOT NULL"
1780 )
1781 ]
1782 ),
1783 where_type="AND",
1784 magic_join=True,
1785 formatted=False,
1786 )
1787 if with_order_by:
1788 sql += " ORDER BY " + mrid_alias
1789 sql = format_sql(sql)
1790 # log.debug(sql)
1791 return sql
1793 @property
1794 def all_full_queries(self) -> List[TableQueryArgs]:
1795 """
1796 Returns all final queries. This is a list of multiple SQL queries, each
1797 retrieving information from one table, and all retrieving information
1798 for the same patient(s).
1800 The patients we use are defined by our :meth:`patient_id_query`.
1802 Returns:
1803 list: a list of :class:`TableQueryArgs` objects (q.v.)
1805 """
1806 return self.all_queries(mrids=None)
1808 def all_queries_specific_patients(
1809 self, mrids: List[int]
1810 ) -> List[TableQueryArgs]:
1811 """
1812 Returns all final queries. This is a list of multiple SQL queries, each
1813 retrieving information from one table, and all retrieving information
1814 for the same patient(s).
1816 The patients we use are defined by the MRID list given.
1818 Args:
1819 mrids: list of MRIDs
1821 Returns:
1822 list: a list of :class:`TableQueryArgs` objects (q.v.)
1824 """
1825 return self.all_queries(mrids=mrids)
1827 def all_queries(self, mrids: List[Any] = None) -> List[TableQueryArgs]:
1828 """
1829 Returns all final queries. This is a list of multiple SQL queries, each
1830 retrieving information from one table, and all retrieving information
1831 for the same patient(s).
1833 The patients we use are defined either by the MRID list given, or if
1834 that is empty or blank, our :meth:`patient_id_query`.
1836 Args:
1837 mrids: list of MRIDs; if this is ``None`` or empty, use the
1838 patients fetched (live) by our :meth:`patient_id_query`.
1840 Returns:
1841 list: a list of :class:`TableQueryArgs` objects (q.v.)
1843 """
1844 queries = [] # type: List[TableQueryArgs]
1845 table_columns_map = columns_to_table_column_hierarchy(
1846 self._output_columns, sort=True
1847 )
1848 for table, columns in table_columns_map:
1849 table_sql_args = self.make_query(
1850 table_id=table, columns=columns, mrids=mrids
1851 )
1852 queries.append(table_sql_args)
1853 return queries
1855 def where_patient_clause(
1856 self, table_id: TableId, mrids: List[Any] = None
1857 ) -> SqlArgsTupleType:
1858 """
1859 Returns an SQL WHERE clauses similar to ``sometable.mrid IN (1, 2, 3)``
1860 or ``sometable.mridcol IN (SELECT mrid FROM masterpatienttable)``. The
1861 clause is used to restrict patients by MRID.
1863 Args:
1864 table_id: :class:`crate_anon.common.sql.TableId` for the table
1865 whose MRID column we will apply the ``WHERE`` clause to
1866 mrids: list of MRIDs; if this is ``None`` or empty, use the
1867 patients fetched (live) by our :meth:`patient_id_query`.
1869 Returns:
1870 tuple: ``sql, args``
1871 """
1872 mrid_column = self._research_database_info.get_mrid_column_from_table(
1873 table_id
1874 )
1875 if mrids:
1876 in_clause = ",".join(["?"] * len(mrids))
1877 # ... see notes for translate_sql_qmark_to_percent()
1878 args = mrids
1879 else:
1880 # If we haven't specified specific patients, use our patient-
1881 # finding query.
1882 in_clause = self.patient_id_query(with_order_by=False)
1883 # ... SQL Server moans if you use use ORDER BY in a subquery:
1884 # "The ORDER BY clause is invalid in views, inline functions,
1885 # derived tables, subqueries, ... unless TOP, OFFSET or FOR XML
1886 # is specified."
1887 args = [] # type: List[Any]
1888 research_database_info = get_research_db_info()
1889 grammar = research_database_info.grammar
1890 sql = f"{mrid_column.identifier(grammar)} IN ({in_clause})"
1891 return sql, args
1893 def make_query(
1894 self,
1895 table_id: TableId,
1896 columns: List[ColumnId],
1897 mrids: List[Any] = None,
1898 ) -> TableQueryArgs:
1899 """
1900 Returns an SQL query to retrieve information from a single table for
1901 certain patients. This query is similar to ``SELECT a, b, c FROM
1902 sometable WHERE sometable.mrid IN (1, 2, 3)`` or ``SELECT a, b, c FROM
1903 sometable WHERE sometable.mrid IN (SELECT mrid FROM
1904 masterpatienttable)``. This then forms one query from (potentially)
1905 many for our patient(s).
1907 Args:
1908 table_id: a :class:`crate_anon.common.sql.TableId` for the SELECT
1909 FROM table
1910 columns: columns, specified as a list of
1911 :class:`crate_anon.common.sql.ColumnId`, to select from the
1912 table (in addition to which, we will always select the MRID
1913 column from that table)
1914 mrids: list of MRIDs; if this is ``None`` or empty, use the
1915 patients fetched (live) by our :meth:`patient_id_query`.
1917 Returns:
1918 a :class:`TableQueryArgs` object (q.v.)
1920 """
1921 if not columns:
1922 raise ValueError("No columns specified")
1923 research_database_info = get_research_db_info()
1924 grammar = research_database_info.grammar
1925 mrid_column = research_database_info.get_mrid_column_from_table(
1926 table_id
1927 )
1928 all_columns = [mrid_column]
1929 for c in columns:
1930 if c not in all_columns:
1931 all_columns.append(c)
1932 where_clause, args = self.where_patient_clause(table_id, mrids)
1933 select_elements = [SelectElement(column_id=col) for col in all_columns]
1934 where_conditions = [WhereCondition(raw_sql=where_clause)]
1935 sql = add_to_select(
1936 "",
1937 grammar=grammar,
1938 select_elements=select_elements,
1939 where_conditions=where_conditions,
1940 magic_join=True,
1941 formatted=True,
1942 )
1943 return TableQueryArgs(table_id, sql, args)
1945 # -------------------------------------------------------------------------
1946 # Display
1947 # -------------------------------------------------------------------------
1949 @property
1950 def output_cols_html(self) -> str:
1951 """
1952 Returns all our output columns in HTML format.
1953 """
1954 research_database_info = get_research_db_info()
1955 grammar = research_database_info.grammar
1956 return prettify_sql_html(
1957 "\n".join(
1958 [
1959 column_id.identifier(grammar)
1960 for column_id in self.output_columns
1961 ]
1962 )
1963 )
1965 @property
1966 def pt_conditions_html(self) -> str:
1967 """
1968 Returns all our patient WHERE conditions in HTML format.
1969 """
1970 research_database_info = get_research_db_info()
1971 grammar = research_database_info.grammar
1972 return prettify_sql_html(
1973 "\nAND ".join([wc.sql(grammar) for wc in self.patient_conditions])
1974 )
1976 def summary_html(self, element_counter: HtmlElementCounter) -> str:
1977 """
1978 Returns an HTML representation of this multiquery.
1980 Args:
1981 element_counter: a
1982 :class:`crate_anon.crateweb.research.html_functions.HtmlElementCounter`,
1983 which will be modified
1985 Returns:
1986 str: HTML
1988 """
1990 def collapser(x: str) -> str:
1991 return element_counter.overflow_div(contents=x)
1993 outcols = self.output_cols_html
1994 manual_query = self.manual_patient_id_query
1995 if manual_query:
1996 manual_or_auto = " (MANUAL)"
1997 ptselect = prettify_sql_html(manual_query)
1998 else:
1999 manual_or_auto = ""
2000 ptselect = self.pt_conditions_html
2001 return f"""
2002 Output columns:<br>
2003 {collapser(outcols)}
2004 Patient selection{manual_or_auto}:<br>
2005 {collapser(ptselect)}
2006 """
2008 # -------------------------------------------------------------------------
2009 # Data finder: COUNT(*) for all patient tables
2010 # -------------------------------------------------------------------------
2012 def gen_data_finder_queries(
2013 self, mrids: List[Any] = None
2014 ) -> Generator[TableQueryArgs, None, None]:
2015 """
2016 Generates a set of queries that, when executed, return the following
2017 summary columns from each of our tables, filtered for patients by our
2018 :meth:`where_patient_clause`, and grouped by ``master_research_id``
2019 (MRID):
2021 .. code-block:: sql
2023 master_research_id,
2024 table_name,
2025 COUNT(*) AS n_records,
2026 MIN(date_column) AS min_date, -- NULL if no date column
2027 MAX(date_column) AS max_date -- NULL if no date column
2029 These queries can be used to see quickly which tables have interesting
2030 information in.
2032 Args:
2033 mrids: list of MRIDs; if this is ``None`` or empty, use the
2034 patients fetched (live) by our :meth:`patient_id_query`.
2036 Yields:
2037 :class:`TableQueryArgs` objects (q.v.)
2039 """
2040 research_database_info = get_research_db_info()
2041 grammar = research_database_info.grammar
2042 mrid_alias = "master_research_id"
2043 table_name_alias = "table_name"
2044 n_records_alias = "n_records"
2045 min_date_alias = "min_date"
2046 max_date_alias = "max_date"
2047 for (
2048 table_id
2049 ) in research_database_info.get_mrid_linkable_patient_tables():
2050 mrid_col = research_database_info.get_mrid_column_from_table(
2051 table=table_id
2052 )
2053 date_col = research_database_info.get_default_date_column(
2054 table=table_id
2055 )
2056 if date_col:
2057 min_date = f"MIN({date_col.identifier(grammar)})"
2058 max_date = f"MAX({date_col.identifier(grammar)})"
2059 else:
2060 min_date = "NULL"
2061 max_date = "NULL"
2062 # ... OK (at least in MySQL) to do:
2063 # SELECT col1, COUNT(*), NULL FROM table GROUP BY col1;
2064 where_clause, args = self.where_patient_clause(table_id, mrids)
2065 table_identifier = table_id.identifier(grammar)
2066 select_elements = [
2067 SelectElement(column_id=mrid_col, alias=mrid_alias),
2068 SelectElement(
2069 raw_select=sql_string_literal(table_identifier),
2070 alias=table_name_alias,
2071 ),
2072 SelectElement(
2073 raw_select="COUNT(*)",
2074 from_table_for_raw_select=table_id,
2075 alias=n_records_alias,
2076 ),
2077 SelectElement(
2078 raw_select=min_date,
2079 from_table_for_raw_select=table_id,
2080 alias=min_date_alias,
2081 ),
2082 SelectElement(
2083 raw_select=max_date,
2084 from_table_for_raw_select=table_id,
2085 alias=max_date_alias,
2086 ),
2087 ]
2088 where_conditions = [WhereCondition(raw_sql=where_clause)]
2089 sql = add_to_select(
2090 "",
2091 grammar=grammar,
2092 select_elements=select_elements,
2093 where_conditions=where_conditions,
2094 magic_join=True,
2095 formatted=False,
2096 )
2097 sql += "\nGROUP BY " + mrid_col.identifier(grammar)
2098 sql += "\nORDER BY " + mrid_alias
2099 sql = format_sql(sql)
2100 yield TableQueryArgs(table_identifier, sql, args)
2102 # -------------------------------------------------------------------------
2103 # Monster data: SELECT * for all patient tables
2104 # -------------------------------------------------------------------------
2106 def gen_monster_queries(
2107 self, mrids: List[int] = None
2108 ) -> Generator[TableQueryArgs, None, None]:
2109 """
2110 Generates a set of queries that, when executed, return ``SELECT *``
2111 from each of our tables, filtered for patients by our
2112 :meth:`where_patient_clause`. So it's like the basic Patient Explorer
2113 but with all columns in the output.
2115 These queries are used in the Patient Explorer "Monster Data" view.
2117 Args:
2118 mrids: list of MRIDs; if this is ``None`` or empty, use the
2119 patients fetched (live) by our :meth:`patient_id_query`.
2121 Yields:
2122 :class:`TableQueryArgs` objects (q.v.)
2124 """
2125 research_database_info = get_research_db_info()
2126 grammar = research_database_info.grammar
2127 for (
2128 table_id
2129 ) in research_database_info.get_mrid_linkable_patient_tables():
2130 mrid_col = research_database_info.get_mrid_column_from_table(
2131 table=table_id
2132 )
2133 where_clause, args = self.where_patient_clause(table_id, mrids)
2134 # We add the WHERE using our magic query machine, to get the joins
2135 # right:
2136 select_elements = [
2137 SelectElement(
2138 raw_select="*", from_table_for_raw_select=table_id
2139 ),
2140 ]
2141 where_conditions = [
2142 WhereCondition(
2143 raw_sql=where_clause,
2144 from_table_for_raw_sql=mrid_col.table_id,
2145 ),
2146 ]
2147 sql = add_to_select(
2148 "",
2149 grammar=grammar,
2150 select_elements=select_elements,
2151 where_conditions=where_conditions,
2152 magic_join=True,
2153 formatted=False,
2154 )
2155 sql += " ORDER BY " + mrid_col.identifier(grammar)
2156 sql = format_sql(sql)
2157 yield TableQueryArgs(table_id, sql, args)
2160# =============================================================================
2161# PatientExplorer
2162# =============================================================================
2164PATIENT_EXPLORER_FWD_REF = "PatientExplorer"
2167class PatientExplorer(models.Model):
2168 """
2169 Class to explore the research database on a per-patient basis.
2170 """
2172 class Meta:
2173 app_label = "research"
2175 id = models.AutoField(primary_key=True) # automatic
2176 user = models.ForeignKey(
2177 settings.AUTH_USER_MODEL, on_delete=models.CASCADE
2178 )
2179 patient_multiquery = JsonClassField(
2180 verbose_name="PatientMultiQuery as JSON", null=True
2181 ) # type: PatientMultiQuery
2182 pmq_hash = models.BigIntegerField(
2183 verbose_name="64-bit non-cryptographic hash of JSON of "
2184 "patient_multiquery"
2185 )
2186 active = models.BooleanField(default=True) # see save() below
2187 created = models.DateTimeField(auto_now_add=True)
2188 deleted = models.BooleanField(
2189 default=False,
2190 verbose_name="Deleted from the user's perspective. "
2191 "Audited queries are never properly deleted.",
2192 )
2193 audited = models.BooleanField(default=False)
2195 def __init__(self, *args, **kwargs) -> None:
2196 super().__init__(*args, **kwargs)
2197 if not self.patient_multiquery:
2198 self.patient_multiquery = PatientMultiQuery()
2200 def __str__(self) -> str:
2201 return f"<PatientExplorer id={self.id}>"
2203 def save(self, *args, update_fields=None, **kwargs) -> None:
2204 """
2205 Custom save method. Ensures that only one :class:`PatientExplorer` has
2206 ``active == True`` for a given user. Also sets the hash.
2207 """
2208 if self.active:
2209 PatientExplorer.objects.filter(user=self.user, active=True).update(
2210 active=False
2211 )
2213 if update_fields is None or "patient_multiquery" in update_fields:
2214 self.pmq_hash = self.patient_multiquery.hash64
2215 if update_fields is not None:
2216 update_fields = {"pmq_hash"}.union(update_fields)
2218 # Beware: Python's hash() function will downconvert to 32 bits on
2219 # 32-bit machines; use pmq.hash64() directly, not hash(pmq).
2220 super().save(*args, update_fields=update_fields, **kwargs)
2222 # -------------------------------------------------------------------------
2223 # Fetching
2224 # -------------------------------------------------------------------------
2226 @staticmethod
2227 def get_active_pe_or_none(
2228 request: HttpRequest,
2229 ) -> Optional[PATIENT_EXPLORER_FWD_REF]:
2230 """
2231 Args:
2232 request: the :class:`django.http.request.HttpRequest`
2234 Returns:
2235 The active :class:`PatientExplorer` for the user, or ``None``.
2237 """
2238 if not request.user.is_authenticated:
2239 return None
2240 try:
2241 return PatientExplorer.objects.get(user=request.user, active=True)
2242 except PatientExplorer.DoesNotExist:
2243 return None
2245 @staticmethod
2246 def get_active_pe_id_or_none(request: HttpRequest) -> Optional[int]:
2247 """
2248 Args:
2249 request: the :class:`django.http.request.HttpRequest`
2251 Returns:
2252 The integer PK of the active :class:`PatientExplorer` for the user,
2253 or ``None``.
2255 """
2256 if not request.user.is_authenticated:
2257 return None
2258 try:
2259 pe = PatientExplorer.objects.get(user=request.user, active=True)
2260 return pe.id
2261 except PatientExplorer.DoesNotExist:
2262 return None
2264 # -------------------------------------------------------------------------
2265 # Activating, deleting, auditing
2266 # -------------------------------------------------------------------------
2268 def activate(self) -> None:
2269 """
2270 Activates this :class:`PatientExplorer` (and deactivates any others).
2271 """
2272 self.active = True
2273 self.save()
2275 def mark_audited(self) -> None:
2276 """
2277 Mark the query as having been executed and audited. (This prevents it
2278 from being wholly deleted.)
2279 """
2280 if self.audited:
2281 return
2282 self.audited = True
2283 self.save()
2285 def mark_deleted(self) -> None:
2286 """
2287 Mark the query as deleted.
2289 This will stop it being shown. It will not delete it from the database.
2291 We use this deletion method for queries that have been executed, so
2292 need an audit trail.
2293 """
2294 if self.deleted:
2295 # log.debug("pointless")
2296 return
2297 self.deleted = True
2298 self.active = False
2299 # log.debug("about to save")
2300 self.save()
2301 # log.debug("saved")
2303 def delete_if_permitted(self) -> None:
2304 """
2305 Delete the query.
2307 - If a query has been executed and therefore audited, it isn't properly
2308 deleted; it's just marked as deleted.
2309 - If a query has never been executed, we can delete it entirely.
2310 """
2311 if self.deleted:
2312 log.debug("already flagged as deleted")
2313 return
2314 if self.audited:
2315 log.debug("marking as deleted")
2316 self.mark_deleted()
2317 else:
2318 # actually delete
2319 log.debug("actually deleting")
2320 self.delete()
2322 def audit(
2323 self,
2324 count_only: bool = False,
2325 n_records: int = 0,
2326 failed: bool = False,
2327 fail_msg: str = "",
2328 ) -> None:
2329 """
2330 Audit the execution of this query:
2332 - insert an audit entry referring to this query
2333 - mark the query as having been audited (so it's not deleted)
2335 Args:
2336 count_only: did we know (in advance) that this was a
2337 ``COUNT()``-only query?
2338 n_records: how many records were returned?
2339 failed: did the query fail?
2340 fail_msg: if the query failed, the associated failure message
2341 """
2342 a = PatientExplorerAudit(
2343 patient_explorer=self,
2344 count_only=count_only,
2345 n_records=n_records,
2346 failed=failed,
2347 fail_msg=fail_msg,
2348 )
2349 a.save()
2350 self.mark_audited()
2352 # -------------------------------------------------------------------------
2353 # Using the internal PatientMultiQuery
2354 # -------------------------------------------------------------------------
2356 def all_queries(self, mrids: List[Any] = None) -> List[TableQueryArgs]:
2357 """
2358 Returns all queries from our :attr:`patient_multiquery`. See
2359 :meth:`PatientMultiQuery.all_queries`
2361 Args:
2362 mrids: list of MRIDs; if this is ``None`` or empty, use the
2363 patients fetched (live) by our :attr:`patient_multiquery`'s
2364 :meth:`PatientMultiQuery.patient_id_query`.
2366 Returns:
2367 list: a list of :class:`TableQueryArgs` objects (q.v.)
2369 """
2370 return self.patient_multiquery.all_queries(mrids=mrids)
2372 @staticmethod
2373 def get_executed_cursor(sql: str, args: List[Any] = None) -> CursorWrapper:
2374 """
2375 Executes a query (via the research database) and returns its cursor.
2377 Args:
2378 sql: SQL text
2379 args: arguments to SQL query
2381 Returns:
2382 a :class:`django.db.backends.utils.CursorWrapper`, which is a
2383 context manager that behaves as the executed cursor and also closes
2384 it on completion
2385 """
2386 sql = translate_sql_qmark_to_percent(sql)
2387 cursor = get_executed_researchdb_cursor(sql, args)
2388 return cursor
2390 def get_patient_mrids(self) -> List[int]:
2391 """
2392 Returns all MRIDs from our :attr:`patient_multiquery`'s
2393 :meth:`PatientMultiQuery.patient_id_query`.
2394 """
2395 sql = self.patient_multiquery.patient_id_query(with_order_by=True)
2396 # log.critical(sql)
2397 with self.get_executed_cursor(sql) as cursor:
2398 return [row[0] for row in cursor.fetchall()]
2400 def get_zipped_tsv_binary(self) -> bytes:
2401 """
2402 Returns a ZIP file containing TSVs, one for each table in our
2403 :attr:`patient_multiquery`.
2404 """
2405 # Don't pass giant result sets around beyond what's necessary.
2406 # Use cursor.fetchone()
2407 grammar = make_grammar(settings.RESEARCH_DB_DIALECT)
2408 memfile = io.BytesIO()
2409 z = zipfile.ZipFile(memfile, "w")
2410 for tsa in self.patient_multiquery.all_queries():
2411 table_id = tsa.table_id
2412 sql = tsa.sql
2413 args = tsa.args
2414 with self.get_executed_cursor(sql, args) as cursor:
2415 try:
2416 fieldnames = get_fieldnames_from_cursor(cursor)
2417 except TypeError:
2418 fieldnames = [] # type: List[str]
2419 tsv = make_tsv_row(fieldnames)
2420 row = cursor.fetchone()
2421 while row is not None:
2422 tsv += make_tsv_row(row)
2423 row = cursor.fetchone()
2424 filename = table_id.identifier(grammar) + ".tsv"
2425 z.writestr(filename, tsv.encode("utf-8"))
2426 z.close()
2427 return memfile.getvalue()
2429 def get_xlsx_binary(self) -> bytes:
2430 """
2431 Returns an XLSX (Excel) file containing spreadsheets, one for each
2432 table in our :attr:`patient_multiquery`.
2434 Other notes:
2436 - cell size: see
2437 https://stackoverflow.com/questions/13197574/python-openpyxl-column-width-size-adjust;
2438 and the "auto_size" / "bestFit" options don't really do the job,
2439 according to the interweb.
2441 """
2442 wb = Workbook()
2443 wb.remove_sheet(wb.active) # remove the autocreated blank sheet
2444 sqlsheet_rows = [["Table", "SQL", "Args", "Executed_at"]]
2445 for tsa in self.patient_multiquery.all_queries():
2446 table_id = tsa.table_id
2447 sql = tsa.sql
2448 args = tsa.args
2449 sqlsheet_rows.append(
2450 [str(table_id), sql, repr(args), datetime.datetime.now()]
2451 )
2452 ws = wb.create_sheet(title=str(table_id))
2453 with self.get_executed_cursor(sql, args) as cursor:
2454 try:
2455 fieldnames = get_fieldnames_from_cursor(cursor)
2456 except (AttributeError, IndexError):
2457 fieldnames = [] # type: List[str]
2458 ws.append(fieldnames)
2459 row = cursor.fetchone()
2460 while row is not None:
2461 ws.append(gen_excel_row_elements(ws, row))
2462 row = cursor.fetchone()
2463 sql_ws = wb.create_sheet(title="SQL")
2464 for r in sqlsheet_rows:
2465 sql_ws.append(r)
2466 return excel_to_bytes(wb)
2468 # -------------------------------------------------------------------------
2469 # Using the internal PatientMultiQuery
2470 # -------------------------------------------------------------------------
2472 def get_patient_id_query(self, with_order_by: bool = True) -> str:
2473 """
2474 Returns SQL from our :attr:`patient_multiquery`'s
2475 :meth:`PatientMultiQuery.patient_id_query` (q.v.).
2477 Args:
2478 with_order_by: see :meth:`PatientMultiQuery.patient_id_query`
2479 """
2480 return self.patient_multiquery.patient_id_query(
2481 with_order_by=with_order_by
2482 )
2484 # -------------------------------------------------------------------------
2485 # Display
2486 # -------------------------------------------------------------------------
2488 @property
2489 def summary_html(self) -> str:
2490 """
2491 Return HTML summarizing this object.
2492 """
2493 # Nasty hack. We want collapsing things, so we want HTML element IDs.
2494 # We could build the HTML table in code for the Patient Explorer
2495 # chooser, but I was trying to do it in Django templates.
2496 # However, it's not easy to pass parameters (such as an
2497 # HtmlElementCounter) back to Python from Django templates.
2498 # So we can hack it a bit:
2499 element_counter = HtmlElementCounter(prefix=f"pe_{self.id}_")
2500 return self.patient_multiquery.summary_html(
2501 element_counter=element_counter
2502 )
2504 @property
2505 def has_patient_id_query(self) -> bool:
2506 """
2507 Does our our :attr:`patient_multiquery` have a patient ID query?
2509 See :meth:`PatientMultiQuery.has_patient_id_query`.
2510 """
2511 return self.patient_multiquery.has_patient_id_query
2513 @property
2514 def has_output_columns(self) -> bool:
2515 """
2516 Does our our :attr:`patient_multiquery` have output columns?
2518 See :meth:`PatientMultiQuery.has_output_columns`.
2519 """
2520 return self.patient_multiquery.has_output_columns
2522 # -------------------------------------------------------------------------
2523 # Data finder
2524 # -------------------------------------------------------------------------
2526 @property
2527 def data_finder_excel(self) -> bytes:
2528 """
2529 Returns an XSLX (Excel) file containing summary (count) information
2530 for each table.
2532 See :meth:`PatientMultiQuery.gen_data_finder_queries`.
2533 """
2534 fieldnames = [] # type: List[str]
2535 wb = Workbook()
2536 wb.remove_sheet(wb.active) # remove the autocreated blank sheet
2537 all_ws = wb.create_sheet("All_patients")
2538 sql_ws = wb.create_sheet("SQL")
2539 sql_ws.append(["Table", "SQL", "Args", "Executed_at"])
2541 for tsa in self.patient_multiquery.gen_data_finder_queries():
2542 table_identifier = tsa.table_id
2543 sql = tsa.sql
2544 args = tsa.args
2545 sql_ws.append(
2546 [
2547 table_identifier,
2548 format_sql(sql),
2549 repr(args),
2550 datetime.datetime.now(),
2551 ]
2552 )
2553 with self.get_executed_cursor(sql, args) as cursor:
2554 if not fieldnames:
2555 try:
2556 fieldnames = get_fieldnames_from_cursor(cursor)
2557 except TypeError:
2558 fieldnames = [] # type: List[str]
2559 all_ws.append(fieldnames)
2560 row = cursor.fetchone()
2561 while row is not None:
2562 mrid = str(row[0])
2563 if mrid in wb:
2564 ws = wb[mrid]
2565 else:
2566 ws = wb.create_sheet(mrid)
2567 ws.append(fieldnames)
2568 rowtuple = tuple(row)
2569 ws.append(rowtuple)
2570 all_ws.append(rowtuple)
2571 row = cursor.fetchone()
2572 return excel_to_bytes(wb)
2575# =============================================================================
2576# PatientExplorer auditing class
2577# =============================================================================
2580class PatientExplorerAudit(models.Model):
2581 """
2582 Audit log for a PatientExplorer.
2583 """
2585 id = models.AutoField(primary_key=True) # automatic
2586 patient_explorer = models.ForeignKey(
2587 "PatientExplorer", on_delete=models.PROTECT
2588 )
2589 # ... contains information on which user
2590 when = models.DateTimeField(auto_now_add=True)
2591 count_only = models.BooleanField(default=False)
2592 n_records = models.IntegerField(default=0)
2593 # ... not PositiveIntegerField; SQL Server gives -1, for example
2594 failed = models.BooleanField(default=False)
2595 fail_msg = models.TextField()
2597 def __str__(self) -> str:
2598 return f"<PatientExplorerAudit id={self.id}>"
2601# =============================================================================
2602# Archive and visualization zone auditing classes
2603# =============================================================================
2605ARCHIVE_PATIENT_ID_MAX_LENGTH = 255
2606ARCHIVE_ATTACHMENT_FILENAME_MAX_LENGTH = 255
2609class ArchiveTemplateAudit(models.Model):
2610 """
2611 Audit log for access to an archive template.
2612 """
2614 id = models.AutoField(primary_key=True) # automatic
2615 user = models.ForeignKey(
2616 settings.AUTH_USER_MODEL, on_delete=models.CASCADE
2617 )
2618 when = models.DateTimeField(auto_now_add=True)
2619 patient_id = models.CharField(max_length=ARCHIVE_PATIENT_ID_MAX_LENGTH)
2620 query_string = models.TextField()
2621 # ... no max length; see
2622 # https://stackoverflow.com/questions/812925/what-is-the-maximum-possible-length-of-a-query-string # noqa: E501
2624 def __str__(self) -> str:
2625 return f"<ArchiveTemplateAudit id={self.id}>"
2628class ArchiveAttachmentAudit(models.Model):
2629 """
2630 Audit log for access to an archive attachment.
2631 """
2633 id = models.AutoField(primary_key=True) # automatic
2634 user = models.ForeignKey(
2635 settings.AUTH_USER_MODEL, on_delete=models.CASCADE
2636 )
2637 when = models.DateTimeField(auto_now_add=True)
2638 patient_id = models.CharField(max_length=ARCHIVE_PATIENT_ID_MAX_LENGTH)
2639 filename = models.CharField(
2640 max_length=ARCHIVE_ATTACHMENT_FILENAME_MAX_LENGTH
2641 )
2643 def __str__(self) -> str:
2644 return f"<ArchiveAttachmentAudit id={self.id}>"