Coverage for crateweb/research/models.py: 31%

856 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-02-05 06:46 -0600

1""" 

2crate_anon/crateweb/research/models.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26""" 

27 

28from collections import OrderedDict 

29import datetime 

30import io 

31import logging 

32from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, Union 

33import weakref 

34import zipfile 

35import json 

36 

37from cardinal_pythonlib.dbfunc import dictfetchall, get_fieldnames_from_cursor 

38from cardinal_pythonlib.django.fields.jsonclassfield import JsonClassField 

39from cardinal_pythonlib.excel import excel_to_bytes 

40from cardinal_pythonlib.exceptions import add_info_to_exception 

41from cardinal_pythonlib.hash import ( 

42 get_longest_supported_hasher_output_length, 

43 hash64, 

44) 

45from cardinal_pythonlib.json_utils.serialize import ( 

46 json_encode, 

47 METHOD_STRIP_UNDERSCORE, 

48 register_for_json, 

49) 

50from cardinal_pythonlib.reprfunc import simple_repr 

51from cardinal_pythonlib.sql.sql_grammar import format_sql 

52from cardinal_pythonlib.tsv import make_tsv_row 

53from cardinal_pythonlib.django.function_cache import django_cache_function 

54from django.db import connections, DatabaseError, models 

55from django.db.models import QuerySet 

56from django.conf import settings 

57from django.http.request import HttpRequest 

58from django.db.backends.utils import CursorWrapper 

59from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE 

60from openpyxl.workbook.workbook import Workbook 

61from openpyxl.worksheet.worksheet import Worksheet 

62 

63from crate_anon.anonymise.constants import PatientInfoConstants 

64from crate_anon.common.constants import JSON_SEPARATORS_COMPACT 

65from crate_anon.common.sql import ( 

66 ColumnId, 

67 columns_to_table_column_hierarchy, 

68 escape_percent_for_python_dbapi, 

69 make_grammar, 

70 sql_string_literal, 

71 SqlArgsTupleType, 

72 TableId, 

73 translate_sql_qmark_to_percent, 

74 WhereCondition, 

75) 

76from crate_anon.crateweb.config.constants import ResearchDbInfoKeys 

77from crate_anon.crateweb.core.constants import ( 

78 RESEARCH_DB_CONNECTION_NAME, 

79 SettingsKeys, 

80) 

81from crate_anon.crateweb.research.html_functions import ( 

82 HtmlElementCounter, 

83 N_CSS_HIGHLIGHT_CLASSES, 

84 prettify_sql_html, 

85) 

86from crate_anon.crateweb.research.research_db_info import ( 

87 get_research_db_info, 

88 SingleResearchDatabase, 

89) 

90from crate_anon.crateweb.research.sql_writer import ( 

91 add_to_select, 

92 SelectElement, 

93) 

94 

95log = logging.getLogger(__name__) 

96 

97 

98# ============================================================================= 

99# Hacking django-pyodbc-azure, to stop it calling cursor.nextset() every time 

100# you ask it to do cursor.fetchone() 

101# ============================================================================= 

102# BUG in django-pyodbc-azure==1.10.4.0 (providing 

103# sql_server/*), 2017-02-17: this causes 

104# ProgrammingError "No results. Previous SQL was not a query." 

105# The problem relates to sql_server/pyodbc/base.py 

106# CursorWrapper.fetchone() calling self.cursor.nextset(); if 

107# you comment this out, it works fine. 

108# Related: 

109# - https://github.com/pymssql/pymssql/issues/98 

110 

111DJANGO_PYODBC_AZURE_ENGINE = "sql_server.pyodbc" 

112 

113 

114def replacement_sqlserver_pyodbc_cursorwrapper_fetchone(self) -> List[Any]: 

115 """ 

116 A function to replace ``CursorWrapper.fetchone()`` in 

117 ``sql_server/pyodbc/base.py`` from ``django-pyodbc-azure``. 

118 This replacement function does not call ``cursor.nextset()``. 

119 """ 

120 # log.debug("Using monkeypatched fetchone(); self: {}; self.cursor: " 

121 # "{}".format(repr(self), repr(self.cursor))) 

122 row = self.cursor.fetchone() 

123 if row is not None: 

124 row = self.format_row(row) 

125 # BUT DO NOT CALL self.cursor.nextset() 

126 return row 

127 

128 

129def hack_django_pyodbc_azure_cursorwrapper() -> None: 

130 """ 

131 Monkey-patch part of the ``sql_server.pyodbc`` library from 

132 ``django-pyodbc-azure``. It replaces the ``fetchone()`` method with a 

133 version that doesn't call ``cursor.nextset()`` automatically. 

134 

135 **It looks like this becomes unnecessary in django-pyodbc-azure==2.0.6.1 

136 or similar, because the call to ``cursor.nextset()`` is now only performed 

137 ``if not self.connection.supports_mars``.** 

138 

139 *Notes* 

140 

141 - I thought I wanted to modify an *instance*, not a *class* 

142 (https://tryolabs.com/blog/2013/07/05/run-time-method-patching-python/). 

143 

144 - To modify a class, we do ``SomeClass.method = newmethod``. 

145 

146 - But to modify an instance, we use ``instance.method = 

147 types.MethodType(newmethod, instance)``. 

148 

149 - However, it turned out the instance was actually part of a long chain 

150 of cursor wrappers, including the Django debug toolbar. Classes included 

151 ``debug_toolbar.panels.sql.tracking.NormalCursorWrapper``; 

152 ``django.db.backends.utils.CursorDebugWrapper``. 

153 And in any case, modifying the class is a sensible thing. 

154 

155 """ 

156 try: 

157 # noinspection PyUnresolvedReferences 

158 from sql_server.pyodbc.base import CursorWrapper 

159 

160 log.info( 

161 "Monkey-patching sql_server.pyodbc.base.CursorWrapper." 

162 "fetchone to disable automatic call to cursor.nextset()" 

163 ) 

164 CursorWrapper.fetchone = ( 

165 replacement_sqlserver_pyodbc_cursorwrapper_fetchone 

166 ) 

167 except ImportError: 

168 return 

169 

170 

171if getattr( 

172 settings, 

173 SettingsKeys.DISABLE_DJANGO_PYODBC_AZURE_CURSOR_FETCHONE_NEXTSET, 

174 True, 

175): 

176 # https://stackoverflow.com/questions/5601590/how-to-define-a-default-value-for-a-custom-django-setting # noqa: E501 

177 hack_django_pyodbc_azure_cursorwrapper() 

178 

179 

180@django_cache_function(timeout=None) 

181def database_last_updated(dbname: str) -> Optional[datetime.datetime]: 

182 """ 

183 Returns a datetime object specifying when the database was last run. 

184 

185 If there are any tables with a column specifying last updated time 

186 (based on the column name provided in the config file) but all dates 

187 are null, the function will return the minimum date possible. If there 

188 are no such tables, the function will return None. 

189 """ 

190 research_database_info = get_research_db_info() 

191 try: 

192 dbinfo = research_database_info.get_dbinfo_by_name(dbname) 

193 except ValueError: 

194 raise ValueError(f"Database {dbname} is not specified in config file") 

195 tables_with_timecol = [] # type: List[str] 

196 for col in dbinfo.colinfolist: 

197 if col.column_name == dbinfo.update_date_field: 

198 tables_with_timecol.append(col.table_name) 

199 if not tables_with_timecol: 

200 return None 

201 latest_time = mindate = datetime.datetime.min 

202 for table in tables_with_timecol: 

203 # Not quite sure about the different dialects ... 

204 if dbinfo.rdb_info.dialect == "mysql": 

205 sql = ( 

206 f"SELECT {dbinfo.update_date_field} " 

207 f"FROM {dbinfo.schema_name}.{table}" 

208 ) 

209 else: 

210 # Dialect must be mssql because ResearchDatabaseInfo checks if 

211 # supported dialect 

212 sql = ( 

213 f"SELECT {dbinfo.update_date_field} " 

214 f"FROM {dbinfo.database}.{dbinfo.schema_name}.{table}" 

215 ) 

216 with get_executed_researchdb_cursor(sql) as cursor: 

217 times = cursor.fetchall() 

218 times = [t[0] if t[0] else mindate for t in times] 

219 times.append(latest_time) 

220 latest_time = max(times) 

221 return latest_time 

222 

223 

224@django_cache_function(timeout=None) 

225def last_updated_all_dbs() -> datetime.datetime: 

226 """ 

227 Returns a datetime object specifying the most recent databse update, or 

228 None if there is no information on this. 

229 """ 

230 last_updated_all = datetime.datetime.min 

231 for db in settings.RESEARCH_DB_INFO: 

232 db_last_updated = database_last_updated(db[ResearchDbInfoKeys.NAME]) 

233 if db_last_updated is None: 

234 continue 

235 last_updated_all = max(last_updated_all, db_last_updated) 

236 if last_updated_all == datetime.datetime.min: 

237 # This will only happen if none of the databases have update time 

238 # fields 

239 last_updated_all = None 

240 return last_updated_all 

241 

242 

243# ============================================================================= 

244# Cursors 

245# ============================================================================= 

246 

247 

248def debug_query() -> None: 

249 """ 

250 Executes a test query that just selects a constant, using the research 

251 database (i.e. ``connections['research']``). 

252 """ 

253 cursor = connections[RESEARCH_DB_CONNECTION_NAME].cursor() 

254 cursor.execute("SELECT 'debug'") 

255 

256 

257def get_executed_researchdb_cursor( 

258 sql: str, args: List[Any] = None 

259) -> CursorWrapper: 

260 """ 

261 Executes a query on the research database. Returns a wrapped cursor that 

262 can be used as a context manager that will close the cursor on completion. 

263 

264 Args: 

265 sql: SQL text 

266 args: arguments to SQL query 

267 

268 Returns: 

269 a :class:`django.db.backends.utils.CursorWrapper`, which is a context 

270 manager that behaves as the executed cursor and also closes it on 

271 completion 

272 

273 Test code: 

274 

275 .. code-block:: python 

276 

277 import os 

278 import django 

279 os.environ['DJANGO_SETTINGS_MODULE'] = 'crate_anon.crateweb.config.settings' 

280 django.setup() 

281 from crate_anon.crateweb.research.models import * 

282 c = get_executed_researchdb_cursor("SELECT 1") 

283 

284 """ # noqa: E501 

285 args = args or [] 

286 cursor = connections[ 

287 RESEARCH_DB_CONNECTION_NAME 

288 ].cursor() # type: CursorWrapper 

289 try: 

290 cursor.execute(sql, args or None) 

291 except DatabaseError as exception: 

292 add_info_to_exception(exception, {"sql": sql, "args": args}) 

293 raise 

294 return cursor 

295 

296 

297def get_executed_researchdb_cursor_qmark_placeholders( 

298 sql: str, args: List[Any] = None 

299) -> CursorWrapper: 

300 """ 

301 As for :func:`get_executed_researchdb_cursor`, but assumes its SQL may 

302 contain question-mark parameter placeholders (``?``) and translates these 

303 to the ones we need internally. 

304 """ 

305 sql = translate_sql_qmark_to_percent(sql) 

306 return get_executed_researchdb_cursor(sql, args) 

307 

308 

309# ============================================================================= 

310# Data going to Excel files 

311# ============================================================================= 

312 

313ILLEGAL_CHARACTERS_REPLACED_WITH = "" 

314 

315 

316def gen_excel_row_elements( 

317 worksheet: Worksheet, row: Iterable 

318) -> Generator[Any, None, None]: 

319 r""" 

320 Given an Excel worksheet row, generate individual cell contents, cell by 

321 cell. 

322 

323 Args: 

324 worksheet: a :class:`openpyxl.worksheet.worksheet.Worksheet`, which we 

325 need in order to find the worksheet's encoding 

326 row: the row to iterate through 

327 

328 Yields: 

329 the contents of each cell 

330 

331 Reasons for this function: 

332 

333 1. We need a tuple/list/generator, as ``openpyxl`` checks its types 

334 manually. 

335 

336 - We want to have a Worksheet object from ``openpyxl``, and say something 

337 like 

338 

339 .. code-block:: python 

340 

341 ws.append(row) 

342 

343 where "row" has come from a database query. 

344 

345 - However, ``openpyxl`` doesn't believe in duck-typing; see 

346 ``Worksheet.append()`` in ``openpyxl/worksheet/worksheet.py``. So 

347 sometimes the plain append works (e.g. from MySQL results), but 

348 sometimes it fails, e.g. when the row is of type ``pyodbc.Row``. 

349 

350 - So we must coerce it to a tuple, list, or generator. 

351 

352 - A generator will be the most efficient. 

353 

354 2. If a string fails certain checks, openpyxl will raise an 

355 :exc:`IllegalCharacterError` exception. We need to work around that. 

356 We'll use the "forgiveness, not permission" maxim. Specifically, it 

357 dislikes strings matching its ``ILLEGAL_CHARACTERS_RE``, which contains 

358 unprintable low characters matching this: 

359 

360 .. code-block:: python 

361 

362 r'[\000-\010]|[\013-\014]|[\016-\037]' 

363 

364 Note the use of octal; ``\037`` is decimal 31. 

365 

366 ``openpyxl`` gets to its ``Cell.check_string()`` function for these 

367 types: 

368 

369 .. code-block:: python 

370 

371 STRING_TYPES = (basestring, unicode, bytes) 

372 

373 In Python 3, this means ``(str, str, bytes)``. So we should check 

374 ``str`` and ``bytes``. (For ``bytes``, we'll follow its method of 

375 converting to ``str`` in the encoding of the worksheet's choice.) 

376 """ 

377 # Docstring must be a raw string for Sphinx! See 

378 # http://openalea.gforge.inria.fr/doc/openalea/doc/_build/html/source/sphinx/rest_syntax.html#text-syntax-bold-italic-verbatim-and-special-characters # noqa: E501 

379 for element in row: 

380 if isinstance(element, bytes): 

381 # Convert to str using the worksheet's encoding. 

382 element = element.decode(worksheet.encoding) 

383 # ... or: str(element, encoding) 

384 

385 if isinstance(element, str): 

386 yield ILLEGAL_CHARACTERS_RE.sub( 

387 ILLEGAL_CHARACTERS_REPLACED_WITH, element 

388 ) 

389 else: 

390 yield element 

391 

392 

393# ============================================================================= 

394# Query highlighting class 

395# ============================================================================= 

396 

397HIGHLIGHT_FWD_REF = "Highlight" 

398 

399 

400class Highlight(models.Model): 

401 """ 

402 Represents the highlighting of a query. 

403 """ 

404 

405 id = models.AutoField(primary_key=True) # automatic 

406 user = models.ForeignKey( 

407 settings.AUTH_USER_MODEL, on_delete=models.CASCADE 

408 ) 

409 colour = models.PositiveSmallIntegerField(verbose_name="Colour number") 

410 text = models.CharField(max_length=255, verbose_name="Text to highlight") 

411 active = models.BooleanField(default=True) 

412 

413 def __str__(self) -> str: 

414 return f"colour={self.colour}, text={self.text}" 

415 

416 def get_safe_colour(self) -> int: 

417 """ 

418 Returns our ``colour`` attribute, coerced to the range ``[0, 

419 N_CSS_HIGHLIGHT_CLASSES - 1]`` (inclusive). 

420 """ 

421 if self.colour is None: 

422 return 0 

423 return max(0, min(self.colour, N_CSS_HIGHLIGHT_CLASSES - 1)) 

424 

425 @staticmethod 

426 def as_ordered_dict( 

427 highlight_list: Iterable[HIGHLIGHT_FWD_REF], 

428 ) -> Dict[int, List[HIGHLIGHT_FWD_REF]]: 

429 """ 

430 Converts a iterable of :class:`Highlight` objects into a dictionary 

431 that collects them by highlight number. 

432 

433 Args: 

434 highlight_list: list of :class:`Highlight` objects 

435 

436 Returns: 

437 an OrderedDict whose keys are highlight colour numbers (in 

438 ascending order), and whose values are lists of all the 

439 :class:`Highlight` objects using that highlight colour number 

440 

441 """ 

442 d = dict() # type: Dict[int, List[HIGHLIGHT_FWD_REF]] 

443 for highlight in highlight_list: 

444 n = highlight.get_safe_colour() 

445 if n not in d: 

446 d[n] = [] # type: List[HIGHLIGHT_FWD_REF] 

447 d[n].append(highlight) 

448 # noinspection PyTypeChecker 

449 return OrderedDict(sorted(d.items())) 

450 

451 @staticmethod 

452 def get_active_highlights(request: HttpRequest) -> QuerySet: 

453 """ 

454 Return all active highlights for the current user. 

455 

456 Args: 

457 request: the :class:`django.http.request.HttpRequest` 

458 

459 Returns: 

460 a :class:`django.db.models.QuerySet` of the :class:`Highlight` 

461 objects 

462 

463 """ 

464 return Highlight.objects.filter(user=request.user, active=True) 

465 

466 def activate(self) -> None: 

467 """ 

468 Mark this highlight as active. 

469 """ 

470 self.active = True 

471 self.save() 

472 

473 def deactivate(self) -> None: 

474 """ 

475 Mark this highlight as inactive. 

476 """ 

477 self.active = False 

478 self.save() 

479 

480 

481# ============================================================================= 

482# Query classes 

483# ============================================================================= 

484 

485QUERY_FWD_REF = "Query" 

486 

487 

488class QueryBase(models.Model): 

489 """ 

490 Abstract base class for the two query classes. 

491 """ 

492 

493 class Meta: 

494 abstract = True 

495 app_label = "research" 

496 

497 id = models.AutoField(primary_key=True) # automatic 

498 

499 sql = models.TextField(verbose_name="SQL query") 

500 sql_hash = models.BigIntegerField( 

501 verbose_name="64-bit non-cryptographic hash of SQL query" 

502 ) 

503 args = JsonClassField(verbose_name="SQL arguments (as JSON)", null=True) 

504 # ... https://github.com/shrubberysoft/django-picklefield 

505 raw = models.BooleanField( 

506 default=False, verbose_name="SQL is raw, not parameter-substituted" 

507 ) 

508 qmark = models.BooleanField( 

509 default=True, 

510 verbose_name="Parameter-substituted SQL uses ?, not %s, " 

511 "as placeholders", 

512 ) 

513 # active = models.BooleanField(default=True) # see save() below 

514 created = models.DateTimeField(auto_now_add=True) 

515 deleted = models.BooleanField( 

516 default=False, 

517 verbose_name="Deleted from the user's perspective. " 

518 "Audited queries are never properly deleted.", 

519 ) 

520 formatted_sql = models.TextField( 

521 default=None, 

522 null=True, 

523 verbose_name="SQL with highlighting and formatting", 

524 ) 

525 

526 def __repr__(self) -> str: 

527 return simple_repr( 

528 self, ["id", "sql", "args", "raw", "qmark", "created", "deleted"] 

529 ) 

530 

531 # ------------------------------------------------------------------------- 

532 # SQL queries 

533 # ------------------------------------------------------------------------- 

534 

535 def get_original_sql(self) -> str: 

536 """ 

537 Returns the stored raw SQL. 

538 """ 

539 # noinspection PyTypeChecker 

540 return self.sql 

541 

542 def set_formatted_sql(self, reformat: bool = False) -> None: 

543 """ 

544 Sets 'formatted_sql' by highlighting the syntax and possibly 

545 reformatting. 

546 """ 

547 # noinspection PyTypeChecker 

548 self.formatted_sql = prettify_sql_html(sql=self.sql, reformat=reformat) 

549 

550 def get_formatted_sql(self) -> str: 

551 """ 

552 Getter for 'formatted_sql'. 

553 """ 

554 return self.formatted_sql 

555 

556 

557def _close_cursor(cursor: Optional[CursorWrapper]) -> None: 

558 if cursor: 

559 # log.debug("Closing cursor") 

560 cursor.close() 

561 

562 

563class Query(QueryBase): 

564 """ 

565 Class to query the research database. 

566 """ 

567 

568 NO_NULL = "_no_null" # special output 

569 

570 user = models.ForeignKey( 

571 settings.AUTH_USER_MODEL, on_delete=models.CASCADE 

572 ) 

573 active = models.BooleanField(default=True) # see save() below 

574 audited = models.BooleanField(default=False) 

575 display = models.TextField( 

576 default="[]", verbose_name="Subset of output columns to be displayed" 

577 ) 

578 no_null = models.BooleanField( 

579 default=False, 

580 verbose_name="Omit Null columns for this query when displayed", 

581 ) 

582 last_run = models.DateTimeField(null=True, default=None) 

583 

584 def __init__(self, *args, **kwargs) -> None: 

585 """ 

586 Initialize our cache. 

587 """ 

588 super().__init__(*args, **kwargs) 

589 self._executed_cursor = None # type: Optional[CursorWrapper] 

590 self._column_names = None # type: Optional[List[str]] 

591 self._rowcount = None # type: Optional[int] 

592 self._rows = None # type: Optional[List[List[Any]]] 

593 self._display_list = None # type: Optional[List[str]] 

594 self._display_indexes = None # type: Optional[List[int]] 

595 self._n_times_executed = 0 

596 self._finalizer = None # type: Optional[weakref.finalize] 

597 

598 def activate(self) -> None: 

599 """ 

600 Activate this query (and deactivates any others). 

601 """ 

602 self.active = True 

603 self.save() 

604 

605 def __repr__(self) -> str: 

606 return simple_repr( 

607 self, 

608 [ 

609 "id", 

610 "user", 

611 "sql", 

612 "args", 

613 "raw", 

614 "qmark", 

615 "active", 

616 "created", 

617 "deleted", 

618 "audited", 

619 ], 

620 ) 

621 

622 def save(self, *args, update_fields=None, **kwargs) -> None: 

623 """ 

624 Custom save method. Ensures that only one :class:`Query` has ``active 

625 == True`` for a given user. Also sets the hash. 

626 """ 

627 # https://stackoverflow.com/questions/1455126/unique-booleanfield-value-in-django # noqa: E501 

628 if self.active: 

629 Query.objects.filter(user=self.user, active=True).update( 

630 active=False 

631 ) 

632 if update_fields is None or "sql" in update_fields: 

633 self.set_formatted_sql() 

634 self.sql_hash = hash64(self.sql) 

635 if update_fields is not None: 

636 update_fields = {"formatted_sql", "sql_hash"}.union( 

637 update_fields 

638 ) 

639 super().save(*args, update_fields=update_fields, **kwargs) 

640 

641 # ------------------------------------------------------------------------- 

642 # SQL queries 

643 # ------------------------------------------------------------------------- 

644 

645 def get_sql_args_for_django(self) -> Tuple[str, Optional[List[Any]]]: 

646 """ 

647 Get sql/args in a format suitable for Django, with ``%s`` placeholders, 

648 or as escaped raw SQL. 

649 

650 Returns: 

651 tuple: ``sql, args`` 

652 

653 - If :attr:`raw` is set, return our raw SQL with ``%`` escaped to 

654 ``%%``; 

655 - otherwise, if :attr:`qmark` is set, return our raw SQL with ``?`` 

656 argument placeholders translated to ``%s`` argument placeholders; 

657 - otherwise, return the raw SQL. 

658 

659 """ 

660 if self.raw: 

661 # noinspection PyTypeChecker 

662 sql = escape_percent_for_python_dbapi(self.sql) 

663 args = None 

664 else: 

665 if self.qmark: 

666 # noinspection PyTypeChecker 

667 sql = translate_sql_qmark_to_percent(self.sql) 

668 else: 

669 sql = self.sql 

670 args = self.args 

671 return sql, args 

672 

673 def get_executed_cursor(self) -> CursorWrapper: 

674 """ 

675 Get cursor with a query executed (based on our attributes :attr:`sql`, 

676 :attr:`args`, :attr:`raw`, :attr:`qmark`). 

677 

678 Returns: 

679 a :class:`django.db.backends.utils.CursorWrapper` 

680 

681 Do NOT use this with ``with``, as in: 

682 

683 .. code-block:: python 

684 

685 with query.get_executed_cursor() as cursor: 

686 # do stuff 

687 

688 You could do that (and in general it's what Django advises) but we are 

689 trying to be fancy here and use the cursor more efficiently. 

690 

691 """ 

692 if self._executed_cursor is None: 

693 sql, args = self.get_sql_args_for_django() 

694 cursor = get_executed_researchdb_cursor(sql, args) 

695 self._n_times_executed += 1 

696 # log.debug("Query: n_times_executed: {}".format( 

697 # self._n_times_executed)) 

698 # log.debug("\n" + "".join(traceback.format_stack())) 

699 if self._n_times_executed > 1: 

700 log.warning( 

701 f"Inefficient: Query executed " 

702 f"{self._n_times_executed} times" 

703 ) 

704 try: 

705 # noinspection PyTypeChecker 

706 self._column_names = get_fieldnames_from_cursor(cursor) 

707 except TypeError: 

708 self._column_names = [] # type: List[str] 

709 self._rowcount = cursor.rowcount 

710 self._executed_cursor = cursor 

711 self._finalizer = weakref.finalize( 

712 self, _close_cursor, self._executed_cursor 

713 ) 

714 return self._executed_cursor 

715 

716 def _invalidate_executed_cursor(self) -> None: 

717 """ 

718 Mark the executed cursor as dead (e.g. iterated through). 

719 """ 

720 if self._executed_cursor is not None: 

721 self._finalizer() 

722 self._finalizer = None # type: Optional[weakref.finalize] 

723 self._executed_cursor = None # type: Optional[CursorWrapper] 

724 

725 def _cache_basics(self) -> None: 

726 """ 

727 Cache rowcount and column names. 

728 

729 Raises: 

730 :exc:`DatabaseError` if the query fails 

731 """ 

732 if self._rowcount is None: 

733 self.get_executed_cursor() # will cache 

734 

735 def _cache_all(self) -> None: 

736 """ 

737 Fetch everything from the query and cache it. 

738 

739 Raises: 

740 :exc:`DatabaseError` if the query fails 

741 """ 

742 if self._rows is None: 

743 cursor = self.get_executed_cursor() 

744 self._rows = cursor.fetchall() 

745 self._invalidate_executed_cursor() 

746 

747 def get_column_names(self) -> List[str]: 

748 """ 

749 Returns column names from the query's cursor. 

750 

751 Raises: 

752 :exc:`DatabaseError` if the query fails 

753 """ 

754 if self._column_names is None: 

755 self._cache_basics() 

756 return self._column_names 

757 

758 def get_rowcount(self) -> int: 

759 """ 

760 Returns the rowcount from the cursor. 

761 

762 Raises: 

763 :exc:`DatabaseError` if the query fails 

764 """ 

765 if self._rowcount is None: 

766 self._cache_basics() 

767 return self._rowcount 

768 

769 def get_rows(self) -> List[List[Any]]: 

770 """ 

771 Returns all rows from the query, as a list. 

772 

773 Raises: 

774 :exc:`DatabaseError` if the query fails 

775 """ 

776 self._cache_all() 

777 return self._rows 

778 

779 def gen_rows(self) -> Generator[List[Any], None, None]: 

780 """ 

781 Generate rows from the query. 

782 

783 Raises: 

784 :exc:`DatabaseError` if the query fails 

785 """ 

786 if self._rows is None: 

787 # No cache 

788 cursor = self.get_executed_cursor() 

789 row = cursor.fetchone() 

790 while row is not None: 

791 yield row 

792 row = cursor.fetchone() 

793 self._invalidate_executed_cursor() 

794 else: 

795 # Cache 

796 for row in self._rows: 

797 yield row 

798 

799 def dictfetchall(self) -> List[Dict[str, Any]]: 

800 """ 

801 Executes the query and returns all results as a list of OrderedDicts 

802 (one for each row, mapping column names to values). 

803 

804 Raises: 

805 :exc:`DatabaseError` if the query fails 

806 """ 

807 if self._rows is None: 

808 # No cache 

809 cursor = self.get_executed_cursor() 

810 # noinspection PyTypeChecker 

811 result = dictfetchall(cursor) 

812 self._invalidate_executed_cursor() 

813 return result 

814 else: 

815 # Cache 

816 columns = self._column_names 

817 return [OrderedDict(zip(columns, row)) for row in self._rows] 

818 

819 def update_last_run(self) -> None: 

820 self.last_run = datetime.datetime.now() 

821 self.save() 

822 

823 @property 

824 def run_since_update(self) -> Optional[bool]: 

825 # Currently doesn't check which databases are involved in the 

826 # query - just checks all databases 

827 last_updated_all = last_updated_all_dbs() 

828 if last_updated_all is None: # no info from dbs 

829 return None 

830 elif self.last_run is None: # query never run 

831 return False 

832 else: 

833 # Make last_updated_all timezone aware so they can be compared 

834 last_updated_all.replace(datetime.timezone.utc) 

835 return self.last_run > last_updated_all 

836 

837 # ------------------------------------------------------------------------- 

838 # Fetching 

839 # ------------------------------------------------------------------------- 

840 

841 @staticmethod 

842 def get_active_query_or_none( 

843 request: HttpRequest, 

844 ) -> Optional[QUERY_FWD_REF]: 

845 """ 

846 Returns the active query for this user, or ``None``. 

847 

848 Args: 

849 request: the :class:`django.http.request.HttpRequest` 

850 

851 Returns: 

852 a :class:`Query`, or ``None``. 

853 

854 """ 

855 if not request.user.is_authenticated: 

856 return None 

857 try: 

858 return Query.objects.get(user=request.user, active=True) 

859 except Query.DoesNotExist: 

860 return None 

861 

862 @staticmethod 

863 def get_active_query_id_or_none(request: HttpRequest) -> Optional[int]: 

864 """ 

865 Returns the active query's integer ID for this user, or ``None``. 

866 

867 Args: 

868 request: the :class:`django.http.request.HttpRequest` 

869 

870 Returns: 

871 the active query's integer PK, or ``None``. 

872 

873 """ 

874 if not request.user.is_authenticated: 

875 return None 

876 try: 

877 query = Query.objects.get(user=request.user, active=True) 

878 return query.id 

879 except Query.DoesNotExist: 

880 return None 

881 

882 # ------------------------------------------------------------------------- 

883 # Activating, deleting, auditing 

884 # ------------------------------------------------------------------------- 

885 

886 # This isn't needed in the base class because it only applies to 

887 # audited queries 

888 def mark_deleted(self) -> None: 

889 """ 

890 Mark the query as deleted. 

891 

892 This will stop it being shown. It will not delete it from the database. 

893 

894 We use this deletion method for queries that have been executed, so 

895 need an audit trail. 

896 """ 

897 if self.deleted: 

898 # log.debug("pointless") 

899 return 

900 self.deleted = True 

901 self.active = False 

902 # log.debug("about to save") 

903 self.save() 

904 # log.debug("saved") 

905 

906 def mark_audited(self) -> None: 

907 """ 

908 Mark the query as having been executed and audited. (This prevents it 

909 from being wholly deleted.) 

910 """ 

911 if self.audited: 

912 return 

913 self.audited = True 

914 self.save() 

915 

916 def audit( 

917 self, 

918 count_only: bool = False, 

919 n_records: int = 0, 

920 failed: bool = False, 

921 fail_msg: str = "", 

922 ) -> None: 

923 """ 

924 Audit the execution of this query: 

925 

926 - insert an audit entry referring to this query 

927 - mark the query as having been audited (so it's not deleted) 

928 

929 Args: 

930 count_only: did we know (in advance) that this was a 

931 ``COUNT()``-only query? 

932 n_records: how many records were returned? 

933 failed: did the query fail? 

934 fail_msg: if the query failed, the associated failure message 

935 """ 

936 a = QueryAudit( 

937 query=self, 

938 count_only=count_only, 

939 n_records=n_records, 

940 failed=failed, 

941 fail_msg=fail_msg, 

942 ) 

943 a.save() 

944 self.mark_audited() 

945 

946 def delete_if_permitted(self) -> None: 

947 """ 

948 Delete the query. 

949 

950 - If a query has been executed and therefore audited, it isn't properly 

951 deleted; it's just marked as deleted. 

952 - If a query has never been executed, we can delete it entirely. 

953 """ 

954 if self.deleted: 

955 log.debug("already flagged as deleted") 

956 return 

957 if self.audited: 

958 log.debug("marking as deleted") 

959 self.mark_deleted() 

960 else: 

961 # actually delete 

962 log.debug("actually deleting") 

963 self.delete() 

964 

965 # ------------------------------------------------------------------------- 

966 # Filtering columns for display output 

967 # ------------------------------------------------------------------------- 

968 

969 def set_display_list(self, display_list: List[str]) -> None: 

970 """ 

971 Sets the internal JSON field, stored in the database, from a list of 

972 column headings to display. 

973 

974 Args: 

975 display_list: list of columns to display 

976 """ 

977 self.display = json.dumps( 

978 display_list, separators=JSON_SEPARATORS_COMPACT 

979 ) 

980 self._display_list = None # clear cache 

981 

982 def _get_display_list(self) -> List[str]: 

983 """ 

984 Returns a list of columns to display, from our internal JSON 

985 representation. 

986 """ 

987 if not self.display: 

988 return [] 

989 try: 

990 result = json.loads(self.display) 

991 except json.decoder.JSONDecodeError: # e.g. junk 

992 log.warning("Query.display field: bad JSON, returning []") 

993 return [] 

994 # Now check it's a list of str: 

995 if not isinstance(result, list): 

996 log.warning("Query.display field: not a list, returning []") 

997 return [] 

998 if not all(isinstance(x, str) for x in result): 

999 log.warning( 

1000 "Query.display field: contains non-strings, " "returning []" 

1001 ) 

1002 return [] 

1003 return result 

1004 

1005 def get_display_list(self) -> List[str]: 

1006 """ 

1007 Returns a list of columns to display, from our internal JSON 

1008 representation. Uses :func:`_get_display_list` and caches it. 

1009 """ 

1010 if self._display_list is None: 

1011 self._display_list = self._get_display_list() 

1012 # log.debug("Query.get_display_list() -> {!r}".format( 

1013 # self._display_list)) 

1014 return self._display_list 

1015 

1016 def _get_display_indexes(self) -> Optional[List[int]]: 

1017 """ 

1018 Returns the indexes of the result columns that we wish to display. 

1019 

1020 Raises: 

1021 :exc:`DatabaseError` on query failure 

1022 """ 

1023 display_fieldnames = self.get_display_list() 

1024 # If the display attribute is empty assume the user wants all fields 

1025 select_all = not display_fieldnames 

1026 

1027 if self.no_null: 

1028 self._cache_all() # writes to self._rows 

1029 

1030 all_column_names = self.get_column_names() 

1031 

1032 if select_all and not self.no_null: 

1033 # No filtering. Provide the original indexes quickly. 

1034 return list(range(len(all_column_names))) 

1035 

1036 field_indexes = [] # type: List[int] 

1037 # Do this to make sure included fields are actually in the results 

1038 for i, name in enumerate(all_column_names): 

1039 if select_all or name in display_fieldnames: 

1040 if self.no_null: 

1041 # Exclude fields where all values are null, if no_null 

1042 # is switched on. 

1043 for row in self._rows: 

1044 if row[i] is not None: 

1045 field_indexes.append(i) 

1046 break 

1047 else: 

1048 field_indexes.append(i) 

1049 return field_indexes 

1050 

1051 def get_display_indexes(self) -> Optional[List[int]]: 

1052 """ 

1053 Returns the indexes of the result columns that we wish to display. 

1054 Uses :func:`_get_display_indexes` and caches it. 

1055 

1056 Raises: 

1057 :exc:`DatabaseError` on query failure 

1058 """ 

1059 if self._display_indexes is None: 

1060 self._display_indexes = self._get_display_indexes() 

1061 return self._display_indexes 

1062 

1063 def get_display_column_names(self) -> List[str]: 

1064 """ 

1065 Returns the filtered column names. 

1066 """ 

1067 column_names = self.get_column_names() 

1068 display_indexes = self.get_display_indexes() 

1069 return [column_names[i] for i in display_indexes] 

1070 

1071 def gen_display_rows(self) -> Generator[List[Any], None, None]: 

1072 """ 

1073 Generates all filtered rows. 

1074 """ 

1075 field_indexes = self.get_display_indexes() 

1076 if not field_indexes: 

1077 # No columns specifically selected; return all columns 

1078 for row in self.gen_rows(): 

1079 yield row 

1080 else: 

1081 for row in self.gen_rows(): 

1082 yield [row[i] for i in field_indexes] 

1083 

1084 def get_display_rows(self) -> List[List[Any]]: 

1085 """ 

1086 Returns a list of all filtered rows. 

1087 """ 

1088 if self._rows is not None: 

1089 # Pre-cached; there may be a shortcut 

1090 field_indexes = self.get_display_indexes() 

1091 if not field_indexes: 

1092 # No columns specifically selected; return all columns 

1093 return self._rows 

1094 # Otherwise, use the generator: 

1095 return list(self.gen_display_rows()) 

1096 

1097 def make_tsv(self) -> str: 

1098 """ 

1099 Executes the query and returns a TSV result (as a multiline string). 

1100 """ 

1101 fieldnames = self.get_display_column_names() 

1102 tsv = make_tsv_row(fieldnames) 

1103 for row in self.gen_display_rows(): 

1104 tsv += make_tsv_row(row) 

1105 self.update_last_run() 

1106 return tsv 

1107 

1108 def make_excel(self) -> bytes: 

1109 """ 

1110 Executes the query and returns an Excel workbook, in binary. 

1111 """ 

1112 self._cache_all() 

1113 wb = Workbook() 

1114 wb.remove_sheet(wb.active) # remove the autocreated blank sheet 

1115 sheetname = f"query_{self.id}" 

1116 ws = wb.create_sheet(sheetname) 

1117 now = datetime.datetime.now() 

1118 

1119 fieldnames = self.get_display_column_names() 

1120 ws.append(fieldnames) 

1121 for row in self.gen_display_rows(): 

1122 ws.append(gen_excel_row_elements(ws, row)) 

1123 

1124 sql_ws = wb.create_sheet(title="SQL") 

1125 sql_ws.append(["SQL", "Executed_at"]) 

1126 sql_ws.append([self.get_original_sql(), now]) 

1127 self.update_last_run() 

1128 return excel_to_bytes(wb) 

1129 

1130 

1131class SitewideQuery(QueryBase): 

1132 """ 

1133 Class representing a site-wide query for research database. 

1134 

1135 - Site-wide queries are not attached to any particular user. 

1136 - They are templatized with placeholders. 

1137 - Placeholders begin with ``[[`` and end with ``]]``. 

1138 - The user is asked to fill in values for the placeholders. 

1139 

1140 """ 

1141 

1142 description = models.TextField( 

1143 verbose_name="query description", default="" 

1144 ) 

1145 

1146 @property 

1147 def sql_chunks(self) -> List[str]: 

1148 """ 

1149 Returns a list of SQL chunks and placeholders made from the original 

1150 SQL. Placeholders begin with ``[[`` and end with ``]]``. 

1151 

1152 For example, if the sql is 

1153 

1154 .. code-block:: none 

1155 

1156 SELECT * FROM [[table]] WHERE brcid="[[brcid]]"; 

1157 

1158 then ``sql_chunks`` will be 

1159 

1160 .. code-block:: python 

1161 

1162 [ 

1163 'SELECT * FROM ', 

1164 'table', 

1165 ' WHERE brcid="', 

1166 'brcid', 

1167 '";' 

1168 ] 

1169 

1170 Note that the first element (and all elements with even [zero-based] 

1171 list indexes) are SQL, not placeholders. All elements with odd indexes 

1172 are placeholders. 

1173 """ 

1174 sql_string = self.sql 

1175 placeholder_start = "[[" 

1176 placeholder_end = "]]" 

1177 startlen = len(placeholder_start) 

1178 endlen = len(placeholder_end) 

1179 chunks = [] # type: List[str] 

1180 index1 = sql_string.find(placeholder_start) 

1181 index2 = sql_string.find(placeholder_end) 

1182 while index1 != -1 and index2 != -1: # placeholder present 

1183 # get bit of sql up to next '[[' 

1184 chunk = sql_string[:index1] 

1185 # get bit of sql between '[[' and ']]' 

1186 placeholder = sql_string[index1 + startlen : index2] 

1187 chunks.append(chunk) 

1188 chunks.append(placeholder) 

1189 # get bit of sql after '[[' - this forms new substring to check 

1190 sql_string = sql_string[index2 + endlen :] 

1191 index1 = sql_string.find(placeholder_start) 

1192 index2 = sql_string.find(placeholder_end) 

1193 # Deal with any remainder 

1194 if sql_string: 

1195 # noinspection PyTypeChecker 

1196 chunks.append(sql_string) 

1197 

1198 return chunks 

1199 

1200 @property 

1201 def prettified_chunks(self) -> List[str]: 

1202 """ 

1203 Returns chunks (see sql_chunks) but with formatting. 

1204 """ 

1205 prettified_chunks = [] # type: List[str] 

1206 for i, chunk in enumerate(self.sql_chunks): 

1207 if (i + 1) % 2 == 1: 

1208 chunk = prettify_sql_html(chunk) 

1209 prettified_chunks.append(chunk) 

1210 return prettified_chunks 

1211 

1212 def save(self, *args, update_fields=None, **kwargs) -> None: 

1213 """ 

1214 Custom save method. Sets the hash. 

1215 """ 

1216 if update_fields is None or "sql" in update_fields: 

1217 self.set_formatted_sql() 

1218 self.sql_hash = hash64(self.sql) 

1219 if update_fields is not None: 

1220 update_fields = {"formatted_sql", "sql_hash"}.union( 

1221 update_fields 

1222 ) 

1223 super().save(*args, update_fields=update_fields, **kwargs) 

1224 

1225 

1226# ============================================================================= 

1227# Query auditing class 

1228# ============================================================================= 

1229 

1230 

1231class QueryAudit(models.Model): 

1232 """ 

1233 Audit log for a query. 

1234 """ 

1235 

1236 id = models.AutoField(primary_key=True) # automatic 

1237 query = models.ForeignKey("Query", on_delete=models.PROTECT) 

1238 # ... contains information about which user 

1239 when = models.DateTimeField(auto_now_add=True) 

1240 count_only = models.BooleanField(default=False) 

1241 n_records = models.IntegerField(default=0) 

1242 # ... not PositiveIntegerField; SQL Server gives -1, for example 

1243 failed = models.BooleanField(default=False) 

1244 fail_msg = models.TextField() 

1245 

1246 def __str__(self) -> str: 

1247 return f"<QueryAudit id={self.id}>" 

1248 

1249 

1250# ============================================================================= 

1251# Lookup class for secret RID-to-PID conversion 

1252# ============================================================================= 

1253 

1254# class PidLookupRouter: 

1255# # https://docs.djangoproject.com/en/1.8/topics/db/multi-db/ 

1256# # https://newcircle.com/s/post/1242/django_multiple_database_support 

1257# # noinspection PyMethodMayBeStatic,PyUnusedLocal 

1258# def db_for_read(self, model: Type[models.Model], **hints) -> Optional[str]: # noqa: E501 

1259# """ 

1260# read model PidLookup -> look at database secret 

1261# """ 

1262# # log.debug("PidLookupRouter: {}".format(model._meta.model_name)) 

1263# # if model._meta.model_name == PidLookup._meta.model_name: 

1264# if model == PidLookup: 

1265# return 'secret' 

1266# return None 

1267# 

1268# # noinspection PyUnusedLocal 

1269# @staticmethod 

1270# def allow_migrate(db: str, app_label: str, model_name: str = None, 

1271# **hints) -> bool: 

1272# # 2017-02-12, to address bug: 

1273# # - https://code.djangoproject.com/ticket/27054 

1274# # See also: 

1275# # - https://docs.djangoproject.com/en/1.10/topics/db/multi-db/#using-other-management-commands # noqa: E501 

1276# return db == 'default' 

1277 

1278 

1279class PidLookup(models.Model): 

1280 """ 

1281 Lookup class for secret RID-to-PID conversion. 

1282 

1283 - Used via one or other of the 'secret' database connections. 

1284 - Intended for READ-ONLY access to that table. 

1285 

1286 - Since we have fixed the tablenames for the anonymiser, we remove the 

1287 ``settings.SECRET_MAP`` option. See 

1288 :class:`crate_anon.anonymise.models.PatientInfo`. Moreover, we fix the 

1289 maximum length, regardless of the specifics of the config used. 

1290 

1291 - Use as e.g. ``Lookup(pid=XXX)``. 

1292 

1293 """ 

1294 

1295 pid = models.PositiveIntegerField( 

1296 primary_key=True, db_column=PatientInfoConstants.PID_FIELDNAME 

1297 ) 

1298 mpid = models.PositiveIntegerField( 

1299 db_column=PatientInfoConstants.MPID_FIELDNAME 

1300 ) 

1301 rid = models.CharField( 

1302 db_column=PatientInfoConstants.RID_FIELDNAME, 

1303 max_length=get_longest_supported_hasher_output_length(), 

1304 ) 

1305 mrid = models.CharField( 

1306 db_column=PatientInfoConstants.MRID_FIELDNAME, 

1307 max_length=get_longest_supported_hasher_output_length(), 

1308 ) 

1309 trid = models.PositiveIntegerField( 

1310 db_column=PatientInfoConstants.TRID_FIELDNAME 

1311 ) 

1312 

1313 class Meta: 

1314 managed = False 

1315 db_table = PatientInfoConstants.SECRET_MAP_TABLENAME 

1316 

1317 # https://stackoverflow.com/questions/12158463/how-can-i-make-a-model-read-only # noqa: E501 

1318 def save(self, *args, **kwargs) -> None: 

1319 return 

1320 

1321 def delete(self, *args, **kwargs) -> None: 

1322 return 

1323 

1324 

1325def get_pid_lookup( 

1326 dbinfo: SingleResearchDatabase, 

1327 pid: Union[int, str] = None, 

1328 mpid: Union[int, str] = None, 

1329 trid: int = None, 

1330 rid: str = None, 

1331 mrid: str = None, 

1332) -> Optional[PidLookup]: 

1333 """ 

1334 Looks up a patient in the secret lookup database associated with a 

1335 database, from one of several possible identifiers. 

1336 

1337 Args: 

1338 dbinfo: a 

1339 :class:`crate_anon.crateweb.research.research_db_info.SingleResearchDatabase` 

1340 pid: optional patient identifier (PID) value 

1341 mpid: optional master patient identifier (MPID) value 

1342 trid: optional transient research identifier (TRID) value 

1343 rid: optional research identifier (RID) value 

1344 mrid: optional master research identifier (MRID) value 

1345 

1346 Returns: 

1347 a :class:`crate_anon.crateweb.research.models.PidLookup` or ``None`` 

1348 

1349 Raises: 

1350 :exc:`ValueError` if none of the IDs was specified 

1351 

1352 """ 

1353 dbalias = dbinfo.secret_lookup_db 

1354 assert dbalias 

1355 q = PidLookup.objects.using(dbalias) 

1356 if trid is not None: 

1357 lookup = q.get(trid=trid) 

1358 elif rid is not None: 

1359 lookup = q.get(rid=rid) 

1360 elif mrid is not None: 

1361 lookup = q.get(mrid=mrid) 

1362 elif pid is not None: 

1363 lookup = q.get(pid=pid) 

1364 elif mpid is not None: 

1365 lookup = q.get(mpid=mpid) 

1366 else: 

1367 raise ValueError("no input") 

1368 return lookup 

1369 

1370 

1371def get_mpid( 

1372 dbinfo: SingleResearchDatabase, 

1373 trid: int = None, 

1374 rid: str = None, 

1375 mrid: str = None, 

1376) -> int: 

1377 """ 

1378 Returns the MPID for a patient, looked up from one of the research IDs. 

1379 

1380 Args: 

1381 dbinfo: a 

1382 :class:`crate_anon.crateweb.research.research_db_info.SingleResearchDatabase` 

1383 trid: optional transient research identifier (TRID) value 

1384 rid: optional research identifier (RID) value 

1385 mrid: optional master research identifier (MRID) value 

1386 

1387 Returns: 

1388 the integer MPID, or ``None`` 

1389 

1390 Raises: 

1391 :exc:`ValueError` if none of the IDs was specified 

1392 """ 

1393 lookup = get_pid_lookup(dbinfo=dbinfo, trid=trid, rid=rid, mrid=mrid) 

1394 # noinspection PyTypeChecker 

1395 return lookup.mpid 

1396 

1397 

1398def get_pid( 

1399 dbinfo: SingleResearchDatabase, 

1400 trid: int = None, 

1401 rid: str = None, 

1402 mrid: str = None, 

1403) -> int: 

1404 """ 

1405 Returns the PID for a patient, looked up from one of the research IDs. 

1406 

1407 Args: 

1408 dbinfo: a 

1409 :class:`crate_anon.crateweb.research.research_db_info.SingleResearchDatabase` 

1410 trid: optional transient research identifier (TRID) value 

1411 rid: optional research identifier (RID) value 

1412 mrid: optional master research identifier (MRID) value 

1413 

1414 Returns: 

1415 the integer PID, or ``None`` 

1416 

1417 Raises: 

1418 :exc:`ValueError` if none of the IDs was specified 

1419 """ 

1420 lookup = get_pid_lookup(dbinfo=dbinfo, trid=trid, rid=rid, mrid=mrid) 

1421 # noinspection PyTypeChecker 

1422 return lookup.pid 

1423 

1424 

1425# ============================================================================= 

1426# Patient Explorer multi-query classes 

1427# ============================================================================= 

1428 

1429 

1430class TableQueryArgs: 

1431 """ 

1432 Represents SQL for a specific table, with arguments for the SQL. Used by 

1433 :class:`PatientMultiQuery`. 

1434 """ 

1435 

1436 def __init__(self, table_id: TableId, sql: str, args: List[Any]) -> None: 

1437 """ 

1438 Args: 

1439 table_id: a :class:`crate_anon.common.sql.TableId` that this query 

1440 is selecting from 

1441 sql: SQL text 

1442 args: a list of arguments to the SQL 

1443 """ 

1444 self.table_id = table_id 

1445 self.sql = sql 

1446 self.args = args 

1447 

1448 

1449@register_for_json(method=METHOD_STRIP_UNDERSCORE) 

1450class PatientMultiQuery: 

1451 """ 

1452 Represents a set of queries across many tables relating to one or several 

1453 patients (but the same patients across all the tables). 

1454 

1455 Used for the Patient Explorer. 

1456 

1457 *Development notes:* 

1458 

1459 - Patient ID query 

1460 

1461 - Single database is easy; we can use RID or TRID, and therefore TRID for 

1462 performance. 

1463 

1464 Note that ``UNION`` gives only ``DISTINCT`` results by default (``UNION 

1465 ALL`` gives everything); see 

1466 https://stackoverflow.com/questions/49925/what-is-the-difference-between-union-and-union-all. 

1467 

1468 .. code-block:: sql 

1469 

1470 -- Clear, but extensibility of boolean logic less clear: 

1471 SELECT trid 

1472 FROM diagnosis_table 

1473 WHERE diagnosis LIKE 'F20%' 

1474 INTERSECT 

1475 SELECT trid 

1476 FROM progress_note_table 

1477 WHERE note LIKE '%schizophreni%' OR note LIKE '%depression%' 

1478 ORDER BY trid 

1479 -- ... logic across tables requires careful arrangement of UNION vs. INTERSECT 

1480 -- ... logic for multiple fields within one table can be done with AND/OR 

1481 

1482 -- Slower (?), but simpler to manipulate logic? 

1483 SELECT DISTINCT something.trid 

1484 FROM diagnosis_table INNER JOIN progress_note_table 

1485 ON diagnosis_table.trid = progress_note_table.trid 

1486 WHERE 

1487 diagnosis_table.diagnosis LIKE 'F20%' 

1488 AND (progress_note_table.note LIKE '%schizophreni%' 

1489 OR progress_note_table.notenote LIKE '%depression%') 

1490 ORDER BY something.trid 

1491 -- ... boolean logic can all be encapsulated in a single WHERE clause 

1492 -- ... can also share existing join code 

1493 -- ... ?reasonable speed since the TRID fields will be indexed 

1494 -- ... preferable. 

1495 

1496 - Which ID for the patient ID query? 

1497 

1498 - the TRID (for speed, inc. sorting) of the first database 

1499 - can use the TRID from the first "where clause" table 

1500 (don't have to join to a master patient table) 

1501 - join everything across databases as before 

1502 

1503 - Results queries 

1504 

1505 .. code-block:: none 

1506 

1507 -- Something like: 

1508 

1509 SELECT rid, date_of_note, note 

1510 FROM progress_note_table 

1511 WHERE trid IN ( ... patient_id_query ... ) 

1512 ORDER BY trid 

1513 

1514 SELECT rid, date_of_diagnosis, diagnosis, diagnosis_description 

1515 FROM diagnosis_table 

1516 WHERE trid IN ( ... patient_id_query ... ) 

1517 ORDER BY trid 

1518 

1519 This means we will repeat the patient_id_query, which may be inefficient. 

1520 Options: 

1521 

1522 - store the TRIDs in Python, then pass them as arguments 

1523 

1524 - at which point the SQL string/packet length becomes relevant; 

1525 - https://stackoverflow.com/questions/1869753/maximum-size-for-a-sql-server-query-in-clause-is-there-a-better-approach 

1526 - https://stackoverflow.com/questions/16335011/what-is-maximum-query-size-for-mysql 

1527 - https://stackoverflow.com/questions/96553/practical-limit-to-length-of-sql-query-specifically-mysql 

1528 

1529 - let the database worry about it 

1530 

1531 - probably best for now! 

1532 

1533 - Display 

1534 

1535 - One patient per page, with multiple results tables. 

1536 

1537 - Boolean logic on patient selection 

1538 

1539 - ... within 

1540 

1541 """ # noqa: E501 

1542 

1543 def __init__( 

1544 self, 

1545 output_columns: List[ColumnId] = None, 

1546 patient_conditions: List[WhereCondition] = None, 

1547 manual_patient_id_query: str = "", 

1548 ): 

1549 """ 

1550 Args: 

1551 output_columns: 

1552 database columns that will be in the output, as 

1553 list of :class:`crate_anon.common.sql.ColumnId` objects 

1554 patient_conditions: 

1555 restrictions on the patient, as a list of 

1556 :class:`crate_anon.common.sql.WhereCondition` objects; they 

1557 will be joined with ``AND`` 

1558 manual_patient_id_query: 

1559 raw SQL; if specified, overrides ``patient_conditions`` and is 

1560 used as the patient-finding part of the query; see 

1561 :func:`set_override_query` 

1562 """ 

1563 self._output_columns = output_columns or [] # type: List[ColumnId] 

1564 self._patient_conditions = ( 

1565 patient_conditions or [] 

1566 ) # type: List[WhereCondition] 

1567 self._manual_patient_id_query = manual_patient_id_query or "" 

1568 

1569 def __repr__(self) -> str: 

1570 return ( 

1571 "{qualname}(" 

1572 "output_columns={output_columns}, " 

1573 "patient_conditions={patient_conditions}, " 

1574 "manual_patient_id_query={manual_patient_id_query}" 

1575 ")".format( 

1576 qualname=self.__class__.__qualname__, 

1577 output_columns=repr(self._output_columns), 

1578 patient_conditions=repr(self._patient_conditions), 

1579 manual_patient_id_query=repr(self._manual_patient_id_query), 

1580 ) 

1581 ) 

1582 

1583 def __eq__(self, other: "PatientMultiQuery") -> bool: 

1584 return ( 

1585 self._output_columns == other._output_columns 

1586 and self._patient_conditions == other._patient_conditions 

1587 and self._manual_patient_id_query == other._manual_patient_id_query 

1588 ) 

1589 

1590 def __hash__(self) -> int: 

1591 """ 

1592 WARNING: Python's hash() function converts the result of __hash__() 

1593 to the integer width of the host machine, so 64-bit results can get 

1594 down-converted to 32 bits. Use hash64() directly if you want a 64-bit 

1595 result. 

1596 """ 

1597 return self.hash64 

1598 

1599 @property 

1600 def hash64(self) -> int: 

1601 """ 

1602 Return an integer (non-cryptographic) hash of the query. 

1603 """ 

1604 return hash64(json_encode(self)) 

1605 

1606 @property 

1607 def output_columns(self) -> List[ColumnId]: 

1608 """ 

1609 Returns the output columns, as a list of 

1610 :class:`crate_anon.common.sql.ColumnId` objects. 

1611 """ 

1612 return self._output_columns 

1613 

1614 @property 

1615 def has_output_columns(self) -> bool: 

1616 """ 

1617 Does this multiquery have any output columns? 

1618 """ 

1619 return bool(self._output_columns) 

1620 

1621 @property 

1622 def ok_to_run(self) -> bool: 

1623 """ 

1624 Is this OK to run, i.e. does it have a patient ID query and some output 

1625 columns? 

1626 """ 

1627 return self.has_output_columns and self.has_patient_id_query 

1628 

1629 @property 

1630 def patient_conditions(self) -> List[WhereCondition]: 

1631 """ 

1632 Returns all ``WHERE`` conditions restricting the patient, as a list of 

1633 :class:`crate_anon.common.sql.WhereCondition` objects. 

1634 """ 

1635 return self._patient_conditions 

1636 

1637 @property 

1638 def manual_patient_id_query(self) -> str: 

1639 """ 

1640 Returns the manual override SQL for the patient ID query. 

1641 """ 

1642 return self._manual_patient_id_query 

1643 

1644 def add_output_column(self, column_id: ColumnId) -> None: 

1645 """ 

1646 Adds a database column to the output. 

1647 """ 

1648 if column_id not in self._output_columns: 

1649 self._output_columns.append(column_id) 

1650 self._output_columns.sort() 

1651 

1652 def clear_output_columns(self) -> None: 

1653 """ 

1654 Removes all output columns from the multiquery. 

1655 """ 

1656 self._output_columns = [] # type: List[ColumnId] 

1657 

1658 def add_patient_condition(self, where: WhereCondition) -> None: 

1659 """ 

1660 Adds a patient ``WHERE`` condition. 

1661 

1662 Args: 

1663 where: a :class:`crate_anon.common.sql.WhereCondition` 

1664 """ 

1665 if where not in self._patient_conditions: 

1666 self._patient_conditions.append(where) 

1667 self._patient_conditions.sort() 

1668 

1669 def clear_patient_conditions(self) -> None: 

1670 """ 

1671 Removes all ``WHERE`` conditions on the patient. 

1672 """ 

1673 self._patient_conditions = [] # type: List[WhereCondition] 

1674 

1675 def set_override_query(self, query: str) -> None: 

1676 """ 

1677 Sets the manual override SQL for the patient ID query. 

1678 

1679 Args: 

1680 query: raw SQL 

1681 

1682 This query should return a single column of MRID values that is fetched 

1683 into Python and used to restrict other queries. Here's a fictional 

1684 example to fetch the MRIDs for all patients who have the word 

1685 "neutrophils" in their notes: 

1686 

1687 .. code-block:: sql 

1688 

1689 SELECT DISTINCT anonymous_output.patient.nhshash AS _mrid 

1690 FROM anonymous_output.patient 

1691 INNER JOIN anonymous_output.note ON anonymous_output.note.trid = anonymous_output.patient.trid 

1692 WHERE MATCH (anonymous_output.note.note) AGAINST ('neutrophils') 

1693 AND anonymous_output.patient.nhshash IS NOT NULL 

1694 ORDER BY _mrid 

1695 

1696 """ # noqa: E501 

1697 self._manual_patient_id_query = query 

1698 

1699 def _get_select_mrid_column(self) -> Optional[ColumnId]: 

1700 """ 

1701 Returns the MRID column from the first table in the patient ``WHERE`` 

1702 conditions, or ``None``. 

1703 

1704 Returns: 

1705 a :class:`crate_anon.common.sql.ColumnId` or ``None`` 

1706 

1707 """ 

1708 if not self._patient_conditions: 

1709 return None 

1710 return self._research_database_info.get_linked_mrid_column( 

1711 self._patient_conditions[0].table_id 

1712 ) 

1713 

1714 @property 

1715 def has_patient_id_query(self) -> bool: 

1716 """ 

1717 Does this multiquery have a patient ID query? This can either be one 

1718 that the user has specified manually, or one built from ``WHERE`` 

1719 conditions that appears to refer to an MRID. 

1720 """ 

1721 if self._manual_patient_id_query: 

1722 return True 

1723 if self._patient_conditions: 

1724 mrid_col = self._get_select_mrid_column() 

1725 if mrid_col and mrid_col.is_valid: 

1726 return True 

1727 return False 

1728 

1729 def patient_id_query(self, with_order_by: bool = True) -> str: 

1730 """ 

1731 Returns an SQL ``SELECT`` statement based on the list of ``WHERE`` 

1732 conditions already stored, joined with ``AND`` by default. (If a manual 

1733 patient ID query has been specified, return that instead.) 

1734 

1735 Args: 

1736 with_order_by: add an ``ORDER BY`` query on the MRID; such an 

1737 ordering is important for consistency across runs (but is 

1738 prohibited by SQL Server in subqueries -- "The ORDER BY clause 

1739 is invalid in views, inline functions, derived tables, 

1740 subqueries, ... unless TOP, OFFSET or FOR XML is specified.") 

1741 

1742 Returns: 

1743 str: SQL 

1744 

1745 """ 

1746 

1747 if self._manual_patient_id_query: 

1748 # User has specified one manually. 

1749 return self._manual_patient_id_query 

1750 

1751 if not self._patient_conditions: 

1752 return "" 

1753 

1754 select_mrid_column = self._get_select_mrid_column() 

1755 if not select_mrid_column.is_valid: 

1756 log.warning( 

1757 f"PatientMultiQuery.patient_id_query(): invalid" 

1758 f" select_mrid_column: {select_mrid_column!r}" 

1759 ) 

1760 # One way this can happen: (1) a user saves a PMQ; (2) the 

1761 # administrator removes one of the databases! 

1762 return "" 

1763 

1764 research_database_info = get_research_db_info() 

1765 grammar = research_database_info.grammar 

1766 

1767 mrid_alias = "_mrid" 

1768 sql = add_to_select( 

1769 "", 

1770 grammar=grammar, 

1771 select_elements=[ 

1772 SelectElement(column_id=select_mrid_column, alias=mrid_alias) 

1773 ], 

1774 distinct=True, 

1775 where_conditions=( 

1776 self._patient_conditions 

1777 + [ 

1778 WhereCondition( 

1779 column_id=select_mrid_column, op="IS NOT NULL" 

1780 ) 

1781 ] 

1782 ), 

1783 where_type="AND", 

1784 magic_join=True, 

1785 formatted=False, 

1786 ) 

1787 if with_order_by: 

1788 sql += " ORDER BY " + mrid_alias 

1789 sql = format_sql(sql) 

1790 # log.debug(sql) 

1791 return sql 

1792 

1793 @property 

1794 def all_full_queries(self) -> List[TableQueryArgs]: 

1795 """ 

1796 Returns all final queries. This is a list of multiple SQL queries, each 

1797 retrieving information from one table, and all retrieving information 

1798 for the same patient(s). 

1799 

1800 The patients we use are defined by our :meth:`patient_id_query`. 

1801 

1802 Returns: 

1803 list: a list of :class:`TableQueryArgs` objects (q.v.) 

1804 

1805 """ 

1806 return self.all_queries(mrids=None) 

1807 

1808 def all_queries_specific_patients( 

1809 self, mrids: List[int] 

1810 ) -> List[TableQueryArgs]: 

1811 """ 

1812 Returns all final queries. This is a list of multiple SQL queries, each 

1813 retrieving information from one table, and all retrieving information 

1814 for the same patient(s). 

1815 

1816 The patients we use are defined by the MRID list given. 

1817 

1818 Args: 

1819 mrids: list of MRIDs 

1820 

1821 Returns: 

1822 list: a list of :class:`TableQueryArgs` objects (q.v.) 

1823 

1824 """ 

1825 return self.all_queries(mrids=mrids) 

1826 

1827 def all_queries(self, mrids: List[Any] = None) -> List[TableQueryArgs]: 

1828 """ 

1829 Returns all final queries. This is a list of multiple SQL queries, each 

1830 retrieving information from one table, and all retrieving information 

1831 for the same patient(s). 

1832 

1833 The patients we use are defined either by the MRID list given, or if 

1834 that is empty or blank, our :meth:`patient_id_query`. 

1835 

1836 Args: 

1837 mrids: list of MRIDs; if this is ``None`` or empty, use the 

1838 patients fetched (live) by our :meth:`patient_id_query`. 

1839 

1840 Returns: 

1841 list: a list of :class:`TableQueryArgs` objects (q.v.) 

1842 

1843 """ 

1844 queries = [] # type: List[TableQueryArgs] 

1845 table_columns_map = columns_to_table_column_hierarchy( 

1846 self._output_columns, sort=True 

1847 ) 

1848 for table, columns in table_columns_map: 

1849 table_sql_args = self.make_query( 

1850 table_id=table, columns=columns, mrids=mrids 

1851 ) 

1852 queries.append(table_sql_args) 

1853 return queries 

1854 

1855 def where_patient_clause( 

1856 self, table_id: TableId, mrids: List[Any] = None 

1857 ) -> SqlArgsTupleType: 

1858 """ 

1859 Returns an SQL WHERE clauses similar to ``sometable.mrid IN (1, 2, 3)`` 

1860 or ``sometable.mridcol IN (SELECT mrid FROM masterpatienttable)``. The 

1861 clause is used to restrict patients by MRID. 

1862 

1863 Args: 

1864 table_id: :class:`crate_anon.common.sql.TableId` for the table 

1865 whose MRID column we will apply the ``WHERE`` clause to 

1866 mrids: list of MRIDs; if this is ``None`` or empty, use the 

1867 patients fetched (live) by our :meth:`patient_id_query`. 

1868 

1869 Returns: 

1870 tuple: ``sql, args`` 

1871 """ 

1872 mrid_column = self._research_database_info.get_mrid_column_from_table( 

1873 table_id 

1874 ) 

1875 if mrids: 

1876 in_clause = ",".join(["?"] * len(mrids)) 

1877 # ... see notes for translate_sql_qmark_to_percent() 

1878 args = mrids 

1879 else: 

1880 # If we haven't specified specific patients, use our patient- 

1881 # finding query. 

1882 in_clause = self.patient_id_query(with_order_by=False) 

1883 # ... SQL Server moans if you use use ORDER BY in a subquery: 

1884 # "The ORDER BY clause is invalid in views, inline functions, 

1885 # derived tables, subqueries, ... unless TOP, OFFSET or FOR XML 

1886 # is specified." 

1887 args = [] # type: List[Any] 

1888 research_database_info = get_research_db_info() 

1889 grammar = research_database_info.grammar 

1890 sql = f"{mrid_column.identifier(grammar)} IN ({in_clause})" 

1891 return sql, args 

1892 

1893 def make_query( 

1894 self, 

1895 table_id: TableId, 

1896 columns: List[ColumnId], 

1897 mrids: List[Any] = None, 

1898 ) -> TableQueryArgs: 

1899 """ 

1900 Returns an SQL query to retrieve information from a single table for 

1901 certain patients. This query is similar to ``SELECT a, b, c FROM 

1902 sometable WHERE sometable.mrid IN (1, 2, 3)`` or ``SELECT a, b, c FROM 

1903 sometable WHERE sometable.mrid IN (SELECT mrid FROM 

1904 masterpatienttable)``. This then forms one query from (potentially) 

1905 many for our patient(s). 

1906 

1907 Args: 

1908 table_id: a :class:`crate_anon.common.sql.TableId` for the SELECT 

1909 FROM table 

1910 columns: columns, specified as a list of 

1911 :class:`crate_anon.common.sql.ColumnId`, to select from the 

1912 table (in addition to which, we will always select the MRID 

1913 column from that table) 

1914 mrids: list of MRIDs; if this is ``None`` or empty, use the 

1915 patients fetched (live) by our :meth:`patient_id_query`. 

1916 

1917 Returns: 

1918 a :class:`TableQueryArgs` object (q.v.) 

1919 

1920 """ 

1921 if not columns: 

1922 raise ValueError("No columns specified") 

1923 research_database_info = get_research_db_info() 

1924 grammar = research_database_info.grammar 

1925 mrid_column = research_database_info.get_mrid_column_from_table( 

1926 table_id 

1927 ) 

1928 all_columns = [mrid_column] 

1929 for c in columns: 

1930 if c not in all_columns: 

1931 all_columns.append(c) 

1932 where_clause, args = self.where_patient_clause(table_id, mrids) 

1933 select_elements = [SelectElement(column_id=col) for col in all_columns] 

1934 where_conditions = [WhereCondition(raw_sql=where_clause)] 

1935 sql = add_to_select( 

1936 "", 

1937 grammar=grammar, 

1938 select_elements=select_elements, 

1939 where_conditions=where_conditions, 

1940 magic_join=True, 

1941 formatted=True, 

1942 ) 

1943 return TableQueryArgs(table_id, sql, args) 

1944 

1945 # ------------------------------------------------------------------------- 

1946 # Display 

1947 # ------------------------------------------------------------------------- 

1948 

1949 @property 

1950 def output_cols_html(self) -> str: 

1951 """ 

1952 Returns all our output columns in HTML format. 

1953 """ 

1954 research_database_info = get_research_db_info() 

1955 grammar = research_database_info.grammar 

1956 return prettify_sql_html( 

1957 "\n".join( 

1958 [ 

1959 column_id.identifier(grammar) 

1960 for column_id in self.output_columns 

1961 ] 

1962 ) 

1963 ) 

1964 

1965 @property 

1966 def pt_conditions_html(self) -> str: 

1967 """ 

1968 Returns all our patient WHERE conditions in HTML format. 

1969 """ 

1970 research_database_info = get_research_db_info() 

1971 grammar = research_database_info.grammar 

1972 return prettify_sql_html( 

1973 "\nAND ".join([wc.sql(grammar) for wc in self.patient_conditions]) 

1974 ) 

1975 

1976 def summary_html(self, element_counter: HtmlElementCounter) -> str: 

1977 """ 

1978 Returns an HTML representation of this multiquery. 

1979 

1980 Args: 

1981 element_counter: a 

1982 :class:`crate_anon.crateweb.research.html_functions.HtmlElementCounter`, 

1983 which will be modified 

1984 

1985 Returns: 

1986 str: HTML 

1987 

1988 """ 

1989 

1990 def collapser(x: str) -> str: 

1991 return element_counter.overflow_div(contents=x) 

1992 

1993 outcols = self.output_cols_html 

1994 manual_query = self.manual_patient_id_query 

1995 if manual_query: 

1996 manual_or_auto = " (MANUAL)" 

1997 ptselect = prettify_sql_html(manual_query) 

1998 else: 

1999 manual_or_auto = "" 

2000 ptselect = self.pt_conditions_html 

2001 return f""" 

2002 Output columns:<br> 

2003 {collapser(outcols)} 

2004 Patient selection{manual_or_auto}:<br> 

2005 {collapser(ptselect)} 

2006 """ 

2007 

2008 # ------------------------------------------------------------------------- 

2009 # Data finder: COUNT(*) for all patient tables 

2010 # ------------------------------------------------------------------------- 

2011 

2012 def gen_data_finder_queries( 

2013 self, mrids: List[Any] = None 

2014 ) -> Generator[TableQueryArgs, None, None]: 

2015 """ 

2016 Generates a set of queries that, when executed, return the following 

2017 summary columns from each of our tables, filtered for patients by our 

2018 :meth:`where_patient_clause`, and grouped by ``master_research_id`` 

2019 (MRID): 

2020 

2021 .. code-block:: sql 

2022 

2023 master_research_id, 

2024 table_name, 

2025 COUNT(*) AS n_records, 

2026 MIN(date_column) AS min_date, -- NULL if no date column 

2027 MAX(date_column) AS max_date -- NULL if no date column 

2028 

2029 These queries can be used to see quickly which tables have interesting 

2030 information in. 

2031 

2032 Args: 

2033 mrids: list of MRIDs; if this is ``None`` or empty, use the 

2034 patients fetched (live) by our :meth:`patient_id_query`. 

2035 

2036 Yields: 

2037 :class:`TableQueryArgs` objects (q.v.) 

2038 

2039 """ 

2040 research_database_info = get_research_db_info() 

2041 grammar = research_database_info.grammar 

2042 mrid_alias = "master_research_id" 

2043 table_name_alias = "table_name" 

2044 n_records_alias = "n_records" 

2045 min_date_alias = "min_date" 

2046 max_date_alias = "max_date" 

2047 for ( 

2048 table_id 

2049 ) in research_database_info.get_mrid_linkable_patient_tables(): 

2050 mrid_col = research_database_info.get_mrid_column_from_table( 

2051 table=table_id 

2052 ) 

2053 date_col = research_database_info.get_default_date_column( 

2054 table=table_id 

2055 ) 

2056 if date_col: 

2057 min_date = f"MIN({date_col.identifier(grammar)})" 

2058 max_date = f"MAX({date_col.identifier(grammar)})" 

2059 else: 

2060 min_date = "NULL" 

2061 max_date = "NULL" 

2062 # ... OK (at least in MySQL) to do: 

2063 # SELECT col1, COUNT(*), NULL FROM table GROUP BY col1; 

2064 where_clause, args = self.where_patient_clause(table_id, mrids) 

2065 table_identifier = table_id.identifier(grammar) 

2066 select_elements = [ 

2067 SelectElement(column_id=mrid_col, alias=mrid_alias), 

2068 SelectElement( 

2069 raw_select=sql_string_literal(table_identifier), 

2070 alias=table_name_alias, 

2071 ), 

2072 SelectElement( 

2073 raw_select="COUNT(*)", 

2074 from_table_for_raw_select=table_id, 

2075 alias=n_records_alias, 

2076 ), 

2077 SelectElement( 

2078 raw_select=min_date, 

2079 from_table_for_raw_select=table_id, 

2080 alias=min_date_alias, 

2081 ), 

2082 SelectElement( 

2083 raw_select=max_date, 

2084 from_table_for_raw_select=table_id, 

2085 alias=max_date_alias, 

2086 ), 

2087 ] 

2088 where_conditions = [WhereCondition(raw_sql=where_clause)] 

2089 sql = add_to_select( 

2090 "", 

2091 grammar=grammar, 

2092 select_elements=select_elements, 

2093 where_conditions=where_conditions, 

2094 magic_join=True, 

2095 formatted=False, 

2096 ) 

2097 sql += "\nGROUP BY " + mrid_col.identifier(grammar) 

2098 sql += "\nORDER BY " + mrid_alias 

2099 sql = format_sql(sql) 

2100 yield TableQueryArgs(table_identifier, sql, args) 

2101 

2102 # ------------------------------------------------------------------------- 

2103 # Monster data: SELECT * for all patient tables 

2104 # ------------------------------------------------------------------------- 

2105 

2106 def gen_monster_queries( 

2107 self, mrids: List[int] = None 

2108 ) -> Generator[TableQueryArgs, None, None]: 

2109 """ 

2110 Generates a set of queries that, when executed, return ``SELECT *`` 

2111 from each of our tables, filtered for patients by our 

2112 :meth:`where_patient_clause`. So it's like the basic Patient Explorer 

2113 but with all columns in the output. 

2114 

2115 These queries are used in the Patient Explorer "Monster Data" view. 

2116 

2117 Args: 

2118 mrids: list of MRIDs; if this is ``None`` or empty, use the 

2119 patients fetched (live) by our :meth:`patient_id_query`. 

2120 

2121 Yields: 

2122 :class:`TableQueryArgs` objects (q.v.) 

2123 

2124 """ 

2125 research_database_info = get_research_db_info() 

2126 grammar = research_database_info.grammar 

2127 for ( 

2128 table_id 

2129 ) in research_database_info.get_mrid_linkable_patient_tables(): 

2130 mrid_col = research_database_info.get_mrid_column_from_table( 

2131 table=table_id 

2132 ) 

2133 where_clause, args = self.where_patient_clause(table_id, mrids) 

2134 # We add the WHERE using our magic query machine, to get the joins 

2135 # right: 

2136 select_elements = [ 

2137 SelectElement( 

2138 raw_select="*", from_table_for_raw_select=table_id 

2139 ), 

2140 ] 

2141 where_conditions = [ 

2142 WhereCondition( 

2143 raw_sql=where_clause, 

2144 from_table_for_raw_sql=mrid_col.table_id, 

2145 ), 

2146 ] 

2147 sql = add_to_select( 

2148 "", 

2149 grammar=grammar, 

2150 select_elements=select_elements, 

2151 where_conditions=where_conditions, 

2152 magic_join=True, 

2153 formatted=False, 

2154 ) 

2155 sql += " ORDER BY " + mrid_col.identifier(grammar) 

2156 sql = format_sql(sql) 

2157 yield TableQueryArgs(table_id, sql, args) 

2158 

2159 

2160# ============================================================================= 

2161# PatientExplorer 

2162# ============================================================================= 

2163 

2164PATIENT_EXPLORER_FWD_REF = "PatientExplorer" 

2165 

2166 

2167class PatientExplorer(models.Model): 

2168 """ 

2169 Class to explore the research database on a per-patient basis. 

2170 """ 

2171 

2172 class Meta: 

2173 app_label = "research" 

2174 

2175 id = models.AutoField(primary_key=True) # automatic 

2176 user = models.ForeignKey( 

2177 settings.AUTH_USER_MODEL, on_delete=models.CASCADE 

2178 ) 

2179 patient_multiquery = JsonClassField( 

2180 verbose_name="PatientMultiQuery as JSON", null=True 

2181 ) # type: PatientMultiQuery 

2182 pmq_hash = models.BigIntegerField( 

2183 verbose_name="64-bit non-cryptographic hash of JSON of " 

2184 "patient_multiquery" 

2185 ) 

2186 active = models.BooleanField(default=True) # see save() below 

2187 created = models.DateTimeField(auto_now_add=True) 

2188 deleted = models.BooleanField( 

2189 default=False, 

2190 verbose_name="Deleted from the user's perspective. " 

2191 "Audited queries are never properly deleted.", 

2192 ) 

2193 audited = models.BooleanField(default=False) 

2194 

2195 def __init__(self, *args, **kwargs) -> None: 

2196 super().__init__(*args, **kwargs) 

2197 if not self.patient_multiquery: 

2198 self.patient_multiquery = PatientMultiQuery() 

2199 

2200 def __str__(self) -> str: 

2201 return f"<PatientExplorer id={self.id}>" 

2202 

2203 def save(self, *args, update_fields=None, **kwargs) -> None: 

2204 """ 

2205 Custom save method. Ensures that only one :class:`PatientExplorer` has 

2206 ``active == True`` for a given user. Also sets the hash. 

2207 """ 

2208 if self.active: 

2209 PatientExplorer.objects.filter(user=self.user, active=True).update( 

2210 active=False 

2211 ) 

2212 

2213 if update_fields is None or "patient_multiquery" in update_fields: 

2214 self.pmq_hash = self.patient_multiquery.hash64 

2215 if update_fields is not None: 

2216 update_fields = {"pmq_hash"}.union(update_fields) 

2217 

2218 # Beware: Python's hash() function will downconvert to 32 bits on 

2219 # 32-bit machines; use pmq.hash64() directly, not hash(pmq). 

2220 super().save(*args, update_fields=update_fields, **kwargs) 

2221 

2222 # ------------------------------------------------------------------------- 

2223 # Fetching 

2224 # ------------------------------------------------------------------------- 

2225 

2226 @staticmethod 

2227 def get_active_pe_or_none( 

2228 request: HttpRequest, 

2229 ) -> Optional[PATIENT_EXPLORER_FWD_REF]: 

2230 """ 

2231 Args: 

2232 request: the :class:`django.http.request.HttpRequest` 

2233 

2234 Returns: 

2235 The active :class:`PatientExplorer` for the user, or ``None``. 

2236 

2237 """ 

2238 if not request.user.is_authenticated: 

2239 return None 

2240 try: 

2241 return PatientExplorer.objects.get(user=request.user, active=True) 

2242 except PatientExplorer.DoesNotExist: 

2243 return None 

2244 

2245 @staticmethod 

2246 def get_active_pe_id_or_none(request: HttpRequest) -> Optional[int]: 

2247 """ 

2248 Args: 

2249 request: the :class:`django.http.request.HttpRequest` 

2250 

2251 Returns: 

2252 The integer PK of the active :class:`PatientExplorer` for the user, 

2253 or ``None``. 

2254 

2255 """ 

2256 if not request.user.is_authenticated: 

2257 return None 

2258 try: 

2259 pe = PatientExplorer.objects.get(user=request.user, active=True) 

2260 return pe.id 

2261 except PatientExplorer.DoesNotExist: 

2262 return None 

2263 

2264 # ------------------------------------------------------------------------- 

2265 # Activating, deleting, auditing 

2266 # ------------------------------------------------------------------------- 

2267 

2268 def activate(self) -> None: 

2269 """ 

2270 Activates this :class:`PatientExplorer` (and deactivates any others). 

2271 """ 

2272 self.active = True 

2273 self.save() 

2274 

2275 def mark_audited(self) -> None: 

2276 """ 

2277 Mark the query as having been executed and audited. (This prevents it 

2278 from being wholly deleted.) 

2279 """ 

2280 if self.audited: 

2281 return 

2282 self.audited = True 

2283 self.save() 

2284 

2285 def mark_deleted(self) -> None: 

2286 """ 

2287 Mark the query as deleted. 

2288 

2289 This will stop it being shown. It will not delete it from the database. 

2290 

2291 We use this deletion method for queries that have been executed, so 

2292 need an audit trail. 

2293 """ 

2294 if self.deleted: 

2295 # log.debug("pointless") 

2296 return 

2297 self.deleted = True 

2298 self.active = False 

2299 # log.debug("about to save") 

2300 self.save() 

2301 # log.debug("saved") 

2302 

2303 def delete_if_permitted(self) -> None: 

2304 """ 

2305 Delete the query. 

2306 

2307 - If a query has been executed and therefore audited, it isn't properly 

2308 deleted; it's just marked as deleted. 

2309 - If a query has never been executed, we can delete it entirely. 

2310 """ 

2311 if self.deleted: 

2312 log.debug("already flagged as deleted") 

2313 return 

2314 if self.audited: 

2315 log.debug("marking as deleted") 

2316 self.mark_deleted() 

2317 else: 

2318 # actually delete 

2319 log.debug("actually deleting") 

2320 self.delete() 

2321 

2322 def audit( 

2323 self, 

2324 count_only: bool = False, 

2325 n_records: int = 0, 

2326 failed: bool = False, 

2327 fail_msg: str = "", 

2328 ) -> None: 

2329 """ 

2330 Audit the execution of this query: 

2331 

2332 - insert an audit entry referring to this query 

2333 - mark the query as having been audited (so it's not deleted) 

2334 

2335 Args: 

2336 count_only: did we know (in advance) that this was a 

2337 ``COUNT()``-only query? 

2338 n_records: how many records were returned? 

2339 failed: did the query fail? 

2340 fail_msg: if the query failed, the associated failure message 

2341 """ 

2342 a = PatientExplorerAudit( 

2343 patient_explorer=self, 

2344 count_only=count_only, 

2345 n_records=n_records, 

2346 failed=failed, 

2347 fail_msg=fail_msg, 

2348 ) 

2349 a.save() 

2350 self.mark_audited() 

2351 

2352 # ------------------------------------------------------------------------- 

2353 # Using the internal PatientMultiQuery 

2354 # ------------------------------------------------------------------------- 

2355 

2356 def all_queries(self, mrids: List[Any] = None) -> List[TableQueryArgs]: 

2357 """ 

2358 Returns all queries from our :attr:`patient_multiquery`. See 

2359 :meth:`PatientMultiQuery.all_queries` 

2360 

2361 Args: 

2362 mrids: list of MRIDs; if this is ``None`` or empty, use the 

2363 patients fetched (live) by our :attr:`patient_multiquery`'s 

2364 :meth:`PatientMultiQuery.patient_id_query`. 

2365 

2366 Returns: 

2367 list: a list of :class:`TableQueryArgs` objects (q.v.) 

2368 

2369 """ 

2370 return self.patient_multiquery.all_queries(mrids=mrids) 

2371 

2372 @staticmethod 

2373 def get_executed_cursor(sql: str, args: List[Any] = None) -> CursorWrapper: 

2374 """ 

2375 Executes a query (via the research database) and returns its cursor. 

2376 

2377 Args: 

2378 sql: SQL text 

2379 args: arguments to SQL query 

2380 

2381 Returns: 

2382 a :class:`django.db.backends.utils.CursorWrapper`, which is a 

2383 context manager that behaves as the executed cursor and also closes 

2384 it on completion 

2385 """ 

2386 sql = translate_sql_qmark_to_percent(sql) 

2387 cursor = get_executed_researchdb_cursor(sql, args) 

2388 return cursor 

2389 

2390 def get_patient_mrids(self) -> List[int]: 

2391 """ 

2392 Returns all MRIDs from our :attr:`patient_multiquery`'s 

2393 :meth:`PatientMultiQuery.patient_id_query`. 

2394 """ 

2395 sql = self.patient_multiquery.patient_id_query(with_order_by=True) 

2396 # log.critical(sql) 

2397 with self.get_executed_cursor(sql) as cursor: 

2398 return [row[0] for row in cursor.fetchall()] 

2399 

2400 def get_zipped_tsv_binary(self) -> bytes: 

2401 """ 

2402 Returns a ZIP file containing TSVs, one for each table in our 

2403 :attr:`patient_multiquery`. 

2404 """ 

2405 # Don't pass giant result sets around beyond what's necessary. 

2406 # Use cursor.fetchone() 

2407 grammar = make_grammar(settings.RESEARCH_DB_DIALECT) 

2408 memfile = io.BytesIO() 

2409 z = zipfile.ZipFile(memfile, "w") 

2410 for tsa in self.patient_multiquery.all_queries(): 

2411 table_id = tsa.table_id 

2412 sql = tsa.sql 

2413 args = tsa.args 

2414 with self.get_executed_cursor(sql, args) as cursor: 

2415 try: 

2416 fieldnames = get_fieldnames_from_cursor(cursor) 

2417 except TypeError: 

2418 fieldnames = [] # type: List[str] 

2419 tsv = make_tsv_row(fieldnames) 

2420 row = cursor.fetchone() 

2421 while row is not None: 

2422 tsv += make_tsv_row(row) 

2423 row = cursor.fetchone() 

2424 filename = table_id.identifier(grammar) + ".tsv" 

2425 z.writestr(filename, tsv.encode("utf-8")) 

2426 z.close() 

2427 return memfile.getvalue() 

2428 

2429 def get_xlsx_binary(self) -> bytes: 

2430 """ 

2431 Returns an XLSX (Excel) file containing spreadsheets, one for each 

2432 table in our :attr:`patient_multiquery`. 

2433 

2434 Other notes: 

2435 

2436 - cell size: see 

2437 https://stackoverflow.com/questions/13197574/python-openpyxl-column-width-size-adjust; 

2438 and the "auto_size" / "bestFit" options don't really do the job, 

2439 according to the interweb. 

2440 

2441 """ 

2442 wb = Workbook() 

2443 wb.remove_sheet(wb.active) # remove the autocreated blank sheet 

2444 sqlsheet_rows = [["Table", "SQL", "Args", "Executed_at"]] 

2445 for tsa in self.patient_multiquery.all_queries(): 

2446 table_id = tsa.table_id 

2447 sql = tsa.sql 

2448 args = tsa.args 

2449 sqlsheet_rows.append( 

2450 [str(table_id), sql, repr(args), datetime.datetime.now()] 

2451 ) 

2452 ws = wb.create_sheet(title=str(table_id)) 

2453 with self.get_executed_cursor(sql, args) as cursor: 

2454 try: 

2455 fieldnames = get_fieldnames_from_cursor(cursor) 

2456 except (AttributeError, IndexError): 

2457 fieldnames = [] # type: List[str] 

2458 ws.append(fieldnames) 

2459 row = cursor.fetchone() 

2460 while row is not None: 

2461 ws.append(gen_excel_row_elements(ws, row)) 

2462 row = cursor.fetchone() 

2463 sql_ws = wb.create_sheet(title="SQL") 

2464 for r in sqlsheet_rows: 

2465 sql_ws.append(r) 

2466 return excel_to_bytes(wb) 

2467 

2468 # ------------------------------------------------------------------------- 

2469 # Using the internal PatientMultiQuery 

2470 # ------------------------------------------------------------------------- 

2471 

2472 def get_patient_id_query(self, with_order_by: bool = True) -> str: 

2473 """ 

2474 Returns SQL from our :attr:`patient_multiquery`'s 

2475 :meth:`PatientMultiQuery.patient_id_query` (q.v.). 

2476 

2477 Args: 

2478 with_order_by: see :meth:`PatientMultiQuery.patient_id_query` 

2479 """ 

2480 return self.patient_multiquery.patient_id_query( 

2481 with_order_by=with_order_by 

2482 ) 

2483 

2484 # ------------------------------------------------------------------------- 

2485 # Display 

2486 # ------------------------------------------------------------------------- 

2487 

2488 @property 

2489 def summary_html(self) -> str: 

2490 """ 

2491 Return HTML summarizing this object. 

2492 """ 

2493 # Nasty hack. We want collapsing things, so we want HTML element IDs. 

2494 # We could build the HTML table in code for the Patient Explorer 

2495 # chooser, but I was trying to do it in Django templates. 

2496 # However, it's not easy to pass parameters (such as an 

2497 # HtmlElementCounter) back to Python from Django templates. 

2498 # So we can hack it a bit: 

2499 element_counter = HtmlElementCounter(prefix=f"pe_{self.id}_") 

2500 return self.patient_multiquery.summary_html( 

2501 element_counter=element_counter 

2502 ) 

2503 

2504 @property 

2505 def has_patient_id_query(self) -> bool: 

2506 """ 

2507 Does our our :attr:`patient_multiquery` have a patient ID query? 

2508 

2509 See :meth:`PatientMultiQuery.has_patient_id_query`. 

2510 """ 

2511 return self.patient_multiquery.has_patient_id_query 

2512 

2513 @property 

2514 def has_output_columns(self) -> bool: 

2515 """ 

2516 Does our our :attr:`patient_multiquery` have output columns? 

2517 

2518 See :meth:`PatientMultiQuery.has_output_columns`. 

2519 """ 

2520 return self.patient_multiquery.has_output_columns 

2521 

2522 # ------------------------------------------------------------------------- 

2523 # Data finder 

2524 # ------------------------------------------------------------------------- 

2525 

2526 @property 

2527 def data_finder_excel(self) -> bytes: 

2528 """ 

2529 Returns an XSLX (Excel) file containing summary (count) information 

2530 for each table. 

2531 

2532 See :meth:`PatientMultiQuery.gen_data_finder_queries`. 

2533 """ 

2534 fieldnames = [] # type: List[str] 

2535 wb = Workbook() 

2536 wb.remove_sheet(wb.active) # remove the autocreated blank sheet 

2537 all_ws = wb.create_sheet("All_patients") 

2538 sql_ws = wb.create_sheet("SQL") 

2539 sql_ws.append(["Table", "SQL", "Args", "Executed_at"]) 

2540 

2541 for tsa in self.patient_multiquery.gen_data_finder_queries(): 

2542 table_identifier = tsa.table_id 

2543 sql = tsa.sql 

2544 args = tsa.args 

2545 sql_ws.append( 

2546 [ 

2547 table_identifier, 

2548 format_sql(sql), 

2549 repr(args), 

2550 datetime.datetime.now(), 

2551 ] 

2552 ) 

2553 with self.get_executed_cursor(sql, args) as cursor: 

2554 if not fieldnames: 

2555 try: 

2556 fieldnames = get_fieldnames_from_cursor(cursor) 

2557 except TypeError: 

2558 fieldnames = [] # type: List[str] 

2559 all_ws.append(fieldnames) 

2560 row = cursor.fetchone() 

2561 while row is not None: 

2562 mrid = str(row[0]) 

2563 if mrid in wb: 

2564 ws = wb[mrid] 

2565 else: 

2566 ws = wb.create_sheet(mrid) 

2567 ws.append(fieldnames) 

2568 rowtuple = tuple(row) 

2569 ws.append(rowtuple) 

2570 all_ws.append(rowtuple) 

2571 row = cursor.fetchone() 

2572 return excel_to_bytes(wb) 

2573 

2574 

2575# ============================================================================= 

2576# PatientExplorer auditing class 

2577# ============================================================================= 

2578 

2579 

2580class PatientExplorerAudit(models.Model): 

2581 """ 

2582 Audit log for a PatientExplorer. 

2583 """ 

2584 

2585 id = models.AutoField(primary_key=True) # automatic 

2586 patient_explorer = models.ForeignKey( 

2587 "PatientExplorer", on_delete=models.PROTECT 

2588 ) 

2589 # ... contains information on which user 

2590 when = models.DateTimeField(auto_now_add=True) 

2591 count_only = models.BooleanField(default=False) 

2592 n_records = models.IntegerField(default=0) 

2593 # ... not PositiveIntegerField; SQL Server gives -1, for example 

2594 failed = models.BooleanField(default=False) 

2595 fail_msg = models.TextField() 

2596 

2597 def __str__(self) -> str: 

2598 return f"<PatientExplorerAudit id={self.id}>" 

2599 

2600 

2601# ============================================================================= 

2602# Archive and visualization zone auditing classes 

2603# ============================================================================= 

2604 

2605ARCHIVE_PATIENT_ID_MAX_LENGTH = 255 

2606ARCHIVE_ATTACHMENT_FILENAME_MAX_LENGTH = 255 

2607 

2608 

2609class ArchiveTemplateAudit(models.Model): 

2610 """ 

2611 Audit log for access to an archive template. 

2612 """ 

2613 

2614 id = models.AutoField(primary_key=True) # automatic 

2615 user = models.ForeignKey( 

2616 settings.AUTH_USER_MODEL, on_delete=models.CASCADE 

2617 ) 

2618 when = models.DateTimeField(auto_now_add=True) 

2619 patient_id = models.CharField(max_length=ARCHIVE_PATIENT_ID_MAX_LENGTH) 

2620 query_string = models.TextField() 

2621 # ... no max length; see 

2622 # https://stackoverflow.com/questions/812925/what-is-the-maximum-possible-length-of-a-query-string # noqa: E501 

2623 

2624 def __str__(self) -> str: 

2625 return f"<ArchiveTemplateAudit id={self.id}>" 

2626 

2627 

2628class ArchiveAttachmentAudit(models.Model): 

2629 """ 

2630 Audit log for access to an archive attachment. 

2631 """ 

2632 

2633 id = models.AutoField(primary_key=True) # automatic 

2634 user = models.ForeignKey( 

2635 settings.AUTH_USER_MODEL, on_delete=models.CASCADE 

2636 ) 

2637 when = models.DateTimeField(auto_now_add=True) 

2638 patient_id = models.CharField(max_length=ARCHIVE_PATIENT_ID_MAX_LENGTH) 

2639 filename = models.CharField( 

2640 max_length=ARCHIVE_ATTACHMENT_FILENAME_MAX_LENGTH 

2641 ) 

2642 

2643 def __str__(self) -> str: 

2644 return f"<ArchiveAttachmentAudit id={self.id}>"