Coverage for crateweb/research/research_db_info.py: 27%

481 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-02-05 06:46 -0600

1""" 

2crate_anon/crateweb/research/research_db_info.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Represents information about the structure of our research database(s).** 

27 

28""" 

29 

30from collections import OrderedDict 

31 

32import logging 

33import re 

34from typing import Any, Dict, List, Optional, Set 

35 

36from cardinal_pythonlib.dbfunc import dictfetchall 

37from cardinal_pythonlib.django.django_constants import ConnectionVendors 

38from cardinal_pythonlib.django.function_cache import django_cache_function 

39from cardinal_pythonlib.excel import excel_to_bytes 

40from cardinal_pythonlib.json_utils.serialize import ( 

41 METHOD_NO_ARGS, 

42 register_for_json, 

43) 

44from cardinal_pythonlib.logs import BraceStyleAdapter 

45from cardinal_pythonlib.reprfunc import auto_repr 

46from cardinal_pythonlib.sql.sql_grammar import SqlGrammar 

47from cardinal_pythonlib.sql.validation import ( 

48 SQLTYPES_TEXT, 

49 SQLTYPES_WITH_DATE, 

50) 

51from cardinal_pythonlib.sqlalchemy.dialect import SqlaDialectName 

52from cardinal_pythonlib.sqlalchemy.schema import ( 

53 MSSQL_DEFAULT_SCHEMA, 

54 POSTGRES_DEFAULT_SCHEMA, 

55) 

56from cardinal_pythonlib.tsv import dictlist_to_tsv 

57from django.db import connections 

58from django.db.backends.base.base import BaseDatabaseWrapper 

59from django.conf import settings 

60from openpyxl import Workbook 

61from requests.structures import CaseInsensitiveDict 

62 

63from crate_anon.common.constants import RUNNING_WITHOUT_CONFIG 

64from crate_anon.common.sql import ( 

65 ColumnId, 

66 QB_DATATYPE_DATE, 

67 QB_DATATYPE_FLOAT, 

68 QB_DATATYPE_INTEGER, 

69 QB_DATATYPE_STRING, 

70 QB_DATATYPE_STRING_FULLTEXT, 

71 QB_DATATYPE_UNKNOWN, 

72 SQLTYPES_FLOAT_OR_OTHER_NUMERIC, 

73 SQLTYPES_INTEGER_OR_BIT, 

74 SchemaId, 

75 SqlArgsTupleType, 

76 TableId, 

77 is_sql_column_type_textual, 

78 make_grammar, 

79 translate_sql_qmark_to_percent, 

80) 

81from crate_anon.crateweb.core.constants import ( 

82 RESEARCH_DB_CONNECTION_NAME, 

83 SettingsKeys, 

84) 

85from crate_anon.crateweb.config.constants import ResearchDbInfoKeys 

86 

87log = BraceStyleAdapter(logging.getLogger(__name__)) 

88 

89 

90# ============================================================================= 

91# Constants 

92# ============================================================================= 

93 

94SUPPORTED_DIALECTS = [ 

95 SqlaDialectName.MSSQL, 

96 SqlaDialectName.MYSQL, 

97 # SqlaDialectName.POSTGRES, # no grammar yet 

98] 

99 

100 

101class PatientFieldPythonTypes: 

102 """ 

103 Represents Python types for each type of patient ID field. 

104 """ 

105 

106 PID = int 

107 MPID = int 

108 RID = str 

109 MRID = str 

110 TRID = int 

111 

112 

113# ============================================================================= 

114# Information about the research database 

115# ============================================================================= 

116 

117 

118class ColumnInfo: 

119 """ 

120 Represents information about a database column, reflected from the 

121 database. 

122 

123 See also ``querybuilder.js``. 

124 """ 

125 

126 def __init__(self, **kwargs) -> None: 

127 # Different databases may vary the case, although the column headings 

128 # are ANSI standard. 

129 cid = CaseInsensitiveDict(kwargs) 

130 try: 

131 self.table_catalog = cid["table_catalog"] # type: str 

132 self.table_schema = cid["table_schema"] # type: str 

133 self.table_name = cid["table_name"] # type: str 

134 self.column_name = cid["column_name"] # type: str 

135 self.is_nullable = bool(cid["is_nullable"]) 

136 self.column_type = cid["column_type"] # type: str 

137 self.column_comment = cid["column_comment"] # type: str 

138 self.indexed = bool(cid["indexed"]) 

139 self.indexed_fulltext = bool(cid["indexed_fulltext"]) 

140 except KeyError: 

141 log.critical( 

142 "Information missing from column info dictionary: {}", kwargs 

143 ) 

144 raise 

145 

146 @property 

147 def basetype(self) -> str: 

148 """ 

149 Returns the SQL base type, such as ``VARCHAR``. 

150 """ 

151 return self.column_type.split("(")[0].upper() 

152 

153 @property 

154 def querybuilder_type(self) -> str: 

155 """ 

156 Returns a string that is defined in ``querybuilder.js`` and that 

157 defines our field type, like ``"int"`` or ``"date"``. See source. 

158 """ 

159 basetype = self.basetype 

160 if basetype in SQLTYPES_FLOAT_OR_OTHER_NUMERIC: 

161 return QB_DATATYPE_FLOAT 

162 if basetype in SQLTYPES_WITH_DATE: 

163 return QB_DATATYPE_DATE 

164 if basetype in SQLTYPES_TEXT: 

165 if self.indexed_fulltext: 

166 return QB_DATATYPE_STRING_FULLTEXT 

167 else: 

168 return QB_DATATYPE_STRING 

169 if basetype in SQLTYPES_INTEGER_OR_BIT: 

170 return QB_DATATYPE_INTEGER 

171 return QB_DATATYPE_UNKNOWN 

172 

173 @property 

174 def column_id(self) -> ColumnId: 

175 """ 

176 Returns a :class:`crate_anon.common.sql.ColumnId` describing this 

177 column. 

178 """ 

179 return ColumnId( 

180 db=self.table_catalog, 

181 schema=self.table_schema, 

182 table=self.table_name, 

183 column=self.column_name, 

184 ) 

185 

186 @property 

187 def table_id(self) -> TableId: 

188 """ 

189 Returns a :class:`crate_anon.common.sql.TableId` describing this 

190 column's table. 

191 """ 

192 return TableId( 

193 db=self.table_catalog, 

194 schema=self.table_schema, 

195 table=self.table_name, 

196 ) 

197 

198 def __repr__(self) -> str: 

199 return auto_repr(self, sort_attrs=False) 

200 

201 def __str__(self) -> str: 

202 return str(self.column_id) 

203 

204 

205class SingleResearchDatabase: 

206 """ 

207 Represents, and adds information to, a single entry from the 

208 ``RESEARCH_DB_INFO`` list: that is, a research database. (It's a list 

209 because it's ordered.) 

210 """ 

211 

212 def __init__( 

213 self, 

214 index: int, 

215 grammar: SqlGrammar, 

216 rdb_info: "ResearchDatabaseInfo", 

217 connection: BaseDatabaseWrapper, 

218 ) -> None: 

219 """ 

220 Instantiates, reading database information as follows: 

221 

222 - the relevant dictionary is ``settings.RESEARCH_DB_INFO[index]`` 

223 - the keys to that dictionary are in 

224 :class:`crate_anon.crateweb.config.constants.ResearchDbInfoKeys` 

225 

226 Args: 

227 index: 

228 Python zero-based index to ``settings.RESEARCH_DB_INFO`` 

229 grammar: 

230 a :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar`, 

231 for the SQL dialect we're using 

232 rdb_info: 

233 a :class:`ResearchDatabaseInfo`, describing the structure of 

234 the research database 

235 connection: 

236 a :class:`django.db.backends.base.base.BaseDatabaseWrapper`, 

237 i.e. a Django database connection. This includes 

238 connection.vendor, the Django database vendor name; see e.g. 

239 https://docs.djangoproject.com/en/2.1/ref/models/options/ 

240 """ 

241 assert 0 <= index <= len(settings.RESEARCH_DB_INFO) 

242 infodict = settings.RESEARCH_DB_INFO[index] 

243 

244 # Don't store self.connection; the Django cache will pickle and it is 

245 # not pickleable. 

246 

247 self.index = index 

248 self.is_first_db = index == 0 

249 self.grammar = grammar 

250 self.rdb_info = rdb_info 

251 

252 try: 

253 self.name = infodict[ResearchDbInfoKeys.NAME] # type: str 

254 self.description = infodict[ 

255 ResearchDbInfoKeys.DESCRIPTION 

256 ] # type: str 

257 self.database = infodict[ResearchDbInfoKeys.DATABASE] # type: str 

258 self.schema_name = infodict[ResearchDbInfoKeys.SCHEMA] # type: str 

259 self.trid_field = infodict[ 

260 ResearchDbInfoKeys.TRID_FIELD 

261 ] # type: str 

262 self.rid_field = infodict[ 

263 ResearchDbInfoKeys.RID_FIELD 

264 ] # type: str 

265 self.rid_family = infodict[ 

266 ResearchDbInfoKeys.RID_FAMILY 

267 ] # type: int 

268 self.mrid_table = infodict[ 

269 ResearchDbInfoKeys.MRID_TABLE 

270 ] # type: str 

271 self.mrid_field = infodict[ 

272 ResearchDbInfoKeys.MRID_FIELD 

273 ] # type: str 

274 except KeyError as e: 

275 raise KeyError( 

276 f"Key {e} is missing from settings.RESEARCH_DB_INFO " 

277 f"for this dict: {infodict!r}" 

278 ) 

279 

280 assert isinstance(self.name, str) and self.name # no blanks 

281 assert re.match(r"^\w+$", self.name), ( 

282 f"Database name {self.name!r} should contain only " 

283 f"alphanumeric/underscore characters" 

284 ) 

285 assert ( 

286 isinstance(self.description, str) and self.description 

287 ) # no blanks 

288 

289 assert isinstance(self.database, str) # may be blank 

290 assert ( 

291 isinstance(self.schema_name, str) and self.schema_name 

292 ) # no blanks 

293 

294 assert isinstance(self.trid_field, str) # may be blank 

295 assert isinstance(self.rid_field, str) and self.rid_field # no blanks 

296 assert isinstance(self.rid_family, int) # may be blank 

297 assert self.rid_family > 0 # positive integers only 

298 assert isinstance(self.mrid_table, str) # may be blank 

299 assert isinstance(self.mrid_field, str) # may be blank 

300 

301 self.pid_pseudo_field = infodict.get( 

302 ResearchDbInfoKeys.PID_PSEUDO_FIELD, "" 

303 ) # type: str 

304 self.mpid_pseudo_field = infodict.get( 

305 ResearchDbInfoKeys.MPID_PSEUDO_FIELD, "" 

306 ) # type: str 

307 assert isinstance( 

308 self.pid_pseudo_field, str 

309 ) # may be blank unless it's a lookup DB 

310 assert isinstance( 

311 self.mpid_pseudo_field, str 

312 ) # may be blank unless it's a lookup DB 

313 

314 self.pid_description = infodict.get( 

315 ResearchDbInfoKeys.PID_DESCRIPTION, 

316 "Patient ID (PID) for database " + self.description, 

317 ) # type: str 

318 self.mpid_description = infodict.get( 

319 ResearchDbInfoKeys.MPID_DESCRIPTION, "Master patient ID (MPID)" 

320 ) # type: str 

321 self.rid_description = infodict.get( 

322 ResearchDbInfoKeys.RID_DESCRIPTION, 

323 "Research ID (RID) for database " + self.description, 

324 ) # type: str 

325 self.mrid_description = infodict.get( 

326 ResearchDbInfoKeys.MRID_DESCRIPTION, "Master research ID (MRID)" 

327 ) # type: str 

328 self.trid_description = infodict.get( 

329 ResearchDbInfoKeys.TRID_DESCRIPTION, 

330 "Transient research ID (TRID) for database " + self.description, 

331 ) # type: str 

332 assert isinstance(self.pid_description, str) 

333 assert isinstance(self.mpid_description, str) 

334 assert isinstance(self.rid_description, str) 

335 assert isinstance(self.mrid_description, str) 

336 assert isinstance(self.trid_description, str) 

337 

338 self.secret_lookup_db = infodict.get( 

339 ResearchDbInfoKeys.SECRET_LOOKUP_DB, "" 

340 ) 

341 assert isinstance(self.secret_lookup_db, str) 

342 if self.secret_lookup_db: 

343 assert self.secret_lookup_db in settings.DATABASES, ( 

344 f"Research database named {self.name!r} in " 

345 f"settings.RESEARCH_DB_INFO has an invalid " 

346 f"secret_lookup_db: {self.secret_lookup_db!r}" 

347 ) 

348 assert re.match(r"^\w+$", self.pid_pseudo_field), ( 

349 f"The research database named {self.name!r} should have a " 

350 f"pid_pseudo_field containing only alphanumeric/underscore " 

351 f"characters (it's {self.pid_pseudo_field!r})" 

352 ) 

353 assert re.match(r"^\w+$", self.mpid_pseudo_field), ( 

354 f"The research database named {self.name!r} should have a " 

355 f"mpid_pseudo_field containing only alphanumeric/underscore " 

356 f"characters (it's {self.mpid_pseudo_field!r})" 

357 ) 

358 

359 self.date_fields_by_table = infodict.get( 

360 ResearchDbInfoKeys.DATE_FIELDS_BY_TABLE, {} 

361 ) # type: Dict[str, str] 

362 assert isinstance(self.date_fields_by_table, dict) 

363 for k, v in self.date_fields_by_table.items(): 

364 assert isinstance(k, str) and k, ( 

365 f"Bad key {k!r} for {ResearchDbInfoKeys.DATE_FIELDS_BY_TABLE} " 

366 f"for database named {self.name!r}" 

367 ) 

368 assert isinstance(v, str) and v, ( 

369 f"Bad value {v!r} for " 

370 f"{ResearchDbInfoKeys.DATE_FIELDS_BY_TABLE} " 

371 f"for database named {self.name!r}" 

372 ) 

373 

374 self.default_date_fields = infodict.get( 

375 ResearchDbInfoKeys.DEFAULT_DATE_FIELDS, [] 

376 ) # type: List[str] 

377 assert isinstance(self.default_date_fields, list) 

378 for v in self.default_date_fields: 

379 assert isinstance(v, str) and v, ( 

380 f"Bad item {v!r} for {ResearchDbInfoKeys.DEFAULT_DATE_FIELDS} " 

381 f"for database named {self.name!r}" 

382 ) 

383 

384 # Field for when the record was last updated in db 

385 self.update_date_field = infodict.get( 

386 ResearchDbInfoKeys.UPDATE_DATE_FIELD, "" 

387 ) 

388 assert isinstance(self.update_date_field, str) 

389 

390 self.schema_id = SchemaId(self.database, self.schema_name) 

391 assert self.schema_id 

392 

393 # Now discover the schema 

394 self._schema_infodictlist = self.get_schema_infodictlist(connection) 

395 self._colinfolist = None # type: Optional[List[ColumnInfo]] 

396 

397 @property 

398 def schema_infodictlist(self) -> List[Dict[str, Any]]: 

399 """ 

400 Discovers the schema. Returns the results of 

401 :meth:`get_schema_infodictlist` for our connection and vendor. 

402 Implements caching. 

403 """ 

404 return self._schema_infodictlist 

405 

406 @property 

407 def colinfolist(self) -> List[ColumnInfo]: 

408 """ 

409 Returns a list of :class:`ColumnInfo` objects for our research 

410 database. 

411 """ 

412 if self._colinfolist is None: 

413 self._colinfolist = [ 

414 ColumnInfo(**d) for d in self.schema_infodictlist 

415 ] 

416 return self._colinfolist 

417 

418 @property 

419 def schema_identifier(self) -> str: 

420 """ 

421 Returns the SQL schema identifier. 

422 """ 

423 return self.schema_id.identifier(self.grammar) 

424 

425 @property 

426 def eligible_for_query_builder(self) -> bool: 

427 """ 

428 Is this database eligible to participate in the CRATE assisted query 

429 builder? 

430 

431 This is ``True`` if it's the first database, or if it can link directly 

432 to the first database (shares a common RID), or if it and the first 

433 database share a common MRID. 

434 """ 

435 if self.is_first_db: 

436 # First one: always eligible 

437 return True 

438 first_db = self.rdb_info.first_dbinfo 

439 return ( 

440 first_db.talks_to_world and self.talks_to_world 

441 ) or self.can_communicate_directly(first_db) 

442 

443 @property 

444 def talks_to_world(self) -> bool: 

445 """ 

446 Does this database "talk to the world", i.e. have an MRID? 

447 """ 

448 return self.has_mrid 

449 

450 @property 

451 def has_mrid(self) -> bool: 

452 """ 

453 Does this database have a defined master research ID (MRID), i.e. a 

454 table with one row per patient and one column that is the MRID? 

455 """ 

456 return bool(self.mrid_table and self.mrid_field) 

457 

458 def can_communicate_directly( 

459 self, other: "SingleResearchDatabase" 

460 ) -> bool: 

461 """ 

462 Can this database "talk" (link, join) to another? 

463 

464 Args: 

465 other: the other :class:`SingleResearchDatabase` 

466 

467 Returns: 

468 ``True`` if they are the same database or share a common RID type 

469 

470 """ 

471 if self.schema_id == other.schema_id: 

472 return True 

473 return self.rid_family == other.rid_family 

474 

475 def get_default_date_field(self, table_id: TableId) -> Optional[ColumnId]: 

476 """ 

477 Gets the default date column for the specified table, or ``None`` if 

478 none exists. 

479 

480 Args: 

481 table_id: a :class:`crate_anon.common.sql.TableId` 

482 

483 Returns: 

484 a :class:`crate_anon.common.sql.ColumnId`, or ``None`` 

485 """ 

486 if table_id.table in self.date_fields_by_table: 

487 # We've been told about a specific date column for this table. 

488 column_id = ColumnId( 

489 db=table_id.db, 

490 schema=table_id.schema, 

491 table=table_id.table, 

492 column=self.date_fields_by_table[table_id.table], 

493 ) 

494 # Now, does it actually exist? 

495 if self.column_present(column_id): 

496 # Yes. 

497 return column_id 

498 # No. 

499 for datecolname in self.default_date_fields: 

500 column_id = ColumnId( 

501 db=table_id.db, 

502 schema=table_id.schema, 

503 table=table_id.table, 

504 column=datecolname, 

505 ) 

506 if self.column_present(column_id): 

507 return column_id 

508 return None 

509 

510 def column_present(self, column_id: ColumnId) -> bool: 

511 """ 

512 Is the named column present in this database? 

513 

514 Args: 

515 column_id: a :class:`crate_anon.common.sql.ColumnId` 

516 """ 

517 for ci in self.colinfolist: 

518 if ci.column_id == column_id: 

519 return True 

520 return False 

521 

522 # ------------------------------------------------------------------------- 

523 # Fetching schema info from the database: internals 

524 # ------------------------------------------------------------------------- 

525 

526 @classmethod 

527 def _schema_query_microsoft( 

528 cls, db_name: str, schema_names: List[str] 

529 ) -> SqlArgsTupleType: 

530 """ 

531 Returns a query to fetch the database structure from an SQL Server 

532 database. 

533 

534 The columns returned are as expected by 

535 :func:`get_schema_infodictlist`. 

536 

537 Args: 

538 db_name: a database name 

539 schema_names: a list of schema names within the database 

540 

541 Returns: 

542 tuple: ``sql, args`` 

543 

544 Notes: 

545 

546 - SQL Server ``INFORMATION_SCHEMA.COLUMNS``: see 

547 https://msdn.microsoft.com/en-us/library/ms188348.aspx 

548 - Re fulltext indexes: 

549 

550 - https://stackoverflow.com/questions/16280918/how-to-find-full-text-indexing-on-database-in-sql-server-2008 

551 - ``sys.fulltext_indexes``: https://msdn.microsoft.com/en-us/library/ms186903.aspx 

552 - ``sys.fulltext_catalogs``: https://msdn.microsoft.com/en-us/library/ms188779.aspx 

553 - ``sys.fulltext_index_columns``: https://msdn.microsoft.com/en-us/library/ms188335.aspx 

554 

555 """ # noqa: E501 

556 if not schema_names: 

557 raise ValueError( 

558 "No schema_names specified (for SQL Server " "database)" 

559 ) 

560 schema_placeholder = ",".join(["?"] * len(schema_names)) 

561 sql = translate_sql_qmark_to_percent( 

562 f""" 

563SELECT 

564 ? AS table_catalog, 

565 d.table_schema, 

566 d.table_name, 

567 d.column_name, 

568 d.is_nullable, 

569 d.column_type, 

570 d.column_comment, 

571 CASE WHEN COUNT(d.index_id) > 0 THEN 1 ELSE 0 END AS indexed, 

572 CASE 

573 WHEN COUNT(d.fulltext_index_object_id) > 0 THEN 1 

574 ELSE 0 

575 END AS indexed_fulltext 

576FROM ( 

577 SELECT 

578 s.name AS table_schema, 

579 ta.name AS table_name, 

580 c.name AS column_name, 

581 c.is_nullable, 

582 UPPER(ty.name) + '(' + CONVERT(VARCHAR(100), c.max_length) + ')' 

583 AS column_type, 

584 CONVERT(VARCHAR(1000), x.value) AS column_comment, 

585 -- x.value is of type SQL_VARIANT 

586 i.index_id, 

587 fi.object_id AS fulltext_index_object_id 

588 FROM [{db_name}].sys.tables ta 

589 INNER JOIN [{db_name}].sys.schemas s ON ta.schema_id = s.schema_id 

590 INNER JOIN [{db_name}].sys.columns c ON c.object_id = ta.object_id 

591 INNER JOIN [{db_name}].sys.types ty ON ty.system_type_id = c.system_type_id 

592 LEFT JOIN [{db_name}].sys.extended_properties x ON ( 

593 x.major_id = c.object_id 

594 AND x.minor_id = c.column_id 

595 ) 

596 LEFT JOIN [{db_name}].sys.index_columns i ON ( 

597 i.object_id = c.object_id 

598 AND i.column_id = c.column_id 

599 ) 

600 LEFT JOIN [{db_name}].sys.fulltext_index_columns fi ON ( 

601 fi.object_id = c.object_id 

602 AND fi.column_id = c.column_id 

603 ) 

604 WHERE s.name IN ({schema_placeholder}) 

605 AND ty.user_type_id = ty.system_type_id 

606 -- restricts to system data types; eliminates 'sysname' type 

607) AS d 

608GROUP BY 

609 table_schema, 

610 table_name, 

611 column_name, 

612 is_nullable, 

613 column_type, 

614 column_comment 

615ORDER BY 

616 table_schema, 

617 table_name, 

618 column_name 

619 """ 

620 ) 

621 args = [db_name] + schema_names 

622 return sql, args 

623 

624 @classmethod 

625 def _schema_query_mysql(cls, db_and_schema_name: str) -> SqlArgsTupleType: 

626 """ 

627 Returns a query to fetch the database structure from a MySQL database. 

628 

629 The columns returned are as expected by 

630 :func:`get_schema_infodictlist`. 

631 

632 Args: 

633 db_and_schema_name: the database (and schema) name 

634 

635 Returns: 

636 tuple: ``sql, args`` 

637 

638 Notes: 

639 

640 - MySQL's ``INFORMATION_SCHEMA.COLUMNS``: see 

641 https://dev.mysql.com/doc/refman/5.7/en/tables-table.html 

642 

643 """ 

644 # --------------------------------------------------------------------- 

645 # Method A. Stupidly slow, e.g. 47s for the query. 

646 # --------------------------------------------------------------------- 

647 # It's the EXISTS stuff that's slow. 

648 # 

649 # sql = translate_sql_qmark_to_percent(""" 

650 # SELECT 

651 # c.table_schema, 

652 # c.table_name, 

653 # c.column_name, 

654 # c.is_nullable, 

655 # c.column_type, /* MySQL: e.g. varchar(32) */ 

656 # c.column_comment, /* MySQL */ 

657 # EXISTS ( 

658 # SELECT * 

659 # FROM information_schema.statistics s 

660 # WHERE s.table_schema = c.table_schema 

661 # AND s.table_name = c.table_name 

662 # AND s.column_name = c.column_name 

663 # ) AS indexed, 

664 # EXISTS ( 

665 # SELECT * 

666 # FROM information_schema.statistics s 

667 # WHERE s.table_schema = c.table_schema 

668 # AND s.table_name = c.table_name 

669 # AND s.column_name = c.column_name 

670 # AND s.index_type LIKE 'FULLTEXT%' 

671 # ) AS indexed_fulltext 

672 # FROM 

673 # information_schema.columns c 

674 # WHERE 

675 # c.table_schema IN ({schema_placeholder}) 

676 # ORDER BY 

677 # c.table_schema, 

678 # c.table_name, 

679 # c.column_name 

680 # """.format( 

681 # schema_placeholder=",".join(["?"] * len(schemas)), 

682 # )) 

683 # args = schemas 

684 

685 # --------------------------------------------------------------------- 

686 # Method B. Much faster, e.g. 0.35s for the same thing. 

687 # --------------------------------------------------------------------- 

688 # http://www.codeproject.com/Articles/33052/Visual-Representation-of-SQL-Joins # noqa: E501 

689 # (Note that EXISTS() above returns 0 or 1.) 

690 # The LEFT JOIN below will produce NULL values for the index 

691 # columns for non-indexed fields. 

692 # However, you can have more than one index on a column, in which 

693 # case the column appears in two rows. 

694 sql = translate_sql_qmark_to_percent( 

695 """ 

696SELECT 

697 '' AS table_catalog, 

698 d.table_schema, 

699 d.table_name, 

700 d.column_name, 

701 d.is_nullable, 

702 d.column_type, 

703 d.column_comment, 

704 d.indexed, 

705 MAX(d.indexed_fulltext) AS indexed_fulltext 

706FROM ( 

707 SELECT 

708 -- c.table_catalog, -- will always be 'def' 

709 c.table_schema, 

710 c.table_name, 

711 c.column_name, 

712 c.is_nullable, 

713 c.column_type, /* MySQL: e.g. varchar(32) */ 

714 c.column_comment, /* MySQL */ 

715 /* s.index_name, */ 

716 /* s.index_type, */ 

717 IF(s.index_type IS NOT NULL, 1, 0) AS indexed, 

718 IF(s.index_type LIKE 'FULLTEXT%', 1, 0) AS indexed_fulltext 

719 FROM 

720 information_schema.columns c 

721 LEFT JOIN information_schema.statistics s 

722 ON ( 

723 c.table_schema = s.table_schema 

724 AND c.table_name = s.table_name 

725 AND c.column_name = s.column_name 

726 ) 

727 WHERE 

728 c.table_schema = ? 

729) AS d /* "Every derived table must have its own alias" */ 

730GROUP BY 

731 table_catalog, 

732 table_schema, 

733 table_name, 

734 column_name, 

735 is_nullable, 

736 column_type, 

737 column_comment, 

738 indexed 

739ORDER BY 

740 table_catalog, 

741 table_schema, 

742 table_name, 

743 column_name 

744 """ 

745 ) 

746 args = [db_and_schema_name] 

747 return sql, args 

748 

749 @classmethod 

750 def _schema_query_postgres( 

751 cls, schema_names: List[str] 

752 ) -> SqlArgsTupleType: 

753 """ 

754 Returns a query to fetch the database structure from an SQL Server 

755 database. 

756 

757 The columns returned are as expected by 

758 :func:`get_schema_infodictlist`. 

759 

760 Args: 

761 schema_names: a list of schema names within the database 

762 

763 Returns: 

764 tuple: ``sql, args`` 

765 

766 Notes: 

767 

768 - A PostgreSQL connection is always to a single database; see 

769 https://stackoverflow.com/questions/10335561/use-database-name-command-in-postgresql 

770 - https://dba.stackexchange.com/questions/75015 

771 - https://stackoverflow.com/questions/14713774 

772 - Note that creating a GIN index looks like: 

773 

774 .. code-block:: sql 

775 

776 ALTER TABLE t ADD COLUMN tsv_mytext TSVECTOR; 

777 UPDATE t SET tsv_mytext = to_tsvector(mytext); 

778 CREATE INDEX idx_t_mytext_gin ON t USING GIN(tsv_mytext); 

779 

780 - PostgreSQL ``INFORMATION_SCHEMA.COLUMNS``: see 

781 https://www.postgresql.org/docs/9.1/static/infoschema-columns.html 

782 

783 """ 

784 if not schema_names: 

785 raise ValueError( 

786 "No schema_names specified (for PostgreSQL " "database)" 

787 ) 

788 schema_placeholder = ",".join(["?"] * len(schema_names)) 

789 sql = translate_sql_qmark_to_percent( 

790 f""" 

791SELECT 

792 '' AS table_catalog, 

793 d.table_schema, 

794 d.table_name, 

795 d.column_name, 

796 d.is_nullable, 

797 d.column_type, 

798 d.column_comment, 

799 CASE WHEN COUNT(d.indrelid) > 0 THEN 1 ELSE 0 END AS indexed, 

800 MAX(d.indexed_fulltext) AS indexed_fulltext 

801FROM ( 

802 SELECT 

803 -- c.table_catalog, -- will always be the connection's database name 

804 c.table_schema, 

805 c.table_name, 

806 c.column_name, 

807 a.attnum as column_seq_num, 

808 c.is_nullable, 

809 pg_catalog.format_type(a.atttypid, a.atttypmod) as column_type, 

810 pgd.description AS column_comment, 

811 i.indrelid, 

812 CASE 

813 WHEN pg_get_indexdef(indexrelid) ~ 'USING (gin |gist )' THEN 1 

814 ELSE 0 

815 END AS indexed_fulltext 

816 FROM pg_catalog.pg_statio_all_tables AS t 

817 INNER JOIN information_schema.columns c ON ( 

818 c.table_schema = t.schemaname 

819 AND c.table_name = t.relname 

820 ) 

821 INNER JOIN pg_catalog.pg_attribute a ON ( -- one row per column 

822 a.attrelid = t.relid 

823 AND a.attname = c.column_name 

824 ) 

825 LEFT JOIN pg_catalog.pg_index AS i ON ( 

826 i.indrelid = t.relid -- match on table 

827 AND i.indkey[0] = a.attnum -- match on column sequence number 

828 AND i.indnatts = 1 -- one column in the index 

829 ) 

830 LEFT JOIN pg_catalog.pg_description pgd ON ( 

831 pgd.objoid = t.relid 

832 AND pgd.objsubid = c.ordinal_position 

833 ) 

834 WHERE t.schemaname IN ({schema_placeholder}) 

835) AS d 

836GROUP BY 

837 table_catalog, 

838 table_schema, 

839 table_name, 

840 column_name, 

841 is_nullable, 

842 column_type, 

843 column_comment 

844ORDER BY 

845 table_catalog, 

846 table_schema, 

847 table_name, 

848 column_name 

849 """ 

850 ) 

851 args = schema_names 

852 return sql, args 

853 

854 @classmethod 

855 def _schema_query_sqlite_as_infodictlist( 

856 cls, connection: BaseDatabaseWrapper, debug: bool = False 

857 ) -> List[Dict[str, Any]]: 

858 """ 

859 Queries an SQLite databases and returns columns as expected by 

860 :func:`get_schema_infodictlist`. 

861 """ 

862 # 1. Catalogue tables. 

863 # pragma table_info(sqlite_master); 

864 empty_args = [] 

865 sql_get_tables = """ 

866 SELECT tbl_name AS tablename 

867 FROM sqlite_master 

868 WHERE type='table' 

869 """ 

870 table_info_rows = cls._exec_sql_query( 

871 connection, (sql_get_tables, empty_args), debug=debug 

872 ) 

873 table_names = [row["tablename"] for row in table_info_rows] 

874 

875 # 2. Catalogue each tables 

876 results = [] # type: List[Dict[str, Any]] 

877 for table_name in table_names: 

878 # A "PRAGMA table_info()" call doesn't work with arguments. 

879 sql_inspect_table = f"PRAGMA table_info({table_name})" 

880 column_info_rows = cls._exec_sql_query( 

881 connection, (sql_inspect_table, empty_args), debug=debug 

882 ) 

883 for ci in column_info_rows: 

884 results.append( 

885 dict( 

886 table_catalog="", 

887 table_schema="", 

888 table_name=table_name, 

889 column_name=ci["name"], 

890 is_nullable=1 - ci["notnull"], 

891 column_type=ci["type"], 

892 column_comment="", 

893 indexed=0, 

894 indexed_fulltext=0, 

895 ) 

896 ) 

897 # Ignored: 

898 # - "cid" (column ID) 

899 # - "dflt_value" 

900 # - "pk" 

901 return results 

902 

903 @classmethod 

904 def _exec_sql_query( 

905 cls, 

906 connection: BaseDatabaseWrapper, 

907 sql_args: SqlArgsTupleType, 

908 debug: bool = False, 

909 ) -> List[Dict[str, Any]]: 

910 """ 

911 Used by get_schema_infodictlist() as a common function to translate an 

912 sql/args pair into the desired results. But it does that because the 

913 incoming SQL has the right column names; the function is more generic 

914 and just runs a query. 

915 

916 Args: 

917 connection: 

918 a :class:`django.db.backends.base.base.BaseDatabaseWrapper`, 

919 i.e. a Django database connection 

920 sql_args: 

921 tuple of SQL and arguments 

922 debug: 

923 be verbose to the log? 

924 

925 Returns: 

926 A list of dictionaries, each mapping column names to values. 

927 The dictionaries are suitable for use as ``**kwargs`` to 

928 :class:`ColumnInfo`. 

929 """ 

930 # We execute this one directly, rather than using the Query class, 

931 # since this is a system rather than a per-user query. 

932 sql, args = sql_args 

933 cursor = connection.cursor() 

934 if debug: 

935 log.debug(f"- sql = {sql}\n- args = {args!r}") 

936 cursor.execute(sql, args) 

937 # Re passing multiple values to SQL via args: 

938 # - Don't circumvent the parameter protection against SQL injection. 

939 # - Too much hassle to use Django's ORM model here, though that would 

940 # also be possible. 

941 # - https://stackoverflow.com/questions/907806 

942 # - Similarly via SQLAlchemy reflection/inspection. 

943 results = dictfetchall(cursor) # list of OrderedDicts 

944 if debug: 

945 log.debug(f"results = {results!r}") 

946 log.debug("... done") 

947 return results 

948 

949 # ------------------------------------------------------------------------- 

950 # Fetching schema info from the database: main (still internal) interface 

951 # ------------------------------------------------------------------------- 

952 

953 def get_schema_infodictlist( 

954 self, connection: BaseDatabaseWrapper, debug: bool = False 

955 ) -> List[Dict[str, Any]]: 

956 """ 

957 Fetch structure information for a specific database, by asking the 

958 database. 

959 

960 Args: 

961 connection: 

962 a :class:`django.db.backends.base.base.BaseDatabaseWrapper`, 

963 i.e. a Django database connection 

964 debug: 

965 be verbose to the log? 

966 

967 Returns: 

968 A list of dictionaries, each mapping column names to values. 

969 The dictionaries are suitable for use as ``**kwargs`` to 

970 :class:`ColumnInfo`. 

971 

972 """ 

973 db_name = self.database 

974 schema_name = self.schema_name 

975 log.info( 

976 f"Fetching/caching database structure (for database " 

977 f"{db_name!r}, schema {schema_name!r})..." 

978 ) 

979 # The db/schema names are guaranteed to be strings by __init__(). 

980 if connection.vendor == ConnectionVendors.MICROSOFT: 

981 if not db_name: 

982 raise ValueError(f"{db_name=!r}; required for MSSQL") 

983 if not schema_name: 

984 raise ValueError(f"{schema_name=!r}; required for MSSQL") 

985 results = self._exec_sql_query( 

986 connection, 

987 sql_args=self._schema_query_microsoft(db_name, [schema_name]), 

988 debug=debug, 

989 ) 

990 elif connection.vendor == ConnectionVendors.POSTGRESQL: 

991 if db_name: 

992 raise ValueError(f"{db_name=!r}; must be '' for PostgreSQL") 

993 if not schema_name: 

994 raise ValueError(f"{schema_name=!r}; required for PostgreSQL") 

995 results = self._exec_sql_query( 

996 connection, 

997 sql_args=self._schema_query_postgres([schema_name]), 

998 debug=debug, 

999 ) 

1000 elif connection.vendor == ConnectionVendors.MYSQL: 

1001 if db_name: 

1002 raise ValueError(f"{db_name=!r}; must be '' for MySQL") 

1003 if not schema_name: 

1004 raise ValueError(f"{schema_name=!r}; required for MySQL") 

1005 results = self._exec_sql_query( 

1006 connection, 

1007 sql_args=self._schema_query_mysql( 

1008 db_and_schema_name=schema_name 

1009 ), 

1010 debug=debug, 

1011 ) 

1012 elif connection.vendor == ConnectionVendors.SQLITE: 

1013 # db_name: don't care? 

1014 # schema_name: don't care? 

1015 # This one can't be done as a single query; the following function 

1016 # builds up the information by querying a list of tables, then each 

1017 # table. 

1018 results = self._schema_query_sqlite_as_infodictlist( 

1019 connection, debug=debug 

1020 ) 

1021 else: 

1022 raise ValueError( 

1023 f"Don't know how to get metadata for " 

1024 f"{connection.vendor=!r}" 

1025 ) 

1026 

1027 if not results: 

1028 log.warning( 

1029 f"SingleResearchDatabase.get_schema_infodictlist(): no " 

1030 f"results for database/schema {self.schema_identifier!r} " 

1031 f"database - misconfigured?" 

1032 ) 

1033 return results 

1034 

1035 

1036@register_for_json(method=METHOD_NO_ARGS) 

1037class ResearchDatabaseInfo: 

1038 """ 

1039 Fetches schema information from the research databases. There can be 

1040 several, and this class represents the whole collection. 

1041 

1042 Notes: 

1043 

1044 - Class primarily exists to be able to use ``@cached_property``. 

1045 

1046 - ... replaced by ``@lru_cache`` 

1047 - ... replaced by ``@django_cache_function`` 

1048 """ 

1049 

1050 # We fetch the dialect at first request; this enables us to import the 

1051 # class without Django configured. 

1052 

1053 def __init__(self, running_without_config: bool = False) -> None: 

1054 self.dbinfolist = [] # type: List[SingleResearchDatabase] 

1055 

1056 if running_without_config: 

1057 self.dialect = "" 

1058 self.grammar = None # type: Optional[SqlGrammar] 

1059 self.dbinfo_for_contact_lookup = ( 

1060 None 

1061 ) # type: Optional[SingleResearchDatabase] 

1062 

1063 else: 

1064 self.dialect = settings.RESEARCH_DB_DIALECT # type: str 

1065 assert ( 

1066 self.dialect in SUPPORTED_DIALECTS 

1067 ), f"Unsupported dialect: {self.dialect!r}" 

1068 

1069 self.grammar = make_grammar(self.dialect) # not expensive 

1070 

1071 connection = self._connection() 

1072 

1073 for index in range(len(settings.RESEARCH_DB_INFO)): 

1074 self.dbinfolist.append( 

1075 SingleResearchDatabase( 

1076 index=index, 

1077 grammar=self.grammar, 

1078 rdb_info=self, 

1079 connection=connection, 

1080 ) 

1081 ) 

1082 assert ( 

1083 len(self.dbinfolist) > 0 

1084 ), "No research databases configured in RESEARCH_DB_INFO" 

1085 names = [x.name for x in self.dbinfolist] 

1086 assert len(names) == len( 

1087 set(names) 

1088 ), f"Duplicate database names in {names!r}" 

1089 

1090 try: 

1091 self.dbinfo_for_contact_lookup = self.get_dbinfo_by_name( 

1092 settings.RESEARCH_DB_FOR_CONTACT_LOOKUP 

1093 ) 

1094 except ValueError: 

1095 raise ValueError( 

1096 f"In your settings, RESEARCH_DB_FOR_CONTACT_LOOKUP " 

1097 f"specifies {settings.RESEARCH_DB_FOR_CONTACT_LOOKUP!r} " 

1098 f"but there is no database with that name in " 

1099 f"RESEARCH_DB_INFO" 

1100 ) 

1101 assert self.dbinfo_for_contact_lookup.secret_lookup_db, ( 

1102 f"Research database {self.dbinfo_for_contact_lookup.name!r} " 

1103 f"is set as your RESEARCH_DB_FOR_CONTACT_LOOKUP but has no " 

1104 f"{ResearchDbInfoKeys.SECRET_LOOKUP_DB!r} attribute" 

1105 ) 

1106 

1107 self.nlp_sourcedb_map = getattr( 

1108 settings, SettingsKeys.NLP_SOURCEDB_MAP, {} 

1109 ) # type: Dict[str, str] 

1110 try: 

1111 assert isinstance(self.nlp_sourcedb_map, dict) 

1112 for k, v in self.nlp_sourcedb_map.items(): 

1113 assert isinstance(k, str) 

1114 assert isinstance(v, str) 

1115 except AssertionError: 

1116 raise ValueError( 

1117 "settings.NLP_SOURCEDB_MAP is not a Dict[str, str]" 

1118 ) 

1119 

1120 # ------------------------------------------------------------------------- 

1121 # Classmethods, staticmethods 

1122 # ------------------------------------------------------------------------- 

1123 

1124 @classmethod 

1125 def _connection(cls) -> BaseDatabaseWrapper: 

1126 """ 

1127 Returns the Django connection to the research database(s), from 

1128 ``connections[RESEARCH_DB_CONNECTION_NAME]``, meaning 

1129 ``connections['research']``. 

1130 

1131 **This must be a read-only connection, enforced by the database. 

1132 Researchers will be allowed to execute unrestricted SQL via this 

1133 connection.** 

1134 

1135 """ 

1136 return connections[RESEARCH_DB_CONNECTION_NAME] 

1137 

1138 @classmethod 

1139 def uses_database_level(cls) -> bool: 

1140 """ 

1141 Does the database simultaneously offer a "database" level above its 

1142 "schema" level? 

1143 """ 

1144 return cls._offers_db_above_schema(cls._connection()) 

1145 

1146 @classmethod 

1147 def format_db_schema(cls, db: str, schema: str) -> str: 

1148 if cls.uses_database_level(): 

1149 return f"{db}.{schema}" 

1150 else: 

1151 return schema 

1152 

1153 @staticmethod 

1154 def _offers_db_above_schema(connection: BaseDatabaseWrapper) -> bool: 

1155 """ 

1156 Does the database simultaneously offer a "database" level above its 

1157 "schema" level? 

1158 

1159 - True for Microsoft SQL Server 

1160 - False for MySQL (in which "database" and "schema" are synonymous) 

1161 - False for PostgreSQL (in which a connection can only talk to one 

1162 database at once, though there can be many schemas within each 

1163 database). 

1164 

1165 Args: 

1166 connection: 

1167 a :class:`django.db.backends.base.base.BaseDatabaseWrapper`, 

1168 i.e. a Django database connection 

1169 """ 

1170 return connection.vendor in [ConnectionVendors.MICROSOFT] 

1171 

1172 # ------------------------------------------------------------------------- 

1173 # Whole-database/schema information 

1174 # ------------------------------------------------------------------------- 

1175 

1176 @property 

1177 def single_research_db(self) -> bool: 

1178 """ 

1179 Do we have only a single research database? 

1180 """ 

1181 return len(self.dbinfolist) == 1 

1182 

1183 @property 

1184 def single_research_db_with_secret_map(self) -> bool: 

1185 """ 

1186 Do we have only one database that has an associated secret lookup 

1187 database to patient IDs? 

1188 """ 

1189 return len(self.dbs_with_secret_map) == 1 

1190 

1191 @property 

1192 def dbs_with_secret_map(self) -> List[SingleResearchDatabase]: 

1193 """ 

1194 Which of our databases has an associated secret lookup database to 

1195 patient IDs? 

1196 

1197 Returns: 

1198 a list of :class:`SingleResearchDatabase` objects 

1199 """ 

1200 return [db for db in self.dbinfolist if db.secret_lookup_db] 

1201 

1202 def _get_dbinfo_by_index(self, index: int) -> SingleResearchDatabase: 

1203 """ 

1204 Returns a :class:`SingleResearchDatabase` by its zero-based index, or 

1205 raise an exception if the index is bad. 

1206 

1207 Args: 

1208 index: a zero-based index 

1209 

1210 Returns: 

1211 a :class:`SingleResearchDatabase` 

1212 

1213 Raises: 

1214 :exc:`IndexError` if the index was bad 

1215 """ 

1216 return self.dbinfolist[index] 

1217 

1218 def get_dbinfo_by_name(self, name: str) -> SingleResearchDatabase: 

1219 """ 

1220 Returns the research database whose name (from 

1221 ``settings.RESEARCH_DB_INFO``) is ``name``. 

1222 

1223 Args: 

1224 name: the name of a database, as per ``settings.RESEARCH_DB_INFO`` 

1225 

1226 Returns: 

1227 a :class:`SingleResearchDatabase` 

1228 

1229 Raises: 

1230 :exc:`ValueError` if none is found 

1231 

1232 """ 

1233 try: 

1234 return next(x for x in self.dbinfolist if x.name == name) 

1235 except StopIteration: 

1236 raise ValueError(f"No research database named {name!r}") 

1237 

1238 def get_dbinfo_by_schema( 

1239 self, schema_id: SchemaId 

1240 ) -> SingleResearchDatabase: 

1241 """ 

1242 Returns the first database representing the specified schema. 

1243 

1244 Args: 

1245 schema_id: a :class:`crate_anon.common.sql.SchemaId` 

1246 

1247 Returns: 

1248 a :class:`SingleResearchDatabase` 

1249 

1250 Raises: 

1251 :exc:`ValueError` if none is found 

1252 """ 

1253 if schema_id.is_blank(): 

1254 raise ValueError("get_db_info(): error: blank schema provided") 

1255 try: 

1256 return next(x for x in self.dbinfolist if x.schema_id == schema_id) 

1257 except StopIteration: 

1258 raise ValueError( 

1259 f"No such database/schema: " 

1260 f"{schema_id.identifier(self.grammar)!r}" 

1261 ) 

1262 

1263 @property 

1264 def first_dbinfo(self) -> SingleResearchDatabase: 

1265 """ 

1266 Returns the first :class:`SingleResearchDatabase` that we know about. 

1267 """ 

1268 return self._get_dbinfo_by_index(0) 

1269 

1270 @property 

1271 def first_dbinfo_with_secret_map(self) -> Optional[SingleResearchDatabase]: 

1272 """ 

1273 Returns the first :class:`SingleResearchDatabase` that has a secret map 

1274 (an associated secret lookup database to patient IDs), or ``None`` if 

1275 there isn't one. 

1276 """ 

1277 dbs = self.dbs_with_secret_map 

1278 if len(dbs) == 0: 

1279 return None 

1280 return dbs[0] 

1281 

1282 @property 

1283 def researchdb_schemas(self) -> List[SchemaId]: 

1284 """ 

1285 Returns all :class:`crate_anon.common.sql.SchemaId` values for our 

1286 databases (one per database). 

1287 """ 

1288 return [x.schema_id for x in self.dbinfolist] 

1289 

1290 def get_default_database_name(self) -> str: 

1291 """ 

1292 Returns the default "database name" for our dialect. 

1293 

1294 - For Microsoft SQL Server, this is 

1295 ``settings.DATABASES['research']['NAME']`` 

1296 - For MySQL, this is blank 

1297 - For PostgreSQL, this is blank 

1298 """ 

1299 dialect = self.dialect 

1300 if dialect == SqlaDialectName.MSSQL: 

1301 return settings.DATABASES[RESEARCH_DB_CONNECTION_NAME]["NAME"] 

1302 elif dialect == SqlaDialectName.POSTGRES: 

1303 return "" 

1304 elif dialect == SqlaDialectName.MYSQL: 

1305 return "" 

1306 else: 

1307 raise ValueError("Bad settings.RESEARCH_DB_DIALECT") 

1308 

1309 def get_default_schema_name(self) -> str: 

1310 """ 

1311 Returns the default "schema name" for our dialect. 

1312 

1313 - For Microsoft SQL Server, this is ``'dbo'`` 

1314 - For PostgreSQL, this is ``'public'`` 

1315 - For MySQL, this is ``settings.DATABASES['research']['NAME']`` 

1316 

1317 """ 

1318 dialect = self.dialect 

1319 if dialect == SqlaDialectName.MSSQL: 

1320 return MSSQL_DEFAULT_SCHEMA 

1321 elif dialect == SqlaDialectName.POSTGRES: 

1322 return POSTGRES_DEFAULT_SCHEMA 

1323 elif dialect == SqlaDialectName.MYSQL: 

1324 return settings.DATABASES[RESEARCH_DB_CONNECTION_NAME]["NAME"] 

1325 else: 

1326 raise ValueError("Bad settings.RESEARCH_DB_DIALECT") 

1327 

1328 # ------------------------------------------------------------------------- 

1329 # Database-wide fields and descriptions 

1330 # ------------------------------------------------------------------------- 

1331 

1332 def get_rid_column(self, table: TableId) -> ColumnId: 

1333 """ 

1334 Returns the RID column in the specified table (which may or may not 

1335 exist). 

1336 

1337 Args: 

1338 table: a :class:`crate_anon.common.sql.TableId` 

1339 

1340 Returns: 

1341 a :class:`crate_anon.common.sql.ColumnId`, which may be blank 

1342 

1343 """ 

1344 dbinfo = self.get_dbinfo_by_schema(table.schema_id) 

1345 return table.column_id(dbinfo.rid_field) 

1346 

1347 def get_trid_column(self, table: TableId) -> ColumnId: 

1348 """ 

1349 Returns the TRID column in the specified table (which may or may not 

1350 exist). 

1351 

1352 Args: 

1353 table: a :class:`crate_anon.common.sql.TableId` 

1354 

1355 Returns: 

1356 a :class:`crate_anon.common.sql.ColumnId`, which may be blank 

1357 

1358 """ 

1359 dbinfo = self.get_dbinfo_by_schema(table.schema_id) 

1360 return table.column_id(dbinfo.trid_field) 

1361 

1362 def get_mrid_column_from_schema(self, schema: SchemaId) -> ColumnId: 

1363 """ 

1364 Returns the MRID column in the MRID master table for a given 

1365 schema/database. 

1366 

1367 Args: 

1368 schema: a :class:`crate_anon.common.sql.SchemaId` 

1369 

1370 Returns: 

1371 a :class:`crate_anon.common.sql.ColumnId`, which may be blank 

1372 

1373 """ 

1374 dbinfo = self.get_dbinfo_by_schema(schema) 

1375 return schema.column_id( 

1376 table=dbinfo.mrid_table, column=dbinfo.mrid_field 

1377 ) 

1378 

1379 def get_mrid_column_from_table(self, table: TableId) -> ColumnId: 

1380 """ 

1381 Returns the MRID column in the specified table (which may or may not 

1382 exist). 

1383 

1384 Args: 

1385 table: a :class:`crate_anon.common.sql.TableId`; this should be the 

1386 MRID master table 

1387 

1388 Returns: 

1389 a :class:`crate_anon.common.sql.ColumnId`, which may be blank 

1390 

1391 """ 

1392 return self.get_mrid_column_from_schema(table.schema_id) 

1393 

1394 def get_linked_mrid_column(self, table: TableId) -> Optional[ColumnId]: 

1395 """ 

1396 Returns either (a) the MRID column in the schema containing the table 

1397 specified, or (b) one that can be linked to it automatically. 

1398 

1399 Args: 

1400 table: a :class:`crate_anon.common.sql.TableId` 

1401 

1402 Returns: 

1403 a :class:`crate_anon.common.sql.ColumnId`, or ``None`` 

1404 

1405 """ 

1406 mrid_in_same_db = self.get_mrid_column_from_table(table) 

1407 if mrid_in_same_db: 

1408 return mrid_in_same_db 

1409 # OK. So our table isn't from a database with an MRID table, but it 

1410 # might be linked to one. 

1411 table_db = self.get_dbinfo_by_schema(table.schema_id) 

1412 first_db = self.first_dbinfo 

1413 if not first_db.talks_to_world: 

1414 return None 

1415 if table_db.talks_to_world or table_db.can_communicate_directly( 

1416 first_db 

1417 ): 

1418 return self.get_mrid_column_from_schema(first_db.schema_id) 

1419 

1420 def get_default_date_column(self, table: TableId) -> Optional[ColumnId]: 

1421 """ 

1422 Returns the default date column in the specified table (which may or 

1423 may not exist). 

1424 

1425 Args: 

1426 table: a :class:`crate_anon.common.sql.TableId` 

1427 

1428 Returns: 

1429 a :class:`crate_anon.common.sql.ColumnId`, which may be blank 

1430 

1431 """ 

1432 dbinfo = self.get_dbinfo_by_schema(table.schema_id) 

1433 return dbinfo.get_default_date_field(table) 

1434 

1435 # ------------------------------------------------------------------------- 

1436 # Table/column information 

1437 # ------------------------------------------------------------------------- 

1438 

1439 @django_cache_function(timeout=None) 

1440 def get_schema_infodictlist(self) -> List[Dict[str, Any]]: 

1441 """ 

1442 Get details of all columns in all research databases. 

1443 

1444 Returns: 

1445 list: across all databases, of all dictionaries provided by their 

1446 :meth:`SingleResearchDatabase.get_schema_infodictlist` function 

1447 """ 

1448 results = [] # type: List[Dict[str, Any]] 

1449 for dbinfo in self.dbinfolist: 

1450 results.extend(dbinfo.schema_infodictlist) 

1451 return results 

1452 

1453 @django_cache_function(timeout=None) 

1454 def get_colinfolist(self) -> List[ColumnInfo]: 

1455 """ 

1456 Get details of all columns in all research databases. 

1457 

1458 Returns: 

1459 list: across all databases, of :class:`ColumnInfo` objects 

1460 """ 

1461 colinfolist = [] # type: List[ColumnInfo] 

1462 for dbi in self.dbinfolist: 

1463 colinfolist.extend(dbi.colinfolist) 

1464 return colinfolist 

1465 

1466 @django_cache_function(timeout=None) 

1467 def get_colinfolist_by_tables(self) -> Dict[TableId, List[ColumnInfo]]: 

1468 """ 

1469 Get details of all columns in all research databases, by table. 

1470 

1471 Returns: 

1472 OrderedDict: across all databases, ``{table_id: columns}`` where 

1473 ``table_id`` is a :class:`crate_anon.common.sql.TableId` and 

1474 ``columns`` is a list of :class:`ColumnInfo` objects 

1475 """ 

1476 colinfolist = self.get_colinfolist() 

1477 table_to_colinfolist = {} # type: Dict[TableId, List[ColumnInfo]] 

1478 for c in colinfolist: 

1479 table_id = c.table_id 

1480 if table_id not in table_to_colinfolist: 

1481 table_to_colinfolist[table_id] = [] # type: List[ColumnInfo] 

1482 table_to_colinfolist[table_id].append(c) 

1483 # noinspection PyTypeChecker 

1484 return OrderedDict(sorted(table_to_colinfolist.items())) 

1485 

1486 @django_cache_function(timeout=None) 

1487 def get_colinfolist_by_schema(self) -> Dict[SchemaId, List[ColumnInfo]]: 

1488 """ 

1489 Get details of all columns in all research databases, by schema. 

1490 

1491 Returns: 

1492 OrderedDict: across all databases, ``{schema_id: columns}`` where 

1493 ``schema_id`` is a :class:`crate_anon.common.sql.SchemaId` and 

1494 ``columns`` is a list of :class:`ColumnInfo` objects 

1495 """ 

1496 colinfolist = self.get_colinfolist() 

1497 schema_to_colinfolist = {} # type: Dict[SchemaId, List[ColumnInfo]] 

1498 for c in colinfolist: 

1499 table_id = c.table_id 

1500 schema = table_id.schema_id 

1501 if schema not in schema_to_colinfolist: 

1502 schema_to_colinfolist[schema] = [] # type: List[ColumnInfo] 

1503 schema_to_colinfolist[schema].append(c) 

1504 # noinspection PyTypeChecker 

1505 return OrderedDict(sorted(schema_to_colinfolist.items())) 

1506 

1507 def tables_containing_field(self, fieldname: str) -> List[TableId]: 

1508 """ 

1509 Returns all tables containing the column (field) with the specified 

1510 name. 

1511 

1512 Args: 

1513 fieldname: field (column) name 

1514 

1515 Returns: 

1516 a list of :class:`crate_anon.common.sql.TableId` objects 

1517 

1518 - We won't use a ``SELECT`` on ``INFORMATION_SCHEMA`` here, since we 

1519 already have the information. 

1520 

1521 """ 

1522 columns = self.get_colinfolist() 

1523 results = [] # type: List[TableId] 

1524 for column in columns: 

1525 if column.column_name == fieldname: 

1526 table_id = column.table_id 

1527 if table_id not in results: 

1528 results.append(table_id) 

1529 return results 

1530 

1531 def text_columns( 

1532 self, table_id: TableId, min_length: int = 1 

1533 ) -> List[ColumnInfo]: 

1534 """ 

1535 Returns all text columns from the specified table. 

1536 

1537 Args: 

1538 table_id: a :class:`crate_anon.common.sql.TableId` 

1539 min_length: the minimum SQL text length to include the column 

1540 

1541 Returns: 

1542 a list of :class:`crate_anon.common.sql.ColumnInfo` objects 

1543 

1544 """ 

1545 results = [] # type: List[ColumnInfo] 

1546 for column in self.get_colinfolist(): 

1547 if column.table_id != table_id: 

1548 continue 

1549 if not is_sql_column_type_textual(column.column_type, min_length): 

1550 # log.debug("Skipping {!r}", column) 

1551 continue 

1552 results.append(column) 

1553 # log.debug("text_columns for {} with min_length={}: [{}]", table_id, 

1554 # min_length, ", ".join(str(x) for x in results)) 

1555 return results 

1556 

1557 @django_cache_function(timeout=None) 

1558 def all_columns(self, table_id: TableId) -> List[ColumnInfo]: 

1559 """ 

1560 Returns all columns from the specified table. 

1561 

1562 Args: 

1563 table_id: a :class:`crate_anon.common.sql.TableId` 

1564 

1565 Returns: 

1566 a list of :class:`crate_anon.common.sql.ColumnInfo` objects 

1567 

1568 """ 

1569 results = [] # type: List[ColumnInfo] 

1570 for column in self.get_colinfolist(): 

1571 if column.table_id != table_id: 

1572 continue 

1573 results.append(column) 

1574 return results 

1575 

1576 def get_tsv(self) -> str: 

1577 """ 

1578 Returns a tab-separated value (TSV) file detailing of all columns in 

1579 all research databases. 

1580 

1581 Returns: 

1582 str: TSV 

1583 

1584 """ 

1585 return dictlist_to_tsv(self.get_schema_infodictlist()) 

1586 

1587 def get_excel(self) -> bytes: 

1588 """ 

1589 Returns an XLSX (Excel) file detailing of all columns in all research 

1590 databases. 

1591 

1592 Returns: 

1593 bytes: binary XLSX file 

1594 

1595 """ 

1596 wb = Workbook() 

1597 wb.remove_sheet(wb.active) # remove the autocreated blank sheet 

1598 schema_colinfolist_dict = self.get_colinfolist_by_schema() 

1599 for schema, colinfolist in schema_colinfolist_dict.items(): 

1600 ws = wb.create_sheet(title=schema.identifier(self.grammar)) 

1601 ws.append( 

1602 [ 

1603 "table_catalog", 

1604 "table_schema", 

1605 "table_name", 

1606 "column_name", 

1607 "is_nullable", 

1608 "column_type", 

1609 "column_comment", 

1610 "indexed", 

1611 "indexed_fulltext", 

1612 "basetype", 

1613 "full_identifier", 

1614 ] 

1615 ) 

1616 for c in colinfolist: # type: ColumnInfo 

1617 ws.append( 

1618 [ 

1619 c.table_catalog, 

1620 c.table_schema, 

1621 c.table_name, 

1622 c.column_name, 

1623 c.is_nullable, 

1624 c.column_type, 

1625 c.column_comment, 

1626 c.indexed, 

1627 c.indexed_fulltext, 

1628 c.basetype, 

1629 c.column_id.identifier(self.grammar), 

1630 ] 

1631 ) 

1632 return excel_to_bytes(wb) 

1633 

1634 @django_cache_function(timeout=None) 

1635 def get_tables(self) -> List[TableId]: 

1636 """ 

1637 Returns all tables in the research database(s). 

1638 

1639 Returns: 

1640 a list of :class:`crate_anon.common.sql.TableId` objects 

1641 """ 

1642 tables = set() # type: Set[TableId] 

1643 for column in self.get_colinfolist(): 

1644 tables.add(column.table_id) 

1645 return sorted(list(tables)) 

1646 

1647 @django_cache_function(timeout=None) 

1648 def table_contains_rid(self, table_id: TableId) -> bool: 

1649 """ 

1650 Does the specified table contain a research ID (RID)? 

1651 

1652 Args: 

1653 table_id: a :class:`crate_anon.common.sql.TableId` 

1654 """ 

1655 target_rid_column = self.get_rid_column(table_id) 

1656 for column in self.get_colinfolist(): 

1657 if column.column_id == target_rid_column: 

1658 return True 

1659 return False 

1660 

1661 def table_contains(self, table_id: TableId, column_id: ColumnId) -> bool: 

1662 """ 

1663 Does the specified table contain the specified column? 

1664 

1665 Args: 

1666 table_id: a :class:`crate_anon.common.sql.TableId` 

1667 column_id: a :class:`crate_anon.common.sql.ColumnId` 

1668 """ 

1669 for c in self.all_columns(table_id): 

1670 if c.column_id == column_id: 

1671 return True 

1672 return False 

1673 

1674 @django_cache_function(timeout=None) 

1675 def get_mrid_linkable_patient_tables(self) -> List[TableId]: 

1676 """ 

1677 Returns all tables in the research database(s) that are linkable on 

1678 MRID (i.e. that contain a RID and live in a database with a RID-to-MRID 

1679 lookup table). 

1680 

1681 Returns: 

1682 a list of :class:`crate_anon.common.sql.TableId` objects 

1683 """ 

1684 eligible_tables = set() # type: Set[TableId] 

1685 for table in self.get_tables(): 

1686 dbinfo = self.get_dbinfo_by_schema(table.schema_id) 

1687 if not dbinfo.has_mrid: 

1688 continue 

1689 if self.table_contains_rid(table): 

1690 eligible_tables.add(table) 

1691 return sorted(list(eligible_tables)) 

1692 

1693 

1694@django_cache_function(timeout=None) 

1695def get_research_db_info() -> ResearchDatabaseInfo: 

1696 return ResearchDatabaseInfo(RUNNING_WITHOUT_CONFIG)