Coverage for crateweb/research/sql_writer.py: 14%

125 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-02-05 06:46 -0600

1""" 

2crate_anon/crateweb/research/sql_writer.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Automatically create/manipulate SQL statements based on our extra knowledge 

27of the fields that can be used to link across tables/databases.** 

28 

29""" 

30 

31import logging 

32from typing import List, Optional 

33 

34from cardinal_pythonlib.sql.sql_grammar import ( 

35 format_sql, 

36 SqlGrammar, 

37 text_from_parsed, 

38) 

39from pyparsing import ParseResults 

40 

41from crate_anon.common.sql import ( 

42 ColumnId, 

43 get_first_from_table, 

44 JoinInfo, 

45 parser_add_result_column, 

46 parser_add_from_tables, 

47 set_distinct_within_parsed, 

48 TableId, 

49 WhereCondition, 

50) 

51from crate_anon.crateweb.research.errors import DatabaseStructureNotUnderstood 

52from crate_anon.crateweb.research.research_db_info import ( 

53 get_research_db_info, 

54) 

55 

56log = logging.getLogger(__name__) 

57 

58 

59# ============================================================================= 

60# Automatic SQL generation functions 

61# ============================================================================= 

62 

63 

64def get_join_info( 

65 grammar: SqlGrammar, 

66 parsed: ParseResults, 

67 jointable: TableId, 

68 magic_join: bool = False, 

69 nonmagic_join_type: str = "INNER JOIN", 

70 nonmagic_join_condition: str = "", 

71) -> List[JoinInfo]: 

72 """ 

73 Works out how to join a new table into an existing SQL ``SELECT`` query. 

74 

75 Args: 

76 grammar: 

77 :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar` 

78 representing the SQL dialect/grammar in use 

79 parsed: 

80 existing :class:`pyparsing.ParseResults` representing the 

81 ``SELECT`` statement so far 

82 jointable: 

83 :class:`crate_anon.common.sql.TableId` representing the table to be 

84 joined in 

85 magic_join: 

86 perform a "magic join", i.e. join the new table in based on our 

87 knowledge of the research database structure? 

88 nonmagic_join_type: 

89 if ``not magic_join``, this is an SQL string specifying the join 

90 type, e.g. ``"INNER JOIN"`` 

91 nonmagic_join_condition: 

92 if ``not magic_join``, this is an SQL string specifying the join 

93 condition, e.g. ``"ON x = y"`` 

94 

95 Returns: 

96 a list of :class:`crate_anon.common.sql.JoinInfo` objects, e.g. 

97 ``[JoinInfo("tablename", "INNER JOIN", "WHERE somecondition")]``. 

98 

99 Raises: 

100 ``DatabaseStructureNotUnderstood`` if the relevant schema information 

101 cannot be looked up. 

102 

103 Notes: 

104 

105 - ``INNER JOIN`` etc. is part of ANSI SQL 

106 

107 """ 

108 research_database_info = get_research_db_info() 

109 first_from_table = get_first_from_table(parsed) 

110 from_table_in_join_schema = get_first_from_table( 

111 parsed, match_db=jointable.db, match_schema=jointable.schema 

112 ) 

113 exact_match_table = get_first_from_table( 

114 parsed, 

115 match_db=jointable.db, 

116 match_schema=jointable.schema, 

117 match_table=jointable.table, 

118 ) 

119 

120 if not first_from_table: 

121 # No tables in query yet. 

122 # This should not happen; this function is to help with adding 

123 # new FROM tables to existing FROM clauses. 

124 log.warning("get_join_info: no tables in query") 

125 return [] 

126 

127 if exact_match_table: 

128 # This table is already in the query. No JOIN should be required. 

129 # log.debug("get_join_info: same table already in query") 

130 return [] 

131 

132 if not magic_join: 

133 # log.debug("get_join_info: non-magic join") 

134 return [ 

135 JoinInfo( 

136 join_type=nonmagic_join_type, 

137 table=jointable.identifier(grammar), 

138 join_condition=nonmagic_join_condition, 

139 ) 

140 ] 

141 

142 if from_table_in_join_schema: 

143 # Another table from the same database is present. Link on the 

144 # TRID field. 

145 # log.debug("get_join_info: joining to another table in same DB") 

146 return [ 

147 JoinInfo( 

148 join_type="INNER JOIN", 

149 table=jointable.identifier(grammar), 

150 join_condition="ON {new} = {existing}".format( 

151 new=research_database_info.get_trid_column( 

152 jointable 

153 ).identifier(grammar), 

154 existing=research_database_info.get_trid_column( 

155 from_table_in_join_schema 

156 ).identifier(grammar), 

157 ), 

158 ) 

159 ] 

160 

161 # OK. So now we're building a cross-database join. 

162 try: 

163 existing_family = research_database_info.get_dbinfo_by_schema( 

164 first_from_table.schema_id 

165 ).rid_family 

166 new_family = research_database_info.get_dbinfo_by_schema( 

167 jointable.schema_id 

168 ).rid_family 

169 except ValueError: 

170 # Some schema information is absent. This probably means that the user 

171 # has created a custom query and passed it to the query builder; 

172 # alternatively, that the database structure has changed. Either way, 

173 # the query builder won't cope. 

174 raise DatabaseStructureNotUnderstood( 

175 "Some schema information is absent. Likely, either a custom query " 

176 "has passed to the query builder, or the database structure has " 

177 "changed since the query was written." 

178 ) 

179 

180 # log.debug("existing_family={}, new_family={}".format( 

181 # existing_family, new_family)) 

182 if existing_family and existing_family == new_family: 

183 # log.debug("get_join_info: new DB, same RID family") 

184 return [ 

185 JoinInfo( 

186 join_type="INNER JOIN", 

187 table=jointable.identifier(grammar), 

188 join_condition="ON {new} = {existing}".format( 

189 new=research_database_info.get_rid_column( 

190 jointable 

191 ).identifier(grammar), 

192 existing=research_database_info.get_rid_column( 

193 first_from_table 

194 ).identifier(grammar), 

195 ), 

196 ) 

197 ] 

198 

199 # If we get here, we have to do a complicated join via the MRID. 

200 # log.debug("get_join_info: new DB, different RID family, using MRID") 

201 existing_mrid_column = research_database_info.get_mrid_column_from_table( 

202 first_from_table 

203 ) 

204 existing_mrid_table = existing_mrid_column.table_id 

205 if not existing_mrid_table: 

206 raise ValueError( 

207 f"No MRID table available (in the same database as table " 

208 f"{first_from_table}; cannot link)" 

209 ) 

210 new_mrid_column = research_database_info.get_mrid_column_from_table( 

211 jointable 

212 ) 

213 new_mrid_table = new_mrid_column.table_id 

214 existing_mrid_table_in_query = bool( 

215 get_first_from_table( 

216 parsed, 

217 match_db=existing_mrid_table.db, 

218 match_schema=existing_mrid_table.schema, 

219 match_table=existing_mrid_table.table, 

220 ) 

221 ) 

222 

223 joins = [] # type: List[JoinInfo] 

224 if not existing_mrid_table_in_query: 

225 joins.append( 

226 JoinInfo( 

227 join_type="INNER JOIN", 

228 table=existing_mrid_table.identifier(grammar), 

229 join_condition="ON {m1_trid1} = {t1_trid1}".format( 

230 m1_trid1=research_database_info.get_trid_column( 

231 existing_mrid_table 

232 ).identifier(grammar), 

233 t1_trid1=research_database_info.get_trid_column( 

234 first_from_table 

235 ).identifier(grammar), 

236 ), 

237 ) 

238 ) 

239 joins.append( 

240 JoinInfo( 

241 join_type="INNER JOIN", 

242 table=new_mrid_table.identifier(grammar), 

243 join_condition="ON {m2_mrid2} = {m1_mrid1}".format( 

244 m2_mrid2=new_mrid_column.identifier(grammar), 

245 m1_mrid1=existing_mrid_column.identifier(grammar), 

246 ), 

247 ) 

248 ) 

249 if jointable != new_mrid_table: 

250 joins.append( 

251 JoinInfo( 

252 join_type="INNER JOIN", 

253 table=jointable.identifier(grammar), 

254 join_condition="ON {t2_trid2} = {m2_trid2}".format( 

255 t2_trid2=research_database_info.get_trid_column( 

256 jointable 

257 ).identifier(grammar), 

258 m2_trid2=research_database_info.get_trid_column( 

259 new_mrid_table 

260 ).identifier(grammar), 

261 ), 

262 ) 

263 ) 

264 return joins 

265 

266 

267class SelectElement: 

268 """ 

269 Class to represent a result column in an SQL ``SELECT`` statement. 

270 """ 

271 

272 def __init__( 

273 self, 

274 column_id: ColumnId = None, 

275 raw_select: str = "", 

276 from_table_for_raw_select: TableId = None, 

277 alias: str = "", 

278 ): 

279 """ 

280 Args: 

281 column_id: 

282 a :class:`crate_anon.common.sql.ColumnId` object; using this 

283 will automatically add the column's table to the ``FROM`` 

284 clause 

285 raw_select: 

286 as an alternative to ``column_id``, raw SQL for the ``SELECT`` 

287 clause 

288 from_table_for_raw_select: 

289 if ``raw_select`` is used, a 

290 :class:`crate_anon.common.sql.TableId` that should be added to 

291 the ``FROM`` clause 

292 alias: 

293 alias to be used, i.e. for ``SELECT something AS alias`` 

294 """ 

295 self.column_id = column_id 

296 self.raw_select = raw_select 

297 self.from_table_for_raw_select = from_table_for_raw_select 

298 self.alias = alias 

299 

300 def __repr__(self) -> str: 

301 return ( 

302 "<{qualname}(" 

303 "column_id={column_id}, " 

304 "raw_select={raw_select}, " 

305 "from_table_for_raw_select={from_table_for_raw_select}, " 

306 "alias={alias}) " 

307 "at {addr}>".format( 

308 qualname=self.__class__.__qualname__, 

309 column_id=repr(self.column_id), 

310 raw_select=repr(self.raw_select), 

311 from_table_for_raw_select=repr(self.from_table_for_raw_select), 

312 alias=repr(self.alias), 

313 addr=hex(id(self)), 

314 ) 

315 ) 

316 

317 def sql_select_column(self, grammar: SqlGrammar) -> str: 

318 """ 

319 Return the raw SQL for this ``SELECT`` result column. 

320 

321 Args: 

322 grammar: 

323 :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar` 

324 representing the SQL dialect/grammar in use 

325 

326 Returns: 

327 str: SQL like ``colname`` or ``expression`` or ``colname AS alias`` 

328 

329 """ 

330 result = self.raw_select or self.column_id.identifier(grammar) 

331 if self.alias: 

332 result += " AS " + self.alias 

333 return result 

334 

335 def from_table(self) -> Optional[TableId]: 

336 """ 

337 Returns details of the table to be added to the ``FROM`` clause of the 

338 ``SELECT`` statement. 

339 

340 Returns: 

341 a :class:`crate_anon.common.sql.TableId`, or ``None`` (if 

342 ``raw_select`` is used and ``from_table_for_raw_select`` was not 

343 specified) 

344 

345 """ 

346 if self.raw_select: 

347 return self.from_table_for_raw_select 

348 return self.column_id.table_id 

349 

350 def from_table_str(self, grammar: SqlGrammar) -> str: 

351 """ 

352 Returns a string form of :meth:`from_table`, i.e. an SQL identifier 

353 for the ``FROM`` clause. 

354 

355 Args: 

356 grammar: 

357 :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar` 

358 representing the SQL dialect/grammar in use 

359 

360 Returns: 

361 str: SQL like ``from_table`` 

362 

363 """ 

364 table_id = self.from_table() 

365 if not table_id: 

366 return "" 

367 return table_id.identifier(grammar) 

368 

369 def sql_select_from(self, grammar: SqlGrammar) -> str: 

370 """ 

371 Returns a full ``SELECT... FROM...`` statement. 

372 

373 Args: 

374 grammar: 

375 :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar` 

376 representing the SQL dialect/grammar in use 

377 

378 Returns: 

379 str: SQL like ``SELECT colname AS alias FROM from_table`` 

380 

381 """ 

382 sql = "SELECT " + self.sql_select_column(grammar=grammar) 

383 from_table = self.from_table() 

384 if from_table: 

385 sql += " FROM " + from_table.identifier(grammar) 

386 return sql 

387 

388 

389def reparse_select(p: ParseResults, grammar: SqlGrammar) -> ParseResults: 

390 """ 

391 Internal function for when we get desperate trying to hack around 

392 the results of ``pyparsing``'s efforts. 

393 

394 - takes a :class:`pyparsing.ParseResults` 

395 - converts it to an SQL string 

396 - parses the string as a ``SELECT`` statement 

397 - returns the resulting :class:`pyparsing.ParseResults` 

398 """ 

399 return grammar.get_select_statement().parseString( 

400 text_from_parsed(p, formatted=False), parseAll=True 

401 ) 

402 

403 

404def add_to_select( 

405 sql: str, 

406 grammar: SqlGrammar, 

407 select_elements: List[SelectElement] = None, 

408 where_conditions: List[WhereCondition] = None, 

409 # For SELECT: 

410 distinct: bool = None, 

411 # For WHERE: 

412 where_type: str = "AND", 

413 bracket_where: bool = False, 

414 # For either, for JOIN: 

415 magic_join: bool = True, 

416 join_type: str = "NATURAL JOIN", 

417 join_condition: str = "", 

418 # General: 

419 formatted: bool = True, 

420 debug: bool = False, 

421 debug_verbose: bool = False, 

422) -> str: 

423 """ 

424 This function encapsulates our query builder's common operations. 

425 

426 One premise is that SQL parsing is relatively slow, so we should do this 

427 only once. We parse; add bits to the parsed structure as required; then 

428 re-convert to text. 

429 

430 If you specify table/column, elements will be added to ``SELECT`` and 

431 ``FROM`` unless they already exist. 

432 

433 If you specify ``where_expression``, elements will be added to ``WHERE``. 

434 In this situation, you should also specify ``where_table``; if the 

435 ``where_table`` isn't yet in the ``FROM`` clause, this will be added as 

436 well. 

437 

438 Parsing is SLOW, so we should do as much as possible in a single call to 

439 this function. 

440 

441 Args: 

442 sql: 

443 existing SQL statement 

444 grammar: 

445 :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar` 

446 representing the SQL dialect/grammar in use 

447 select_elements: 

448 optional list of :class:`SelectElement` objects representing 

449 things to add to the ``SELECT`` clause of the ``SELECT`` statement 

450 (i.e. results columns) 

451 where_conditions: 

452 optional list of :class:`crate_anon.common.sql.WhereCondition` 

453 representing conditions to add to the ``WHERE`` clause of the 

454 ``SELECT`` statement 

455 distinct: 

456 if ``True``, make the ``SELECT`` statement a ``SELECT DISTINCT``; 

457 if ``False``, remove any ``DISTINCT``; if ``None``, leave the 

458 ``DISTINCT`` status as it is. 

459 where_type: 

460 logical operator with which to join multiple parts of the ``WHERE`` 

461 expression, typically ``AND`` (but maybe ``OR``, etc.) 

462 bracket_where: 

463 put brackets ``()`` around each new part of the ``WHERE`` 

464 expression? 

465 magic_join: 

466 perform a "magic join", i.e. join the new table in based on our 

467 knowledge of the research database structure? 

468 join_type: 

469 if ``not magic_join``, this is an SQL string specifying the join 

470 type, e.g. ``"INNER JOIN"`` 

471 join_condition: 

472 if ``not magic_join``, this is an SQL string specifying the join 

473 condition, e.g. ``"ON x = y"`` 

474 formatted: 

475 reformat the SQL to look pretty? 

476 debug: 

477 show debugging information 

478 debug_verbose: 

479 show verbose debugging information 

480 

481 Returns: 

482 str: SQL statement 

483 

484 Raises: 

485 ``DatabaseStructureNotUnderstood`` if the relevant schema information 

486 cannot be looked up. 

487 

488 """ 

489 select_elements = select_elements or [] # type: List[SelectElement] 

490 where_conditions = where_conditions or [] # type: List[WhereCondition] 

491 if debug: 

492 log.info(f"START: {sql}") 

493 log.debug(f"select_elements: {select_elements}") 

494 log.debug(f"where_conditions: {where_conditions}") 

495 log.debug(f"where_type: {where_type}") 

496 log.debug(f"join_type: {join_type}") 

497 log.debug(f"join_condition: {join_condition}") 

498 

499 # ------------------------------------------------------------------------- 

500 # Get going. We have to handle a fresh SQL statement in a slightly 

501 # different way. 

502 # ------------------------------------------------------------------------- 

503 if not sql: 

504 if not select_elements: 

505 raise ValueError( 

506 "Fresh SQL statements must include a SELECT " "element" 

507 ) 

508 # --------------------------------------------------------------------- 

509 # Fresh SQL statement 

510 # --------------------------------------------------------------------- 

511 first_select = select_elements[0] 

512 select_elements = select_elements[1:] 

513 sql = first_select.sql_select_from(grammar) 

514 # log.debug("Starting SQL from scratch as: " + sql) 

515 

516 # ------------------------------------------------------------------------- 

517 # Parse what we have (which is now, at a minimum, SELECT ... FROM ...). 

518 # ------------------------------------------------------------------------- 

519 p = grammar.get_select_statement().parseString(sql, parseAll=True) 

520 if debug and debug_verbose: 

521 log.debug("start dump:\n" + p.dump()) 

522 

523 existing_tables = p.join_source.from_tables.asList() # type: List[str] 

524 new_tables = [] # type: List[TableId] 

525 

526 def add_new_table(_table_id: TableId) -> None: 

527 if ( 

528 _table_id 

529 and _table_id not in new_tables 

530 and _table_id.identifier(grammar) not in existing_tables 

531 ): 

532 new_tables.append(_table_id) 

533 

534 # ------------------------------------------------------------------------- 

535 # DISTINCT? 

536 # ------------------------------------------------------------------------- 

537 if distinct is True: 

538 set_distinct_within_parsed(p, action="set") 

539 elif distinct is False: 

540 set_distinct_within_parsed(p, action="clear") 

541 

542 # ------------------------------------------------------------------------- 

543 # Process all the (other?) SELECT clauses 

544 # ------------------------------------------------------------------------- 

545 for se in select_elements: 

546 p = parser_add_result_column( 

547 p, se.sql_select_column(grammar), grammar=grammar 

548 ) 

549 add_new_table(se.from_table()) 

550 

551 # ------------------------------------------------------------------------- 

552 # Process all the WHERE clauses 

553 # ------------------------------------------------------------------------- 

554 for wc in where_conditions: 

555 where_expression = wc.sql(grammar) 

556 if bracket_where: 

557 where_expression = "(" + where_expression + ")" 

558 

559 # The tricky bit: inserting it. 

560 # We use the [0] to overcome the effects of defining these things 

561 # as a pyparsing Group(), which encapsulates the results in a list. 

562 if p.where_clause: 

563 cond = grammar.get_expr().parseString( 

564 where_expression, parseAll=True 

565 )[0] 

566 extra = [where_type, cond] 

567 p.where_clause.where_expr.extend(extra) 

568 else: 

569 # No WHERE as yet 

570 # Doing this properly is a nightmare. 

571 # It's hard to add a *named* ParseResults element to another. 

572 # So it's very hard to alter p.where_clause.where_expr such that 

573 # we can continue adding more WHERE clauses if we want. 

574 # This is the inefficient, cop-out method: 

575 # (1) Add as plain text 

576 p.where_clause.append("WHERE " + where_expression) 

577 # (2) Reparse... 

578 p = reparse_select(p, grammar=grammar) 

579 

580 add_new_table(wc.table_id) 

581 

582 # ------------------------------------------------------------------------- 

583 # Process all the FROM clauses, autojoining as necessary 

584 # ------------------------------------------------------------------------- 

585 

586 for table_id in new_tables: 

587 p = parser_add_from_tables( 

588 p, 

589 get_join_info( 

590 grammar=grammar, 

591 parsed=p, 

592 jointable=table_id, 

593 magic_join=magic_join, 

594 nonmagic_join_type=join_type, 

595 nonmagic_join_condition=join_condition, 

596 ), # may raise DatabaseStructureNotUnderstood 

597 grammar=grammar, 

598 ) 

599 

600 if debug and debug_verbose: 

601 log.debug("end dump:\n" + p.dump()) 

602 result = text_from_parsed(p, formatted=False) 

603 if formatted: 

604 result = format_sql(result) 

605 if debug: 

606 log.info(f"END: {result}") 

607 return result