Coverage for crateweb/research/sql_writer.py: 14%
125 statements
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
1"""
2crate_anon/crateweb/research/sql_writer.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Automatically create/manipulate SQL statements based on our extra knowledge
27of the fields that can be used to link across tables/databases.**
29"""
31import logging
32from typing import List, Optional
34from cardinal_pythonlib.sql.sql_grammar import (
35 format_sql,
36 SqlGrammar,
37 text_from_parsed,
38)
39from pyparsing import ParseResults
41from crate_anon.common.sql import (
42 ColumnId,
43 get_first_from_table,
44 JoinInfo,
45 parser_add_result_column,
46 parser_add_from_tables,
47 set_distinct_within_parsed,
48 TableId,
49 WhereCondition,
50)
51from crate_anon.crateweb.research.errors import DatabaseStructureNotUnderstood
52from crate_anon.crateweb.research.research_db_info import (
53 get_research_db_info,
54)
56log = logging.getLogger(__name__)
59# =============================================================================
60# Automatic SQL generation functions
61# =============================================================================
64def get_join_info(
65 grammar: SqlGrammar,
66 parsed: ParseResults,
67 jointable: TableId,
68 magic_join: bool = False,
69 nonmagic_join_type: str = "INNER JOIN",
70 nonmagic_join_condition: str = "",
71) -> List[JoinInfo]:
72 """
73 Works out how to join a new table into an existing SQL ``SELECT`` query.
75 Args:
76 grammar:
77 :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar`
78 representing the SQL dialect/grammar in use
79 parsed:
80 existing :class:`pyparsing.ParseResults` representing the
81 ``SELECT`` statement so far
82 jointable:
83 :class:`crate_anon.common.sql.TableId` representing the table to be
84 joined in
85 magic_join:
86 perform a "magic join", i.e. join the new table in based on our
87 knowledge of the research database structure?
88 nonmagic_join_type:
89 if ``not magic_join``, this is an SQL string specifying the join
90 type, e.g. ``"INNER JOIN"``
91 nonmagic_join_condition:
92 if ``not magic_join``, this is an SQL string specifying the join
93 condition, e.g. ``"ON x = y"``
95 Returns:
96 a list of :class:`crate_anon.common.sql.JoinInfo` objects, e.g.
97 ``[JoinInfo("tablename", "INNER JOIN", "WHERE somecondition")]``.
99 Raises:
100 ``DatabaseStructureNotUnderstood`` if the relevant schema information
101 cannot be looked up.
103 Notes:
105 - ``INNER JOIN`` etc. is part of ANSI SQL
107 """
108 research_database_info = get_research_db_info()
109 first_from_table = get_first_from_table(parsed)
110 from_table_in_join_schema = get_first_from_table(
111 parsed, match_db=jointable.db, match_schema=jointable.schema
112 )
113 exact_match_table = get_first_from_table(
114 parsed,
115 match_db=jointable.db,
116 match_schema=jointable.schema,
117 match_table=jointable.table,
118 )
120 if not first_from_table:
121 # No tables in query yet.
122 # This should not happen; this function is to help with adding
123 # new FROM tables to existing FROM clauses.
124 log.warning("get_join_info: no tables in query")
125 return []
127 if exact_match_table:
128 # This table is already in the query. No JOIN should be required.
129 # log.debug("get_join_info: same table already in query")
130 return []
132 if not magic_join:
133 # log.debug("get_join_info: non-magic join")
134 return [
135 JoinInfo(
136 join_type=nonmagic_join_type,
137 table=jointable.identifier(grammar),
138 join_condition=nonmagic_join_condition,
139 )
140 ]
142 if from_table_in_join_schema:
143 # Another table from the same database is present. Link on the
144 # TRID field.
145 # log.debug("get_join_info: joining to another table in same DB")
146 return [
147 JoinInfo(
148 join_type="INNER JOIN",
149 table=jointable.identifier(grammar),
150 join_condition="ON {new} = {existing}".format(
151 new=research_database_info.get_trid_column(
152 jointable
153 ).identifier(grammar),
154 existing=research_database_info.get_trid_column(
155 from_table_in_join_schema
156 ).identifier(grammar),
157 ),
158 )
159 ]
161 # OK. So now we're building a cross-database join.
162 try:
163 existing_family = research_database_info.get_dbinfo_by_schema(
164 first_from_table.schema_id
165 ).rid_family
166 new_family = research_database_info.get_dbinfo_by_schema(
167 jointable.schema_id
168 ).rid_family
169 except ValueError:
170 # Some schema information is absent. This probably means that the user
171 # has created a custom query and passed it to the query builder;
172 # alternatively, that the database structure has changed. Either way,
173 # the query builder won't cope.
174 raise DatabaseStructureNotUnderstood(
175 "Some schema information is absent. Likely, either a custom query "
176 "has passed to the query builder, or the database structure has "
177 "changed since the query was written."
178 )
180 # log.debug("existing_family={}, new_family={}".format(
181 # existing_family, new_family))
182 if existing_family and existing_family == new_family:
183 # log.debug("get_join_info: new DB, same RID family")
184 return [
185 JoinInfo(
186 join_type="INNER JOIN",
187 table=jointable.identifier(grammar),
188 join_condition="ON {new} = {existing}".format(
189 new=research_database_info.get_rid_column(
190 jointable
191 ).identifier(grammar),
192 existing=research_database_info.get_rid_column(
193 first_from_table
194 ).identifier(grammar),
195 ),
196 )
197 ]
199 # If we get here, we have to do a complicated join via the MRID.
200 # log.debug("get_join_info: new DB, different RID family, using MRID")
201 existing_mrid_column = research_database_info.get_mrid_column_from_table(
202 first_from_table
203 )
204 existing_mrid_table = existing_mrid_column.table_id
205 if not existing_mrid_table:
206 raise ValueError(
207 f"No MRID table available (in the same database as table "
208 f"{first_from_table}; cannot link)"
209 )
210 new_mrid_column = research_database_info.get_mrid_column_from_table(
211 jointable
212 )
213 new_mrid_table = new_mrid_column.table_id
214 existing_mrid_table_in_query = bool(
215 get_first_from_table(
216 parsed,
217 match_db=existing_mrid_table.db,
218 match_schema=existing_mrid_table.schema,
219 match_table=existing_mrid_table.table,
220 )
221 )
223 joins = [] # type: List[JoinInfo]
224 if not existing_mrid_table_in_query:
225 joins.append(
226 JoinInfo(
227 join_type="INNER JOIN",
228 table=existing_mrid_table.identifier(grammar),
229 join_condition="ON {m1_trid1} = {t1_trid1}".format(
230 m1_trid1=research_database_info.get_trid_column(
231 existing_mrid_table
232 ).identifier(grammar),
233 t1_trid1=research_database_info.get_trid_column(
234 first_from_table
235 ).identifier(grammar),
236 ),
237 )
238 )
239 joins.append(
240 JoinInfo(
241 join_type="INNER JOIN",
242 table=new_mrid_table.identifier(grammar),
243 join_condition="ON {m2_mrid2} = {m1_mrid1}".format(
244 m2_mrid2=new_mrid_column.identifier(grammar),
245 m1_mrid1=existing_mrid_column.identifier(grammar),
246 ),
247 )
248 )
249 if jointable != new_mrid_table:
250 joins.append(
251 JoinInfo(
252 join_type="INNER JOIN",
253 table=jointable.identifier(grammar),
254 join_condition="ON {t2_trid2} = {m2_trid2}".format(
255 t2_trid2=research_database_info.get_trid_column(
256 jointable
257 ).identifier(grammar),
258 m2_trid2=research_database_info.get_trid_column(
259 new_mrid_table
260 ).identifier(grammar),
261 ),
262 )
263 )
264 return joins
267class SelectElement:
268 """
269 Class to represent a result column in an SQL ``SELECT`` statement.
270 """
272 def __init__(
273 self,
274 column_id: ColumnId = None,
275 raw_select: str = "",
276 from_table_for_raw_select: TableId = None,
277 alias: str = "",
278 ):
279 """
280 Args:
281 column_id:
282 a :class:`crate_anon.common.sql.ColumnId` object; using this
283 will automatically add the column's table to the ``FROM``
284 clause
285 raw_select:
286 as an alternative to ``column_id``, raw SQL for the ``SELECT``
287 clause
288 from_table_for_raw_select:
289 if ``raw_select`` is used, a
290 :class:`crate_anon.common.sql.TableId` that should be added to
291 the ``FROM`` clause
292 alias:
293 alias to be used, i.e. for ``SELECT something AS alias``
294 """
295 self.column_id = column_id
296 self.raw_select = raw_select
297 self.from_table_for_raw_select = from_table_for_raw_select
298 self.alias = alias
300 def __repr__(self) -> str:
301 return (
302 "<{qualname}("
303 "column_id={column_id}, "
304 "raw_select={raw_select}, "
305 "from_table_for_raw_select={from_table_for_raw_select}, "
306 "alias={alias}) "
307 "at {addr}>".format(
308 qualname=self.__class__.__qualname__,
309 column_id=repr(self.column_id),
310 raw_select=repr(self.raw_select),
311 from_table_for_raw_select=repr(self.from_table_for_raw_select),
312 alias=repr(self.alias),
313 addr=hex(id(self)),
314 )
315 )
317 def sql_select_column(self, grammar: SqlGrammar) -> str:
318 """
319 Return the raw SQL for this ``SELECT`` result column.
321 Args:
322 grammar:
323 :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar`
324 representing the SQL dialect/grammar in use
326 Returns:
327 str: SQL like ``colname`` or ``expression`` or ``colname AS alias``
329 """
330 result = self.raw_select or self.column_id.identifier(grammar)
331 if self.alias:
332 result += " AS " + self.alias
333 return result
335 def from_table(self) -> Optional[TableId]:
336 """
337 Returns details of the table to be added to the ``FROM`` clause of the
338 ``SELECT`` statement.
340 Returns:
341 a :class:`crate_anon.common.sql.TableId`, or ``None`` (if
342 ``raw_select`` is used and ``from_table_for_raw_select`` was not
343 specified)
345 """
346 if self.raw_select:
347 return self.from_table_for_raw_select
348 return self.column_id.table_id
350 def from_table_str(self, grammar: SqlGrammar) -> str:
351 """
352 Returns a string form of :meth:`from_table`, i.e. an SQL identifier
353 for the ``FROM`` clause.
355 Args:
356 grammar:
357 :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar`
358 representing the SQL dialect/grammar in use
360 Returns:
361 str: SQL like ``from_table``
363 """
364 table_id = self.from_table()
365 if not table_id:
366 return ""
367 return table_id.identifier(grammar)
369 def sql_select_from(self, grammar: SqlGrammar) -> str:
370 """
371 Returns a full ``SELECT... FROM...`` statement.
373 Args:
374 grammar:
375 :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar`
376 representing the SQL dialect/grammar in use
378 Returns:
379 str: SQL like ``SELECT colname AS alias FROM from_table``
381 """
382 sql = "SELECT " + self.sql_select_column(grammar=grammar)
383 from_table = self.from_table()
384 if from_table:
385 sql += " FROM " + from_table.identifier(grammar)
386 return sql
389def reparse_select(p: ParseResults, grammar: SqlGrammar) -> ParseResults:
390 """
391 Internal function for when we get desperate trying to hack around
392 the results of ``pyparsing``'s efforts.
394 - takes a :class:`pyparsing.ParseResults`
395 - converts it to an SQL string
396 - parses the string as a ``SELECT`` statement
397 - returns the resulting :class:`pyparsing.ParseResults`
398 """
399 return grammar.get_select_statement().parseString(
400 text_from_parsed(p, formatted=False), parseAll=True
401 )
404def add_to_select(
405 sql: str,
406 grammar: SqlGrammar,
407 select_elements: List[SelectElement] = None,
408 where_conditions: List[WhereCondition] = None,
409 # For SELECT:
410 distinct: bool = None,
411 # For WHERE:
412 where_type: str = "AND",
413 bracket_where: bool = False,
414 # For either, for JOIN:
415 magic_join: bool = True,
416 join_type: str = "NATURAL JOIN",
417 join_condition: str = "",
418 # General:
419 formatted: bool = True,
420 debug: bool = False,
421 debug_verbose: bool = False,
422) -> str:
423 """
424 This function encapsulates our query builder's common operations.
426 One premise is that SQL parsing is relatively slow, so we should do this
427 only once. We parse; add bits to the parsed structure as required; then
428 re-convert to text.
430 If you specify table/column, elements will be added to ``SELECT`` and
431 ``FROM`` unless they already exist.
433 If you specify ``where_expression``, elements will be added to ``WHERE``.
434 In this situation, you should also specify ``where_table``; if the
435 ``where_table`` isn't yet in the ``FROM`` clause, this will be added as
436 well.
438 Parsing is SLOW, so we should do as much as possible in a single call to
439 this function.
441 Args:
442 sql:
443 existing SQL statement
444 grammar:
445 :class:`cardinal_pythonlib.sql.sql_grammar.SqlGrammar`
446 representing the SQL dialect/grammar in use
447 select_elements:
448 optional list of :class:`SelectElement` objects representing
449 things to add to the ``SELECT`` clause of the ``SELECT`` statement
450 (i.e. results columns)
451 where_conditions:
452 optional list of :class:`crate_anon.common.sql.WhereCondition`
453 representing conditions to add to the ``WHERE`` clause of the
454 ``SELECT`` statement
455 distinct:
456 if ``True``, make the ``SELECT`` statement a ``SELECT DISTINCT``;
457 if ``False``, remove any ``DISTINCT``; if ``None``, leave the
458 ``DISTINCT`` status as it is.
459 where_type:
460 logical operator with which to join multiple parts of the ``WHERE``
461 expression, typically ``AND`` (but maybe ``OR``, etc.)
462 bracket_where:
463 put brackets ``()`` around each new part of the ``WHERE``
464 expression?
465 magic_join:
466 perform a "magic join", i.e. join the new table in based on our
467 knowledge of the research database structure?
468 join_type:
469 if ``not magic_join``, this is an SQL string specifying the join
470 type, e.g. ``"INNER JOIN"``
471 join_condition:
472 if ``not magic_join``, this is an SQL string specifying the join
473 condition, e.g. ``"ON x = y"``
474 formatted:
475 reformat the SQL to look pretty?
476 debug:
477 show debugging information
478 debug_verbose:
479 show verbose debugging information
481 Returns:
482 str: SQL statement
484 Raises:
485 ``DatabaseStructureNotUnderstood`` if the relevant schema information
486 cannot be looked up.
488 """
489 select_elements = select_elements or [] # type: List[SelectElement]
490 where_conditions = where_conditions or [] # type: List[WhereCondition]
491 if debug:
492 log.info(f"START: {sql}")
493 log.debug(f"select_elements: {select_elements}")
494 log.debug(f"where_conditions: {where_conditions}")
495 log.debug(f"where_type: {where_type}")
496 log.debug(f"join_type: {join_type}")
497 log.debug(f"join_condition: {join_condition}")
499 # -------------------------------------------------------------------------
500 # Get going. We have to handle a fresh SQL statement in a slightly
501 # different way.
502 # -------------------------------------------------------------------------
503 if not sql:
504 if not select_elements:
505 raise ValueError(
506 "Fresh SQL statements must include a SELECT " "element"
507 )
508 # ---------------------------------------------------------------------
509 # Fresh SQL statement
510 # ---------------------------------------------------------------------
511 first_select = select_elements[0]
512 select_elements = select_elements[1:]
513 sql = first_select.sql_select_from(grammar)
514 # log.debug("Starting SQL from scratch as: " + sql)
516 # -------------------------------------------------------------------------
517 # Parse what we have (which is now, at a minimum, SELECT ... FROM ...).
518 # -------------------------------------------------------------------------
519 p = grammar.get_select_statement().parseString(sql, parseAll=True)
520 if debug and debug_verbose:
521 log.debug("start dump:\n" + p.dump())
523 existing_tables = p.join_source.from_tables.asList() # type: List[str]
524 new_tables = [] # type: List[TableId]
526 def add_new_table(_table_id: TableId) -> None:
527 if (
528 _table_id
529 and _table_id not in new_tables
530 and _table_id.identifier(grammar) not in existing_tables
531 ):
532 new_tables.append(_table_id)
534 # -------------------------------------------------------------------------
535 # DISTINCT?
536 # -------------------------------------------------------------------------
537 if distinct is True:
538 set_distinct_within_parsed(p, action="set")
539 elif distinct is False:
540 set_distinct_within_parsed(p, action="clear")
542 # -------------------------------------------------------------------------
543 # Process all the (other?) SELECT clauses
544 # -------------------------------------------------------------------------
545 for se in select_elements:
546 p = parser_add_result_column(
547 p, se.sql_select_column(grammar), grammar=grammar
548 )
549 add_new_table(se.from_table())
551 # -------------------------------------------------------------------------
552 # Process all the WHERE clauses
553 # -------------------------------------------------------------------------
554 for wc in where_conditions:
555 where_expression = wc.sql(grammar)
556 if bracket_where:
557 where_expression = "(" + where_expression + ")"
559 # The tricky bit: inserting it.
560 # We use the [0] to overcome the effects of defining these things
561 # as a pyparsing Group(), which encapsulates the results in a list.
562 if p.where_clause:
563 cond = grammar.get_expr().parseString(
564 where_expression, parseAll=True
565 )[0]
566 extra = [where_type, cond]
567 p.where_clause.where_expr.extend(extra)
568 else:
569 # No WHERE as yet
570 # Doing this properly is a nightmare.
571 # It's hard to add a *named* ParseResults element to another.
572 # So it's very hard to alter p.where_clause.where_expr such that
573 # we can continue adding more WHERE clauses if we want.
574 # This is the inefficient, cop-out method:
575 # (1) Add as plain text
576 p.where_clause.append("WHERE " + where_expression)
577 # (2) Reparse...
578 p = reparse_select(p, grammar=grammar)
580 add_new_table(wc.table_id)
582 # -------------------------------------------------------------------------
583 # Process all the FROM clauses, autojoining as necessary
584 # -------------------------------------------------------------------------
586 for table_id in new_tables:
587 p = parser_add_from_tables(
588 p,
589 get_join_info(
590 grammar=grammar,
591 parsed=p,
592 jointable=table_id,
593 magic_join=magic_join,
594 nonmagic_join_type=join_type,
595 nonmagic_join_condition=join_condition,
596 ), # may raise DatabaseStructureNotUnderstood
597 grammar=grammar,
598 )
600 if debug and debug_verbose:
601 log.debug("end dump:\n" + p.dump())
602 result = text_from_parsed(p, formatted=False)
603 if formatted:
604 result = format_sql(result)
605 if debug:
606 log.info(f"END: {result}")
607 return result