sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7from sqlglot import exp, generator, parser, tokens, transforms 8from sqlglot.dialects.dialect import ( 9 Dialect, 10 NormalizationStrategy, 11 arg_max_or_min_no_count, 12 binary_from_function, 13 date_add_interval_sql, 14 datestrtodate_sql, 15 build_formatted_time, 16 filter_array_using_unnest, 17 if_sql, 18 inline_array_unless_query, 19 max_or_greatest, 20 min_or_least, 21 no_ilike_sql, 22 build_date_delta_with_interval, 23 regexp_replace_sql, 24 rename_func, 25 sha256_sql, 26 timestrtotime_sql, 27 ts_or_ds_add_cast, 28 unit_to_var, 29) 30from sqlglot.helper import seq_get, split_num_words 31from sqlglot.tokens import TokenType 32 33if t.TYPE_CHECKING: 34 from sqlglot._typing import E, Lit 35 36logger = logging.getLogger("sqlglot") 37 38 39def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 40 if not expression.find_ancestor(exp.From, exp.Join): 41 return self.values_sql(expression) 42 43 structs = [] 44 alias = expression.args.get("alias") 45 for tup in expression.find_all(exp.Tuple): 46 field_aliases = ( 47 alias.columns 48 if alias and alias.columns 49 else (f"_c{i}" for i in range(len(tup.expressions))) 50 ) 51 expressions = [ 52 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 53 for name, fld in zip(field_aliases, tup.expressions) 54 ] 55 structs.append(exp.Struct(expressions=expressions)) 56 57 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 58 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 59 return self.unnest_sql( 60 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 61 ) 62 63 64def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 65 this = expression.this 66 if isinstance(this, exp.Schema): 67 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 68 else: 69 this = self.sql(this) 70 return f"RETURNS {this}" 71 72 73def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 74 returns = expression.find(exp.ReturnsProperty) 75 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 76 expression.set("kind", "TABLE FUNCTION") 77 78 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 79 expression.set("expression", expression.expression.this) 80 81 return self.create_sql(expression) 82 83 84# https://issuetracker.google.com/issues/162294746 85# workaround for bigquery bug when grouping by an expression and then ordering 86# WITH x AS (SELECT 1 y) 87# SELECT y + 1 z 88# FROM x 89# GROUP BY x + 1 90# ORDER by z 91def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 92 if isinstance(expression, exp.Select): 93 group = expression.args.get("group") 94 order = expression.args.get("order") 95 96 if group and order: 97 aliases = { 98 select.this: select.args["alias"] 99 for select in expression.selects 100 if isinstance(select, exp.Alias) 101 } 102 103 for grouped in group.expressions: 104 if grouped.is_int: 105 continue 106 alias = aliases.get(grouped) 107 if alias: 108 grouped.replace(exp.column(alias)) 109 110 return expression 111 112 113def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 114 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 115 if isinstance(expression, exp.CTE) and expression.alias_column_names: 116 cte_query = expression.this 117 118 if cte_query.is_star: 119 logger.warning( 120 "Can't push down CTE column names for star queries. Run the query through" 121 " the optimizer or use 'qualify' to expand the star projections first." 122 ) 123 return expression 124 125 column_names = expression.alias_column_names 126 expression.args["alias"].set("columns", None) 127 128 for name, select in zip(column_names, cte_query.selects): 129 to_replace = select 130 131 if isinstance(select, exp.Alias): 132 select = select.this 133 134 # Inner aliases are shadowed by the CTE column names 135 to_replace.replace(exp.alias_(select, name)) 136 137 return expression 138 139 140def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 141 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 142 this.set("zone", seq_get(args, 2)) 143 return this 144 145 146def _build_timestamp(args: t.List) -> exp.Timestamp: 147 timestamp = exp.Timestamp.from_arg_list(args) 148 timestamp.set("with_tz", True) 149 return timestamp 150 151 152def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 153 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 154 return expr_type.from_arg_list(args) 155 156 157def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 158 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 159 arg = seq_get(args, 0) 160 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 161 162 163def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 164 return self.sql( 165 exp.Exists( 166 this=exp.select("1") 167 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 168 .where(exp.column("_col").eq(expression.right)) 169 ) 170 ) 171 172 173def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 174 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 175 176 177def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 178 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 179 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 180 unit = unit_to_var(expression) 181 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 182 183 184def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 185 scale = expression.args.get("scale") 186 timestamp = expression.this 187 188 if scale in (None, exp.UnixToTime.SECONDS): 189 return self.func("TIMESTAMP_SECONDS", timestamp) 190 if scale == exp.UnixToTime.MILLIS: 191 return self.func("TIMESTAMP_MILLIS", timestamp) 192 if scale == exp.UnixToTime.MICROS: 193 return self.func("TIMESTAMP_MICROS", timestamp) 194 195 unix_seconds = exp.cast( 196 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 197 ) 198 return self.func("TIMESTAMP_SECONDS", unix_seconds) 199 200 201def _build_time(args: t.List) -> exp.Func: 202 if len(args) == 1: 203 return exp.TsOrDsToTime(this=args[0]) 204 if len(args) == 2: 205 return exp.Time.from_arg_list(args) 206 return exp.TimeFromParts.from_arg_list(args) 207 208 209def _build_datetime(args: t.List) -> exp.Func: 210 if len(args) == 1: 211 return exp.TsOrDsToTimestamp.from_arg_list(args) 212 if len(args) == 2: 213 return exp.Datetime.from_arg_list(args) 214 return exp.TimestampFromParts.from_arg_list(args) 215 216 217def _str_to_datetime_sql( 218 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 219) -> str: 220 this = self.sql(expression, "this") 221 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 222 223 if expression.args.get("safe"): 224 fmt = self.format_time( 225 expression, 226 self.dialect.INVERSE_FORMAT_MAPPING, 227 self.dialect.INVERSE_FORMAT_TRIE, 228 ) 229 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 230 231 fmt = self.format_time(expression) 232 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 233 234 235class BigQuery(Dialect): 236 WEEK_OFFSET = -1 237 UNNEST_COLUMN_ONLY = True 238 SUPPORTS_USER_DEFINED_TYPES = False 239 SUPPORTS_SEMI_ANTI_JOIN = False 240 LOG_BASE_FIRST = False 241 HEX_LOWERCASE = True 242 FORCE_EARLY_ALIAS_REF_EXPANSION = True 243 EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = True 244 245 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 246 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 247 248 # bigquery udfs are case sensitive 249 NORMALIZE_FUNCTIONS = False 250 251 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 252 TIME_MAPPING = { 253 "%D": "%m/%d/%y", 254 "%E6S": "%S.%f", 255 } 256 257 FORMAT_MAPPING = { 258 "DD": "%d", 259 "MM": "%m", 260 "MON": "%b", 261 "MONTH": "%B", 262 "YYYY": "%Y", 263 "YY": "%y", 264 "HH": "%I", 265 "HH12": "%I", 266 "HH24": "%H", 267 "MI": "%M", 268 "SS": "%S", 269 "SSSSS": "%f", 270 "TZH": "%z", 271 } 272 273 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 274 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 275 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 276 277 def normalize_identifier(self, expression: E) -> E: 278 if ( 279 isinstance(expression, exp.Identifier) 280 and self.normalization_strategy is not NormalizationStrategy.CASE_SENSITIVE 281 ): 282 parent = expression.parent 283 while isinstance(parent, exp.Dot): 284 parent = parent.parent 285 286 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 287 # by default. The following check uses a heuristic to detect tables based on whether 288 # they are qualified. This should generally be correct, because tables in BigQuery 289 # must be qualified with at least a dataset, unless @@dataset_id is set. 290 case_sensitive = ( 291 isinstance(parent, exp.UserDefinedFunction) 292 or ( 293 isinstance(parent, exp.Table) 294 and parent.db 295 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 296 ) 297 or expression.meta.get("is_table") 298 ) 299 if not case_sensitive: 300 expression.set("this", expression.this.lower()) 301 302 return expression 303 304 class Tokenizer(tokens.Tokenizer): 305 QUOTES = ["'", '"', '"""', "'''"] 306 COMMENTS = ["--", "#", ("/*", "*/")] 307 IDENTIFIERS = ["`"] 308 STRING_ESCAPES = ["\\"] 309 310 HEX_STRINGS = [("0x", ""), ("0X", "")] 311 312 BYTE_STRINGS = [ 313 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 314 ] 315 316 RAW_STRINGS = [ 317 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 318 ] 319 320 KEYWORDS = { 321 **tokens.Tokenizer.KEYWORDS, 322 "ANY TYPE": TokenType.VARIANT, 323 "BEGIN": TokenType.COMMAND, 324 "BEGIN TRANSACTION": TokenType.BEGIN, 325 "BYTEINT": TokenType.INT, 326 "BYTES": TokenType.BINARY, 327 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 328 "DATETIME": TokenType.TIMESTAMP, 329 "DECLARE": TokenType.COMMAND, 330 "ELSEIF": TokenType.COMMAND, 331 "EXCEPTION": TokenType.COMMAND, 332 "FLOAT64": TokenType.DOUBLE, 333 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 334 "MODEL": TokenType.MODEL, 335 "NOT DETERMINISTIC": TokenType.VOLATILE, 336 "RECORD": TokenType.STRUCT, 337 "TIMESTAMP": TokenType.TIMESTAMPTZ, 338 } 339 KEYWORDS.pop("DIV") 340 KEYWORDS.pop("VALUES") 341 KEYWORDS.pop("/*+") 342 343 class Parser(parser.Parser): 344 PREFIXED_PIVOT_COLUMNS = True 345 LOG_DEFAULTS_TO_LN = True 346 SUPPORTS_IMPLICIT_UNNEST = True 347 348 FUNCTIONS = { 349 **parser.Parser.FUNCTIONS, 350 "DATE": _build_date, 351 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 352 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 353 "DATE_TRUNC": lambda args: exp.DateTrunc( 354 unit=exp.Literal.string(str(seq_get(args, 1))), 355 this=seq_get(args, 0), 356 ), 357 "DATETIME": _build_datetime, 358 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 359 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 360 "DIV": binary_from_function(exp.IntDiv), 361 "FORMAT_DATE": lambda args: exp.TimeToStr( 362 this=exp.TsOrDsToDate(this=seq_get(args, 1)), format=seq_get(args, 0) 363 ), 364 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 365 "JSON_EXTRACT_SCALAR": lambda args: exp.JSONExtractScalar( 366 this=seq_get(args, 0), expression=seq_get(args, 1) or exp.Literal.string("$") 367 ), 368 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 369 "MD5": exp.MD5Digest.from_arg_list, 370 "TO_HEX": _build_to_hex, 371 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 372 [seq_get(args, 1), seq_get(args, 0)] 373 ), 374 "PARSE_TIMESTAMP": _build_parse_timestamp, 375 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 376 "REGEXP_EXTRACT": lambda args: exp.RegexpExtract( 377 this=seq_get(args, 0), 378 expression=seq_get(args, 1), 379 position=seq_get(args, 2), 380 occurrence=seq_get(args, 3), 381 group=exp.Literal.number(1) if re.compile(args[1].name).groups == 1 else None, 382 ), 383 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 384 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 385 "SPLIT": lambda args: exp.Split( 386 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 387 this=seq_get(args, 0), 388 expression=seq_get(args, 1) or exp.Literal.string(","), 389 ), 390 "TIME": _build_time, 391 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 392 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 393 "TIMESTAMP": _build_timestamp, 394 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 395 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 396 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 397 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 398 ), 399 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 400 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 401 ), 402 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 403 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 404 } 405 406 FUNCTION_PARSERS = { 407 **parser.Parser.FUNCTION_PARSERS, 408 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 409 } 410 FUNCTION_PARSERS.pop("TRIM") 411 412 NO_PAREN_FUNCTIONS = { 413 **parser.Parser.NO_PAREN_FUNCTIONS, 414 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 415 } 416 417 NESTED_TYPE_TOKENS = { 418 *parser.Parser.NESTED_TYPE_TOKENS, 419 TokenType.TABLE, 420 } 421 422 PROPERTY_PARSERS = { 423 **parser.Parser.PROPERTY_PARSERS, 424 "NOT DETERMINISTIC": lambda self: self.expression( 425 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 426 ), 427 "OPTIONS": lambda self: self._parse_with_property(), 428 } 429 430 CONSTRAINT_PARSERS = { 431 **parser.Parser.CONSTRAINT_PARSERS, 432 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 433 } 434 435 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 436 RANGE_PARSERS.pop(TokenType.OVERLAPS) 437 438 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 439 440 STATEMENT_PARSERS = { 441 **parser.Parser.STATEMENT_PARSERS, 442 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 443 TokenType.END: lambda self: self._parse_as_command(self._prev), 444 TokenType.FOR: lambda self: self._parse_for_in(), 445 } 446 447 BRACKET_OFFSETS = { 448 "OFFSET": (0, False), 449 "ORDINAL": (1, False), 450 "SAFE_OFFSET": (0, True), 451 "SAFE_ORDINAL": (1, True), 452 } 453 454 def _parse_for_in(self) -> exp.ForIn: 455 this = self._parse_range() 456 self._match_text_seq("DO") 457 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 458 459 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 460 this = super()._parse_table_part(schema=schema) or self._parse_number() 461 462 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 463 if isinstance(this, exp.Identifier): 464 table_name = this.name 465 while self._match(TokenType.DASH, advance=False) and self._next: 466 text = "" 467 while self._curr and self._curr.token_type != TokenType.DOT: 468 self._advance() 469 text += self._prev.text 470 table_name += text 471 472 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 473 elif isinstance(this, exp.Literal): 474 table_name = this.name 475 476 if self._is_connected() and self._parse_var(any_token=True): 477 table_name += self._prev.text 478 479 this = exp.Identifier(this=table_name, quoted=True) 480 481 return this 482 483 def _parse_table_parts( 484 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 485 ) -> exp.Table: 486 table = super()._parse_table_parts( 487 schema=schema, is_db_reference=is_db_reference, wildcard=True 488 ) 489 490 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 491 if not table.catalog: 492 if table.db: 493 parts = table.db.split(".") 494 if len(parts) == 2 and not table.args["db"].quoted: 495 table.set("catalog", exp.Identifier(this=parts[0])) 496 table.set("db", exp.Identifier(this=parts[1])) 497 else: 498 parts = table.name.split(".") 499 if len(parts) == 2 and not table.this.quoted: 500 table.set("db", exp.Identifier(this=parts[0])) 501 table.set("this", exp.Identifier(this=parts[1])) 502 503 if any("." in p.name for p in table.parts): 504 catalog, db, this, *rest = ( 505 exp.to_identifier(p, quoted=True) 506 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 507 ) 508 509 if rest and this: 510 this = exp.Dot.build([this, *rest]) # type: ignore 511 512 table = exp.Table( 513 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 514 ) 515 table.meta["quoted_table"] = True 516 517 return table 518 519 def _parse_column(self) -> t.Optional[exp.Expression]: 520 column = super()._parse_column() 521 if isinstance(column, exp.Column): 522 parts = column.parts 523 if any("." in p.name for p in parts): 524 catalog, db, table, this, *rest = ( 525 exp.to_identifier(p, quoted=True) 526 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 527 ) 528 529 if rest and this: 530 this = exp.Dot.build([this, *rest]) # type: ignore 531 532 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 533 column.meta["quoted_column"] = True 534 535 return column 536 537 @t.overload 538 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 539 540 @t.overload 541 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 542 543 def _parse_json_object(self, agg=False): 544 json_object = super()._parse_json_object() 545 array_kv_pair = seq_get(json_object.expressions, 0) 546 547 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 548 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 549 if ( 550 array_kv_pair 551 and isinstance(array_kv_pair.this, exp.Array) 552 and isinstance(array_kv_pair.expression, exp.Array) 553 ): 554 keys = array_kv_pair.this.expressions 555 values = array_kv_pair.expression.expressions 556 557 json_object.set( 558 "expressions", 559 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 560 ) 561 562 return json_object 563 564 def _parse_bracket( 565 self, this: t.Optional[exp.Expression] = None 566 ) -> t.Optional[exp.Expression]: 567 bracket = super()._parse_bracket(this) 568 569 if this is bracket: 570 return bracket 571 572 if isinstance(bracket, exp.Bracket): 573 for expression in bracket.expressions: 574 name = expression.name.upper() 575 576 if name not in self.BRACKET_OFFSETS: 577 break 578 579 offset, safe = self.BRACKET_OFFSETS[name] 580 bracket.set("offset", offset) 581 bracket.set("safe", safe) 582 expression.replace(expression.expressions[0]) 583 584 return bracket 585 586 class Generator(generator.Generator): 587 EXPLICIT_SET_OP = True 588 INTERVAL_ALLOWS_PLURAL_FORM = False 589 JOIN_HINTS = False 590 QUERY_HINTS = False 591 TABLE_HINTS = False 592 LIMIT_FETCH = "LIMIT" 593 RENAME_TABLE_WITH_DB = False 594 NVL2_SUPPORTED = False 595 UNNEST_WITH_ORDINALITY = False 596 COLLATE_IS_FUNC = True 597 LIMIT_ONLY_LITERALS = True 598 SUPPORTS_TABLE_ALIAS_COLUMNS = False 599 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 600 JSON_KEY_VALUE_PAIR_SEP = "," 601 NULL_ORDERING_SUPPORTED = False 602 IGNORE_NULLS_IN_FUNC = True 603 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 604 CAN_IMPLEMENT_ARRAY_ANY = True 605 SUPPORTS_TO_NUMBER = False 606 NAMED_PLACEHOLDER_TOKEN = "@" 607 HEX_FUNC = "TO_HEX" 608 WITH_PROPERTIES_PREFIX = "OPTIONS" 609 610 TRANSFORMS = { 611 **generator.Generator.TRANSFORMS, 612 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 613 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 614 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 615 exp.Array: inline_array_unless_query, 616 exp.ArrayContains: _array_contains_sql, 617 exp.ArrayFilter: filter_array_using_unnest, 618 exp.ArraySize: rename_func("ARRAY_LENGTH"), 619 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 620 exp.CollateProperty: lambda self, e: ( 621 f"DEFAULT COLLATE {self.sql(e, 'this')}" 622 if e.args.get("default") 623 else f"COLLATE {self.sql(e, 'this')}" 624 ), 625 exp.Commit: lambda *_: "COMMIT TRANSACTION", 626 exp.CountIf: rename_func("COUNTIF"), 627 exp.Create: _create_sql, 628 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 629 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 630 exp.DateDiff: lambda self, e: self.func( 631 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 632 ), 633 exp.DateFromParts: rename_func("DATE"), 634 exp.DateStrToDate: datestrtodate_sql, 635 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 636 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 637 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 638 exp.DateTrunc: lambda self, e: self.func("DATE_TRUNC", e.this, e.text("unit")), 639 exp.FromTimeZone: lambda self, e: self.func( 640 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 641 ), 642 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 643 exp.GroupConcat: rename_func("STRING_AGG"), 644 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 645 exp.If: if_sql(false_value="NULL"), 646 exp.ILike: no_ilike_sql, 647 exp.IntDiv: rename_func("DIV"), 648 exp.JSONFormat: rename_func("TO_JSON_STRING"), 649 exp.Max: max_or_greatest, 650 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 651 exp.MD5Digest: rename_func("MD5"), 652 exp.Min: min_or_least, 653 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 654 exp.RegexpExtract: lambda self, e: self.func( 655 "REGEXP_EXTRACT", 656 e.this, 657 e.expression, 658 e.args.get("position"), 659 e.args.get("occurrence"), 660 ), 661 exp.RegexpReplace: regexp_replace_sql, 662 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 663 exp.ReturnsProperty: _returnsproperty_sql, 664 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 665 exp.Select: transforms.preprocess( 666 [ 667 transforms.explode_to_unnest(), 668 transforms.unqualify_unnest, 669 transforms.eliminate_distinct_on, 670 _alias_ordered_group, 671 transforms.eliminate_semi_and_anti_joins, 672 ] 673 ), 674 exp.SHA: rename_func("SHA1"), 675 exp.SHA2: sha256_sql, 676 exp.StabilityProperty: lambda self, e: ( 677 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 678 ), 679 exp.StrToDate: _str_to_datetime_sql, 680 exp.StrToTime: _str_to_datetime_sql, 681 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 682 exp.TimeFromParts: rename_func("TIME"), 683 exp.TimestampFromParts: rename_func("DATETIME"), 684 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 685 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 686 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 687 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 688 exp.TimeStrToTime: timestrtotime_sql, 689 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 690 exp.Trim: lambda self, e: self.func("TRIM", e.this, e.expression), 691 exp.TsOrDsAdd: _ts_or_ds_add_sql, 692 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 693 exp.TsOrDsToTime: rename_func("TIME"), 694 exp.TsOrDsToTimestamp: rename_func("DATETIME"), 695 exp.Unhex: rename_func("FROM_HEX"), 696 exp.UnixDate: rename_func("UNIX_DATE"), 697 exp.UnixToTime: _unix_to_time_sql, 698 exp.Values: _derived_table_values_to_unnest, 699 exp.VariancePop: rename_func("VAR_POP"), 700 } 701 702 SUPPORTED_JSON_PATH_PARTS = { 703 exp.JSONPathKey, 704 exp.JSONPathRoot, 705 exp.JSONPathSubscript, 706 } 707 708 TYPE_MAPPING = { 709 **generator.Generator.TYPE_MAPPING, 710 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 711 exp.DataType.Type.BIGINT: "INT64", 712 exp.DataType.Type.BINARY: "BYTES", 713 exp.DataType.Type.BOOLEAN: "BOOL", 714 exp.DataType.Type.CHAR: "STRING", 715 exp.DataType.Type.DECIMAL: "NUMERIC", 716 exp.DataType.Type.DOUBLE: "FLOAT64", 717 exp.DataType.Type.FLOAT: "FLOAT64", 718 exp.DataType.Type.INT: "INT64", 719 exp.DataType.Type.NCHAR: "STRING", 720 exp.DataType.Type.NVARCHAR: "STRING", 721 exp.DataType.Type.SMALLINT: "INT64", 722 exp.DataType.Type.TEXT: "STRING", 723 exp.DataType.Type.TIMESTAMP: "DATETIME", 724 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 725 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 726 exp.DataType.Type.TINYINT: "INT64", 727 exp.DataType.Type.VARBINARY: "BYTES", 728 exp.DataType.Type.ROWVERSION: "BYTES", 729 exp.DataType.Type.VARCHAR: "STRING", 730 exp.DataType.Type.VARIANT: "ANY TYPE", 731 } 732 733 PROPERTIES_LOCATION = { 734 **generator.Generator.PROPERTIES_LOCATION, 735 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 736 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 737 } 738 739 # WINDOW comes after QUALIFY 740 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 741 AFTER_HAVING_MODIFIER_TRANSFORMS = { 742 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 743 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 744 } 745 746 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 747 RESERVED_KEYWORDS = { 748 "all", 749 "and", 750 "any", 751 "array", 752 "as", 753 "asc", 754 "assert_rows_modified", 755 "at", 756 "between", 757 "by", 758 "case", 759 "cast", 760 "collate", 761 "contains", 762 "create", 763 "cross", 764 "cube", 765 "current", 766 "default", 767 "define", 768 "desc", 769 "distinct", 770 "else", 771 "end", 772 "enum", 773 "escape", 774 "except", 775 "exclude", 776 "exists", 777 "extract", 778 "false", 779 "fetch", 780 "following", 781 "for", 782 "from", 783 "full", 784 "group", 785 "grouping", 786 "groups", 787 "hash", 788 "having", 789 "if", 790 "ignore", 791 "in", 792 "inner", 793 "intersect", 794 "interval", 795 "into", 796 "is", 797 "join", 798 "lateral", 799 "left", 800 "like", 801 "limit", 802 "lookup", 803 "merge", 804 "natural", 805 "new", 806 "no", 807 "not", 808 "null", 809 "nulls", 810 "of", 811 "on", 812 "or", 813 "order", 814 "outer", 815 "over", 816 "partition", 817 "preceding", 818 "proto", 819 "qualify", 820 "range", 821 "recursive", 822 "respect", 823 "right", 824 "rollup", 825 "rows", 826 "select", 827 "set", 828 "some", 829 "struct", 830 "tablesample", 831 "then", 832 "to", 833 "treat", 834 "true", 835 "unbounded", 836 "union", 837 "unnest", 838 "using", 839 "when", 840 "where", 841 "window", 842 "with", 843 "within", 844 } 845 846 def mod_sql(self, expression: exp.Mod) -> str: 847 this = expression.this 848 expr = expression.expression 849 return self.func( 850 "MOD", 851 this.unnest() if isinstance(this, exp.Paren) else this, 852 expr.unnest() if isinstance(expr, exp.Paren) else expr, 853 ) 854 855 def column_parts(self, expression: exp.Column) -> str: 856 if expression.meta.get("quoted_column"): 857 # If a column reference is of the form `dataset.table`.name, we need 858 # to preserve the quoted table path, otherwise the reference breaks 859 table_parts = ".".join(p.name for p in expression.parts[:-1]) 860 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 861 return f"{table_path}.{self.sql(expression, 'this')}" 862 863 return super().column_parts(expression) 864 865 def table_parts(self, expression: exp.Table) -> str: 866 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 867 # we need to make sure the correct quoting is used in each case. 868 # 869 # For example, if there is a CTE x that clashes with a schema name, then the former will 870 # return the table y in that schema, whereas the latter will return the CTE's y column: 871 # 872 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 873 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 874 if expression.meta.get("quoted_table"): 875 table_parts = ".".join(p.name for p in expression.parts) 876 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 877 878 return super().table_parts(expression) 879 880 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 881 this = expression.this if isinstance(expression.this, exp.TsOrDsToDate) else expression 882 return self.func("FORMAT_DATE", self.format_time(expression), this.this) 883 884 def eq_sql(self, expression: exp.EQ) -> str: 885 # Operands of = cannot be NULL in BigQuery 886 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 887 if not isinstance(expression.parent, exp.Update): 888 return "NULL" 889 890 return self.binary(expression, "=") 891 892 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 893 parent = expression.parent 894 895 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 896 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 897 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 898 return self.func( 899 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 900 ) 901 902 return super().attimezone_sql(expression) 903 904 def trycast_sql(self, expression: exp.TryCast) -> str: 905 return self.cast_sql(expression, safe_prefix="SAFE_") 906 907 def bracket_sql(self, expression: exp.Bracket) -> str: 908 this = expression.this 909 expressions = expression.expressions 910 911 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 912 arg = expressions[0] 913 if arg.type is None: 914 from sqlglot.optimizer.annotate_types import annotate_types 915 916 arg = annotate_types(arg) 917 918 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 919 # BQ doesn't support bracket syntax with string values for structs 920 return f"{self.sql(this)}.{arg.name}" 921 922 expressions_sql = self.expressions(expression, flat=True) 923 offset = expression.args.get("offset") 924 925 if offset == 0: 926 expressions_sql = f"OFFSET({expressions_sql})" 927 elif offset == 1: 928 expressions_sql = f"ORDINAL({expressions_sql})" 929 elif offset is not None: 930 self.unsupported(f"Unsupported array offset: {offset}") 931 932 if expression.args.get("safe"): 933 expressions_sql = f"SAFE_{expressions_sql}" 934 935 return f"{self.sql(this)}[{expressions_sql}]" 936 937 def in_unnest_op(self, expression: exp.Unnest) -> str: 938 return self.sql(expression) 939 940 def except_op(self, expression: exp.Except) -> str: 941 if not expression.args.get("distinct"): 942 self.unsupported("EXCEPT without DISTINCT is not supported in BigQuery") 943 return f"EXCEPT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 944 945 def intersect_op(self, expression: exp.Intersect) -> str: 946 if not expression.args.get("distinct"): 947 self.unsupported("INTERSECT without DISTINCT is not supported in BigQuery") 948 return f"INTERSECT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 949 950 def version_sql(self, expression: exp.Version) -> str: 951 if expression.name == "TIMESTAMP": 952 expression.set("this", "SYSTEM_TIME") 953 return super().version_sql(expression)
236class BigQuery(Dialect): 237 WEEK_OFFSET = -1 238 UNNEST_COLUMN_ONLY = True 239 SUPPORTS_USER_DEFINED_TYPES = False 240 SUPPORTS_SEMI_ANTI_JOIN = False 241 LOG_BASE_FIRST = False 242 HEX_LOWERCASE = True 243 FORCE_EARLY_ALIAS_REF_EXPANSION = True 244 EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = True 245 246 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 247 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 248 249 # bigquery udfs are case sensitive 250 NORMALIZE_FUNCTIONS = False 251 252 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 253 TIME_MAPPING = { 254 "%D": "%m/%d/%y", 255 "%E6S": "%S.%f", 256 } 257 258 FORMAT_MAPPING = { 259 "DD": "%d", 260 "MM": "%m", 261 "MON": "%b", 262 "MONTH": "%B", 263 "YYYY": "%Y", 264 "YY": "%y", 265 "HH": "%I", 266 "HH12": "%I", 267 "HH24": "%H", 268 "MI": "%M", 269 "SS": "%S", 270 "SSSSS": "%f", 271 "TZH": "%z", 272 } 273 274 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 275 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 276 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 277 278 def normalize_identifier(self, expression: E) -> E: 279 if ( 280 isinstance(expression, exp.Identifier) 281 and self.normalization_strategy is not NormalizationStrategy.CASE_SENSITIVE 282 ): 283 parent = expression.parent 284 while isinstance(parent, exp.Dot): 285 parent = parent.parent 286 287 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 288 # by default. The following check uses a heuristic to detect tables based on whether 289 # they are qualified. This should generally be correct, because tables in BigQuery 290 # must be qualified with at least a dataset, unless @@dataset_id is set. 291 case_sensitive = ( 292 isinstance(parent, exp.UserDefinedFunction) 293 or ( 294 isinstance(parent, exp.Table) 295 and parent.db 296 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 297 ) 298 or expression.meta.get("is_table") 299 ) 300 if not case_sensitive: 301 expression.set("this", expression.this.lower()) 302 303 return expression 304 305 class Tokenizer(tokens.Tokenizer): 306 QUOTES = ["'", '"', '"""', "'''"] 307 COMMENTS = ["--", "#", ("/*", "*/")] 308 IDENTIFIERS = ["`"] 309 STRING_ESCAPES = ["\\"] 310 311 HEX_STRINGS = [("0x", ""), ("0X", "")] 312 313 BYTE_STRINGS = [ 314 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 315 ] 316 317 RAW_STRINGS = [ 318 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 319 ] 320 321 KEYWORDS = { 322 **tokens.Tokenizer.KEYWORDS, 323 "ANY TYPE": TokenType.VARIANT, 324 "BEGIN": TokenType.COMMAND, 325 "BEGIN TRANSACTION": TokenType.BEGIN, 326 "BYTEINT": TokenType.INT, 327 "BYTES": TokenType.BINARY, 328 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 329 "DATETIME": TokenType.TIMESTAMP, 330 "DECLARE": TokenType.COMMAND, 331 "ELSEIF": TokenType.COMMAND, 332 "EXCEPTION": TokenType.COMMAND, 333 "FLOAT64": TokenType.DOUBLE, 334 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 335 "MODEL": TokenType.MODEL, 336 "NOT DETERMINISTIC": TokenType.VOLATILE, 337 "RECORD": TokenType.STRUCT, 338 "TIMESTAMP": TokenType.TIMESTAMPTZ, 339 } 340 KEYWORDS.pop("DIV") 341 KEYWORDS.pop("VALUES") 342 KEYWORDS.pop("/*+") 343 344 class Parser(parser.Parser): 345 PREFIXED_PIVOT_COLUMNS = True 346 LOG_DEFAULTS_TO_LN = True 347 SUPPORTS_IMPLICIT_UNNEST = True 348 349 FUNCTIONS = { 350 **parser.Parser.FUNCTIONS, 351 "DATE": _build_date, 352 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 353 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 354 "DATE_TRUNC": lambda args: exp.DateTrunc( 355 unit=exp.Literal.string(str(seq_get(args, 1))), 356 this=seq_get(args, 0), 357 ), 358 "DATETIME": _build_datetime, 359 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 360 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 361 "DIV": binary_from_function(exp.IntDiv), 362 "FORMAT_DATE": lambda args: exp.TimeToStr( 363 this=exp.TsOrDsToDate(this=seq_get(args, 1)), format=seq_get(args, 0) 364 ), 365 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 366 "JSON_EXTRACT_SCALAR": lambda args: exp.JSONExtractScalar( 367 this=seq_get(args, 0), expression=seq_get(args, 1) or exp.Literal.string("$") 368 ), 369 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 370 "MD5": exp.MD5Digest.from_arg_list, 371 "TO_HEX": _build_to_hex, 372 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 373 [seq_get(args, 1), seq_get(args, 0)] 374 ), 375 "PARSE_TIMESTAMP": _build_parse_timestamp, 376 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 377 "REGEXP_EXTRACT": lambda args: exp.RegexpExtract( 378 this=seq_get(args, 0), 379 expression=seq_get(args, 1), 380 position=seq_get(args, 2), 381 occurrence=seq_get(args, 3), 382 group=exp.Literal.number(1) if re.compile(args[1].name).groups == 1 else None, 383 ), 384 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 385 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 386 "SPLIT": lambda args: exp.Split( 387 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 388 this=seq_get(args, 0), 389 expression=seq_get(args, 1) or exp.Literal.string(","), 390 ), 391 "TIME": _build_time, 392 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 393 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 394 "TIMESTAMP": _build_timestamp, 395 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 396 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 397 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 398 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 399 ), 400 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 401 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 402 ), 403 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 404 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 405 } 406 407 FUNCTION_PARSERS = { 408 **parser.Parser.FUNCTION_PARSERS, 409 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 410 } 411 FUNCTION_PARSERS.pop("TRIM") 412 413 NO_PAREN_FUNCTIONS = { 414 **parser.Parser.NO_PAREN_FUNCTIONS, 415 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 416 } 417 418 NESTED_TYPE_TOKENS = { 419 *parser.Parser.NESTED_TYPE_TOKENS, 420 TokenType.TABLE, 421 } 422 423 PROPERTY_PARSERS = { 424 **parser.Parser.PROPERTY_PARSERS, 425 "NOT DETERMINISTIC": lambda self: self.expression( 426 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 427 ), 428 "OPTIONS": lambda self: self._parse_with_property(), 429 } 430 431 CONSTRAINT_PARSERS = { 432 **parser.Parser.CONSTRAINT_PARSERS, 433 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 434 } 435 436 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 437 RANGE_PARSERS.pop(TokenType.OVERLAPS) 438 439 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 440 441 STATEMENT_PARSERS = { 442 **parser.Parser.STATEMENT_PARSERS, 443 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 444 TokenType.END: lambda self: self._parse_as_command(self._prev), 445 TokenType.FOR: lambda self: self._parse_for_in(), 446 } 447 448 BRACKET_OFFSETS = { 449 "OFFSET": (0, False), 450 "ORDINAL": (1, False), 451 "SAFE_OFFSET": (0, True), 452 "SAFE_ORDINAL": (1, True), 453 } 454 455 def _parse_for_in(self) -> exp.ForIn: 456 this = self._parse_range() 457 self._match_text_seq("DO") 458 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 459 460 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 461 this = super()._parse_table_part(schema=schema) or self._parse_number() 462 463 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 464 if isinstance(this, exp.Identifier): 465 table_name = this.name 466 while self._match(TokenType.DASH, advance=False) and self._next: 467 text = "" 468 while self._curr and self._curr.token_type != TokenType.DOT: 469 self._advance() 470 text += self._prev.text 471 table_name += text 472 473 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 474 elif isinstance(this, exp.Literal): 475 table_name = this.name 476 477 if self._is_connected() and self._parse_var(any_token=True): 478 table_name += self._prev.text 479 480 this = exp.Identifier(this=table_name, quoted=True) 481 482 return this 483 484 def _parse_table_parts( 485 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 486 ) -> exp.Table: 487 table = super()._parse_table_parts( 488 schema=schema, is_db_reference=is_db_reference, wildcard=True 489 ) 490 491 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 492 if not table.catalog: 493 if table.db: 494 parts = table.db.split(".") 495 if len(parts) == 2 and not table.args["db"].quoted: 496 table.set("catalog", exp.Identifier(this=parts[0])) 497 table.set("db", exp.Identifier(this=parts[1])) 498 else: 499 parts = table.name.split(".") 500 if len(parts) == 2 and not table.this.quoted: 501 table.set("db", exp.Identifier(this=parts[0])) 502 table.set("this", exp.Identifier(this=parts[1])) 503 504 if any("." in p.name for p in table.parts): 505 catalog, db, this, *rest = ( 506 exp.to_identifier(p, quoted=True) 507 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 508 ) 509 510 if rest and this: 511 this = exp.Dot.build([this, *rest]) # type: ignore 512 513 table = exp.Table( 514 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 515 ) 516 table.meta["quoted_table"] = True 517 518 return table 519 520 def _parse_column(self) -> t.Optional[exp.Expression]: 521 column = super()._parse_column() 522 if isinstance(column, exp.Column): 523 parts = column.parts 524 if any("." in p.name for p in parts): 525 catalog, db, table, this, *rest = ( 526 exp.to_identifier(p, quoted=True) 527 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 528 ) 529 530 if rest and this: 531 this = exp.Dot.build([this, *rest]) # type: ignore 532 533 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 534 column.meta["quoted_column"] = True 535 536 return column 537 538 @t.overload 539 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 540 541 @t.overload 542 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 543 544 def _parse_json_object(self, agg=False): 545 json_object = super()._parse_json_object() 546 array_kv_pair = seq_get(json_object.expressions, 0) 547 548 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 549 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 550 if ( 551 array_kv_pair 552 and isinstance(array_kv_pair.this, exp.Array) 553 and isinstance(array_kv_pair.expression, exp.Array) 554 ): 555 keys = array_kv_pair.this.expressions 556 values = array_kv_pair.expression.expressions 557 558 json_object.set( 559 "expressions", 560 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 561 ) 562 563 return json_object 564 565 def _parse_bracket( 566 self, this: t.Optional[exp.Expression] = None 567 ) -> t.Optional[exp.Expression]: 568 bracket = super()._parse_bracket(this) 569 570 if this is bracket: 571 return bracket 572 573 if isinstance(bracket, exp.Bracket): 574 for expression in bracket.expressions: 575 name = expression.name.upper() 576 577 if name not in self.BRACKET_OFFSETS: 578 break 579 580 offset, safe = self.BRACKET_OFFSETS[name] 581 bracket.set("offset", offset) 582 bracket.set("safe", safe) 583 expression.replace(expression.expressions[0]) 584 585 return bracket 586 587 class Generator(generator.Generator): 588 EXPLICIT_SET_OP = True 589 INTERVAL_ALLOWS_PLURAL_FORM = False 590 JOIN_HINTS = False 591 QUERY_HINTS = False 592 TABLE_HINTS = False 593 LIMIT_FETCH = "LIMIT" 594 RENAME_TABLE_WITH_DB = False 595 NVL2_SUPPORTED = False 596 UNNEST_WITH_ORDINALITY = False 597 COLLATE_IS_FUNC = True 598 LIMIT_ONLY_LITERALS = True 599 SUPPORTS_TABLE_ALIAS_COLUMNS = False 600 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 601 JSON_KEY_VALUE_PAIR_SEP = "," 602 NULL_ORDERING_SUPPORTED = False 603 IGNORE_NULLS_IN_FUNC = True 604 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 605 CAN_IMPLEMENT_ARRAY_ANY = True 606 SUPPORTS_TO_NUMBER = False 607 NAMED_PLACEHOLDER_TOKEN = "@" 608 HEX_FUNC = "TO_HEX" 609 WITH_PROPERTIES_PREFIX = "OPTIONS" 610 611 TRANSFORMS = { 612 **generator.Generator.TRANSFORMS, 613 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 614 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 615 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 616 exp.Array: inline_array_unless_query, 617 exp.ArrayContains: _array_contains_sql, 618 exp.ArrayFilter: filter_array_using_unnest, 619 exp.ArraySize: rename_func("ARRAY_LENGTH"), 620 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 621 exp.CollateProperty: lambda self, e: ( 622 f"DEFAULT COLLATE {self.sql(e, 'this')}" 623 if e.args.get("default") 624 else f"COLLATE {self.sql(e, 'this')}" 625 ), 626 exp.Commit: lambda *_: "COMMIT TRANSACTION", 627 exp.CountIf: rename_func("COUNTIF"), 628 exp.Create: _create_sql, 629 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 630 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 631 exp.DateDiff: lambda self, e: self.func( 632 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 633 ), 634 exp.DateFromParts: rename_func("DATE"), 635 exp.DateStrToDate: datestrtodate_sql, 636 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 637 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 638 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 639 exp.DateTrunc: lambda self, e: self.func("DATE_TRUNC", e.this, e.text("unit")), 640 exp.FromTimeZone: lambda self, e: self.func( 641 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 642 ), 643 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 644 exp.GroupConcat: rename_func("STRING_AGG"), 645 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 646 exp.If: if_sql(false_value="NULL"), 647 exp.ILike: no_ilike_sql, 648 exp.IntDiv: rename_func("DIV"), 649 exp.JSONFormat: rename_func("TO_JSON_STRING"), 650 exp.Max: max_or_greatest, 651 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 652 exp.MD5Digest: rename_func("MD5"), 653 exp.Min: min_or_least, 654 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 655 exp.RegexpExtract: lambda self, e: self.func( 656 "REGEXP_EXTRACT", 657 e.this, 658 e.expression, 659 e.args.get("position"), 660 e.args.get("occurrence"), 661 ), 662 exp.RegexpReplace: regexp_replace_sql, 663 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 664 exp.ReturnsProperty: _returnsproperty_sql, 665 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 666 exp.Select: transforms.preprocess( 667 [ 668 transforms.explode_to_unnest(), 669 transforms.unqualify_unnest, 670 transforms.eliminate_distinct_on, 671 _alias_ordered_group, 672 transforms.eliminate_semi_and_anti_joins, 673 ] 674 ), 675 exp.SHA: rename_func("SHA1"), 676 exp.SHA2: sha256_sql, 677 exp.StabilityProperty: lambda self, e: ( 678 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 679 ), 680 exp.StrToDate: _str_to_datetime_sql, 681 exp.StrToTime: _str_to_datetime_sql, 682 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 683 exp.TimeFromParts: rename_func("TIME"), 684 exp.TimestampFromParts: rename_func("DATETIME"), 685 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 686 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 687 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 688 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 689 exp.TimeStrToTime: timestrtotime_sql, 690 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 691 exp.Trim: lambda self, e: self.func("TRIM", e.this, e.expression), 692 exp.TsOrDsAdd: _ts_or_ds_add_sql, 693 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 694 exp.TsOrDsToTime: rename_func("TIME"), 695 exp.TsOrDsToTimestamp: rename_func("DATETIME"), 696 exp.Unhex: rename_func("FROM_HEX"), 697 exp.UnixDate: rename_func("UNIX_DATE"), 698 exp.UnixToTime: _unix_to_time_sql, 699 exp.Values: _derived_table_values_to_unnest, 700 exp.VariancePop: rename_func("VAR_POP"), 701 } 702 703 SUPPORTED_JSON_PATH_PARTS = { 704 exp.JSONPathKey, 705 exp.JSONPathRoot, 706 exp.JSONPathSubscript, 707 } 708 709 TYPE_MAPPING = { 710 **generator.Generator.TYPE_MAPPING, 711 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 712 exp.DataType.Type.BIGINT: "INT64", 713 exp.DataType.Type.BINARY: "BYTES", 714 exp.DataType.Type.BOOLEAN: "BOOL", 715 exp.DataType.Type.CHAR: "STRING", 716 exp.DataType.Type.DECIMAL: "NUMERIC", 717 exp.DataType.Type.DOUBLE: "FLOAT64", 718 exp.DataType.Type.FLOAT: "FLOAT64", 719 exp.DataType.Type.INT: "INT64", 720 exp.DataType.Type.NCHAR: "STRING", 721 exp.DataType.Type.NVARCHAR: "STRING", 722 exp.DataType.Type.SMALLINT: "INT64", 723 exp.DataType.Type.TEXT: "STRING", 724 exp.DataType.Type.TIMESTAMP: "DATETIME", 725 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 726 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 727 exp.DataType.Type.TINYINT: "INT64", 728 exp.DataType.Type.VARBINARY: "BYTES", 729 exp.DataType.Type.ROWVERSION: "BYTES", 730 exp.DataType.Type.VARCHAR: "STRING", 731 exp.DataType.Type.VARIANT: "ANY TYPE", 732 } 733 734 PROPERTIES_LOCATION = { 735 **generator.Generator.PROPERTIES_LOCATION, 736 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 737 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 738 } 739 740 # WINDOW comes after QUALIFY 741 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 742 AFTER_HAVING_MODIFIER_TRANSFORMS = { 743 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 744 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 745 } 746 747 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 748 RESERVED_KEYWORDS = { 749 "all", 750 "and", 751 "any", 752 "array", 753 "as", 754 "asc", 755 "assert_rows_modified", 756 "at", 757 "between", 758 "by", 759 "case", 760 "cast", 761 "collate", 762 "contains", 763 "create", 764 "cross", 765 "cube", 766 "current", 767 "default", 768 "define", 769 "desc", 770 "distinct", 771 "else", 772 "end", 773 "enum", 774 "escape", 775 "except", 776 "exclude", 777 "exists", 778 "extract", 779 "false", 780 "fetch", 781 "following", 782 "for", 783 "from", 784 "full", 785 "group", 786 "grouping", 787 "groups", 788 "hash", 789 "having", 790 "if", 791 "ignore", 792 "in", 793 "inner", 794 "intersect", 795 "interval", 796 "into", 797 "is", 798 "join", 799 "lateral", 800 "left", 801 "like", 802 "limit", 803 "lookup", 804 "merge", 805 "natural", 806 "new", 807 "no", 808 "not", 809 "null", 810 "nulls", 811 "of", 812 "on", 813 "or", 814 "order", 815 "outer", 816 "over", 817 "partition", 818 "preceding", 819 "proto", 820 "qualify", 821 "range", 822 "recursive", 823 "respect", 824 "right", 825 "rollup", 826 "rows", 827 "select", 828 "set", 829 "some", 830 "struct", 831 "tablesample", 832 "then", 833 "to", 834 "treat", 835 "true", 836 "unbounded", 837 "union", 838 "unnest", 839 "using", 840 "when", 841 "where", 842 "window", 843 "with", 844 "within", 845 } 846 847 def mod_sql(self, expression: exp.Mod) -> str: 848 this = expression.this 849 expr = expression.expression 850 return self.func( 851 "MOD", 852 this.unnest() if isinstance(this, exp.Paren) else this, 853 expr.unnest() if isinstance(expr, exp.Paren) else expr, 854 ) 855 856 def column_parts(self, expression: exp.Column) -> str: 857 if expression.meta.get("quoted_column"): 858 # If a column reference is of the form `dataset.table`.name, we need 859 # to preserve the quoted table path, otherwise the reference breaks 860 table_parts = ".".join(p.name for p in expression.parts[:-1]) 861 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 862 return f"{table_path}.{self.sql(expression, 'this')}" 863 864 return super().column_parts(expression) 865 866 def table_parts(self, expression: exp.Table) -> str: 867 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 868 # we need to make sure the correct quoting is used in each case. 869 # 870 # For example, if there is a CTE x that clashes with a schema name, then the former will 871 # return the table y in that schema, whereas the latter will return the CTE's y column: 872 # 873 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 874 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 875 if expression.meta.get("quoted_table"): 876 table_parts = ".".join(p.name for p in expression.parts) 877 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 878 879 return super().table_parts(expression) 880 881 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 882 this = expression.this if isinstance(expression.this, exp.TsOrDsToDate) else expression 883 return self.func("FORMAT_DATE", self.format_time(expression), this.this) 884 885 def eq_sql(self, expression: exp.EQ) -> str: 886 # Operands of = cannot be NULL in BigQuery 887 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 888 if not isinstance(expression.parent, exp.Update): 889 return "NULL" 890 891 return self.binary(expression, "=") 892 893 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 894 parent = expression.parent 895 896 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 897 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 898 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 899 return self.func( 900 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 901 ) 902 903 return super().attimezone_sql(expression) 904 905 def trycast_sql(self, expression: exp.TryCast) -> str: 906 return self.cast_sql(expression, safe_prefix="SAFE_") 907 908 def bracket_sql(self, expression: exp.Bracket) -> str: 909 this = expression.this 910 expressions = expression.expressions 911 912 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 913 arg = expressions[0] 914 if arg.type is None: 915 from sqlglot.optimizer.annotate_types import annotate_types 916 917 arg = annotate_types(arg) 918 919 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 920 # BQ doesn't support bracket syntax with string values for structs 921 return f"{self.sql(this)}.{arg.name}" 922 923 expressions_sql = self.expressions(expression, flat=True) 924 offset = expression.args.get("offset") 925 926 if offset == 0: 927 expressions_sql = f"OFFSET({expressions_sql})" 928 elif offset == 1: 929 expressions_sql = f"ORDINAL({expressions_sql})" 930 elif offset is not None: 931 self.unsupported(f"Unsupported array offset: {offset}") 932 933 if expression.args.get("safe"): 934 expressions_sql = f"SAFE_{expressions_sql}" 935 936 return f"{self.sql(this)}[{expressions_sql}]" 937 938 def in_unnest_op(self, expression: exp.Unnest) -> str: 939 return self.sql(expression) 940 941 def except_op(self, expression: exp.Except) -> str: 942 if not expression.args.get("distinct"): 943 self.unsupported("EXCEPT without DISTINCT is not supported in BigQuery") 944 return f"EXCEPT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 945 946 def intersect_op(self, expression: exp.Intersect) -> str: 947 if not expression.args.get("distinct"): 948 self.unsupported("INTERSECT without DISTINCT is not supported in BigQuery") 949 return f"INTERSECT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 950 951 def version_sql(self, expression: exp.Version) -> str: 952 if expression.name == "TIMESTAMP": 953 expression.set("this", "SYSTEM_TIME") 954 return super().version_sql(expression)
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects "my_id" would refer to "data.my_id" (which is done in _qualify_columns()) across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether alias reference expansion before qualification should only happen for the GROUP BY clause.
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime
formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy')
.
If empty, the corresponding trie will be constructed off of TIME_MAPPING
.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT *
queries.
278 def normalize_identifier(self, expression: E) -> E: 279 if ( 280 isinstance(expression, exp.Identifier) 281 and self.normalization_strategy is not NormalizationStrategy.CASE_SENSITIVE 282 ): 283 parent = expression.parent 284 while isinstance(parent, exp.Dot): 285 parent = parent.parent 286 287 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 288 # by default. The following check uses a heuristic to detect tables based on whether 289 # they are qualified. This should generally be correct, because tables in BigQuery 290 # must be qualified with at least a dataset, unless @@dataset_id is set. 291 case_sensitive = ( 292 isinstance(parent, exp.UserDefinedFunction) 293 or ( 294 isinstance(parent, exp.Table) 295 and parent.db 296 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 297 ) 298 or expression.meta.get("is_table") 299 ) 300 if not case_sensitive: 301 expression.set("this", expression.this.lower()) 302 303 return expression
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO
would be resolved as foo
in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO
. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
Inherited Members
- sqlglot.dialects.dialect.Dialect
- Dialect
- INDEX_OFFSET
- ALIAS_POST_TABLESAMPLE
- TABLESAMPLE_SIZE_IS_PERCENT
- IDENTIFIERS_CAN_START_WITH_DIGIT
- DPIPE_IS_STRING_CONCAT
- STRICT_STRING_CONCAT
- COPY_PARAMS_ARE_CSV
- NULL_ORDERING
- TYPED_DIVISION
- SAFE_DIVISION
- CONCAT_COALESCE
- DATE_FORMAT
- DATEINT_FORMAT
- TIME_FORMAT
- PREFER_CTE_ALIAS_COLUMN
- SUPPORTS_ORDER_BY_ALL
- DATE_PART_MAPPING
- TYPE_TO_EXPRESSIONS
- ANNOTATORS
- get_or_raise
- format_time
- settings
- case_sensitive
- can_identify
- quote_identifier
- to_json_path
- parse
- parse_into
- generate
- transpile
- tokenize
- tokenizer
- jsonpath_tokenizer
- parser
- generator
305 class Tokenizer(tokens.Tokenizer): 306 QUOTES = ["'", '"', '"""', "'''"] 307 COMMENTS = ["--", "#", ("/*", "*/")] 308 IDENTIFIERS = ["`"] 309 STRING_ESCAPES = ["\\"] 310 311 HEX_STRINGS = [("0x", ""), ("0X", "")] 312 313 BYTE_STRINGS = [ 314 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 315 ] 316 317 RAW_STRINGS = [ 318 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 319 ] 320 321 KEYWORDS = { 322 **tokens.Tokenizer.KEYWORDS, 323 "ANY TYPE": TokenType.VARIANT, 324 "BEGIN": TokenType.COMMAND, 325 "BEGIN TRANSACTION": TokenType.BEGIN, 326 "BYTEINT": TokenType.INT, 327 "BYTES": TokenType.BINARY, 328 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 329 "DATETIME": TokenType.TIMESTAMP, 330 "DECLARE": TokenType.COMMAND, 331 "ELSEIF": TokenType.COMMAND, 332 "EXCEPTION": TokenType.COMMAND, 333 "FLOAT64": TokenType.DOUBLE, 334 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 335 "MODEL": TokenType.MODEL, 336 "NOT DETERMINISTIC": TokenType.VOLATILE, 337 "RECORD": TokenType.STRUCT, 338 "TIMESTAMP": TokenType.TIMESTAMPTZ, 339 } 340 KEYWORDS.pop("DIV") 341 KEYWORDS.pop("VALUES") 342 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- IDENTIFIER_ESCAPES
- VAR_SINGLE_TOKENS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
344 class Parser(parser.Parser): 345 PREFIXED_PIVOT_COLUMNS = True 346 LOG_DEFAULTS_TO_LN = True 347 SUPPORTS_IMPLICIT_UNNEST = True 348 349 FUNCTIONS = { 350 **parser.Parser.FUNCTIONS, 351 "DATE": _build_date, 352 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 353 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 354 "DATE_TRUNC": lambda args: exp.DateTrunc( 355 unit=exp.Literal.string(str(seq_get(args, 1))), 356 this=seq_get(args, 0), 357 ), 358 "DATETIME": _build_datetime, 359 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 360 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 361 "DIV": binary_from_function(exp.IntDiv), 362 "FORMAT_DATE": lambda args: exp.TimeToStr( 363 this=exp.TsOrDsToDate(this=seq_get(args, 1)), format=seq_get(args, 0) 364 ), 365 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 366 "JSON_EXTRACT_SCALAR": lambda args: exp.JSONExtractScalar( 367 this=seq_get(args, 0), expression=seq_get(args, 1) or exp.Literal.string("$") 368 ), 369 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 370 "MD5": exp.MD5Digest.from_arg_list, 371 "TO_HEX": _build_to_hex, 372 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 373 [seq_get(args, 1), seq_get(args, 0)] 374 ), 375 "PARSE_TIMESTAMP": _build_parse_timestamp, 376 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 377 "REGEXP_EXTRACT": lambda args: exp.RegexpExtract( 378 this=seq_get(args, 0), 379 expression=seq_get(args, 1), 380 position=seq_get(args, 2), 381 occurrence=seq_get(args, 3), 382 group=exp.Literal.number(1) if re.compile(args[1].name).groups == 1 else None, 383 ), 384 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 385 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 386 "SPLIT": lambda args: exp.Split( 387 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 388 this=seq_get(args, 0), 389 expression=seq_get(args, 1) or exp.Literal.string(","), 390 ), 391 "TIME": _build_time, 392 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 393 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 394 "TIMESTAMP": _build_timestamp, 395 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 396 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 397 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 398 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 399 ), 400 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 401 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 402 ), 403 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 404 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 405 } 406 407 FUNCTION_PARSERS = { 408 **parser.Parser.FUNCTION_PARSERS, 409 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 410 } 411 FUNCTION_PARSERS.pop("TRIM") 412 413 NO_PAREN_FUNCTIONS = { 414 **parser.Parser.NO_PAREN_FUNCTIONS, 415 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 416 } 417 418 NESTED_TYPE_TOKENS = { 419 *parser.Parser.NESTED_TYPE_TOKENS, 420 TokenType.TABLE, 421 } 422 423 PROPERTY_PARSERS = { 424 **parser.Parser.PROPERTY_PARSERS, 425 "NOT DETERMINISTIC": lambda self: self.expression( 426 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 427 ), 428 "OPTIONS": lambda self: self._parse_with_property(), 429 } 430 431 CONSTRAINT_PARSERS = { 432 **parser.Parser.CONSTRAINT_PARSERS, 433 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 434 } 435 436 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 437 RANGE_PARSERS.pop(TokenType.OVERLAPS) 438 439 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 440 441 STATEMENT_PARSERS = { 442 **parser.Parser.STATEMENT_PARSERS, 443 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 444 TokenType.END: lambda self: self._parse_as_command(self._prev), 445 TokenType.FOR: lambda self: self._parse_for_in(), 446 } 447 448 BRACKET_OFFSETS = { 449 "OFFSET": (0, False), 450 "ORDINAL": (1, False), 451 "SAFE_OFFSET": (0, True), 452 "SAFE_ORDINAL": (1, True), 453 } 454 455 def _parse_for_in(self) -> exp.ForIn: 456 this = self._parse_range() 457 self._match_text_seq("DO") 458 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 459 460 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 461 this = super()._parse_table_part(schema=schema) or self._parse_number() 462 463 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 464 if isinstance(this, exp.Identifier): 465 table_name = this.name 466 while self._match(TokenType.DASH, advance=False) and self._next: 467 text = "" 468 while self._curr and self._curr.token_type != TokenType.DOT: 469 self._advance() 470 text += self._prev.text 471 table_name += text 472 473 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 474 elif isinstance(this, exp.Literal): 475 table_name = this.name 476 477 if self._is_connected() and self._parse_var(any_token=True): 478 table_name += self._prev.text 479 480 this = exp.Identifier(this=table_name, quoted=True) 481 482 return this 483 484 def _parse_table_parts( 485 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 486 ) -> exp.Table: 487 table = super()._parse_table_parts( 488 schema=schema, is_db_reference=is_db_reference, wildcard=True 489 ) 490 491 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 492 if not table.catalog: 493 if table.db: 494 parts = table.db.split(".") 495 if len(parts) == 2 and not table.args["db"].quoted: 496 table.set("catalog", exp.Identifier(this=parts[0])) 497 table.set("db", exp.Identifier(this=parts[1])) 498 else: 499 parts = table.name.split(".") 500 if len(parts) == 2 and not table.this.quoted: 501 table.set("db", exp.Identifier(this=parts[0])) 502 table.set("this", exp.Identifier(this=parts[1])) 503 504 if any("." in p.name for p in table.parts): 505 catalog, db, this, *rest = ( 506 exp.to_identifier(p, quoted=True) 507 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 508 ) 509 510 if rest and this: 511 this = exp.Dot.build([this, *rest]) # type: ignore 512 513 table = exp.Table( 514 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 515 ) 516 table.meta["quoted_table"] = True 517 518 return table 519 520 def _parse_column(self) -> t.Optional[exp.Expression]: 521 column = super()._parse_column() 522 if isinstance(column, exp.Column): 523 parts = column.parts 524 if any("." in p.name for p in parts): 525 catalog, db, table, this, *rest = ( 526 exp.to_identifier(p, quoted=True) 527 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 528 ) 529 530 if rest and this: 531 this = exp.Dot.build([this, *rest]) # type: ignore 532 533 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 534 column.meta["quoted_column"] = True 535 536 return column 537 538 @t.overload 539 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 540 541 @t.overload 542 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 543 544 def _parse_json_object(self, agg=False): 545 json_object = super()._parse_json_object() 546 array_kv_pair = seq_get(json_object.expressions, 0) 547 548 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 549 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 550 if ( 551 array_kv_pair 552 and isinstance(array_kv_pair.this, exp.Array) 553 and isinstance(array_kv_pair.expression, exp.Array) 554 ): 555 keys = array_kv_pair.this.expressions 556 values = array_kv_pair.expression.expressions 557 558 json_object.set( 559 "expressions", 560 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 561 ) 562 563 return json_object 564 565 def _parse_bracket( 566 self, this: t.Optional[exp.Expression] = None 567 ) -> t.Optional[exp.Expression]: 568 bracket = super()._parse_bracket(this) 569 570 if this is bracket: 571 return bracket 572 573 if isinstance(bracket, exp.Bracket): 574 for expression in bracket.expressions: 575 name = expression.name.upper() 576 577 if name not in self.BRACKET_OFFSETS: 578 break 579 580 offset, safe = self.BRACKET_OFFSETS[name] 581 bracket.set("offset", offset) 582 bracket.set("safe", safe) 583 expression.replace(expression.expressions[0]) 584 585 return bracket
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- INTERVAL_VARS
- ALIAS_TOKENS
- ARRAY_CONSTRUCTORS
- COMMENT_TABLE_ALIAS_TOKENS
- UPDATE_ALIAS_TOKENS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- errors
- sql
587 class Generator(generator.Generator): 588 EXPLICIT_SET_OP = True 589 INTERVAL_ALLOWS_PLURAL_FORM = False 590 JOIN_HINTS = False 591 QUERY_HINTS = False 592 TABLE_HINTS = False 593 LIMIT_FETCH = "LIMIT" 594 RENAME_TABLE_WITH_DB = False 595 NVL2_SUPPORTED = False 596 UNNEST_WITH_ORDINALITY = False 597 COLLATE_IS_FUNC = True 598 LIMIT_ONLY_LITERALS = True 599 SUPPORTS_TABLE_ALIAS_COLUMNS = False 600 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 601 JSON_KEY_VALUE_PAIR_SEP = "," 602 NULL_ORDERING_SUPPORTED = False 603 IGNORE_NULLS_IN_FUNC = True 604 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 605 CAN_IMPLEMENT_ARRAY_ANY = True 606 SUPPORTS_TO_NUMBER = False 607 NAMED_PLACEHOLDER_TOKEN = "@" 608 HEX_FUNC = "TO_HEX" 609 WITH_PROPERTIES_PREFIX = "OPTIONS" 610 611 TRANSFORMS = { 612 **generator.Generator.TRANSFORMS, 613 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 614 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 615 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 616 exp.Array: inline_array_unless_query, 617 exp.ArrayContains: _array_contains_sql, 618 exp.ArrayFilter: filter_array_using_unnest, 619 exp.ArraySize: rename_func("ARRAY_LENGTH"), 620 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 621 exp.CollateProperty: lambda self, e: ( 622 f"DEFAULT COLLATE {self.sql(e, 'this')}" 623 if e.args.get("default") 624 else f"COLLATE {self.sql(e, 'this')}" 625 ), 626 exp.Commit: lambda *_: "COMMIT TRANSACTION", 627 exp.CountIf: rename_func("COUNTIF"), 628 exp.Create: _create_sql, 629 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 630 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 631 exp.DateDiff: lambda self, e: self.func( 632 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 633 ), 634 exp.DateFromParts: rename_func("DATE"), 635 exp.DateStrToDate: datestrtodate_sql, 636 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 637 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 638 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 639 exp.DateTrunc: lambda self, e: self.func("DATE_TRUNC", e.this, e.text("unit")), 640 exp.FromTimeZone: lambda self, e: self.func( 641 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 642 ), 643 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 644 exp.GroupConcat: rename_func("STRING_AGG"), 645 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 646 exp.If: if_sql(false_value="NULL"), 647 exp.ILike: no_ilike_sql, 648 exp.IntDiv: rename_func("DIV"), 649 exp.JSONFormat: rename_func("TO_JSON_STRING"), 650 exp.Max: max_or_greatest, 651 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 652 exp.MD5Digest: rename_func("MD5"), 653 exp.Min: min_or_least, 654 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 655 exp.RegexpExtract: lambda self, e: self.func( 656 "REGEXP_EXTRACT", 657 e.this, 658 e.expression, 659 e.args.get("position"), 660 e.args.get("occurrence"), 661 ), 662 exp.RegexpReplace: regexp_replace_sql, 663 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 664 exp.ReturnsProperty: _returnsproperty_sql, 665 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 666 exp.Select: transforms.preprocess( 667 [ 668 transforms.explode_to_unnest(), 669 transforms.unqualify_unnest, 670 transforms.eliminate_distinct_on, 671 _alias_ordered_group, 672 transforms.eliminate_semi_and_anti_joins, 673 ] 674 ), 675 exp.SHA: rename_func("SHA1"), 676 exp.SHA2: sha256_sql, 677 exp.StabilityProperty: lambda self, e: ( 678 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 679 ), 680 exp.StrToDate: _str_to_datetime_sql, 681 exp.StrToTime: _str_to_datetime_sql, 682 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 683 exp.TimeFromParts: rename_func("TIME"), 684 exp.TimestampFromParts: rename_func("DATETIME"), 685 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 686 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 687 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 688 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 689 exp.TimeStrToTime: timestrtotime_sql, 690 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 691 exp.Trim: lambda self, e: self.func("TRIM", e.this, e.expression), 692 exp.TsOrDsAdd: _ts_or_ds_add_sql, 693 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 694 exp.TsOrDsToTime: rename_func("TIME"), 695 exp.TsOrDsToTimestamp: rename_func("DATETIME"), 696 exp.Unhex: rename_func("FROM_HEX"), 697 exp.UnixDate: rename_func("UNIX_DATE"), 698 exp.UnixToTime: _unix_to_time_sql, 699 exp.Values: _derived_table_values_to_unnest, 700 exp.VariancePop: rename_func("VAR_POP"), 701 } 702 703 SUPPORTED_JSON_PATH_PARTS = { 704 exp.JSONPathKey, 705 exp.JSONPathRoot, 706 exp.JSONPathSubscript, 707 } 708 709 TYPE_MAPPING = { 710 **generator.Generator.TYPE_MAPPING, 711 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 712 exp.DataType.Type.BIGINT: "INT64", 713 exp.DataType.Type.BINARY: "BYTES", 714 exp.DataType.Type.BOOLEAN: "BOOL", 715 exp.DataType.Type.CHAR: "STRING", 716 exp.DataType.Type.DECIMAL: "NUMERIC", 717 exp.DataType.Type.DOUBLE: "FLOAT64", 718 exp.DataType.Type.FLOAT: "FLOAT64", 719 exp.DataType.Type.INT: "INT64", 720 exp.DataType.Type.NCHAR: "STRING", 721 exp.DataType.Type.NVARCHAR: "STRING", 722 exp.DataType.Type.SMALLINT: "INT64", 723 exp.DataType.Type.TEXT: "STRING", 724 exp.DataType.Type.TIMESTAMP: "DATETIME", 725 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 726 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 727 exp.DataType.Type.TINYINT: "INT64", 728 exp.DataType.Type.VARBINARY: "BYTES", 729 exp.DataType.Type.ROWVERSION: "BYTES", 730 exp.DataType.Type.VARCHAR: "STRING", 731 exp.DataType.Type.VARIANT: "ANY TYPE", 732 } 733 734 PROPERTIES_LOCATION = { 735 **generator.Generator.PROPERTIES_LOCATION, 736 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 737 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 738 } 739 740 # WINDOW comes after QUALIFY 741 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 742 AFTER_HAVING_MODIFIER_TRANSFORMS = { 743 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 744 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 745 } 746 747 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 748 RESERVED_KEYWORDS = { 749 "all", 750 "and", 751 "any", 752 "array", 753 "as", 754 "asc", 755 "assert_rows_modified", 756 "at", 757 "between", 758 "by", 759 "case", 760 "cast", 761 "collate", 762 "contains", 763 "create", 764 "cross", 765 "cube", 766 "current", 767 "default", 768 "define", 769 "desc", 770 "distinct", 771 "else", 772 "end", 773 "enum", 774 "escape", 775 "except", 776 "exclude", 777 "exists", 778 "extract", 779 "false", 780 "fetch", 781 "following", 782 "for", 783 "from", 784 "full", 785 "group", 786 "grouping", 787 "groups", 788 "hash", 789 "having", 790 "if", 791 "ignore", 792 "in", 793 "inner", 794 "intersect", 795 "interval", 796 "into", 797 "is", 798 "join", 799 "lateral", 800 "left", 801 "like", 802 "limit", 803 "lookup", 804 "merge", 805 "natural", 806 "new", 807 "no", 808 "not", 809 "null", 810 "nulls", 811 "of", 812 "on", 813 "or", 814 "order", 815 "outer", 816 "over", 817 "partition", 818 "preceding", 819 "proto", 820 "qualify", 821 "range", 822 "recursive", 823 "respect", 824 "right", 825 "rollup", 826 "rows", 827 "select", 828 "set", 829 "some", 830 "struct", 831 "tablesample", 832 "then", 833 "to", 834 "treat", 835 "true", 836 "unbounded", 837 "union", 838 "unnest", 839 "using", 840 "when", 841 "where", 842 "window", 843 "with", 844 "within", 845 } 846 847 def mod_sql(self, expression: exp.Mod) -> str: 848 this = expression.this 849 expr = expression.expression 850 return self.func( 851 "MOD", 852 this.unnest() if isinstance(this, exp.Paren) else this, 853 expr.unnest() if isinstance(expr, exp.Paren) else expr, 854 ) 855 856 def column_parts(self, expression: exp.Column) -> str: 857 if expression.meta.get("quoted_column"): 858 # If a column reference is of the form `dataset.table`.name, we need 859 # to preserve the quoted table path, otherwise the reference breaks 860 table_parts = ".".join(p.name for p in expression.parts[:-1]) 861 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 862 return f"{table_path}.{self.sql(expression, 'this')}" 863 864 return super().column_parts(expression) 865 866 def table_parts(self, expression: exp.Table) -> str: 867 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 868 # we need to make sure the correct quoting is used in each case. 869 # 870 # For example, if there is a CTE x that clashes with a schema name, then the former will 871 # return the table y in that schema, whereas the latter will return the CTE's y column: 872 # 873 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 874 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 875 if expression.meta.get("quoted_table"): 876 table_parts = ".".join(p.name for p in expression.parts) 877 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 878 879 return super().table_parts(expression) 880 881 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 882 this = expression.this if isinstance(expression.this, exp.TsOrDsToDate) else expression 883 return self.func("FORMAT_DATE", self.format_time(expression), this.this) 884 885 def eq_sql(self, expression: exp.EQ) -> str: 886 # Operands of = cannot be NULL in BigQuery 887 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 888 if not isinstance(expression.parent, exp.Update): 889 return "NULL" 890 891 return self.binary(expression, "=") 892 893 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 894 parent = expression.parent 895 896 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 897 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 898 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 899 return self.func( 900 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 901 ) 902 903 return super().attimezone_sql(expression) 904 905 def trycast_sql(self, expression: exp.TryCast) -> str: 906 return self.cast_sql(expression, safe_prefix="SAFE_") 907 908 def bracket_sql(self, expression: exp.Bracket) -> str: 909 this = expression.this 910 expressions = expression.expressions 911 912 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 913 arg = expressions[0] 914 if arg.type is None: 915 from sqlglot.optimizer.annotate_types import annotate_types 916 917 arg = annotate_types(arg) 918 919 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 920 # BQ doesn't support bracket syntax with string values for structs 921 return f"{self.sql(this)}.{arg.name}" 922 923 expressions_sql = self.expressions(expression, flat=True) 924 offset = expression.args.get("offset") 925 926 if offset == 0: 927 expressions_sql = f"OFFSET({expressions_sql})" 928 elif offset == 1: 929 expressions_sql = f"ORDINAL({expressions_sql})" 930 elif offset is not None: 931 self.unsupported(f"Unsupported array offset: {offset}") 932 933 if expression.args.get("safe"): 934 expressions_sql = f"SAFE_{expressions_sql}" 935 936 return f"{self.sql(this)}[{expressions_sql}]" 937 938 def in_unnest_op(self, expression: exp.Unnest) -> str: 939 return self.sql(expression) 940 941 def except_op(self, expression: exp.Except) -> str: 942 if not expression.args.get("distinct"): 943 self.unsupported("EXCEPT without DISTINCT is not supported in BigQuery") 944 return f"EXCEPT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 945 946 def intersect_op(self, expression: exp.Intersect) -> str: 947 if not expression.args.get("distinct"): 948 self.unsupported("INTERSECT without DISTINCT is not supported in BigQuery") 949 return f"INTERSECT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 950 951 def version_sql(self, expression: exp.Version) -> str: 952 if expression.name == "TIMESTAMP": 953 expression.set("this", "SYSTEM_TIME") 954 return super().version_sql(expression)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
856 def column_parts(self, expression: exp.Column) -> str: 857 if expression.meta.get("quoted_column"): 858 # If a column reference is of the form `dataset.table`.name, we need 859 # to preserve the quoted table path, otherwise the reference breaks 860 table_parts = ".".join(p.name for p in expression.parts[:-1]) 861 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 862 return f"{table_path}.{self.sql(expression, 'this')}" 863 864 return super().column_parts(expression)
866 def table_parts(self, expression: exp.Table) -> str: 867 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 868 # we need to make sure the correct quoting is used in each case. 869 # 870 # For example, if there is a CTE x that clashes with a schema name, then the former will 871 # return the table y in that schema, whereas the latter will return the CTE's y column: 872 # 873 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 874 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 875 if expression.meta.get("quoted_table"): 876 table_parts = ".".join(p.name for p in expression.parts) 877 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 878 879 return super().table_parts(expression)
893 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 894 parent = expression.parent 895 896 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 897 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 898 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 899 return self.func( 900 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 901 ) 902 903 return super().attimezone_sql(expression)
908 def bracket_sql(self, expression: exp.Bracket) -> str: 909 this = expression.this 910 expressions = expression.expressions 911 912 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 913 arg = expressions[0] 914 if arg.type is None: 915 from sqlglot.optimizer.annotate_types import annotate_types 916 917 arg = annotate_types(arg) 918 919 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 920 # BQ doesn't support bracket syntax with string values for structs 921 return f"{self.sql(this)}.{arg.name}" 922 923 expressions_sql = self.expressions(expression, flat=True) 924 offset = expression.args.get("offset") 925 926 if offset == 0: 927 expressions_sql = f"OFFSET({expressions_sql})" 928 elif offset == 1: 929 expressions_sql = f"ORDINAL({expressions_sql})" 930 elif offset is not None: 931 self.unsupported(f"Unsupported array offset: {offset}") 932 933 if expression.args.get("safe"): 934 expressions_sql = f"SAFE_{expressions_sql}" 935 936 return f"{self.sql(this)}[{expressions_sql}]"
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- PARSE_JSON_NAME
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- pad_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- transformcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- except_sql
- fetch_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- intersect_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- set_operations
- union_sql
- union_op
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- cast_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterdiststyle_sql
- altersortkey_sql
- renametable_sql
- renamecolumn_sql
- alterset_sql
- altertable_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- ilike_sql
- ilikeany_sql
- is_sql
- like_sql
- likeany_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- try_sql
- log_sql
- use_sql
- binary
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- forin_sql
- refresh_sql
- operator_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql