sqlglot.tokens
1from __future__ import annotations 2 3import typing as t 4from enum import auto 5 6from sqlglot.errors import TokenError 7from sqlglot.helper import AutoName 8from sqlglot.trie import TrieResult, in_trie, new_trie 9 10 11class TokenType(AutoName): 12 L_PAREN = auto() 13 R_PAREN = auto() 14 L_BRACKET = auto() 15 R_BRACKET = auto() 16 L_BRACE = auto() 17 R_BRACE = auto() 18 COMMA = auto() 19 DOT = auto() 20 DASH = auto() 21 PLUS = auto() 22 COLON = auto() 23 DCOLON = auto() 24 DQMARK = auto() 25 SEMICOLON = auto() 26 STAR = auto() 27 BACKSLASH = auto() 28 SLASH = auto() 29 LT = auto() 30 LTE = auto() 31 GT = auto() 32 GTE = auto() 33 NOT = auto() 34 EQ = auto() 35 NEQ = auto() 36 NULLSAFE_EQ = auto() 37 AND = auto() 38 OR = auto() 39 AMP = auto() 40 DPIPE = auto() 41 PIPE = auto() 42 CARET = auto() 43 TILDA = auto() 44 ARROW = auto() 45 DARROW = auto() 46 FARROW = auto() 47 HASH = auto() 48 HASH_ARROW = auto() 49 DHASH_ARROW = auto() 50 LR_ARROW = auto() 51 LT_AT = auto() 52 AT_GT = auto() 53 DOLLAR = auto() 54 PARAMETER = auto() 55 SESSION_PARAMETER = auto() 56 DAMP = auto() 57 XOR = auto() 58 59 BLOCK_START = auto() 60 BLOCK_END = auto() 61 62 SPACE = auto() 63 BREAK = auto() 64 65 STRING = auto() 66 NUMBER = auto() 67 IDENTIFIER = auto() 68 DATABASE = auto() 69 COLUMN = auto() 70 COLUMN_DEF = auto() 71 SCHEMA = auto() 72 TABLE = auto() 73 VAR = auto() 74 BIT_STRING = auto() 75 HEX_STRING = auto() 76 BYTE_STRING = auto() 77 NATIONAL_STRING = auto() 78 RAW_STRING = auto() 79 80 # types 81 BIT = auto() 82 BOOLEAN = auto() 83 TINYINT = auto() 84 UTINYINT = auto() 85 SMALLINT = auto() 86 USMALLINT = auto() 87 INT = auto() 88 UINT = auto() 89 BIGINT = auto() 90 UBIGINT = auto() 91 INT128 = auto() 92 UINT128 = auto() 93 INT256 = auto() 94 UINT256 = auto() 95 FLOAT = auto() 96 DOUBLE = auto() 97 DECIMAL = auto() 98 BIGDECIMAL = auto() 99 CHAR = auto() 100 NCHAR = auto() 101 VARCHAR = auto() 102 NVARCHAR = auto() 103 TEXT = auto() 104 MEDIUMTEXT = auto() 105 LONGTEXT = auto() 106 MEDIUMBLOB = auto() 107 LONGBLOB = auto() 108 BINARY = auto() 109 VARBINARY = auto() 110 JSON = auto() 111 JSONB = auto() 112 TIME = auto() 113 TIMETZ = auto() 114 TIMESTAMP = auto() 115 TIMESTAMPTZ = auto() 116 TIMESTAMPLTZ = auto() 117 DATETIME = auto() 118 DATETIME64 = auto() 119 DATE = auto() 120 INT4RANGE = auto() 121 INT4MULTIRANGE = auto() 122 INT8RANGE = auto() 123 INT8MULTIRANGE = auto() 124 NUMRANGE = auto() 125 NUMMULTIRANGE = auto() 126 TSRANGE = auto() 127 TSMULTIRANGE = auto() 128 TSTZRANGE = auto() 129 TSTZMULTIRANGE = auto() 130 DATERANGE = auto() 131 DATEMULTIRANGE = auto() 132 UUID = auto() 133 GEOGRAPHY = auto() 134 NULLABLE = auto() 135 GEOMETRY = auto() 136 HLLSKETCH = auto() 137 HSTORE = auto() 138 SUPER = auto() 139 SERIAL = auto() 140 SMALLSERIAL = auto() 141 BIGSERIAL = auto() 142 XML = auto() 143 UNIQUEIDENTIFIER = auto() 144 USERDEFINED = auto() 145 MONEY = auto() 146 SMALLMONEY = auto() 147 ROWVERSION = auto() 148 IMAGE = auto() 149 VARIANT = auto() 150 OBJECT = auto() 151 INET = auto() 152 IPADDRESS = auto() 153 IPPREFIX = auto() 154 ENUM = auto() 155 ENUM8 = auto() 156 ENUM16 = auto() 157 FIXEDSTRING = auto() 158 LOWCARDINALITY = auto() 159 NESTED = auto() 160 161 # keywords 162 ALIAS = auto() 163 ALTER = auto() 164 ALWAYS = auto() 165 ALL = auto() 166 ANTI = auto() 167 ANY = auto() 168 APPLY = auto() 169 ARRAY = auto() 170 ASC = auto() 171 ASOF = auto() 172 AUTO_INCREMENT = auto() 173 BEGIN = auto() 174 BETWEEN = auto() 175 CACHE = auto() 176 CASE = auto() 177 CHARACTER_SET = auto() 178 CLUSTER_BY = auto() 179 COLLATE = auto() 180 COMMAND = auto() 181 COMMENT = auto() 182 COMMIT = auto() 183 CONSTRAINT = auto() 184 CREATE = auto() 185 CROSS = auto() 186 CUBE = auto() 187 CURRENT_DATE = auto() 188 CURRENT_DATETIME = auto() 189 CURRENT_TIME = auto() 190 CURRENT_TIMESTAMP = auto() 191 CURRENT_USER = auto() 192 DEFAULT = auto() 193 DELETE = auto() 194 DESC = auto() 195 DESCRIBE = auto() 196 DICTIONARY = auto() 197 DISTINCT = auto() 198 DISTRIBUTE_BY = auto() 199 DIV = auto() 200 DROP = auto() 201 ELSE = auto() 202 END = auto() 203 ESCAPE = auto() 204 EXCEPT = auto() 205 EXECUTE = auto() 206 EXISTS = auto() 207 FALSE = auto() 208 FETCH = auto() 209 FILTER = auto() 210 FINAL = auto() 211 FIRST = auto() 212 FOR = auto() 213 FORCE = auto() 214 FOREIGN_KEY = auto() 215 FORMAT = auto() 216 FROM = auto() 217 FULL = auto() 218 FUNCTION = auto() 219 GLOB = auto() 220 GLOBAL = auto() 221 GROUP_BY = auto() 222 GROUPING_SETS = auto() 223 HAVING = auto() 224 HINT = auto() 225 IGNORE = auto() 226 ILIKE = auto() 227 ILIKE_ANY = auto() 228 IN = auto() 229 INDEX = auto() 230 INNER = auto() 231 INSERT = auto() 232 INTERSECT = auto() 233 INTERVAL = auto() 234 INTO = auto() 235 INTRODUCER = auto() 236 IRLIKE = auto() 237 IS = auto() 238 ISNULL = auto() 239 JOIN = auto() 240 JOIN_MARKER = auto() 241 KEEP = auto() 242 LANGUAGE = auto() 243 LATERAL = auto() 244 LEFT = auto() 245 LIKE = auto() 246 LIKE_ANY = auto() 247 LIMIT = auto() 248 LOAD = auto() 249 LOCK = auto() 250 MAP = auto() 251 MATCH_RECOGNIZE = auto() 252 MEMBER_OF = auto() 253 MERGE = auto() 254 MOD = auto() 255 NATURAL = auto() 256 NEXT = auto() 257 NOTNULL = auto() 258 NULL = auto() 259 OFFSET = auto() 260 ON = auto() 261 ORDER_BY = auto() 262 ORDERED = auto() 263 ORDINALITY = auto() 264 OUTER = auto() 265 OVER = auto() 266 OVERLAPS = auto() 267 OVERWRITE = auto() 268 PARTITION = auto() 269 PARTITION_BY = auto() 270 PERCENT = auto() 271 PIVOT = auto() 272 PLACEHOLDER = auto() 273 PRAGMA = auto() 274 PRIMARY_KEY = auto() 275 PROCEDURE = auto() 276 PROPERTIES = auto() 277 PSEUDO_TYPE = auto() 278 QUALIFY = auto() 279 QUOTE = auto() 280 RANGE = auto() 281 RECURSIVE = auto() 282 REPLACE = auto() 283 RETURNING = auto() 284 REFERENCES = auto() 285 RIGHT = auto() 286 RLIKE = auto() 287 ROLLBACK = auto() 288 ROLLUP = auto() 289 ROW = auto() 290 ROWS = auto() 291 SELECT = auto() 292 SEMI = auto() 293 SEPARATOR = auto() 294 SERDE_PROPERTIES = auto() 295 SET = auto() 296 SETTINGS = auto() 297 SHOW = auto() 298 SIMILAR_TO = auto() 299 SOME = auto() 300 SORT_BY = auto() 301 STRUCT = auto() 302 TABLE_SAMPLE = auto() 303 TEMPORARY = auto() 304 TOP = auto() 305 THEN = auto() 306 TRUE = auto() 307 UNCACHE = auto() 308 UNION = auto() 309 UNNEST = auto() 310 UNPIVOT = auto() 311 UPDATE = auto() 312 USE = auto() 313 USING = auto() 314 VALUES = auto() 315 VIEW = auto() 316 VOLATILE = auto() 317 WHEN = auto() 318 WHERE = auto() 319 WINDOW = auto() 320 WITH = auto() 321 UNIQUE = auto() 322 323 324class Token: 325 __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments") 326 327 @classmethod 328 def number(cls, number: int) -> Token: 329 """Returns a NUMBER token with `number` as its text.""" 330 return cls(TokenType.NUMBER, str(number)) 331 332 @classmethod 333 def string(cls, string: str) -> Token: 334 """Returns a STRING token with `string` as its text.""" 335 return cls(TokenType.STRING, string) 336 337 @classmethod 338 def identifier(cls, identifier: str) -> Token: 339 """Returns an IDENTIFIER token with `identifier` as its text.""" 340 return cls(TokenType.IDENTIFIER, identifier) 341 342 @classmethod 343 def var(cls, var: str) -> Token: 344 """Returns an VAR token with `var` as its text.""" 345 return cls(TokenType.VAR, var) 346 347 def __init__( 348 self, 349 token_type: TokenType, 350 text: str, 351 line: int = 1, 352 col: int = 1, 353 start: int = 0, 354 end: int = 0, 355 comments: t.List[str] = [], 356 ) -> None: 357 """Token initializer. 358 359 Args: 360 token_type: The TokenType Enum. 361 text: The text of the token. 362 line: The line that the token ends on. 363 col: The column that the token ends on. 364 start: The start index of the token. 365 end: The ending index of the token. 366 comments: The comments to attach to the token. 367 """ 368 self.token_type = token_type 369 self.text = text 370 self.line = line 371 self.col = col 372 self.start = start 373 self.end = end 374 self.comments = comments 375 376 def __repr__(self) -> str: 377 attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) 378 return f"<Token {attributes}>" 379 380 381class _Tokenizer(type): 382 def __new__(cls, clsname, bases, attrs): 383 klass = super().__new__(cls, clsname, bases, attrs) 384 385 def _convert_quotes(arr: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]: 386 return dict( 387 (item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr 388 ) 389 390 def _quotes_to_format( 391 token_type: TokenType, arr: t.List[str | t.Tuple[str, str]] 392 ) -> t.Dict[str, t.Tuple[str, TokenType]]: 393 return {k: (v, token_type) for k, v in _convert_quotes(arr).items()} 394 395 klass._QUOTES = _convert_quotes(klass.QUOTES) 396 klass._IDENTIFIERS = _convert_quotes(klass.IDENTIFIERS) 397 398 klass._FORMAT_STRINGS = { 399 **{ 400 p + s: (e, TokenType.NATIONAL_STRING) 401 for s, e in klass._QUOTES.items() 402 for p in ("n", "N") 403 }, 404 **_quotes_to_format(TokenType.BIT_STRING, klass.BIT_STRINGS), 405 **_quotes_to_format(TokenType.BYTE_STRING, klass.BYTE_STRINGS), 406 **_quotes_to_format(TokenType.HEX_STRING, klass.HEX_STRINGS), 407 **_quotes_to_format(TokenType.RAW_STRING, klass.RAW_STRINGS), 408 } 409 410 klass._STRING_ESCAPES = set(klass.STRING_ESCAPES) 411 klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES) 412 klass._COMMENTS = { 413 **dict( 414 (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) 415 for comment in klass.COMMENTS 416 ), 417 "{#": "#}", # Ensure Jinja comments are tokenized correctly in all dialects 418 } 419 420 klass._KEYWORD_TRIE = new_trie( 421 key.upper() 422 for key in ( 423 *klass.KEYWORDS, 424 *klass._COMMENTS, 425 *klass._QUOTES, 426 *klass._FORMAT_STRINGS, 427 ) 428 if " " in key or any(single in key for single in klass.SINGLE_TOKENS) 429 ) 430 431 return klass 432 433 434class Tokenizer(metaclass=_Tokenizer): 435 SINGLE_TOKENS = { 436 "(": TokenType.L_PAREN, 437 ")": TokenType.R_PAREN, 438 "[": TokenType.L_BRACKET, 439 "]": TokenType.R_BRACKET, 440 "{": TokenType.L_BRACE, 441 "}": TokenType.R_BRACE, 442 "&": TokenType.AMP, 443 "^": TokenType.CARET, 444 ":": TokenType.COLON, 445 ",": TokenType.COMMA, 446 ".": TokenType.DOT, 447 "-": TokenType.DASH, 448 "=": TokenType.EQ, 449 ">": TokenType.GT, 450 "<": TokenType.LT, 451 "%": TokenType.MOD, 452 "!": TokenType.NOT, 453 "|": TokenType.PIPE, 454 "+": TokenType.PLUS, 455 ";": TokenType.SEMICOLON, 456 "/": TokenType.SLASH, 457 "\\": TokenType.BACKSLASH, 458 "*": TokenType.STAR, 459 "~": TokenType.TILDA, 460 "?": TokenType.PLACEHOLDER, 461 "@": TokenType.PARAMETER, 462 # used for breaking a var like x'y' but nothing else 463 # the token type doesn't matter 464 "'": TokenType.QUOTE, 465 "`": TokenType.IDENTIFIER, 466 '"': TokenType.IDENTIFIER, 467 "#": TokenType.HASH, 468 } 469 470 BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] 471 BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] 472 HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] 473 RAW_STRINGS: t.List[str | t.Tuple[str, str]] = [] 474 IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] 475 IDENTIFIER_ESCAPES = ['"'] 476 QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] 477 STRING_ESCAPES = ["'"] 478 VAR_SINGLE_TOKENS: t.Set[str] = set() 479 480 # Autofilled 481 IDENTIFIERS_CAN_START_WITH_DIGIT: bool = False 482 483 _COMMENTS: t.Dict[str, str] = {} 484 _FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {} 485 _IDENTIFIERS: t.Dict[str, str] = {} 486 _IDENTIFIER_ESCAPES: t.Set[str] = set() 487 _QUOTES: t.Dict[str, str] = {} 488 _STRING_ESCAPES: t.Set[str] = set() 489 _KEYWORD_TRIE: t.Dict = {} 490 491 KEYWORDS: t.Dict[str, TokenType] = { 492 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 493 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 494 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 495 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 496 "/*+": TokenType.HINT, 497 "==": TokenType.EQ, 498 "::": TokenType.DCOLON, 499 "||": TokenType.DPIPE, 500 ">=": TokenType.GTE, 501 "<=": TokenType.LTE, 502 "<>": TokenType.NEQ, 503 "!=": TokenType.NEQ, 504 "<=>": TokenType.NULLSAFE_EQ, 505 "->": TokenType.ARROW, 506 "->>": TokenType.DARROW, 507 "=>": TokenType.FARROW, 508 "#>": TokenType.HASH_ARROW, 509 "#>>": TokenType.DHASH_ARROW, 510 "<->": TokenType.LR_ARROW, 511 "&&": TokenType.DAMP, 512 "??": TokenType.DQMARK, 513 "ALL": TokenType.ALL, 514 "ALWAYS": TokenType.ALWAYS, 515 "AND": TokenType.AND, 516 "ANTI": TokenType.ANTI, 517 "ANY": TokenType.ANY, 518 "ASC": TokenType.ASC, 519 "AS": TokenType.ALIAS, 520 "ASOF": TokenType.ASOF, 521 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 522 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 523 "BEGIN": TokenType.BEGIN, 524 "BETWEEN": TokenType.BETWEEN, 525 "CACHE": TokenType.CACHE, 526 "UNCACHE": TokenType.UNCACHE, 527 "CASE": TokenType.CASE, 528 "CHARACTER SET": TokenType.CHARACTER_SET, 529 "CLUSTER BY": TokenType.CLUSTER_BY, 530 "COLLATE": TokenType.COLLATE, 531 "COLUMN": TokenType.COLUMN, 532 "COMMIT": TokenType.COMMIT, 533 "CONSTRAINT": TokenType.CONSTRAINT, 534 "CREATE": TokenType.CREATE, 535 "CROSS": TokenType.CROSS, 536 "CUBE": TokenType.CUBE, 537 "CURRENT_DATE": TokenType.CURRENT_DATE, 538 "CURRENT_TIME": TokenType.CURRENT_TIME, 539 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 540 "CURRENT_USER": TokenType.CURRENT_USER, 541 "DATABASE": TokenType.DATABASE, 542 "DEFAULT": TokenType.DEFAULT, 543 "DELETE": TokenType.DELETE, 544 "DESC": TokenType.DESC, 545 "DESCRIBE": TokenType.DESCRIBE, 546 "DISTINCT": TokenType.DISTINCT, 547 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 548 "DIV": TokenType.DIV, 549 "DROP": TokenType.DROP, 550 "ELSE": TokenType.ELSE, 551 "END": TokenType.END, 552 "ESCAPE": TokenType.ESCAPE, 553 "EXCEPT": TokenType.EXCEPT, 554 "EXECUTE": TokenType.EXECUTE, 555 "EXISTS": TokenType.EXISTS, 556 "FALSE": TokenType.FALSE, 557 "FETCH": TokenType.FETCH, 558 "FILTER": TokenType.FILTER, 559 "FIRST": TokenType.FIRST, 560 "FULL": TokenType.FULL, 561 "FUNCTION": TokenType.FUNCTION, 562 "FOR": TokenType.FOR, 563 "FOREIGN KEY": TokenType.FOREIGN_KEY, 564 "FORMAT": TokenType.FORMAT, 565 "FROM": TokenType.FROM, 566 "GEOGRAPHY": TokenType.GEOGRAPHY, 567 "GEOMETRY": TokenType.GEOMETRY, 568 "GLOB": TokenType.GLOB, 569 "GROUP BY": TokenType.GROUP_BY, 570 "GROUPING SETS": TokenType.GROUPING_SETS, 571 "HAVING": TokenType.HAVING, 572 "ILIKE": TokenType.ILIKE, 573 "IN": TokenType.IN, 574 "INDEX": TokenType.INDEX, 575 "INET": TokenType.INET, 576 "INNER": TokenType.INNER, 577 "INSERT": TokenType.INSERT, 578 "INTERVAL": TokenType.INTERVAL, 579 "INTERSECT": TokenType.INTERSECT, 580 "INTO": TokenType.INTO, 581 "IS": TokenType.IS, 582 "ISNULL": TokenType.ISNULL, 583 "JOIN": TokenType.JOIN, 584 "KEEP": TokenType.KEEP, 585 "LATERAL": TokenType.LATERAL, 586 "LEFT": TokenType.LEFT, 587 "LIKE": TokenType.LIKE, 588 "LIMIT": TokenType.LIMIT, 589 "LOAD": TokenType.LOAD, 590 "LOCK": TokenType.LOCK, 591 "MERGE": TokenType.MERGE, 592 "NATURAL": TokenType.NATURAL, 593 "NEXT": TokenType.NEXT, 594 "NOT": TokenType.NOT, 595 "NOTNULL": TokenType.NOTNULL, 596 "NULL": TokenType.NULL, 597 "OBJECT": TokenType.OBJECT, 598 "OFFSET": TokenType.OFFSET, 599 "ON": TokenType.ON, 600 "OR": TokenType.OR, 601 "XOR": TokenType.XOR, 602 "ORDER BY": TokenType.ORDER_BY, 603 "ORDINALITY": TokenType.ORDINALITY, 604 "OUTER": TokenType.OUTER, 605 "OVER": TokenType.OVER, 606 "OVERLAPS": TokenType.OVERLAPS, 607 "OVERWRITE": TokenType.OVERWRITE, 608 "PARTITION": TokenType.PARTITION, 609 "PARTITION BY": TokenType.PARTITION_BY, 610 "PARTITIONED BY": TokenType.PARTITION_BY, 611 "PARTITIONED_BY": TokenType.PARTITION_BY, 612 "PERCENT": TokenType.PERCENT, 613 "PIVOT": TokenType.PIVOT, 614 "PRAGMA": TokenType.PRAGMA, 615 "PRIMARY KEY": TokenType.PRIMARY_KEY, 616 "PROCEDURE": TokenType.PROCEDURE, 617 "QUALIFY": TokenType.QUALIFY, 618 "RANGE": TokenType.RANGE, 619 "RECURSIVE": TokenType.RECURSIVE, 620 "REGEXP": TokenType.RLIKE, 621 "REPLACE": TokenType.REPLACE, 622 "RETURNING": TokenType.RETURNING, 623 "REFERENCES": TokenType.REFERENCES, 624 "RIGHT": TokenType.RIGHT, 625 "RLIKE": TokenType.RLIKE, 626 "ROLLBACK": TokenType.ROLLBACK, 627 "ROLLUP": TokenType.ROLLUP, 628 "ROW": TokenType.ROW, 629 "ROWS": TokenType.ROWS, 630 "SCHEMA": TokenType.SCHEMA, 631 "SELECT": TokenType.SELECT, 632 "SEMI": TokenType.SEMI, 633 "SET": TokenType.SET, 634 "SETTINGS": TokenType.SETTINGS, 635 "SHOW": TokenType.SHOW, 636 "SIMILAR TO": TokenType.SIMILAR_TO, 637 "SOME": TokenType.SOME, 638 "SORT BY": TokenType.SORT_BY, 639 "TABLE": TokenType.TABLE, 640 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 641 "TEMP": TokenType.TEMPORARY, 642 "TEMPORARY": TokenType.TEMPORARY, 643 "THEN": TokenType.THEN, 644 "TRUE": TokenType.TRUE, 645 "UNION": TokenType.UNION, 646 "UNNEST": TokenType.UNNEST, 647 "UNPIVOT": TokenType.UNPIVOT, 648 "UPDATE": TokenType.UPDATE, 649 "USE": TokenType.USE, 650 "USING": TokenType.USING, 651 "UUID": TokenType.UUID, 652 "VALUES": TokenType.VALUES, 653 "VIEW": TokenType.VIEW, 654 "VOLATILE": TokenType.VOLATILE, 655 "WHEN": TokenType.WHEN, 656 "WHERE": TokenType.WHERE, 657 "WINDOW": TokenType.WINDOW, 658 "WITH": TokenType.WITH, 659 "APPLY": TokenType.APPLY, 660 "ARRAY": TokenType.ARRAY, 661 "BIT": TokenType.BIT, 662 "BOOL": TokenType.BOOLEAN, 663 "BOOLEAN": TokenType.BOOLEAN, 664 "BYTE": TokenType.TINYINT, 665 "TINYINT": TokenType.TINYINT, 666 "SHORT": TokenType.SMALLINT, 667 "SMALLINT": TokenType.SMALLINT, 668 "INT128": TokenType.INT128, 669 "INT2": TokenType.SMALLINT, 670 "INTEGER": TokenType.INT, 671 "INT": TokenType.INT, 672 "INT4": TokenType.INT, 673 "LONG": TokenType.BIGINT, 674 "BIGINT": TokenType.BIGINT, 675 "INT8": TokenType.BIGINT, 676 "DEC": TokenType.DECIMAL, 677 "DECIMAL": TokenType.DECIMAL, 678 "BIGDECIMAL": TokenType.BIGDECIMAL, 679 "BIGNUMERIC": TokenType.BIGDECIMAL, 680 "MAP": TokenType.MAP, 681 "NULLABLE": TokenType.NULLABLE, 682 "NUMBER": TokenType.DECIMAL, 683 "NUMERIC": TokenType.DECIMAL, 684 "FIXED": TokenType.DECIMAL, 685 "REAL": TokenType.FLOAT, 686 "FLOAT": TokenType.FLOAT, 687 "FLOAT4": TokenType.FLOAT, 688 "FLOAT8": TokenType.DOUBLE, 689 "DOUBLE": TokenType.DOUBLE, 690 "DOUBLE PRECISION": TokenType.DOUBLE, 691 "JSON": TokenType.JSON, 692 "CHAR": TokenType.CHAR, 693 "CHARACTER": TokenType.CHAR, 694 "NCHAR": TokenType.NCHAR, 695 "VARCHAR": TokenType.VARCHAR, 696 "VARCHAR2": TokenType.VARCHAR, 697 "NVARCHAR": TokenType.NVARCHAR, 698 "NVARCHAR2": TokenType.NVARCHAR, 699 "STR": TokenType.TEXT, 700 "STRING": TokenType.TEXT, 701 "TEXT": TokenType.TEXT, 702 "CLOB": TokenType.TEXT, 703 "LONGVARCHAR": TokenType.TEXT, 704 "BINARY": TokenType.BINARY, 705 "BLOB": TokenType.VARBINARY, 706 "BYTEA": TokenType.VARBINARY, 707 "VARBINARY": TokenType.VARBINARY, 708 "TIME": TokenType.TIME, 709 "TIMETZ": TokenType.TIMETZ, 710 "TIMESTAMP": TokenType.TIMESTAMP, 711 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 712 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 713 "DATE": TokenType.DATE, 714 "DATETIME": TokenType.DATETIME, 715 "INT4RANGE": TokenType.INT4RANGE, 716 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 717 "INT8RANGE": TokenType.INT8RANGE, 718 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 719 "NUMRANGE": TokenType.NUMRANGE, 720 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 721 "TSRANGE": TokenType.TSRANGE, 722 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 723 "TSTZRANGE": TokenType.TSTZRANGE, 724 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 725 "DATERANGE": TokenType.DATERANGE, 726 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 727 "UNIQUE": TokenType.UNIQUE, 728 "STRUCT": TokenType.STRUCT, 729 "VARIANT": TokenType.VARIANT, 730 "ALTER": TokenType.ALTER, 731 "ANALYZE": TokenType.COMMAND, 732 "CALL": TokenType.COMMAND, 733 "COMMENT": TokenType.COMMENT, 734 "COPY": TokenType.COMMAND, 735 "EXPLAIN": TokenType.COMMAND, 736 "GRANT": TokenType.COMMAND, 737 "OPTIMIZE": TokenType.COMMAND, 738 "PREPARE": TokenType.COMMAND, 739 "TRUNCATE": TokenType.COMMAND, 740 "VACUUM": TokenType.COMMAND, 741 "USER-DEFINED": TokenType.USERDEFINED, 742 } 743 744 WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = { 745 " ": TokenType.SPACE, 746 "\t": TokenType.SPACE, 747 "\n": TokenType.BREAK, 748 "\r": TokenType.BREAK, 749 "\r\n": TokenType.BREAK, 750 } 751 752 COMMANDS = { 753 TokenType.COMMAND, 754 TokenType.EXECUTE, 755 TokenType.FETCH, 756 TokenType.SHOW, 757 } 758 759 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 760 761 # handle numeric literals like in hive (3L = BIGINT) 762 NUMERIC_LITERALS: t.Dict[str, str] = {} 763 ENCODE: t.Optional[str] = None 764 765 COMMENTS = ["--", ("/*", "*/")] 766 767 __slots__ = ( 768 "sql", 769 "size", 770 "tokens", 771 "_start", 772 "_current", 773 "_line", 774 "_col", 775 "_comments", 776 "_char", 777 "_end", 778 "_peek", 779 "_prev_token_line", 780 ) 781 782 def __init__(self) -> None: 783 self.reset() 784 785 def reset(self) -> None: 786 self.sql = "" 787 self.size = 0 788 self.tokens: t.List[Token] = [] 789 self._start = 0 790 self._current = 0 791 self._line = 1 792 self._col = 0 793 self._comments: t.List[str] = [] 794 795 self._char = "" 796 self._end = False 797 self._peek = "" 798 self._prev_token_line = -1 799 800 def tokenize(self, sql: str) -> t.List[Token]: 801 """Returns a list of tokens corresponding to the SQL string `sql`.""" 802 self.reset() 803 self.sql = sql 804 self.size = len(sql) 805 806 try: 807 self._scan() 808 except Exception as e: 809 start = max(self._current - 50, 0) 810 end = min(self._current + 50, self.size - 1) 811 context = self.sql[start:end] 812 raise TokenError(f"Error tokenizing '{context}'") from e 813 814 return self.tokens 815 816 def _scan(self, until: t.Optional[t.Callable] = None) -> None: 817 while self.size and not self._end: 818 self._start = self._current 819 self._advance() 820 821 if self._char is None: 822 break 823 824 if self._char not in self.WHITE_SPACE: 825 if self._char.isdigit(): 826 self._scan_number() 827 elif self._char in self._IDENTIFIERS: 828 self._scan_identifier(self._IDENTIFIERS[self._char]) 829 else: 830 self._scan_keywords() 831 832 if until and until(): 833 break 834 835 if self.tokens and self._comments: 836 self.tokens[-1].comments.extend(self._comments) 837 838 def _chars(self, size: int) -> str: 839 if size == 1: 840 return self._char 841 842 start = self._current - 1 843 end = start + size 844 845 return self.sql[start:end] if end <= self.size else "" 846 847 def _advance(self, i: int = 1, alnum: bool = False) -> None: 848 if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: 849 self._col = 1 850 self._line += 1 851 else: 852 self._col += i 853 854 self._current += i 855 self._end = self._current >= self.size 856 self._char = self.sql[self._current - 1] 857 self._peek = "" if self._end else self.sql[self._current] 858 859 if alnum and self._char.isalnum(): 860 # Here we use local variables instead of attributes for better performance 861 _col = self._col 862 _current = self._current 863 _end = self._end 864 _peek = self._peek 865 866 while _peek.isalnum(): 867 _col += 1 868 _current += 1 869 _end = _current >= self.size 870 _peek = "" if _end else self.sql[_current] 871 872 self._col = _col 873 self._current = _current 874 self._end = _end 875 self._peek = _peek 876 self._char = self.sql[_current - 1] 877 878 @property 879 def _text(self) -> str: 880 return self.sql[self._start : self._current] 881 882 def peek(self, i: int = 0) -> str: 883 i = self._current + i 884 if i < self.size: 885 return self.sql[i] 886 return "" 887 888 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: 889 self._prev_token_line = self._line 890 891 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 892 self.tokens[-1].comments.extend(self._comments) 893 self._comments = [] 894 895 self.tokens.append( 896 Token( 897 token_type, 898 text=self._text if text is None else text, 899 line=self._line, 900 col=self._col, 901 start=self._start, 902 end=self._current - 1, 903 comments=self._comments, 904 ) 905 ) 906 self._comments = [] 907 908 # If we have either a semicolon or a begin token before the command's token, we'll parse 909 # whatever follows the command's token as a string 910 if ( 911 token_type in self.COMMANDS 912 and self._peek != ";" 913 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS) 914 ): 915 start = self._current 916 tokens = len(self.tokens) 917 self._scan(lambda: self._peek == ";") 918 self.tokens = self.tokens[:tokens] 919 text = self.sql[start : self._current].strip() 920 if text: 921 self._add(TokenType.STRING, text) 922 923 def _scan_keywords(self) -> None: 924 size = 0 925 word = None 926 chars = self._text 927 char = chars 928 prev_space = False 929 skip = False 930 trie = self._KEYWORD_TRIE 931 single_token = char in self.SINGLE_TOKENS 932 933 while chars: 934 if skip: 935 result = TrieResult.PREFIX 936 else: 937 result, trie = in_trie(trie, char.upper()) 938 939 if result == TrieResult.FAILED: 940 break 941 if result == TrieResult.EXISTS: 942 word = chars 943 944 size += 1 945 end = self._current - 1 + size 946 947 if end < self.size: 948 char = self.sql[end] 949 single_token = single_token or char in self.SINGLE_TOKENS 950 is_space = char in self.WHITE_SPACE 951 952 if not is_space or not prev_space: 953 if is_space: 954 char = " " 955 chars += char 956 prev_space = is_space 957 skip = False 958 else: 959 skip = True 960 else: 961 char = "" 962 chars = " " 963 964 if not word: 965 if self._char in self.SINGLE_TOKENS: 966 self._add(self.SINGLE_TOKENS[self._char], text=self._char) 967 return 968 self._scan_var() 969 return 970 971 if self._scan_string(word): 972 return 973 if self._scan_comment(word): 974 return 975 976 self._advance(size - 1) 977 word = word.upper() 978 self._add(self.KEYWORDS[word], text=word) 979 980 def _scan_comment(self, comment_start: str) -> bool: 981 if comment_start not in self._COMMENTS: 982 return False 983 984 comment_start_line = self._line 985 comment_start_size = len(comment_start) 986 comment_end = self._COMMENTS[comment_start] 987 988 if comment_end: 989 # Skip the comment's start delimiter 990 self._advance(comment_start_size) 991 992 comment_end_size = len(comment_end) 993 while not self._end and self._chars(comment_end_size) != comment_end: 994 self._advance(alnum=True) 995 996 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 997 self._advance(comment_end_size - 1) 998 else: 999 while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK: 1000 self._advance(alnum=True) 1001 self._comments.append(self._text[comment_start_size:]) 1002 1003 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 1004 # Multiple consecutive comments are preserved by appending them to the current comments list. 1005 if comment_start_line == self._prev_token_line: 1006 self.tokens[-1].comments.extend(self._comments) 1007 self._comments = [] 1008 self._prev_token_line = self._line 1009 1010 return True 1011 1012 def _scan_number(self) -> None: 1013 if self._char == "0": 1014 peek = self._peek.upper() 1015 if peek == "B": 1016 return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER) 1017 elif peek == "X": 1018 return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER) 1019 1020 decimal = False 1021 scientific = 0 1022 1023 while True: 1024 if self._peek.isdigit(): 1025 self._advance() 1026 elif self._peek == "." and not decimal: 1027 after = self.peek(1) 1028 if after.isdigit() or not after.isalpha(): 1029 decimal = True 1030 self._advance() 1031 else: 1032 return self._add(TokenType.VAR) 1033 elif self._peek in ("-", "+") and scientific == 1: 1034 scientific += 1 1035 self._advance() 1036 elif self._peek.upper() == "E" and not scientific: 1037 scientific += 1 1038 self._advance() 1039 elif self._peek.isidentifier(): 1040 number_text = self._text 1041 literal = "" 1042 1043 while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: 1044 literal += self._peek.upper() 1045 self._advance() 1046 1047 token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal, "")) 1048 1049 if token_type: 1050 self._add(TokenType.NUMBER, number_text) 1051 self._add(TokenType.DCOLON, "::") 1052 return self._add(token_type, literal) 1053 elif self.IDENTIFIERS_CAN_START_WITH_DIGIT: 1054 return self._add(TokenType.VAR) 1055 1056 self._add(TokenType.NUMBER, number_text) 1057 return self._advance(-len(literal)) 1058 else: 1059 return self._add(TokenType.NUMBER) 1060 1061 def _scan_bits(self) -> None: 1062 self._advance() 1063 value = self._extract_value() 1064 try: 1065 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 1066 int(value, 2) 1067 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 1068 except ValueError: 1069 self._add(TokenType.IDENTIFIER) 1070 1071 def _scan_hex(self) -> None: 1072 self._advance() 1073 value = self._extract_value() 1074 try: 1075 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1076 int(value, 16) 1077 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1078 except ValueError: 1079 self._add(TokenType.IDENTIFIER) 1080 1081 def _extract_value(self) -> str: 1082 while True: 1083 char = self._peek.strip() 1084 if char and char not in self.SINGLE_TOKENS: 1085 self._advance(alnum=True) 1086 else: 1087 break 1088 1089 return self._text 1090 1091 def _scan_string(self, start: str) -> bool: 1092 base = None 1093 token_type = TokenType.STRING 1094 1095 if start in self._QUOTES: 1096 end = self._QUOTES[start] 1097 elif start in self._FORMAT_STRINGS: 1098 end, token_type = self._FORMAT_STRINGS[start] 1099 1100 if token_type == TokenType.HEX_STRING: 1101 base = 16 1102 elif token_type == TokenType.BIT_STRING: 1103 base = 2 1104 else: 1105 return False 1106 1107 self._advance(len(start)) 1108 text = self._extract_string(end) 1109 1110 if base: 1111 try: 1112 int(text, base) 1113 except: 1114 raise TokenError( 1115 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1116 ) 1117 else: 1118 text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text 1119 1120 self._add(token_type, text) 1121 return True 1122 1123 def _scan_identifier(self, identifier_end: str) -> None: 1124 self._advance() 1125 text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES) 1126 self._add(TokenType.IDENTIFIER, text) 1127 1128 def _scan_var(self) -> None: 1129 while True: 1130 char = self._peek.strip() 1131 if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): 1132 self._advance(alnum=True) 1133 else: 1134 break 1135 1136 self._add( 1137 TokenType.VAR 1138 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1139 else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) 1140 ) 1141 1142 def _extract_string(self, delimiter: str, escapes=None) -> str: 1143 text = "" 1144 delim_size = len(delimiter) 1145 escapes = self._STRING_ESCAPES if escapes is None else escapes 1146 1147 while True: 1148 if self._char in escapes and (self._peek == delimiter or self._peek in escapes): 1149 if self._peek == delimiter: 1150 text += self._peek 1151 else: 1152 text += self._char + self._peek 1153 1154 if self._current + 1 < self.size: 1155 self._advance(2) 1156 else: 1157 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1158 else: 1159 if self._chars(delim_size) == delimiter: 1160 if delim_size > 1: 1161 self._advance(delim_size - 1) 1162 break 1163 1164 if self._end: 1165 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1166 1167 current = self._current - 1 1168 self._advance(alnum=True) 1169 text += self.sql[current : self._current - 1] 1170 1171 return text
12class TokenType(AutoName): 13 L_PAREN = auto() 14 R_PAREN = auto() 15 L_BRACKET = auto() 16 R_BRACKET = auto() 17 L_BRACE = auto() 18 R_BRACE = auto() 19 COMMA = auto() 20 DOT = auto() 21 DASH = auto() 22 PLUS = auto() 23 COLON = auto() 24 DCOLON = auto() 25 DQMARK = auto() 26 SEMICOLON = auto() 27 STAR = auto() 28 BACKSLASH = auto() 29 SLASH = auto() 30 LT = auto() 31 LTE = auto() 32 GT = auto() 33 GTE = auto() 34 NOT = auto() 35 EQ = auto() 36 NEQ = auto() 37 NULLSAFE_EQ = auto() 38 AND = auto() 39 OR = auto() 40 AMP = auto() 41 DPIPE = auto() 42 PIPE = auto() 43 CARET = auto() 44 TILDA = auto() 45 ARROW = auto() 46 DARROW = auto() 47 FARROW = auto() 48 HASH = auto() 49 HASH_ARROW = auto() 50 DHASH_ARROW = auto() 51 LR_ARROW = auto() 52 LT_AT = auto() 53 AT_GT = auto() 54 DOLLAR = auto() 55 PARAMETER = auto() 56 SESSION_PARAMETER = auto() 57 DAMP = auto() 58 XOR = auto() 59 60 BLOCK_START = auto() 61 BLOCK_END = auto() 62 63 SPACE = auto() 64 BREAK = auto() 65 66 STRING = auto() 67 NUMBER = auto() 68 IDENTIFIER = auto() 69 DATABASE = auto() 70 COLUMN = auto() 71 COLUMN_DEF = auto() 72 SCHEMA = auto() 73 TABLE = auto() 74 VAR = auto() 75 BIT_STRING = auto() 76 HEX_STRING = auto() 77 BYTE_STRING = auto() 78 NATIONAL_STRING = auto() 79 RAW_STRING = auto() 80 81 # types 82 BIT = auto() 83 BOOLEAN = auto() 84 TINYINT = auto() 85 UTINYINT = auto() 86 SMALLINT = auto() 87 USMALLINT = auto() 88 INT = auto() 89 UINT = auto() 90 BIGINT = auto() 91 UBIGINT = auto() 92 INT128 = auto() 93 UINT128 = auto() 94 INT256 = auto() 95 UINT256 = auto() 96 FLOAT = auto() 97 DOUBLE = auto() 98 DECIMAL = auto() 99 BIGDECIMAL = auto() 100 CHAR = auto() 101 NCHAR = auto() 102 VARCHAR = auto() 103 NVARCHAR = auto() 104 TEXT = auto() 105 MEDIUMTEXT = auto() 106 LONGTEXT = auto() 107 MEDIUMBLOB = auto() 108 LONGBLOB = auto() 109 BINARY = auto() 110 VARBINARY = auto() 111 JSON = auto() 112 JSONB = auto() 113 TIME = auto() 114 TIMETZ = auto() 115 TIMESTAMP = auto() 116 TIMESTAMPTZ = auto() 117 TIMESTAMPLTZ = auto() 118 DATETIME = auto() 119 DATETIME64 = auto() 120 DATE = auto() 121 INT4RANGE = auto() 122 INT4MULTIRANGE = auto() 123 INT8RANGE = auto() 124 INT8MULTIRANGE = auto() 125 NUMRANGE = auto() 126 NUMMULTIRANGE = auto() 127 TSRANGE = auto() 128 TSMULTIRANGE = auto() 129 TSTZRANGE = auto() 130 TSTZMULTIRANGE = auto() 131 DATERANGE = auto() 132 DATEMULTIRANGE = auto() 133 UUID = auto() 134 GEOGRAPHY = auto() 135 NULLABLE = auto() 136 GEOMETRY = auto() 137 HLLSKETCH = auto() 138 HSTORE = auto() 139 SUPER = auto() 140 SERIAL = auto() 141 SMALLSERIAL = auto() 142 BIGSERIAL = auto() 143 XML = auto() 144 UNIQUEIDENTIFIER = auto() 145 USERDEFINED = auto() 146 MONEY = auto() 147 SMALLMONEY = auto() 148 ROWVERSION = auto() 149 IMAGE = auto() 150 VARIANT = auto() 151 OBJECT = auto() 152 INET = auto() 153 IPADDRESS = auto() 154 IPPREFIX = auto() 155 ENUM = auto() 156 ENUM8 = auto() 157 ENUM16 = auto() 158 FIXEDSTRING = auto() 159 LOWCARDINALITY = auto() 160 NESTED = auto() 161 162 # keywords 163 ALIAS = auto() 164 ALTER = auto() 165 ALWAYS = auto() 166 ALL = auto() 167 ANTI = auto() 168 ANY = auto() 169 APPLY = auto() 170 ARRAY = auto() 171 ASC = auto() 172 ASOF = auto() 173 AUTO_INCREMENT = auto() 174 BEGIN = auto() 175 BETWEEN = auto() 176 CACHE = auto() 177 CASE = auto() 178 CHARACTER_SET = auto() 179 CLUSTER_BY = auto() 180 COLLATE = auto() 181 COMMAND = auto() 182 COMMENT = auto() 183 COMMIT = auto() 184 CONSTRAINT = auto() 185 CREATE = auto() 186 CROSS = auto() 187 CUBE = auto() 188 CURRENT_DATE = auto() 189 CURRENT_DATETIME = auto() 190 CURRENT_TIME = auto() 191 CURRENT_TIMESTAMP = auto() 192 CURRENT_USER = auto() 193 DEFAULT = auto() 194 DELETE = auto() 195 DESC = auto() 196 DESCRIBE = auto() 197 DICTIONARY = auto() 198 DISTINCT = auto() 199 DISTRIBUTE_BY = auto() 200 DIV = auto() 201 DROP = auto() 202 ELSE = auto() 203 END = auto() 204 ESCAPE = auto() 205 EXCEPT = auto() 206 EXECUTE = auto() 207 EXISTS = auto() 208 FALSE = auto() 209 FETCH = auto() 210 FILTER = auto() 211 FINAL = auto() 212 FIRST = auto() 213 FOR = auto() 214 FORCE = auto() 215 FOREIGN_KEY = auto() 216 FORMAT = auto() 217 FROM = auto() 218 FULL = auto() 219 FUNCTION = auto() 220 GLOB = auto() 221 GLOBAL = auto() 222 GROUP_BY = auto() 223 GROUPING_SETS = auto() 224 HAVING = auto() 225 HINT = auto() 226 IGNORE = auto() 227 ILIKE = auto() 228 ILIKE_ANY = auto() 229 IN = auto() 230 INDEX = auto() 231 INNER = auto() 232 INSERT = auto() 233 INTERSECT = auto() 234 INTERVAL = auto() 235 INTO = auto() 236 INTRODUCER = auto() 237 IRLIKE = auto() 238 IS = auto() 239 ISNULL = auto() 240 JOIN = auto() 241 JOIN_MARKER = auto() 242 KEEP = auto() 243 LANGUAGE = auto() 244 LATERAL = auto() 245 LEFT = auto() 246 LIKE = auto() 247 LIKE_ANY = auto() 248 LIMIT = auto() 249 LOAD = auto() 250 LOCK = auto() 251 MAP = auto() 252 MATCH_RECOGNIZE = auto() 253 MEMBER_OF = auto() 254 MERGE = auto() 255 MOD = auto() 256 NATURAL = auto() 257 NEXT = auto() 258 NOTNULL = auto() 259 NULL = auto() 260 OFFSET = auto() 261 ON = auto() 262 ORDER_BY = auto() 263 ORDERED = auto() 264 ORDINALITY = auto() 265 OUTER = auto() 266 OVER = auto() 267 OVERLAPS = auto() 268 OVERWRITE = auto() 269 PARTITION = auto() 270 PARTITION_BY = auto() 271 PERCENT = auto() 272 PIVOT = auto() 273 PLACEHOLDER = auto() 274 PRAGMA = auto() 275 PRIMARY_KEY = auto() 276 PROCEDURE = auto() 277 PROPERTIES = auto() 278 PSEUDO_TYPE = auto() 279 QUALIFY = auto() 280 QUOTE = auto() 281 RANGE = auto() 282 RECURSIVE = auto() 283 REPLACE = auto() 284 RETURNING = auto() 285 REFERENCES = auto() 286 RIGHT = auto() 287 RLIKE = auto() 288 ROLLBACK = auto() 289 ROLLUP = auto() 290 ROW = auto() 291 ROWS = auto() 292 SELECT = auto() 293 SEMI = auto() 294 SEPARATOR = auto() 295 SERDE_PROPERTIES = auto() 296 SET = auto() 297 SETTINGS = auto() 298 SHOW = auto() 299 SIMILAR_TO = auto() 300 SOME = auto() 301 SORT_BY = auto() 302 STRUCT = auto() 303 TABLE_SAMPLE = auto() 304 TEMPORARY = auto() 305 TOP = auto() 306 THEN = auto() 307 TRUE = auto() 308 UNCACHE = auto() 309 UNION = auto() 310 UNNEST = auto() 311 UNPIVOT = auto() 312 UPDATE = auto() 313 USE = auto() 314 USING = auto() 315 VALUES = auto() 316 VIEW = auto() 317 VOLATILE = auto() 318 WHEN = auto() 319 WHERE = auto() 320 WINDOW = auto() 321 WITH = auto() 322 UNIQUE = auto()
An enumeration.
L_PAREN =
<TokenType.L_PAREN: 'L_PAREN'>
R_PAREN =
<TokenType.R_PAREN: 'R_PAREN'>
L_BRACKET =
<TokenType.L_BRACKET: 'L_BRACKET'>
R_BRACKET =
<TokenType.R_BRACKET: 'R_BRACKET'>
L_BRACE =
<TokenType.L_BRACE: 'L_BRACE'>
R_BRACE =
<TokenType.R_BRACE: 'R_BRACE'>
COMMA =
<TokenType.COMMA: 'COMMA'>
DOT =
<TokenType.DOT: 'DOT'>
DASH =
<TokenType.DASH: 'DASH'>
PLUS =
<TokenType.PLUS: 'PLUS'>
COLON =
<TokenType.COLON: 'COLON'>
DCOLON =
<TokenType.DCOLON: 'DCOLON'>
DQMARK =
<TokenType.DQMARK: 'DQMARK'>
SEMICOLON =
<TokenType.SEMICOLON: 'SEMICOLON'>
STAR =
<TokenType.STAR: 'STAR'>
BACKSLASH =
<TokenType.BACKSLASH: 'BACKSLASH'>
SLASH =
<TokenType.SLASH: 'SLASH'>
LT =
<TokenType.LT: 'LT'>
LTE =
<TokenType.LTE: 'LTE'>
GT =
<TokenType.GT: 'GT'>
GTE =
<TokenType.GTE: 'GTE'>
NOT =
<TokenType.NOT: 'NOT'>
EQ =
<TokenType.EQ: 'EQ'>
NEQ =
<TokenType.NEQ: 'NEQ'>
NULLSAFE_EQ =
<TokenType.NULLSAFE_EQ: 'NULLSAFE_EQ'>
AND =
<TokenType.AND: 'AND'>
OR =
<TokenType.OR: 'OR'>
AMP =
<TokenType.AMP: 'AMP'>
DPIPE =
<TokenType.DPIPE: 'DPIPE'>
PIPE =
<TokenType.PIPE: 'PIPE'>
CARET =
<TokenType.CARET: 'CARET'>
TILDA =
<TokenType.TILDA: 'TILDA'>
ARROW =
<TokenType.ARROW: 'ARROW'>
DARROW =
<TokenType.DARROW: 'DARROW'>
FARROW =
<TokenType.FARROW: 'FARROW'>
HASH =
<TokenType.HASH: 'HASH'>
HASH_ARROW =
<TokenType.HASH_ARROW: 'HASH_ARROW'>
DHASH_ARROW =
<TokenType.DHASH_ARROW: 'DHASH_ARROW'>
LR_ARROW =
<TokenType.LR_ARROW: 'LR_ARROW'>
LT_AT =
<TokenType.LT_AT: 'LT_AT'>
AT_GT =
<TokenType.AT_GT: 'AT_GT'>
DOLLAR =
<TokenType.DOLLAR: 'DOLLAR'>
PARAMETER =
<TokenType.PARAMETER: 'PARAMETER'>
SESSION_PARAMETER =
<TokenType.SESSION_PARAMETER: 'SESSION_PARAMETER'>
DAMP =
<TokenType.DAMP: 'DAMP'>
XOR =
<TokenType.XOR: 'XOR'>
BLOCK_START =
<TokenType.BLOCK_START: 'BLOCK_START'>
BLOCK_END =
<TokenType.BLOCK_END: 'BLOCK_END'>
SPACE =
<TokenType.SPACE: 'SPACE'>
BREAK =
<TokenType.BREAK: 'BREAK'>
STRING =
<TokenType.STRING: 'STRING'>
NUMBER =
<TokenType.NUMBER: 'NUMBER'>
IDENTIFIER =
<TokenType.IDENTIFIER: 'IDENTIFIER'>
DATABASE =
<TokenType.DATABASE: 'DATABASE'>
COLUMN =
<TokenType.COLUMN: 'COLUMN'>
COLUMN_DEF =
<TokenType.COLUMN_DEF: 'COLUMN_DEF'>
SCHEMA =
<TokenType.SCHEMA: 'SCHEMA'>
TABLE =
<TokenType.TABLE: 'TABLE'>
VAR =
<TokenType.VAR: 'VAR'>
BIT_STRING =
<TokenType.BIT_STRING: 'BIT_STRING'>
HEX_STRING =
<TokenType.HEX_STRING: 'HEX_STRING'>
BYTE_STRING =
<TokenType.BYTE_STRING: 'BYTE_STRING'>
NATIONAL_STRING =
<TokenType.NATIONAL_STRING: 'NATIONAL_STRING'>
RAW_STRING =
<TokenType.RAW_STRING: 'RAW_STRING'>
BIT =
<TokenType.BIT: 'BIT'>
BOOLEAN =
<TokenType.BOOLEAN: 'BOOLEAN'>
TINYINT =
<TokenType.TINYINT: 'TINYINT'>
UTINYINT =
<TokenType.UTINYINT: 'UTINYINT'>
SMALLINT =
<TokenType.SMALLINT: 'SMALLINT'>
USMALLINT =
<TokenType.USMALLINT: 'USMALLINT'>
INT =
<TokenType.INT: 'INT'>
UINT =
<TokenType.UINT: 'UINT'>
BIGINT =
<TokenType.BIGINT: 'BIGINT'>
UBIGINT =
<TokenType.UBIGINT: 'UBIGINT'>
INT128 =
<TokenType.INT128: 'INT128'>
UINT128 =
<TokenType.UINT128: 'UINT128'>
INT256 =
<TokenType.INT256: 'INT256'>
UINT256 =
<TokenType.UINT256: 'UINT256'>
FLOAT =
<TokenType.FLOAT: 'FLOAT'>
DOUBLE =
<TokenType.DOUBLE: 'DOUBLE'>
DECIMAL =
<TokenType.DECIMAL: 'DECIMAL'>
BIGDECIMAL =
<TokenType.BIGDECIMAL: 'BIGDECIMAL'>
CHAR =
<TokenType.CHAR: 'CHAR'>
NCHAR =
<TokenType.NCHAR: 'NCHAR'>
VARCHAR =
<TokenType.VARCHAR: 'VARCHAR'>
NVARCHAR =
<TokenType.NVARCHAR: 'NVARCHAR'>
TEXT =
<TokenType.TEXT: 'TEXT'>
MEDIUMTEXT =
<TokenType.MEDIUMTEXT: 'MEDIUMTEXT'>
LONGTEXT =
<TokenType.LONGTEXT: 'LONGTEXT'>
MEDIUMBLOB =
<TokenType.MEDIUMBLOB: 'MEDIUMBLOB'>
LONGBLOB =
<TokenType.LONGBLOB: 'LONGBLOB'>
BINARY =
<TokenType.BINARY: 'BINARY'>
VARBINARY =
<TokenType.VARBINARY: 'VARBINARY'>
JSON =
<TokenType.JSON: 'JSON'>
JSONB =
<TokenType.JSONB: 'JSONB'>
TIME =
<TokenType.TIME: 'TIME'>
TIMETZ =
<TokenType.TIMETZ: 'TIMETZ'>
TIMESTAMP =
<TokenType.TIMESTAMP: 'TIMESTAMP'>
TIMESTAMPTZ =
<TokenType.TIMESTAMPTZ: 'TIMESTAMPTZ'>
TIMESTAMPLTZ =
<TokenType.TIMESTAMPLTZ: 'TIMESTAMPLTZ'>
DATETIME =
<TokenType.DATETIME: 'DATETIME'>
DATETIME64 =
<TokenType.DATETIME64: 'DATETIME64'>
DATE =
<TokenType.DATE: 'DATE'>
INT4RANGE =
<TokenType.INT4RANGE: 'INT4RANGE'>
INT4MULTIRANGE =
<TokenType.INT4MULTIRANGE: 'INT4MULTIRANGE'>
INT8RANGE =
<TokenType.INT8RANGE: 'INT8RANGE'>
INT8MULTIRANGE =
<TokenType.INT8MULTIRANGE: 'INT8MULTIRANGE'>
NUMRANGE =
<TokenType.NUMRANGE: 'NUMRANGE'>
NUMMULTIRANGE =
<TokenType.NUMMULTIRANGE: 'NUMMULTIRANGE'>
TSRANGE =
<TokenType.TSRANGE: 'TSRANGE'>
TSMULTIRANGE =
<TokenType.TSMULTIRANGE: 'TSMULTIRANGE'>
TSTZRANGE =
<TokenType.TSTZRANGE: 'TSTZRANGE'>
TSTZMULTIRANGE =
<TokenType.TSTZMULTIRANGE: 'TSTZMULTIRANGE'>
DATERANGE =
<TokenType.DATERANGE: 'DATERANGE'>
DATEMULTIRANGE =
<TokenType.DATEMULTIRANGE: 'DATEMULTIRANGE'>
UUID =
<TokenType.UUID: 'UUID'>
GEOGRAPHY =
<TokenType.GEOGRAPHY: 'GEOGRAPHY'>
NULLABLE =
<TokenType.NULLABLE: 'NULLABLE'>
GEOMETRY =
<TokenType.GEOMETRY: 'GEOMETRY'>
HLLSKETCH =
<TokenType.HLLSKETCH: 'HLLSKETCH'>
HSTORE =
<TokenType.HSTORE: 'HSTORE'>
SUPER =
<TokenType.SUPER: 'SUPER'>
SERIAL =
<TokenType.SERIAL: 'SERIAL'>
SMALLSERIAL =
<TokenType.SMALLSERIAL: 'SMALLSERIAL'>
BIGSERIAL =
<TokenType.BIGSERIAL: 'BIGSERIAL'>
XML =
<TokenType.XML: 'XML'>
UNIQUEIDENTIFIER =
<TokenType.UNIQUEIDENTIFIER: 'UNIQUEIDENTIFIER'>
USERDEFINED =
<TokenType.USERDEFINED: 'USERDEFINED'>
MONEY =
<TokenType.MONEY: 'MONEY'>
SMALLMONEY =
<TokenType.SMALLMONEY: 'SMALLMONEY'>
ROWVERSION =
<TokenType.ROWVERSION: 'ROWVERSION'>
IMAGE =
<TokenType.IMAGE: 'IMAGE'>
VARIANT =
<TokenType.VARIANT: 'VARIANT'>
OBJECT =
<TokenType.OBJECT: 'OBJECT'>
INET =
<TokenType.INET: 'INET'>
IPADDRESS =
<TokenType.IPADDRESS: 'IPADDRESS'>
IPPREFIX =
<TokenType.IPPREFIX: 'IPPREFIX'>
ENUM =
<TokenType.ENUM: 'ENUM'>
ENUM8 =
<TokenType.ENUM8: 'ENUM8'>
ENUM16 =
<TokenType.ENUM16: 'ENUM16'>
FIXEDSTRING =
<TokenType.FIXEDSTRING: 'FIXEDSTRING'>
LOWCARDINALITY =
<TokenType.LOWCARDINALITY: 'LOWCARDINALITY'>
NESTED =
<TokenType.NESTED: 'NESTED'>
ALIAS =
<TokenType.ALIAS: 'ALIAS'>
ALTER =
<TokenType.ALTER: 'ALTER'>
ALWAYS =
<TokenType.ALWAYS: 'ALWAYS'>
ALL =
<TokenType.ALL: 'ALL'>
ANTI =
<TokenType.ANTI: 'ANTI'>
ANY =
<TokenType.ANY: 'ANY'>
APPLY =
<TokenType.APPLY: 'APPLY'>
ARRAY =
<TokenType.ARRAY: 'ARRAY'>
ASC =
<TokenType.ASC: 'ASC'>
ASOF =
<TokenType.ASOF: 'ASOF'>
AUTO_INCREMENT =
<TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>
BEGIN =
<TokenType.BEGIN: 'BEGIN'>
BETWEEN =
<TokenType.BETWEEN: 'BETWEEN'>
CACHE =
<TokenType.CACHE: 'CACHE'>
CASE =
<TokenType.CASE: 'CASE'>
CHARACTER_SET =
<TokenType.CHARACTER_SET: 'CHARACTER_SET'>
CLUSTER_BY =
<TokenType.CLUSTER_BY: 'CLUSTER_BY'>
COLLATE =
<TokenType.COLLATE: 'COLLATE'>
COMMAND =
<TokenType.COMMAND: 'COMMAND'>
COMMENT =
<TokenType.COMMENT: 'COMMENT'>
COMMIT =
<TokenType.COMMIT: 'COMMIT'>
CONSTRAINT =
<TokenType.CONSTRAINT: 'CONSTRAINT'>
CREATE =
<TokenType.CREATE: 'CREATE'>
CROSS =
<TokenType.CROSS: 'CROSS'>
CUBE =
<TokenType.CUBE: 'CUBE'>
CURRENT_DATE =
<TokenType.CURRENT_DATE: 'CURRENT_DATE'>
CURRENT_DATETIME =
<TokenType.CURRENT_DATETIME: 'CURRENT_DATETIME'>
CURRENT_TIME =
<TokenType.CURRENT_TIME: 'CURRENT_TIME'>
CURRENT_TIMESTAMP =
<TokenType.CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'>
CURRENT_USER =
<TokenType.CURRENT_USER: 'CURRENT_USER'>
DEFAULT =
<TokenType.DEFAULT: 'DEFAULT'>
DELETE =
<TokenType.DELETE: 'DELETE'>
DESC =
<TokenType.DESC: 'DESC'>
DESCRIBE =
<TokenType.DESCRIBE: 'DESCRIBE'>
DICTIONARY =
<TokenType.DICTIONARY: 'DICTIONARY'>
DISTINCT =
<TokenType.DISTINCT: 'DISTINCT'>
DISTRIBUTE_BY =
<TokenType.DISTRIBUTE_BY: 'DISTRIBUTE_BY'>
DIV =
<TokenType.DIV: 'DIV'>
DROP =
<TokenType.DROP: 'DROP'>
ELSE =
<TokenType.ELSE: 'ELSE'>
END =
<TokenType.END: 'END'>
ESCAPE =
<TokenType.ESCAPE: 'ESCAPE'>
EXCEPT =
<TokenType.EXCEPT: 'EXCEPT'>
EXECUTE =
<TokenType.EXECUTE: 'EXECUTE'>
EXISTS =
<TokenType.EXISTS: 'EXISTS'>
FALSE =
<TokenType.FALSE: 'FALSE'>
FETCH =
<TokenType.FETCH: 'FETCH'>
FILTER =
<TokenType.FILTER: 'FILTER'>
FINAL =
<TokenType.FINAL: 'FINAL'>
FIRST =
<TokenType.FIRST: 'FIRST'>
FOR =
<TokenType.FOR: 'FOR'>
FORCE =
<TokenType.FORCE: 'FORCE'>
FOREIGN_KEY =
<TokenType.FOREIGN_KEY: 'FOREIGN_KEY'>
FORMAT =
<TokenType.FORMAT: 'FORMAT'>
FROM =
<TokenType.FROM: 'FROM'>
FULL =
<TokenType.FULL: 'FULL'>
FUNCTION =
<TokenType.FUNCTION: 'FUNCTION'>
GLOB =
<TokenType.GLOB: 'GLOB'>
GLOBAL =
<TokenType.GLOBAL: 'GLOBAL'>
GROUP_BY =
<TokenType.GROUP_BY: 'GROUP_BY'>
GROUPING_SETS =
<TokenType.GROUPING_SETS: 'GROUPING_SETS'>
HAVING =
<TokenType.HAVING: 'HAVING'>
HINT =
<TokenType.HINT: 'HINT'>
IGNORE =
<TokenType.IGNORE: 'IGNORE'>
ILIKE =
<TokenType.ILIKE: 'ILIKE'>
ILIKE_ANY =
<TokenType.ILIKE_ANY: 'ILIKE_ANY'>
IN =
<TokenType.IN: 'IN'>
INDEX =
<TokenType.INDEX: 'INDEX'>
INNER =
<TokenType.INNER: 'INNER'>
INSERT =
<TokenType.INSERT: 'INSERT'>
INTERSECT =
<TokenType.INTERSECT: 'INTERSECT'>
INTERVAL =
<TokenType.INTERVAL: 'INTERVAL'>
INTO =
<TokenType.INTO: 'INTO'>
INTRODUCER =
<TokenType.INTRODUCER: 'INTRODUCER'>
IRLIKE =
<TokenType.IRLIKE: 'IRLIKE'>
IS =
<TokenType.IS: 'IS'>
ISNULL =
<TokenType.ISNULL: 'ISNULL'>
JOIN =
<TokenType.JOIN: 'JOIN'>
JOIN_MARKER =
<TokenType.JOIN_MARKER: 'JOIN_MARKER'>
KEEP =
<TokenType.KEEP: 'KEEP'>
LANGUAGE =
<TokenType.LANGUAGE: 'LANGUAGE'>
LATERAL =
<TokenType.LATERAL: 'LATERAL'>
LEFT =
<TokenType.LEFT: 'LEFT'>
LIKE =
<TokenType.LIKE: 'LIKE'>
LIKE_ANY =
<TokenType.LIKE_ANY: 'LIKE_ANY'>
LIMIT =
<TokenType.LIMIT: 'LIMIT'>
LOAD =
<TokenType.LOAD: 'LOAD'>
LOCK =
<TokenType.LOCK: 'LOCK'>
MAP =
<TokenType.MAP: 'MAP'>
MATCH_RECOGNIZE =
<TokenType.MATCH_RECOGNIZE: 'MATCH_RECOGNIZE'>
MEMBER_OF =
<TokenType.MEMBER_OF: 'MEMBER_OF'>
MERGE =
<TokenType.MERGE: 'MERGE'>
MOD =
<TokenType.MOD: 'MOD'>
NATURAL =
<TokenType.NATURAL: 'NATURAL'>
NEXT =
<TokenType.NEXT: 'NEXT'>
NOTNULL =
<TokenType.NOTNULL: 'NOTNULL'>
NULL =
<TokenType.NULL: 'NULL'>
OFFSET =
<TokenType.OFFSET: 'OFFSET'>
ON =
<TokenType.ON: 'ON'>
ORDER_BY =
<TokenType.ORDER_BY: 'ORDER_BY'>
ORDERED =
<TokenType.ORDERED: 'ORDERED'>
ORDINALITY =
<TokenType.ORDINALITY: 'ORDINALITY'>
OUTER =
<TokenType.OUTER: 'OUTER'>
OVER =
<TokenType.OVER: 'OVER'>
OVERLAPS =
<TokenType.OVERLAPS: 'OVERLAPS'>
OVERWRITE =
<TokenType.OVERWRITE: 'OVERWRITE'>
PARTITION =
<TokenType.PARTITION: 'PARTITION'>
PARTITION_BY =
<TokenType.PARTITION_BY: 'PARTITION_BY'>
PERCENT =
<TokenType.PERCENT: 'PERCENT'>
PIVOT =
<TokenType.PIVOT: 'PIVOT'>
PLACEHOLDER =
<TokenType.PLACEHOLDER: 'PLACEHOLDER'>
PRAGMA =
<TokenType.PRAGMA: 'PRAGMA'>
PRIMARY_KEY =
<TokenType.PRIMARY_KEY: 'PRIMARY_KEY'>
PROCEDURE =
<TokenType.PROCEDURE: 'PROCEDURE'>
PROPERTIES =
<TokenType.PROPERTIES: 'PROPERTIES'>
PSEUDO_TYPE =
<TokenType.PSEUDO_TYPE: 'PSEUDO_TYPE'>
QUALIFY =
<TokenType.QUALIFY: 'QUALIFY'>
QUOTE =
<TokenType.QUOTE: 'QUOTE'>
RANGE =
<TokenType.RANGE: 'RANGE'>
RECURSIVE =
<TokenType.RECURSIVE: 'RECURSIVE'>
REPLACE =
<TokenType.REPLACE: 'REPLACE'>
RETURNING =
<TokenType.RETURNING: 'RETURNING'>
REFERENCES =
<TokenType.REFERENCES: 'REFERENCES'>
RIGHT =
<TokenType.RIGHT: 'RIGHT'>
RLIKE =
<TokenType.RLIKE: 'RLIKE'>
ROLLBACK =
<TokenType.ROLLBACK: 'ROLLBACK'>
ROLLUP =
<TokenType.ROLLUP: 'ROLLUP'>
ROW =
<TokenType.ROW: 'ROW'>
ROWS =
<TokenType.ROWS: 'ROWS'>
SELECT =
<TokenType.SELECT: 'SELECT'>
SEMI =
<TokenType.SEMI: 'SEMI'>
SEPARATOR =
<TokenType.SEPARATOR: 'SEPARATOR'>
SERDE_PROPERTIES =
<TokenType.SERDE_PROPERTIES: 'SERDE_PROPERTIES'>
SET =
<TokenType.SET: 'SET'>
SETTINGS =
<TokenType.SETTINGS: 'SETTINGS'>
SHOW =
<TokenType.SHOW: 'SHOW'>
SIMILAR_TO =
<TokenType.SIMILAR_TO: 'SIMILAR_TO'>
SOME =
<TokenType.SOME: 'SOME'>
SORT_BY =
<TokenType.SORT_BY: 'SORT_BY'>
STRUCT =
<TokenType.STRUCT: 'STRUCT'>
TABLE_SAMPLE =
<TokenType.TABLE_SAMPLE: 'TABLE_SAMPLE'>
TEMPORARY =
<TokenType.TEMPORARY: 'TEMPORARY'>
TOP =
<TokenType.TOP: 'TOP'>
THEN =
<TokenType.THEN: 'THEN'>
TRUE =
<TokenType.TRUE: 'TRUE'>
UNCACHE =
<TokenType.UNCACHE: 'UNCACHE'>
UNION =
<TokenType.UNION: 'UNION'>
UNNEST =
<TokenType.UNNEST: 'UNNEST'>
UNPIVOT =
<TokenType.UNPIVOT: 'UNPIVOT'>
UPDATE =
<TokenType.UPDATE: 'UPDATE'>
USE =
<TokenType.USE: 'USE'>
USING =
<TokenType.USING: 'USING'>
VALUES =
<TokenType.VALUES: 'VALUES'>
VIEW =
<TokenType.VIEW: 'VIEW'>
VOLATILE =
<TokenType.VOLATILE: 'VOLATILE'>
WHEN =
<TokenType.WHEN: 'WHEN'>
WHERE =
<TokenType.WHERE: 'WHERE'>
WINDOW =
<TokenType.WINDOW: 'WINDOW'>
WITH =
<TokenType.WITH: 'WITH'>
UNIQUE =
<TokenType.UNIQUE: 'UNIQUE'>
Inherited Members
- enum.Enum
- name
- value
class
Token:
325class Token: 326 __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments") 327 328 @classmethod 329 def number(cls, number: int) -> Token: 330 """Returns a NUMBER token with `number` as its text.""" 331 return cls(TokenType.NUMBER, str(number)) 332 333 @classmethod 334 def string(cls, string: str) -> Token: 335 """Returns a STRING token with `string` as its text.""" 336 return cls(TokenType.STRING, string) 337 338 @classmethod 339 def identifier(cls, identifier: str) -> Token: 340 """Returns an IDENTIFIER token with `identifier` as its text.""" 341 return cls(TokenType.IDENTIFIER, identifier) 342 343 @classmethod 344 def var(cls, var: str) -> Token: 345 """Returns an VAR token with `var` as its text.""" 346 return cls(TokenType.VAR, var) 347 348 def __init__( 349 self, 350 token_type: TokenType, 351 text: str, 352 line: int = 1, 353 col: int = 1, 354 start: int = 0, 355 end: int = 0, 356 comments: t.List[str] = [], 357 ) -> None: 358 """Token initializer. 359 360 Args: 361 token_type: The TokenType Enum. 362 text: The text of the token. 363 line: The line that the token ends on. 364 col: The column that the token ends on. 365 start: The start index of the token. 366 end: The ending index of the token. 367 comments: The comments to attach to the token. 368 """ 369 self.token_type = token_type 370 self.text = text 371 self.line = line 372 self.col = col 373 self.start = start 374 self.end = end 375 self.comments = comments 376 377 def __repr__(self) -> str: 378 attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) 379 return f"<Token {attributes}>"
Token( token_type: sqlglot.tokens.TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: List[str] = [])
348 def __init__( 349 self, 350 token_type: TokenType, 351 text: str, 352 line: int = 1, 353 col: int = 1, 354 start: int = 0, 355 end: int = 0, 356 comments: t.List[str] = [], 357 ) -> None: 358 """Token initializer. 359 360 Args: 361 token_type: The TokenType Enum. 362 text: The text of the token. 363 line: The line that the token ends on. 364 col: The column that the token ends on. 365 start: The start index of the token. 366 end: The ending index of the token. 367 comments: The comments to attach to the token. 368 """ 369 self.token_type = token_type 370 self.text = text 371 self.line = line 372 self.col = col 373 self.start = start 374 self.end = end 375 self.comments = comments
Token initializer.
Arguments:
- token_type: The TokenType Enum.
- text: The text of the token.
- line: The line that the token ends on.
- col: The column that the token ends on.
- start: The start index of the token.
- end: The ending index of the token.
- comments: The comments to attach to the token.
328 @classmethod 329 def number(cls, number: int) -> Token: 330 """Returns a NUMBER token with `number` as its text.""" 331 return cls(TokenType.NUMBER, str(number))
Returns a NUMBER token with number
as its text.
333 @classmethod 334 def string(cls, string: str) -> Token: 335 """Returns a STRING token with `string` as its text.""" 336 return cls(TokenType.STRING, string)
Returns a STRING token with string
as its text.
338 @classmethod 339 def identifier(cls, identifier: str) -> Token: 340 """Returns an IDENTIFIER token with `identifier` as its text.""" 341 return cls(TokenType.IDENTIFIER, identifier)
Returns an IDENTIFIER token with identifier
as its text.
class
Tokenizer:
435class Tokenizer(metaclass=_Tokenizer): 436 SINGLE_TOKENS = { 437 "(": TokenType.L_PAREN, 438 ")": TokenType.R_PAREN, 439 "[": TokenType.L_BRACKET, 440 "]": TokenType.R_BRACKET, 441 "{": TokenType.L_BRACE, 442 "}": TokenType.R_BRACE, 443 "&": TokenType.AMP, 444 "^": TokenType.CARET, 445 ":": TokenType.COLON, 446 ",": TokenType.COMMA, 447 ".": TokenType.DOT, 448 "-": TokenType.DASH, 449 "=": TokenType.EQ, 450 ">": TokenType.GT, 451 "<": TokenType.LT, 452 "%": TokenType.MOD, 453 "!": TokenType.NOT, 454 "|": TokenType.PIPE, 455 "+": TokenType.PLUS, 456 ";": TokenType.SEMICOLON, 457 "/": TokenType.SLASH, 458 "\\": TokenType.BACKSLASH, 459 "*": TokenType.STAR, 460 "~": TokenType.TILDA, 461 "?": TokenType.PLACEHOLDER, 462 "@": TokenType.PARAMETER, 463 # used for breaking a var like x'y' but nothing else 464 # the token type doesn't matter 465 "'": TokenType.QUOTE, 466 "`": TokenType.IDENTIFIER, 467 '"': TokenType.IDENTIFIER, 468 "#": TokenType.HASH, 469 } 470 471 BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] 472 BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] 473 HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] 474 RAW_STRINGS: t.List[str | t.Tuple[str, str]] = [] 475 IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] 476 IDENTIFIER_ESCAPES = ['"'] 477 QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] 478 STRING_ESCAPES = ["'"] 479 VAR_SINGLE_TOKENS: t.Set[str] = set() 480 481 # Autofilled 482 IDENTIFIERS_CAN_START_WITH_DIGIT: bool = False 483 484 _COMMENTS: t.Dict[str, str] = {} 485 _FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {} 486 _IDENTIFIERS: t.Dict[str, str] = {} 487 _IDENTIFIER_ESCAPES: t.Set[str] = set() 488 _QUOTES: t.Dict[str, str] = {} 489 _STRING_ESCAPES: t.Set[str] = set() 490 _KEYWORD_TRIE: t.Dict = {} 491 492 KEYWORDS: t.Dict[str, TokenType] = { 493 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 494 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 495 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 496 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 497 "/*+": TokenType.HINT, 498 "==": TokenType.EQ, 499 "::": TokenType.DCOLON, 500 "||": TokenType.DPIPE, 501 ">=": TokenType.GTE, 502 "<=": TokenType.LTE, 503 "<>": TokenType.NEQ, 504 "!=": TokenType.NEQ, 505 "<=>": TokenType.NULLSAFE_EQ, 506 "->": TokenType.ARROW, 507 "->>": TokenType.DARROW, 508 "=>": TokenType.FARROW, 509 "#>": TokenType.HASH_ARROW, 510 "#>>": TokenType.DHASH_ARROW, 511 "<->": TokenType.LR_ARROW, 512 "&&": TokenType.DAMP, 513 "??": TokenType.DQMARK, 514 "ALL": TokenType.ALL, 515 "ALWAYS": TokenType.ALWAYS, 516 "AND": TokenType.AND, 517 "ANTI": TokenType.ANTI, 518 "ANY": TokenType.ANY, 519 "ASC": TokenType.ASC, 520 "AS": TokenType.ALIAS, 521 "ASOF": TokenType.ASOF, 522 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 523 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 524 "BEGIN": TokenType.BEGIN, 525 "BETWEEN": TokenType.BETWEEN, 526 "CACHE": TokenType.CACHE, 527 "UNCACHE": TokenType.UNCACHE, 528 "CASE": TokenType.CASE, 529 "CHARACTER SET": TokenType.CHARACTER_SET, 530 "CLUSTER BY": TokenType.CLUSTER_BY, 531 "COLLATE": TokenType.COLLATE, 532 "COLUMN": TokenType.COLUMN, 533 "COMMIT": TokenType.COMMIT, 534 "CONSTRAINT": TokenType.CONSTRAINT, 535 "CREATE": TokenType.CREATE, 536 "CROSS": TokenType.CROSS, 537 "CUBE": TokenType.CUBE, 538 "CURRENT_DATE": TokenType.CURRENT_DATE, 539 "CURRENT_TIME": TokenType.CURRENT_TIME, 540 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 541 "CURRENT_USER": TokenType.CURRENT_USER, 542 "DATABASE": TokenType.DATABASE, 543 "DEFAULT": TokenType.DEFAULT, 544 "DELETE": TokenType.DELETE, 545 "DESC": TokenType.DESC, 546 "DESCRIBE": TokenType.DESCRIBE, 547 "DISTINCT": TokenType.DISTINCT, 548 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 549 "DIV": TokenType.DIV, 550 "DROP": TokenType.DROP, 551 "ELSE": TokenType.ELSE, 552 "END": TokenType.END, 553 "ESCAPE": TokenType.ESCAPE, 554 "EXCEPT": TokenType.EXCEPT, 555 "EXECUTE": TokenType.EXECUTE, 556 "EXISTS": TokenType.EXISTS, 557 "FALSE": TokenType.FALSE, 558 "FETCH": TokenType.FETCH, 559 "FILTER": TokenType.FILTER, 560 "FIRST": TokenType.FIRST, 561 "FULL": TokenType.FULL, 562 "FUNCTION": TokenType.FUNCTION, 563 "FOR": TokenType.FOR, 564 "FOREIGN KEY": TokenType.FOREIGN_KEY, 565 "FORMAT": TokenType.FORMAT, 566 "FROM": TokenType.FROM, 567 "GEOGRAPHY": TokenType.GEOGRAPHY, 568 "GEOMETRY": TokenType.GEOMETRY, 569 "GLOB": TokenType.GLOB, 570 "GROUP BY": TokenType.GROUP_BY, 571 "GROUPING SETS": TokenType.GROUPING_SETS, 572 "HAVING": TokenType.HAVING, 573 "ILIKE": TokenType.ILIKE, 574 "IN": TokenType.IN, 575 "INDEX": TokenType.INDEX, 576 "INET": TokenType.INET, 577 "INNER": TokenType.INNER, 578 "INSERT": TokenType.INSERT, 579 "INTERVAL": TokenType.INTERVAL, 580 "INTERSECT": TokenType.INTERSECT, 581 "INTO": TokenType.INTO, 582 "IS": TokenType.IS, 583 "ISNULL": TokenType.ISNULL, 584 "JOIN": TokenType.JOIN, 585 "KEEP": TokenType.KEEP, 586 "LATERAL": TokenType.LATERAL, 587 "LEFT": TokenType.LEFT, 588 "LIKE": TokenType.LIKE, 589 "LIMIT": TokenType.LIMIT, 590 "LOAD": TokenType.LOAD, 591 "LOCK": TokenType.LOCK, 592 "MERGE": TokenType.MERGE, 593 "NATURAL": TokenType.NATURAL, 594 "NEXT": TokenType.NEXT, 595 "NOT": TokenType.NOT, 596 "NOTNULL": TokenType.NOTNULL, 597 "NULL": TokenType.NULL, 598 "OBJECT": TokenType.OBJECT, 599 "OFFSET": TokenType.OFFSET, 600 "ON": TokenType.ON, 601 "OR": TokenType.OR, 602 "XOR": TokenType.XOR, 603 "ORDER BY": TokenType.ORDER_BY, 604 "ORDINALITY": TokenType.ORDINALITY, 605 "OUTER": TokenType.OUTER, 606 "OVER": TokenType.OVER, 607 "OVERLAPS": TokenType.OVERLAPS, 608 "OVERWRITE": TokenType.OVERWRITE, 609 "PARTITION": TokenType.PARTITION, 610 "PARTITION BY": TokenType.PARTITION_BY, 611 "PARTITIONED BY": TokenType.PARTITION_BY, 612 "PARTITIONED_BY": TokenType.PARTITION_BY, 613 "PERCENT": TokenType.PERCENT, 614 "PIVOT": TokenType.PIVOT, 615 "PRAGMA": TokenType.PRAGMA, 616 "PRIMARY KEY": TokenType.PRIMARY_KEY, 617 "PROCEDURE": TokenType.PROCEDURE, 618 "QUALIFY": TokenType.QUALIFY, 619 "RANGE": TokenType.RANGE, 620 "RECURSIVE": TokenType.RECURSIVE, 621 "REGEXP": TokenType.RLIKE, 622 "REPLACE": TokenType.REPLACE, 623 "RETURNING": TokenType.RETURNING, 624 "REFERENCES": TokenType.REFERENCES, 625 "RIGHT": TokenType.RIGHT, 626 "RLIKE": TokenType.RLIKE, 627 "ROLLBACK": TokenType.ROLLBACK, 628 "ROLLUP": TokenType.ROLLUP, 629 "ROW": TokenType.ROW, 630 "ROWS": TokenType.ROWS, 631 "SCHEMA": TokenType.SCHEMA, 632 "SELECT": TokenType.SELECT, 633 "SEMI": TokenType.SEMI, 634 "SET": TokenType.SET, 635 "SETTINGS": TokenType.SETTINGS, 636 "SHOW": TokenType.SHOW, 637 "SIMILAR TO": TokenType.SIMILAR_TO, 638 "SOME": TokenType.SOME, 639 "SORT BY": TokenType.SORT_BY, 640 "TABLE": TokenType.TABLE, 641 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 642 "TEMP": TokenType.TEMPORARY, 643 "TEMPORARY": TokenType.TEMPORARY, 644 "THEN": TokenType.THEN, 645 "TRUE": TokenType.TRUE, 646 "UNION": TokenType.UNION, 647 "UNNEST": TokenType.UNNEST, 648 "UNPIVOT": TokenType.UNPIVOT, 649 "UPDATE": TokenType.UPDATE, 650 "USE": TokenType.USE, 651 "USING": TokenType.USING, 652 "UUID": TokenType.UUID, 653 "VALUES": TokenType.VALUES, 654 "VIEW": TokenType.VIEW, 655 "VOLATILE": TokenType.VOLATILE, 656 "WHEN": TokenType.WHEN, 657 "WHERE": TokenType.WHERE, 658 "WINDOW": TokenType.WINDOW, 659 "WITH": TokenType.WITH, 660 "APPLY": TokenType.APPLY, 661 "ARRAY": TokenType.ARRAY, 662 "BIT": TokenType.BIT, 663 "BOOL": TokenType.BOOLEAN, 664 "BOOLEAN": TokenType.BOOLEAN, 665 "BYTE": TokenType.TINYINT, 666 "TINYINT": TokenType.TINYINT, 667 "SHORT": TokenType.SMALLINT, 668 "SMALLINT": TokenType.SMALLINT, 669 "INT128": TokenType.INT128, 670 "INT2": TokenType.SMALLINT, 671 "INTEGER": TokenType.INT, 672 "INT": TokenType.INT, 673 "INT4": TokenType.INT, 674 "LONG": TokenType.BIGINT, 675 "BIGINT": TokenType.BIGINT, 676 "INT8": TokenType.BIGINT, 677 "DEC": TokenType.DECIMAL, 678 "DECIMAL": TokenType.DECIMAL, 679 "BIGDECIMAL": TokenType.BIGDECIMAL, 680 "BIGNUMERIC": TokenType.BIGDECIMAL, 681 "MAP": TokenType.MAP, 682 "NULLABLE": TokenType.NULLABLE, 683 "NUMBER": TokenType.DECIMAL, 684 "NUMERIC": TokenType.DECIMAL, 685 "FIXED": TokenType.DECIMAL, 686 "REAL": TokenType.FLOAT, 687 "FLOAT": TokenType.FLOAT, 688 "FLOAT4": TokenType.FLOAT, 689 "FLOAT8": TokenType.DOUBLE, 690 "DOUBLE": TokenType.DOUBLE, 691 "DOUBLE PRECISION": TokenType.DOUBLE, 692 "JSON": TokenType.JSON, 693 "CHAR": TokenType.CHAR, 694 "CHARACTER": TokenType.CHAR, 695 "NCHAR": TokenType.NCHAR, 696 "VARCHAR": TokenType.VARCHAR, 697 "VARCHAR2": TokenType.VARCHAR, 698 "NVARCHAR": TokenType.NVARCHAR, 699 "NVARCHAR2": TokenType.NVARCHAR, 700 "STR": TokenType.TEXT, 701 "STRING": TokenType.TEXT, 702 "TEXT": TokenType.TEXT, 703 "CLOB": TokenType.TEXT, 704 "LONGVARCHAR": TokenType.TEXT, 705 "BINARY": TokenType.BINARY, 706 "BLOB": TokenType.VARBINARY, 707 "BYTEA": TokenType.VARBINARY, 708 "VARBINARY": TokenType.VARBINARY, 709 "TIME": TokenType.TIME, 710 "TIMETZ": TokenType.TIMETZ, 711 "TIMESTAMP": TokenType.TIMESTAMP, 712 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 713 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 714 "DATE": TokenType.DATE, 715 "DATETIME": TokenType.DATETIME, 716 "INT4RANGE": TokenType.INT4RANGE, 717 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 718 "INT8RANGE": TokenType.INT8RANGE, 719 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 720 "NUMRANGE": TokenType.NUMRANGE, 721 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 722 "TSRANGE": TokenType.TSRANGE, 723 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 724 "TSTZRANGE": TokenType.TSTZRANGE, 725 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 726 "DATERANGE": TokenType.DATERANGE, 727 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 728 "UNIQUE": TokenType.UNIQUE, 729 "STRUCT": TokenType.STRUCT, 730 "VARIANT": TokenType.VARIANT, 731 "ALTER": TokenType.ALTER, 732 "ANALYZE": TokenType.COMMAND, 733 "CALL": TokenType.COMMAND, 734 "COMMENT": TokenType.COMMENT, 735 "COPY": TokenType.COMMAND, 736 "EXPLAIN": TokenType.COMMAND, 737 "GRANT": TokenType.COMMAND, 738 "OPTIMIZE": TokenType.COMMAND, 739 "PREPARE": TokenType.COMMAND, 740 "TRUNCATE": TokenType.COMMAND, 741 "VACUUM": TokenType.COMMAND, 742 "USER-DEFINED": TokenType.USERDEFINED, 743 } 744 745 WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = { 746 " ": TokenType.SPACE, 747 "\t": TokenType.SPACE, 748 "\n": TokenType.BREAK, 749 "\r": TokenType.BREAK, 750 "\r\n": TokenType.BREAK, 751 } 752 753 COMMANDS = { 754 TokenType.COMMAND, 755 TokenType.EXECUTE, 756 TokenType.FETCH, 757 TokenType.SHOW, 758 } 759 760 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 761 762 # handle numeric literals like in hive (3L = BIGINT) 763 NUMERIC_LITERALS: t.Dict[str, str] = {} 764 ENCODE: t.Optional[str] = None 765 766 COMMENTS = ["--", ("/*", "*/")] 767 768 __slots__ = ( 769 "sql", 770 "size", 771 "tokens", 772 "_start", 773 "_current", 774 "_line", 775 "_col", 776 "_comments", 777 "_char", 778 "_end", 779 "_peek", 780 "_prev_token_line", 781 ) 782 783 def __init__(self) -> None: 784 self.reset() 785 786 def reset(self) -> None: 787 self.sql = "" 788 self.size = 0 789 self.tokens: t.List[Token] = [] 790 self._start = 0 791 self._current = 0 792 self._line = 1 793 self._col = 0 794 self._comments: t.List[str] = [] 795 796 self._char = "" 797 self._end = False 798 self._peek = "" 799 self._prev_token_line = -1 800 801 def tokenize(self, sql: str) -> t.List[Token]: 802 """Returns a list of tokens corresponding to the SQL string `sql`.""" 803 self.reset() 804 self.sql = sql 805 self.size = len(sql) 806 807 try: 808 self._scan() 809 except Exception as e: 810 start = max(self._current - 50, 0) 811 end = min(self._current + 50, self.size - 1) 812 context = self.sql[start:end] 813 raise TokenError(f"Error tokenizing '{context}'") from e 814 815 return self.tokens 816 817 def _scan(self, until: t.Optional[t.Callable] = None) -> None: 818 while self.size and not self._end: 819 self._start = self._current 820 self._advance() 821 822 if self._char is None: 823 break 824 825 if self._char not in self.WHITE_SPACE: 826 if self._char.isdigit(): 827 self._scan_number() 828 elif self._char in self._IDENTIFIERS: 829 self._scan_identifier(self._IDENTIFIERS[self._char]) 830 else: 831 self._scan_keywords() 832 833 if until and until(): 834 break 835 836 if self.tokens and self._comments: 837 self.tokens[-1].comments.extend(self._comments) 838 839 def _chars(self, size: int) -> str: 840 if size == 1: 841 return self._char 842 843 start = self._current - 1 844 end = start + size 845 846 return self.sql[start:end] if end <= self.size else "" 847 848 def _advance(self, i: int = 1, alnum: bool = False) -> None: 849 if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: 850 self._col = 1 851 self._line += 1 852 else: 853 self._col += i 854 855 self._current += i 856 self._end = self._current >= self.size 857 self._char = self.sql[self._current - 1] 858 self._peek = "" if self._end else self.sql[self._current] 859 860 if alnum and self._char.isalnum(): 861 # Here we use local variables instead of attributes for better performance 862 _col = self._col 863 _current = self._current 864 _end = self._end 865 _peek = self._peek 866 867 while _peek.isalnum(): 868 _col += 1 869 _current += 1 870 _end = _current >= self.size 871 _peek = "" if _end else self.sql[_current] 872 873 self._col = _col 874 self._current = _current 875 self._end = _end 876 self._peek = _peek 877 self._char = self.sql[_current - 1] 878 879 @property 880 def _text(self) -> str: 881 return self.sql[self._start : self._current] 882 883 def peek(self, i: int = 0) -> str: 884 i = self._current + i 885 if i < self.size: 886 return self.sql[i] 887 return "" 888 889 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: 890 self._prev_token_line = self._line 891 892 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 893 self.tokens[-1].comments.extend(self._comments) 894 self._comments = [] 895 896 self.tokens.append( 897 Token( 898 token_type, 899 text=self._text if text is None else text, 900 line=self._line, 901 col=self._col, 902 start=self._start, 903 end=self._current - 1, 904 comments=self._comments, 905 ) 906 ) 907 self._comments = [] 908 909 # If we have either a semicolon or a begin token before the command's token, we'll parse 910 # whatever follows the command's token as a string 911 if ( 912 token_type in self.COMMANDS 913 and self._peek != ";" 914 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS) 915 ): 916 start = self._current 917 tokens = len(self.tokens) 918 self._scan(lambda: self._peek == ";") 919 self.tokens = self.tokens[:tokens] 920 text = self.sql[start : self._current].strip() 921 if text: 922 self._add(TokenType.STRING, text) 923 924 def _scan_keywords(self) -> None: 925 size = 0 926 word = None 927 chars = self._text 928 char = chars 929 prev_space = False 930 skip = False 931 trie = self._KEYWORD_TRIE 932 single_token = char in self.SINGLE_TOKENS 933 934 while chars: 935 if skip: 936 result = TrieResult.PREFIX 937 else: 938 result, trie = in_trie(trie, char.upper()) 939 940 if result == TrieResult.FAILED: 941 break 942 if result == TrieResult.EXISTS: 943 word = chars 944 945 size += 1 946 end = self._current - 1 + size 947 948 if end < self.size: 949 char = self.sql[end] 950 single_token = single_token or char in self.SINGLE_TOKENS 951 is_space = char in self.WHITE_SPACE 952 953 if not is_space or not prev_space: 954 if is_space: 955 char = " " 956 chars += char 957 prev_space = is_space 958 skip = False 959 else: 960 skip = True 961 else: 962 char = "" 963 chars = " " 964 965 if not word: 966 if self._char in self.SINGLE_TOKENS: 967 self._add(self.SINGLE_TOKENS[self._char], text=self._char) 968 return 969 self._scan_var() 970 return 971 972 if self._scan_string(word): 973 return 974 if self._scan_comment(word): 975 return 976 977 self._advance(size - 1) 978 word = word.upper() 979 self._add(self.KEYWORDS[word], text=word) 980 981 def _scan_comment(self, comment_start: str) -> bool: 982 if comment_start not in self._COMMENTS: 983 return False 984 985 comment_start_line = self._line 986 comment_start_size = len(comment_start) 987 comment_end = self._COMMENTS[comment_start] 988 989 if comment_end: 990 # Skip the comment's start delimiter 991 self._advance(comment_start_size) 992 993 comment_end_size = len(comment_end) 994 while not self._end and self._chars(comment_end_size) != comment_end: 995 self._advance(alnum=True) 996 997 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 998 self._advance(comment_end_size - 1) 999 else: 1000 while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK: 1001 self._advance(alnum=True) 1002 self._comments.append(self._text[comment_start_size:]) 1003 1004 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 1005 # Multiple consecutive comments are preserved by appending them to the current comments list. 1006 if comment_start_line == self._prev_token_line: 1007 self.tokens[-1].comments.extend(self._comments) 1008 self._comments = [] 1009 self._prev_token_line = self._line 1010 1011 return True 1012 1013 def _scan_number(self) -> None: 1014 if self._char == "0": 1015 peek = self._peek.upper() 1016 if peek == "B": 1017 return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER) 1018 elif peek == "X": 1019 return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER) 1020 1021 decimal = False 1022 scientific = 0 1023 1024 while True: 1025 if self._peek.isdigit(): 1026 self._advance() 1027 elif self._peek == "." and not decimal: 1028 after = self.peek(1) 1029 if after.isdigit() or not after.isalpha(): 1030 decimal = True 1031 self._advance() 1032 else: 1033 return self._add(TokenType.VAR) 1034 elif self._peek in ("-", "+") and scientific == 1: 1035 scientific += 1 1036 self._advance() 1037 elif self._peek.upper() == "E" and not scientific: 1038 scientific += 1 1039 self._advance() 1040 elif self._peek.isidentifier(): 1041 number_text = self._text 1042 literal = "" 1043 1044 while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: 1045 literal += self._peek.upper() 1046 self._advance() 1047 1048 token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal, "")) 1049 1050 if token_type: 1051 self._add(TokenType.NUMBER, number_text) 1052 self._add(TokenType.DCOLON, "::") 1053 return self._add(token_type, literal) 1054 elif self.IDENTIFIERS_CAN_START_WITH_DIGIT: 1055 return self._add(TokenType.VAR) 1056 1057 self._add(TokenType.NUMBER, number_text) 1058 return self._advance(-len(literal)) 1059 else: 1060 return self._add(TokenType.NUMBER) 1061 1062 def _scan_bits(self) -> None: 1063 self._advance() 1064 value = self._extract_value() 1065 try: 1066 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 1067 int(value, 2) 1068 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 1069 except ValueError: 1070 self._add(TokenType.IDENTIFIER) 1071 1072 def _scan_hex(self) -> None: 1073 self._advance() 1074 value = self._extract_value() 1075 try: 1076 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1077 int(value, 16) 1078 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1079 except ValueError: 1080 self._add(TokenType.IDENTIFIER) 1081 1082 def _extract_value(self) -> str: 1083 while True: 1084 char = self._peek.strip() 1085 if char and char not in self.SINGLE_TOKENS: 1086 self._advance(alnum=True) 1087 else: 1088 break 1089 1090 return self._text 1091 1092 def _scan_string(self, start: str) -> bool: 1093 base = None 1094 token_type = TokenType.STRING 1095 1096 if start in self._QUOTES: 1097 end = self._QUOTES[start] 1098 elif start in self._FORMAT_STRINGS: 1099 end, token_type = self._FORMAT_STRINGS[start] 1100 1101 if token_type == TokenType.HEX_STRING: 1102 base = 16 1103 elif token_type == TokenType.BIT_STRING: 1104 base = 2 1105 else: 1106 return False 1107 1108 self._advance(len(start)) 1109 text = self._extract_string(end) 1110 1111 if base: 1112 try: 1113 int(text, base) 1114 except: 1115 raise TokenError( 1116 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1117 ) 1118 else: 1119 text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text 1120 1121 self._add(token_type, text) 1122 return True 1123 1124 def _scan_identifier(self, identifier_end: str) -> None: 1125 self._advance() 1126 text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES) 1127 self._add(TokenType.IDENTIFIER, text) 1128 1129 def _scan_var(self) -> None: 1130 while True: 1131 char = self._peek.strip() 1132 if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): 1133 self._advance(alnum=True) 1134 else: 1135 break 1136 1137 self._add( 1138 TokenType.VAR 1139 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1140 else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) 1141 ) 1142 1143 def _extract_string(self, delimiter: str, escapes=None) -> str: 1144 text = "" 1145 delim_size = len(delimiter) 1146 escapes = self._STRING_ESCAPES if escapes is None else escapes 1147 1148 while True: 1149 if self._char in escapes and (self._peek == delimiter or self._peek in escapes): 1150 if self._peek == delimiter: 1151 text += self._peek 1152 else: 1153 text += self._char + self._peek 1154 1155 if self._current + 1 < self.size: 1156 self._advance(2) 1157 else: 1158 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1159 else: 1160 if self._chars(delim_size) == delimiter: 1161 if delim_size > 1: 1162 self._advance(delim_size - 1) 1163 break 1164 1165 if self._end: 1166 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1167 1168 current = self._current - 1 1169 self._advance(alnum=True) 1170 text += self.sql[current : self._current - 1] 1171 1172 return text
SINGLE_TOKENS =
{'(': <TokenType.L_PAREN: 'L_PAREN'>, ')': <TokenType.R_PAREN: 'R_PAREN'>, '[': <TokenType.L_BRACKET: 'L_BRACKET'>, ']': <TokenType.R_BRACKET: 'R_BRACKET'>, '{': <TokenType.L_BRACE: 'L_BRACE'>, '}': <TokenType.R_BRACE: 'R_BRACE'>, '&': <TokenType.AMP: 'AMP'>, '^': <TokenType.CARET: 'CARET'>, ':': <TokenType.COLON: 'COLON'>, ',': <TokenType.COMMA: 'COMMA'>, '.': <TokenType.DOT: 'DOT'>, '-': <TokenType.DASH: 'DASH'>, '=': <TokenType.EQ: 'EQ'>, '>': <TokenType.GT: 'GT'>, '<': <TokenType.LT: 'LT'>, '%': <TokenType.MOD: 'MOD'>, '!': <TokenType.NOT: 'NOT'>, '|': <TokenType.PIPE: 'PIPE'>, '+': <TokenType.PLUS: 'PLUS'>, ';': <TokenType.SEMICOLON: 'SEMICOLON'>, '/': <TokenType.SLASH: 'SLASH'>, '\\': <TokenType.BACKSLASH: 'BACKSLASH'>, '*': <TokenType.STAR: 'STAR'>, '~': <TokenType.TILDA: 'TILDA'>, '?': <TokenType.PLACEHOLDER: 'PLACEHOLDER'>, '@': <TokenType.PARAMETER: 'PARAMETER'>, "'": <TokenType.QUOTE: 'QUOTE'>, '`': <TokenType.IDENTIFIER: 'IDENTIFIER'>, '"': <TokenType.IDENTIFIER: 'IDENTIFIER'>, '#': <TokenType.HASH: 'HASH'>}
KEYWORDS: Dict[str, sqlglot.tokens.TokenType] =
{'{%': <TokenType.BLOCK_START: 'BLOCK_START'>, '{%+': <TokenType.BLOCK_START: 'BLOCK_START'>, '{%-': <TokenType.BLOCK_START: 'BLOCK_START'>, '%}': <TokenType.BLOCK_END: 'BLOCK_END'>, '+%}': <TokenType.BLOCK_END: 'BLOCK_END'>, '-%}': <TokenType.BLOCK_END: 'BLOCK_END'>, '{{+': <TokenType.BLOCK_START: 'BLOCK_START'>, '{{-': <TokenType.BLOCK_START: 'BLOCK_START'>, '+}}': <TokenType.BLOCK_END: 'BLOCK_END'>, '-}}': <TokenType.BLOCK_END: 'BLOCK_END'>, '/*+': <TokenType.HINT: 'HINT'>, '==': <TokenType.EQ: 'EQ'>, '::': <TokenType.DCOLON: 'DCOLON'>, '||': <TokenType.DPIPE: 'DPIPE'>, '>=': <TokenType.GTE: 'GTE'>, '<=': <TokenType.LTE: 'LTE'>, '<>': <TokenType.NEQ: 'NEQ'>, '!=': <TokenType.NEQ: 'NEQ'>, '<=>': <TokenType.NULLSAFE_EQ: 'NULLSAFE_EQ'>, '->': <TokenType.ARROW: 'ARROW'>, '->>': <TokenType.DARROW: 'DARROW'>, '=>': <TokenType.FARROW: 'FARROW'>, '#>': <TokenType.HASH_ARROW: 'HASH_ARROW'>, '#>>': <TokenType.DHASH_ARROW: 'DHASH_ARROW'>, '<->': <TokenType.LR_ARROW: 'LR_ARROW'>, '&&': <TokenType.DAMP: 'DAMP'>, '??': <TokenType.DQMARK: 'DQMARK'>, 'ALL': <TokenType.ALL: 'ALL'>, 'ALWAYS': <TokenType.ALWAYS: 'ALWAYS'>, 'AND': <TokenType.AND: 'AND'>, 'ANTI': <TokenType.ANTI: 'ANTI'>, 'ANY': <TokenType.ANY: 'ANY'>, 'ASC': <TokenType.ASC: 'ASC'>, 'AS': <TokenType.ALIAS: 'ALIAS'>, 'ASOF': <TokenType.ASOF: 'ASOF'>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>, 'BEGIN': <TokenType.BEGIN: 'BEGIN'>, 'BETWEEN': <TokenType.BETWEEN: 'BETWEEN'>, 'CACHE': <TokenType.CACHE: 'CACHE'>, 'UNCACHE': <TokenType.UNCACHE: 'UNCACHE'>, 'CASE': <TokenType.CASE: 'CASE'>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 'CHARACTER_SET'>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 'CLUSTER_BY'>, 'COLLATE': <TokenType.COLLATE: 'COLLATE'>, 'COLUMN': <TokenType.COLUMN: 'COLUMN'>, 'COMMIT': <TokenType.COMMIT: 'COMMIT'>, 'CONSTRAINT': <TokenType.CONSTRAINT: 'CONSTRAINT'>, 'CREATE': <TokenType.CREATE: 'CREATE'>, 'CROSS': <TokenType.CROSS: 'CROSS'>, 'CUBE': <TokenType.CUBE: 'CUBE'>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 'CURRENT_DATE'>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 'CURRENT_TIME'>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'>, 'CURRENT_USER': <TokenType.CURRENT_USER: 'CURRENT_USER'>, 'DATABASE': <TokenType.DATABASE: 'DATABASE'>, 'DEFAULT': <TokenType.DEFAULT: 'DEFAULT'>, 'DELETE': <TokenType.DELETE: 'DELETE'>, 'DESC': <TokenType.DESC: 'DESC'>, 'DESCRIBE': <TokenType.DESCRIBE: 'DESCRIBE'>, 'DISTINCT': <TokenType.DISTINCT: 'DISTINCT'>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 'DISTRIBUTE_BY'>, 'DIV': <TokenType.DIV: 'DIV'>, 'DROP': <TokenType.DROP: 'DROP'>, 'ELSE': <TokenType.ELSE: 'ELSE'>, 'END': <TokenType.END: 'END'>, 'ESCAPE': <TokenType.ESCAPE: 'ESCAPE'>, 'EXCEPT': <TokenType.EXCEPT: 'EXCEPT'>, 'EXECUTE': <TokenType.EXECUTE: 'EXECUTE'>, 'EXISTS': <TokenType.EXISTS: 'EXISTS'>, 'FALSE': <TokenType.FALSE: 'FALSE'>, 'FETCH': <TokenType.FETCH: 'FETCH'>, 'FILTER': <TokenType.FILTER: 'FILTER'>, 'FIRST': <TokenType.FIRST: 'FIRST'>, 'FULL': <TokenType.FULL: 'FULL'>, 'FUNCTION': <TokenType.FUNCTION: 'FUNCTION'>, 'FOR': <TokenType.FOR: 'FOR'>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 'FOREIGN_KEY'>, 'FORMAT': <TokenType.FORMAT: 'FORMAT'>, 'FROM': <TokenType.FROM: 'FROM'>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 'GEOGRAPHY'>, 'GEOMETRY': <TokenType.GEOMETRY: 'GEOMETRY'>, 'GLOB': <TokenType.GLOB: 'GLOB'>, 'GROUP BY': <TokenType.GROUP_BY: 'GROUP_BY'>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 'GROUPING_SETS'>, 'HAVING': <TokenType.HAVING: 'HAVING'>, 'ILIKE': <TokenType.ILIKE: 'ILIKE'>, 'IN': <TokenType.IN: 'IN'>, 'INDEX': <TokenType.INDEX: 'INDEX'>, 'INET': <TokenType.INET: 'INET'>, 'INNER': <TokenType.INNER: 'INNER'>, 'INSERT': <TokenType.INSERT: 'INSERT'>, 'INTERVAL': <TokenType.INTERVAL: 'INTERVAL'>, 'INTERSECT': <TokenType.INTERSECT: 'INTERSECT'>, 'INTO': <TokenType.INTO: 'INTO'>, 'IS': <TokenType.IS: 'IS'>, 'ISNULL': <TokenType.ISNULL: 'ISNULL'>, 'JOIN': <TokenType.JOIN: 'JOIN'>, 'KEEP': <TokenType.KEEP: 'KEEP'>, 'LATERAL': <TokenType.LATERAL: 'LATERAL'>, 'LEFT': <TokenType.LEFT: 'LEFT'>, 'LIKE': <TokenType.LIKE: 'LIKE'>, 'LIMIT': <TokenType.LIMIT: 'LIMIT'>, 'LOAD': <TokenType.LOAD: 'LOAD'>, 'LOCK': <TokenType.LOCK: 'LOCK'>, 'MERGE': <TokenType.MERGE: 'MERGE'>, 'NATURAL': <TokenType.NATURAL: 'NATURAL'>, 'NEXT': <TokenType.NEXT: 'NEXT'>, 'NOT': <TokenType.NOT: 'NOT'>, 'NOTNULL': <TokenType.NOTNULL: 'NOTNULL'>, 'NULL': <TokenType.NULL: 'NULL'>, 'OBJECT': <TokenType.OBJECT: 'OBJECT'>, 'OFFSET': <TokenType.OFFSET: 'OFFSET'>, 'ON': <TokenType.ON: 'ON'>, 'OR': <TokenType.OR: 'OR'>, 'XOR': <TokenType.XOR: 'XOR'>, 'ORDER BY': <TokenType.ORDER_BY: 'ORDER_BY'>, 'ORDINALITY': <TokenType.ORDINALITY: 'ORDINALITY'>, 'OUTER': <TokenType.OUTER: 'OUTER'>, 'OVER': <TokenType.OVER: 'OVER'>, 'OVERLAPS': <TokenType.OVERLAPS: 'OVERLAPS'>, 'OVERWRITE': <TokenType.OVERWRITE: 'OVERWRITE'>, 'PARTITION': <TokenType.PARTITION: 'PARTITION'>, 'PARTITION BY': <TokenType.PARTITION_BY: 'PARTITION_BY'>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 'PARTITION_BY'>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 'PARTITION_BY'>, 'PERCENT': <TokenType.PERCENT: 'PERCENT'>, 'PIVOT': <TokenType.PIVOT: 'PIVOT'>, 'PRAGMA': <TokenType.PRAGMA: 'PRAGMA'>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 'PRIMARY_KEY'>, 'PROCEDURE': <TokenType.PROCEDURE: 'PROCEDURE'>, 'QUALIFY': <TokenType.QUALIFY: 'QUALIFY'>, 'RANGE': <TokenType.RANGE: 'RANGE'>, 'RECURSIVE': <TokenType.RECURSIVE: 'RECURSIVE'>, 'REGEXP': <TokenType.RLIKE: 'RLIKE'>, 'REPLACE': <TokenType.REPLACE: 'REPLACE'>, 'RETURNING': <TokenType.RETURNING: 'RETURNING'>, 'REFERENCES': <TokenType.REFERENCES: 'REFERENCES'>, 'RIGHT': <TokenType.RIGHT: 'RIGHT'>, 'RLIKE': <TokenType.RLIKE: 'RLIKE'>, 'ROLLBACK': <TokenType.ROLLBACK: 'ROLLBACK'>, 'ROLLUP': <TokenType.ROLLUP: 'ROLLUP'>, 'ROW': <TokenType.ROW: 'ROW'>, 'ROWS': <TokenType.ROWS: 'ROWS'>, 'SCHEMA': <TokenType.SCHEMA: 'SCHEMA'>, 'SELECT': <TokenType.SELECT: 'SELECT'>, 'SEMI': <TokenType.SEMI: 'SEMI'>, 'SET': <TokenType.SET: 'SET'>, 'SETTINGS': <TokenType.SETTINGS: 'SETTINGS'>, 'SHOW': <TokenType.SHOW: 'SHOW'>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 'SIMILAR_TO'>, 'SOME': <TokenType.SOME: 'SOME'>, 'SORT BY': <TokenType.SORT_BY: 'SORT_BY'>, 'TABLE': <TokenType.TABLE: 'TABLE'>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 'TABLE_SAMPLE'>, 'TEMP': <TokenType.TEMPORARY: 'TEMPORARY'>, 'TEMPORARY': <TokenType.TEMPORARY: 'TEMPORARY'>, 'THEN': <TokenType.THEN: 'THEN'>, 'TRUE': <TokenType.TRUE: 'TRUE'>, 'UNION': <TokenType.UNION: 'UNION'>, 'UNNEST': <TokenType.UNNEST: 'UNNEST'>, 'UNPIVOT': <TokenType.UNPIVOT: 'UNPIVOT'>, 'UPDATE': <TokenType.UPDATE: 'UPDATE'>, 'USE': <TokenType.USE: 'USE'>, 'USING': <TokenType.USING: 'USING'>, 'UUID': <TokenType.UUID: 'UUID'>, 'VALUES': <TokenType.VALUES: 'VALUES'>, 'VIEW': <TokenType.VIEW: 'VIEW'>, 'VOLATILE': <TokenType.VOLATILE: 'VOLATILE'>, 'WHEN': <TokenType.WHEN: 'WHEN'>, 'WHERE': <TokenType.WHERE: 'WHERE'>, 'WINDOW': <TokenType.WINDOW: 'WINDOW'>, 'WITH': <TokenType.WITH: 'WITH'>, 'APPLY': <TokenType.APPLY: 'APPLY'>, 'ARRAY': <TokenType.ARRAY: 'ARRAY'>, 'BIT': <TokenType.BIT: 'BIT'>, 'BOOL': <TokenType.BOOLEAN: 'BOOLEAN'>, 'BOOLEAN': <TokenType.BOOLEAN: 'BOOLEAN'>, 'BYTE': <TokenType.TINYINT: 'TINYINT'>, 'TINYINT': <TokenType.TINYINT: 'TINYINT'>, 'SHORT': <TokenType.SMALLINT: 'SMALLINT'>, 'SMALLINT': <TokenType.SMALLINT: 'SMALLINT'>, 'INT128': <TokenType.INT128: 'INT128'>, 'INT2': <TokenType.SMALLINT: 'SMALLINT'>, 'INTEGER': <TokenType.INT: 'INT'>, 'INT': <TokenType.INT: 'INT'>, 'INT4': <TokenType.INT: 'INT'>, 'LONG': <TokenType.BIGINT: 'BIGINT'>, 'BIGINT': <TokenType.BIGINT: 'BIGINT'>, 'INT8': <TokenType.BIGINT: 'BIGINT'>, 'DEC': <TokenType.DECIMAL: 'DECIMAL'>, 'DECIMAL': <TokenType.DECIMAL: 'DECIMAL'>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 'BIGDECIMAL'>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 'BIGDECIMAL'>, 'MAP': <TokenType.MAP: 'MAP'>, 'NULLABLE': <TokenType.NULLABLE: 'NULLABLE'>, 'NUMBER': <TokenType.DECIMAL: 'DECIMAL'>, 'NUMERIC': <TokenType.DECIMAL: 'DECIMAL'>, 'FIXED': <TokenType.DECIMAL: 'DECIMAL'>, 'REAL': <TokenType.FLOAT: 'FLOAT'>, 'FLOAT': <TokenType.FLOAT: 'FLOAT'>, 'FLOAT4': <TokenType.FLOAT: 'FLOAT'>, 'FLOAT8': <TokenType.DOUBLE: 'DOUBLE'>, 'DOUBLE': <TokenType.DOUBLE: 'DOUBLE'>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 'DOUBLE'>, 'JSON': <TokenType.JSON: 'JSON'>, 'CHAR': <TokenType.CHAR: 'CHAR'>, 'CHARACTER': <TokenType.CHAR: 'CHAR'>, 'NCHAR': <TokenType.NCHAR: 'NCHAR'>, 'VARCHAR': <TokenType.VARCHAR: 'VARCHAR'>, 'VARCHAR2': <TokenType.VARCHAR: 'VARCHAR'>, 'NVARCHAR': <TokenType.NVARCHAR: 'NVARCHAR'>, 'NVARCHAR2': <TokenType.NVARCHAR: 'NVARCHAR'>, 'STR': <TokenType.TEXT: 'TEXT'>, 'STRING': <TokenType.TEXT: 'TEXT'>, 'TEXT': <TokenType.TEXT: 'TEXT'>, 'CLOB': <TokenType.TEXT: 'TEXT'>, 'LONGVARCHAR': <TokenType.TEXT: 'TEXT'>, 'BINARY': <TokenType.BINARY: 'BINARY'>, 'BLOB': <TokenType.VARBINARY: 'VARBINARY'>, 'BYTEA': <TokenType.VARBINARY: 'VARBINARY'>, 'VARBINARY': <TokenType.VARBINARY: 'VARBINARY'>, 'TIME': <TokenType.TIME: 'TIME'>, 'TIMETZ': <TokenType.TIMETZ: 'TIMETZ'>, 'TIMESTAMP': <TokenType.TIMESTAMP: 'TIMESTAMP'>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 'TIMESTAMPTZ'>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 'TIMESTAMPLTZ'>, 'DATE': <TokenType.DATE: 'DATE'>, 'DATETIME': <TokenType.DATETIME: 'DATETIME'>, 'INT4RANGE': <TokenType.INT4RANGE: 'INT4RANGE'>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 'INT4MULTIRANGE'>, 'INT8RANGE': <TokenType.INT8RANGE: 'INT8RANGE'>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 'INT8MULTIRANGE'>, 'NUMRANGE': <TokenType.NUMRANGE: 'NUMRANGE'>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 'NUMMULTIRANGE'>, 'TSRANGE': <TokenType.TSRANGE: 'TSRANGE'>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 'TSMULTIRANGE'>, 'TSTZRANGE': <TokenType.TSTZRANGE: 'TSTZRANGE'>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 'TSTZMULTIRANGE'>, 'DATERANGE': <TokenType.DATERANGE: 'DATERANGE'>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 'DATEMULTIRANGE'>, 'UNIQUE': <TokenType.UNIQUE: 'UNIQUE'>, 'STRUCT': <TokenType.STRUCT: 'STRUCT'>, 'VARIANT': <TokenType.VARIANT: 'VARIANT'>, 'ALTER': <TokenType.ALTER: 'ALTER'>, 'ANALYZE': <TokenType.COMMAND: 'COMMAND'>, 'CALL': <TokenType.COMMAND: 'COMMAND'>, 'COMMENT': <TokenType.COMMENT: 'COMMENT'>, 'COPY': <TokenType.COMMAND: 'COMMAND'>, 'EXPLAIN': <TokenType.COMMAND: 'COMMAND'>, 'GRANT': <TokenType.COMMAND: 'COMMAND'>, 'OPTIMIZE': <TokenType.COMMAND: 'COMMAND'>, 'PREPARE': <TokenType.COMMAND: 'COMMAND'>, 'TRUNCATE': <TokenType.COMMAND: 'COMMAND'>, 'VACUUM': <TokenType.COMMAND: 'COMMAND'>, 'USER-DEFINED': <TokenType.USERDEFINED: 'USERDEFINED'>}
WHITE_SPACE: Dict[Optional[str], sqlglot.tokens.TokenType] =
{' ': <TokenType.SPACE: 'SPACE'>, '\t': <TokenType.SPACE: 'SPACE'>, '\n': <TokenType.BREAK: 'BREAK'>, '\r': <TokenType.BREAK: 'BREAK'>, '\r\n': <TokenType.BREAK: 'BREAK'>}
COMMANDS =
{<TokenType.FETCH: 'FETCH'>, <TokenType.COMMAND: 'COMMAND'>, <TokenType.SHOW: 'SHOW'>, <TokenType.EXECUTE: 'EXECUTE'>}
COMMAND_PREFIX_TOKENS =
{<TokenType.BEGIN: 'BEGIN'>, <TokenType.SEMICOLON: 'SEMICOLON'>}
def
reset(self) -> None:
786 def reset(self) -> None: 787 self.sql = "" 788 self.size = 0 789 self.tokens: t.List[Token] = [] 790 self._start = 0 791 self._current = 0 792 self._line = 1 793 self._col = 0 794 self._comments: t.List[str] = [] 795 796 self._char = "" 797 self._end = False 798 self._peek = "" 799 self._prev_token_line = -1
801 def tokenize(self, sql: str) -> t.List[Token]: 802 """Returns a list of tokens corresponding to the SQL string `sql`.""" 803 self.reset() 804 self.sql = sql 805 self.size = len(sql) 806 807 try: 808 self._scan() 809 except Exception as e: 810 start = max(self._current - 50, 0) 811 end = min(self._current + 50, self.size - 1) 812 context = self.sql[start:end] 813 raise TokenError(f"Error tokenizing '{context}'") from e 814 815 return self.tokens
Returns a list of tokens corresponding to the SQL string sql
.