sqlglot.tokens
1from __future__ import annotations 2 3import typing as t 4from enum import auto 5 6from sqlglot.helper import AutoName 7from sqlglot.trie import in_trie, new_trie 8 9 10class TokenType(AutoName): 11 L_PAREN = auto() 12 R_PAREN = auto() 13 L_BRACKET = auto() 14 R_BRACKET = auto() 15 L_BRACE = auto() 16 R_BRACE = auto() 17 COMMA = auto() 18 DOT = auto() 19 DASH = auto() 20 PLUS = auto() 21 COLON = auto() 22 DCOLON = auto() 23 SEMICOLON = auto() 24 STAR = auto() 25 BACKSLASH = auto() 26 SLASH = auto() 27 LT = auto() 28 LTE = auto() 29 GT = auto() 30 GTE = auto() 31 NOT = auto() 32 EQ = auto() 33 NEQ = auto() 34 NULLSAFE_EQ = auto() 35 AND = auto() 36 OR = auto() 37 AMP = auto() 38 DPIPE = auto() 39 PIPE = auto() 40 CARET = auto() 41 TILDA = auto() 42 ARROW = auto() 43 DARROW = auto() 44 FARROW = auto() 45 HASH = auto() 46 HASH_ARROW = auto() 47 DHASH_ARROW = auto() 48 LR_ARROW = auto() 49 LT_AT = auto() 50 AT_GT = auto() 51 DOLLAR = auto() 52 PARAMETER = auto() 53 SESSION_PARAMETER = auto() 54 NATIONAL = auto() 55 DAMP = auto() 56 57 BLOCK_START = auto() 58 BLOCK_END = auto() 59 60 SPACE = auto() 61 BREAK = auto() 62 63 STRING = auto() 64 NUMBER = auto() 65 IDENTIFIER = auto() 66 DATABASE = auto() 67 COLUMN = auto() 68 COLUMN_DEF = auto() 69 SCHEMA = auto() 70 TABLE = auto() 71 VAR = auto() 72 BIT_STRING = auto() 73 HEX_STRING = auto() 74 BYTE_STRING = auto() 75 76 # types 77 BIT = auto() 78 BOOLEAN = auto() 79 TINYINT = auto() 80 UTINYINT = auto() 81 SMALLINT = auto() 82 USMALLINT = auto() 83 INT = auto() 84 UINT = auto() 85 BIGINT = auto() 86 UBIGINT = auto() 87 INT128 = auto() 88 UINT128 = auto() 89 INT256 = auto() 90 UINT256 = auto() 91 FLOAT = auto() 92 DOUBLE = auto() 93 DECIMAL = auto() 94 BIGDECIMAL = auto() 95 CHAR = auto() 96 NCHAR = auto() 97 VARCHAR = auto() 98 NVARCHAR = auto() 99 TEXT = auto() 100 MEDIUMTEXT = auto() 101 LONGTEXT = auto() 102 MEDIUMBLOB = auto() 103 LONGBLOB = auto() 104 BINARY = auto() 105 VARBINARY = auto() 106 JSON = auto() 107 JSONB = auto() 108 TIME = auto() 109 TIMESTAMP = auto() 110 TIMESTAMPTZ = auto() 111 TIMESTAMPLTZ = auto() 112 DATETIME = auto() 113 DATETIME64 = auto() 114 DATE = auto() 115 UUID = auto() 116 GEOGRAPHY = auto() 117 NULLABLE = auto() 118 GEOMETRY = auto() 119 HLLSKETCH = auto() 120 HSTORE = auto() 121 SUPER = auto() 122 SERIAL = auto() 123 SMALLSERIAL = auto() 124 BIGSERIAL = auto() 125 XML = auto() 126 UNIQUEIDENTIFIER = auto() 127 MONEY = auto() 128 SMALLMONEY = auto() 129 ROWVERSION = auto() 130 IMAGE = auto() 131 VARIANT = auto() 132 OBJECT = auto() 133 INET = auto() 134 135 # keywords 136 ALIAS = auto() 137 ALTER = auto() 138 ALWAYS = auto() 139 ALL = auto() 140 ANTI = auto() 141 ANY = auto() 142 APPLY = auto() 143 ARRAY = auto() 144 ASC = auto() 145 ASOF = auto() 146 AUTO_INCREMENT = auto() 147 BEGIN = auto() 148 BETWEEN = auto() 149 CACHE = auto() 150 CASE = auto() 151 CHARACTER_SET = auto() 152 COLLATE = auto() 153 COMMAND = auto() 154 COMMENT = auto() 155 COMMIT = auto() 156 CONSTRAINT = auto() 157 CREATE = auto() 158 CROSS = auto() 159 CUBE = auto() 160 CURRENT_DATE = auto() 161 CURRENT_DATETIME = auto() 162 CURRENT_TIME = auto() 163 CURRENT_TIMESTAMP = auto() 164 CURRENT_USER = auto() 165 DEFAULT = auto() 166 DELETE = auto() 167 DESC = auto() 168 DESCRIBE = auto() 169 DISTINCT = auto() 170 DIV = auto() 171 DROP = auto() 172 ELSE = auto() 173 END = auto() 174 ESCAPE = auto() 175 EXCEPT = auto() 176 EXECUTE = auto() 177 EXISTS = auto() 178 FALSE = auto() 179 FETCH = auto() 180 FILTER = auto() 181 FINAL = auto() 182 FIRST = auto() 183 FOR = auto() 184 FOREIGN_KEY = auto() 185 FORMAT = auto() 186 FROM = auto() 187 FULL = auto() 188 FUNCTION = auto() 189 GLOB = auto() 190 GLOBAL = auto() 191 GROUP_BY = auto() 192 GROUPING_SETS = auto() 193 HAVING = auto() 194 HINT = auto() 195 IF = auto() 196 ILIKE = auto() 197 ILIKE_ANY = auto() 198 IN = auto() 199 INDEX = auto() 200 INNER = auto() 201 INSERT = auto() 202 INTERSECT = auto() 203 INTERVAL = auto() 204 INTO = auto() 205 INTRODUCER = auto() 206 IRLIKE = auto() 207 IS = auto() 208 ISNULL = auto() 209 JOIN = auto() 210 JOIN_MARKER = auto() 211 KEEP = auto() 212 LANGUAGE = auto() 213 LATERAL = auto() 214 LEFT = auto() 215 LIKE = auto() 216 LIKE_ANY = auto() 217 LIMIT = auto() 218 LOAD = auto() 219 LOCK = auto() 220 MAP = auto() 221 MATCH_RECOGNIZE = auto() 222 MERGE = auto() 223 MOD = auto() 224 NATURAL = auto() 225 NEXT = auto() 226 NEXT_VALUE_FOR = auto() 227 NOTNULL = auto() 228 NULL = auto() 229 OFFSET = auto() 230 ON = auto() 231 ORDER_BY = auto() 232 ORDERED = auto() 233 ORDINALITY = auto() 234 OUTER = auto() 235 OVER = auto() 236 OVERLAPS = auto() 237 OVERWRITE = auto() 238 PARTITION = auto() 239 PARTITION_BY = auto() 240 PERCENT = auto() 241 PIVOT = auto() 242 PLACEHOLDER = auto() 243 PRAGMA = auto() 244 PRIMARY_KEY = auto() 245 PROCEDURE = auto() 246 PROPERTIES = auto() 247 PSEUDO_TYPE = auto() 248 QUALIFY = auto() 249 QUOTE = auto() 250 RANGE = auto() 251 RECURSIVE = auto() 252 REPLACE = auto() 253 RETURNING = auto() 254 REFERENCES = auto() 255 RIGHT = auto() 256 RLIKE = auto() 257 ROLLBACK = auto() 258 ROLLUP = auto() 259 ROW = auto() 260 ROWS = auto() 261 SELECT = auto() 262 SEMI = auto() 263 SEPARATOR = auto() 264 SERDE_PROPERTIES = auto() 265 SET = auto() 266 SETTINGS = auto() 267 SHOW = auto() 268 SIMILAR_TO = auto() 269 SOME = auto() 270 STRUCT = auto() 271 TABLE_SAMPLE = auto() 272 TEMPORARY = auto() 273 TOP = auto() 274 THEN = auto() 275 TRUE = auto() 276 UNCACHE = auto() 277 UNION = auto() 278 UNNEST = auto() 279 UNPIVOT = auto() 280 UPDATE = auto() 281 USE = auto() 282 USING = auto() 283 VALUES = auto() 284 VIEW = auto() 285 VOLATILE = auto() 286 WHEN = auto() 287 WHERE = auto() 288 WINDOW = auto() 289 WITH = auto() 290 UNIQUE = auto() 291 292 293class Token: 294 __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments") 295 296 @classmethod 297 def number(cls, number: int) -> Token: 298 """Returns a NUMBER token with `number` as its text.""" 299 return cls(TokenType.NUMBER, str(number)) 300 301 @classmethod 302 def string(cls, string: str) -> Token: 303 """Returns a STRING token with `string` as its text.""" 304 return cls(TokenType.STRING, string) 305 306 @classmethod 307 def identifier(cls, identifier: str) -> Token: 308 """Returns an IDENTIFIER token with `identifier` as its text.""" 309 return cls(TokenType.IDENTIFIER, identifier) 310 311 @classmethod 312 def var(cls, var: str) -> Token: 313 """Returns an VAR token with `var` as its text.""" 314 return cls(TokenType.VAR, var) 315 316 def __init__( 317 self, 318 token_type: TokenType, 319 text: str, 320 line: int = 1, 321 col: int = 1, 322 start: int = 0, 323 end: int = 0, 324 comments: t.List[str] = [], 325 ) -> None: 326 """Token initializer. 327 328 Args: 329 token_type: The TokenType Enum. 330 text: The text of the token. 331 line: The line that the token ends on. 332 col: The column that the token ends on. 333 start: The start index of the token. 334 end: The ending index of the token. 335 """ 336 self.token_type = token_type 337 self.text = text 338 self.line = line 339 self.col = col 340 self.start = start 341 self.end = end 342 self.comments = comments 343 344 def __repr__(self) -> str: 345 attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) 346 return f"<Token {attributes}>" 347 348 349class _Tokenizer(type): 350 def __new__(cls, clsname, bases, attrs): 351 klass = super().__new__(cls, clsname, bases, attrs) 352 353 klass._QUOTES = { 354 f"{prefix}{s}": e 355 for s, e in cls._delimeter_list_to_dict(klass.QUOTES).items() 356 for prefix in (("",) if s[0].isalpha() else ("", "n", "N")) 357 } 358 klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS) 359 klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS) 360 klass._BYTE_STRINGS = cls._delimeter_list_to_dict(klass.BYTE_STRINGS) 361 klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS) 362 klass._STRING_ESCAPES = set(klass.STRING_ESCAPES) 363 klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES) 364 klass._COMMENTS = dict( 365 (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) 366 for comment in klass.COMMENTS 367 ) 368 369 klass.KEYWORD_TRIE = new_trie( 370 key.upper() 371 for key in ( 372 *klass.KEYWORDS, 373 *klass._COMMENTS, 374 *klass._QUOTES, 375 *klass._BIT_STRINGS, 376 *klass._HEX_STRINGS, 377 *klass._BYTE_STRINGS, 378 ) 379 if " " in key or any(single in key for single in klass.SINGLE_TOKENS) 380 ) 381 382 return klass 383 384 @staticmethod 385 def _delimeter_list_to_dict(list: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]: 386 return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list) 387 388 389class Tokenizer(metaclass=_Tokenizer): 390 SINGLE_TOKENS = { 391 "(": TokenType.L_PAREN, 392 ")": TokenType.R_PAREN, 393 "[": TokenType.L_BRACKET, 394 "]": TokenType.R_BRACKET, 395 "{": TokenType.L_BRACE, 396 "}": TokenType.R_BRACE, 397 "&": TokenType.AMP, 398 "^": TokenType.CARET, 399 ":": TokenType.COLON, 400 ",": TokenType.COMMA, 401 ".": TokenType.DOT, 402 "-": TokenType.DASH, 403 "=": TokenType.EQ, 404 ">": TokenType.GT, 405 "<": TokenType.LT, 406 "%": TokenType.MOD, 407 "!": TokenType.NOT, 408 "|": TokenType.PIPE, 409 "+": TokenType.PLUS, 410 ";": TokenType.SEMICOLON, 411 "/": TokenType.SLASH, 412 "\\": TokenType.BACKSLASH, 413 "*": TokenType.STAR, 414 "~": TokenType.TILDA, 415 "?": TokenType.PLACEHOLDER, 416 "@": TokenType.PARAMETER, 417 # used for breaking a var like x'y' but nothing else 418 # the token type doesn't matter 419 "'": TokenType.QUOTE, 420 "`": TokenType.IDENTIFIER, 421 '"': TokenType.IDENTIFIER, 422 "#": TokenType.HASH, 423 } 424 425 BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] 426 BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] 427 HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] 428 IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] 429 IDENTIFIER_ESCAPES = ['"'] 430 QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] 431 STRING_ESCAPES = ["'"] 432 VAR_SINGLE_TOKENS: t.Set[str] = set() 433 434 _COMMENTS: t.Dict[str, str] = {} 435 _BIT_STRINGS: t.Dict[str, str] = {} 436 _BYTE_STRINGS: t.Dict[str, str] = {} 437 _HEX_STRINGS: t.Dict[str, str] = {} 438 _IDENTIFIERS: t.Dict[str, str] = {} 439 _IDENTIFIER_ESCAPES: t.Set[str] = set() 440 _QUOTES: t.Dict[str, str] = {} 441 _STRING_ESCAPES: t.Set[str] = set() 442 443 KEYWORDS: t.Dict[t.Optional[str], TokenType] = { 444 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 445 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 446 "{{+": TokenType.BLOCK_START, 447 "{{-": TokenType.BLOCK_START, 448 "+}}": TokenType.BLOCK_END, 449 "-}}": TokenType.BLOCK_END, 450 "/*+": TokenType.HINT, 451 "==": TokenType.EQ, 452 "::": TokenType.DCOLON, 453 "||": TokenType.DPIPE, 454 ">=": TokenType.GTE, 455 "<=": TokenType.LTE, 456 "<>": TokenType.NEQ, 457 "!=": TokenType.NEQ, 458 "<=>": TokenType.NULLSAFE_EQ, 459 "->": TokenType.ARROW, 460 "->>": TokenType.DARROW, 461 "=>": TokenType.FARROW, 462 "#>": TokenType.HASH_ARROW, 463 "#>>": TokenType.DHASH_ARROW, 464 "<->": TokenType.LR_ARROW, 465 "&&": TokenType.DAMP, 466 "ALL": TokenType.ALL, 467 "ALWAYS": TokenType.ALWAYS, 468 "AND": TokenType.AND, 469 "ANTI": TokenType.ANTI, 470 "ANY": TokenType.ANY, 471 "ASC": TokenType.ASC, 472 "AS": TokenType.ALIAS, 473 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 474 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 475 "BEGIN": TokenType.BEGIN, 476 "BETWEEN": TokenType.BETWEEN, 477 "CACHE": TokenType.CACHE, 478 "UNCACHE": TokenType.UNCACHE, 479 "CASE": TokenType.CASE, 480 "CHARACTER SET": TokenType.CHARACTER_SET, 481 "COLLATE": TokenType.COLLATE, 482 "COLUMN": TokenType.COLUMN, 483 "COMMIT": TokenType.COMMIT, 484 "CONSTRAINT": TokenType.CONSTRAINT, 485 "CREATE": TokenType.CREATE, 486 "CROSS": TokenType.CROSS, 487 "CUBE": TokenType.CUBE, 488 "CURRENT_DATE": TokenType.CURRENT_DATE, 489 "CURRENT_TIME": TokenType.CURRENT_TIME, 490 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 491 "CURRENT_USER": TokenType.CURRENT_USER, 492 "DATABASE": TokenType.DATABASE, 493 "DEFAULT": TokenType.DEFAULT, 494 "DELETE": TokenType.DELETE, 495 "DESC": TokenType.DESC, 496 "DESCRIBE": TokenType.DESCRIBE, 497 "DISTINCT": TokenType.DISTINCT, 498 "DIV": TokenType.DIV, 499 "DROP": TokenType.DROP, 500 "ELSE": TokenType.ELSE, 501 "END": TokenType.END, 502 "ESCAPE": TokenType.ESCAPE, 503 "EXCEPT": TokenType.EXCEPT, 504 "EXECUTE": TokenType.EXECUTE, 505 "EXISTS": TokenType.EXISTS, 506 "FALSE": TokenType.FALSE, 507 "FETCH": TokenType.FETCH, 508 "FILTER": TokenType.FILTER, 509 "FIRST": TokenType.FIRST, 510 "FULL": TokenType.FULL, 511 "FUNCTION": TokenType.FUNCTION, 512 "FOR": TokenType.FOR, 513 "FOREIGN KEY": TokenType.FOREIGN_KEY, 514 "FORMAT": TokenType.FORMAT, 515 "FROM": TokenType.FROM, 516 "GEOGRAPHY": TokenType.GEOGRAPHY, 517 "GEOMETRY": TokenType.GEOMETRY, 518 "GLOB": TokenType.GLOB, 519 "GROUP BY": TokenType.GROUP_BY, 520 "GROUPING SETS": TokenType.GROUPING_SETS, 521 "HAVING": TokenType.HAVING, 522 "IF": TokenType.IF, 523 "ILIKE": TokenType.ILIKE, 524 "IN": TokenType.IN, 525 "INDEX": TokenType.INDEX, 526 "INET": TokenType.INET, 527 "INNER": TokenType.INNER, 528 "INSERT": TokenType.INSERT, 529 "INTERVAL": TokenType.INTERVAL, 530 "INTERSECT": TokenType.INTERSECT, 531 "INTO": TokenType.INTO, 532 "IS": TokenType.IS, 533 "ISNULL": TokenType.ISNULL, 534 "JOIN": TokenType.JOIN, 535 "KEEP": TokenType.KEEP, 536 "LATERAL": TokenType.LATERAL, 537 "LEFT": TokenType.LEFT, 538 "LIKE": TokenType.LIKE, 539 "LIMIT": TokenType.LIMIT, 540 "LOAD": TokenType.LOAD, 541 "LOCK": TokenType.LOCK, 542 "MERGE": TokenType.MERGE, 543 "NATURAL": TokenType.NATURAL, 544 "NEXT": TokenType.NEXT, 545 "NEXT VALUE FOR": TokenType.NEXT_VALUE_FOR, 546 "NOT": TokenType.NOT, 547 "NOTNULL": TokenType.NOTNULL, 548 "NULL": TokenType.NULL, 549 "OBJECT": TokenType.OBJECT, 550 "OFFSET": TokenType.OFFSET, 551 "ON": TokenType.ON, 552 "OR": TokenType.OR, 553 "ORDER BY": TokenType.ORDER_BY, 554 "ORDINALITY": TokenType.ORDINALITY, 555 "OUTER": TokenType.OUTER, 556 "OVER": TokenType.OVER, 557 "OVERLAPS": TokenType.OVERLAPS, 558 "OVERWRITE": TokenType.OVERWRITE, 559 "PARTITION": TokenType.PARTITION, 560 "PARTITION BY": TokenType.PARTITION_BY, 561 "PARTITIONED BY": TokenType.PARTITION_BY, 562 "PARTITIONED_BY": TokenType.PARTITION_BY, 563 "PERCENT": TokenType.PERCENT, 564 "PIVOT": TokenType.PIVOT, 565 "PRAGMA": TokenType.PRAGMA, 566 "PRIMARY KEY": TokenType.PRIMARY_KEY, 567 "PROCEDURE": TokenType.PROCEDURE, 568 "QUALIFY": TokenType.QUALIFY, 569 "RANGE": TokenType.RANGE, 570 "RECURSIVE": TokenType.RECURSIVE, 571 "REGEXP": TokenType.RLIKE, 572 "REPLACE": TokenType.REPLACE, 573 "REFERENCES": TokenType.REFERENCES, 574 "RIGHT": TokenType.RIGHT, 575 "RLIKE": TokenType.RLIKE, 576 "ROLLBACK": TokenType.ROLLBACK, 577 "ROLLUP": TokenType.ROLLUP, 578 "ROW": TokenType.ROW, 579 "ROWS": TokenType.ROWS, 580 "SCHEMA": TokenType.SCHEMA, 581 "SELECT": TokenType.SELECT, 582 "SEMI": TokenType.SEMI, 583 "SET": TokenType.SET, 584 "SETTINGS": TokenType.SETTINGS, 585 "SHOW": TokenType.SHOW, 586 "SIMILAR TO": TokenType.SIMILAR_TO, 587 "SOME": TokenType.SOME, 588 "TABLE": TokenType.TABLE, 589 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 590 "TEMP": TokenType.TEMPORARY, 591 "TEMPORARY": TokenType.TEMPORARY, 592 "THEN": TokenType.THEN, 593 "TRUE": TokenType.TRUE, 594 "UNION": TokenType.UNION, 595 "UNNEST": TokenType.UNNEST, 596 "UNPIVOT": TokenType.UNPIVOT, 597 "UPDATE": TokenType.UPDATE, 598 "USE": TokenType.USE, 599 "USING": TokenType.USING, 600 "UUID": TokenType.UUID, 601 "VALUES": TokenType.VALUES, 602 "VIEW": TokenType.VIEW, 603 "VOLATILE": TokenType.VOLATILE, 604 "WHEN": TokenType.WHEN, 605 "WHERE": TokenType.WHERE, 606 "WINDOW": TokenType.WINDOW, 607 "WITH": TokenType.WITH, 608 "APPLY": TokenType.APPLY, 609 "ARRAY": TokenType.ARRAY, 610 "BIT": TokenType.BIT, 611 "BOOL": TokenType.BOOLEAN, 612 "BOOLEAN": TokenType.BOOLEAN, 613 "BYTE": TokenType.TINYINT, 614 "TINYINT": TokenType.TINYINT, 615 "SHORT": TokenType.SMALLINT, 616 "SMALLINT": TokenType.SMALLINT, 617 "INT2": TokenType.SMALLINT, 618 "INTEGER": TokenType.INT, 619 "INT": TokenType.INT, 620 "INT4": TokenType.INT, 621 "LONG": TokenType.BIGINT, 622 "BIGINT": TokenType.BIGINT, 623 "INT8": TokenType.BIGINT, 624 "DEC": TokenType.DECIMAL, 625 "DECIMAL": TokenType.DECIMAL, 626 "BIGDECIMAL": TokenType.BIGDECIMAL, 627 "BIGNUMERIC": TokenType.BIGDECIMAL, 628 "MAP": TokenType.MAP, 629 "NULLABLE": TokenType.NULLABLE, 630 "NUMBER": TokenType.DECIMAL, 631 "NUMERIC": TokenType.DECIMAL, 632 "FIXED": TokenType.DECIMAL, 633 "REAL": TokenType.FLOAT, 634 "FLOAT": TokenType.FLOAT, 635 "FLOAT4": TokenType.FLOAT, 636 "FLOAT8": TokenType.DOUBLE, 637 "DOUBLE": TokenType.DOUBLE, 638 "DOUBLE PRECISION": TokenType.DOUBLE, 639 "JSON": TokenType.JSON, 640 "CHAR": TokenType.CHAR, 641 "CHARACTER": TokenType.CHAR, 642 "NCHAR": TokenType.NCHAR, 643 "VARCHAR": TokenType.VARCHAR, 644 "VARCHAR2": TokenType.VARCHAR, 645 "NVARCHAR": TokenType.NVARCHAR, 646 "NVARCHAR2": TokenType.NVARCHAR, 647 "STR": TokenType.TEXT, 648 "STRING": TokenType.TEXT, 649 "TEXT": TokenType.TEXT, 650 "CLOB": TokenType.TEXT, 651 "LONGVARCHAR": TokenType.TEXT, 652 "BINARY": TokenType.BINARY, 653 "BLOB": TokenType.VARBINARY, 654 "BYTEA": TokenType.VARBINARY, 655 "VARBINARY": TokenType.VARBINARY, 656 "TIME": TokenType.TIME, 657 "TIMESTAMP": TokenType.TIMESTAMP, 658 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 659 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 660 "DATE": TokenType.DATE, 661 "DATETIME": TokenType.DATETIME, 662 "UNIQUE": TokenType.UNIQUE, 663 "STRUCT": TokenType.STRUCT, 664 "VARIANT": TokenType.VARIANT, 665 "ALTER": TokenType.ALTER, 666 "ANALYZE": TokenType.COMMAND, 667 "CALL": TokenType.COMMAND, 668 "COMMENT": TokenType.COMMENT, 669 "COPY": TokenType.COMMAND, 670 "EXPLAIN": TokenType.COMMAND, 671 "GRANT": TokenType.COMMAND, 672 "OPTIMIZE": TokenType.COMMAND, 673 "PREPARE": TokenType.COMMAND, 674 "TRUNCATE": TokenType.COMMAND, 675 "VACUUM": TokenType.COMMAND, 676 } 677 678 WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = { 679 " ": TokenType.SPACE, 680 "\t": TokenType.SPACE, 681 "\n": TokenType.BREAK, 682 "\r": TokenType.BREAK, 683 "\r\n": TokenType.BREAK, 684 } 685 686 COMMANDS = { 687 TokenType.COMMAND, 688 TokenType.EXECUTE, 689 TokenType.FETCH, 690 TokenType.SHOW, 691 } 692 693 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 694 695 # handle numeric literals like in hive (3L = BIGINT) 696 NUMERIC_LITERALS: t.Dict[str, str] = {} 697 ENCODE: t.Optional[str] = None 698 699 COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")] 700 KEYWORD_TRIE: t.Dict = {} # autofilled 701 702 IDENTIFIER_CAN_START_WITH_DIGIT = False 703 704 __slots__ = ( 705 "sql", 706 "size", 707 "tokens", 708 "_start", 709 "_current", 710 "_line", 711 "_col", 712 "_comments", 713 "_char", 714 "_end", 715 "_peek", 716 "_prev_token_line", 717 ) 718 719 def __init__(self) -> None: 720 self.reset() 721 722 def reset(self) -> None: 723 self.sql = "" 724 self.size = 0 725 self.tokens: t.List[Token] = [] 726 self._start = 0 727 self._current = 0 728 self._line = 1 729 self._col = 0 730 self._comments: t.List[str] = [] 731 732 self._char = "" 733 self._end = False 734 self._peek = "" 735 self._prev_token_line = -1 736 737 def tokenize(self, sql: str) -> t.List[Token]: 738 """Returns a list of tokens corresponding to the SQL string `sql`.""" 739 self.reset() 740 self.sql = sql 741 self.size = len(sql) 742 743 try: 744 self._scan() 745 except Exception as e: 746 start = max(self._current - 50, 0) 747 end = min(self._current + 50, self.size - 1) 748 context = self.sql[start:end] 749 raise ValueError(f"Error tokenizing '{context}'") from e 750 751 return self.tokens 752 753 def _scan(self, until: t.Optional[t.Callable] = None) -> None: 754 while self.size and not self._end: 755 self._start = self._current 756 self._advance() 757 758 if self._char is None: 759 break 760 761 if self._char not in self.WHITE_SPACE: 762 if self._char.isdigit(): 763 self._scan_number() 764 elif self._char in self._IDENTIFIERS: 765 self._scan_identifier(self._IDENTIFIERS[self._char]) 766 else: 767 self._scan_keywords() 768 769 if until and until(): 770 break 771 772 if self.tokens and self._comments: 773 self.tokens[-1].comments.extend(self._comments) 774 775 def _chars(self, size: int) -> str: 776 if size == 1: 777 return self._char 778 779 start = self._current - 1 780 end = start + size 781 782 return self.sql[start:end] if end <= self.size else "" 783 784 def _advance(self, i: int = 1, alnum: bool = False) -> None: 785 if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: 786 self._col = 1 787 self._line += 1 788 else: 789 self._col += i 790 791 self._current += i 792 self._end = self._current >= self.size 793 self._char = self.sql[self._current - 1] 794 self._peek = "" if self._end else self.sql[self._current] 795 796 if alnum and self._char.isalnum(): 797 # Here we use local variables instead of attributes for better performance 798 _col = self._col 799 _current = self._current 800 _end = self._end 801 _peek = self._peek 802 803 while _peek.isalnum(): 804 _col += 1 805 _current += 1 806 _end = _current >= self.size 807 _peek = "" if _end else self.sql[_current] 808 809 self._col = _col 810 self._current = _current 811 self._end = _end 812 self._peek = _peek 813 self._char = self.sql[_current - 1] 814 815 @property 816 def _text(self) -> str: 817 return self.sql[self._start : self._current] 818 819 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: 820 self._prev_token_line = self._line 821 self.tokens.append( 822 Token( 823 token_type, 824 text=self._text if text is None else text, 825 line=self._line, 826 col=self._col, 827 start=self._start, 828 end=self._current - 1, 829 comments=self._comments, 830 ) 831 ) 832 self._comments = [] 833 834 # If we have either a semicolon or a begin token before the command's token, we'll parse 835 # whatever follows the command's token as a string 836 if ( 837 token_type in self.COMMANDS 838 and self._peek != ";" 839 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS) 840 ): 841 start = self._current 842 tokens = len(self.tokens) 843 self._scan(lambda: self._peek == ";") 844 self.tokens = self.tokens[:tokens] 845 text = self.sql[start : self._current].strip() 846 if text: 847 self._add(TokenType.STRING, text) 848 849 def _scan_keywords(self) -> None: 850 size = 0 851 word = None 852 chars = self._text 853 char = chars 854 prev_space = False 855 skip = False 856 trie = self.KEYWORD_TRIE 857 single_token = char in self.SINGLE_TOKENS 858 859 while chars: 860 if skip: 861 result = 1 862 else: 863 result, trie = in_trie(trie, char.upper()) 864 865 if result == 0: 866 break 867 if result == 2: 868 word = chars 869 870 size += 1 871 end = self._current - 1 + size 872 873 if end < self.size: 874 char = self.sql[end] 875 single_token = single_token or char in self.SINGLE_TOKENS 876 is_space = char in self.WHITE_SPACE 877 878 if not is_space or not prev_space: 879 if is_space: 880 char = " " 881 chars += char 882 prev_space = is_space 883 skip = False 884 else: 885 skip = True 886 else: 887 char = "" 888 chars = " " 889 890 word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word 891 892 if not word: 893 if self._char in self.SINGLE_TOKENS: 894 self._add(self.SINGLE_TOKENS[self._char], text=self._char) 895 return 896 self._scan_var() 897 return 898 899 if self._scan_string(word): 900 return 901 if self._scan_formatted_string(word): 902 return 903 if self._scan_comment(word): 904 return 905 906 self._advance(size - 1) 907 word = word.upper() 908 self._add(self.KEYWORDS[word], text=word) 909 910 def _scan_comment(self, comment_start: str) -> bool: 911 if comment_start not in self._COMMENTS: 912 return False 913 914 comment_start_line = self._line 915 comment_start_size = len(comment_start) 916 comment_end = self._COMMENTS[comment_start] 917 918 if comment_end: 919 # Skip the comment's start delimiter 920 self._advance(comment_start_size) 921 922 comment_end_size = len(comment_end) 923 while not self._end and self._chars(comment_end_size) != comment_end: 924 self._advance(alnum=True) 925 926 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 927 self._advance(comment_end_size - 1) 928 else: 929 while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK: 930 self._advance(alnum=True) 931 self._comments.append(self._text[comment_start_size:]) 932 933 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 934 # Multiple consecutive comments are preserved by appending them to the current comments list. 935 if comment_start_line == self._prev_token_line: 936 self.tokens[-1].comments.extend(self._comments) 937 self._comments = [] 938 self._prev_token_line = self._line 939 940 return True 941 942 def _scan_number(self) -> None: 943 if self._char == "0": 944 peek = self._peek.upper() 945 if peek == "B": 946 return self._scan_bits() if self._BIT_STRINGS else self._add(TokenType.NUMBER) 947 elif peek == "X": 948 return self._scan_hex() if self._HEX_STRINGS else self._add(TokenType.NUMBER) 949 950 decimal = False 951 scientific = 0 952 953 while True: 954 if self._peek.isdigit(): 955 self._advance() 956 elif self._peek == "." and not decimal: 957 decimal = True 958 self._advance() 959 elif self._peek in ("-", "+") and scientific == 1: 960 scientific += 1 961 self._advance() 962 elif self._peek.upper() == "E" and not scientific: 963 scientific += 1 964 self._advance() 965 elif self._peek.isidentifier(): 966 number_text = self._text 967 literal = "" 968 969 while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: 970 literal += self._peek.upper() 971 self._advance() 972 973 token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal)) 974 975 if token_type: 976 self._add(TokenType.NUMBER, number_text) 977 self._add(TokenType.DCOLON, "::") 978 return self._add(token_type, literal) 979 elif self.IDENTIFIER_CAN_START_WITH_DIGIT: 980 return self._add(TokenType.VAR) 981 982 self._add(TokenType.NUMBER, number_text) 983 return self._advance(-len(literal)) 984 else: 985 return self._add(TokenType.NUMBER) 986 987 def _scan_bits(self) -> None: 988 self._advance() 989 value = self._extract_value() 990 try: 991 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 992 int(value, 2) 993 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 994 except ValueError: 995 self._add(TokenType.IDENTIFIER) 996 997 def _scan_hex(self) -> None: 998 self._advance() 999 value = self._extract_value() 1000 try: 1001 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1002 int(value, 16) 1003 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1004 except ValueError: 1005 self._add(TokenType.IDENTIFIER) 1006 1007 def _extract_value(self) -> str: 1008 while True: 1009 char = self._peek.strip() 1010 if char and char not in self.SINGLE_TOKENS: 1011 self._advance(alnum=True) 1012 else: 1013 break 1014 1015 return self._text 1016 1017 def _scan_string(self, quote: str) -> bool: 1018 quote_end = self._QUOTES.get(quote) 1019 if quote_end is None: 1020 return False 1021 1022 self._advance(len(quote)) 1023 text = self._extract_string(quote_end) 1024 text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text 1025 self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text) 1026 return True 1027 1028 # X'1234', b'0110', E'\\\\\' etc. 1029 def _scan_formatted_string(self, string_start: str) -> bool: 1030 if string_start in self._HEX_STRINGS: 1031 delimiters = self._HEX_STRINGS 1032 token_type = TokenType.HEX_STRING 1033 base = 16 1034 elif string_start in self._BIT_STRINGS: 1035 delimiters = self._BIT_STRINGS 1036 token_type = TokenType.BIT_STRING 1037 base = 2 1038 elif string_start in self._BYTE_STRINGS: 1039 delimiters = self._BYTE_STRINGS 1040 token_type = TokenType.BYTE_STRING 1041 base = None 1042 else: 1043 return False 1044 1045 self._advance(len(string_start)) 1046 string_end = delimiters[string_start] 1047 text = self._extract_string(string_end) 1048 1049 if base: 1050 try: 1051 int(text, base) 1052 except: 1053 raise RuntimeError( 1054 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1055 ) 1056 1057 self._add(token_type, text) 1058 return True 1059 1060 def _scan_identifier(self, identifier_end: str) -> None: 1061 self._advance() 1062 text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES) 1063 self._add(TokenType.IDENTIFIER, text) 1064 1065 def _scan_var(self) -> None: 1066 while True: 1067 char = self._peek.strip() 1068 if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): 1069 self._advance(alnum=True) 1070 else: 1071 break 1072 1073 self._add( 1074 TokenType.VAR 1075 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1076 else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) 1077 ) 1078 1079 def _extract_string(self, delimiter: str, escapes=None) -> str: 1080 text = "" 1081 delim_size = len(delimiter) 1082 escapes = self._STRING_ESCAPES if escapes is None else escapes 1083 1084 while True: 1085 if self._char in escapes and (self._peek == delimiter or self._peek in escapes): 1086 if self._peek == delimiter: 1087 text += self._peek 1088 else: 1089 text += self._char + self._peek 1090 1091 if self._current + 1 < self.size: 1092 self._advance(2) 1093 else: 1094 raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._current}") 1095 else: 1096 if self._chars(delim_size) == delimiter: 1097 if delim_size > 1: 1098 self._advance(delim_size - 1) 1099 break 1100 1101 if self._end: 1102 raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}") 1103 1104 current = self._current - 1 1105 self._advance(alnum=True) 1106 text += self.sql[current : self._current - 1] 1107 1108 return text
11class TokenType(AutoName): 12 L_PAREN = auto() 13 R_PAREN = auto() 14 L_BRACKET = auto() 15 R_BRACKET = auto() 16 L_BRACE = auto() 17 R_BRACE = auto() 18 COMMA = auto() 19 DOT = auto() 20 DASH = auto() 21 PLUS = auto() 22 COLON = auto() 23 DCOLON = auto() 24 SEMICOLON = auto() 25 STAR = auto() 26 BACKSLASH = auto() 27 SLASH = auto() 28 LT = auto() 29 LTE = auto() 30 GT = auto() 31 GTE = auto() 32 NOT = auto() 33 EQ = auto() 34 NEQ = auto() 35 NULLSAFE_EQ = auto() 36 AND = auto() 37 OR = auto() 38 AMP = auto() 39 DPIPE = auto() 40 PIPE = auto() 41 CARET = auto() 42 TILDA = auto() 43 ARROW = auto() 44 DARROW = auto() 45 FARROW = auto() 46 HASH = auto() 47 HASH_ARROW = auto() 48 DHASH_ARROW = auto() 49 LR_ARROW = auto() 50 LT_AT = auto() 51 AT_GT = auto() 52 DOLLAR = auto() 53 PARAMETER = auto() 54 SESSION_PARAMETER = auto() 55 NATIONAL = auto() 56 DAMP = auto() 57 58 BLOCK_START = auto() 59 BLOCK_END = auto() 60 61 SPACE = auto() 62 BREAK = auto() 63 64 STRING = auto() 65 NUMBER = auto() 66 IDENTIFIER = auto() 67 DATABASE = auto() 68 COLUMN = auto() 69 COLUMN_DEF = auto() 70 SCHEMA = auto() 71 TABLE = auto() 72 VAR = auto() 73 BIT_STRING = auto() 74 HEX_STRING = auto() 75 BYTE_STRING = auto() 76 77 # types 78 BIT = auto() 79 BOOLEAN = auto() 80 TINYINT = auto() 81 UTINYINT = auto() 82 SMALLINT = auto() 83 USMALLINT = auto() 84 INT = auto() 85 UINT = auto() 86 BIGINT = auto() 87 UBIGINT = auto() 88 INT128 = auto() 89 UINT128 = auto() 90 INT256 = auto() 91 UINT256 = auto() 92 FLOAT = auto() 93 DOUBLE = auto() 94 DECIMAL = auto() 95 BIGDECIMAL = auto() 96 CHAR = auto() 97 NCHAR = auto() 98 VARCHAR = auto() 99 NVARCHAR = auto() 100 TEXT = auto() 101 MEDIUMTEXT = auto() 102 LONGTEXT = auto() 103 MEDIUMBLOB = auto() 104 LONGBLOB = auto() 105 BINARY = auto() 106 VARBINARY = auto() 107 JSON = auto() 108 JSONB = auto() 109 TIME = auto() 110 TIMESTAMP = auto() 111 TIMESTAMPTZ = auto() 112 TIMESTAMPLTZ = auto() 113 DATETIME = auto() 114 DATETIME64 = auto() 115 DATE = auto() 116 UUID = auto() 117 GEOGRAPHY = auto() 118 NULLABLE = auto() 119 GEOMETRY = auto() 120 HLLSKETCH = auto() 121 HSTORE = auto() 122 SUPER = auto() 123 SERIAL = auto() 124 SMALLSERIAL = auto() 125 BIGSERIAL = auto() 126 XML = auto() 127 UNIQUEIDENTIFIER = auto() 128 MONEY = auto() 129 SMALLMONEY = auto() 130 ROWVERSION = auto() 131 IMAGE = auto() 132 VARIANT = auto() 133 OBJECT = auto() 134 INET = auto() 135 136 # keywords 137 ALIAS = auto() 138 ALTER = auto() 139 ALWAYS = auto() 140 ALL = auto() 141 ANTI = auto() 142 ANY = auto() 143 APPLY = auto() 144 ARRAY = auto() 145 ASC = auto() 146 ASOF = auto() 147 AUTO_INCREMENT = auto() 148 BEGIN = auto() 149 BETWEEN = auto() 150 CACHE = auto() 151 CASE = auto() 152 CHARACTER_SET = auto() 153 COLLATE = auto() 154 COMMAND = auto() 155 COMMENT = auto() 156 COMMIT = auto() 157 CONSTRAINT = auto() 158 CREATE = auto() 159 CROSS = auto() 160 CUBE = auto() 161 CURRENT_DATE = auto() 162 CURRENT_DATETIME = auto() 163 CURRENT_TIME = auto() 164 CURRENT_TIMESTAMP = auto() 165 CURRENT_USER = auto() 166 DEFAULT = auto() 167 DELETE = auto() 168 DESC = auto() 169 DESCRIBE = auto() 170 DISTINCT = auto() 171 DIV = auto() 172 DROP = auto() 173 ELSE = auto() 174 END = auto() 175 ESCAPE = auto() 176 EXCEPT = auto() 177 EXECUTE = auto() 178 EXISTS = auto() 179 FALSE = auto() 180 FETCH = auto() 181 FILTER = auto() 182 FINAL = auto() 183 FIRST = auto() 184 FOR = auto() 185 FOREIGN_KEY = auto() 186 FORMAT = auto() 187 FROM = auto() 188 FULL = auto() 189 FUNCTION = auto() 190 GLOB = auto() 191 GLOBAL = auto() 192 GROUP_BY = auto() 193 GROUPING_SETS = auto() 194 HAVING = auto() 195 HINT = auto() 196 IF = auto() 197 ILIKE = auto() 198 ILIKE_ANY = auto() 199 IN = auto() 200 INDEX = auto() 201 INNER = auto() 202 INSERT = auto() 203 INTERSECT = auto() 204 INTERVAL = auto() 205 INTO = auto() 206 INTRODUCER = auto() 207 IRLIKE = auto() 208 IS = auto() 209 ISNULL = auto() 210 JOIN = auto() 211 JOIN_MARKER = auto() 212 KEEP = auto() 213 LANGUAGE = auto() 214 LATERAL = auto() 215 LEFT = auto() 216 LIKE = auto() 217 LIKE_ANY = auto() 218 LIMIT = auto() 219 LOAD = auto() 220 LOCK = auto() 221 MAP = auto() 222 MATCH_RECOGNIZE = auto() 223 MERGE = auto() 224 MOD = auto() 225 NATURAL = auto() 226 NEXT = auto() 227 NEXT_VALUE_FOR = auto() 228 NOTNULL = auto() 229 NULL = auto() 230 OFFSET = auto() 231 ON = auto() 232 ORDER_BY = auto() 233 ORDERED = auto() 234 ORDINALITY = auto() 235 OUTER = auto() 236 OVER = auto() 237 OVERLAPS = auto() 238 OVERWRITE = auto() 239 PARTITION = auto() 240 PARTITION_BY = auto() 241 PERCENT = auto() 242 PIVOT = auto() 243 PLACEHOLDER = auto() 244 PRAGMA = auto() 245 PRIMARY_KEY = auto() 246 PROCEDURE = auto() 247 PROPERTIES = auto() 248 PSEUDO_TYPE = auto() 249 QUALIFY = auto() 250 QUOTE = auto() 251 RANGE = auto() 252 RECURSIVE = auto() 253 REPLACE = auto() 254 RETURNING = auto() 255 REFERENCES = auto() 256 RIGHT = auto() 257 RLIKE = auto() 258 ROLLBACK = auto() 259 ROLLUP = auto() 260 ROW = auto() 261 ROWS = auto() 262 SELECT = auto() 263 SEMI = auto() 264 SEPARATOR = auto() 265 SERDE_PROPERTIES = auto() 266 SET = auto() 267 SETTINGS = auto() 268 SHOW = auto() 269 SIMILAR_TO = auto() 270 SOME = auto() 271 STRUCT = auto() 272 TABLE_SAMPLE = auto() 273 TEMPORARY = auto() 274 TOP = auto() 275 THEN = auto() 276 TRUE = auto() 277 UNCACHE = auto() 278 UNION = auto() 279 UNNEST = auto() 280 UNPIVOT = auto() 281 UPDATE = auto() 282 USE = auto() 283 USING = auto() 284 VALUES = auto() 285 VIEW = auto() 286 VOLATILE = auto() 287 WHEN = auto() 288 WHERE = auto() 289 WINDOW = auto() 290 WITH = auto() 291 UNIQUE = auto()
An enumeration.
L_PAREN =
<TokenType.L_PAREN: 'L_PAREN'>
R_PAREN =
<TokenType.R_PAREN: 'R_PAREN'>
L_BRACKET =
<TokenType.L_BRACKET: 'L_BRACKET'>
R_BRACKET =
<TokenType.R_BRACKET: 'R_BRACKET'>
L_BRACE =
<TokenType.L_BRACE: 'L_BRACE'>
R_BRACE =
<TokenType.R_BRACE: 'R_BRACE'>
COMMA =
<TokenType.COMMA: 'COMMA'>
DOT =
<TokenType.DOT: 'DOT'>
DASH =
<TokenType.DASH: 'DASH'>
PLUS =
<TokenType.PLUS: 'PLUS'>
COLON =
<TokenType.COLON: 'COLON'>
DCOLON =
<TokenType.DCOLON: 'DCOLON'>
SEMICOLON =
<TokenType.SEMICOLON: 'SEMICOLON'>
STAR =
<TokenType.STAR: 'STAR'>
BACKSLASH =
<TokenType.BACKSLASH: 'BACKSLASH'>
SLASH =
<TokenType.SLASH: 'SLASH'>
LT =
<TokenType.LT: 'LT'>
LTE =
<TokenType.LTE: 'LTE'>
GT =
<TokenType.GT: 'GT'>
GTE =
<TokenType.GTE: 'GTE'>
NOT =
<TokenType.NOT: 'NOT'>
EQ =
<TokenType.EQ: 'EQ'>
NEQ =
<TokenType.NEQ: 'NEQ'>
NULLSAFE_EQ =
<TokenType.NULLSAFE_EQ: 'NULLSAFE_EQ'>
AND =
<TokenType.AND: 'AND'>
OR =
<TokenType.OR: 'OR'>
AMP =
<TokenType.AMP: 'AMP'>
DPIPE =
<TokenType.DPIPE: 'DPIPE'>
PIPE =
<TokenType.PIPE: 'PIPE'>
CARET =
<TokenType.CARET: 'CARET'>
TILDA =
<TokenType.TILDA: 'TILDA'>
ARROW =
<TokenType.ARROW: 'ARROW'>
DARROW =
<TokenType.DARROW: 'DARROW'>
FARROW =
<TokenType.FARROW: 'FARROW'>
HASH =
<TokenType.HASH: 'HASH'>
HASH_ARROW =
<TokenType.HASH_ARROW: 'HASH_ARROW'>
DHASH_ARROW =
<TokenType.DHASH_ARROW: 'DHASH_ARROW'>
LR_ARROW =
<TokenType.LR_ARROW: 'LR_ARROW'>
LT_AT =
<TokenType.LT_AT: 'LT_AT'>
AT_GT =
<TokenType.AT_GT: 'AT_GT'>
DOLLAR =
<TokenType.DOLLAR: 'DOLLAR'>
PARAMETER =
<TokenType.PARAMETER: 'PARAMETER'>
SESSION_PARAMETER =
<TokenType.SESSION_PARAMETER: 'SESSION_PARAMETER'>
NATIONAL =
<TokenType.NATIONAL: 'NATIONAL'>
DAMP =
<TokenType.DAMP: 'DAMP'>
BLOCK_START =
<TokenType.BLOCK_START: 'BLOCK_START'>
BLOCK_END =
<TokenType.BLOCK_END: 'BLOCK_END'>
SPACE =
<TokenType.SPACE: 'SPACE'>
BREAK =
<TokenType.BREAK: 'BREAK'>
STRING =
<TokenType.STRING: 'STRING'>
NUMBER =
<TokenType.NUMBER: 'NUMBER'>
IDENTIFIER =
<TokenType.IDENTIFIER: 'IDENTIFIER'>
DATABASE =
<TokenType.DATABASE: 'DATABASE'>
COLUMN =
<TokenType.COLUMN: 'COLUMN'>
COLUMN_DEF =
<TokenType.COLUMN_DEF: 'COLUMN_DEF'>
SCHEMA =
<TokenType.SCHEMA: 'SCHEMA'>
TABLE =
<TokenType.TABLE: 'TABLE'>
VAR =
<TokenType.VAR: 'VAR'>
BIT_STRING =
<TokenType.BIT_STRING: 'BIT_STRING'>
HEX_STRING =
<TokenType.HEX_STRING: 'HEX_STRING'>
BYTE_STRING =
<TokenType.BYTE_STRING: 'BYTE_STRING'>
BIT =
<TokenType.BIT: 'BIT'>
BOOLEAN =
<TokenType.BOOLEAN: 'BOOLEAN'>
TINYINT =
<TokenType.TINYINT: 'TINYINT'>
UTINYINT =
<TokenType.UTINYINT: 'UTINYINT'>
SMALLINT =
<TokenType.SMALLINT: 'SMALLINT'>
USMALLINT =
<TokenType.USMALLINT: 'USMALLINT'>
INT =
<TokenType.INT: 'INT'>
UINT =
<TokenType.UINT: 'UINT'>
BIGINT =
<TokenType.BIGINT: 'BIGINT'>
UBIGINT =
<TokenType.UBIGINT: 'UBIGINT'>
INT128 =
<TokenType.INT128: 'INT128'>
UINT128 =
<TokenType.UINT128: 'UINT128'>
INT256 =
<TokenType.INT256: 'INT256'>
UINT256 =
<TokenType.UINT256: 'UINT256'>
FLOAT =
<TokenType.FLOAT: 'FLOAT'>
DOUBLE =
<TokenType.DOUBLE: 'DOUBLE'>
DECIMAL =
<TokenType.DECIMAL: 'DECIMAL'>
BIGDECIMAL =
<TokenType.BIGDECIMAL: 'BIGDECIMAL'>
CHAR =
<TokenType.CHAR: 'CHAR'>
NCHAR =
<TokenType.NCHAR: 'NCHAR'>
VARCHAR =
<TokenType.VARCHAR: 'VARCHAR'>
NVARCHAR =
<TokenType.NVARCHAR: 'NVARCHAR'>
TEXT =
<TokenType.TEXT: 'TEXT'>
MEDIUMTEXT =
<TokenType.MEDIUMTEXT: 'MEDIUMTEXT'>
LONGTEXT =
<TokenType.LONGTEXT: 'LONGTEXT'>
MEDIUMBLOB =
<TokenType.MEDIUMBLOB: 'MEDIUMBLOB'>
LONGBLOB =
<TokenType.LONGBLOB: 'LONGBLOB'>
BINARY =
<TokenType.BINARY: 'BINARY'>
VARBINARY =
<TokenType.VARBINARY: 'VARBINARY'>
JSON =
<TokenType.JSON: 'JSON'>
JSONB =
<TokenType.JSONB: 'JSONB'>
TIME =
<TokenType.TIME: 'TIME'>
TIMESTAMP =
<TokenType.TIMESTAMP: 'TIMESTAMP'>
TIMESTAMPTZ =
<TokenType.TIMESTAMPTZ: 'TIMESTAMPTZ'>
TIMESTAMPLTZ =
<TokenType.TIMESTAMPLTZ: 'TIMESTAMPLTZ'>
DATETIME =
<TokenType.DATETIME: 'DATETIME'>
DATETIME64 =
<TokenType.DATETIME64: 'DATETIME64'>
DATE =
<TokenType.DATE: 'DATE'>
UUID =
<TokenType.UUID: 'UUID'>
GEOGRAPHY =
<TokenType.GEOGRAPHY: 'GEOGRAPHY'>
NULLABLE =
<TokenType.NULLABLE: 'NULLABLE'>
GEOMETRY =
<TokenType.GEOMETRY: 'GEOMETRY'>
HLLSKETCH =
<TokenType.HLLSKETCH: 'HLLSKETCH'>
HSTORE =
<TokenType.HSTORE: 'HSTORE'>
SUPER =
<TokenType.SUPER: 'SUPER'>
SERIAL =
<TokenType.SERIAL: 'SERIAL'>
SMALLSERIAL =
<TokenType.SMALLSERIAL: 'SMALLSERIAL'>
BIGSERIAL =
<TokenType.BIGSERIAL: 'BIGSERIAL'>
XML =
<TokenType.XML: 'XML'>
UNIQUEIDENTIFIER =
<TokenType.UNIQUEIDENTIFIER: 'UNIQUEIDENTIFIER'>
MONEY =
<TokenType.MONEY: 'MONEY'>
SMALLMONEY =
<TokenType.SMALLMONEY: 'SMALLMONEY'>
ROWVERSION =
<TokenType.ROWVERSION: 'ROWVERSION'>
IMAGE =
<TokenType.IMAGE: 'IMAGE'>
VARIANT =
<TokenType.VARIANT: 'VARIANT'>
OBJECT =
<TokenType.OBJECT: 'OBJECT'>
INET =
<TokenType.INET: 'INET'>
ALIAS =
<TokenType.ALIAS: 'ALIAS'>
ALTER =
<TokenType.ALTER: 'ALTER'>
ALWAYS =
<TokenType.ALWAYS: 'ALWAYS'>
ALL =
<TokenType.ALL: 'ALL'>
ANTI =
<TokenType.ANTI: 'ANTI'>
ANY =
<TokenType.ANY: 'ANY'>
APPLY =
<TokenType.APPLY: 'APPLY'>
ARRAY =
<TokenType.ARRAY: 'ARRAY'>
ASC =
<TokenType.ASC: 'ASC'>
ASOF =
<TokenType.ASOF: 'ASOF'>
AUTO_INCREMENT =
<TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>
BEGIN =
<TokenType.BEGIN: 'BEGIN'>
BETWEEN =
<TokenType.BETWEEN: 'BETWEEN'>
CACHE =
<TokenType.CACHE: 'CACHE'>
CASE =
<TokenType.CASE: 'CASE'>
CHARACTER_SET =
<TokenType.CHARACTER_SET: 'CHARACTER_SET'>
COLLATE =
<TokenType.COLLATE: 'COLLATE'>
COMMAND =
<TokenType.COMMAND: 'COMMAND'>
COMMENT =
<TokenType.COMMENT: 'COMMENT'>
COMMIT =
<TokenType.COMMIT: 'COMMIT'>
CONSTRAINT =
<TokenType.CONSTRAINT: 'CONSTRAINT'>
CREATE =
<TokenType.CREATE: 'CREATE'>
CROSS =
<TokenType.CROSS: 'CROSS'>
CUBE =
<TokenType.CUBE: 'CUBE'>
CURRENT_DATE =
<TokenType.CURRENT_DATE: 'CURRENT_DATE'>
CURRENT_DATETIME =
<TokenType.CURRENT_DATETIME: 'CURRENT_DATETIME'>
CURRENT_TIME =
<TokenType.CURRENT_TIME: 'CURRENT_TIME'>
CURRENT_TIMESTAMP =
<TokenType.CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'>
CURRENT_USER =
<TokenType.CURRENT_USER: 'CURRENT_USER'>
DEFAULT =
<TokenType.DEFAULT: 'DEFAULT'>
DELETE =
<TokenType.DELETE: 'DELETE'>
DESC =
<TokenType.DESC: 'DESC'>
DESCRIBE =
<TokenType.DESCRIBE: 'DESCRIBE'>
DISTINCT =
<TokenType.DISTINCT: 'DISTINCT'>
DIV =
<TokenType.DIV: 'DIV'>
DROP =
<TokenType.DROP: 'DROP'>
ELSE =
<TokenType.ELSE: 'ELSE'>
END =
<TokenType.END: 'END'>
ESCAPE =
<TokenType.ESCAPE: 'ESCAPE'>
EXCEPT =
<TokenType.EXCEPT: 'EXCEPT'>
EXECUTE =
<TokenType.EXECUTE: 'EXECUTE'>
EXISTS =
<TokenType.EXISTS: 'EXISTS'>
FALSE =
<TokenType.FALSE: 'FALSE'>
FETCH =
<TokenType.FETCH: 'FETCH'>
FILTER =
<TokenType.FILTER: 'FILTER'>
FINAL =
<TokenType.FINAL: 'FINAL'>
FIRST =
<TokenType.FIRST: 'FIRST'>
FOR =
<TokenType.FOR: 'FOR'>
FOREIGN_KEY =
<TokenType.FOREIGN_KEY: 'FOREIGN_KEY'>
FORMAT =
<TokenType.FORMAT: 'FORMAT'>
FROM =
<TokenType.FROM: 'FROM'>
FULL =
<TokenType.FULL: 'FULL'>
FUNCTION =
<TokenType.FUNCTION: 'FUNCTION'>
GLOB =
<TokenType.GLOB: 'GLOB'>
GLOBAL =
<TokenType.GLOBAL: 'GLOBAL'>
GROUP_BY =
<TokenType.GROUP_BY: 'GROUP_BY'>
GROUPING_SETS =
<TokenType.GROUPING_SETS: 'GROUPING_SETS'>
HAVING =
<TokenType.HAVING: 'HAVING'>
HINT =
<TokenType.HINT: 'HINT'>
IF =
<TokenType.IF: 'IF'>
ILIKE =
<TokenType.ILIKE: 'ILIKE'>
ILIKE_ANY =
<TokenType.ILIKE_ANY: 'ILIKE_ANY'>
IN =
<TokenType.IN: 'IN'>
INDEX =
<TokenType.INDEX: 'INDEX'>
INNER =
<TokenType.INNER: 'INNER'>
INSERT =
<TokenType.INSERT: 'INSERT'>
INTERSECT =
<TokenType.INTERSECT: 'INTERSECT'>
INTERVAL =
<TokenType.INTERVAL: 'INTERVAL'>
INTO =
<TokenType.INTO: 'INTO'>
INTRODUCER =
<TokenType.INTRODUCER: 'INTRODUCER'>
IRLIKE =
<TokenType.IRLIKE: 'IRLIKE'>
IS =
<TokenType.IS: 'IS'>
ISNULL =
<TokenType.ISNULL: 'ISNULL'>
JOIN =
<TokenType.JOIN: 'JOIN'>
JOIN_MARKER =
<TokenType.JOIN_MARKER: 'JOIN_MARKER'>
KEEP =
<TokenType.KEEP: 'KEEP'>
LANGUAGE =
<TokenType.LANGUAGE: 'LANGUAGE'>
LATERAL =
<TokenType.LATERAL: 'LATERAL'>
LEFT =
<TokenType.LEFT: 'LEFT'>
LIKE =
<TokenType.LIKE: 'LIKE'>
LIKE_ANY =
<TokenType.LIKE_ANY: 'LIKE_ANY'>
LIMIT =
<TokenType.LIMIT: 'LIMIT'>
LOAD =
<TokenType.LOAD: 'LOAD'>
LOCK =
<TokenType.LOCK: 'LOCK'>
MAP =
<TokenType.MAP: 'MAP'>
MATCH_RECOGNIZE =
<TokenType.MATCH_RECOGNIZE: 'MATCH_RECOGNIZE'>
MERGE =
<TokenType.MERGE: 'MERGE'>
MOD =
<TokenType.MOD: 'MOD'>
NATURAL =
<TokenType.NATURAL: 'NATURAL'>
NEXT =
<TokenType.NEXT: 'NEXT'>
NEXT_VALUE_FOR =
<TokenType.NEXT_VALUE_FOR: 'NEXT_VALUE_FOR'>
NOTNULL =
<TokenType.NOTNULL: 'NOTNULL'>
NULL =
<TokenType.NULL: 'NULL'>
OFFSET =
<TokenType.OFFSET: 'OFFSET'>
ON =
<TokenType.ON: 'ON'>
ORDER_BY =
<TokenType.ORDER_BY: 'ORDER_BY'>
ORDERED =
<TokenType.ORDERED: 'ORDERED'>
ORDINALITY =
<TokenType.ORDINALITY: 'ORDINALITY'>
OUTER =
<TokenType.OUTER: 'OUTER'>
OVER =
<TokenType.OVER: 'OVER'>
OVERLAPS =
<TokenType.OVERLAPS: 'OVERLAPS'>
OVERWRITE =
<TokenType.OVERWRITE: 'OVERWRITE'>
PARTITION =
<TokenType.PARTITION: 'PARTITION'>
PARTITION_BY =
<TokenType.PARTITION_BY: 'PARTITION_BY'>
PERCENT =
<TokenType.PERCENT: 'PERCENT'>
PIVOT =
<TokenType.PIVOT: 'PIVOT'>
PLACEHOLDER =
<TokenType.PLACEHOLDER: 'PLACEHOLDER'>
PRAGMA =
<TokenType.PRAGMA: 'PRAGMA'>
PRIMARY_KEY =
<TokenType.PRIMARY_KEY: 'PRIMARY_KEY'>
PROCEDURE =
<TokenType.PROCEDURE: 'PROCEDURE'>
PROPERTIES =
<TokenType.PROPERTIES: 'PROPERTIES'>
PSEUDO_TYPE =
<TokenType.PSEUDO_TYPE: 'PSEUDO_TYPE'>
QUALIFY =
<TokenType.QUALIFY: 'QUALIFY'>
QUOTE =
<TokenType.QUOTE: 'QUOTE'>
RANGE =
<TokenType.RANGE: 'RANGE'>
RECURSIVE =
<TokenType.RECURSIVE: 'RECURSIVE'>
REPLACE =
<TokenType.REPLACE: 'REPLACE'>
RETURNING =
<TokenType.RETURNING: 'RETURNING'>
REFERENCES =
<TokenType.REFERENCES: 'REFERENCES'>
RIGHT =
<TokenType.RIGHT: 'RIGHT'>
RLIKE =
<TokenType.RLIKE: 'RLIKE'>
ROLLBACK =
<TokenType.ROLLBACK: 'ROLLBACK'>
ROLLUP =
<TokenType.ROLLUP: 'ROLLUP'>
ROW =
<TokenType.ROW: 'ROW'>
ROWS =
<TokenType.ROWS: 'ROWS'>
SELECT =
<TokenType.SELECT: 'SELECT'>
SEMI =
<TokenType.SEMI: 'SEMI'>
SEPARATOR =
<TokenType.SEPARATOR: 'SEPARATOR'>
SERDE_PROPERTIES =
<TokenType.SERDE_PROPERTIES: 'SERDE_PROPERTIES'>
SET =
<TokenType.SET: 'SET'>
SETTINGS =
<TokenType.SETTINGS: 'SETTINGS'>
SHOW =
<TokenType.SHOW: 'SHOW'>
SIMILAR_TO =
<TokenType.SIMILAR_TO: 'SIMILAR_TO'>
SOME =
<TokenType.SOME: 'SOME'>
STRUCT =
<TokenType.STRUCT: 'STRUCT'>
TABLE_SAMPLE =
<TokenType.TABLE_SAMPLE: 'TABLE_SAMPLE'>
TEMPORARY =
<TokenType.TEMPORARY: 'TEMPORARY'>
TOP =
<TokenType.TOP: 'TOP'>
THEN =
<TokenType.THEN: 'THEN'>
TRUE =
<TokenType.TRUE: 'TRUE'>
UNCACHE =
<TokenType.UNCACHE: 'UNCACHE'>
UNION =
<TokenType.UNION: 'UNION'>
UNNEST =
<TokenType.UNNEST: 'UNNEST'>
UNPIVOT =
<TokenType.UNPIVOT: 'UNPIVOT'>
UPDATE =
<TokenType.UPDATE: 'UPDATE'>
USE =
<TokenType.USE: 'USE'>
USING =
<TokenType.USING: 'USING'>
VALUES =
<TokenType.VALUES: 'VALUES'>
VIEW =
<TokenType.VIEW: 'VIEW'>
VOLATILE =
<TokenType.VOLATILE: 'VOLATILE'>
WHEN =
<TokenType.WHEN: 'WHEN'>
WHERE =
<TokenType.WHERE: 'WHERE'>
WINDOW =
<TokenType.WINDOW: 'WINDOW'>
WITH =
<TokenType.WITH: 'WITH'>
UNIQUE =
<TokenType.UNIQUE: 'UNIQUE'>
Inherited Members
- enum.Enum
- name
- value
class
Token:
294class Token: 295 __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments") 296 297 @classmethod 298 def number(cls, number: int) -> Token: 299 """Returns a NUMBER token with `number` as its text.""" 300 return cls(TokenType.NUMBER, str(number)) 301 302 @classmethod 303 def string(cls, string: str) -> Token: 304 """Returns a STRING token with `string` as its text.""" 305 return cls(TokenType.STRING, string) 306 307 @classmethod 308 def identifier(cls, identifier: str) -> Token: 309 """Returns an IDENTIFIER token with `identifier` as its text.""" 310 return cls(TokenType.IDENTIFIER, identifier) 311 312 @classmethod 313 def var(cls, var: str) -> Token: 314 """Returns an VAR token with `var` as its text.""" 315 return cls(TokenType.VAR, var) 316 317 def __init__( 318 self, 319 token_type: TokenType, 320 text: str, 321 line: int = 1, 322 col: int = 1, 323 start: int = 0, 324 end: int = 0, 325 comments: t.List[str] = [], 326 ) -> None: 327 """Token initializer. 328 329 Args: 330 token_type: The TokenType Enum. 331 text: The text of the token. 332 line: The line that the token ends on. 333 col: The column that the token ends on. 334 start: The start index of the token. 335 end: The ending index of the token. 336 """ 337 self.token_type = token_type 338 self.text = text 339 self.line = line 340 self.col = col 341 self.start = start 342 self.end = end 343 self.comments = comments 344 345 def __repr__(self) -> str: 346 attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) 347 return f"<Token {attributes}>"
Token( token_type: sqlglot.tokens.TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: List[str] = [])
317 def __init__( 318 self, 319 token_type: TokenType, 320 text: str, 321 line: int = 1, 322 col: int = 1, 323 start: int = 0, 324 end: int = 0, 325 comments: t.List[str] = [], 326 ) -> None: 327 """Token initializer. 328 329 Args: 330 token_type: The TokenType Enum. 331 text: The text of the token. 332 line: The line that the token ends on. 333 col: The column that the token ends on. 334 start: The start index of the token. 335 end: The ending index of the token. 336 """ 337 self.token_type = token_type 338 self.text = text 339 self.line = line 340 self.col = col 341 self.start = start 342 self.end = end 343 self.comments = comments
Token initializer.
Arguments:
- token_type: The TokenType Enum.
- text: The text of the token.
- line: The line that the token ends on.
- col: The column that the token ends on.
- start: The start index of the token.
- end: The ending index of the token.
297 @classmethod 298 def number(cls, number: int) -> Token: 299 """Returns a NUMBER token with `number` as its text.""" 300 return cls(TokenType.NUMBER, str(number))
Returns a NUMBER token with number
as its text.
302 @classmethod 303 def string(cls, string: str) -> Token: 304 """Returns a STRING token with `string` as its text.""" 305 return cls(TokenType.STRING, string)
Returns a STRING token with string
as its text.
307 @classmethod 308 def identifier(cls, identifier: str) -> Token: 309 """Returns an IDENTIFIER token with `identifier` as its text.""" 310 return cls(TokenType.IDENTIFIER, identifier)
Returns an IDENTIFIER token with identifier
as its text.
class
Tokenizer:
390class Tokenizer(metaclass=_Tokenizer): 391 SINGLE_TOKENS = { 392 "(": TokenType.L_PAREN, 393 ")": TokenType.R_PAREN, 394 "[": TokenType.L_BRACKET, 395 "]": TokenType.R_BRACKET, 396 "{": TokenType.L_BRACE, 397 "}": TokenType.R_BRACE, 398 "&": TokenType.AMP, 399 "^": TokenType.CARET, 400 ":": TokenType.COLON, 401 ",": TokenType.COMMA, 402 ".": TokenType.DOT, 403 "-": TokenType.DASH, 404 "=": TokenType.EQ, 405 ">": TokenType.GT, 406 "<": TokenType.LT, 407 "%": TokenType.MOD, 408 "!": TokenType.NOT, 409 "|": TokenType.PIPE, 410 "+": TokenType.PLUS, 411 ";": TokenType.SEMICOLON, 412 "/": TokenType.SLASH, 413 "\\": TokenType.BACKSLASH, 414 "*": TokenType.STAR, 415 "~": TokenType.TILDA, 416 "?": TokenType.PLACEHOLDER, 417 "@": TokenType.PARAMETER, 418 # used for breaking a var like x'y' but nothing else 419 # the token type doesn't matter 420 "'": TokenType.QUOTE, 421 "`": TokenType.IDENTIFIER, 422 '"': TokenType.IDENTIFIER, 423 "#": TokenType.HASH, 424 } 425 426 BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] 427 BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] 428 HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] 429 IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] 430 IDENTIFIER_ESCAPES = ['"'] 431 QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] 432 STRING_ESCAPES = ["'"] 433 VAR_SINGLE_TOKENS: t.Set[str] = set() 434 435 _COMMENTS: t.Dict[str, str] = {} 436 _BIT_STRINGS: t.Dict[str, str] = {} 437 _BYTE_STRINGS: t.Dict[str, str] = {} 438 _HEX_STRINGS: t.Dict[str, str] = {} 439 _IDENTIFIERS: t.Dict[str, str] = {} 440 _IDENTIFIER_ESCAPES: t.Set[str] = set() 441 _QUOTES: t.Dict[str, str] = {} 442 _STRING_ESCAPES: t.Set[str] = set() 443 444 KEYWORDS: t.Dict[t.Optional[str], TokenType] = { 445 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 446 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 447 "{{+": TokenType.BLOCK_START, 448 "{{-": TokenType.BLOCK_START, 449 "+}}": TokenType.BLOCK_END, 450 "-}}": TokenType.BLOCK_END, 451 "/*+": TokenType.HINT, 452 "==": TokenType.EQ, 453 "::": TokenType.DCOLON, 454 "||": TokenType.DPIPE, 455 ">=": TokenType.GTE, 456 "<=": TokenType.LTE, 457 "<>": TokenType.NEQ, 458 "!=": TokenType.NEQ, 459 "<=>": TokenType.NULLSAFE_EQ, 460 "->": TokenType.ARROW, 461 "->>": TokenType.DARROW, 462 "=>": TokenType.FARROW, 463 "#>": TokenType.HASH_ARROW, 464 "#>>": TokenType.DHASH_ARROW, 465 "<->": TokenType.LR_ARROW, 466 "&&": TokenType.DAMP, 467 "ALL": TokenType.ALL, 468 "ALWAYS": TokenType.ALWAYS, 469 "AND": TokenType.AND, 470 "ANTI": TokenType.ANTI, 471 "ANY": TokenType.ANY, 472 "ASC": TokenType.ASC, 473 "AS": TokenType.ALIAS, 474 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 475 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 476 "BEGIN": TokenType.BEGIN, 477 "BETWEEN": TokenType.BETWEEN, 478 "CACHE": TokenType.CACHE, 479 "UNCACHE": TokenType.UNCACHE, 480 "CASE": TokenType.CASE, 481 "CHARACTER SET": TokenType.CHARACTER_SET, 482 "COLLATE": TokenType.COLLATE, 483 "COLUMN": TokenType.COLUMN, 484 "COMMIT": TokenType.COMMIT, 485 "CONSTRAINT": TokenType.CONSTRAINT, 486 "CREATE": TokenType.CREATE, 487 "CROSS": TokenType.CROSS, 488 "CUBE": TokenType.CUBE, 489 "CURRENT_DATE": TokenType.CURRENT_DATE, 490 "CURRENT_TIME": TokenType.CURRENT_TIME, 491 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 492 "CURRENT_USER": TokenType.CURRENT_USER, 493 "DATABASE": TokenType.DATABASE, 494 "DEFAULT": TokenType.DEFAULT, 495 "DELETE": TokenType.DELETE, 496 "DESC": TokenType.DESC, 497 "DESCRIBE": TokenType.DESCRIBE, 498 "DISTINCT": TokenType.DISTINCT, 499 "DIV": TokenType.DIV, 500 "DROP": TokenType.DROP, 501 "ELSE": TokenType.ELSE, 502 "END": TokenType.END, 503 "ESCAPE": TokenType.ESCAPE, 504 "EXCEPT": TokenType.EXCEPT, 505 "EXECUTE": TokenType.EXECUTE, 506 "EXISTS": TokenType.EXISTS, 507 "FALSE": TokenType.FALSE, 508 "FETCH": TokenType.FETCH, 509 "FILTER": TokenType.FILTER, 510 "FIRST": TokenType.FIRST, 511 "FULL": TokenType.FULL, 512 "FUNCTION": TokenType.FUNCTION, 513 "FOR": TokenType.FOR, 514 "FOREIGN KEY": TokenType.FOREIGN_KEY, 515 "FORMAT": TokenType.FORMAT, 516 "FROM": TokenType.FROM, 517 "GEOGRAPHY": TokenType.GEOGRAPHY, 518 "GEOMETRY": TokenType.GEOMETRY, 519 "GLOB": TokenType.GLOB, 520 "GROUP BY": TokenType.GROUP_BY, 521 "GROUPING SETS": TokenType.GROUPING_SETS, 522 "HAVING": TokenType.HAVING, 523 "IF": TokenType.IF, 524 "ILIKE": TokenType.ILIKE, 525 "IN": TokenType.IN, 526 "INDEX": TokenType.INDEX, 527 "INET": TokenType.INET, 528 "INNER": TokenType.INNER, 529 "INSERT": TokenType.INSERT, 530 "INTERVAL": TokenType.INTERVAL, 531 "INTERSECT": TokenType.INTERSECT, 532 "INTO": TokenType.INTO, 533 "IS": TokenType.IS, 534 "ISNULL": TokenType.ISNULL, 535 "JOIN": TokenType.JOIN, 536 "KEEP": TokenType.KEEP, 537 "LATERAL": TokenType.LATERAL, 538 "LEFT": TokenType.LEFT, 539 "LIKE": TokenType.LIKE, 540 "LIMIT": TokenType.LIMIT, 541 "LOAD": TokenType.LOAD, 542 "LOCK": TokenType.LOCK, 543 "MERGE": TokenType.MERGE, 544 "NATURAL": TokenType.NATURAL, 545 "NEXT": TokenType.NEXT, 546 "NEXT VALUE FOR": TokenType.NEXT_VALUE_FOR, 547 "NOT": TokenType.NOT, 548 "NOTNULL": TokenType.NOTNULL, 549 "NULL": TokenType.NULL, 550 "OBJECT": TokenType.OBJECT, 551 "OFFSET": TokenType.OFFSET, 552 "ON": TokenType.ON, 553 "OR": TokenType.OR, 554 "ORDER BY": TokenType.ORDER_BY, 555 "ORDINALITY": TokenType.ORDINALITY, 556 "OUTER": TokenType.OUTER, 557 "OVER": TokenType.OVER, 558 "OVERLAPS": TokenType.OVERLAPS, 559 "OVERWRITE": TokenType.OVERWRITE, 560 "PARTITION": TokenType.PARTITION, 561 "PARTITION BY": TokenType.PARTITION_BY, 562 "PARTITIONED BY": TokenType.PARTITION_BY, 563 "PARTITIONED_BY": TokenType.PARTITION_BY, 564 "PERCENT": TokenType.PERCENT, 565 "PIVOT": TokenType.PIVOT, 566 "PRAGMA": TokenType.PRAGMA, 567 "PRIMARY KEY": TokenType.PRIMARY_KEY, 568 "PROCEDURE": TokenType.PROCEDURE, 569 "QUALIFY": TokenType.QUALIFY, 570 "RANGE": TokenType.RANGE, 571 "RECURSIVE": TokenType.RECURSIVE, 572 "REGEXP": TokenType.RLIKE, 573 "REPLACE": TokenType.REPLACE, 574 "REFERENCES": TokenType.REFERENCES, 575 "RIGHT": TokenType.RIGHT, 576 "RLIKE": TokenType.RLIKE, 577 "ROLLBACK": TokenType.ROLLBACK, 578 "ROLLUP": TokenType.ROLLUP, 579 "ROW": TokenType.ROW, 580 "ROWS": TokenType.ROWS, 581 "SCHEMA": TokenType.SCHEMA, 582 "SELECT": TokenType.SELECT, 583 "SEMI": TokenType.SEMI, 584 "SET": TokenType.SET, 585 "SETTINGS": TokenType.SETTINGS, 586 "SHOW": TokenType.SHOW, 587 "SIMILAR TO": TokenType.SIMILAR_TO, 588 "SOME": TokenType.SOME, 589 "TABLE": TokenType.TABLE, 590 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 591 "TEMP": TokenType.TEMPORARY, 592 "TEMPORARY": TokenType.TEMPORARY, 593 "THEN": TokenType.THEN, 594 "TRUE": TokenType.TRUE, 595 "UNION": TokenType.UNION, 596 "UNNEST": TokenType.UNNEST, 597 "UNPIVOT": TokenType.UNPIVOT, 598 "UPDATE": TokenType.UPDATE, 599 "USE": TokenType.USE, 600 "USING": TokenType.USING, 601 "UUID": TokenType.UUID, 602 "VALUES": TokenType.VALUES, 603 "VIEW": TokenType.VIEW, 604 "VOLATILE": TokenType.VOLATILE, 605 "WHEN": TokenType.WHEN, 606 "WHERE": TokenType.WHERE, 607 "WINDOW": TokenType.WINDOW, 608 "WITH": TokenType.WITH, 609 "APPLY": TokenType.APPLY, 610 "ARRAY": TokenType.ARRAY, 611 "BIT": TokenType.BIT, 612 "BOOL": TokenType.BOOLEAN, 613 "BOOLEAN": TokenType.BOOLEAN, 614 "BYTE": TokenType.TINYINT, 615 "TINYINT": TokenType.TINYINT, 616 "SHORT": TokenType.SMALLINT, 617 "SMALLINT": TokenType.SMALLINT, 618 "INT2": TokenType.SMALLINT, 619 "INTEGER": TokenType.INT, 620 "INT": TokenType.INT, 621 "INT4": TokenType.INT, 622 "LONG": TokenType.BIGINT, 623 "BIGINT": TokenType.BIGINT, 624 "INT8": TokenType.BIGINT, 625 "DEC": TokenType.DECIMAL, 626 "DECIMAL": TokenType.DECIMAL, 627 "BIGDECIMAL": TokenType.BIGDECIMAL, 628 "BIGNUMERIC": TokenType.BIGDECIMAL, 629 "MAP": TokenType.MAP, 630 "NULLABLE": TokenType.NULLABLE, 631 "NUMBER": TokenType.DECIMAL, 632 "NUMERIC": TokenType.DECIMAL, 633 "FIXED": TokenType.DECIMAL, 634 "REAL": TokenType.FLOAT, 635 "FLOAT": TokenType.FLOAT, 636 "FLOAT4": TokenType.FLOAT, 637 "FLOAT8": TokenType.DOUBLE, 638 "DOUBLE": TokenType.DOUBLE, 639 "DOUBLE PRECISION": TokenType.DOUBLE, 640 "JSON": TokenType.JSON, 641 "CHAR": TokenType.CHAR, 642 "CHARACTER": TokenType.CHAR, 643 "NCHAR": TokenType.NCHAR, 644 "VARCHAR": TokenType.VARCHAR, 645 "VARCHAR2": TokenType.VARCHAR, 646 "NVARCHAR": TokenType.NVARCHAR, 647 "NVARCHAR2": TokenType.NVARCHAR, 648 "STR": TokenType.TEXT, 649 "STRING": TokenType.TEXT, 650 "TEXT": TokenType.TEXT, 651 "CLOB": TokenType.TEXT, 652 "LONGVARCHAR": TokenType.TEXT, 653 "BINARY": TokenType.BINARY, 654 "BLOB": TokenType.VARBINARY, 655 "BYTEA": TokenType.VARBINARY, 656 "VARBINARY": TokenType.VARBINARY, 657 "TIME": TokenType.TIME, 658 "TIMESTAMP": TokenType.TIMESTAMP, 659 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 660 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 661 "DATE": TokenType.DATE, 662 "DATETIME": TokenType.DATETIME, 663 "UNIQUE": TokenType.UNIQUE, 664 "STRUCT": TokenType.STRUCT, 665 "VARIANT": TokenType.VARIANT, 666 "ALTER": TokenType.ALTER, 667 "ANALYZE": TokenType.COMMAND, 668 "CALL": TokenType.COMMAND, 669 "COMMENT": TokenType.COMMENT, 670 "COPY": TokenType.COMMAND, 671 "EXPLAIN": TokenType.COMMAND, 672 "GRANT": TokenType.COMMAND, 673 "OPTIMIZE": TokenType.COMMAND, 674 "PREPARE": TokenType.COMMAND, 675 "TRUNCATE": TokenType.COMMAND, 676 "VACUUM": TokenType.COMMAND, 677 } 678 679 WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = { 680 " ": TokenType.SPACE, 681 "\t": TokenType.SPACE, 682 "\n": TokenType.BREAK, 683 "\r": TokenType.BREAK, 684 "\r\n": TokenType.BREAK, 685 } 686 687 COMMANDS = { 688 TokenType.COMMAND, 689 TokenType.EXECUTE, 690 TokenType.FETCH, 691 TokenType.SHOW, 692 } 693 694 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 695 696 # handle numeric literals like in hive (3L = BIGINT) 697 NUMERIC_LITERALS: t.Dict[str, str] = {} 698 ENCODE: t.Optional[str] = None 699 700 COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")] 701 KEYWORD_TRIE: t.Dict = {} # autofilled 702 703 IDENTIFIER_CAN_START_WITH_DIGIT = False 704 705 __slots__ = ( 706 "sql", 707 "size", 708 "tokens", 709 "_start", 710 "_current", 711 "_line", 712 "_col", 713 "_comments", 714 "_char", 715 "_end", 716 "_peek", 717 "_prev_token_line", 718 ) 719 720 def __init__(self) -> None: 721 self.reset() 722 723 def reset(self) -> None: 724 self.sql = "" 725 self.size = 0 726 self.tokens: t.List[Token] = [] 727 self._start = 0 728 self._current = 0 729 self._line = 1 730 self._col = 0 731 self._comments: t.List[str] = [] 732 733 self._char = "" 734 self._end = False 735 self._peek = "" 736 self._prev_token_line = -1 737 738 def tokenize(self, sql: str) -> t.List[Token]: 739 """Returns a list of tokens corresponding to the SQL string `sql`.""" 740 self.reset() 741 self.sql = sql 742 self.size = len(sql) 743 744 try: 745 self._scan() 746 except Exception as e: 747 start = max(self._current - 50, 0) 748 end = min(self._current + 50, self.size - 1) 749 context = self.sql[start:end] 750 raise ValueError(f"Error tokenizing '{context}'") from e 751 752 return self.tokens 753 754 def _scan(self, until: t.Optional[t.Callable] = None) -> None: 755 while self.size and not self._end: 756 self._start = self._current 757 self._advance() 758 759 if self._char is None: 760 break 761 762 if self._char not in self.WHITE_SPACE: 763 if self._char.isdigit(): 764 self._scan_number() 765 elif self._char in self._IDENTIFIERS: 766 self._scan_identifier(self._IDENTIFIERS[self._char]) 767 else: 768 self._scan_keywords() 769 770 if until and until(): 771 break 772 773 if self.tokens and self._comments: 774 self.tokens[-1].comments.extend(self._comments) 775 776 def _chars(self, size: int) -> str: 777 if size == 1: 778 return self._char 779 780 start = self._current - 1 781 end = start + size 782 783 return self.sql[start:end] if end <= self.size else "" 784 785 def _advance(self, i: int = 1, alnum: bool = False) -> None: 786 if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: 787 self._col = 1 788 self._line += 1 789 else: 790 self._col += i 791 792 self._current += i 793 self._end = self._current >= self.size 794 self._char = self.sql[self._current - 1] 795 self._peek = "" if self._end else self.sql[self._current] 796 797 if alnum and self._char.isalnum(): 798 # Here we use local variables instead of attributes for better performance 799 _col = self._col 800 _current = self._current 801 _end = self._end 802 _peek = self._peek 803 804 while _peek.isalnum(): 805 _col += 1 806 _current += 1 807 _end = _current >= self.size 808 _peek = "" if _end else self.sql[_current] 809 810 self._col = _col 811 self._current = _current 812 self._end = _end 813 self._peek = _peek 814 self._char = self.sql[_current - 1] 815 816 @property 817 def _text(self) -> str: 818 return self.sql[self._start : self._current] 819 820 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: 821 self._prev_token_line = self._line 822 self.tokens.append( 823 Token( 824 token_type, 825 text=self._text if text is None else text, 826 line=self._line, 827 col=self._col, 828 start=self._start, 829 end=self._current - 1, 830 comments=self._comments, 831 ) 832 ) 833 self._comments = [] 834 835 # If we have either a semicolon or a begin token before the command's token, we'll parse 836 # whatever follows the command's token as a string 837 if ( 838 token_type in self.COMMANDS 839 and self._peek != ";" 840 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS) 841 ): 842 start = self._current 843 tokens = len(self.tokens) 844 self._scan(lambda: self._peek == ";") 845 self.tokens = self.tokens[:tokens] 846 text = self.sql[start : self._current].strip() 847 if text: 848 self._add(TokenType.STRING, text) 849 850 def _scan_keywords(self) -> None: 851 size = 0 852 word = None 853 chars = self._text 854 char = chars 855 prev_space = False 856 skip = False 857 trie = self.KEYWORD_TRIE 858 single_token = char in self.SINGLE_TOKENS 859 860 while chars: 861 if skip: 862 result = 1 863 else: 864 result, trie = in_trie(trie, char.upper()) 865 866 if result == 0: 867 break 868 if result == 2: 869 word = chars 870 871 size += 1 872 end = self._current - 1 + size 873 874 if end < self.size: 875 char = self.sql[end] 876 single_token = single_token or char in self.SINGLE_TOKENS 877 is_space = char in self.WHITE_SPACE 878 879 if not is_space or not prev_space: 880 if is_space: 881 char = " " 882 chars += char 883 prev_space = is_space 884 skip = False 885 else: 886 skip = True 887 else: 888 char = "" 889 chars = " " 890 891 word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word 892 893 if not word: 894 if self._char in self.SINGLE_TOKENS: 895 self._add(self.SINGLE_TOKENS[self._char], text=self._char) 896 return 897 self._scan_var() 898 return 899 900 if self._scan_string(word): 901 return 902 if self._scan_formatted_string(word): 903 return 904 if self._scan_comment(word): 905 return 906 907 self._advance(size - 1) 908 word = word.upper() 909 self._add(self.KEYWORDS[word], text=word) 910 911 def _scan_comment(self, comment_start: str) -> bool: 912 if comment_start not in self._COMMENTS: 913 return False 914 915 comment_start_line = self._line 916 comment_start_size = len(comment_start) 917 comment_end = self._COMMENTS[comment_start] 918 919 if comment_end: 920 # Skip the comment's start delimiter 921 self._advance(comment_start_size) 922 923 comment_end_size = len(comment_end) 924 while not self._end and self._chars(comment_end_size) != comment_end: 925 self._advance(alnum=True) 926 927 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 928 self._advance(comment_end_size - 1) 929 else: 930 while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK: 931 self._advance(alnum=True) 932 self._comments.append(self._text[comment_start_size:]) 933 934 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 935 # Multiple consecutive comments are preserved by appending them to the current comments list. 936 if comment_start_line == self._prev_token_line: 937 self.tokens[-1].comments.extend(self._comments) 938 self._comments = [] 939 self._prev_token_line = self._line 940 941 return True 942 943 def _scan_number(self) -> None: 944 if self._char == "0": 945 peek = self._peek.upper() 946 if peek == "B": 947 return self._scan_bits() if self._BIT_STRINGS else self._add(TokenType.NUMBER) 948 elif peek == "X": 949 return self._scan_hex() if self._HEX_STRINGS else self._add(TokenType.NUMBER) 950 951 decimal = False 952 scientific = 0 953 954 while True: 955 if self._peek.isdigit(): 956 self._advance() 957 elif self._peek == "." and not decimal: 958 decimal = True 959 self._advance() 960 elif self._peek in ("-", "+") and scientific == 1: 961 scientific += 1 962 self._advance() 963 elif self._peek.upper() == "E" and not scientific: 964 scientific += 1 965 self._advance() 966 elif self._peek.isidentifier(): 967 number_text = self._text 968 literal = "" 969 970 while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: 971 literal += self._peek.upper() 972 self._advance() 973 974 token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal)) 975 976 if token_type: 977 self._add(TokenType.NUMBER, number_text) 978 self._add(TokenType.DCOLON, "::") 979 return self._add(token_type, literal) 980 elif self.IDENTIFIER_CAN_START_WITH_DIGIT: 981 return self._add(TokenType.VAR) 982 983 self._add(TokenType.NUMBER, number_text) 984 return self._advance(-len(literal)) 985 else: 986 return self._add(TokenType.NUMBER) 987 988 def _scan_bits(self) -> None: 989 self._advance() 990 value = self._extract_value() 991 try: 992 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 993 int(value, 2) 994 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 995 except ValueError: 996 self._add(TokenType.IDENTIFIER) 997 998 def _scan_hex(self) -> None: 999 self._advance() 1000 value = self._extract_value() 1001 try: 1002 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1003 int(value, 16) 1004 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1005 except ValueError: 1006 self._add(TokenType.IDENTIFIER) 1007 1008 def _extract_value(self) -> str: 1009 while True: 1010 char = self._peek.strip() 1011 if char and char not in self.SINGLE_TOKENS: 1012 self._advance(alnum=True) 1013 else: 1014 break 1015 1016 return self._text 1017 1018 def _scan_string(self, quote: str) -> bool: 1019 quote_end = self._QUOTES.get(quote) 1020 if quote_end is None: 1021 return False 1022 1023 self._advance(len(quote)) 1024 text = self._extract_string(quote_end) 1025 text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text 1026 self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text) 1027 return True 1028 1029 # X'1234', b'0110', E'\\\\\' etc. 1030 def _scan_formatted_string(self, string_start: str) -> bool: 1031 if string_start in self._HEX_STRINGS: 1032 delimiters = self._HEX_STRINGS 1033 token_type = TokenType.HEX_STRING 1034 base = 16 1035 elif string_start in self._BIT_STRINGS: 1036 delimiters = self._BIT_STRINGS 1037 token_type = TokenType.BIT_STRING 1038 base = 2 1039 elif string_start in self._BYTE_STRINGS: 1040 delimiters = self._BYTE_STRINGS 1041 token_type = TokenType.BYTE_STRING 1042 base = None 1043 else: 1044 return False 1045 1046 self._advance(len(string_start)) 1047 string_end = delimiters[string_start] 1048 text = self._extract_string(string_end) 1049 1050 if base: 1051 try: 1052 int(text, base) 1053 except: 1054 raise RuntimeError( 1055 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1056 ) 1057 1058 self._add(token_type, text) 1059 return True 1060 1061 def _scan_identifier(self, identifier_end: str) -> None: 1062 self._advance() 1063 text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES) 1064 self._add(TokenType.IDENTIFIER, text) 1065 1066 def _scan_var(self) -> None: 1067 while True: 1068 char = self._peek.strip() 1069 if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): 1070 self._advance(alnum=True) 1071 else: 1072 break 1073 1074 self._add( 1075 TokenType.VAR 1076 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1077 else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) 1078 ) 1079 1080 def _extract_string(self, delimiter: str, escapes=None) -> str: 1081 text = "" 1082 delim_size = len(delimiter) 1083 escapes = self._STRING_ESCAPES if escapes is None else escapes 1084 1085 while True: 1086 if self._char in escapes and (self._peek == delimiter or self._peek in escapes): 1087 if self._peek == delimiter: 1088 text += self._peek 1089 else: 1090 text += self._char + self._peek 1091 1092 if self._current + 1 < self.size: 1093 self._advance(2) 1094 else: 1095 raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._current}") 1096 else: 1097 if self._chars(delim_size) == delimiter: 1098 if delim_size > 1: 1099 self._advance(delim_size - 1) 1100 break 1101 1102 if self._end: 1103 raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}") 1104 1105 current = self._current - 1 1106 self._advance(alnum=True) 1107 text += self.sql[current : self._current - 1] 1108 1109 return text
def
reset(self) -> None:
723 def reset(self) -> None: 724 self.sql = "" 725 self.size = 0 726 self.tokens: t.List[Token] = [] 727 self._start = 0 728 self._current = 0 729 self._line = 1 730 self._col = 0 731 self._comments: t.List[str] = [] 732 733 self._char = "" 734 self._end = False 735 self._peek = "" 736 self._prev_token_line = -1
738 def tokenize(self, sql: str) -> t.List[Token]: 739 """Returns a list of tokens corresponding to the SQL string `sql`.""" 740 self.reset() 741 self.sql = sql 742 self.size = len(sql) 743 744 try: 745 self._scan() 746 except Exception as e: 747 start = max(self._current - 50, 0) 748 end = min(self._current + 50, self.size - 1) 749 context = self.sql[start:end] 750 raise ValueError(f"Error tokenizing '{context}'") from e 751 752 return self.tokens
Returns a list of tokens corresponding to the SQL string sql
.