Edit on GitHub

sqlglot.tokens

   1from __future__ import annotations
   2
   3import typing as t
   4from enum import auto
   5
   6from sqlglot.helper import AutoName
   7from sqlglot.trie import in_trie, new_trie
   8
   9
  10class TokenType(AutoName):
  11    L_PAREN = auto()
  12    R_PAREN = auto()
  13    L_BRACKET = auto()
  14    R_BRACKET = auto()
  15    L_BRACE = auto()
  16    R_BRACE = auto()
  17    COMMA = auto()
  18    DOT = auto()
  19    DASH = auto()
  20    PLUS = auto()
  21    COLON = auto()
  22    DCOLON = auto()
  23    SEMICOLON = auto()
  24    STAR = auto()
  25    BACKSLASH = auto()
  26    SLASH = auto()
  27    LT = auto()
  28    LTE = auto()
  29    GT = auto()
  30    GTE = auto()
  31    NOT = auto()
  32    EQ = auto()
  33    NEQ = auto()
  34    NULLSAFE_EQ = auto()
  35    AND = auto()
  36    OR = auto()
  37    AMP = auto()
  38    DPIPE = auto()
  39    PIPE = auto()
  40    CARET = auto()
  41    TILDA = auto()
  42    ARROW = auto()
  43    DARROW = auto()
  44    FARROW = auto()
  45    HASH = auto()
  46    HASH_ARROW = auto()
  47    DHASH_ARROW = auto()
  48    LR_ARROW = auto()
  49    LT_AT = auto()
  50    AT_GT = auto()
  51    DOLLAR = auto()
  52    PARAMETER = auto()
  53    SESSION_PARAMETER = auto()
  54    NATIONAL = auto()
  55    DAMP = auto()
  56
  57    BLOCK_START = auto()
  58    BLOCK_END = auto()
  59
  60    SPACE = auto()
  61    BREAK = auto()
  62
  63    STRING = auto()
  64    NUMBER = auto()
  65    IDENTIFIER = auto()
  66    DATABASE = auto()
  67    COLUMN = auto()
  68    COLUMN_DEF = auto()
  69    SCHEMA = auto()
  70    TABLE = auto()
  71    VAR = auto()
  72    BIT_STRING = auto()
  73    HEX_STRING = auto()
  74    BYTE_STRING = auto()
  75
  76    # types
  77    BIT = auto()
  78    BOOLEAN = auto()
  79    TINYINT = auto()
  80    UTINYINT = auto()
  81    SMALLINT = auto()
  82    USMALLINT = auto()
  83    INT = auto()
  84    UINT = auto()
  85    BIGINT = auto()
  86    UBIGINT = auto()
  87    INT128 = auto()
  88    UINT128 = auto()
  89    INT256 = auto()
  90    UINT256 = auto()
  91    FLOAT = auto()
  92    DOUBLE = auto()
  93    DECIMAL = auto()
  94    BIGDECIMAL = auto()
  95    CHAR = auto()
  96    NCHAR = auto()
  97    VARCHAR = auto()
  98    NVARCHAR = auto()
  99    TEXT = auto()
 100    MEDIUMTEXT = auto()
 101    LONGTEXT = auto()
 102    MEDIUMBLOB = auto()
 103    LONGBLOB = auto()
 104    BINARY = auto()
 105    VARBINARY = auto()
 106    JSON = auto()
 107    JSONB = auto()
 108    TIME = auto()
 109    TIMESTAMP = auto()
 110    TIMESTAMPTZ = auto()
 111    TIMESTAMPLTZ = auto()
 112    DATETIME = auto()
 113    DATETIME64 = auto()
 114    DATE = auto()
 115    UUID = auto()
 116    GEOGRAPHY = auto()
 117    NULLABLE = auto()
 118    GEOMETRY = auto()
 119    HLLSKETCH = auto()
 120    HSTORE = auto()
 121    SUPER = auto()
 122    SERIAL = auto()
 123    SMALLSERIAL = auto()
 124    BIGSERIAL = auto()
 125    XML = auto()
 126    UNIQUEIDENTIFIER = auto()
 127    MONEY = auto()
 128    SMALLMONEY = auto()
 129    ROWVERSION = auto()
 130    IMAGE = auto()
 131    VARIANT = auto()
 132    OBJECT = auto()
 133    INET = auto()
 134
 135    # keywords
 136    ALIAS = auto()
 137    ALTER = auto()
 138    ALWAYS = auto()
 139    ALL = auto()
 140    ANTI = auto()
 141    ANY = auto()
 142    APPLY = auto()
 143    ARRAY = auto()
 144    ASC = auto()
 145    ASOF = auto()
 146    AUTO_INCREMENT = auto()
 147    BEGIN = auto()
 148    BETWEEN = auto()
 149    CACHE = auto()
 150    CASE = auto()
 151    CHARACTER_SET = auto()
 152    COLLATE = auto()
 153    COMMAND = auto()
 154    COMMENT = auto()
 155    COMMIT = auto()
 156    CONSTRAINT = auto()
 157    CREATE = auto()
 158    CROSS = auto()
 159    CUBE = auto()
 160    CURRENT_DATE = auto()
 161    CURRENT_DATETIME = auto()
 162    CURRENT_TIME = auto()
 163    CURRENT_TIMESTAMP = auto()
 164    CURRENT_USER = auto()
 165    DEFAULT = auto()
 166    DELETE = auto()
 167    DESC = auto()
 168    DESCRIBE = auto()
 169    DISTINCT = auto()
 170    DIV = auto()
 171    DROP = auto()
 172    ELSE = auto()
 173    END = auto()
 174    ESCAPE = auto()
 175    EXCEPT = auto()
 176    EXECUTE = auto()
 177    EXISTS = auto()
 178    FALSE = auto()
 179    FETCH = auto()
 180    FILTER = auto()
 181    FINAL = auto()
 182    FIRST = auto()
 183    FOR = auto()
 184    FOREIGN_KEY = auto()
 185    FORMAT = auto()
 186    FROM = auto()
 187    FULL = auto()
 188    FUNCTION = auto()
 189    GLOB = auto()
 190    GLOBAL = auto()
 191    GROUP_BY = auto()
 192    GROUPING_SETS = auto()
 193    HAVING = auto()
 194    HINT = auto()
 195    IF = auto()
 196    ILIKE = auto()
 197    ILIKE_ANY = auto()
 198    IN = auto()
 199    INDEX = auto()
 200    INNER = auto()
 201    INSERT = auto()
 202    INTERSECT = auto()
 203    INTERVAL = auto()
 204    INTO = auto()
 205    INTRODUCER = auto()
 206    IRLIKE = auto()
 207    IS = auto()
 208    ISNULL = auto()
 209    JOIN = auto()
 210    JOIN_MARKER = auto()
 211    KEEP = auto()
 212    LANGUAGE = auto()
 213    LATERAL = auto()
 214    LEFT = auto()
 215    LIKE = auto()
 216    LIKE_ANY = auto()
 217    LIMIT = auto()
 218    LOAD = auto()
 219    LOCK = auto()
 220    MAP = auto()
 221    MATCH_RECOGNIZE = auto()
 222    MERGE = auto()
 223    MOD = auto()
 224    NATURAL = auto()
 225    NEXT = auto()
 226    NEXT_VALUE_FOR = auto()
 227    NOTNULL = auto()
 228    NULL = auto()
 229    OFFSET = auto()
 230    ON = auto()
 231    ORDER_BY = auto()
 232    ORDERED = auto()
 233    ORDINALITY = auto()
 234    OUTER = auto()
 235    OVER = auto()
 236    OVERLAPS = auto()
 237    OVERWRITE = auto()
 238    PARTITION = auto()
 239    PARTITION_BY = auto()
 240    PERCENT = auto()
 241    PIVOT = auto()
 242    PLACEHOLDER = auto()
 243    PRAGMA = auto()
 244    PRIMARY_KEY = auto()
 245    PROCEDURE = auto()
 246    PROPERTIES = auto()
 247    PSEUDO_TYPE = auto()
 248    QUALIFY = auto()
 249    QUOTE = auto()
 250    RANGE = auto()
 251    RECURSIVE = auto()
 252    REPLACE = auto()
 253    RETURNING = auto()
 254    REFERENCES = auto()
 255    RIGHT = auto()
 256    RLIKE = auto()
 257    ROLLBACK = auto()
 258    ROLLUP = auto()
 259    ROW = auto()
 260    ROWS = auto()
 261    SELECT = auto()
 262    SEMI = auto()
 263    SEPARATOR = auto()
 264    SERDE_PROPERTIES = auto()
 265    SET = auto()
 266    SETTINGS = auto()
 267    SHOW = auto()
 268    SIMILAR_TO = auto()
 269    SOME = auto()
 270    STRUCT = auto()
 271    TABLE_SAMPLE = auto()
 272    TEMPORARY = auto()
 273    TOP = auto()
 274    THEN = auto()
 275    TRUE = auto()
 276    UNCACHE = auto()
 277    UNION = auto()
 278    UNNEST = auto()
 279    UNPIVOT = auto()
 280    UPDATE = auto()
 281    USE = auto()
 282    USING = auto()
 283    VALUES = auto()
 284    VIEW = auto()
 285    VOLATILE = auto()
 286    WHEN = auto()
 287    WHERE = auto()
 288    WINDOW = auto()
 289    WITH = auto()
 290    UNIQUE = auto()
 291
 292
 293class Token:
 294    __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments")
 295
 296    @classmethod
 297    def number(cls, number: int) -> Token:
 298        """Returns a NUMBER token with `number` as its text."""
 299        return cls(TokenType.NUMBER, str(number))
 300
 301    @classmethod
 302    def string(cls, string: str) -> Token:
 303        """Returns a STRING token with `string` as its text."""
 304        return cls(TokenType.STRING, string)
 305
 306    @classmethod
 307    def identifier(cls, identifier: str) -> Token:
 308        """Returns an IDENTIFIER token with `identifier` as its text."""
 309        return cls(TokenType.IDENTIFIER, identifier)
 310
 311    @classmethod
 312    def var(cls, var: str) -> Token:
 313        """Returns an VAR token with `var` as its text."""
 314        return cls(TokenType.VAR, var)
 315
 316    def __init__(
 317        self,
 318        token_type: TokenType,
 319        text: str,
 320        line: int = 1,
 321        col: int = 1,
 322        start: int = 0,
 323        end: int = 0,
 324        comments: t.List[str] = [],
 325    ) -> None:
 326        """Token initializer.
 327
 328        Args:
 329            token_type: The TokenType Enum.
 330            text: The text of the token.
 331            line: The line that the token ends on.
 332            col: The column that the token ends on.
 333            start: The start index of the token.
 334            end: The ending index of the token.
 335        """
 336        self.token_type = token_type
 337        self.text = text
 338        self.line = line
 339        self.col = col
 340        self.start = start
 341        self.end = end
 342        self.comments = comments
 343
 344    def __repr__(self) -> str:
 345        attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
 346        return f"<Token {attributes}>"
 347
 348
 349class _Tokenizer(type):
 350    def __new__(cls, clsname, bases, attrs):
 351        klass = super().__new__(cls, clsname, bases, attrs)
 352
 353        klass._QUOTES = {
 354            f"{prefix}{s}": e
 355            for s, e in cls._delimeter_list_to_dict(klass.QUOTES).items()
 356            for prefix in (("",) if s[0].isalpha() else ("", "n", "N"))
 357        }
 358        klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS)
 359        klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS)
 360        klass._BYTE_STRINGS = cls._delimeter_list_to_dict(klass.BYTE_STRINGS)
 361        klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS)
 362        klass._STRING_ESCAPES = set(klass.STRING_ESCAPES)
 363        klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES)
 364        klass._COMMENTS = dict(
 365            (comment, None) if isinstance(comment, str) else (comment[0], comment[1])
 366            for comment in klass.COMMENTS
 367        )
 368
 369        klass.KEYWORD_TRIE = new_trie(
 370            key.upper()
 371            for key in (
 372                *klass.KEYWORDS,
 373                *klass._COMMENTS,
 374                *klass._QUOTES,
 375                *klass._BIT_STRINGS,
 376                *klass._HEX_STRINGS,
 377                *klass._BYTE_STRINGS,
 378            )
 379            if " " in key or any(single in key for single in klass.SINGLE_TOKENS)
 380        )
 381
 382        return klass
 383
 384    @staticmethod
 385    def _delimeter_list_to_dict(list: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]:
 386        return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list)
 387
 388
 389class Tokenizer(metaclass=_Tokenizer):
 390    SINGLE_TOKENS = {
 391        "(": TokenType.L_PAREN,
 392        ")": TokenType.R_PAREN,
 393        "[": TokenType.L_BRACKET,
 394        "]": TokenType.R_BRACKET,
 395        "{": TokenType.L_BRACE,
 396        "}": TokenType.R_BRACE,
 397        "&": TokenType.AMP,
 398        "^": TokenType.CARET,
 399        ":": TokenType.COLON,
 400        ",": TokenType.COMMA,
 401        ".": TokenType.DOT,
 402        "-": TokenType.DASH,
 403        "=": TokenType.EQ,
 404        ">": TokenType.GT,
 405        "<": TokenType.LT,
 406        "%": TokenType.MOD,
 407        "!": TokenType.NOT,
 408        "|": TokenType.PIPE,
 409        "+": TokenType.PLUS,
 410        ";": TokenType.SEMICOLON,
 411        "/": TokenType.SLASH,
 412        "\\": TokenType.BACKSLASH,
 413        "*": TokenType.STAR,
 414        "~": TokenType.TILDA,
 415        "?": TokenType.PLACEHOLDER,
 416        "@": TokenType.PARAMETER,
 417        # used for breaking a var like x'y' but nothing else
 418        # the token type doesn't matter
 419        "'": TokenType.QUOTE,
 420        "`": TokenType.IDENTIFIER,
 421        '"': TokenType.IDENTIFIER,
 422        "#": TokenType.HASH,
 423    }
 424
 425    BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
 426    BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
 427    HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
 428    IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
 429    IDENTIFIER_ESCAPES = ['"']
 430    QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
 431    STRING_ESCAPES = ["'"]
 432    VAR_SINGLE_TOKENS: t.Set[str] = set()
 433
 434    _COMMENTS: t.Dict[str, str] = {}
 435    _BIT_STRINGS: t.Dict[str, str] = {}
 436    _BYTE_STRINGS: t.Dict[str, str] = {}
 437    _HEX_STRINGS: t.Dict[str, str] = {}
 438    _IDENTIFIERS: t.Dict[str, str] = {}
 439    _IDENTIFIER_ESCAPES: t.Set[str] = set()
 440    _QUOTES: t.Dict[str, str] = {}
 441    _STRING_ESCAPES: t.Set[str] = set()
 442
 443    KEYWORDS: t.Dict[t.Optional[str], TokenType] = {
 444        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
 445        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
 446        "{{+": TokenType.BLOCK_START,
 447        "{{-": TokenType.BLOCK_START,
 448        "+}}": TokenType.BLOCK_END,
 449        "-}}": TokenType.BLOCK_END,
 450        "/*+": TokenType.HINT,
 451        "==": TokenType.EQ,
 452        "::": TokenType.DCOLON,
 453        "||": TokenType.DPIPE,
 454        ">=": TokenType.GTE,
 455        "<=": TokenType.LTE,
 456        "<>": TokenType.NEQ,
 457        "!=": TokenType.NEQ,
 458        "<=>": TokenType.NULLSAFE_EQ,
 459        "->": TokenType.ARROW,
 460        "->>": TokenType.DARROW,
 461        "=>": TokenType.FARROW,
 462        "#>": TokenType.HASH_ARROW,
 463        "#>>": TokenType.DHASH_ARROW,
 464        "<->": TokenType.LR_ARROW,
 465        "&&": TokenType.DAMP,
 466        "ALL": TokenType.ALL,
 467        "ALWAYS": TokenType.ALWAYS,
 468        "AND": TokenType.AND,
 469        "ANTI": TokenType.ANTI,
 470        "ANY": TokenType.ANY,
 471        "ASC": TokenType.ASC,
 472        "AS": TokenType.ALIAS,
 473        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
 474        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
 475        "BEGIN": TokenType.BEGIN,
 476        "BETWEEN": TokenType.BETWEEN,
 477        "CACHE": TokenType.CACHE,
 478        "UNCACHE": TokenType.UNCACHE,
 479        "CASE": TokenType.CASE,
 480        "CHARACTER SET": TokenType.CHARACTER_SET,
 481        "COLLATE": TokenType.COLLATE,
 482        "COLUMN": TokenType.COLUMN,
 483        "COMMIT": TokenType.COMMIT,
 484        "CONSTRAINT": TokenType.CONSTRAINT,
 485        "CREATE": TokenType.CREATE,
 486        "CROSS": TokenType.CROSS,
 487        "CUBE": TokenType.CUBE,
 488        "CURRENT_DATE": TokenType.CURRENT_DATE,
 489        "CURRENT_TIME": TokenType.CURRENT_TIME,
 490        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
 491        "CURRENT_USER": TokenType.CURRENT_USER,
 492        "DATABASE": TokenType.DATABASE,
 493        "DEFAULT": TokenType.DEFAULT,
 494        "DELETE": TokenType.DELETE,
 495        "DESC": TokenType.DESC,
 496        "DESCRIBE": TokenType.DESCRIBE,
 497        "DISTINCT": TokenType.DISTINCT,
 498        "DIV": TokenType.DIV,
 499        "DROP": TokenType.DROP,
 500        "ELSE": TokenType.ELSE,
 501        "END": TokenType.END,
 502        "ESCAPE": TokenType.ESCAPE,
 503        "EXCEPT": TokenType.EXCEPT,
 504        "EXECUTE": TokenType.EXECUTE,
 505        "EXISTS": TokenType.EXISTS,
 506        "FALSE": TokenType.FALSE,
 507        "FETCH": TokenType.FETCH,
 508        "FILTER": TokenType.FILTER,
 509        "FIRST": TokenType.FIRST,
 510        "FULL": TokenType.FULL,
 511        "FUNCTION": TokenType.FUNCTION,
 512        "FOR": TokenType.FOR,
 513        "FOREIGN KEY": TokenType.FOREIGN_KEY,
 514        "FORMAT": TokenType.FORMAT,
 515        "FROM": TokenType.FROM,
 516        "GEOGRAPHY": TokenType.GEOGRAPHY,
 517        "GEOMETRY": TokenType.GEOMETRY,
 518        "GLOB": TokenType.GLOB,
 519        "GROUP BY": TokenType.GROUP_BY,
 520        "GROUPING SETS": TokenType.GROUPING_SETS,
 521        "HAVING": TokenType.HAVING,
 522        "IF": TokenType.IF,
 523        "ILIKE": TokenType.ILIKE,
 524        "IN": TokenType.IN,
 525        "INDEX": TokenType.INDEX,
 526        "INET": TokenType.INET,
 527        "INNER": TokenType.INNER,
 528        "INSERT": TokenType.INSERT,
 529        "INTERVAL": TokenType.INTERVAL,
 530        "INTERSECT": TokenType.INTERSECT,
 531        "INTO": TokenType.INTO,
 532        "IS": TokenType.IS,
 533        "ISNULL": TokenType.ISNULL,
 534        "JOIN": TokenType.JOIN,
 535        "KEEP": TokenType.KEEP,
 536        "LATERAL": TokenType.LATERAL,
 537        "LEFT": TokenType.LEFT,
 538        "LIKE": TokenType.LIKE,
 539        "LIMIT": TokenType.LIMIT,
 540        "LOAD": TokenType.LOAD,
 541        "LOCK": TokenType.LOCK,
 542        "MERGE": TokenType.MERGE,
 543        "NATURAL": TokenType.NATURAL,
 544        "NEXT": TokenType.NEXT,
 545        "NEXT VALUE FOR": TokenType.NEXT_VALUE_FOR,
 546        "NOT": TokenType.NOT,
 547        "NOTNULL": TokenType.NOTNULL,
 548        "NULL": TokenType.NULL,
 549        "OBJECT": TokenType.OBJECT,
 550        "OFFSET": TokenType.OFFSET,
 551        "ON": TokenType.ON,
 552        "OR": TokenType.OR,
 553        "ORDER BY": TokenType.ORDER_BY,
 554        "ORDINALITY": TokenType.ORDINALITY,
 555        "OUTER": TokenType.OUTER,
 556        "OVER": TokenType.OVER,
 557        "OVERLAPS": TokenType.OVERLAPS,
 558        "OVERWRITE": TokenType.OVERWRITE,
 559        "PARTITION": TokenType.PARTITION,
 560        "PARTITION BY": TokenType.PARTITION_BY,
 561        "PARTITIONED BY": TokenType.PARTITION_BY,
 562        "PARTITIONED_BY": TokenType.PARTITION_BY,
 563        "PERCENT": TokenType.PERCENT,
 564        "PIVOT": TokenType.PIVOT,
 565        "PRAGMA": TokenType.PRAGMA,
 566        "PRIMARY KEY": TokenType.PRIMARY_KEY,
 567        "PROCEDURE": TokenType.PROCEDURE,
 568        "QUALIFY": TokenType.QUALIFY,
 569        "RANGE": TokenType.RANGE,
 570        "RECURSIVE": TokenType.RECURSIVE,
 571        "REGEXP": TokenType.RLIKE,
 572        "REPLACE": TokenType.REPLACE,
 573        "REFERENCES": TokenType.REFERENCES,
 574        "RIGHT": TokenType.RIGHT,
 575        "RLIKE": TokenType.RLIKE,
 576        "ROLLBACK": TokenType.ROLLBACK,
 577        "ROLLUP": TokenType.ROLLUP,
 578        "ROW": TokenType.ROW,
 579        "ROWS": TokenType.ROWS,
 580        "SCHEMA": TokenType.SCHEMA,
 581        "SELECT": TokenType.SELECT,
 582        "SEMI": TokenType.SEMI,
 583        "SET": TokenType.SET,
 584        "SETTINGS": TokenType.SETTINGS,
 585        "SHOW": TokenType.SHOW,
 586        "SIMILAR TO": TokenType.SIMILAR_TO,
 587        "SOME": TokenType.SOME,
 588        "TABLE": TokenType.TABLE,
 589        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
 590        "TEMP": TokenType.TEMPORARY,
 591        "TEMPORARY": TokenType.TEMPORARY,
 592        "THEN": TokenType.THEN,
 593        "TRUE": TokenType.TRUE,
 594        "UNION": TokenType.UNION,
 595        "UNNEST": TokenType.UNNEST,
 596        "UNPIVOT": TokenType.UNPIVOT,
 597        "UPDATE": TokenType.UPDATE,
 598        "USE": TokenType.USE,
 599        "USING": TokenType.USING,
 600        "UUID": TokenType.UUID,
 601        "VALUES": TokenType.VALUES,
 602        "VIEW": TokenType.VIEW,
 603        "VOLATILE": TokenType.VOLATILE,
 604        "WHEN": TokenType.WHEN,
 605        "WHERE": TokenType.WHERE,
 606        "WINDOW": TokenType.WINDOW,
 607        "WITH": TokenType.WITH,
 608        "APPLY": TokenType.APPLY,
 609        "ARRAY": TokenType.ARRAY,
 610        "BIT": TokenType.BIT,
 611        "BOOL": TokenType.BOOLEAN,
 612        "BOOLEAN": TokenType.BOOLEAN,
 613        "BYTE": TokenType.TINYINT,
 614        "TINYINT": TokenType.TINYINT,
 615        "SHORT": TokenType.SMALLINT,
 616        "SMALLINT": TokenType.SMALLINT,
 617        "INT2": TokenType.SMALLINT,
 618        "INTEGER": TokenType.INT,
 619        "INT": TokenType.INT,
 620        "INT4": TokenType.INT,
 621        "LONG": TokenType.BIGINT,
 622        "BIGINT": TokenType.BIGINT,
 623        "INT8": TokenType.BIGINT,
 624        "DEC": TokenType.DECIMAL,
 625        "DECIMAL": TokenType.DECIMAL,
 626        "BIGDECIMAL": TokenType.BIGDECIMAL,
 627        "BIGNUMERIC": TokenType.BIGDECIMAL,
 628        "MAP": TokenType.MAP,
 629        "NULLABLE": TokenType.NULLABLE,
 630        "NUMBER": TokenType.DECIMAL,
 631        "NUMERIC": TokenType.DECIMAL,
 632        "FIXED": TokenType.DECIMAL,
 633        "REAL": TokenType.FLOAT,
 634        "FLOAT": TokenType.FLOAT,
 635        "FLOAT4": TokenType.FLOAT,
 636        "FLOAT8": TokenType.DOUBLE,
 637        "DOUBLE": TokenType.DOUBLE,
 638        "DOUBLE PRECISION": TokenType.DOUBLE,
 639        "JSON": TokenType.JSON,
 640        "CHAR": TokenType.CHAR,
 641        "CHARACTER": TokenType.CHAR,
 642        "NCHAR": TokenType.NCHAR,
 643        "VARCHAR": TokenType.VARCHAR,
 644        "VARCHAR2": TokenType.VARCHAR,
 645        "NVARCHAR": TokenType.NVARCHAR,
 646        "NVARCHAR2": TokenType.NVARCHAR,
 647        "STR": TokenType.TEXT,
 648        "STRING": TokenType.TEXT,
 649        "TEXT": TokenType.TEXT,
 650        "CLOB": TokenType.TEXT,
 651        "LONGVARCHAR": TokenType.TEXT,
 652        "BINARY": TokenType.BINARY,
 653        "BLOB": TokenType.VARBINARY,
 654        "BYTEA": TokenType.VARBINARY,
 655        "VARBINARY": TokenType.VARBINARY,
 656        "TIME": TokenType.TIME,
 657        "TIMESTAMP": TokenType.TIMESTAMP,
 658        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
 659        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
 660        "DATE": TokenType.DATE,
 661        "DATETIME": TokenType.DATETIME,
 662        "UNIQUE": TokenType.UNIQUE,
 663        "STRUCT": TokenType.STRUCT,
 664        "VARIANT": TokenType.VARIANT,
 665        "ALTER": TokenType.ALTER,
 666        "ANALYZE": TokenType.COMMAND,
 667        "CALL": TokenType.COMMAND,
 668        "COMMENT": TokenType.COMMENT,
 669        "COPY": TokenType.COMMAND,
 670        "EXPLAIN": TokenType.COMMAND,
 671        "GRANT": TokenType.COMMAND,
 672        "OPTIMIZE": TokenType.COMMAND,
 673        "PREPARE": TokenType.COMMAND,
 674        "TRUNCATE": TokenType.COMMAND,
 675        "VACUUM": TokenType.COMMAND,
 676    }
 677
 678    WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = {
 679        " ": TokenType.SPACE,
 680        "\t": TokenType.SPACE,
 681        "\n": TokenType.BREAK,
 682        "\r": TokenType.BREAK,
 683        "\r\n": TokenType.BREAK,
 684    }
 685
 686    COMMANDS = {
 687        TokenType.COMMAND,
 688        TokenType.EXECUTE,
 689        TokenType.FETCH,
 690        TokenType.SHOW,
 691    }
 692
 693    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
 694
 695    # handle numeric literals like in hive (3L = BIGINT)
 696    NUMERIC_LITERALS: t.Dict[str, str] = {}
 697    ENCODE: t.Optional[str] = None
 698
 699    COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")]
 700    KEYWORD_TRIE: t.Dict = {}  # autofilled
 701
 702    IDENTIFIER_CAN_START_WITH_DIGIT = False
 703
 704    __slots__ = (
 705        "sql",
 706        "size",
 707        "tokens",
 708        "_start",
 709        "_current",
 710        "_line",
 711        "_col",
 712        "_comments",
 713        "_char",
 714        "_end",
 715        "_peek",
 716        "_prev_token_line",
 717    )
 718
 719    def __init__(self) -> None:
 720        self.reset()
 721
 722    def reset(self) -> None:
 723        self.sql = ""
 724        self.size = 0
 725        self.tokens: t.List[Token] = []
 726        self._start = 0
 727        self._current = 0
 728        self._line = 1
 729        self._col = 0
 730        self._comments: t.List[str] = []
 731
 732        self._char = ""
 733        self._end = False
 734        self._peek = ""
 735        self._prev_token_line = -1
 736
 737    def tokenize(self, sql: str) -> t.List[Token]:
 738        """Returns a list of tokens corresponding to the SQL string `sql`."""
 739        self.reset()
 740        self.sql = sql
 741        self.size = len(sql)
 742
 743        try:
 744            self._scan()
 745        except Exception as e:
 746            start = max(self._current - 50, 0)
 747            end = min(self._current + 50, self.size - 1)
 748            context = self.sql[start:end]
 749            raise ValueError(f"Error tokenizing '{context}'") from e
 750
 751        return self.tokens
 752
 753    def _scan(self, until: t.Optional[t.Callable] = None) -> None:
 754        while self.size and not self._end:
 755            self._start = self._current
 756            self._advance()
 757
 758            if self._char is None:
 759                break
 760
 761            if self._char not in self.WHITE_SPACE:
 762                if self._char.isdigit():
 763                    self._scan_number()
 764                elif self._char in self._IDENTIFIERS:
 765                    self._scan_identifier(self._IDENTIFIERS[self._char])
 766                else:
 767                    self._scan_keywords()
 768
 769            if until and until():
 770                break
 771
 772        if self.tokens and self._comments:
 773            self.tokens[-1].comments.extend(self._comments)
 774
 775    def _chars(self, size: int) -> str:
 776        if size == 1:
 777            return self._char
 778
 779        start = self._current - 1
 780        end = start + size
 781
 782        return self.sql[start:end] if end <= self.size else ""
 783
 784    def _advance(self, i: int = 1, alnum: bool = False) -> None:
 785        if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
 786            self._col = 1
 787            self._line += 1
 788        else:
 789            self._col += i
 790
 791        self._current += i
 792        self._end = self._current >= self.size
 793        self._char = self.sql[self._current - 1]
 794        self._peek = "" if self._end else self.sql[self._current]
 795
 796        if alnum and self._char.isalnum():
 797            # Here we use local variables instead of attributes for better performance
 798            _col = self._col
 799            _current = self._current
 800            _end = self._end
 801            _peek = self._peek
 802
 803            while _peek.isalnum():
 804                _col += 1
 805                _current += 1
 806                _end = _current >= self.size
 807                _peek = "" if _end else self.sql[_current]
 808
 809            self._col = _col
 810            self._current = _current
 811            self._end = _end
 812            self._peek = _peek
 813            self._char = self.sql[_current - 1]
 814
 815    @property
 816    def _text(self) -> str:
 817        return self.sql[self._start : self._current]
 818
 819    def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
 820        self._prev_token_line = self._line
 821        self.tokens.append(
 822            Token(
 823                token_type,
 824                text=self._text if text is None else text,
 825                line=self._line,
 826                col=self._col,
 827                start=self._start,
 828                end=self._current - 1,
 829                comments=self._comments,
 830            )
 831        )
 832        self._comments = []
 833
 834        # If we have either a semicolon or a begin token before the command's token, we'll parse
 835        # whatever follows the command's token as a string
 836        if (
 837            token_type in self.COMMANDS
 838            and self._peek != ";"
 839            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS)
 840        ):
 841            start = self._current
 842            tokens = len(self.tokens)
 843            self._scan(lambda: self._peek == ";")
 844            self.tokens = self.tokens[:tokens]
 845            text = self.sql[start : self._current].strip()
 846            if text:
 847                self._add(TokenType.STRING, text)
 848
 849    def _scan_keywords(self) -> None:
 850        size = 0
 851        word = None
 852        chars = self._text
 853        char = chars
 854        prev_space = False
 855        skip = False
 856        trie = self.KEYWORD_TRIE
 857        single_token = char in self.SINGLE_TOKENS
 858
 859        while chars:
 860            if skip:
 861                result = 1
 862            else:
 863                result, trie = in_trie(trie, char.upper())
 864
 865            if result == 0:
 866                break
 867            if result == 2:
 868                word = chars
 869
 870            size += 1
 871            end = self._current - 1 + size
 872
 873            if end < self.size:
 874                char = self.sql[end]
 875                single_token = single_token or char in self.SINGLE_TOKENS
 876                is_space = char in self.WHITE_SPACE
 877
 878                if not is_space or not prev_space:
 879                    if is_space:
 880                        char = " "
 881                    chars += char
 882                    prev_space = is_space
 883                    skip = False
 884                else:
 885                    skip = True
 886            else:
 887                char = ""
 888                chars = " "
 889
 890        word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word
 891
 892        if not word:
 893            if self._char in self.SINGLE_TOKENS:
 894                self._add(self.SINGLE_TOKENS[self._char], text=self._char)
 895                return
 896            self._scan_var()
 897            return
 898
 899        if self._scan_string(word):
 900            return
 901        if self._scan_formatted_string(word):
 902            return
 903        if self._scan_comment(word):
 904            return
 905
 906        self._advance(size - 1)
 907        word = word.upper()
 908        self._add(self.KEYWORDS[word], text=word)
 909
 910    def _scan_comment(self, comment_start: str) -> bool:
 911        if comment_start not in self._COMMENTS:
 912            return False
 913
 914        comment_start_line = self._line
 915        comment_start_size = len(comment_start)
 916        comment_end = self._COMMENTS[comment_start]
 917
 918        if comment_end:
 919            # Skip the comment's start delimiter
 920            self._advance(comment_start_size)
 921
 922            comment_end_size = len(comment_end)
 923            while not self._end and self._chars(comment_end_size) != comment_end:
 924                self._advance(alnum=True)
 925
 926            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
 927            self._advance(comment_end_size - 1)
 928        else:
 929            while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK:
 930                self._advance(alnum=True)
 931            self._comments.append(self._text[comment_start_size:])
 932
 933        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
 934        # Multiple consecutive comments are preserved by appending them to the current comments list.
 935        if comment_start_line == self._prev_token_line:
 936            self.tokens[-1].comments.extend(self._comments)
 937            self._comments = []
 938            self._prev_token_line = self._line
 939
 940        return True
 941
 942    def _scan_number(self) -> None:
 943        if self._char == "0":
 944            peek = self._peek.upper()
 945            if peek == "B":
 946                return self._scan_bits() if self._BIT_STRINGS else self._add(TokenType.NUMBER)
 947            elif peek == "X":
 948                return self._scan_hex() if self._HEX_STRINGS else self._add(TokenType.NUMBER)
 949
 950        decimal = False
 951        scientific = 0
 952
 953        while True:
 954            if self._peek.isdigit():
 955                self._advance()
 956            elif self._peek == "." and not decimal:
 957                decimal = True
 958                self._advance()
 959            elif self._peek in ("-", "+") and scientific == 1:
 960                scientific += 1
 961                self._advance()
 962            elif self._peek.upper() == "E" and not scientific:
 963                scientific += 1
 964                self._advance()
 965            elif self._peek.isidentifier():
 966                number_text = self._text
 967                literal = ""
 968
 969                while self._peek.strip() and self._peek not in self.SINGLE_TOKENS:
 970                    literal += self._peek.upper()
 971                    self._advance()
 972
 973                token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal))
 974
 975                if token_type:
 976                    self._add(TokenType.NUMBER, number_text)
 977                    self._add(TokenType.DCOLON, "::")
 978                    return self._add(token_type, literal)
 979                elif self.IDENTIFIER_CAN_START_WITH_DIGIT:
 980                    return self._add(TokenType.VAR)
 981
 982                self._add(TokenType.NUMBER, number_text)
 983                return self._advance(-len(literal))
 984            else:
 985                return self._add(TokenType.NUMBER)
 986
 987    def _scan_bits(self) -> None:
 988        self._advance()
 989        value = self._extract_value()
 990        try:
 991            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
 992            int(value, 2)
 993            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
 994        except ValueError:
 995            self._add(TokenType.IDENTIFIER)
 996
 997    def _scan_hex(self) -> None:
 998        self._advance()
 999        value = self._extract_value()
1000        try:
1001            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
1002            int(value, 16)
1003            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
1004        except ValueError:
1005            self._add(TokenType.IDENTIFIER)
1006
1007    def _extract_value(self) -> str:
1008        while True:
1009            char = self._peek.strip()
1010            if char and char not in self.SINGLE_TOKENS:
1011                self._advance(alnum=True)
1012            else:
1013                break
1014
1015        return self._text
1016
1017    def _scan_string(self, quote: str) -> bool:
1018        quote_end = self._QUOTES.get(quote)
1019        if quote_end is None:
1020            return False
1021
1022        self._advance(len(quote))
1023        text = self._extract_string(quote_end)
1024        text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
1025        self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text)
1026        return True
1027
1028    # X'1234', b'0110', E'\\\\\' etc.
1029    def _scan_formatted_string(self, string_start: str) -> bool:
1030        if string_start in self._HEX_STRINGS:
1031            delimiters = self._HEX_STRINGS
1032            token_type = TokenType.HEX_STRING
1033            base = 16
1034        elif string_start in self._BIT_STRINGS:
1035            delimiters = self._BIT_STRINGS
1036            token_type = TokenType.BIT_STRING
1037            base = 2
1038        elif string_start in self._BYTE_STRINGS:
1039            delimiters = self._BYTE_STRINGS
1040            token_type = TokenType.BYTE_STRING
1041            base = None
1042        else:
1043            return False
1044
1045        self._advance(len(string_start))
1046        string_end = delimiters[string_start]
1047        text = self._extract_string(string_end)
1048
1049        if base:
1050            try:
1051                int(text, base)
1052            except:
1053                raise RuntimeError(
1054                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
1055                )
1056
1057        self._add(token_type, text)
1058        return True
1059
1060    def _scan_identifier(self, identifier_end: str) -> None:
1061        self._advance()
1062        text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES)
1063        self._add(TokenType.IDENTIFIER, text)
1064
1065    def _scan_var(self) -> None:
1066        while True:
1067            char = self._peek.strip()
1068            if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
1069                self._advance(alnum=True)
1070            else:
1071                break
1072
1073        self._add(
1074            TokenType.VAR
1075            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1076            else self.KEYWORDS.get(self._text.upper(), TokenType.VAR)
1077        )
1078
1079    def _extract_string(self, delimiter: str, escapes=None) -> str:
1080        text = ""
1081        delim_size = len(delimiter)
1082        escapes = self._STRING_ESCAPES if escapes is None else escapes
1083
1084        while True:
1085            if self._char in escapes and (self._peek == delimiter or self._peek in escapes):
1086                if self._peek == delimiter:
1087                    text += self._peek
1088                else:
1089                    text += self._char + self._peek
1090
1091                if self._current + 1 < self.size:
1092                    self._advance(2)
1093                else:
1094                    raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._current}")
1095            else:
1096                if self._chars(delim_size) == delimiter:
1097                    if delim_size > 1:
1098                        self._advance(delim_size - 1)
1099                    break
1100
1101                if self._end:
1102                    raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}")
1103
1104                current = self._current - 1
1105                self._advance(alnum=True)
1106                text += self.sql[current : self._current - 1]
1107
1108        return text
class TokenType(sqlglot.helper.AutoName):
 11class TokenType(AutoName):
 12    L_PAREN = auto()
 13    R_PAREN = auto()
 14    L_BRACKET = auto()
 15    R_BRACKET = auto()
 16    L_BRACE = auto()
 17    R_BRACE = auto()
 18    COMMA = auto()
 19    DOT = auto()
 20    DASH = auto()
 21    PLUS = auto()
 22    COLON = auto()
 23    DCOLON = auto()
 24    SEMICOLON = auto()
 25    STAR = auto()
 26    BACKSLASH = auto()
 27    SLASH = auto()
 28    LT = auto()
 29    LTE = auto()
 30    GT = auto()
 31    GTE = auto()
 32    NOT = auto()
 33    EQ = auto()
 34    NEQ = auto()
 35    NULLSAFE_EQ = auto()
 36    AND = auto()
 37    OR = auto()
 38    AMP = auto()
 39    DPIPE = auto()
 40    PIPE = auto()
 41    CARET = auto()
 42    TILDA = auto()
 43    ARROW = auto()
 44    DARROW = auto()
 45    FARROW = auto()
 46    HASH = auto()
 47    HASH_ARROW = auto()
 48    DHASH_ARROW = auto()
 49    LR_ARROW = auto()
 50    LT_AT = auto()
 51    AT_GT = auto()
 52    DOLLAR = auto()
 53    PARAMETER = auto()
 54    SESSION_PARAMETER = auto()
 55    NATIONAL = auto()
 56    DAMP = auto()
 57
 58    BLOCK_START = auto()
 59    BLOCK_END = auto()
 60
 61    SPACE = auto()
 62    BREAK = auto()
 63
 64    STRING = auto()
 65    NUMBER = auto()
 66    IDENTIFIER = auto()
 67    DATABASE = auto()
 68    COLUMN = auto()
 69    COLUMN_DEF = auto()
 70    SCHEMA = auto()
 71    TABLE = auto()
 72    VAR = auto()
 73    BIT_STRING = auto()
 74    HEX_STRING = auto()
 75    BYTE_STRING = auto()
 76
 77    # types
 78    BIT = auto()
 79    BOOLEAN = auto()
 80    TINYINT = auto()
 81    UTINYINT = auto()
 82    SMALLINT = auto()
 83    USMALLINT = auto()
 84    INT = auto()
 85    UINT = auto()
 86    BIGINT = auto()
 87    UBIGINT = auto()
 88    INT128 = auto()
 89    UINT128 = auto()
 90    INT256 = auto()
 91    UINT256 = auto()
 92    FLOAT = auto()
 93    DOUBLE = auto()
 94    DECIMAL = auto()
 95    BIGDECIMAL = auto()
 96    CHAR = auto()
 97    NCHAR = auto()
 98    VARCHAR = auto()
 99    NVARCHAR = auto()
100    TEXT = auto()
101    MEDIUMTEXT = auto()
102    LONGTEXT = auto()
103    MEDIUMBLOB = auto()
104    LONGBLOB = auto()
105    BINARY = auto()
106    VARBINARY = auto()
107    JSON = auto()
108    JSONB = auto()
109    TIME = auto()
110    TIMESTAMP = auto()
111    TIMESTAMPTZ = auto()
112    TIMESTAMPLTZ = auto()
113    DATETIME = auto()
114    DATETIME64 = auto()
115    DATE = auto()
116    UUID = auto()
117    GEOGRAPHY = auto()
118    NULLABLE = auto()
119    GEOMETRY = auto()
120    HLLSKETCH = auto()
121    HSTORE = auto()
122    SUPER = auto()
123    SERIAL = auto()
124    SMALLSERIAL = auto()
125    BIGSERIAL = auto()
126    XML = auto()
127    UNIQUEIDENTIFIER = auto()
128    MONEY = auto()
129    SMALLMONEY = auto()
130    ROWVERSION = auto()
131    IMAGE = auto()
132    VARIANT = auto()
133    OBJECT = auto()
134    INET = auto()
135
136    # keywords
137    ALIAS = auto()
138    ALTER = auto()
139    ALWAYS = auto()
140    ALL = auto()
141    ANTI = auto()
142    ANY = auto()
143    APPLY = auto()
144    ARRAY = auto()
145    ASC = auto()
146    ASOF = auto()
147    AUTO_INCREMENT = auto()
148    BEGIN = auto()
149    BETWEEN = auto()
150    CACHE = auto()
151    CASE = auto()
152    CHARACTER_SET = auto()
153    COLLATE = auto()
154    COMMAND = auto()
155    COMMENT = auto()
156    COMMIT = auto()
157    CONSTRAINT = auto()
158    CREATE = auto()
159    CROSS = auto()
160    CUBE = auto()
161    CURRENT_DATE = auto()
162    CURRENT_DATETIME = auto()
163    CURRENT_TIME = auto()
164    CURRENT_TIMESTAMP = auto()
165    CURRENT_USER = auto()
166    DEFAULT = auto()
167    DELETE = auto()
168    DESC = auto()
169    DESCRIBE = auto()
170    DISTINCT = auto()
171    DIV = auto()
172    DROP = auto()
173    ELSE = auto()
174    END = auto()
175    ESCAPE = auto()
176    EXCEPT = auto()
177    EXECUTE = auto()
178    EXISTS = auto()
179    FALSE = auto()
180    FETCH = auto()
181    FILTER = auto()
182    FINAL = auto()
183    FIRST = auto()
184    FOR = auto()
185    FOREIGN_KEY = auto()
186    FORMAT = auto()
187    FROM = auto()
188    FULL = auto()
189    FUNCTION = auto()
190    GLOB = auto()
191    GLOBAL = auto()
192    GROUP_BY = auto()
193    GROUPING_SETS = auto()
194    HAVING = auto()
195    HINT = auto()
196    IF = auto()
197    ILIKE = auto()
198    ILIKE_ANY = auto()
199    IN = auto()
200    INDEX = auto()
201    INNER = auto()
202    INSERT = auto()
203    INTERSECT = auto()
204    INTERVAL = auto()
205    INTO = auto()
206    INTRODUCER = auto()
207    IRLIKE = auto()
208    IS = auto()
209    ISNULL = auto()
210    JOIN = auto()
211    JOIN_MARKER = auto()
212    KEEP = auto()
213    LANGUAGE = auto()
214    LATERAL = auto()
215    LEFT = auto()
216    LIKE = auto()
217    LIKE_ANY = auto()
218    LIMIT = auto()
219    LOAD = auto()
220    LOCK = auto()
221    MAP = auto()
222    MATCH_RECOGNIZE = auto()
223    MERGE = auto()
224    MOD = auto()
225    NATURAL = auto()
226    NEXT = auto()
227    NEXT_VALUE_FOR = auto()
228    NOTNULL = auto()
229    NULL = auto()
230    OFFSET = auto()
231    ON = auto()
232    ORDER_BY = auto()
233    ORDERED = auto()
234    ORDINALITY = auto()
235    OUTER = auto()
236    OVER = auto()
237    OVERLAPS = auto()
238    OVERWRITE = auto()
239    PARTITION = auto()
240    PARTITION_BY = auto()
241    PERCENT = auto()
242    PIVOT = auto()
243    PLACEHOLDER = auto()
244    PRAGMA = auto()
245    PRIMARY_KEY = auto()
246    PROCEDURE = auto()
247    PROPERTIES = auto()
248    PSEUDO_TYPE = auto()
249    QUALIFY = auto()
250    QUOTE = auto()
251    RANGE = auto()
252    RECURSIVE = auto()
253    REPLACE = auto()
254    RETURNING = auto()
255    REFERENCES = auto()
256    RIGHT = auto()
257    RLIKE = auto()
258    ROLLBACK = auto()
259    ROLLUP = auto()
260    ROW = auto()
261    ROWS = auto()
262    SELECT = auto()
263    SEMI = auto()
264    SEPARATOR = auto()
265    SERDE_PROPERTIES = auto()
266    SET = auto()
267    SETTINGS = auto()
268    SHOW = auto()
269    SIMILAR_TO = auto()
270    SOME = auto()
271    STRUCT = auto()
272    TABLE_SAMPLE = auto()
273    TEMPORARY = auto()
274    TOP = auto()
275    THEN = auto()
276    TRUE = auto()
277    UNCACHE = auto()
278    UNION = auto()
279    UNNEST = auto()
280    UNPIVOT = auto()
281    UPDATE = auto()
282    USE = auto()
283    USING = auto()
284    VALUES = auto()
285    VIEW = auto()
286    VOLATILE = auto()
287    WHEN = auto()
288    WHERE = auto()
289    WINDOW = auto()
290    WITH = auto()
291    UNIQUE = auto()

An enumeration.

L_PAREN = <TokenType.L_PAREN: 'L_PAREN'>
R_PAREN = <TokenType.R_PAREN: 'R_PAREN'>
L_BRACKET = <TokenType.L_BRACKET: 'L_BRACKET'>
R_BRACKET = <TokenType.R_BRACKET: 'R_BRACKET'>
L_BRACE = <TokenType.L_BRACE: 'L_BRACE'>
R_BRACE = <TokenType.R_BRACE: 'R_BRACE'>
COMMA = <TokenType.COMMA: 'COMMA'>
DOT = <TokenType.DOT: 'DOT'>
DASH = <TokenType.DASH: 'DASH'>
PLUS = <TokenType.PLUS: 'PLUS'>
COLON = <TokenType.COLON: 'COLON'>
DCOLON = <TokenType.DCOLON: 'DCOLON'>
SEMICOLON = <TokenType.SEMICOLON: 'SEMICOLON'>
STAR = <TokenType.STAR: 'STAR'>
BACKSLASH = <TokenType.BACKSLASH: 'BACKSLASH'>
SLASH = <TokenType.SLASH: 'SLASH'>
LT = <TokenType.LT: 'LT'>
LTE = <TokenType.LTE: 'LTE'>
GT = <TokenType.GT: 'GT'>
GTE = <TokenType.GTE: 'GTE'>
NOT = <TokenType.NOT: 'NOT'>
EQ = <TokenType.EQ: 'EQ'>
NEQ = <TokenType.NEQ: 'NEQ'>
NULLSAFE_EQ = <TokenType.NULLSAFE_EQ: 'NULLSAFE_EQ'>
AND = <TokenType.AND: 'AND'>
OR = <TokenType.OR: 'OR'>
AMP = <TokenType.AMP: 'AMP'>
DPIPE = <TokenType.DPIPE: 'DPIPE'>
PIPE = <TokenType.PIPE: 'PIPE'>
CARET = <TokenType.CARET: 'CARET'>
TILDA = <TokenType.TILDA: 'TILDA'>
ARROW = <TokenType.ARROW: 'ARROW'>
DARROW = <TokenType.DARROW: 'DARROW'>
FARROW = <TokenType.FARROW: 'FARROW'>
HASH = <TokenType.HASH: 'HASH'>
HASH_ARROW = <TokenType.HASH_ARROW: 'HASH_ARROW'>
DHASH_ARROW = <TokenType.DHASH_ARROW: 'DHASH_ARROW'>
LR_ARROW = <TokenType.LR_ARROW: 'LR_ARROW'>
LT_AT = <TokenType.LT_AT: 'LT_AT'>
AT_GT = <TokenType.AT_GT: 'AT_GT'>
DOLLAR = <TokenType.DOLLAR: 'DOLLAR'>
PARAMETER = <TokenType.PARAMETER: 'PARAMETER'>
SESSION_PARAMETER = <TokenType.SESSION_PARAMETER: 'SESSION_PARAMETER'>
NATIONAL = <TokenType.NATIONAL: 'NATIONAL'>
DAMP = <TokenType.DAMP: 'DAMP'>
BLOCK_START = <TokenType.BLOCK_START: 'BLOCK_START'>
BLOCK_END = <TokenType.BLOCK_END: 'BLOCK_END'>
SPACE = <TokenType.SPACE: 'SPACE'>
BREAK = <TokenType.BREAK: 'BREAK'>
STRING = <TokenType.STRING: 'STRING'>
NUMBER = <TokenType.NUMBER: 'NUMBER'>
IDENTIFIER = <TokenType.IDENTIFIER: 'IDENTIFIER'>
DATABASE = <TokenType.DATABASE: 'DATABASE'>
COLUMN = <TokenType.COLUMN: 'COLUMN'>
COLUMN_DEF = <TokenType.COLUMN_DEF: 'COLUMN_DEF'>
SCHEMA = <TokenType.SCHEMA: 'SCHEMA'>
TABLE = <TokenType.TABLE: 'TABLE'>
VAR = <TokenType.VAR: 'VAR'>
BIT_STRING = <TokenType.BIT_STRING: 'BIT_STRING'>
HEX_STRING = <TokenType.HEX_STRING: 'HEX_STRING'>
BYTE_STRING = <TokenType.BYTE_STRING: 'BYTE_STRING'>
BIT = <TokenType.BIT: 'BIT'>
BOOLEAN = <TokenType.BOOLEAN: 'BOOLEAN'>
TINYINT = <TokenType.TINYINT: 'TINYINT'>
UTINYINT = <TokenType.UTINYINT: 'UTINYINT'>
SMALLINT = <TokenType.SMALLINT: 'SMALLINT'>
USMALLINT = <TokenType.USMALLINT: 'USMALLINT'>
INT = <TokenType.INT: 'INT'>
UINT = <TokenType.UINT: 'UINT'>
BIGINT = <TokenType.BIGINT: 'BIGINT'>
UBIGINT = <TokenType.UBIGINT: 'UBIGINT'>
INT128 = <TokenType.INT128: 'INT128'>
UINT128 = <TokenType.UINT128: 'UINT128'>
INT256 = <TokenType.INT256: 'INT256'>
UINT256 = <TokenType.UINT256: 'UINT256'>
FLOAT = <TokenType.FLOAT: 'FLOAT'>
DOUBLE = <TokenType.DOUBLE: 'DOUBLE'>
DECIMAL = <TokenType.DECIMAL: 'DECIMAL'>
BIGDECIMAL = <TokenType.BIGDECIMAL: 'BIGDECIMAL'>
CHAR = <TokenType.CHAR: 'CHAR'>
NCHAR = <TokenType.NCHAR: 'NCHAR'>
VARCHAR = <TokenType.VARCHAR: 'VARCHAR'>
NVARCHAR = <TokenType.NVARCHAR: 'NVARCHAR'>
TEXT = <TokenType.TEXT: 'TEXT'>
MEDIUMTEXT = <TokenType.MEDIUMTEXT: 'MEDIUMTEXT'>
LONGTEXT = <TokenType.LONGTEXT: 'LONGTEXT'>
MEDIUMBLOB = <TokenType.MEDIUMBLOB: 'MEDIUMBLOB'>
LONGBLOB = <TokenType.LONGBLOB: 'LONGBLOB'>
BINARY = <TokenType.BINARY: 'BINARY'>
VARBINARY = <TokenType.VARBINARY: 'VARBINARY'>
JSON = <TokenType.JSON: 'JSON'>
JSONB = <TokenType.JSONB: 'JSONB'>
TIME = <TokenType.TIME: 'TIME'>
TIMESTAMP = <TokenType.TIMESTAMP: 'TIMESTAMP'>
TIMESTAMPTZ = <TokenType.TIMESTAMPTZ: 'TIMESTAMPTZ'>
TIMESTAMPLTZ = <TokenType.TIMESTAMPLTZ: 'TIMESTAMPLTZ'>
DATETIME = <TokenType.DATETIME: 'DATETIME'>
DATETIME64 = <TokenType.DATETIME64: 'DATETIME64'>
DATE = <TokenType.DATE: 'DATE'>
UUID = <TokenType.UUID: 'UUID'>
GEOGRAPHY = <TokenType.GEOGRAPHY: 'GEOGRAPHY'>
NULLABLE = <TokenType.NULLABLE: 'NULLABLE'>
GEOMETRY = <TokenType.GEOMETRY: 'GEOMETRY'>
HLLSKETCH = <TokenType.HLLSKETCH: 'HLLSKETCH'>
HSTORE = <TokenType.HSTORE: 'HSTORE'>
SUPER = <TokenType.SUPER: 'SUPER'>
SERIAL = <TokenType.SERIAL: 'SERIAL'>
SMALLSERIAL = <TokenType.SMALLSERIAL: 'SMALLSERIAL'>
BIGSERIAL = <TokenType.BIGSERIAL: 'BIGSERIAL'>
XML = <TokenType.XML: 'XML'>
UNIQUEIDENTIFIER = <TokenType.UNIQUEIDENTIFIER: 'UNIQUEIDENTIFIER'>
MONEY = <TokenType.MONEY: 'MONEY'>
SMALLMONEY = <TokenType.SMALLMONEY: 'SMALLMONEY'>
ROWVERSION = <TokenType.ROWVERSION: 'ROWVERSION'>
IMAGE = <TokenType.IMAGE: 'IMAGE'>
VARIANT = <TokenType.VARIANT: 'VARIANT'>
OBJECT = <TokenType.OBJECT: 'OBJECT'>
INET = <TokenType.INET: 'INET'>
ALIAS = <TokenType.ALIAS: 'ALIAS'>
ALTER = <TokenType.ALTER: 'ALTER'>
ALWAYS = <TokenType.ALWAYS: 'ALWAYS'>
ALL = <TokenType.ALL: 'ALL'>
ANTI = <TokenType.ANTI: 'ANTI'>
ANY = <TokenType.ANY: 'ANY'>
APPLY = <TokenType.APPLY: 'APPLY'>
ARRAY = <TokenType.ARRAY: 'ARRAY'>
ASC = <TokenType.ASC: 'ASC'>
ASOF = <TokenType.ASOF: 'ASOF'>
AUTO_INCREMENT = <TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>
BEGIN = <TokenType.BEGIN: 'BEGIN'>
BETWEEN = <TokenType.BETWEEN: 'BETWEEN'>
CACHE = <TokenType.CACHE: 'CACHE'>
CASE = <TokenType.CASE: 'CASE'>
CHARACTER_SET = <TokenType.CHARACTER_SET: 'CHARACTER_SET'>
COLLATE = <TokenType.COLLATE: 'COLLATE'>
COMMAND = <TokenType.COMMAND: 'COMMAND'>
COMMENT = <TokenType.COMMENT: 'COMMENT'>
COMMIT = <TokenType.COMMIT: 'COMMIT'>
CONSTRAINT = <TokenType.CONSTRAINT: 'CONSTRAINT'>
CREATE = <TokenType.CREATE: 'CREATE'>
CROSS = <TokenType.CROSS: 'CROSS'>
CUBE = <TokenType.CUBE: 'CUBE'>
CURRENT_DATE = <TokenType.CURRENT_DATE: 'CURRENT_DATE'>
CURRENT_DATETIME = <TokenType.CURRENT_DATETIME: 'CURRENT_DATETIME'>
CURRENT_TIME = <TokenType.CURRENT_TIME: 'CURRENT_TIME'>
CURRENT_TIMESTAMP = <TokenType.CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'>
CURRENT_USER = <TokenType.CURRENT_USER: 'CURRENT_USER'>
DEFAULT = <TokenType.DEFAULT: 'DEFAULT'>
DELETE = <TokenType.DELETE: 'DELETE'>
DESC = <TokenType.DESC: 'DESC'>
DESCRIBE = <TokenType.DESCRIBE: 'DESCRIBE'>
DISTINCT = <TokenType.DISTINCT: 'DISTINCT'>
DIV = <TokenType.DIV: 'DIV'>
DROP = <TokenType.DROP: 'DROP'>
ELSE = <TokenType.ELSE: 'ELSE'>
END = <TokenType.END: 'END'>
ESCAPE = <TokenType.ESCAPE: 'ESCAPE'>
EXCEPT = <TokenType.EXCEPT: 'EXCEPT'>
EXECUTE = <TokenType.EXECUTE: 'EXECUTE'>
EXISTS = <TokenType.EXISTS: 'EXISTS'>
FALSE = <TokenType.FALSE: 'FALSE'>
FETCH = <TokenType.FETCH: 'FETCH'>
FILTER = <TokenType.FILTER: 'FILTER'>
FINAL = <TokenType.FINAL: 'FINAL'>
FIRST = <TokenType.FIRST: 'FIRST'>
FOR = <TokenType.FOR: 'FOR'>
FOREIGN_KEY = <TokenType.FOREIGN_KEY: 'FOREIGN_KEY'>
FORMAT = <TokenType.FORMAT: 'FORMAT'>
FROM = <TokenType.FROM: 'FROM'>
FULL = <TokenType.FULL: 'FULL'>
FUNCTION = <TokenType.FUNCTION: 'FUNCTION'>
GLOB = <TokenType.GLOB: 'GLOB'>
GLOBAL = <TokenType.GLOBAL: 'GLOBAL'>
GROUP_BY = <TokenType.GROUP_BY: 'GROUP_BY'>
GROUPING_SETS = <TokenType.GROUPING_SETS: 'GROUPING_SETS'>
HAVING = <TokenType.HAVING: 'HAVING'>
HINT = <TokenType.HINT: 'HINT'>
IF = <TokenType.IF: 'IF'>
ILIKE = <TokenType.ILIKE: 'ILIKE'>
ILIKE_ANY = <TokenType.ILIKE_ANY: 'ILIKE_ANY'>
IN = <TokenType.IN: 'IN'>
INDEX = <TokenType.INDEX: 'INDEX'>
INNER = <TokenType.INNER: 'INNER'>
INSERT = <TokenType.INSERT: 'INSERT'>
INTERSECT = <TokenType.INTERSECT: 'INTERSECT'>
INTERVAL = <TokenType.INTERVAL: 'INTERVAL'>
INTO = <TokenType.INTO: 'INTO'>
INTRODUCER = <TokenType.INTRODUCER: 'INTRODUCER'>
IRLIKE = <TokenType.IRLIKE: 'IRLIKE'>
IS = <TokenType.IS: 'IS'>
ISNULL = <TokenType.ISNULL: 'ISNULL'>
JOIN = <TokenType.JOIN: 'JOIN'>
JOIN_MARKER = <TokenType.JOIN_MARKER: 'JOIN_MARKER'>
KEEP = <TokenType.KEEP: 'KEEP'>
LANGUAGE = <TokenType.LANGUAGE: 'LANGUAGE'>
LATERAL = <TokenType.LATERAL: 'LATERAL'>
LEFT = <TokenType.LEFT: 'LEFT'>
LIKE = <TokenType.LIKE: 'LIKE'>
LIKE_ANY = <TokenType.LIKE_ANY: 'LIKE_ANY'>
LIMIT = <TokenType.LIMIT: 'LIMIT'>
LOAD = <TokenType.LOAD: 'LOAD'>
LOCK = <TokenType.LOCK: 'LOCK'>
MAP = <TokenType.MAP: 'MAP'>
MATCH_RECOGNIZE = <TokenType.MATCH_RECOGNIZE: 'MATCH_RECOGNIZE'>
MERGE = <TokenType.MERGE: 'MERGE'>
MOD = <TokenType.MOD: 'MOD'>
NATURAL = <TokenType.NATURAL: 'NATURAL'>
NEXT = <TokenType.NEXT: 'NEXT'>
NEXT_VALUE_FOR = <TokenType.NEXT_VALUE_FOR: 'NEXT_VALUE_FOR'>
NOTNULL = <TokenType.NOTNULL: 'NOTNULL'>
NULL = <TokenType.NULL: 'NULL'>
OFFSET = <TokenType.OFFSET: 'OFFSET'>
ON = <TokenType.ON: 'ON'>
ORDER_BY = <TokenType.ORDER_BY: 'ORDER_BY'>
ORDERED = <TokenType.ORDERED: 'ORDERED'>
ORDINALITY = <TokenType.ORDINALITY: 'ORDINALITY'>
OUTER = <TokenType.OUTER: 'OUTER'>
OVER = <TokenType.OVER: 'OVER'>
OVERLAPS = <TokenType.OVERLAPS: 'OVERLAPS'>
OVERWRITE = <TokenType.OVERWRITE: 'OVERWRITE'>
PARTITION = <TokenType.PARTITION: 'PARTITION'>
PARTITION_BY = <TokenType.PARTITION_BY: 'PARTITION_BY'>
PERCENT = <TokenType.PERCENT: 'PERCENT'>
PIVOT = <TokenType.PIVOT: 'PIVOT'>
PLACEHOLDER = <TokenType.PLACEHOLDER: 'PLACEHOLDER'>
PRAGMA = <TokenType.PRAGMA: 'PRAGMA'>
PRIMARY_KEY = <TokenType.PRIMARY_KEY: 'PRIMARY_KEY'>
PROCEDURE = <TokenType.PROCEDURE: 'PROCEDURE'>
PROPERTIES = <TokenType.PROPERTIES: 'PROPERTIES'>
PSEUDO_TYPE = <TokenType.PSEUDO_TYPE: 'PSEUDO_TYPE'>
QUALIFY = <TokenType.QUALIFY: 'QUALIFY'>
QUOTE = <TokenType.QUOTE: 'QUOTE'>
RANGE = <TokenType.RANGE: 'RANGE'>
RECURSIVE = <TokenType.RECURSIVE: 'RECURSIVE'>
REPLACE = <TokenType.REPLACE: 'REPLACE'>
RETURNING = <TokenType.RETURNING: 'RETURNING'>
REFERENCES = <TokenType.REFERENCES: 'REFERENCES'>
RIGHT = <TokenType.RIGHT: 'RIGHT'>
RLIKE = <TokenType.RLIKE: 'RLIKE'>
ROLLBACK = <TokenType.ROLLBACK: 'ROLLBACK'>
ROLLUP = <TokenType.ROLLUP: 'ROLLUP'>
ROW = <TokenType.ROW: 'ROW'>
ROWS = <TokenType.ROWS: 'ROWS'>
SELECT = <TokenType.SELECT: 'SELECT'>
SEMI = <TokenType.SEMI: 'SEMI'>
SEPARATOR = <TokenType.SEPARATOR: 'SEPARATOR'>
SERDE_PROPERTIES = <TokenType.SERDE_PROPERTIES: 'SERDE_PROPERTIES'>
SET = <TokenType.SET: 'SET'>
SETTINGS = <TokenType.SETTINGS: 'SETTINGS'>
SHOW = <TokenType.SHOW: 'SHOW'>
SIMILAR_TO = <TokenType.SIMILAR_TO: 'SIMILAR_TO'>
SOME = <TokenType.SOME: 'SOME'>
STRUCT = <TokenType.STRUCT: 'STRUCT'>
TABLE_SAMPLE = <TokenType.TABLE_SAMPLE: 'TABLE_SAMPLE'>
TEMPORARY = <TokenType.TEMPORARY: 'TEMPORARY'>
TOP = <TokenType.TOP: 'TOP'>
THEN = <TokenType.THEN: 'THEN'>
TRUE = <TokenType.TRUE: 'TRUE'>
UNCACHE = <TokenType.UNCACHE: 'UNCACHE'>
UNION = <TokenType.UNION: 'UNION'>
UNNEST = <TokenType.UNNEST: 'UNNEST'>
UNPIVOT = <TokenType.UNPIVOT: 'UNPIVOT'>
UPDATE = <TokenType.UPDATE: 'UPDATE'>
USE = <TokenType.USE: 'USE'>
USING = <TokenType.USING: 'USING'>
VALUES = <TokenType.VALUES: 'VALUES'>
VIEW = <TokenType.VIEW: 'VIEW'>
VOLATILE = <TokenType.VOLATILE: 'VOLATILE'>
WHEN = <TokenType.WHEN: 'WHEN'>
WHERE = <TokenType.WHERE: 'WHERE'>
WINDOW = <TokenType.WINDOW: 'WINDOW'>
WITH = <TokenType.WITH: 'WITH'>
UNIQUE = <TokenType.UNIQUE: 'UNIQUE'>
Inherited Members
enum.Enum
name
value
class Token:
294class Token:
295    __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments")
296
297    @classmethod
298    def number(cls, number: int) -> Token:
299        """Returns a NUMBER token with `number` as its text."""
300        return cls(TokenType.NUMBER, str(number))
301
302    @classmethod
303    def string(cls, string: str) -> Token:
304        """Returns a STRING token with `string` as its text."""
305        return cls(TokenType.STRING, string)
306
307    @classmethod
308    def identifier(cls, identifier: str) -> Token:
309        """Returns an IDENTIFIER token with `identifier` as its text."""
310        return cls(TokenType.IDENTIFIER, identifier)
311
312    @classmethod
313    def var(cls, var: str) -> Token:
314        """Returns an VAR token with `var` as its text."""
315        return cls(TokenType.VAR, var)
316
317    def __init__(
318        self,
319        token_type: TokenType,
320        text: str,
321        line: int = 1,
322        col: int = 1,
323        start: int = 0,
324        end: int = 0,
325        comments: t.List[str] = [],
326    ) -> None:
327        """Token initializer.
328
329        Args:
330            token_type: The TokenType Enum.
331            text: The text of the token.
332            line: The line that the token ends on.
333            col: The column that the token ends on.
334            start: The start index of the token.
335            end: The ending index of the token.
336        """
337        self.token_type = token_type
338        self.text = text
339        self.line = line
340        self.col = col
341        self.start = start
342        self.end = end
343        self.comments = comments
344
345    def __repr__(self) -> str:
346        attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
347        return f"<Token {attributes}>"
Token( token_type: sqlglot.tokens.TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: List[str] = [])
317    def __init__(
318        self,
319        token_type: TokenType,
320        text: str,
321        line: int = 1,
322        col: int = 1,
323        start: int = 0,
324        end: int = 0,
325        comments: t.List[str] = [],
326    ) -> None:
327        """Token initializer.
328
329        Args:
330            token_type: The TokenType Enum.
331            text: The text of the token.
332            line: The line that the token ends on.
333            col: The column that the token ends on.
334            start: The start index of the token.
335            end: The ending index of the token.
336        """
337        self.token_type = token_type
338        self.text = text
339        self.line = line
340        self.col = col
341        self.start = start
342        self.end = end
343        self.comments = comments

Token initializer.

Arguments:
  • token_type: The TokenType Enum.
  • text: The text of the token.
  • line: The line that the token ends on.
  • col: The column that the token ends on.
  • start: The start index of the token.
  • end: The ending index of the token.
@classmethod
def number(cls, number: int) -> sqlglot.tokens.Token:
297    @classmethod
298    def number(cls, number: int) -> Token:
299        """Returns a NUMBER token with `number` as its text."""
300        return cls(TokenType.NUMBER, str(number))

Returns a NUMBER token with number as its text.

@classmethod
def string(cls, string: str) -> sqlglot.tokens.Token:
302    @classmethod
303    def string(cls, string: str) -> Token:
304        """Returns a STRING token with `string` as its text."""
305        return cls(TokenType.STRING, string)

Returns a STRING token with string as its text.

@classmethod
def identifier(cls, identifier: str) -> sqlglot.tokens.Token:
307    @classmethod
308    def identifier(cls, identifier: str) -> Token:
309        """Returns an IDENTIFIER token with `identifier` as its text."""
310        return cls(TokenType.IDENTIFIER, identifier)

Returns an IDENTIFIER token with identifier as its text.

@classmethod
def var(cls, var: str) -> sqlglot.tokens.Token:
312    @classmethod
313    def var(cls, var: str) -> Token:
314        """Returns an VAR token with `var` as its text."""
315        return cls(TokenType.VAR, var)

Returns an VAR token with var as its text.

class Tokenizer:
 390class Tokenizer(metaclass=_Tokenizer):
 391    SINGLE_TOKENS = {
 392        "(": TokenType.L_PAREN,
 393        ")": TokenType.R_PAREN,
 394        "[": TokenType.L_BRACKET,
 395        "]": TokenType.R_BRACKET,
 396        "{": TokenType.L_BRACE,
 397        "}": TokenType.R_BRACE,
 398        "&": TokenType.AMP,
 399        "^": TokenType.CARET,
 400        ":": TokenType.COLON,
 401        ",": TokenType.COMMA,
 402        ".": TokenType.DOT,
 403        "-": TokenType.DASH,
 404        "=": TokenType.EQ,
 405        ">": TokenType.GT,
 406        "<": TokenType.LT,
 407        "%": TokenType.MOD,
 408        "!": TokenType.NOT,
 409        "|": TokenType.PIPE,
 410        "+": TokenType.PLUS,
 411        ";": TokenType.SEMICOLON,
 412        "/": TokenType.SLASH,
 413        "\\": TokenType.BACKSLASH,
 414        "*": TokenType.STAR,
 415        "~": TokenType.TILDA,
 416        "?": TokenType.PLACEHOLDER,
 417        "@": TokenType.PARAMETER,
 418        # used for breaking a var like x'y' but nothing else
 419        # the token type doesn't matter
 420        "'": TokenType.QUOTE,
 421        "`": TokenType.IDENTIFIER,
 422        '"': TokenType.IDENTIFIER,
 423        "#": TokenType.HASH,
 424    }
 425
 426    BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
 427    BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
 428    HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
 429    IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
 430    IDENTIFIER_ESCAPES = ['"']
 431    QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
 432    STRING_ESCAPES = ["'"]
 433    VAR_SINGLE_TOKENS: t.Set[str] = set()
 434
 435    _COMMENTS: t.Dict[str, str] = {}
 436    _BIT_STRINGS: t.Dict[str, str] = {}
 437    _BYTE_STRINGS: t.Dict[str, str] = {}
 438    _HEX_STRINGS: t.Dict[str, str] = {}
 439    _IDENTIFIERS: t.Dict[str, str] = {}
 440    _IDENTIFIER_ESCAPES: t.Set[str] = set()
 441    _QUOTES: t.Dict[str, str] = {}
 442    _STRING_ESCAPES: t.Set[str] = set()
 443
 444    KEYWORDS: t.Dict[t.Optional[str], TokenType] = {
 445        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
 446        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
 447        "{{+": TokenType.BLOCK_START,
 448        "{{-": TokenType.BLOCK_START,
 449        "+}}": TokenType.BLOCK_END,
 450        "-}}": TokenType.BLOCK_END,
 451        "/*+": TokenType.HINT,
 452        "==": TokenType.EQ,
 453        "::": TokenType.DCOLON,
 454        "||": TokenType.DPIPE,
 455        ">=": TokenType.GTE,
 456        "<=": TokenType.LTE,
 457        "<>": TokenType.NEQ,
 458        "!=": TokenType.NEQ,
 459        "<=>": TokenType.NULLSAFE_EQ,
 460        "->": TokenType.ARROW,
 461        "->>": TokenType.DARROW,
 462        "=>": TokenType.FARROW,
 463        "#>": TokenType.HASH_ARROW,
 464        "#>>": TokenType.DHASH_ARROW,
 465        "<->": TokenType.LR_ARROW,
 466        "&&": TokenType.DAMP,
 467        "ALL": TokenType.ALL,
 468        "ALWAYS": TokenType.ALWAYS,
 469        "AND": TokenType.AND,
 470        "ANTI": TokenType.ANTI,
 471        "ANY": TokenType.ANY,
 472        "ASC": TokenType.ASC,
 473        "AS": TokenType.ALIAS,
 474        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
 475        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
 476        "BEGIN": TokenType.BEGIN,
 477        "BETWEEN": TokenType.BETWEEN,
 478        "CACHE": TokenType.CACHE,
 479        "UNCACHE": TokenType.UNCACHE,
 480        "CASE": TokenType.CASE,
 481        "CHARACTER SET": TokenType.CHARACTER_SET,
 482        "COLLATE": TokenType.COLLATE,
 483        "COLUMN": TokenType.COLUMN,
 484        "COMMIT": TokenType.COMMIT,
 485        "CONSTRAINT": TokenType.CONSTRAINT,
 486        "CREATE": TokenType.CREATE,
 487        "CROSS": TokenType.CROSS,
 488        "CUBE": TokenType.CUBE,
 489        "CURRENT_DATE": TokenType.CURRENT_DATE,
 490        "CURRENT_TIME": TokenType.CURRENT_TIME,
 491        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
 492        "CURRENT_USER": TokenType.CURRENT_USER,
 493        "DATABASE": TokenType.DATABASE,
 494        "DEFAULT": TokenType.DEFAULT,
 495        "DELETE": TokenType.DELETE,
 496        "DESC": TokenType.DESC,
 497        "DESCRIBE": TokenType.DESCRIBE,
 498        "DISTINCT": TokenType.DISTINCT,
 499        "DIV": TokenType.DIV,
 500        "DROP": TokenType.DROP,
 501        "ELSE": TokenType.ELSE,
 502        "END": TokenType.END,
 503        "ESCAPE": TokenType.ESCAPE,
 504        "EXCEPT": TokenType.EXCEPT,
 505        "EXECUTE": TokenType.EXECUTE,
 506        "EXISTS": TokenType.EXISTS,
 507        "FALSE": TokenType.FALSE,
 508        "FETCH": TokenType.FETCH,
 509        "FILTER": TokenType.FILTER,
 510        "FIRST": TokenType.FIRST,
 511        "FULL": TokenType.FULL,
 512        "FUNCTION": TokenType.FUNCTION,
 513        "FOR": TokenType.FOR,
 514        "FOREIGN KEY": TokenType.FOREIGN_KEY,
 515        "FORMAT": TokenType.FORMAT,
 516        "FROM": TokenType.FROM,
 517        "GEOGRAPHY": TokenType.GEOGRAPHY,
 518        "GEOMETRY": TokenType.GEOMETRY,
 519        "GLOB": TokenType.GLOB,
 520        "GROUP BY": TokenType.GROUP_BY,
 521        "GROUPING SETS": TokenType.GROUPING_SETS,
 522        "HAVING": TokenType.HAVING,
 523        "IF": TokenType.IF,
 524        "ILIKE": TokenType.ILIKE,
 525        "IN": TokenType.IN,
 526        "INDEX": TokenType.INDEX,
 527        "INET": TokenType.INET,
 528        "INNER": TokenType.INNER,
 529        "INSERT": TokenType.INSERT,
 530        "INTERVAL": TokenType.INTERVAL,
 531        "INTERSECT": TokenType.INTERSECT,
 532        "INTO": TokenType.INTO,
 533        "IS": TokenType.IS,
 534        "ISNULL": TokenType.ISNULL,
 535        "JOIN": TokenType.JOIN,
 536        "KEEP": TokenType.KEEP,
 537        "LATERAL": TokenType.LATERAL,
 538        "LEFT": TokenType.LEFT,
 539        "LIKE": TokenType.LIKE,
 540        "LIMIT": TokenType.LIMIT,
 541        "LOAD": TokenType.LOAD,
 542        "LOCK": TokenType.LOCK,
 543        "MERGE": TokenType.MERGE,
 544        "NATURAL": TokenType.NATURAL,
 545        "NEXT": TokenType.NEXT,
 546        "NEXT VALUE FOR": TokenType.NEXT_VALUE_FOR,
 547        "NOT": TokenType.NOT,
 548        "NOTNULL": TokenType.NOTNULL,
 549        "NULL": TokenType.NULL,
 550        "OBJECT": TokenType.OBJECT,
 551        "OFFSET": TokenType.OFFSET,
 552        "ON": TokenType.ON,
 553        "OR": TokenType.OR,
 554        "ORDER BY": TokenType.ORDER_BY,
 555        "ORDINALITY": TokenType.ORDINALITY,
 556        "OUTER": TokenType.OUTER,
 557        "OVER": TokenType.OVER,
 558        "OVERLAPS": TokenType.OVERLAPS,
 559        "OVERWRITE": TokenType.OVERWRITE,
 560        "PARTITION": TokenType.PARTITION,
 561        "PARTITION BY": TokenType.PARTITION_BY,
 562        "PARTITIONED BY": TokenType.PARTITION_BY,
 563        "PARTITIONED_BY": TokenType.PARTITION_BY,
 564        "PERCENT": TokenType.PERCENT,
 565        "PIVOT": TokenType.PIVOT,
 566        "PRAGMA": TokenType.PRAGMA,
 567        "PRIMARY KEY": TokenType.PRIMARY_KEY,
 568        "PROCEDURE": TokenType.PROCEDURE,
 569        "QUALIFY": TokenType.QUALIFY,
 570        "RANGE": TokenType.RANGE,
 571        "RECURSIVE": TokenType.RECURSIVE,
 572        "REGEXP": TokenType.RLIKE,
 573        "REPLACE": TokenType.REPLACE,
 574        "REFERENCES": TokenType.REFERENCES,
 575        "RIGHT": TokenType.RIGHT,
 576        "RLIKE": TokenType.RLIKE,
 577        "ROLLBACK": TokenType.ROLLBACK,
 578        "ROLLUP": TokenType.ROLLUP,
 579        "ROW": TokenType.ROW,
 580        "ROWS": TokenType.ROWS,
 581        "SCHEMA": TokenType.SCHEMA,
 582        "SELECT": TokenType.SELECT,
 583        "SEMI": TokenType.SEMI,
 584        "SET": TokenType.SET,
 585        "SETTINGS": TokenType.SETTINGS,
 586        "SHOW": TokenType.SHOW,
 587        "SIMILAR TO": TokenType.SIMILAR_TO,
 588        "SOME": TokenType.SOME,
 589        "TABLE": TokenType.TABLE,
 590        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
 591        "TEMP": TokenType.TEMPORARY,
 592        "TEMPORARY": TokenType.TEMPORARY,
 593        "THEN": TokenType.THEN,
 594        "TRUE": TokenType.TRUE,
 595        "UNION": TokenType.UNION,
 596        "UNNEST": TokenType.UNNEST,
 597        "UNPIVOT": TokenType.UNPIVOT,
 598        "UPDATE": TokenType.UPDATE,
 599        "USE": TokenType.USE,
 600        "USING": TokenType.USING,
 601        "UUID": TokenType.UUID,
 602        "VALUES": TokenType.VALUES,
 603        "VIEW": TokenType.VIEW,
 604        "VOLATILE": TokenType.VOLATILE,
 605        "WHEN": TokenType.WHEN,
 606        "WHERE": TokenType.WHERE,
 607        "WINDOW": TokenType.WINDOW,
 608        "WITH": TokenType.WITH,
 609        "APPLY": TokenType.APPLY,
 610        "ARRAY": TokenType.ARRAY,
 611        "BIT": TokenType.BIT,
 612        "BOOL": TokenType.BOOLEAN,
 613        "BOOLEAN": TokenType.BOOLEAN,
 614        "BYTE": TokenType.TINYINT,
 615        "TINYINT": TokenType.TINYINT,
 616        "SHORT": TokenType.SMALLINT,
 617        "SMALLINT": TokenType.SMALLINT,
 618        "INT2": TokenType.SMALLINT,
 619        "INTEGER": TokenType.INT,
 620        "INT": TokenType.INT,
 621        "INT4": TokenType.INT,
 622        "LONG": TokenType.BIGINT,
 623        "BIGINT": TokenType.BIGINT,
 624        "INT8": TokenType.BIGINT,
 625        "DEC": TokenType.DECIMAL,
 626        "DECIMAL": TokenType.DECIMAL,
 627        "BIGDECIMAL": TokenType.BIGDECIMAL,
 628        "BIGNUMERIC": TokenType.BIGDECIMAL,
 629        "MAP": TokenType.MAP,
 630        "NULLABLE": TokenType.NULLABLE,
 631        "NUMBER": TokenType.DECIMAL,
 632        "NUMERIC": TokenType.DECIMAL,
 633        "FIXED": TokenType.DECIMAL,
 634        "REAL": TokenType.FLOAT,
 635        "FLOAT": TokenType.FLOAT,
 636        "FLOAT4": TokenType.FLOAT,
 637        "FLOAT8": TokenType.DOUBLE,
 638        "DOUBLE": TokenType.DOUBLE,
 639        "DOUBLE PRECISION": TokenType.DOUBLE,
 640        "JSON": TokenType.JSON,
 641        "CHAR": TokenType.CHAR,
 642        "CHARACTER": TokenType.CHAR,
 643        "NCHAR": TokenType.NCHAR,
 644        "VARCHAR": TokenType.VARCHAR,
 645        "VARCHAR2": TokenType.VARCHAR,
 646        "NVARCHAR": TokenType.NVARCHAR,
 647        "NVARCHAR2": TokenType.NVARCHAR,
 648        "STR": TokenType.TEXT,
 649        "STRING": TokenType.TEXT,
 650        "TEXT": TokenType.TEXT,
 651        "CLOB": TokenType.TEXT,
 652        "LONGVARCHAR": TokenType.TEXT,
 653        "BINARY": TokenType.BINARY,
 654        "BLOB": TokenType.VARBINARY,
 655        "BYTEA": TokenType.VARBINARY,
 656        "VARBINARY": TokenType.VARBINARY,
 657        "TIME": TokenType.TIME,
 658        "TIMESTAMP": TokenType.TIMESTAMP,
 659        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
 660        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
 661        "DATE": TokenType.DATE,
 662        "DATETIME": TokenType.DATETIME,
 663        "UNIQUE": TokenType.UNIQUE,
 664        "STRUCT": TokenType.STRUCT,
 665        "VARIANT": TokenType.VARIANT,
 666        "ALTER": TokenType.ALTER,
 667        "ANALYZE": TokenType.COMMAND,
 668        "CALL": TokenType.COMMAND,
 669        "COMMENT": TokenType.COMMENT,
 670        "COPY": TokenType.COMMAND,
 671        "EXPLAIN": TokenType.COMMAND,
 672        "GRANT": TokenType.COMMAND,
 673        "OPTIMIZE": TokenType.COMMAND,
 674        "PREPARE": TokenType.COMMAND,
 675        "TRUNCATE": TokenType.COMMAND,
 676        "VACUUM": TokenType.COMMAND,
 677    }
 678
 679    WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = {
 680        " ": TokenType.SPACE,
 681        "\t": TokenType.SPACE,
 682        "\n": TokenType.BREAK,
 683        "\r": TokenType.BREAK,
 684        "\r\n": TokenType.BREAK,
 685    }
 686
 687    COMMANDS = {
 688        TokenType.COMMAND,
 689        TokenType.EXECUTE,
 690        TokenType.FETCH,
 691        TokenType.SHOW,
 692    }
 693
 694    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
 695
 696    # handle numeric literals like in hive (3L = BIGINT)
 697    NUMERIC_LITERALS: t.Dict[str, str] = {}
 698    ENCODE: t.Optional[str] = None
 699
 700    COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")]
 701    KEYWORD_TRIE: t.Dict = {}  # autofilled
 702
 703    IDENTIFIER_CAN_START_WITH_DIGIT = False
 704
 705    __slots__ = (
 706        "sql",
 707        "size",
 708        "tokens",
 709        "_start",
 710        "_current",
 711        "_line",
 712        "_col",
 713        "_comments",
 714        "_char",
 715        "_end",
 716        "_peek",
 717        "_prev_token_line",
 718    )
 719
 720    def __init__(self) -> None:
 721        self.reset()
 722
 723    def reset(self) -> None:
 724        self.sql = ""
 725        self.size = 0
 726        self.tokens: t.List[Token] = []
 727        self._start = 0
 728        self._current = 0
 729        self._line = 1
 730        self._col = 0
 731        self._comments: t.List[str] = []
 732
 733        self._char = ""
 734        self._end = False
 735        self._peek = ""
 736        self._prev_token_line = -1
 737
 738    def tokenize(self, sql: str) -> t.List[Token]:
 739        """Returns a list of tokens corresponding to the SQL string `sql`."""
 740        self.reset()
 741        self.sql = sql
 742        self.size = len(sql)
 743
 744        try:
 745            self._scan()
 746        except Exception as e:
 747            start = max(self._current - 50, 0)
 748            end = min(self._current + 50, self.size - 1)
 749            context = self.sql[start:end]
 750            raise ValueError(f"Error tokenizing '{context}'") from e
 751
 752        return self.tokens
 753
 754    def _scan(self, until: t.Optional[t.Callable] = None) -> None:
 755        while self.size and not self._end:
 756            self._start = self._current
 757            self._advance()
 758
 759            if self._char is None:
 760                break
 761
 762            if self._char not in self.WHITE_SPACE:
 763                if self._char.isdigit():
 764                    self._scan_number()
 765                elif self._char in self._IDENTIFIERS:
 766                    self._scan_identifier(self._IDENTIFIERS[self._char])
 767                else:
 768                    self._scan_keywords()
 769
 770            if until and until():
 771                break
 772
 773        if self.tokens and self._comments:
 774            self.tokens[-1].comments.extend(self._comments)
 775
 776    def _chars(self, size: int) -> str:
 777        if size == 1:
 778            return self._char
 779
 780        start = self._current - 1
 781        end = start + size
 782
 783        return self.sql[start:end] if end <= self.size else ""
 784
 785    def _advance(self, i: int = 1, alnum: bool = False) -> None:
 786        if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
 787            self._col = 1
 788            self._line += 1
 789        else:
 790            self._col += i
 791
 792        self._current += i
 793        self._end = self._current >= self.size
 794        self._char = self.sql[self._current - 1]
 795        self._peek = "" if self._end else self.sql[self._current]
 796
 797        if alnum and self._char.isalnum():
 798            # Here we use local variables instead of attributes for better performance
 799            _col = self._col
 800            _current = self._current
 801            _end = self._end
 802            _peek = self._peek
 803
 804            while _peek.isalnum():
 805                _col += 1
 806                _current += 1
 807                _end = _current >= self.size
 808                _peek = "" if _end else self.sql[_current]
 809
 810            self._col = _col
 811            self._current = _current
 812            self._end = _end
 813            self._peek = _peek
 814            self._char = self.sql[_current - 1]
 815
 816    @property
 817    def _text(self) -> str:
 818        return self.sql[self._start : self._current]
 819
 820    def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
 821        self._prev_token_line = self._line
 822        self.tokens.append(
 823            Token(
 824                token_type,
 825                text=self._text if text is None else text,
 826                line=self._line,
 827                col=self._col,
 828                start=self._start,
 829                end=self._current - 1,
 830                comments=self._comments,
 831            )
 832        )
 833        self._comments = []
 834
 835        # If we have either a semicolon or a begin token before the command's token, we'll parse
 836        # whatever follows the command's token as a string
 837        if (
 838            token_type in self.COMMANDS
 839            and self._peek != ";"
 840            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS)
 841        ):
 842            start = self._current
 843            tokens = len(self.tokens)
 844            self._scan(lambda: self._peek == ";")
 845            self.tokens = self.tokens[:tokens]
 846            text = self.sql[start : self._current].strip()
 847            if text:
 848                self._add(TokenType.STRING, text)
 849
 850    def _scan_keywords(self) -> None:
 851        size = 0
 852        word = None
 853        chars = self._text
 854        char = chars
 855        prev_space = False
 856        skip = False
 857        trie = self.KEYWORD_TRIE
 858        single_token = char in self.SINGLE_TOKENS
 859
 860        while chars:
 861            if skip:
 862                result = 1
 863            else:
 864                result, trie = in_trie(trie, char.upper())
 865
 866            if result == 0:
 867                break
 868            if result == 2:
 869                word = chars
 870
 871            size += 1
 872            end = self._current - 1 + size
 873
 874            if end < self.size:
 875                char = self.sql[end]
 876                single_token = single_token or char in self.SINGLE_TOKENS
 877                is_space = char in self.WHITE_SPACE
 878
 879                if not is_space or not prev_space:
 880                    if is_space:
 881                        char = " "
 882                    chars += char
 883                    prev_space = is_space
 884                    skip = False
 885                else:
 886                    skip = True
 887            else:
 888                char = ""
 889                chars = " "
 890
 891        word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word
 892
 893        if not word:
 894            if self._char in self.SINGLE_TOKENS:
 895                self._add(self.SINGLE_TOKENS[self._char], text=self._char)
 896                return
 897            self._scan_var()
 898            return
 899
 900        if self._scan_string(word):
 901            return
 902        if self._scan_formatted_string(word):
 903            return
 904        if self._scan_comment(word):
 905            return
 906
 907        self._advance(size - 1)
 908        word = word.upper()
 909        self._add(self.KEYWORDS[word], text=word)
 910
 911    def _scan_comment(self, comment_start: str) -> bool:
 912        if comment_start not in self._COMMENTS:
 913            return False
 914
 915        comment_start_line = self._line
 916        comment_start_size = len(comment_start)
 917        comment_end = self._COMMENTS[comment_start]
 918
 919        if comment_end:
 920            # Skip the comment's start delimiter
 921            self._advance(comment_start_size)
 922
 923            comment_end_size = len(comment_end)
 924            while not self._end and self._chars(comment_end_size) != comment_end:
 925                self._advance(alnum=True)
 926
 927            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
 928            self._advance(comment_end_size - 1)
 929        else:
 930            while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK:
 931                self._advance(alnum=True)
 932            self._comments.append(self._text[comment_start_size:])
 933
 934        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
 935        # Multiple consecutive comments are preserved by appending them to the current comments list.
 936        if comment_start_line == self._prev_token_line:
 937            self.tokens[-1].comments.extend(self._comments)
 938            self._comments = []
 939            self._prev_token_line = self._line
 940
 941        return True
 942
 943    def _scan_number(self) -> None:
 944        if self._char == "0":
 945            peek = self._peek.upper()
 946            if peek == "B":
 947                return self._scan_bits() if self._BIT_STRINGS else self._add(TokenType.NUMBER)
 948            elif peek == "X":
 949                return self._scan_hex() if self._HEX_STRINGS else self._add(TokenType.NUMBER)
 950
 951        decimal = False
 952        scientific = 0
 953
 954        while True:
 955            if self._peek.isdigit():
 956                self._advance()
 957            elif self._peek == "." and not decimal:
 958                decimal = True
 959                self._advance()
 960            elif self._peek in ("-", "+") and scientific == 1:
 961                scientific += 1
 962                self._advance()
 963            elif self._peek.upper() == "E" and not scientific:
 964                scientific += 1
 965                self._advance()
 966            elif self._peek.isidentifier():
 967                number_text = self._text
 968                literal = ""
 969
 970                while self._peek.strip() and self._peek not in self.SINGLE_TOKENS:
 971                    literal += self._peek.upper()
 972                    self._advance()
 973
 974                token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal))
 975
 976                if token_type:
 977                    self._add(TokenType.NUMBER, number_text)
 978                    self._add(TokenType.DCOLON, "::")
 979                    return self._add(token_type, literal)
 980                elif self.IDENTIFIER_CAN_START_WITH_DIGIT:
 981                    return self._add(TokenType.VAR)
 982
 983                self._add(TokenType.NUMBER, number_text)
 984                return self._advance(-len(literal))
 985            else:
 986                return self._add(TokenType.NUMBER)
 987
 988    def _scan_bits(self) -> None:
 989        self._advance()
 990        value = self._extract_value()
 991        try:
 992            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
 993            int(value, 2)
 994            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
 995        except ValueError:
 996            self._add(TokenType.IDENTIFIER)
 997
 998    def _scan_hex(self) -> None:
 999        self._advance()
1000        value = self._extract_value()
1001        try:
1002            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
1003            int(value, 16)
1004            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
1005        except ValueError:
1006            self._add(TokenType.IDENTIFIER)
1007
1008    def _extract_value(self) -> str:
1009        while True:
1010            char = self._peek.strip()
1011            if char and char not in self.SINGLE_TOKENS:
1012                self._advance(alnum=True)
1013            else:
1014                break
1015
1016        return self._text
1017
1018    def _scan_string(self, quote: str) -> bool:
1019        quote_end = self._QUOTES.get(quote)
1020        if quote_end is None:
1021            return False
1022
1023        self._advance(len(quote))
1024        text = self._extract_string(quote_end)
1025        text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
1026        self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text)
1027        return True
1028
1029    # X'1234', b'0110', E'\\\\\' etc.
1030    def _scan_formatted_string(self, string_start: str) -> bool:
1031        if string_start in self._HEX_STRINGS:
1032            delimiters = self._HEX_STRINGS
1033            token_type = TokenType.HEX_STRING
1034            base = 16
1035        elif string_start in self._BIT_STRINGS:
1036            delimiters = self._BIT_STRINGS
1037            token_type = TokenType.BIT_STRING
1038            base = 2
1039        elif string_start in self._BYTE_STRINGS:
1040            delimiters = self._BYTE_STRINGS
1041            token_type = TokenType.BYTE_STRING
1042            base = None
1043        else:
1044            return False
1045
1046        self._advance(len(string_start))
1047        string_end = delimiters[string_start]
1048        text = self._extract_string(string_end)
1049
1050        if base:
1051            try:
1052                int(text, base)
1053            except:
1054                raise RuntimeError(
1055                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
1056                )
1057
1058        self._add(token_type, text)
1059        return True
1060
1061    def _scan_identifier(self, identifier_end: str) -> None:
1062        self._advance()
1063        text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES)
1064        self._add(TokenType.IDENTIFIER, text)
1065
1066    def _scan_var(self) -> None:
1067        while True:
1068            char = self._peek.strip()
1069            if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
1070                self._advance(alnum=True)
1071            else:
1072                break
1073
1074        self._add(
1075            TokenType.VAR
1076            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1077            else self.KEYWORDS.get(self._text.upper(), TokenType.VAR)
1078        )
1079
1080    def _extract_string(self, delimiter: str, escapes=None) -> str:
1081        text = ""
1082        delim_size = len(delimiter)
1083        escapes = self._STRING_ESCAPES if escapes is None else escapes
1084
1085        while True:
1086            if self._char in escapes and (self._peek == delimiter or self._peek in escapes):
1087                if self._peek == delimiter:
1088                    text += self._peek
1089                else:
1090                    text += self._char + self._peek
1091
1092                if self._current + 1 < self.size:
1093                    self._advance(2)
1094                else:
1095                    raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._current}")
1096            else:
1097                if self._chars(delim_size) == delimiter:
1098                    if delim_size > 1:
1099                        self._advance(delim_size - 1)
1100                    break
1101
1102                if self._end:
1103                    raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}")
1104
1105                current = self._current - 1
1106                self._advance(alnum=True)
1107                text += self.sql[current : self._current - 1]
1108
1109        return text
def reset(self) -> None:
723    def reset(self) -> None:
724        self.sql = ""
725        self.size = 0
726        self.tokens: t.List[Token] = []
727        self._start = 0
728        self._current = 0
729        self._line = 1
730        self._col = 0
731        self._comments: t.List[str] = []
732
733        self._char = ""
734        self._end = False
735        self._peek = ""
736        self._prev_token_line = -1
def tokenize(self, sql: str) -> List[sqlglot.tokens.Token]:
738    def tokenize(self, sql: str) -> t.List[Token]:
739        """Returns a list of tokens corresponding to the SQL string `sql`."""
740        self.reset()
741        self.sql = sql
742        self.size = len(sql)
743
744        try:
745            self._scan()
746        except Exception as e:
747            start = max(self._current - 50, 0)
748            end = min(self._current + 50, self.size - 1)
749            context = self.sql[start:end]
750            raise ValueError(f"Error tokenizing '{context}'") from e
751
752        return self.tokens

Returns a list of tokens corresponding to the SQL string sql.