Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3 pygments.lexer 

4 ~~~~~~~~~~~~~~ 

5 

6 Base lexer classes. 

7 

8 :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS. 

9 :license: BSD, see LICENSE for details. 

10""" 

11 

12import re 

13import sys 

14import time 

15 

16from pygments.filter import apply_filters, Filter 

17from pygments.filters import get_filter_by_name 

18from pygments.token import Error, Text, Other, _TokenType 

19from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ 

20 make_analysator, Future, guess_decode 

21from pygments.regexopt import regex_opt 

22 

23__all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', 

24 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this', 

25 'default', 'words'] 

26 

27 

28_encoding_map = [(b'\xef\xbb\xbf', 'utf-8'), 

29 (b'\xff\xfe\0\0', 'utf-32'), 

30 (b'\0\0\xfe\xff', 'utf-32be'), 

31 (b'\xff\xfe', 'utf-16'), 

32 (b'\xfe\xff', 'utf-16be')] 

33 

34_default_analyse = staticmethod(lambda x: 0.0) 

35 

36 

37class LexerMeta(type): 

38 """ 

39 This metaclass automagically converts ``analyse_text`` methods into 

40 static methods which always return float values. 

41 """ 

42 

43 def __new__(mcs, name, bases, d): 

44 if 'analyse_text' in d: 

45 d['analyse_text'] = make_analysator(d['analyse_text']) 

46 return type.__new__(mcs, name, bases, d) 

47 

48 

49class Lexer(metaclass=LexerMeta): 

50 """ 

51 Lexer for a specific language. 

52 

53 Basic options recognized: 

54 ``stripnl`` 

55 Strip leading and trailing newlines from the input (default: True). 

56 ``stripall`` 

57 Strip all leading and trailing whitespace from the input 

58 (default: False). 

59 ``ensurenl`` 

60 Make sure that the input ends with a newline (default: True). This 

61 is required for some lexers that consume input linewise. 

62 

63 .. versionadded:: 1.3 

64 

65 ``tabsize`` 

66 If given and greater than 0, expand tabs in the input (default: 0). 

67 ``encoding`` 

68 If given, must be an encoding name. This encoding will be used to 

69 convert the input string to Unicode, if it is not already a Unicode 

70 string (default: ``'guess'``, which uses a simple UTF-8 / Locale / 

71 Latin1 detection. Can also be ``'chardet'`` to use the chardet 

72 library, if it is installed. 

73 ``inencoding`` 

74 Overrides the ``encoding`` if given. 

75 """ 

76 

77 #: Name of the lexer 

78 name = None 

79 

80 #: Shortcuts for the lexer 

81 aliases = [] 

82 

83 #: File name globs 

84 filenames = [] 

85 

86 #: Secondary file name globs 

87 alias_filenames = [] 

88 

89 #: MIME types 

90 mimetypes = [] 

91 

92 #: Priority, should multiple lexers match and no content is provided 

93 priority = 0 

94 

95 def __init__(self, **options): 

96 self.options = options 

97 self.stripnl = get_bool_opt(options, 'stripnl', True) 

98 self.stripall = get_bool_opt(options, 'stripall', False) 

99 self.ensurenl = get_bool_opt(options, 'ensurenl', True) 

100 self.tabsize = get_int_opt(options, 'tabsize', 0) 

101 self.encoding = options.get('encoding', 'guess') 

102 self.encoding = options.get('inencoding') or self.encoding 

103 self.filters = [] 

104 for filter_ in get_list_opt(options, 'filters', ()): 

105 self.add_filter(filter_) 

106 

107 def __repr__(self): 

108 if self.options: 

109 return '<pygments.lexers.%s with %r>' % (self.__class__.__name__, 

110 self.options) 

111 else: 

112 return '<pygments.lexers.%s>' % self.__class__.__name__ 

113 

114 def add_filter(self, filter_, **options): 

115 """ 

116 Add a new stream filter to this lexer. 

117 """ 

118 if not isinstance(filter_, Filter): 

119 filter_ = get_filter_by_name(filter_, **options) 

120 self.filters.append(filter_) 

121 

122 def analyse_text(text): 

123 """ 

124 Has to return a float between ``0`` and ``1`` that indicates 

125 if a lexer wants to highlight this text. Used by ``guess_lexer``. 

126 If this method returns ``0`` it won't highlight it in any case, if 

127 it returns ``1`` highlighting with this lexer is guaranteed. 

128 

129 The `LexerMeta` metaclass automatically wraps this function so 

130 that it works like a static method (no ``self`` or ``cls`` 

131 parameter) and the return value is automatically converted to 

132 `float`. If the return value is an object that is boolean `False` 

133 it's the same as if the return values was ``0.0``. 

134 """ 

135 

136 def get_tokens(self, text, unfiltered=False): 

137 """ 

138 Return an iterable of (tokentype, value) pairs generated from 

139 `text`. If `unfiltered` is set to `True`, the filtering mechanism 

140 is bypassed even if filters are defined. 

141 

142 Also preprocess the text, i.e. expand tabs and strip it if 

143 wanted and applies registered filters. 

144 """ 

145 if not isinstance(text, str): 

146 if self.encoding == 'guess': 

147 text, _ = guess_decode(text) 

148 elif self.encoding == 'chardet': 

149 try: 

150 import chardet 

151 except ImportError as e: 

152 raise ImportError('To enable chardet encoding guessing, ' 

153 'please install the chardet library ' 

154 'from http://chardet.feedparser.org/') from e 

155 # check for BOM first 

156 decoded = None 

157 for bom, encoding in _encoding_map: 

158 if text.startswith(bom): 

159 decoded = text[len(bom):].decode(encoding, 'replace') 

160 break 

161 # no BOM found, so use chardet 

162 if decoded is None: 

163 enc = chardet.detect(text[:1024]) # Guess using first 1KB 

164 decoded = text.decode(enc.get('encoding') or 'utf-8', 

165 'replace') 

166 text = decoded 

167 else: 

168 text = text.decode(self.encoding) 

169 if text.startswith('\ufeff'): 

170 text = text[len('\ufeff'):] 

171 else: 

172 if text.startswith('\ufeff'): 

173 text = text[len('\ufeff'):] 

174 

175 # text now *is* a unicode string 

176 text = text.replace('\r\n', '\n') 

177 text = text.replace('\r', '\n') 

178 if self.stripall: 

179 text = text.strip() 

180 elif self.stripnl: 

181 text = text.strip('\n') 

182 if self.tabsize > 0: 

183 text = text.expandtabs(self.tabsize) 

184 if self.ensurenl and not text.endswith('\n'): 

185 text += '\n' 

186 

187 def streamer(): 

188 for _, t, v in self.get_tokens_unprocessed(text): 

189 yield t, v 

190 stream = streamer() 

191 if not unfiltered: 

192 stream = apply_filters(stream, self.filters, self) 

193 return stream 

194 

195 def get_tokens_unprocessed(self, text): 

196 """ 

197 Return an iterable of (index, tokentype, value) pairs where "index" 

198 is the starting position of the token within the input text. 

199 

200 In subclasses, implement this method as a generator to 

201 maximize effectiveness. 

202 """ 

203 raise NotImplementedError 

204 

205 

206class DelegatingLexer(Lexer): 

207 """ 

208 This lexer takes two lexer as arguments. A root lexer and 

209 a language lexer. First everything is scanned using the language 

210 lexer, afterwards all ``Other`` tokens are lexed using the root 

211 lexer. 

212 

213 The lexers from the ``template`` lexer package use this base lexer. 

214 """ 

215 

216 def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options): 

217 self.root_lexer = _root_lexer(**options) 

218 self.language_lexer = _language_lexer(**options) 

219 self.needle = _needle 

220 Lexer.__init__(self, **options) 

221 

222 def get_tokens_unprocessed(self, text): 

223 buffered = '' 

224 insertions = [] 

225 lng_buffer = [] 

226 for i, t, v in self.language_lexer.get_tokens_unprocessed(text): 

227 if t is self.needle: 

228 if lng_buffer: 

229 insertions.append((len(buffered), lng_buffer)) 

230 lng_buffer = [] 

231 buffered += v 

232 else: 

233 lng_buffer.append((i, t, v)) 

234 if lng_buffer: 

235 insertions.append((len(buffered), lng_buffer)) 

236 return do_insertions(insertions, 

237 self.root_lexer.get_tokens_unprocessed(buffered)) 

238 

239 

240# ------------------------------------------------------------------------------ 

241# RegexLexer and ExtendedRegexLexer 

242# 

243 

244 

245class include(str): # pylint: disable=invalid-name 

246 """ 

247 Indicates that a state should include rules from another state. 

248 """ 

249 pass 

250 

251 

252class _inherit: 

253 """ 

254 Indicates the a state should inherit from its superclass. 

255 """ 

256 def __repr__(self): 

257 return 'inherit' 

258 

259inherit = _inherit() # pylint: disable=invalid-name 

260 

261 

262class combined(tuple): # pylint: disable=invalid-name 

263 """ 

264 Indicates a state combined from multiple states. 

265 """ 

266 

267 def __new__(cls, *args): 

268 return tuple.__new__(cls, args) 

269 

270 def __init__(self, *args): 

271 # tuple.__init__ doesn't do anything 

272 pass 

273 

274 

275class _PseudoMatch: 

276 """ 

277 A pseudo match object constructed from a string. 

278 """ 

279 

280 def __init__(self, start, text): 

281 self._text = text 

282 self._start = start 

283 

284 def start(self, arg=None): 

285 return self._start 

286 

287 def end(self, arg=None): 

288 return self._start + len(self._text) 

289 

290 def group(self, arg=None): 

291 if arg: 

292 raise IndexError('No such group') 

293 return self._text 

294 

295 def groups(self): 

296 return (self._text,) 

297 

298 def groupdict(self): 

299 return {} 

300 

301 

302def bygroups(*args): 

303 """ 

304 Callback that yields multiple actions for each group in the match. 

305 """ 

306 def callback(lexer, match, ctx=None): 

307 for i, action in enumerate(args): 

308 if action is None: 

309 continue 

310 elif type(action) is _TokenType: 

311 data = match.group(i + 1) 

312 if data: 

313 yield match.start(i + 1), action, data 

314 else: 

315 data = match.group(i + 1) 

316 if data is not None: 

317 if ctx: 

318 ctx.pos = match.start(i + 1) 

319 for item in action(lexer, 

320 _PseudoMatch(match.start(i + 1), data), ctx): 

321 if item: 

322 yield item 

323 if ctx: 

324 ctx.pos = match.end() 

325 return callback 

326 

327 

328class _This: 

329 """ 

330 Special singleton used for indicating the caller class. 

331 Used by ``using``. 

332 """ 

333 

334this = _This() 

335 

336 

337def using(_other, **kwargs): 

338 """ 

339 Callback that processes the match with a different lexer. 

340 

341 The keyword arguments are forwarded to the lexer, except `state` which 

342 is handled separately. 

343 

344 `state` specifies the state that the new lexer will start in, and can 

345 be an enumerable such as ('root', 'inline', 'string') or a simple 

346 string which is assumed to be on top of the root state. 

347 

348 Note: For that to work, `_other` must not be an `ExtendedRegexLexer`. 

349 """ 

350 gt_kwargs = {} 

351 if 'state' in kwargs: 

352 s = kwargs.pop('state') 

353 if isinstance(s, (list, tuple)): 

354 gt_kwargs['stack'] = s 

355 else: 

356 gt_kwargs['stack'] = ('root', s) 

357 

358 if _other is this: 

359 def callback(lexer, match, ctx=None): 

360 # if keyword arguments are given the callback 

361 # function has to create a new lexer instance 

362 if kwargs: 

363 # XXX: cache that somehow 

364 kwargs.update(lexer.options) 

365 lx = lexer.__class__(**kwargs) 

366 else: 

367 lx = lexer 

368 s = match.start() 

369 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs): 

370 yield i + s, t, v 

371 if ctx: 

372 ctx.pos = match.end() 

373 else: 

374 def callback(lexer, match, ctx=None): 

375 # XXX: cache that somehow 

376 kwargs.update(lexer.options) 

377 lx = _other(**kwargs) 

378 

379 s = match.start() 

380 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs): 

381 yield i + s, t, v 

382 if ctx: 

383 ctx.pos = match.end() 

384 return callback 

385 

386 

387class default: 

388 """ 

389 Indicates a state or state action (e.g. #pop) to apply. 

390 For example default('#pop') is equivalent to ('', Token, '#pop') 

391 Note that state tuples may be used as well. 

392 

393 .. versionadded:: 2.0 

394 """ 

395 def __init__(self, state): 

396 self.state = state 

397 

398 

399class words(Future): 

400 """ 

401 Indicates a list of literal words that is transformed into an optimized 

402 regex that matches any of the words. 

403 

404 .. versionadded:: 2.0 

405 """ 

406 def __init__(self, words, prefix='', suffix=''): 

407 self.words = words 

408 self.prefix = prefix 

409 self.suffix = suffix 

410 

411 def get(self): 

412 return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix) 

413 

414 

415class RegexLexerMeta(LexerMeta): 

416 """ 

417 Metaclass for RegexLexer, creates the self._tokens attribute from 

418 self.tokens on the first instantiation. 

419 """ 

420 

421 def _process_regex(cls, regex, rflags, state): 

422 """Preprocess the regular expression component of a token definition.""" 

423 if isinstance(regex, Future): 

424 regex = regex.get() 

425 return re.compile(regex, rflags).match 

426 

427 def _process_token(cls, token): 

428 """Preprocess the token component of a token definition.""" 

429 assert type(token) is _TokenType or callable(token), \ 

430 'token type must be simple type or callable, not %r' % (token,) 

431 return token 

432 

433 def _process_new_state(cls, new_state, unprocessed, processed): 

434 """Preprocess the state transition action of a token definition.""" 

435 if isinstance(new_state, str): 

436 # an existing state 

437 if new_state == '#pop': 

438 return -1 

439 elif new_state in unprocessed: 

440 return (new_state,) 

441 elif new_state == '#push': 

442 return new_state 

443 elif new_state[:5] == '#pop:': 

444 return -int(new_state[5:]) 

445 else: 

446 assert False, 'unknown new state %r' % new_state 

447 elif isinstance(new_state, combined): 

448 # combine a new state from existing ones 

449 tmp_state = '_tmp_%d' % cls._tmpname 

450 cls._tmpname += 1 

451 itokens = [] 

452 for istate in new_state: 

453 assert istate != new_state, 'circular state ref %r' % istate 

454 itokens.extend(cls._process_state(unprocessed, 

455 processed, istate)) 

456 processed[tmp_state] = itokens 

457 return (tmp_state,) 

458 elif isinstance(new_state, tuple): 

459 # push more than one state 

460 for istate in new_state: 

461 assert (istate in unprocessed or 

462 istate in ('#pop', '#push')), \ 

463 'unknown new state ' + istate 

464 return new_state 

465 else: 

466 assert False, 'unknown new state def %r' % new_state 

467 

468 def _process_state(cls, unprocessed, processed, state): 

469 """Preprocess a single state definition.""" 

470 assert type(state) is str, "wrong state name %r" % state 

471 assert state[0] != '#', "invalid state name %r" % state 

472 if state in processed: 

473 return processed[state] 

474 tokens = processed[state] = [] 

475 rflags = cls.flags 

476 for tdef in unprocessed[state]: 

477 if isinstance(tdef, include): 

478 # it's a state reference 

479 assert tdef != state, "circular state reference %r" % state 

480 tokens.extend(cls._process_state(unprocessed, processed, 

481 str(tdef))) 

482 continue 

483 if isinstance(tdef, _inherit): 

484 # should be processed already, but may not in the case of: 

485 # 1. the state has no counterpart in any parent 

486 # 2. the state includes more than one 'inherit' 

487 continue 

488 if isinstance(tdef, default): 

489 new_state = cls._process_new_state(tdef.state, unprocessed, processed) 

490 tokens.append((re.compile('').match, None, new_state)) 

491 continue 

492 

493 assert type(tdef) is tuple, "wrong rule def %r" % tdef 

494 

495 try: 

496 rex = cls._process_regex(tdef[0], rflags, state) 

497 except Exception as err: 

498 raise ValueError("uncompilable regex %r in state %r of %r: %s" % 

499 (tdef[0], state, cls, err)) from err 

500 

501 token = cls._process_token(tdef[1]) 

502 

503 if len(tdef) == 2: 

504 new_state = None 

505 else: 

506 new_state = cls._process_new_state(tdef[2], 

507 unprocessed, processed) 

508 

509 tokens.append((rex, token, new_state)) 

510 return tokens 

511 

512 def process_tokendef(cls, name, tokendefs=None): 

513 """Preprocess a dictionary of token definitions.""" 

514 processed = cls._all_tokens[name] = {} 

515 tokendefs = tokendefs or cls.tokens[name] 

516 for state in list(tokendefs): 

517 cls._process_state(tokendefs, processed, state) 

518 return processed 

519 

520 def get_tokendefs(cls): 

521 """ 

522 Merge tokens from superclasses in MRO order, returning a single tokendef 

523 dictionary. 

524 

525 Any state that is not defined by a subclass will be inherited 

526 automatically. States that *are* defined by subclasses will, by 

527 default, override that state in the superclass. If a subclass wishes to 

528 inherit definitions from a superclass, it can use the special value 

529 "inherit", which will cause the superclass' state definition to be 

530 included at that point in the state. 

531 """ 

532 tokens = {} 

533 inheritable = {} 

534 for c in cls.__mro__: 

535 toks = c.__dict__.get('tokens', {}) 

536 

537 for state, items in toks.items(): 

538 curitems = tokens.get(state) 

539 if curitems is None: 

540 # N.b. because this is assigned by reference, sufficiently 

541 # deep hierarchies are processed incrementally (e.g. for 

542 # A(B), B(C), C(RegexLexer), B will be premodified so X(B) 

543 # will not see any inherits in B). 

544 tokens[state] = items 

545 try: 

546 inherit_ndx = items.index(inherit) 

547 except ValueError: 

548 continue 

549 inheritable[state] = inherit_ndx 

550 continue 

551 

552 inherit_ndx = inheritable.pop(state, None) 

553 if inherit_ndx is None: 

554 continue 

555 

556 # Replace the "inherit" value with the items 

557 curitems[inherit_ndx:inherit_ndx+1] = items 

558 try: 

559 # N.b. this is the index in items (that is, the superclass 

560 # copy), so offset required when storing below. 

561 new_inh_ndx = items.index(inherit) 

562 except ValueError: 

563 pass 

564 else: 

565 inheritable[state] = inherit_ndx + new_inh_ndx 

566 

567 return tokens 

568 

569 def __call__(cls, *args, **kwds): 

570 """Instantiate cls after preprocessing its token definitions.""" 

571 if '_tokens' not in cls.__dict__: 

572 cls._all_tokens = {} 

573 cls._tmpname = 0 

574 if hasattr(cls, 'token_variants') and cls.token_variants: 

575 # don't process yet 

576 pass 

577 else: 

578 cls._tokens = cls.process_tokendef('', cls.get_tokendefs()) 

579 

580 return type.__call__(cls, *args, **kwds) 

581 

582 

583class RegexLexer(Lexer, metaclass=RegexLexerMeta): 

584 """ 

585 Base for simple stateful regular expression-based lexers. 

586 Simplifies the lexing process so that you need only 

587 provide a list of states and regular expressions. 

588 """ 

589 

590 #: Flags for compiling the regular expressions. 

591 #: Defaults to MULTILINE. 

592 flags = re.MULTILINE 

593 

594 #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}`` 

595 #: 

596 #: The initial state is 'root'. 

597 #: ``new_state`` can be omitted to signify no state transition. 

598 #: If it is a string, the state is pushed on the stack and changed. 

599 #: If it is a tuple of strings, all states are pushed on the stack and 

600 #: the current state will be the topmost. 

601 #: It can also be ``combined('state1', 'state2', ...)`` 

602 #: to signify a new, anonymous state combined from the rules of two 

603 #: or more existing ones. 

604 #: Furthermore, it can be '#pop' to signify going back one step in 

605 #: the state stack, or '#push' to push the current state on the stack 

606 #: again. 

607 #: 

608 #: The tuple can also be replaced with ``include('state')``, in which 

609 #: case the rules from the state named by the string are included in the 

610 #: current one. 

611 tokens = {} 

612 

613 def get_tokens_unprocessed(self, text, stack=('root',)): 

614 """ 

615 Split ``text`` into (tokentype, text) pairs. 

616 

617 ``stack`` is the inital stack (default: ``['root']``) 

618 """ 

619 pos = 0 

620 tokendefs = self._tokens 

621 statestack = list(stack) 

622 statetokens = tokendefs[statestack[-1]] 

623 while 1: 

624 for rexmatch, action, new_state in statetokens: 

625 m = rexmatch(text, pos) 

626 if m: 

627 if action is not None: 

628 if type(action) is _TokenType: 

629 yield pos, action, m.group() 

630 else: 

631 yield from action(self, m) 

632 pos = m.end() 

633 if new_state is not None: 

634 # state transition 

635 if isinstance(new_state, tuple): 

636 for state in new_state: 

637 if state == '#pop': 

638 if len(statestack) > 1: 

639 statestack.pop() 

640 elif state == '#push': 

641 statestack.append(statestack[-1]) 

642 else: 

643 statestack.append(state) 

644 elif isinstance(new_state, int): 

645 # pop, but keep at least one state on the stack 

646 # (random code leading to unexpected pops should 

647 # not allow exceptions) 

648 if abs(new_state) >= len(statestack): 

649 del statestack[1:] 

650 else: 

651 del statestack[new_state:] 

652 elif new_state == '#push': 

653 statestack.append(statestack[-1]) 

654 else: 

655 assert False, "wrong state def: %r" % new_state 

656 statetokens = tokendefs[statestack[-1]] 

657 break 

658 else: 

659 # We are here only if all state tokens have been considered 

660 # and there was not a match on any of them. 

661 try: 

662 if text[pos] == '\n': 

663 # at EOL, reset state to "root" 

664 statestack = ['root'] 

665 statetokens = tokendefs['root'] 

666 yield pos, Text, '\n' 

667 pos += 1 

668 continue 

669 yield pos, Error, text[pos] 

670 pos += 1 

671 except IndexError: 

672 break 

673 

674 

675class LexerContext: 

676 """ 

677 A helper object that holds lexer position data. 

678 """ 

679 

680 def __init__(self, text, pos, stack=None, end=None): 

681 self.text = text 

682 self.pos = pos 

683 self.end = end or len(text) # end=0 not supported ;-) 

684 self.stack = stack or ['root'] 

685 

686 def __repr__(self): 

687 return 'LexerContext(%r, %r, %r)' % ( 

688 self.text, self.pos, self.stack) 

689 

690 

691class ExtendedRegexLexer(RegexLexer): 

692 """ 

693 A RegexLexer that uses a context object to store its state. 

694 """ 

695 

696 def get_tokens_unprocessed(self, text=None, context=None): 

697 """ 

698 Split ``text`` into (tokentype, text) pairs. 

699 If ``context`` is given, use this lexer context instead. 

700 """ 

701 tokendefs = self._tokens 

702 if not context: 

703 ctx = LexerContext(text, 0) 

704 statetokens = tokendefs['root'] 

705 else: 

706 ctx = context 

707 statetokens = tokendefs[ctx.stack[-1]] 

708 text = ctx.text 

709 while 1: 

710 for rexmatch, action, new_state in statetokens: 

711 m = rexmatch(text, ctx.pos, ctx.end) 

712 if m: 

713 if action is not None: 

714 if type(action) is _TokenType: 

715 yield ctx.pos, action, m.group() 

716 ctx.pos = m.end() 

717 else: 

718 yield from action(self, m, ctx) 

719 if not new_state: 

720 # altered the state stack? 

721 statetokens = tokendefs[ctx.stack[-1]] 

722 # CAUTION: callback must set ctx.pos! 

723 if new_state is not None: 

724 # state transition 

725 if isinstance(new_state, tuple): 

726 for state in new_state: 

727 if state == '#pop': 

728 if len(ctx.stack) > 1: 

729 ctx.stack.pop() 

730 elif state == '#push': 

731 ctx.stack.append(ctx.stack[-1]) 

732 else: 

733 ctx.stack.append(state) 

734 elif isinstance(new_state, int): 

735 # see RegexLexer for why this check is made 

736 if abs(new_state) >= len(ctx.stack): 

737 del ctx.state[1:] 

738 else: 

739 del ctx.stack[new_state:] 

740 elif new_state == '#push': 

741 ctx.stack.append(ctx.stack[-1]) 

742 else: 

743 assert False, "wrong state def: %r" % new_state 

744 statetokens = tokendefs[ctx.stack[-1]] 

745 break 

746 else: 

747 try: 

748 if ctx.pos >= ctx.end: 

749 break 

750 if text[ctx.pos] == '\n': 

751 # at EOL, reset state to "root" 

752 ctx.stack = ['root'] 

753 statetokens = tokendefs['root'] 

754 yield ctx.pos, Text, '\n' 

755 ctx.pos += 1 

756 continue 

757 yield ctx.pos, Error, text[ctx.pos] 

758 ctx.pos += 1 

759 except IndexError: 

760 break 

761 

762 

763def do_insertions(insertions, tokens): 

764 """ 

765 Helper for lexers which must combine the results of several 

766 sublexers. 

767 

768 ``insertions`` is a list of ``(index, itokens)`` pairs. 

769 Each ``itokens`` iterable should be inserted at position 

770 ``index`` into the token stream given by the ``tokens`` 

771 argument. 

772 

773 The result is a combined token stream. 

774 

775 TODO: clean up the code here. 

776 """ 

777 insertions = iter(insertions) 

778 try: 

779 index, itokens = next(insertions) 

780 except StopIteration: 

781 # no insertions 

782 yield from tokens 

783 return 

784 

785 realpos = None 

786 insleft = True 

787 

788 # iterate over the token stream where we want to insert 

789 # the tokens from the insertion list. 

790 for i, t, v in tokens: 

791 # first iteration. store the postition of first item 

792 if realpos is None: 

793 realpos = i 

794 oldi = 0 

795 while insleft and i + len(v) >= index: 

796 tmpval = v[oldi:index - i] 

797 if tmpval: 

798 yield realpos, t, tmpval 

799 realpos += len(tmpval) 

800 for it_index, it_token, it_value in itokens: 

801 yield realpos, it_token, it_value 

802 realpos += len(it_value) 

803 oldi = index - i 

804 try: 

805 index, itokens = next(insertions) 

806 except StopIteration: 

807 insleft = False 

808 break # not strictly necessary 

809 if oldi < len(v): 

810 yield realpos, t, v[oldi:] 

811 realpos += len(v) - oldi 

812 

813 # leftover tokens 

814 while insleft: 

815 # no normal tokens, set realpos to zero 

816 realpos = realpos or 0 

817 for p, t, v in itokens: 

818 yield realpos, t, v 

819 realpos += len(v) 

820 try: 

821 index, itokens = next(insertions) 

822 except StopIteration: 

823 insleft = False 

824 break # not strictly necessary 

825 

826 

827class ProfilingRegexLexerMeta(RegexLexerMeta): 

828 """Metaclass for ProfilingRegexLexer, collects regex timing info.""" 

829 

830 def _process_regex(cls, regex, rflags, state): 

831 if isinstance(regex, words): 

832 rex = regex_opt(regex.words, prefix=regex.prefix, 

833 suffix=regex.suffix) 

834 else: 

835 rex = regex 

836 compiled = re.compile(rex, rflags) 

837 

838 def match_func(text, pos, endpos=sys.maxsize): 

839 info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0]) 

840 t0 = time.time() 

841 res = compiled.match(text, pos, endpos) 

842 t1 = time.time() 

843 info[0] += 1 

844 info[1] += t1 - t0 

845 return res 

846 return match_func 

847 

848 

849class ProfilingRegexLexer(RegexLexer, metaclass=ProfilingRegexLexerMeta): 

850 """Drop-in replacement for RegexLexer that does profiling of its regexes.""" 

851 

852 _prof_data = [] 

853 _prof_sort_index = 4 # defaults to time per call 

854 

855 def get_tokens_unprocessed(self, text, stack=('root',)): 

856 # this needs to be a stack, since using(this) will produce nested calls 

857 self.__class__._prof_data.append({}) 

858 yield from RegexLexer.get_tokens_unprocessed(self, text, stack) 

859 rawdata = self.__class__._prof_data.pop() 

860 data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65], 

861 n, 1000 * t, 1000 * t / n) 

862 for ((s, r), (n, t)) in rawdata.items()), 

863 key=lambda x: x[self._prof_sort_index], 

864 reverse=True) 

865 sum_total = sum(x[3] for x in data) 

866 

867 print() 

868 print('Profiling result for %s lexing %d chars in %.3f ms' % 

869 (self.__class__.__name__, len(text), sum_total)) 

870 print('=' * 110) 

871 print('%-20s %-64s ncalls tottime percall' % ('state', 'regex')) 

872 print('-' * 110) 

873 for d in data: 

874 print('%-20s %-65s %5d %8.4f %8.4f' % d) 

875 print('=' * 110)