Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3 pygments.lexers.data 

4 ~~~~~~~~~~~~~~~~~~~~ 

5 

6 Lexers for data file format. 

7 

8 :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS. 

9 :license: BSD, see LICENSE for details. 

10""" 

11 

12import re 

13 

14from pygments.lexer import Lexer, RegexLexer, ExtendedRegexLexer, LexerContext, \ 

15 include, bygroups, inherit 

16from pygments.token import Text, Comment, Keyword, Name, String, Number, \ 

17 Punctuation, Literal, Error 

18 

19__all__ = ['YamlLexer', 'JsonLexer', 'JsonBareObjectLexer', 'JsonLdLexer'] 

20 

21 

22class YamlLexerContext(LexerContext): 

23 """Indentation context for the YAML lexer.""" 

24 

25 def __init__(self, *args, **kwds): 

26 super().__init__(*args, **kwds) 

27 self.indent_stack = [] 

28 self.indent = -1 

29 self.next_indent = 0 

30 self.block_scalar_indent = None 

31 

32 

33class YamlLexer(ExtendedRegexLexer): 

34 """ 

35 Lexer for `YAML <http://yaml.org/>`_, a human-friendly data serialization 

36 language. 

37 

38 .. versionadded:: 0.11 

39 """ 

40 

41 name = 'YAML' 

42 aliases = ['yaml'] 

43 filenames = ['*.yaml', '*.yml'] 

44 mimetypes = ['text/x-yaml'] 

45 

46 def something(token_class): 

47 """Do not produce empty tokens.""" 

48 def callback(lexer, match, context): 

49 text = match.group() 

50 if not text: 

51 return 

52 yield match.start(), token_class, text 

53 context.pos = match.end() 

54 return callback 

55 

56 def reset_indent(token_class): 

57 """Reset the indentation levels.""" 

58 def callback(lexer, match, context): 

59 text = match.group() 

60 context.indent_stack = [] 

61 context.indent = -1 

62 context.next_indent = 0 

63 context.block_scalar_indent = None 

64 yield match.start(), token_class, text 

65 context.pos = match.end() 

66 return callback 

67 

68 def save_indent(token_class, start=False): 

69 """Save a possible indentation level.""" 

70 def callback(lexer, match, context): 

71 text = match.group() 

72 extra = '' 

73 if start: 

74 context.next_indent = len(text) 

75 if context.next_indent < context.indent: 

76 while context.next_indent < context.indent: 

77 context.indent = context.indent_stack.pop() 

78 if context.next_indent > context.indent: 

79 extra = text[context.indent:] 

80 text = text[:context.indent] 

81 else: 

82 context.next_indent += len(text) 

83 if text: 

84 yield match.start(), token_class, text 

85 if extra: 

86 yield match.start()+len(text), token_class.Error, extra 

87 context.pos = match.end() 

88 return callback 

89 

90 def set_indent(token_class, implicit=False): 

91 """Set the previously saved indentation level.""" 

92 def callback(lexer, match, context): 

93 text = match.group() 

94 if context.indent < context.next_indent: 

95 context.indent_stack.append(context.indent) 

96 context.indent = context.next_indent 

97 if not implicit: 

98 context.next_indent += len(text) 

99 yield match.start(), token_class, text 

100 context.pos = match.end() 

101 return callback 

102 

103 def set_block_scalar_indent(token_class): 

104 """Set an explicit indentation level for a block scalar.""" 

105 def callback(lexer, match, context): 

106 text = match.group() 

107 context.block_scalar_indent = None 

108 if not text: 

109 return 

110 increment = match.group(1) 

111 if increment: 

112 current_indent = max(context.indent, 0) 

113 increment = int(increment) 

114 context.block_scalar_indent = current_indent + increment 

115 if text: 

116 yield match.start(), token_class, text 

117 context.pos = match.end() 

118 return callback 

119 

120 def parse_block_scalar_empty_line(indent_token_class, content_token_class): 

121 """Process an empty line in a block scalar.""" 

122 def callback(lexer, match, context): 

123 text = match.group() 

124 if (context.block_scalar_indent is None or 

125 len(text) <= context.block_scalar_indent): 

126 if text: 

127 yield match.start(), indent_token_class, text 

128 else: 

129 indentation = text[:context.block_scalar_indent] 

130 content = text[context.block_scalar_indent:] 

131 yield match.start(), indent_token_class, indentation 

132 yield (match.start()+context.block_scalar_indent, 

133 content_token_class, content) 

134 context.pos = match.end() 

135 return callback 

136 

137 def parse_block_scalar_indent(token_class): 

138 """Process indentation spaces in a block scalar.""" 

139 def callback(lexer, match, context): 

140 text = match.group() 

141 if context.block_scalar_indent is None: 

142 if len(text) <= max(context.indent, 0): 

143 context.stack.pop() 

144 context.stack.pop() 

145 return 

146 context.block_scalar_indent = len(text) 

147 else: 

148 if len(text) < context.block_scalar_indent: 

149 context.stack.pop() 

150 context.stack.pop() 

151 return 

152 if text: 

153 yield match.start(), token_class, text 

154 context.pos = match.end() 

155 return callback 

156 

157 def parse_plain_scalar_indent(token_class): 

158 """Process indentation spaces in a plain scalar.""" 

159 def callback(lexer, match, context): 

160 text = match.group() 

161 if len(text) <= context.indent: 

162 context.stack.pop() 

163 context.stack.pop() 

164 return 

165 if text: 

166 yield match.start(), token_class, text 

167 context.pos = match.end() 

168 return callback 

169 

170 tokens = { 

171 # the root rules 

172 'root': [ 

173 # ignored whitespaces 

174 (r'[ ]+(?=#|$)', Text), 

175 # line breaks 

176 (r'\n+', Text), 

177 # a comment 

178 (r'#[^\n]*', Comment.Single), 

179 # the '%YAML' directive 

180 (r'^%YAML(?=[ ]|$)', reset_indent(Name.Tag), 'yaml-directive'), 

181 # the %TAG directive 

182 (r'^%TAG(?=[ ]|$)', reset_indent(Name.Tag), 'tag-directive'), 

183 # document start and document end indicators 

184 (r'^(?:---|\.\.\.)(?=[ ]|$)', reset_indent(Name.Namespace), 

185 'block-line'), 

186 # indentation spaces 

187 (r'[ ]*(?!\s|$)', save_indent(Text, start=True), 

188 ('block-line', 'indentation')), 

189 ], 

190 

191 # trailing whitespaces after directives or a block scalar indicator 

192 'ignored-line': [ 

193 # ignored whitespaces 

194 (r'[ ]+(?=#|$)', Text), 

195 # a comment 

196 (r'#[^\n]*', Comment.Single), 

197 # line break 

198 (r'\n', Text, '#pop:2'), 

199 ], 

200 

201 # the %YAML directive 

202 'yaml-directive': [ 

203 # the version number 

204 (r'([ ]+)([0-9]+\.[0-9]+)', 

205 bygroups(Text, Number), 'ignored-line'), 

206 ], 

207 

208 # the %TAG directive 

209 'tag-directive': [ 

210 # a tag handle and the corresponding prefix 

211 (r'([ ]+)(!|![\w-]*!)' 

212 r'([ ]+)(!|!?[\w;/?:@&=+$,.!~*\'()\[\]%-]+)', 

213 bygroups(Text, Keyword.Type, Text, Keyword.Type), 

214 'ignored-line'), 

215 ], 

216 

217 # block scalar indicators and indentation spaces 

218 'indentation': [ 

219 # trailing whitespaces are ignored 

220 (r'[ ]*$', something(Text), '#pop:2'), 

221 # whitespaces preceding block collection indicators 

222 (r'[ ]+(?=[?:-](?:[ ]|$))', save_indent(Text)), 

223 # block collection indicators 

224 (r'[?:-](?=[ ]|$)', set_indent(Punctuation.Indicator)), 

225 # the beginning a block line 

226 (r'[ ]*', save_indent(Text), '#pop'), 

227 ], 

228 

229 # an indented line in the block context 

230 'block-line': [ 

231 # the line end 

232 (r'[ ]*(?=#|$)', something(Text), '#pop'), 

233 # whitespaces separating tokens 

234 (r'[ ]+', Text), 

235 # key with colon 

236 (r'''([^#,:?\[\]{}"'\n]+)(:)(?=[ ]|$)''', 

237 bygroups(Name.Tag, set_indent(Punctuation, implicit=True))), 

238 # tags, anchors and aliases, 

239 include('descriptors'), 

240 # block collections and scalars 

241 include('block-nodes'), 

242 # flow collections and quoted scalars 

243 include('flow-nodes'), 

244 # a plain scalar 

245 (r'(?=[^\s?:,\[\]{}#&*!|>\'"%@`-]|[?:-]\S)', 

246 something(Name.Variable), 

247 'plain-scalar-in-block-context'), 

248 ], 

249 

250 # tags, anchors, aliases 

251 'descriptors': [ 

252 # a full-form tag 

253 (r'!<[\w#;/?:@&=+$,.!~*\'()\[\]%-]+>', Keyword.Type), 

254 # a tag in the form '!', '!suffix' or '!handle!suffix' 

255 (r'!(?:[\w-]+!)?' 

256 r'[\w#;/?:@&=+$,.!~*\'()\[\]%-]*', Keyword.Type), 

257 # an anchor 

258 (r'&[\w-]+', Name.Label), 

259 # an alias 

260 (r'\*[\w-]+', Name.Variable), 

261 ], 

262 

263 # block collections and scalars 

264 'block-nodes': [ 

265 # implicit key 

266 (r':(?=[ ]|$)', set_indent(Punctuation.Indicator, implicit=True)), 

267 # literal and folded scalars 

268 (r'[|>]', Punctuation.Indicator, 

269 ('block-scalar-content', 'block-scalar-header')), 

270 ], 

271 

272 # flow collections and quoted scalars 

273 'flow-nodes': [ 

274 # a flow sequence 

275 (r'\[', Punctuation.Indicator, 'flow-sequence'), 

276 # a flow mapping 

277 (r'\{', Punctuation.Indicator, 'flow-mapping'), 

278 # a single-quoted scalar 

279 (r'\'', String, 'single-quoted-scalar'), 

280 # a double-quoted scalar 

281 (r'\"', String, 'double-quoted-scalar'), 

282 ], 

283 

284 # the content of a flow collection 

285 'flow-collection': [ 

286 # whitespaces 

287 (r'[ ]+', Text), 

288 # line breaks 

289 (r'\n+', Text), 

290 # a comment 

291 (r'#[^\n]*', Comment.Single), 

292 # simple indicators 

293 (r'[?:,]', Punctuation.Indicator), 

294 # tags, anchors and aliases 

295 include('descriptors'), 

296 # nested collections and quoted scalars 

297 include('flow-nodes'), 

298 # a plain scalar 

299 (r'(?=[^\s?:,\[\]{}#&*!|>\'"%@`])', 

300 something(Name.Variable), 

301 'plain-scalar-in-flow-context'), 

302 ], 

303 

304 # a flow sequence indicated by '[' and ']' 

305 'flow-sequence': [ 

306 # include flow collection rules 

307 include('flow-collection'), 

308 # the closing indicator 

309 (r'\]', Punctuation.Indicator, '#pop'), 

310 ], 

311 

312 # a flow mapping indicated by '{' and '}' 

313 'flow-mapping': [ 

314 # key with colon 

315 (r'''([^,:?\[\]{}"'\n]+)(:)(?=[ ]|$)''', 

316 bygroups(Name.Tag, Punctuation)), 

317 # include flow collection rules 

318 include('flow-collection'), 

319 # the closing indicator 

320 (r'\}', Punctuation.Indicator, '#pop'), 

321 ], 

322 

323 # block scalar lines 

324 'block-scalar-content': [ 

325 # line break 

326 (r'\n', Text), 

327 # empty line 

328 (r'^[ ]+$', 

329 parse_block_scalar_empty_line(Text, Name.Constant)), 

330 # indentation spaces (we may leave the state here) 

331 (r'^[ ]*', parse_block_scalar_indent(Text)), 

332 # line content 

333 (r'[\S\t ]+', Name.Constant), 

334 ], 

335 

336 # the content of a literal or folded scalar 

337 'block-scalar-header': [ 

338 # indentation indicator followed by chomping flag 

339 (r'([1-9])?[+-]?(?=[ ]|$)', 

340 set_block_scalar_indent(Punctuation.Indicator), 

341 'ignored-line'), 

342 # chomping flag followed by indentation indicator 

343 (r'[+-]?([1-9])?(?=[ ]|$)', 

344 set_block_scalar_indent(Punctuation.Indicator), 

345 'ignored-line'), 

346 ], 

347 

348 # ignored and regular whitespaces in quoted scalars 

349 'quoted-scalar-whitespaces': [ 

350 # leading and trailing whitespaces are ignored 

351 (r'^[ ]+', Text), 

352 (r'[ ]+$', Text), 

353 # line breaks are ignored 

354 (r'\n+', Text), 

355 # other whitespaces are a part of the value 

356 (r'[ ]+', Name.Variable), 

357 ], 

358 

359 # single-quoted scalars 

360 'single-quoted-scalar': [ 

361 # include whitespace and line break rules 

362 include('quoted-scalar-whitespaces'), 

363 # escaping of the quote character 

364 (r'\'\'', String.Escape), 

365 # regular non-whitespace characters 

366 (r'[^\s\']+', String), 

367 # the closing quote 

368 (r'\'', String, '#pop'), 

369 ], 

370 

371 # double-quoted scalars 

372 'double-quoted-scalar': [ 

373 # include whitespace and line break rules 

374 include('quoted-scalar-whitespaces'), 

375 # escaping of special characters 

376 (r'\\[0abt\tn\nvfre "\\N_LP]', String), 

377 # escape codes 

378 (r'\\(?:x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})', 

379 String.Escape), 

380 # regular non-whitespace characters 

381 (r'[^\s"\\]+', String), 

382 # the closing quote 

383 (r'"', String, '#pop'), 

384 ], 

385 

386 # the beginning of a new line while scanning a plain scalar 

387 'plain-scalar-in-block-context-new-line': [ 

388 # empty lines 

389 (r'^[ ]+$', Text), 

390 # line breaks 

391 (r'\n+', Text), 

392 # document start and document end indicators 

393 (r'^(?=---|\.\.\.)', something(Name.Namespace), '#pop:3'), 

394 # indentation spaces (we may leave the block line state here) 

395 (r'^[ ]*', parse_plain_scalar_indent(Text), '#pop'), 

396 ], 

397 

398 # a plain scalar in the block context 

399 'plain-scalar-in-block-context': [ 

400 # the scalar ends with the ':' indicator 

401 (r'[ ]*(?=:[ ]|:$)', something(Text), '#pop'), 

402 # the scalar ends with whitespaces followed by a comment 

403 (r'[ ]+(?=#)', Text, '#pop'), 

404 # trailing whitespaces are ignored 

405 (r'[ ]+$', Text), 

406 # line breaks are ignored 

407 (r'\n+', Text, 'plain-scalar-in-block-context-new-line'), 

408 # other whitespaces are a part of the value 

409 (r'[ ]+', Literal.Scalar.Plain), 

410 # regular non-whitespace characters 

411 (r'(?::(?!\s)|[^\s:])+', Literal.Scalar.Plain), 

412 ], 

413 

414 # a plain scalar is the flow context 

415 'plain-scalar-in-flow-context': [ 

416 # the scalar ends with an indicator character 

417 (r'[ ]*(?=[,:?\[\]{}])', something(Text), '#pop'), 

418 # the scalar ends with a comment 

419 (r'[ ]+(?=#)', Text, '#pop'), 

420 # leading and trailing whitespaces are ignored 

421 (r'^[ ]+', Text), 

422 (r'[ ]+$', Text), 

423 # line breaks are ignored 

424 (r'\n+', Text), 

425 # other whitespaces are a part of the value 

426 (r'[ ]+', Name.Variable), 

427 # regular non-whitespace characters 

428 (r'[^\s,:?\[\]{}]+', Name.Variable), 

429 ], 

430 

431 } 

432 

433 def get_tokens_unprocessed(self, text=None, context=None): 

434 if context is None: 

435 context = YamlLexerContext(text, 0) 

436 return super().get_tokens_unprocessed(text, context) 

437 

438 

439class JsonLexer(Lexer): 

440 """ 

441 For JSON data structures. 

442 

443 .. versionadded:: 1.5 

444 """ 

445 

446 name = 'JSON' 

447 aliases = ['json', 'json-object'] 

448 filenames = ['*.json', 'Pipfile.lock'] 

449 mimetypes = ['application/json', 'application/json-object'] 

450 

451 # No validation of integers, floats, or constants is done. 

452 # As long as the characters are members of the following 

453 # sets, the token will be considered valid. For example, 

454 # 

455 # "--1--" is parsed as an integer 

456 # "1...eee" is parsed as a float 

457 # "trustful" is parsed as a constant 

458 # 

459 integers = set('-0123456789') 

460 floats = set('.eE+') 

461 constants = set('truefalsenull') # true|false|null 

462 hexadecimals = set('0123456789abcdefABCDEF') 

463 punctuations = set('{}[],') 

464 whitespaces = {'\u0020', '\u000a', '\u000d', '\u0009'} 

465 

466 def get_tokens_unprocessed(self, text): 

467 """Parse JSON data.""" 

468 

469 in_string = False 

470 in_escape = False 

471 in_unicode_escape = 0 

472 in_whitespace = False 

473 in_constant = False 

474 in_number = False 

475 in_float = False 

476 in_punctuation = False 

477 

478 start = 0 

479 

480 # The queue is used to store data that may need to be tokenized 

481 # differently based on what follows. In particular, JSON object 

482 # keys are tokenized differently than string values, but cannot 

483 # be distinguished until punctuation is encountered outside the 

484 # string. 

485 # 

486 # A ":" character after the string indicates that the string is 

487 # an object key; any other character indicates the string is a 

488 # regular string value. 

489 # 

490 # The queue holds tuples that contain the following data: 

491 # 

492 # (start_index, token_type, text) 

493 # 

494 # By default the token type of text in double quotes is 

495 # String.Double. The token type will be replaced if a colon 

496 # is encountered after the string closes. 

497 # 

498 queue = [] 

499 

500 for stop, character in enumerate(text): 

501 if in_string: 

502 if in_unicode_escape: 

503 if character in self.hexadecimals: 

504 in_unicode_escape -= 1 

505 if not in_unicode_escape: 

506 in_escape = False 

507 else: 

508 in_unicode_escape = 0 

509 in_escape = False 

510 

511 elif in_escape: 

512 if character == 'u': 

513 in_unicode_escape = 4 

514 else: 

515 in_escape = False 

516 

517 elif character == '\\': 

518 in_escape = True 

519 

520 elif character == '"': 

521 queue.append((start, String.Double, text[start:stop + 1])) 

522 in_string = False 

523 in_escape = False 

524 in_unicode_escape = 0 

525 

526 continue 

527 

528 elif in_whitespace: 

529 if character in self.whitespaces: 

530 continue 

531 

532 if queue: 

533 queue.append((start, Text, text[start:stop])) 

534 else: 

535 yield start, Text, text[start:stop] 

536 in_whitespace = False 

537 # Fall through so the new character can be evaluated. 

538 

539 elif in_constant: 

540 if character in self.constants: 

541 continue 

542 

543 yield start, Keyword.Constant, text[start:stop] 

544 in_constant = False 

545 # Fall through so the new character can be evaluated. 

546 

547 elif in_number: 

548 if character in self.integers: 

549 continue 

550 elif character in self.floats: 

551 in_float = True 

552 continue 

553 

554 if in_float: 

555 yield start, Number.Float, text[start:stop] 

556 else: 

557 yield start, Number.Integer, text[start:stop] 

558 in_number = False 

559 in_float = False 

560 # Fall through so the new character can be evaluated. 

561 

562 elif in_punctuation: 

563 if character in self.punctuations: 

564 continue 

565 

566 yield start, Punctuation, text[start:stop] 

567 in_punctuation = False 

568 # Fall through so the new character can be evaluated. 

569 

570 start = stop 

571 

572 if character == '"': 

573 in_string = True 

574 

575 elif character in self.whitespaces: 

576 in_whitespace = True 

577 

578 elif character in {'f', 'n', 't'}: # The first letters of true|false|null 

579 # Exhaust the queue. Accept the existing token types. 

580 yield from queue 

581 queue.clear() 

582 

583 in_constant = True 

584 

585 elif character in self.integers: 

586 # Exhaust the queue. Accept the existing token types. 

587 yield from queue 

588 queue.clear() 

589 

590 in_number = True 

591 

592 elif character == ':': 

593 # Yield from the queue. Replace string token types. 

594 for _start, _token, _text in queue: 

595 if _token is Text: 

596 yield _start, _token, _text 

597 elif _token is String.Double: 

598 yield _start, Name.Tag, _text 

599 else: 

600 yield _start, Error, _text 

601 queue.clear() 

602 

603 in_punctuation = True 

604 

605 elif character in self.punctuations: 

606 # Exhaust the queue. Accept the existing token types. 

607 yield from queue 

608 queue.clear() 

609 

610 in_punctuation = True 

611 

612 else: 

613 # Exhaust the queue. Accept the existing token types. 

614 yield from queue 

615 queue.clear() 

616 

617 yield start, Error, character 

618 

619 # Yield any remaining text. 

620 yield from queue 

621 if in_string: 

622 yield start, Error, text[start:] 

623 elif in_float: 

624 yield start, Number.Float, text[start:] 

625 elif in_number: 

626 yield start, Number.Integer, text[start:] 

627 elif in_constant: 

628 yield start, Keyword.Constant, text[start:] 

629 elif in_whitespace: 

630 yield start, Text, text[start:] 

631 elif in_punctuation: 

632 yield start, Punctuation, text[start:] 

633 

634 

635class JsonBareObjectLexer(JsonLexer): 

636 """ 

637 For JSON data structures (with missing object curly braces). 

638 

639 .. versionadded:: 2.2 

640 

641 .. deprecated:: 2.8.0 

642 

643 Behaves the same as `JsonLexer` now. 

644 """ 

645 

646 name = 'JSONBareObject' 

647 aliases = [] 

648 filenames = [] 

649 mimetypes = [] 

650 

651 

652class JsonLdLexer(JsonLexer): 

653 """ 

654 For `JSON-LD <https://json-ld.org/>`_ linked data. 

655 

656 .. versionadded:: 2.0 

657 """ 

658 

659 name = 'JSON-LD' 

660 aliases = ['jsonld', 'json-ld'] 

661 filenames = ['*.jsonld'] 

662 mimetypes = ['application/ld+json'] 

663 

664 json_ld_keywords = { 

665 '"@%s"' % keyword 

666 for keyword in ( 

667 'base', 

668 'container', 

669 'context', 

670 'direction', 

671 'graph', 

672 'id', 

673 'import', 

674 'included', 

675 'index', 

676 'json', 

677 'language', 

678 'list', 

679 'nest', 

680 'none', 

681 'prefix', 

682 'propagate', 

683 'protected', 

684 'reverse', 

685 'set', 

686 'type', 

687 'value', 

688 'version', 

689 'vocab', 

690 ) 

691 } 

692 

693 def get_tokens_unprocessed(self, text): 

694 for start, token, value in super(JsonLdLexer, self).get_tokens_unprocessed(text): 

695 if token is Name.Tag and value in self.json_ld_keywords: 

696 yield start, Name.Decorator, value 

697 else: 

698 yield start, token, value