Coverage for C:\leo.repo\leo-editor\leo\plugins\importers\javascript.py: 79%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#@+leo-ver=5-thin
2#@+node:ekr.20140723122936.18144: * @file ../plugins/importers/javascript.py
3"""The @auto importer for JavaScript."""
4import re
5import textwrap
6import unittest
7from typing import List
8from leo.core import leoGlobals as g
9from leo.plugins.importers import linescanner
10Importer = linescanner.Importer
11Target = linescanner.Target
12#@+others
13#@+node:ekr.20140723122936.18049: ** class JS_Importer
14class JS_Importer(Importer):
16 def __init__(self, importCommands, force_at_others=False, **kwargs):
17 """The ctor for the JS_ImportController class."""
18 # Init the base class.
19 super().__init__(
20 importCommands,
21 gen_refs=False, # Fix #639.
22 language='javascript',
23 state_class=JS_ScanState,
24 )
26 #@+others
27 #@+node:ekr.20180123051226.1: *3* js_i.post_pass & helpers
28 def post_pass(self, parent):
29 """
30 Optional Stage 2 of the javascript pipeline.
32 All substages **must** use the API for setting body text. Changing
33 p.b directly will cause asserts to fail later in i.finish().
34 """
35 self.clean_all_headlines(parent)
36 self.remove_singleton_at_others(parent)
37 self.clean_all_nodes(parent)
38 self.move_trailing_comments(parent)
39 #@+node:ekr.20180123051401.1: *4* js_i.remove_singleton_at_others
40 at_others = re.compile(r'^\s*@others\b')
42 def remove_singleton_at_others(self, parent):
43 """Replace @others by the body of a singleton child node."""
44 found = False
45 for p in parent.subtree():
46 if p.numberOfChildren() == 1:
47 child = p.firstChild()
48 lines = self.get_lines(p)
49 matches = [i for i, s in enumerate(lines) if self.at_others.match(s)]
50 if len(matches) == 1:
51 found = True
52 i = matches[0]
53 child_lines = self.get_lines(child)
54 lines = lines[:i] + child_lines + lines[i + 1 :]
55 self.set_lines(p, lines)
56 # Delete child later. Is this enough???
57 self.set_lines(child, [])
58 return found
59 #@+node:ekr.20180123060307.1: *4* js_i.remove_organizer_nodes
60 def remove_organizer_nodes(self, parent):
61 """Removed all organizer nodes created by i.delete_all_empty_nodes."""
62 # Careful: Restart this loop whenever we find an organizer.
63 found = True
64 while found:
65 found = False
66 for p in parent.subtree():
67 lines = self.get_lines(p)
68 if p.h.lower() == 'organizer' and not lines:
69 p.promote()
70 p.doDelete()
71 found = True # Restart the loop.
72 #@+node:ekr.20200202071105.1: *4* js_i.clean_all_nodes
73 def clean_all_nodes(self, parent):
74 """Remove common leading whitespace from all nodes."""
75 for p in parent.subtree():
76 lines = self.get_lines(p)
77 s = textwrap.dedent(''.join(lines))
78 self.set_lines(p, g.splitLines(s))
79 #@+node:ekr.20200202091613.1: *4* js_i.move_trailing_comments & helper (new)
80 def move_trailing_comments(self, parent):
81 """Move all trailing comments to the start of the next node."""
82 for p in parent.subtree():
83 next = p.next()
84 if next:
85 lines = self.get_lines(p)
86 head_lines, tail_lines = self.get_trailing_comments(lines)
87 if tail_lines:
88 self.set_lines(p, head_lines)
89 next_lines = self.get_lines(next)
90 self.set_lines(next, tail_lines + next_lines)
91 #@+node:ekr.20200202092332.1: *5* js_i.get_trailing_comments
92 def get_trailing_comments(self, lines):
93 """
94 Return the trailing comments of p.
95 Return (head_lines, tail_lines).
96 """
97 s = ''.join(lines)
98 head: List[str] = []
99 tail: List[str] = []
100 if not s.strip:
101 return head, tail
102 in_block_comment = False
103 head = lines
104 for i, line in enumerate(lines):
105 s = line.strip()
106 if in_block_comment:
107 tail.append(line)
108 if s.startswith('*/'):
109 in_block_comment = False
110 elif s.startswith('/*'):
111 in_block_comment = True
112 head = lines[:i]
113 tail = [line]
114 elif s.startswith('//'):
115 head = lines[:i]
116 tail = [line]
117 elif s: # Clear any previous comments.
118 head = lines
119 tail = []
120 return head, tail
121 #@+node:ekr.20161105140842.5: *3* js_i.scan_line (rewritten)
122 def scan_line(self, s, prev_state):
123 """
124 Update the scan state at the *end* of the line.
125 Return JS_ScanState({'context':context, 'curlies':curlies, 'parens':parens})
127 This code uses JsLex to scan the tokens, which scans strings and regexs properly.
129 This code also handles *partial* tokens: tokens continued from the
130 previous line or continued to the next line.
131 """
132 context = prev_state.context
133 curlies, parens = prev_state.curlies, prev_state.parens
134 # Scan tokens, updating context and counts.
135 prev_val = None
136 for kind, val in JsLexer().lex(s):
137 # g.trace(f"context: {context:2} kind: {kind:10} val: {val!r}")
138 if context:
139 if context in ('"', "'") and kind in ('other', 'punct') and val == context:
140 context = ''
141 elif (
142 context == '/*'
143 and kind in ('other', 'punct')
144 and prev_val == '*'
145 and val == '/'
146 ):
147 context = ''
148 elif kind in ('other', 'punct') and val in ('"', "'"):
149 context = val
150 elif kind in ('other', 'punct') and val == '*' and prev_val == '/':
151 context = '/*'
152 elif kind in ('other', 'punct'):
153 if val == '*' and prev_val == '/':
154 context = '/*'
155 elif val == '{':
156 curlies += 1
157 elif val == '}':
158 curlies -= 1
159 elif val == '(':
160 parens += 1
161 elif val == ')':
162 parens -= 1
163 prev_val = val
164 d = {'context': context, 'curlies': curlies, 'parens': parens}
165 state = JS_ScanState(d)
166 return state
167 #@+node:ekr.20171224145755.1: *3* js_i.starts_block
168 func_patterns = [
169 re.compile(r'.*?\)\s*=>\s*\{'),
170 re.compile(r'\s*class\b'),
171 re.compile(r'\s*function\b'),
172 re.compile(r'.*?[(=,]\s*function\b'),
173 ]
175 def starts_block(self, i, lines, new_state, prev_state):
176 """True if the new state starts a block."""
177 if new_state.level() <= prev_state.level():
178 return False
179 # Remove strings and regexs from the line before applying the patterns.
180 cleaned_line = []
181 for kind, val in JsLexer().lex(lines[i]):
182 if kind not in ('string', 'regex'):
183 cleaned_line.append(val)
184 # Search for any of the patterns.
185 line = ''.join(cleaned_line)
186 for pattern in self.func_patterns:
187 if pattern.match(line) is not None:
188 return True
189 return False
190 #@+node:ekr.20200131193217.1: *3* js_i.ends_block
191 def ends_block(self, line, new_state, prev_state, stack):
192 """True if line ends the block."""
193 # Comparing new_state against prev_state does not work for python.
194 top = stack[-1]
195 return new_state.level() < top.state.level()
196 #@+node:ekr.20161101183354.1: *3* js_i.clean_headline
197 clean_regex_list1 = [
198 re.compile(r'\s*\(?(function\b\s*[\w]*)\s*\('),
199 # (function name (
200 re.compile(r'\s*(\w+\s*\:\s*\(*\s*function\s*\()'),
201 # name: (function (
202 re.compile(r'\s*(?:const|let|var)\s*(\w+\s*(?:=\s*.*)=>)'),
203 # const|let|var name = .* =>
204 ]
205 clean_regex_list2 = [
206 re.compile(r'(.*\=)(\s*function)'),
207 # .* = function
208 ]
209 clean_regex_list3 = [
210 re.compile(r'(.*\=\s*new\s*\w+)\s*\(.*(=>)'),
211 # .* = new name .* =>
212 re.compile(r'(.*)\=\s*\(.*(=>)'),
213 # .* = ( .* =>
214 re.compile(r'(.*)\((\s*function)'),
215 # .* ( function
216 re.compile(r'(.*)\(.*(=>)'),
217 # .* ( .* =>
218 re.compile(r'(.*)(\(.*\,\s*function)'),
219 # .* \( .*, function
220 ]
221 clean_regex_list4 = [
222 re.compile(r'(.*)\(\s*(=>)'),
223 # .* ( =>
224 ]
226 def clean_headline(self, s, p=None, trace=False):
227 """Return a cleaned up headline s."""
228 # pylint: disable=arguments-differ
229 s = s.strip()
230 # Don't clean a headline twice.
231 if s.endswith('>>') and s.startswith('<<'):
232 return s
233 for ch in '{(=':
234 if s.endswith(ch):
235 s = s[:-1].strip()
236 # First regex cleanup. Use \1.
237 for pattern in self.clean_regex_list1:
238 m = pattern.match(s)
239 if m:
240 s = m.group(1)
241 break
242 # Second regex cleanup. Use \1 + \2
243 for pattern in self.clean_regex_list2:
244 m = pattern.match(s)
245 if m:
246 s = m.group(1) + m.group(2)
247 break
248 # Third regex cleanup. Use \1 + ' ' + \2
249 for pattern in self.clean_regex_list3:
250 m = pattern.match(s)
251 if m:
252 s = m.group(1) + ' ' + m.group(2)
253 break
254 # Fourth cleanup. Use \1 + ' ' + \2 again
255 for pattern in self.clean_regex_list4:
256 m = pattern.match(s)
257 if m:
258 s = m.group(1) + ' ' + m.group(2)
259 break
260 # Final whitespace cleanups.
261 s = s.replace(' ', ' ')
262 s = s.replace(' (', '(')
263 return g.truncate(s, 100)
264 #@-others
265#@+node:ekr.20161105092745.1: ** class JS_ScanState
266class JS_ScanState:
267 """A class representing the state of the javascript line-oriented scan."""
269 def __init__(self, d=None):
270 """JS_ScanState ctor"""
271 if d:
272 # d is *different* from the dict created by i.scan_line.
273 self.context = d.get('context')
274 self.curlies = d.get('curlies')
275 self.parens = d.get('parens')
276 else:
277 self.context = ''
278 self.curlies = self.parens = 0
280 def __repr__(self):
281 """JS_ScanState.__repr__"""
282 return 'JS_ScanState context: %r curlies: %s parens: %s' % (
283 self.context, self.curlies, self.parens)
285 __str__ = __repr__
287 #@+others
288 #@+node:ekr.20161119115505.1: *3* js_state.level
289 def level(self):
290 """JS_ScanState.level."""
291 return (self.curlies, self.parens)
292 #@+node:ekr.20161119051049.1: *3* js_state.update
293 def update(self, data):
294 """
295 Update the state using the 6-tuple returned by i.scan_line.
296 Return i = data[1]
297 """
298 context, i, delta_c, delta_p, delta_s, bs_nl = data
299 # self.bs_nl = bs_nl
300 self.context = context
301 self.curlies += delta_c
302 self.parens += delta_p
303 # self.squares += delta_s
304 return i
306 #@-others
308#@+node:ekr.20200131110322.2: ** JsLexer...
309# JsLex: a lexer for Javascript
310# Written by Ned Batchelder. Used by permission.
311#
312# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
313# For details: https://bitbucket.org/ned/jslex/src/default/NOTICE.txt
314#@+node:ekr.20200131110322.4: *3* class Tok
315class Tok:
316 """A specification for a token class."""
318 num = 0
320 def __init__(self, name, regex, next=None):
321 self.id = Tok.num
322 Tok.num += 1
323 self.name = name
324 self.regex = regex
325 self.next = next
326#@+node:ekr.20200131110322.7: *3* class Lexer
327class Lexer:
328 """A generic multi-state regex-based lexer."""
330 #@+others
331 #@+node:ekr.20200131110322.8: *4* Lexer.__init__
332 def __init__(self, states, first):
333 self.regexes = {}
334 self.toks = {}
335 for state, rules in states.items():
336 parts = []
337 for tok in rules:
338 groupid = "t%d" % tok.id
339 self.toks[groupid] = tok
340 parts.append("(?P<%s>%s)" % (groupid, tok.regex))
341 self.regexes[state] = re.compile("|".join(parts), re.MULTILINE | re.VERBOSE) # |re.UNICODE)
342 self.state = first
344 #@+node:ekr.20200131110322.9: *4* Lexer.lex
345 def lex(self, text):
346 """Lexically analyze `text`.
348 Yields pairs (`name`, `tokentext`).
349 """
350 end = len(text)
351 state = self.state
352 regexes = self.regexes
353 toks = self.toks
354 start = 0
355 while start < end:
356 for match in regexes[state].finditer(text, start):
357 # g.trace(state, start, text, match)
358 # g.printObj(regexes[state])
359 name = match.lastgroup
360 tok = toks[name]
361 toktext = match.group(name)
362 start += len(toktext)
363 yield(tok.name, toktext)
364 if tok.next:
365 state = tok.next
366 break
367 self.state = state
368 #@-others
369#@+node:ekr.20200131110322.6: *3* function: literals
370def literals(choices, prefix="", suffix=""):
371 """
372 Create a regex from a space-separated list of literal `choices`.
374 If provided, `prefix` and `suffix` will be attached to each choice
375 individually.
377 """
378 return "|".join(prefix + re.escape(c) + suffix for c in choices.split())
380#@+node:ekr.20200131110322.10: *3* class JsLexer(Lexer)
381class JsLexer(Lexer):
382 """A Javascript lexer
384 >>> lexer = JsLexer()
385 >>> list(lexer.lex("a = 1"))
386 [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
388 This doesn't properly handle non-Ascii characters in the Javascript source.
390 """
392 #@+<< constants >>
393 #@+node:ekr.20200131190707.1: *4* << constants >> (JsLexer)
395 # Because these tokens are matched as alternatives in a regex, longer possibilities
396 # must appear in the list before shorter ones, for example, '>>' before '>'.
397 #
398 # Note that we don't have to detect malformed Javascript, only properly lex
399 # correct Javascript, so much of this is simplified.
401 # Details of Javascript lexical structure are taken from
402 # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
404 # A useful explanation of automatic semicolon insertion is at
405 # http://inimino.org/~inimino/blog/javascript_semicolons
407 # See https://stackoverflow.com/questions/6314614/match-any-unicode-letter
409 both_before = [
410 Tok("comment", r"/\*(.|\n)*?\*/"),
411 Tok("linecomment", r"//.*?$"),
412 Tok("ws", r"\s+"),
413 Tok("keyword", literals("""
414 async await
415 break case catch class const continue debugger
416 default delete do else enum export extends
417 finally for function if import in instanceof new
418 return super switch this throw try typeof var
419 void while with
420 """, suffix=r"\b"), next='reg'),
421 Tok("reserved", literals("null true false", suffix=r"\b"), next='div'),
422 #
423 # EKR: This would work if patterns were compiled with the re.UNICODE flag.
424 # However, \w is not the same as valid JS characters.
425 # In any case, the JS importer doesn't need to handle id's carefully.
426 #
427 # Tok("id", r"""([\w$])([\w\d]*)""", next='div'),
428 #
429 Tok("id", r"""
430 ([a-zA-Z_$ ]|\\u[0-9a-fA-Z]{4}) # first char
431 ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})* # rest chars
432 """, next='div'),
433 Tok("hnum", r"0[xX][0-9a-fA-F]+", next='div'),
434 Tok("onum", r"0[0-7]+"),
435 Tok("dnum", r"""
436 ( (0|[1-9][0-9]*) # DecimalIntegerLiteral
437 \. # dot
438 [0-9]* # DecimalDigits-opt
439 ([eE][-+]?[0-9]+)? # ExponentPart-opt
440 |
441 \. # dot
442 [0-9]+ # DecimalDigits
443 ([eE][-+]?[0-9]+)? # ExponentPart-opt
444 |
445 (0|[1-9][0-9]*) # DecimalIntegerLiteral
446 ([eE][-+]?[0-9]+)? # ExponentPart-opt
447 )
448 """, next='div'),
449 Tok("punct", literals("""
450 >>>= === !== >>> <<= >>= <= >= == != << >> &&
451 || += -= *= %= &= |= ^=
452 """), next="reg"),
453 Tok("punct", literals("++ -- ) ]"), next='div'),
454 Tok("punct", literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
455 Tok("string", r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
456 Tok("string", r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
457 ]
459 both_after = [
460 Tok("other", r"."),
461 ]
463 states = {
464 'div': # slash will mean division
465 both_before + [
466 Tok("punct", literals("/= /"), next='reg'),
467 ] + both_after,
469 'reg': # slash will mean regex
470 both_before + [
471 Tok("regex",
472 r"""
473 / # opening slash
474 # First character is..
475 ( [^*\\/[] # anything but * \ / or [
476 | \\. # or an escape sequence
477 | \[ # or a class, which has
478 ( [^\]\\] # anything but \ or ]
479 | \\. # or an escape sequence
480 )* # many times
481 \]
482 )
483 # Following characters are same, except for excluding a star
484 ( [^\\/[] # anything but \ / or [
485 | \\. # or an escape sequence
486 | \[ # or a class, which has
487 ( [^\]\\] # anything but \ or ]
488 | \\. # or an escape sequence
489 )* # many times
490 \]
491 )* # many times
492 / # closing slash
493 [a-zA-Z0-9]* # trailing flags
494 """, next='div'),
495 ] + both_after,
496 }
497 #@-<< constants >>
499 #@+others
500 #@+node:ekr.20200131110322.11: *4* JsLexer.__init__
501 def __init__(self):
502 super().__init__(self.states, 'reg')
503 #@-others
504#@+node:ekr.20200131070055.1: ** class TestJSImporter (importers/javascript.py)
505class TestJSImporter(unittest.TestCase):
506 #@+others
507 #@+node:ekr.20200202093420.1: *3* test_get_trailing_comments
508 def test_get_trailing_comments(self):
510 table = (
511 # Test 1
512 ("""\
513 head
514 // tail""", 1),
516 # Test 2
517 ("""\
518 head
519 /* comment 1
520 * comment 2
521 */""", 3),
523 # Test 3
524 ("""\
525 head
526 /* comment 1
527 * comment 2
528 */
529 tail""", 0), # no tail
531 # Test 4
532 ("""\
533 head
534 // comment
535 tail""", 0), # no tail
537 ) # End table.
538 for s, expected_length in table:
539 x = JS_Importer(None)
540 s = textwrap.dedent(s)
541 lines = g.splitLines(s)
542 head, tail = x.get_trailing_comments(lines)
543 expected_lines = lines[-expected_length :] if expected_length else []
544 assert tail == expected_lines, (repr(tail), repr(expected_lines))
545 #@+node:ekr.20200202104932.1: *3* test_JsLex
546 def test_JsLex(self):
548 table = (
549 ('id', ('f_', '$', 'A1', 'abc')),
550 ('other', ('ÁÁ',)), # Unicode strings are not handled by JsLex.
551 ('keyword', ('async', 'await', 'if')),
552 ('punct', ('(', ')', '{', '}', ',', ':', ';')),
553 # ('num', ('9', '2')), # This test doesn't matter at present.
554 )
555 for kind, data in table:
556 for contents in data:
557 for name, tok in JsLexer().lex(contents):
558 assert name == kind, f"expected {kind!s} got {name!s} {tok!r} {contents}"
559 # print(f"{kind!s:10} {tok!r:10}")
561 #@+node:ekr.20200203051839.1: *3* test_starts_block
562 def test_starts_block(self):
564 table = (
565 (1, 'xx) => {}'),
566 (1, 'class c1'),
567 (1, 'function f1'),
568 (1, 'xx(function f2'),
569 (1, 'xx = function f3'),
570 (1, 'xx, function f4'),
571 (0, 'a = "function"'),
572 (0, 'a = /function/'),
573 )
574 for expected, line in table:
575 x = JS_Importer(None)
576 lines = [line]
577 new_state = JS_ScanState()
578 new_state.curlies += 1
579 prev_state = JS_ScanState()
580 results = x.starts_block(0, lines, new_state, prev_state)
581 # if expected != results: x.scan_line(line, prev_state
582 assert expected == results, f"expected: {expected} got: {int(results)} {line!r}\n"
583 #@+node:ekr.20200203060718.1: *3* test_scan_line
584 def test_scan_line(self):
586 table = (
587 # result prev_context s
588 ((0, 0, '"'), "", r'"string'),
589 ((0, 0, '/*'), "", r'/* line 1'),
590 ((0, 0, '/*'), "/*", r'line 2'), # New.
591 ((0, 0, ''), "/*", r'line 3 */'), # New.
592 ((0, 0, ''), "", r'a + b // /*'),
593 ((0, 1, ''), "", r'(function'),
594 ((1, 1, ''), "", r'(function(a) {'),
595 ((0, 0, ''), "", r'var x = /abc/'),
596 ((0, 0, ''), "", r'var x = /a"c/'),
597 ((0, 0, ''), "", r'var x = /a\//'),
598 ((0, 0, ''), "", r'var x = /a\//'),
599 ((0, 1, ''), "", r'var x = (0,'),
600 )
601 for result, prev_context, s in table:
602 importer = JS_Importer(None)
603 prev_state = JS_ScanState()
604 prev_state.context = prev_context
605 new_state = importer.scan_line(s, prev_state)
606 curlies, parens, context = result
607 ok = (
608 new_state.curlies == curlies and
609 new_state.parens == parens and
610 new_state.context == context)
611 assert ok, (
612 f"\n"
613 f" expected: curlies: {curlies}, parens: {parens}, context: {context!r}\n"
614 f"new_state: {new_state}\n"
615 f" s: {s!r}")
616 #@-others
617#@-others
618importer_dict = {
619 'func': JS_Importer.do_import(),
620 'extensions': ['.js',],
621}
622if __name__ == '__main__':
623 unittest.main()
624#@@language python
625#@@tabwidth -4
626#@-leo