Coverage for C:\leo.repo\leo-editor\leo\plugins\importers\xml.py: 91%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

141 statements  

1#@+leo-ver=5-thin 

2#@+node:ekr.20140723122936.18137: * @file ../plugins/importers/xml.py 

3"""The @auto importer for the xml language.""" 

4import re 

5from leo.core import leoGlobals as g 

6from leo.plugins.importers import linescanner 

7Importer = linescanner.Importer 

8Target = linescanner.Target 

9#@+others 

10#@+node:ekr.20161121204146.3: ** class Xml_Importer 

11class Xml_Importer(Importer): 

12 """The importer for the xml lanuage.""" 

13 

14 #@+others 

15 #@+node:ekr.20161122124109.1: *3* xml_i.__init__ 

16 def __init__(self, importCommands, tags_setting='import_xml_tags', **kwargs): 

17 """Xml_Importer.__init__""" 

18 # Init the base class. 

19 super().__init__( 

20 importCommands, 

21 language='xml', 

22 state_class=Xml_ScanState, 

23 strict=False, 

24 ) 

25 self.tags_setting = tags_setting 

26 self.start_tags = self.add_tags() 

27 # A closing tag decrements state.tag_level only if the top is an opening tag. 

28 self.stack = [] # Stack of tags. 

29 self.void_tags = [ 

30 '<?xml', 

31 '!doctype', 

32 ] 

33 self.tag_warning_given = False # True: a structure error has been detected. 

34 #@+node:ekr.20161121204918.1: *3* xml_i.add_tags 

35 def add_tags(self): 

36 """Add items to self.class/functionTags and from settings.""" 

37 c, setting = self.c, self.tags_setting 

38 aList = c.config.getData(setting) or [] 

39 aList = [z.lower() for z in aList] 

40 return aList 

41 #@+node:ekr.20170416082422.1: *3* xml_i.clean_headline 

42 def clean_headline(self, s, p=None): 

43 """xml and html: Return a cleaned up headline s.""" 

44 m = re.match(r'\s*(<[^>]+>)', s) 

45 return m.group(1) if m else s.strip() 

46 #@+node:ekr.20161123003732.1: *3* xml_i.error 

47 def error(self, s): 

48 """Issue an error, but do *not* cause a unit test to fail.""" 

49 g.es_print('\nin %s' % self.root.h) 

50 g.es_print(s) 

51 # Tell i.check to strip lws. 

52 self.ws_error = True 

53 #@+node:ekr.20161122073505.1: *3* xml_i.scan_line & helpers 

54 def scan_line(self, s, prev_state): 

55 """Update the xml scan state by scanning line s.""" 

56 context, tag_level = prev_state.context, prev_state.tag_level 

57 i = 0 

58 while i < len(s): 

59 progress = i 

60 if context: 

61 context, i = self.scan_in_context(context, i, s) 

62 else: 

63 context, i, tag_level = self.scan_out_context(i, s, tag_level) 

64 assert progress < i, (repr(s[i]), '***', repr(s)) 

65 d = {'context': context, 'tag_level': tag_level} 

66 return Xml_ScanState(d) 

67 #@+node:ekr.20161122073937.1: *4* xml_i.scan_in_context 

68 def scan_in_context(self, context, i, s): 

69 """ 

70 Scan s from i, within the given context. 

71 Return (context, i) 

72 """ 

73 assert context in ('"', '<!--'), repr(context) 

74 # Only double-quoted strings are valid strings in xml/html. 

75 if context == '"' and self.match(s, i, '"'): 

76 context = '' 

77 i += 1 

78 elif context == '<!--' and self.match(s, i, '-->'): 

79 context = '' 

80 i += 3 

81 else: 

82 i += 1 

83 return context, i 

84 #@+node:ekr.20161122073938.1: *4* xml_i.scan_out_context & helpers 

85 def scan_out_context(self, i, s, tag_level): 

86 """ 

87 Scan s from i, outside any context. 

88 Return (context, i, tag_level) 

89 """ 

90 context = '' 

91 if self.match(s, i, '"'): 

92 context = '"' # Only double-quoted strings are xml/html strings. 

93 i += 1 

94 elif self.match(s, i, '<!--'): 

95 context = '<!--' 

96 i += 4 

97 elif self.match(s, i, '<'): 

98 # xml/html tags do *not* start contexts. 

99 i, tag_level = self.scan_tag(s, i, tag_level) 

100 elif self.match(s, i, '/>'): 

101 i += 2 

102 tag_level = self.end_tag(s, tag='/>', tag_level=tag_level) 

103 elif self.match(s, i, '>'): 

104 tag_level = self.end_tag(s, tag='>', tag_level=tag_level) 

105 i += 1 

106 else: 

107 i += 1 

108 return context, i, tag_level 

109 #@+node:ekr.20161122084808.1: *5* xml_i.end_tag 

110 def end_tag(self, s, tag, tag_level): 

111 """ 

112 Handle the ">" or "/>" that ends an element. 

113 

114 Ignore ">" except for void tags. 

115 """ 

116 if self.stack: 

117 if tag == '/>': 

118 top = self.stack.pop() 

119 if top in self.start_tags: 

120 tag_level -= 1 

121 else: 

122 top = self.stack[-1] 

123 if top in self.void_tags: 

124 self.stack.pop() 

125 elif tag == '/>': 

126 g.es_print("Warning: ignoring dubious /> in...") 

127 g.es_print(repr(s)) 

128 return tag_level 

129 #@+node:ekr.20161122080143.1: *5* xml_i.scan_tag & helper 

130 ch_pattern = re.compile(r'([\!\?]?[\w\_\.\:\-]+)', re.UNICODE) 

131 

132 def scan_tag(self, s, i, tag_level): 

133 """ 

134 Scan an xml tag starting with "<" or "</". 

135 

136 Adjust the stack as appropriate: 

137 - "<" adds the tag to the stack. 

138 - "</" removes the top of the stack if it matches. 

139 """ 

140 assert s[i] == '<', repr(s[i]) 

141 end_tag = self.match(s, i, '</') 

142 # Scan the tag. 

143 i += (2 if end_tag else 1) 

144 m = self.ch_pattern.match(s, i) 

145 if m: 

146 tag = m.group(0).lower() 

147 i += len(m.group(0)) 

148 else: 

149 # All other '<' characters should have had xml/html escapes applied to them. 

150 self.error('missing tag in position %s of %r' % (i, s)) 

151 g.es_print(repr(s)) 

152 return i, tag_level 

153 if end_tag: 

154 self.pop_to_tag(tag, s) 

155 if tag in self.start_tags: 

156 tag_level -= 1 

157 else: 

158 self.stack.append(tag) 

159 if tag in self.start_tags: 

160 tag_level += 1 

161 return i, tag_level 

162 #@+node:ekr.20170416043508.1: *6* xml_i.pop_to_tag 

163 def pop_to_tag(self, tag, s): 

164 """ 

165 Attempt to pop tag from the top of the stack. 

166 

167 If the top doesn't match, issue a warning and attempt to recover. 

168 """ 

169 if not self.stack: 

170 self.error('Empty tag stack: %s' % tag) 

171 g.es_print(repr(s)) 

172 return 

173 top = self.stack[-1] 

174 if top == tag: 

175 self.stack.pop() 

176 return 

177 # Only issue one warning per file. 

178 # Attempt a recovery. 

179 if tag in self.stack: 

180 while self.stack: 

181 top = self.stack.pop() 

182 # if trace: g.trace('POP: ', top) 

183 if top == tag: 

184 return 

185 #@+node:ekr.20161121210839.1: *3* xml_i.starts_block 

186 def starts_block(self, i, lines, new_state, prev_state): 

187 """True if the line startswith an xml block""" 

188 return new_state.tag_level > prev_state.tag_level 

189 #@+node:ekr.20161121212858.1: *3* xml_i.is_ws_line 

190 # Warning: base Importer class defines ws_pattern. 

191 xml_ws_pattern = re.compile(r'\s*(<!--([^-]|-[^-])*-->\s*)*$') 

192 

193 def is_ws_line(self, s): 

194 """True if s is nothing but whitespace or single-line comments.""" 

195 return bool(self.xml_ws_pattern.match(s)) 

196 #@+node:ekr.20161123005742.1: *3* xml_i.undent 

197 def undent(self, p): 

198 """ 

199 Regularize lws before @others, but preserve lws for all other lines. 

200 This is needed to handle embedded brython code properly. 

201 """ 

202 result, w = [], self.tab_width 

203 indent = ' ' * abs(w) if w < 0 else '\t' 

204 for s in self.get_lines(p): 

205 ls = '\n' if s.isspace() else s.lstrip() 

206 if ls.startswith('@others'): 

207 if p == self.root: 

208 result.append(ls) 

209 else: 

210 result.append(indent + ls) 

211 else: 

212 # Fix #479: Preserve brython indentation when importing .html files. 

213 result.append('\n' if s.isspace() else s) 

214 return result 

215 #@-others 

216#@+node:ekr.20161121204146.7: ** class class Xml_ScanState 

217class Xml_ScanState: 

218 """A class representing the state of the xml line-oriented scan.""" 

219 

220 def __init__(self, d=None): 

221 """Xml_ScanState.__init__""" 

222 if d: 

223 self.context = d.get('context') 

224 self.tag_level = d.get('tag_level') 

225 else: 

226 self.context = '' 

227 self.tag_level = 0 

228 

229 def __repr__(self): 

230 """Xml_ScanState.__repr__""" 

231 return "Xml_ScanState context: %r tag_level: %s" % ( 

232 self.context, self.tag_level) 

233 

234 __str__ = __repr__ 

235 

236 #@+others 

237 #@+node:ekr.20161121204146.8: *3* xml_state.level 

238 def level(self): 

239 """Xml_ScanState.level.""" 

240 return self.tag_level 

241 #@-others 

242#@-others 

243importer_dict = { 

244 'func': Xml_Importer.do_import(), 

245 'extensions': ['.xml',], 

246} 

247#@@language python 

248#@@tabwidth -4 

249 

250#@-leo