Coverage for phml\core\parser\hypertextMarkupParser.py: 90%

93 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-30 09:38 -0600

1"""Pythonic Hypertext Markup Language (phml) parser.""" 

2 

3from html.parser import HTMLParser 

4from typing import Optional 

5 

6from phml.nodes import Comment, DocType, Element, Point, Position, Properties, Root, Text 

7 

8self_closing_tags = [ 

9 "area", 

10 "base", 

11 "br", 

12 "col", 

13 "embed", 

14 "hr", 

15 "img", 

16 "input", 

17 "link", 

18 "meta", 

19 "param", 

20 "source", 

21 "track", 

22 "wbr", 

23 "command", 

24 "keygen", 

25 "menuitem", 

26] 

27 

28 

29def build_point(pos: tuple[int, int], offset: Optional[int] = None) -> Point: 

30 """Build a phml.node.Point from a tuple.""" 

31 return Point(pos[0], pos[1], offset) 

32 

33 

34def build_position( 

35 start: tuple[int, int, Optional[int]], 

36 end: tuple[int, int, Optional[int]], 

37 indent: Optional[int] = None, 

38) -> Position: 

39 """Build a phml.node.Posiiton from two tuples.""" 

40 return Position(build_point(start), build_point(end), indent) 

41 

42 

43def calc_end_of_tag(tag_text: str, cur_pos: tuple[int, int]) -> tuple[int, int]: 

44 """Given the current position and the open tag text, this function 

45 calculates where the start tag ends. 

46 """ 

47 lines = tag_text.split("\n") 

48 line = len(lines) - 1 

49 col = len(lines[-1]) + cur_pos[1] if len(lines) == 1 else len(lines[-1]) 

50 

51 return cur_pos[0] + line, col 

52 

53 

54def strip_and_count(data: str, cur_pos: tuple[int, int]) -> tuple[str, int, int]: 

55 """This function takes a possibly mutliline string and strips leading and trailing 

56 blank lines. Given the current position it will also calculate the line and column 

57 taht the data ends at. 

58 """ 

59 lines, cols = 0, len(data) + cur_pos[1] 

60 data_lines = data.split("\n") 

61 

62 # If multiline data block 

63 if len(data_lines) > 1: 

64 

65 # remove leading blank lines 

66 for idx in range(len(data_lines)): 

67 if data_lines[idx].strip() != "": 

68 data_lines = data_lines[idx:] 

69 break 

70 if idx == len(data_lines) - 1: 

71 data_lines = [] 

72 break 

73 

74 # Remove trailing blank lines 

75 if len(data_lines) > 0: 

76 for idx in range(len(data_lines) - 1, 0, -1): 

77 if data_lines[idx].replace("\n", " ").strip() != "": 

78 data_lines = data_lines[: idx + 1] 

79 break 

80 

81 if len(data_lines) > 0: 

82 # Get the line and col of the final position 

83 lines, cols = len(data_lines) - 1, len(data_lines[-1]) 

84 

85 data_lines = "\n".join(data_lines) 

86 

87 # Else it is a single line data block 

88 else: 

89 # Is not a blank line 

90 if data_lines[0].replace("\n", " ").strip() != "": 

91 data_lines = data_lines[0] 

92 else: 

93 data_lines = "" 

94 

95 return data_lines, cur_pos[0] + lines, cols 

96 

97 

98class HypertextMarkupParser(HTMLParser): 

99 """Custom html parser inherited from the python 

100 built-in html.parser. 

101 """ 

102 

103 cur: Root | Element 

104 """The current parent element in the recursion.""" 

105 

106 cur_tags: list 

107 """Stack of all open tags. Used for balancing tags.""" 

108 

109 def __init__(self, *, convert_charrefs=True): 

110 super().__init__(convert_charrefs=convert_charrefs) 

111 

112 self.cur = Root() 

113 self.cur_tags = [] 

114 

115 def handle_decl(self, decl: str) -> None: 

116 if decl.split(" ")[0].lower() == "doctype": 

117 tokens = decl.split(" ") 

118 if self.cur.type == "root": 

119 if len(tokens) > 1: 

120 self.cur.children.append( 

121 DocType( 

122 lang=tokens[1], 

123 parent=self.cur, 

124 position=build_position(self.getpos(), self.getpos()), 

125 ) 

126 ) 

127 else: 

128 self.cur.children.append( 

129 DocType( 

130 lang=None, 

131 parent=self.cur, 

132 position=build_position(self.getpos(), self.getpos()), 

133 ) 

134 ) 

135 else: 

136 raise Exception("<!doctype> must be in the root!") 

137 

138 def handle_pi(self, data: str) -> None: 

139 print("Encountered a processing instruction tag:", data) 

140 

141 def handle_starttag(self, tag, attrs): 

142 

143 properties: Properties = {} 

144 

145 for attr in attrs: 

146 if attr[1] is not None: 

147 properties[attr[0]] = attr[1] if attr[1] != "no" else False 

148 else: 

149 properties[attr[0]] = True 

150 

151 self.cur.children.append(Element(tag=tag, properties=properties, parent=self.cur)) 

152 

153 if tag in self_closing_tags: 

154 self.cur.children[-1].startend = True 

155 

156 self.cur.children[-1].position = build_position( 

157 self.getpos(), calc_end_of_tag(self.get_starttag_text(), self.getpos()) 

158 ) 

159 else: 

160 self.cur = self.cur.children[-1] 

161 self.cur_tags.append(self.cur) 

162 self.cur.position = build_position(self.getpos(), (0, 0)) 

163 

164 def handle_startendtag(self, tag, attrs): 

165 properties: Properties = {} 

166 

167 for attr in attrs: 

168 if attr[1] is not None: 

169 properties[attr[0]] = attr[1] if attr[1] != "no" else False 

170 else: 

171 properties[attr[0]] = True 

172 

173 self.cur.children.append( 

174 Element( 

175 tag=tag, 

176 properties=properties, 

177 parent=self.cur, 

178 startend=True, 

179 position=build_position( 

180 self.getpos(), calc_end_of_tag(self.get_starttag_text(), self.getpos()) 

181 ), 

182 ) 

183 ) 

184 

185 def handle_endtag(self, tag): 

186 if tag == self.cur_tags[-1].tag: 

187 if len(self.cur.children) == 0: 

188 self.cur.startend = True 

189 

190 self.cur.position.end = build_point(self.getpos()) 

191 self.cur = self.cur.parent 

192 self.cur_tags.pop(-1) 

193 else: 

194 raise Exception( 

195 f"Mismatched tags <{self.cur.tag}> and </{tag}> at [{self.getpos()[0]}:{self.getpos()[1]}]" 

196 ) 

197 

198 def handle_data(self, data): 

199 

200 data, eline, ecol = strip_and_count(data, self.getpos()) 

201 

202 if data not in [[], "", None]: 

203 self.cur.children.append( 

204 Text( 

205 data, 

206 self.cur, 

207 position=build_position(self.getpos(), (eline, ecol)), 

208 ) 

209 ) 

210 

211 def handle_comment(self, data: str) -> None: 

212 data, eline, ecol = strip_and_count(data, self.getpos()) 

213 

214 if eline == self.getpos()[0]: 

215 ecol += 7 

216 else: 

217 ecol += 3 

218 

219 self.cur.children.append( 

220 Comment( 

221 value=data, 

222 parent=self.cur, 

223 position=build_position( 

224 self.getpos(), 

225 (eline, ecol), 

226 ), 

227 ) 

228 )