Coverage for phml\core\parser\hypertextMarkupParser.py: 90%
93 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-30 09:38 -0600
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-30 09:38 -0600
1"""Pythonic Hypertext Markup Language (phml) parser."""
3from html.parser import HTMLParser
4from typing import Optional
6from phml.nodes import Comment, DocType, Element, Point, Position, Properties, Root, Text
8self_closing_tags = [
9 "area",
10 "base",
11 "br",
12 "col",
13 "embed",
14 "hr",
15 "img",
16 "input",
17 "link",
18 "meta",
19 "param",
20 "source",
21 "track",
22 "wbr",
23 "command",
24 "keygen",
25 "menuitem",
26]
29def build_point(pos: tuple[int, int], offset: Optional[int] = None) -> Point:
30 """Build a phml.node.Point from a tuple."""
31 return Point(pos[0], pos[1], offset)
34def build_position(
35 start: tuple[int, int, Optional[int]],
36 end: tuple[int, int, Optional[int]],
37 indent: Optional[int] = None,
38) -> Position:
39 """Build a phml.node.Posiiton from two tuples."""
40 return Position(build_point(start), build_point(end), indent)
43def calc_end_of_tag(tag_text: str, cur_pos: tuple[int, int]) -> tuple[int, int]:
44 """Given the current position and the open tag text, this function
45 calculates where the start tag ends.
46 """
47 lines = tag_text.split("\n")
48 line = len(lines) - 1
49 col = len(lines[-1]) + cur_pos[1] if len(lines) == 1 else len(lines[-1])
51 return cur_pos[0] + line, col
54def strip_and_count(data: str, cur_pos: tuple[int, int]) -> tuple[str, int, int]:
55 """This function takes a possibly mutliline string and strips leading and trailing
56 blank lines. Given the current position it will also calculate the line and column
57 taht the data ends at.
58 """
59 lines, cols = 0, len(data) + cur_pos[1]
60 data_lines = data.split("\n")
62 # If multiline data block
63 if len(data_lines) > 1:
65 # remove leading blank lines
66 for idx in range(len(data_lines)):
67 if data_lines[idx].strip() != "":
68 data_lines = data_lines[idx:]
69 break
70 if idx == len(data_lines) - 1:
71 data_lines = []
72 break
74 # Remove trailing blank lines
75 if len(data_lines) > 0:
76 for idx in range(len(data_lines) - 1, 0, -1):
77 if data_lines[idx].replace("\n", " ").strip() != "":
78 data_lines = data_lines[: idx + 1]
79 break
81 if len(data_lines) > 0:
82 # Get the line and col of the final position
83 lines, cols = len(data_lines) - 1, len(data_lines[-1])
85 data_lines = "\n".join(data_lines)
87 # Else it is a single line data block
88 else:
89 # Is not a blank line
90 if data_lines[0].replace("\n", " ").strip() != "":
91 data_lines = data_lines[0]
92 else:
93 data_lines = ""
95 return data_lines, cur_pos[0] + lines, cols
98class HypertextMarkupParser(HTMLParser):
99 """Custom html parser inherited from the python
100 built-in html.parser.
101 """
103 cur: Root | Element
104 """The current parent element in the recursion."""
106 cur_tags: list
107 """Stack of all open tags. Used for balancing tags."""
109 def __init__(self, *, convert_charrefs=True):
110 super().__init__(convert_charrefs=convert_charrefs)
112 self.cur = Root()
113 self.cur_tags = []
115 def handle_decl(self, decl: str) -> None:
116 if decl.split(" ")[0].lower() == "doctype":
117 tokens = decl.split(" ")
118 if self.cur.type == "root":
119 if len(tokens) > 1:
120 self.cur.children.append(
121 DocType(
122 lang=tokens[1],
123 parent=self.cur,
124 position=build_position(self.getpos(), self.getpos()),
125 )
126 )
127 else:
128 self.cur.children.append(
129 DocType(
130 lang=None,
131 parent=self.cur,
132 position=build_position(self.getpos(), self.getpos()),
133 )
134 )
135 else:
136 raise Exception("<!doctype> must be in the root!")
138 def handle_pi(self, data: str) -> None:
139 print("Encountered a processing instruction tag:", data)
141 def handle_starttag(self, tag, attrs):
143 properties: Properties = {}
145 for attr in attrs:
146 if attr[1] is not None:
147 properties[attr[0]] = attr[1] if attr[1] != "no" else False
148 else:
149 properties[attr[0]] = True
151 self.cur.children.append(Element(tag=tag, properties=properties, parent=self.cur))
153 if tag in self_closing_tags:
154 self.cur.children[-1].startend = True
156 self.cur.children[-1].position = build_position(
157 self.getpos(), calc_end_of_tag(self.get_starttag_text(), self.getpos())
158 )
159 else:
160 self.cur = self.cur.children[-1]
161 self.cur_tags.append(self.cur)
162 self.cur.position = build_position(self.getpos(), (0, 0))
164 def handle_startendtag(self, tag, attrs):
165 properties: Properties = {}
167 for attr in attrs:
168 if attr[1] is not None:
169 properties[attr[0]] = attr[1] if attr[1] != "no" else False
170 else:
171 properties[attr[0]] = True
173 self.cur.children.append(
174 Element(
175 tag=tag,
176 properties=properties,
177 parent=self.cur,
178 startend=True,
179 position=build_position(
180 self.getpos(), calc_end_of_tag(self.get_starttag_text(), self.getpos())
181 ),
182 )
183 )
185 def handle_endtag(self, tag):
186 if tag == self.cur_tags[-1].tag:
187 if len(self.cur.children) == 0:
188 self.cur.startend = True
190 self.cur.position.end = build_point(self.getpos())
191 self.cur = self.cur.parent
192 self.cur_tags.pop(-1)
193 else:
194 raise Exception(
195 f"Mismatched tags <{self.cur.tag}> and </{tag}> at [{self.getpos()[0]}:{self.getpos()[1]}]"
196 )
198 def handle_data(self, data):
200 data, eline, ecol = strip_and_count(data, self.getpos())
202 if data not in [[], "", None]:
203 self.cur.children.append(
204 Text(
205 data,
206 self.cur,
207 position=build_position(self.getpos(), (eline, ecol)),
208 )
209 )
211 def handle_comment(self, data: str) -> None:
212 data, eline, ecol = strip_and_count(data, self.getpos())
214 if eline == self.getpos()[0]:
215 ecol += 7
216 else:
217 ecol += 3
219 self.cur.children.append(
220 Comment(
221 value=data,
222 parent=self.cur,
223 position=build_position(
224 self.getpos(),
225 (eline, ecol),
226 ),
227 )
228 )