Coverage for phml\utilities\locate\select.py: 100%

214 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-04-12 14:26 -0500

1"""utilities.select 

2 

3A collection of utilities around querying for specific 

4types of data. 

5""" 

6 

7import re 

8from typing import Callable 

9 

10from phml.nodes import Element, Node, Parent 

11from phml.utilities.travel.travel import walk 

12 

13__all__ = ["query", "query_all", "matches", "parse_specifiers"] 

14 

15 

16def query(tree: Parent, specifier: str) -> Element | None: 

17 """Same as javascripts querySelector. `#` indicates an id and `.` 

18 indicates a class. If they are used alone they match anything. 

19 Any tag can be used by itself or with `#` and/or `.`. You may use 

20 any number of class specifiers, but may only use one id specifier per 

21 tag name. Complex specifiers are accepted are allowed meaning you can 

22 have space seperated specifiers indicating nesting or a parent child 

23 relationship. 

24 

25 Rules: 

26 * `*` = any element 

27 * `>` = direct child of the current element 

28 * `+` = first sibling 

29 * `~` = elements after the current element 

30 * `.` = class 

31 * `#` = id 

32 * `[attribute]` = elements with attribute 

33 * `[attribute=value]` = elements with attribute=value 

34 * `[attribute~=value]` = elements with attribute containing value 

35 * `[attribute|=value]` = elements with attribute=value or attribute starting with value- 

36 * `[attribute^=value]` = elements with an attribute starting with value 

37 * `[attribute$=value]` = elements with an attribute ending with value 

38 * `[attribute*=value]` = elements with an attribute containing value 

39 

40 Examles: 

41 * `.some-example` matches the first element with the class `some-example` 

42 * `#some-example` matches the first element with the id `some-example` 

43 * `li` matches the first `li` element 

44 * `li.red` matches the first `li` with the class `red` 

45 * `li#red` matches the first `li` with the id `red` 

46 * `input[type="checkbox"]` matches the first `input` with the attribute `type="checkbox"` 

47 * `div.form-control input[type="checkbox"]` matches the first `input` with the 

48 attribute `type="checked"` that has a parent `div` with the class `form-control`. 

49 

50 Return: 

51 Element | None: The first element matching the specifier or None if no element was 

52 found. 

53 """ 

54 

55 def all_nodes(current: Parent, rules: list, include_self: bool = True): 

56 """Get all nodes starting with the current node.""" 

57 

58 result = None 

59 for node in walk(current): 

60 if isinstance(node, Element) and (include_self or node != current): 

61 result = branch(node, rules) 

62 if result is not None: 

63 break 

64 return result 

65 

66 def all_children(current: Parent, rules: list): 

67 """Get all children of the curret node.""" 

68 result = None 

69 for node in current: 

70 if isinstance(node, Element): 

71 result = branch(node, rules) 

72 if result is not None: 

73 break 

74 return result 

75 

76 def first_sibling(node: Parent, rules: list): 

77 """Get the first sibling following the node.""" 

78 if node.parent is None: 

79 return None 

80 

81 idx = node.parent.index(node) 

82 if idx + 1 < len(node.parent) and isinstance(node.parent[idx + 1], Element): 

83 return branch(node.parent[idx + 1], rules) 

84 return None 

85 

86 def all_siblings(current: Parent, rules: list): 

87 """Get all siblings after the current node.""" 

88 if current.parent is None: 

89 return None 

90 

91 result = None 

92 idx = current.parent.index(current) 

93 if idx + 1 < len(current.parent): 

94 for node in range(idx + 1, len(current.parent)): 

95 if isinstance(current.parent[node], Element): 

96 result = branch(current.parent[node], rules) 

97 if result is not None: 

98 break 

99 return result 

100 

101 def process_dict(rules: list, node: Element): 

102 if is_equal(rules[0], node): 

103 if len(rules) - 1 == 0: 

104 return node 

105 

106 if isinstance(rules[1], dict) or rules[1] == "*": 

107 return ( 

108 all_nodes(node, rules[1:], False) 

109 if isinstance(rules[1], dict) 

110 else all_nodes(node, rules[2:], False) 

111 ) 

112 

113 return branch(node, rules[1:]) 

114 return None 

115 

116 def branch(node: Node, rules: list): # pylint: disable=too-many-return-statements 

117 """Based on the current rule, recursively check the nodes. 

118 If on the last rule then return the current valid node. 

119 """ 

120 

121 if isinstance(node, Parent): 

122 if len(rules) == 0: 

123 return node 

124 

125 if isinstance(rules[0], dict) and isinstance(node, Element): 

126 return process_dict(rules, node) 

127 

128 if rules[0] == "*": 

129 return all_nodes(node, rules[1:]) 

130 

131 if rules[0] == ">": 

132 return all_children(node, rules[1:]) 

133 

134 if rules[0] == "+": 

135 return first_sibling(node, rules[1:]) 

136 

137 if rules[0] == "~": 

138 return all_siblings(node, rules[1:]) 

139 

140 rules = parse_specifiers(specifier) 

141 return all_nodes(tree, rules) 

142 

143 

144def query_all(tree: Parent, specifier: str) -> list[Element]: 

145 """Same as javascripts querySelectorAll. `#` indicates an id and `.` 

146 indicates a class. If they are used alone they match anything. 

147 Any tag can be used by itself or with `#` and/or `.`. You may use 

148 any number of class specifiers, but may only use one id specifier per 

149 tag name. Complex specifiers are accepted are allowed meaning you can 

150 have space seperated specifiers indicating nesting or a parent child 

151 relationship. 

152 

153 Rules: 

154 * `*` = any element 

155 * `>` = direct child of the current element 

156 * `+` = first sibling 

157 * `~` = elements after the current element 

158 * `.` = class 

159 * `#` = id 

160 * `[attribute]` = elements with attribute 

161 * `[attribute=value]` = elements with attribute=value 

162 * `[attribute~=value]` = elements with attribute containing value 

163 * `[attribute|=value]` = elements with attribute=value or attribute starting with value- 

164 * `[attribute^=value]` = elements with an attribute starting with value 

165 * `[attribute$=value]` = elements with an attribute ending with value 

166 * `[attribute*=value]` = elements with an attribute containing value 

167 

168 Examles: 

169 * `.some-example` matches the first element with the class `some-example` 

170 * `#some-example` matches the first element with the id `some-example` 

171 * `li` matches the first `li` element 

172 * `li.red` matches the first `li` with the class `red` 

173 * `li#red` matches the first `li` with the id `red` 

174 * `input[type="checkbox"]` matches the first `input` with the attribute `type="checkbox"` 

175 * `div.form-control input[type="checkbox"]` matches the first `input` with the 

176 attribute `type="checked"` that has a parent `div` with the class `form-control`. 

177 

178 Return: 

179 list[Element] | None: The all elements matching the specifier or and empty list if no 

180 elements were found. 

181 """ 

182 

183 def all_nodes(current: Parent, rules: list, include_self: bool = True): 

184 """Get all nodes starting with the current node.""" 

185 results = [] 

186 for node in walk(current): 

187 if isinstance(node, Element) and (include_self or node != current): 

188 results.extend(branch(node, rules) or []) 

189 return results 

190 

191 def all_children(current: Parent, rules: list): 

192 """Get all children of the curret node.""" 

193 results = [] 

194 for node in current: 

195 if isinstance(node, Element): 

196 results.extend(branch(node, rules) or []) 

197 return results 

198 

199 def first_sibling(node: Parent, rules: list): 

200 """Get the first sibling following the node.""" 

201 if node.parent is None: 

202 return [] 

203 

204 idx = node.parent.index(node) 

205 if idx + 1 < len(node.parent) and node.parent[idx + 1].type == "element": 

206 result = branch(node.parent[idx + 1], rules) 

207 return result 

208 return [] 

209 

210 def all_siblings(current: Parent, rules: list): 

211 """Get all siblings after the current node.""" 

212 if current.parent is None: 

213 return [] 

214 

215 results = [] 

216 idx = current.parent.index(current) 

217 if idx + 1 < len(current.parent): 

218 for node in range(idx + 1, len(current.parent)): 

219 if current.parent[node].type == "element": 

220 results.extend(branch(current.parent[node], rules) or []) 

221 return results 

222 

223 def process_dict(rules: list, node: Element): 

224 if is_equal(rules[0], node): 

225 if len(rules) - 1 == 0: 

226 return [node] 

227 

228 if isinstance(rules[1], dict) or rules[1] == "*": 

229 return ( 

230 all_nodes(node, rules[1:]) 

231 if isinstance(rules[1], dict) 

232 else all_nodes(node, rules[2:], False) 

233 ) 

234 

235 return branch(node, rules[1:]) 

236 return [] 

237 

238 def branch(node: Node, rules: list): # pylint: disable=too-many-return-statements 

239 """Based on the current rule, recursively check the nodes. 

240 If on the last rule then return the current valid node. 

241 """ 

242 

243 if isinstance(node, Parent): 

244 if len(rules) == 0: 

245 return [node] 

246 

247 if isinstance(rules[0], dict) and isinstance(node, Element): 

248 return process_dict(rules, node) 

249 

250 if rules[0] == "*": 

251 return all_nodes(node, rules[1:]) 

252 

253 if rules[0] == ">": 

254 return all_children(node, rules[1:]) 

255 

256 if rules[0] == "+": 

257 return first_sibling(node, rules[1:]) 

258 

259 if rules[0] == "~": 

260 return all_siblings(node, rules[1:]) 

261 

262 rules = parse_specifiers(specifier) 

263 return all_nodes(tree, rules) 

264 # return [result[i] for i in range(len(result)) if i == result.index(result[i])] 

265 

266 

267def matches(node: Element, specifier: str) -> bool: 

268 """Works the same as the Javascript matches. `#` indicates an id and `.` 

269 indicates a class. If they are used alone they match anything. 

270 Any tag can be used by itself or with `#` and/or `.`. You may use 

271 any number of class specifiers, but may only use one id specifier per 

272 tag name. Complex specifiers are not supported. Everything in the specifier 

273 must relate to one element/tag. 

274 

275 Rules: 

276 * `.` = class 

277 * `#` = id 

278 * `[attribute]` = elements with attribute 

279 * `[attribute=value]` = elements with attribute=value 

280 * `[attribute~=value]` = elements with attribute containing value 

281 * `[attribute|=value]` = elements with attribute=value or attribute starting with value- 

282 * `[attribute^=value]` = elements with an attribute starting with value 

283 * `[attribute$=value]` = elements with an attribute ending with value 

284 * `[attribute*=value]` = elements with an attribute containing value 

285 

286 Examles: 

287 * `.some-example` matches the element with the class `some-example` 

288 * `#some-example` matches the element with the id `some-example` 

289 * `li` matches an `li` element 

290 * `li.red` matches the an `li` with the class `red` 

291 * `li#red` matches the an `li` with the id `red` 

292 * `input[type="checkbox"]` matches the `input` element with the attribute `type="checkbox"` 

293 """ 

294 

295 rules = parse_specifiers(specifier) 

296 

297 if len(rules) > 1: 

298 raise Exception(f"Complex specifier detected and is not allowed.\n{specifier}") 

299 if not isinstance(rules[0], dict): 

300 raise Exception( 

301 "Specifier must only include tag name, classes, id, and or attribute specfiers.\n\ 

302Example: `li.red#sample[class^='form-'][title~='sample']`", 

303 ) 

304 

305 return is_equal(rules[0], node) 

306 

307 

308def is_equal(rule: dict, node: Node) -> bool: 

309 """Checks if a rule is valid on a node. 

310 A rule is a dictionary of possible values and each value must 

311 be valid on the node. 

312 

313 A rule may have a tag, id, classList, and attribute list: 

314 * If the `tag` is provided, the nodes `tag` must match the rules `tag` 

315 * If the `id` is provided, the nodes `id` must match the rules `id` 

316 * If the `classList` is not empty, each class in the `classList` must exist in the nodes 

317 class attribute 

318 * If the `attribute` list is not empty, each attribute in the attribute list with be compared 

319 against the nodes attributes given the `attribute` lists comparators. Below is the list of 

320 possible comparisons. 

321 1. Exists: `[checked]` yields any element that has the attribute `checked` no matter it's 

322 value. 

323 2. Equals: `[checked='no']` yields any element with `checked='no'` 

324 3. Contains: `[class~=sample]` or `[class*=sample]` yields any element with a class 

325 containing `sample` 

326 4. Equal to or startswith value-: `[class|=sample]` yields elements that either have 

327 a class that equals `sample` or or a class that starts with `sample-` 

328 5. Starts with: `[class^=sample]` yields elements with a class that starts with `sample` 

329 6. Ends with: `[class$="sample"]` yields elements with a class that ends wtih `sample` 

330 

331 Args: 

332 rule (dict): The rule to apply to the node. 

333 node (Element): The node the validate. 

334 

335 Returns: 

336 bool: Whether the node passes all the rules in the dictionary. 

337 """ 

338 # Validate tag 

339 if rule["tag"] != "*" and rule["tag"] != node.tag: 

340 return False 

341 

342 # Validate id 

343 if rule["id"] is not None and ("id" not in node or rule["id"] != node["id"]): 

344 return False 

345 

346 # Validate class list 

347 if len(rule["classList"]) > 0: 

348 for klass in rule["classList"]: 

349 if "class" not in node or klass not in str(node["class"]).split(" "): 

350 return False 

351 

352 # Validate all attributes 

353 if len(rule["attributes"]) > 0: 

354 return all( 

355 attr["name"] in node.attributes and __validate_attr(attr, node) 

356 for attr in rule["attributes"] 

357 ) 

358 

359 return True 

360 

361 

362def compare_equal(attr: str, c_value: str) -> bool: 

363 return attr == c_value 

364 

365 

366def compare_equal_or_start_with_value_dash(attr: str, c_value: str) -> bool: 

367 return attr == c_value or attr.startswith(f"{c_value}-") 

368 

369 

370def compare_startswith(attr: str, c_value: str) -> bool: 

371 return attr.startswith(c_value) 

372 

373 

374def compare_endswith(attr: str, c_value: str) -> bool: 

375 return attr.endswith(c_value) 

376 

377 

378def compare_contains(attr: str, c_value: str) -> bool: 

379 return c_value in attr 

380 

381 

382def compare_exists(attr: str, _) -> bool: 

383 return attr == "true" 

384 

385 

386def __validate_attr(attr: dict, node: Element): 

387 attribute = node[attr["name"]] 

388 if isinstance(attribute, bool): 

389 attribute = str(node[attr["name"]]).lower() 

390 

391 if attr["compare"] == "=": 

392 return is_valid_attr( 

393 attr=attribute, 

394 sub=attr["value"], 

395 name=attr["name"], 

396 validator=compare_equal, 

397 ) 

398 

399 if attr["compare"] == "|=": 

400 return is_valid_attr( 

401 attr=attribute, 

402 sub=attr["value"], 

403 name=attr["name"], 

404 validator=compare_equal_or_start_with_value_dash, 

405 ) 

406 

407 if attr["compare"] == "^=": 

408 return is_valid_attr( 

409 attr=attribute, 

410 sub=attr["value"], 

411 name=attr["name"], 

412 validator=compare_startswith, 

413 ) 

414 

415 if attr["compare"] == "$=": 

416 return is_valid_attr( 

417 attr=attribute, 

418 sub=attr["value"], 

419 name=attr["name"], 

420 validator=compare_endswith, 

421 ) 

422 

423 if attr["compare"] in ["*=", "~="]: 

424 return is_valid_attr( 

425 attr=attribute, 

426 sub=attr["value"], 

427 name=attr["name"], 

428 validator=compare_contains, 

429 ) 

430 

431 if attr["compare"] == "" and attr["value"] == "": 

432 return is_valid_attr( 

433 attr=attribute, 

434 sub=attr["value"], 

435 name=attr["name"], 

436 validator=compare_exists, 

437 ) 

438 

439 

440def is_valid_attr(attr: str, sub: str, name: str, validator: Callable) -> bool: 

441 """Validate an attribute value with a given string and a validator callable. 

442 If classlist, create list with attribute value seperated on spaces. Otherwise, 

443 the list will only have the attribute value. For each item in the list, check 

444 against validator, if valid add to count. 

445 

446 Returns: 

447 True if the valid count is greater than 0. 

448 """ 

449 list_attributes = ["class"] 

450 

451 compare_values = [attr] 

452 if name in list_attributes: 

453 compare_values = attr.split(" ") 

454 

455 return bool(len([item for item in compare_values if validator(item, sub)]) > 0) 

456 

457 

458def __parse_el_with_attribute( 

459 tag: str | None, context: str | None, attributes: str | None 

460) -> dict: 

461 el_from_class_from_id = re.compile(r"(#|\.)([\w\-]+)") 

462 

463 attr_compare_val = re.compile( 

464 r"\[\s*([\w\-:@]+)\s*([\~\|\^\$\*]?=)?\s*(\"[^\"\[\]=]*\"|\'[^\'\[\]=]*\'|[^\s\[\]=\"']+)?\s*\]" 

465 ) 

466 re.compile(r"\[\s*([\w\-:@]+)\]") 

467 

468 element = { 

469 "tag": tag or "*", 

470 "classList": [], 

471 "id": None, 

472 "attributes": [], 

473 } 

474 

475 if attributes is not None: 

476 for attr in attr_compare_val.findall(attributes): 

477 name, compare, value = attr 

478 if value is not None: 

479 value = value.lstrip("'\"").rstrip("'\"") 

480 element["attributes"].append( 

481 { 

482 "name": name, 

483 "compare": compare, 

484 "value": value, 

485 }, 

486 ) 

487 

488 if context is not None: 

489 for part in el_from_class_from_id.finditer(context): 

490 if part.group(1) == ".": 

491 if part.group(2) not in element["classList"]: 

492 element["classList"].append(part.group(2)) 

493 elif part.group(1) == "#": 

494 if element["id"] is None: 

495 element["id"] = part.group(2) 

496 else: 

497 raise Exception( 

498 f"There may only be one id per element specifier. '{(tag or '') + (context or '')}{attributes or ''}'", 

499 ) 

500 return element 

501 

502 

503def __parse_attr_only_element(token: str) -> dict: 

504 attr_compare_val = re.compile( 

505 r"\[([a-zA-Z0-9_:\-]+)([~|^$*]?=)?(\"[^\"]+\"|'[^']+'|[^'\"]+)?\]" 

506 ) 

507 

508 element = { 

509 "tag": None, 

510 "classList": [], 

511 "id": None, 

512 "attributes": [], 

513 } 

514 

515 element["tag"] = "*" 

516 

517 if token not in ["", None]: 

518 for attr in attr_compare_val.finditer(token): 

519 name, compare, value = attr.groups() 

520 if value is not None: 

521 value = value.lstrip("'\"").rstrip("'\"") 

522 element["attributes"].append( 

523 { 

524 "name": name, 

525 "compare": compare, 

526 "value": value, 

527 }, 

528 ) 

529 

530 return element 

531 

532 

533def parse_specifiers(specifier: str) -> list: 

534 """ 

535 Rules: 

536 * `*` = any element 

537 * `>` = direct child of the current element 

538 * `+` = first sibling 

539 * `~` = elements after the current element 

540 * `.` = class 

541 * `#` = id 

542 * `[attribute]` = elements with attribute 

543 * `[attribute=value]` = elements with attribute=value 

544 * `[attribute~=value]` = elements with attribute containing value 

545 * `[attribute|=value]` = elements with attribute=value or attribute starting with value- 

546 * `[attribute^=value]` = elements with an attribute starting with value 

547 * `[attribute$=value]` = elements with an attribute ending with value 

548 * `[attribute*=value]` = elements with an attribute containing value 

549 """ 

550 splitter = re.compile( 

551 r"([~>\*+])|((?:\[[^\[\]]+\])+)|([^.#\[\]\s]+)?((?:(?:\.|#)[^.#\[\]\s]+)+)?((?:\[[^\[\]]+\])+)?" 

552 ) 

553 

554 tokens = [] 

555 for token in splitter.finditer(specifier): 

556 ( 

557 sibling, 

558 just_attributes, 

559 tag, 

560 context, 

561 attributes, 

562 ) = token.groups() 

563 if sibling in ["*", ">", "+", "~"]: 

564 tokens.append(sibling) 

565 elif tag is not None or context is not None or attributes is not None: 

566 tokens.append(__parse_el_with_attribute(tag, context, attributes)) 

567 elif just_attributes is not None: 

568 tokens.append(__parse_attr_only_element(just_attributes)) 

569 return tokens