phml.utilities.transform.sanitize
phml.utilities.transform.sanatize
Logic for sanatizing a phml ast.
1"""phml.utilities.transform.sanatize 2 3Logic for sanatizing a phml ast. 4""" 5from .clean import ( 6 sanatize, 7 recurse_strip, 8 recurse_check_tag, 9 recurse_check_ancestor, 10 recurse_check_required, 11 recurse_check_attributes, 12) 13from .schema import Schema 14 15__all__ = [ 16 "sanatize", 17 "Schema", 18 "recurse_check_attributes", 19 "recurse_check_required", 20 "recurse_strip", 21 "recurse_check_tag", 22 "recurse_check_ancestor" 23]
135def sanatize(tree: Parent, schema: Schema = Schema()): 136 """Sanatize elements and attributes in the phml tree. Should be used when using 137 data from an unkown source. It should be used with an AST that has already been 138 compiled to html to no unkown values are unchecked. 139 140 By default the sanatization schema uses the github schema and follows the hast 141 sanatize utility. 142 143 * [github schema](https://github.com/syntax-tree/hast-util-sanitize/blob/main/lib/schema.js) 144 * [hast sanatize](https://github.com/syntax-tree/hast-util-sanitize) 145 146 Note: 147 This utility will edit the tree in place. 148 149 Args: 150 tree (Parent): The root of the tree that will be sanatized. 151 schema (Schema, optional): User defined schema. Defaults to github schema. 152 """ 153 154 from phml.utilities import ( # pylint: disable=import-outside-toplevel 155 remove_nodes, 156 ) 157 158 for strip in schema.strip: 159 remove_nodes(tree, ["element", {"tag": strip}]) 160 161 162 recurse_check_tag(tree, schema) 163 recurse_strip(tree, schema) 164 recurse_check_ancestor(tree, schema) 165 recurse_check_attributes(tree, schema) 166 recurse_check_required(tree, schema)
Sanatize elements and attributes in the phml tree. Should be used when using data from an unkown source. It should be used with an AST that has already been compiled to html to no unkown values are unchecked.
By default the sanatization schema uses the github schema and follows the hast sanatize utility.
Note
This utility will edit the tree in place.
Args
- tree (Parent): The root of the tree that will be sanatized.
- schema (Schema, optional): User defined schema. Defaults to github schema.
31@dataclass 32class Schema: 33 """Dataclass of information on how to sanatize a phml tree. 34 35 `strip (list[str])`: The elements to strip from the tree. 36 `protocols (dict[str, list])`: Collection of element name and allowed protocal value list 37 `tag_names (list[str])`: List of allowed tag names. 38 `attributes (dict[str, list[str | list[str]]])`: Collection of element name and allowed property 39 names. 40 `required (dict[str, dict[str, str | bool]])`: Collection of element names and their required 41 properties and required property values. 42 """ 43 44 strip: list[str] = field(default_factory=lambda: ["script"]) 45 ancestors: dict[str, list] = field( 46 default_factory=lambda: { 47 "tbody": ["table"], 48 "tfoot": ["table"], 49 "thead": ["table"], 50 "td": ["table"], 51 "th": ["table"], 52 "tr": ["table"], 53 }, 54 ) 55 protocols: dict[str, list] = field( 56 default_factory=lambda: { 57 "href": ["http", "https", "mailto", "xmpp", "irc", "ircs"], 58 "cite": ["http", "https"], 59 "src": ["http", "https"], 60 "longDesc": ["http", "https"], 61 }, 62 ) 63 tag_names: list[str] = field( 64 default_factory=lambda: [ 65 "h1", 66 "h2", 67 "h3", 68 "h4", 69 "h5", 70 "h6", 71 "br", 72 "b", 73 "i", 74 "strong", 75 "em", 76 "a", 77 "pre", 78 "code", 79 "img", 80 "tt", 81 "div", 82 "ins", 83 "del", 84 "sup", 85 "sub", 86 "p", 87 "ol", 88 "ul", 89 "table", 90 "thead", 91 "tbody", 92 "tfoot", 93 "blockquote", 94 "dl", 95 "dt", 96 "dd", 97 "kbd", 98 "q", 99 "samp", 100 "var", 101 "hr", 102 "ruby", 103 "rt", 104 "rp", 105 "li", 106 "tr", 107 "td", 108 "th", 109 "s", 110 "strike", 111 "summary", 112 "details", 113 "caption", 114 "figure", 115 "figcaption", 116 "abbr", 117 "bdo", 118 "cite", 119 "dfn", 120 "mark", 121 "small", 122 "span", 123 "time", 124 "wbr", 125 "input", 126 ], 127 ) 128 attributes: dict[str, list[str | tuple[str|bool, ...]]] = field( 129 default_factory=lambda: { 130 "a": ["href"], 131 "article": ["class"], 132 "img": ["src", "longDesc", "loading"], 133 "input": [("type", "checkbox"), ("disabled", True)], 134 "li": [("class", "task-list-item")], 135 "div": ["itemScope", "itemType"], 136 "blockquote": ["cite"], 137 "del": ["cite"], 138 "ins": ["cite"], 139 "q": ["cite"], 140 "*": [ 141 "abbr", 142 "accept", 143 "acceptCharset", 144 "accessKey", 145 "action", 146 "align", 147 "alt", 148 "ariaDescribedBy", 149 "ariaHidden", 150 "ariaLabel", 151 "ariaLabelledBy", 152 "axis", 153 "border", 154 "cellPadding", 155 "cellSpacing", 156 "char", 157 "charOff", 158 "charSet", 159 "checked", 160 "clear", 161 "cols", 162 "colSpan", 163 "color", 164 "compact", 165 "coords", 166 "dateTime", 167 "dir", 168 "disabled", 169 "encType", 170 "htmlFor", 171 "frame", 172 "headers", 173 "height", 174 "hrefLang", 175 "hSpace", 176 "isMap", 177 "id", 178 "label", 179 "lang", 180 "maxLength", 181 "media", 182 "method", 183 "multiple", 184 "name", 185 "noHref", 186 "noShade", 187 "noWrap", 188 "open", 189 "prompt", 190 "readOnly", 191 "rel", 192 "rev", 193 "rows", 194 "rowSpan", 195 "rules", 196 "scope", 197 "selected", 198 "shape", 199 "size", 200 "span", 201 "start", 202 "summary", 203 "tabIndex", 204 "target", 205 "title", 206 "type", 207 "useMap", 208 "vAlign", 209 "value", 210 "vSpace", 211 "width", 212 "itemProp", 213 ], 214 }, 215 ) 216 required: dict[str, dict[str, str | bool]] = field( 217 default_factory=lambda: { 218 "input": { 219 "type": "checkbox", 220 "disabled": True, 221 }, 222 }, 223 ) 224 225 def extend( 226 self, 227 strip: list[str] | None = None, 228 ancestors: dict[str, list[str]] | None = None, 229 protocols: dict[str, list[str]] | None = None, 230 tag_names: list[str] | None = None, 231 attributes: dict[str, list[str | tuple[str|bool, ...]]] | None = None, 232 required: dict[str, dict[str, str | bool]] | None = None, 233 ) -> Schema: 234 """Extend the default schemas values. 235 236 Args: 237 `strip (list[str])`: The elements to strip from the tree. 238 `ancestors (dict[str, list[str]])`: Key is a element tag and the value is a list of valid 239 parent elements. 240 `protocols (dict[str, list[str]])`: Collection of element names to list of valid protocols (prefixes). 241 `tag_names (list[str])`: List of allowed tag names. 242 `attributes (dict[str, list[str | list[str]]])`: Collection of element name and allowed property 243 names. 244 `required (dict[str, dict[str, str | bool]])`: Collection of element names and their required 245 properties and required property values. 246 """ 247 248 return Schema( 249 strip=list(set([*self.strip, *(strip or [])])), 250 ancestors=_extend_dict_list_({**self.ancestors}, ancestors or {}), 251 protocols=_extend_dict_list_({**self.protocols}, protocols or {}), 252 attributes=_extend_dict_list_({**self.attributes}, attributes or {}), 253 tag_names=list(set([*self.tag_names, *(tag_names or [])])), 254 required=_extend_dict_dict_({**self.required}, required or {}), 255 )
Dataclass of information on how to sanatize a phml tree.
strip (list[str])
: The elements to strip from the tree.
protocols (dict[str, list])
: Collection of element name and allowed protocal value list
tag_names (list[str])
: List of allowed tag names.
attributes (dict[str, list[str | list[str]]])
: Collection of element name and allowed property
names.
required (dict[str, dict[str, str | bool]])
: Collection of element names and their required
properties and required property values.
225 def extend( 226 self, 227 strip: list[str] | None = None, 228 ancestors: dict[str, list[str]] | None = None, 229 protocols: dict[str, list[str]] | None = None, 230 tag_names: list[str] | None = None, 231 attributes: dict[str, list[str | tuple[str|bool, ...]]] | None = None, 232 required: dict[str, dict[str, str | bool]] | None = None, 233 ) -> Schema: 234 """Extend the default schemas values. 235 236 Args: 237 `strip (list[str])`: The elements to strip from the tree. 238 `ancestors (dict[str, list[str]])`: Key is a element tag and the value is a list of valid 239 parent elements. 240 `protocols (dict[str, list[str]])`: Collection of element names to list of valid protocols (prefixes). 241 `tag_names (list[str])`: List of allowed tag names. 242 `attributes (dict[str, list[str | list[str]]])`: Collection of element name and allowed property 243 names. 244 `required (dict[str, dict[str, str | bool]])`: Collection of element names and their required 245 properties and required property values. 246 """ 247 248 return Schema( 249 strip=list(set([*self.strip, *(strip or [])])), 250 ancestors=_extend_dict_list_({**self.ancestors}, ancestors or {}), 251 protocols=_extend_dict_list_({**self.protocols}, protocols or {}), 252 attributes=_extend_dict_list_({**self.attributes}, attributes or {}), 253 tag_names=list(set([*self.tag_names, *(tag_names or [])])), 254 required=_extend_dict_dict_({**self.required}, required or {}), 255 )
Extend the default schemas values.
Args
strip (list[str])
: The elements to strip from the tree.ancestors (dict[str, list[str]])
: Key is a element tag and the value is a list of valid parent elements.protocols (dict[str, list[str]])
: Collection of element names to list of valid protocols (prefixes).tag_names (list[str])
: List of allowed tag names.attributes (dict[str, list[str | list[str]]])
: Collection of element name and allowed property names.required (dict[str, dict[str, str | bool]])
: Collection of element names and their required properties and required property values.
66def recurse_check_attributes(node: Node, schema: Schema): 67 if isinstance(node, Element): 68 if node.tag in schema.attributes: 69 pop_attrs = build_remove_attr_list( 70 node.attributes, 71 { 72 str(attr[0]): attr[1:] 73 for attr in ( 74 schema.attributes[node.tag] 75 + schema.attributes.get("*", []) 76 ) 77 if isinstance(attr, tuple) 78 }, 79 [ 80 attr if isinstance(attr, str) else attr[0] 81 for attr in ( 82 schema.attributes[node.tag] 83 + schema.attributes.get("*", []) 84 ) 85 ], 86 schema, 87 ) 88 else: 89 pop_attrs = build_remove_attr_list( 90 node.attributes, 91 { 92 str(attr[0]): attr[1:] 93 for attr in schema.attributes.get("*", []) 94 if isinstance(attr, tuple) 95 }, 96 [ 97 attr if isinstance(attr, str) else attr[0] 98 for attr in schema.attributes.get("*", []) 99 ], 100 schema, 101 ) 102 103 for attribute in pop_attrs: 104 node.pop(attribute, None) 105 106 if isinstance(node, Parent): 107 for child in node: 108 recurse_check_attributes(child, schema)
110def recurse_check_required(node: Parent, schema: Schema): 111 for child in node: 112 if isinstance(child, Element) and child.tag in schema.required: 113 for attr, value in schema.required[child.tag].items(): 114 if attr not in child.attributes: 115 child[attr] = value 116 elif isinstance(value, bool): 117 child[attr] = str(value).lower() 118 elif isinstance(value, str) and child[attr] != value: 119 child[attr] = value 120 elif isinstance(child, Element): 121 recurse_check_required(child, schema)
18def recurse_check_ancestor(node: Parent, schema: Schema): 19 for child in list(node): 20 if ( 21 isinstance(child, Element) 22 and child.tag in schema.ancestors 23 and ( 24 not isinstance(child.parent, Element) 25 or child.parent.tag not in schema.ancestors[child.tag] 26 ) 27 ): 28 node.remove(child) 29 elif isinstance(child, Element): 30 recurse_check_ancestor(child, schema)