phml.utilities.transform.sanitize

phml.utilities.transform.sanatize

Logic for sanatizing a phml ast.

 1"""phml.utilities.transform.sanatize
 2
 3Logic for sanatizing a phml ast.
 4"""
 5from .clean import (
 6    sanatize,
 7    recurse_strip,
 8    recurse_check_tag,
 9    recurse_check_ancestor,
10    recurse_check_required,
11    recurse_check_attributes,
12)
13from .schema import Schema
14
15__all__ = [
16    "sanatize",
17    "Schema",
18    "recurse_check_attributes",
19    "recurse_check_required",
20    "recurse_strip",
21    "recurse_check_tag",
22    "recurse_check_ancestor"
23]
def sanatize( tree: phml.nodes.Parent, schema: phml.utilities.transform.sanitize.Schema = Schema(strip=['script'], ancestors={'tbody': ['table'], 'tfoot': ['table'], 'thead': ['table'], 'td': ['table'], 'th': ['table'], 'tr': ['table']}, protocols={'href': ['http', 'https', 'mailto', 'xmpp', 'irc', 'ircs'], 'cite': ['http', 'https'], 'src': ['http', 'https'], 'longDesc': ['http', 'https']}, tag_names=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'b', 'i', 'strong', 'em', 'a', 'pre', 'code', 'img', 'tt', 'div', 'ins', 'del', 'sup', 'sub', 'p', 'ol', 'ul', 'table', 'thead', 'tbody', 'tfoot', 'blockquote', 'dl', 'dt', 'dd', 'kbd', 'q', 'samp', 'var', 'hr', 'ruby', 'rt', 'rp', 'li', 'tr', 'td', 'th', 's', 'strike', 'summary', 'details', 'caption', 'figure', 'figcaption', 'abbr', 'bdo', 'cite', 'dfn', 'mark', 'small', 'span', 'time', 'wbr', 'input'], attributes={'a': ['href'], 'article': ['class'], 'img': ['src', 'longDesc', 'loading'], 'input': [('type', 'checkbox'), ('disabled', True)], 'li': [('class', 'task-list-item')], 'div': ['itemScope', 'itemType'], 'blockquote': ['cite'], 'del': ['cite'], 'ins': ['cite'], 'q': ['cite'], '*': ['abbr', 'accept', 'acceptCharset', 'accessKey', 'action', 'align', 'alt', 'ariaDescribedBy', 'ariaHidden', 'ariaLabel', 'ariaLabelledBy', 'axis', 'border', 'cellPadding', 'cellSpacing', 'char', 'charOff', 'charSet', 'checked', 'clear', 'cols', 'colSpan', 'color', 'compact', 'coords', 'dateTime', 'dir', 'disabled', 'encType', 'htmlFor', 'frame', 'headers', 'height', 'hrefLang', 'hSpace', 'isMap', 'id', 'label', 'lang', 'maxLength', 'media', 'method', 'multiple', 'name', 'noHref', 'noShade', 'noWrap', 'open', 'prompt', 'readOnly', 'rel', 'rev', 'rows', 'rowSpan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'start', 'summary', 'tabIndex', 'target', 'title', 'type', 'useMap', 'vAlign', 'value', 'vSpace', 'width', 'itemProp']}, required={'input': {'type': 'checkbox', 'disabled': True}})):
135def sanatize(tree: Parent, schema: Schema = Schema()):
136    """Sanatize elements and attributes in the phml tree. Should be used when using
137    data from an unkown source. It should be used with an AST that has already been
138    compiled to html to no unkown values are unchecked.
139
140    By default the sanatization schema uses the github schema and follows the hast
141    sanatize utility.
142
143    * [github schema](https://github.com/syntax-tree/hast-util-sanitize/blob/main/lib/schema.js)
144    * [hast sanatize](https://github.com/syntax-tree/hast-util-sanitize)
145
146    Note:
147        This utility will edit the tree in place.
148
149    Args:
150        tree (Parent): The root of the tree that will be sanatized.
151        schema (Schema, optional): User defined schema. Defaults to github schema.
152    """
153
154    from phml.utilities import (  # pylint: disable=import-outside-toplevel
155        remove_nodes,
156    )
157
158    for strip in schema.strip:
159        remove_nodes(tree, ["element", {"tag": strip}])
160
161
162    recurse_check_tag(tree, schema)
163    recurse_strip(tree, schema)
164    recurse_check_ancestor(tree, schema)
165    recurse_check_attributes(tree, schema)
166    recurse_check_required(tree, schema)

Sanatize elements and attributes in the phml tree. Should be used when using data from an unkown source. It should be used with an AST that has already been compiled to html to no unkown values are unchecked.

By default the sanatization schema uses the github schema and follows the hast sanatize utility.

Note

This utility will edit the tree in place.

Args
  • tree (Parent): The root of the tree that will be sanatized.
  • schema (Schema, optional): User defined schema. Defaults to github schema.
@dataclass
class Schema:
 31@dataclass
 32class Schema:
 33    """Dataclass of information on how to sanatize a phml tree.
 34
 35    `strip (list[str])`: The elements to strip from the tree.
 36    `protocols (dict[str, list])`: Collection of element name and allowed protocal value list
 37    `tag_names (list[str])`: List of allowed tag names.
 38    `attributes (dict[str, list[str | list[str]]])`: Collection of element name and allowed property
 39    names.
 40    `required (dict[str, dict[str, str | bool]])`: Collection of element names and their required
 41    properties and required property values.
 42    """
 43
 44    strip: list[str] = field(default_factory=lambda: ["script"])
 45    ancestors: dict[str, list] = field(
 46        default_factory=lambda: {
 47            "tbody": ["table"],
 48            "tfoot": ["table"],
 49            "thead": ["table"],
 50            "td": ["table"],
 51            "th": ["table"],
 52            "tr": ["table"],
 53        },
 54    )
 55    protocols: dict[str, list] = field(
 56        default_factory=lambda: {
 57            "href": ["http", "https", "mailto", "xmpp", "irc", "ircs"],
 58            "cite": ["http", "https"],
 59            "src": ["http", "https"],
 60            "longDesc": ["http", "https"],
 61        },
 62    )
 63    tag_names: list[str] = field(
 64        default_factory=lambda: [
 65            "h1",
 66            "h2",
 67            "h3",
 68            "h4",
 69            "h5",
 70            "h6",
 71            "br",
 72            "b",
 73            "i",
 74            "strong",
 75            "em",
 76            "a",
 77            "pre",
 78            "code",
 79            "img",
 80            "tt",
 81            "div",
 82            "ins",
 83            "del",
 84            "sup",
 85            "sub",
 86            "p",
 87            "ol",
 88            "ul",
 89            "table",
 90            "thead",
 91            "tbody",
 92            "tfoot",
 93            "blockquote",
 94            "dl",
 95            "dt",
 96            "dd",
 97            "kbd",
 98            "q",
 99            "samp",
100            "var",
101            "hr",
102            "ruby",
103            "rt",
104            "rp",
105            "li",
106            "tr",
107            "td",
108            "th",
109            "s",
110            "strike",
111            "summary",
112            "details",
113            "caption",
114            "figure",
115            "figcaption",
116            "abbr",
117            "bdo",
118            "cite",
119            "dfn",
120            "mark",
121            "small",
122            "span",
123            "time",
124            "wbr",
125            "input",
126        ],
127    )
128    attributes: dict[str, list[str | tuple[str|bool, ...]]] = field(
129        default_factory=lambda: {
130            "a": ["href"],
131            "article": ["class"],
132            "img": ["src", "longDesc", "loading"],
133            "input": [("type", "checkbox"), ("disabled", True)],
134            "li": [("class", "task-list-item")],
135            "div": ["itemScope", "itemType"],
136            "blockquote": ["cite"],
137            "del": ["cite"],
138            "ins": ["cite"],
139            "q": ["cite"],
140            "*": [
141                "abbr",
142                "accept",
143                "acceptCharset",
144                "accessKey",
145                "action",
146                "align",
147                "alt",
148                "ariaDescribedBy",
149                "ariaHidden",
150                "ariaLabel",
151                "ariaLabelledBy",
152                "axis",
153                "border",
154                "cellPadding",
155                "cellSpacing",
156                "char",
157                "charOff",
158                "charSet",
159                "checked",
160                "clear",
161                "cols",
162                "colSpan",
163                "color",
164                "compact",
165                "coords",
166                "dateTime",
167                "dir",
168                "disabled",
169                "encType",
170                "htmlFor",
171                "frame",
172                "headers",
173                "height",
174                "hrefLang",
175                "hSpace",
176                "isMap",
177                "id",
178                "label",
179                "lang",
180                "maxLength",
181                "media",
182                "method",
183                "multiple",
184                "name",
185                "noHref",
186                "noShade",
187                "noWrap",
188                "open",
189                "prompt",
190                "readOnly",
191                "rel",
192                "rev",
193                "rows",
194                "rowSpan",
195                "rules",
196                "scope",
197                "selected",
198                "shape",
199                "size",
200                "span",
201                "start",
202                "summary",
203                "tabIndex",
204                "target",
205                "title",
206                "type",
207                "useMap",
208                "vAlign",
209                "value",
210                "vSpace",
211                "width",
212                "itemProp",
213            ],
214        },
215    )
216    required: dict[str, dict[str, str | bool]] = field(
217        default_factory=lambda: {
218            "input": {
219                "type": "checkbox",
220                "disabled": True,
221            },
222        },
223    )
224
225    def extend(
226        self,
227        strip: list[str] | None = None,
228        ancestors: dict[str, list[str]] | None = None,
229        protocols: dict[str, list[str]] | None = None,
230        tag_names: list[str] | None = None,
231        attributes: dict[str, list[str | tuple[str|bool, ...]]] | None = None,
232        required: dict[str, dict[str, str | bool]] | None = None,
233    ) -> Schema:
234        """Extend the default schemas values.
235
236        Args:
237            `strip (list[str])`: The elements to strip from the tree.
238            `ancestors (dict[str, list[str]])`: Key is a element tag and the value is a list of valid
239                parent elements.
240            `protocols (dict[str, list[str]])`: Collection of element names to list of valid protocols (prefixes).
241            `tag_names (list[str])`: List of allowed tag names.
242            `attributes (dict[str, list[str | list[str]]])`: Collection of element name and allowed property
243                names.
244            `required (dict[str, dict[str, str | bool]])`: Collection of element names and their required
245                properties and required property values.
246        """
247
248        return Schema(
249            strip=list(set([*self.strip, *(strip or [])])),
250            ancestors=_extend_dict_list_({**self.ancestors}, ancestors or {}),
251            protocols=_extend_dict_list_({**self.protocols}, protocols or {}),
252            attributes=_extend_dict_list_({**self.attributes}, attributes or {}),
253            tag_names=list(set([*self.tag_names, *(tag_names or [])])),
254            required=_extend_dict_dict_({**self.required}, required or {}),
255        )

Dataclass of information on how to sanatize a phml tree.

strip (list[str]): The elements to strip from the tree. protocols (dict[str, list]): Collection of element name and allowed protocal value list tag_names (list[str]): List of allowed tag names. attributes (dict[str, list[str | list[str]]]): Collection of element name and allowed property names. required (dict[str, dict[str, str | bool]]): Collection of element names and their required properties and required property values.

Schema( strip: list[str] = <factory>, ancestors: dict[str, list] = <factory>, protocols: dict[str, list] = <factory>, tag_names: list[str] = <factory>, attributes: dict[str, list[str | tuple[str | bool, ...]]] = <factory>, required: dict[str, dict[str, str | bool]] = <factory>)
def extend( self, strip: list[str] | None = None, ancestors: dict[str, list[str]] | None = None, protocols: dict[str, list[str]] | None = None, tag_names: list[str] | None = None, attributes: dict[str, list[str | tuple[str | bool, ...]]] | None = None, required: dict[str, dict[str, str | bool]] | None = None) -> phml.utilities.transform.sanitize.Schema:
225    def extend(
226        self,
227        strip: list[str] | None = None,
228        ancestors: dict[str, list[str]] | None = None,
229        protocols: dict[str, list[str]] | None = None,
230        tag_names: list[str] | None = None,
231        attributes: dict[str, list[str | tuple[str|bool, ...]]] | None = None,
232        required: dict[str, dict[str, str | bool]] | None = None,
233    ) -> Schema:
234        """Extend the default schemas values.
235
236        Args:
237            `strip (list[str])`: The elements to strip from the tree.
238            `ancestors (dict[str, list[str]])`: Key is a element tag and the value is a list of valid
239                parent elements.
240            `protocols (dict[str, list[str]])`: Collection of element names to list of valid protocols (prefixes).
241            `tag_names (list[str])`: List of allowed tag names.
242            `attributes (dict[str, list[str | list[str]]])`: Collection of element name and allowed property
243                names.
244            `required (dict[str, dict[str, str | bool]])`: Collection of element names and their required
245                properties and required property values.
246        """
247
248        return Schema(
249            strip=list(set([*self.strip, *(strip or [])])),
250            ancestors=_extend_dict_list_({**self.ancestors}, ancestors or {}),
251            protocols=_extend_dict_list_({**self.protocols}, protocols or {}),
252            attributes=_extend_dict_list_({**self.attributes}, attributes or {}),
253            tag_names=list(set([*self.tag_names, *(tag_names or [])])),
254            required=_extend_dict_dict_({**self.required}, required or {}),
255        )

Extend the default schemas values.

Args
  • strip (list[str]): The elements to strip from the tree.
  • ancestors (dict[str, list[str]]): Key is a element tag and the value is a list of valid parent elements.
  • protocols (dict[str, list[str]]): Collection of element names to list of valid protocols (prefixes).
  • tag_names (list[str]): List of allowed tag names.
  • attributes (dict[str, list[str | list[str]]]): Collection of element name and allowed property names.
  • required (dict[str, dict[str, str | bool]]): Collection of element names and their required properties and required property values.
def recurse_check_attributes( node: phml.nodes.Node, schema: phml.utilities.transform.sanitize.Schema):
 66def recurse_check_attributes(node: Node, schema: Schema):
 67    if isinstance(node, Element):
 68        if node.tag in schema.attributes:
 69            pop_attrs = build_remove_attr_list(
 70                node.attributes,
 71                {
 72                    str(attr[0]): attr[1:]
 73                    for attr in (
 74                        schema.attributes[node.tag]
 75                        + schema.attributes.get("*", [])
 76                    )
 77                    if isinstance(attr, tuple)
 78                },
 79                [
 80                    attr if isinstance(attr, str) else attr[0]
 81                    for attr in (
 82                        schema.attributes[node.tag]
 83                        + schema.attributes.get("*", [])
 84                    )
 85                ],
 86                schema,
 87            )
 88        else:
 89            pop_attrs = build_remove_attr_list(
 90                node.attributes,
 91                {
 92                    str(attr[0]): attr[1:]
 93                    for attr in schema.attributes.get("*", [])
 94                    if isinstance(attr, tuple)
 95                },
 96                [
 97                    attr if isinstance(attr, str) else attr[0]
 98                    for attr in schema.attributes.get("*", [])
 99                ],
100                schema,
101            )
102
103        for attribute in pop_attrs:
104            node.pop(attribute, None)
105
106    if isinstance(node, Parent):
107        for child in node:
108            recurse_check_attributes(child, schema)
def recurse_check_required( node: phml.nodes.Parent, schema: phml.utilities.transform.sanitize.Schema):
110def recurse_check_required(node: Parent, schema: Schema):
111    for child in node:
112        if isinstance(child, Element) and child.tag in schema.required:
113            for attr, value in schema.required[child.tag].items():
114                if attr not in child.attributes:
115                    child[attr] = value
116                elif isinstance(value, bool):
117                    child[attr] = str(value).lower()
118                elif isinstance(value, str) and child[attr] != value:
119                    child[attr] = value
120        elif isinstance(child, Element):
121            recurse_check_required(child, schema)
def recurse_strip(node, schema: phml.utilities.transform.sanitize.Schema):
126def recurse_strip(node, schema: Schema):
127    from phml.utilities import is_element
128
129    for child in list(node):
130        if isinstance(child, Element) and is_element(child, schema.strip):
131            node.remove(child)
132        elif isinstance(child, Parent):
133            recurse_strip(child, schema)
def recurse_check_tag( node: phml.nodes.Parent, schema: phml.utilities.transform.sanitize.Schema):
 9def recurse_check_tag(node: Parent, schema: Schema):
10    from phml.utilities import is_element
11
12    for child in list(node):
13        if isinstance(child, Element) and not is_element(child, schema.tag_names):
14            node.remove(child)
15        elif isinstance(child, Parent):
16            recurse_check_tag(child, schema)
def recurse_check_ancestor( node: phml.nodes.Parent, schema: phml.utilities.transform.sanitize.Schema):
18def recurse_check_ancestor(node: Parent, schema: Schema):
19    for child in list(node):
20        if (
21            isinstance(child, Element)
22            and child.tag in schema.ancestors
23            and (
24                not isinstance(child.parent, Element)
25                or child.parent.tag not in schema.ancestors[child.tag]
26            )
27        ):
28            node.remove(child)
29        elif isinstance(child, Element):
30            recurse_check_ancestor(child, schema)