phml.utils.transform.sanitize

phml.utils.transform.sanatize

Logic for sanatizing a phml ast.

1"""phml.utils.transform.sanatize
2
3Logic for sanatizing a phml ast.
4"""
5from .clean import sanatize
6from .schema import Schema
7
8__all__ = ["sanatize", "Schema"]
def sanatize( tree: phml.nodes.AST.AST | phml.nodes.root.Root | phml.nodes.element.Element, schema: Optional[phml.utils.transform.sanitize.Schema] = Schema(strip=['script'], ancestors={'tbody': ['table'], 'tfoot': ['table'], 'thead': ['table'], 'td': ['table'], 'th': ['table'], 'tr': ['table']}, protocols={'href': ['http', 'https', 'mailto', 'xmpp', 'irc', 'ircs'], 'cite': ['http', 'https'], 'src': ['http', 'https'], 'longDesc': ['http', 'https']}, tag_names=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'b', 'i', 'strong', 'em', 'a', 'pre', 'code', 'img', 'tt', 'div', 'ins', 'del', 'sup', 'sub', 'p', 'ol', 'ul', 'table', 'thead', 'tbody', 'tfoot', 'blockquote', 'dl', 'dt', 'dd', 'kbd', 'q', 'samp', 'var', 'hr', 'ruby', 'rt', 'rp', 'li', 'tr', 'td', 'th', 's', 'strike', 'summary', 'details', 'caption', 'figure', 'figcaption', 'abbr', 'bdo', 'cite', 'dfn', 'mark', 'small', 'span', 'time', 'wbr', 'input'], attributes={'a': ['href'], 'img': ['src', 'longDesc'], 'input': [['type', 'checkbox'], ['disabled', True]], 'li': [['class', 'task-list-item']], 'div': ['itemScope', 'itemType'], 'blockquote': ['cite'], 'del': ['cite'], 'ins': ['cite'], 'q': ['cite'], '*': ['abbr', 'accept', 'acceptCharset', 'accessKey', 'action', 'align', 'alt', 'ariaDescribedBy', 'ariaHidden', 'ariaLabel', 'ariaLabelledBy', 'axis', 'border', 'cellPadding', 'cellSpacing', 'char', 'charOff', 'charSet', 'checked', 'clear', 'cols', 'colSpan', 'color', 'compact', 'coords', 'dateTime', 'dir', 'disabled', 'encType', 'htmlFor', 'frame', 'headers', 'height', 'hrefLang', 'hSpace', 'isMap', 'id', 'label', 'lang', 'maxLength', 'media', 'method', 'multiple', 'name', 'noHref', 'noShade', 'noWrap', 'open', 'prompt', 'readOnly', 'rel', 'rev', 'rows', 'rowSpan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'start', 'summary', 'tabIndex', 'target', 'title', 'type', 'useMap', 'vAlign', 'value', 'vSpace', 'width', 'itemProp']}, required={'input': {'type': 'checkbox', 'disabled': True}})):
 11def sanatize(tree: AST | Root | Element, schema: Optional[Schema] = Schema()):
 12    """Sanatize elements and attributes in the phml tree. Should be used when using
 13    data from an unkown source. It should be used with an AST that has already been
 14    compiled to html to no unkown values are unchecked.
 15
 16    By default the sanatization schema uses the github schema and follows the hast
 17    sanatize utility.
 18
 19    * [github schema](https://github.com/syntax-tree/hast-util-sanitize/blob/main/lib/schema.js)
 20    * [hast sanatize](https://github.com/syntax-tree/hast-util-sanitize)
 21
 22    Note:
 23        This utility will edit the tree in place.
 24
 25    Args:
 26        tree (AST | Root | Element): The root of the tree that will be sanatized.
 27        schema (Optional[Schema], optional): User defined schema. Defaults to github schema.
 28    """
 29
 30    from phml.utils import is_element, remove_nodes, test  # pylint: disable=import-outside-toplevel
 31
 32    if isinstance(tree, AST):
 33        src = tree.tree
 34    else:
 35        src = tree
 36
 37    for strip in schema.strip:
 38        remove_nodes(src, ["element", {"tag": strip}])
 39
 40    def recurse_check_tag(node: Root | Element):
 41        pop_els = []
 42        for idx, child in enumerate(node.children):
 43            if test(child, "element") and not is_element(child, schema.tag_names):
 44                pop_els.append(child)
 45            elif test(node.children[idx], "element"):
 46                recurse_check_tag(node.children[idx])
 47
 48        for element in pop_els:
 49            node.children.remove(element)
 50
 51    def recurse_check_ancestor(node: Root | Element):
 52        pop_els = []
 53        for idx, child in enumerate(node.children):
 54            if (
 55                test(child, "element")
 56                and child.tag in schema.ancestors.keys()
 57                and child.parent.tag not in schema.ancestors[child.tag]
 58            ):
 59                pop_els.append(child)
 60            elif test(node.children[idx], "element"):
 61                recurse_check_ancestor(node.children[idx])
 62
 63        for element in pop_els:
 64            node.children.remove(element)
 65
 66    def build_valid_attributes(attributes: list) -> list[str]:
 67        """Extract attributes from schema."""
 68        valid_attrs = []
 69        for attribute in attributes:
 70            valid_attrs = (
 71                [*valid_attrs, attribute]
 72                if isinstance(attribute, str)
 73                else [*valid_attrs, attribute[0]]
 74            )
 75        return valid_attrs
 76
 77    def build_remove_attr_list(properties: dict, attributes: dict, valid_attrs: list):
 78        """Build the list of attributes to remove from a dict of attributes."""
 79        result = []
 80        for attribute in properties:
 81            if attribute not in valid_attrs:
 82                result.append(attribute)
 83            else:
 84                for attr in attributes:
 85                    if bool(
 86                        (isinstance(attr, str) and attr != attribute)
 87                        or (attr[0] == attribute and properties[attribute] not in attr[1:])
 88                        or (
 89                            attribute in schema.protocols
 90                            and not check_protocols(
 91                                properties[attribute], schema.protocols[attribute]
 92                            )
 93                        )
 94                    ):
 95                        result.append(attribute)
 96
 97        return result
 98
 99    def recurse_check_attributes(node: Root | Element):
100        for idx, child in enumerate(node.children):
101            if test(child, "element") and child.tag in schema.attributes.keys():
102                valid_attrs = build_valid_attributes(schema.attributes[child.tag])
103
104                pop_attrs = build_remove_attr_list(
105                    node.children[idx].properties, schema.attributes[child.tag], valid_attrs
106                )
107
108                for attribute in pop_attrs:
109                    node.children[idx].properties.pop(attribute, None)
110
111            elif test(node.children[idx], "element"):
112                recurse_check_attributes(node.children[idx])
113
114    def recurse_check_required(node: Root | Element):
115        for idx, child in enumerate(node.children):
116            if test(child, "element") and child.tag in schema.required.keys():
117                for attr, value in schema.required[child.tag].items():
118                    if attr not in child.properties:
119                        node.children[idx].properties[attr] = value
120
121            elif test(node.children[idx], "element"):
122                recurse_check_required(node.children[idx])
123
124    def check_protocols(value: str, protocols: list[str]):
125        for protocol in protocols:
126            if match(f"{protocol}:.*", value) is not None:
127                return True
128        return False
129
130    recurse_check_tag(src)
131    recurse_check_ancestor(src)
132    recurse_check_attributes(src)
133    recurse_check_required(src)

Sanatize elements and attributes in the phml tree. Should be used when using data from an unkown source. It should be used with an AST that has already been compiled to html to no unkown values are unchecked.

By default the sanatization schema uses the github schema and follows the hast sanatize utility.

Note

This utility will edit the tree in place.

Args
  • tree (AST | Root | Element): The root of the tree that will be sanatized.
  • schema (Optional[Schema], optional): User defined schema. Defaults to github schema.
@dataclass
class Schema:
  6@dataclass
  7class Schema:
  8    """Dataclass of information on how to sanatize a phml tree.
  9
 10    `strip (list[str])`: The elements to strip from the tree.
 11    `protocols (dict[str, list])`: Collection of element name and allowed protocal value list
 12    `tag_names (list[str])`: List of allowed tag names.
 13    `attributes (dict[str, list[str | list[str]]])`: Collection of element name and allowed property
 14    names.
 15    `required (dict[str, str | list[str]])`: Collection of element names and their required
 16    properties and required property values.
 17    """
 18
 19    strip: list[str] = field(default_factory=lambda: ['script'])
 20    ancestors: dict[str, list] = field(
 21        default_factory=lambda: {
 22            "tbody": ['table'],
 23            "tfoot": ['table'],
 24            "thead": ['table'],
 25            "td": ['table'],
 26            "th": ['table'],
 27            "tr": ['table'],
 28        }
 29    )
 30    protocols: dict[str, list] = field(
 31        default_factory=lambda: {
 32            "href": ['http', 'https', 'mailto', 'xmpp', 'irc', 'ircs'],
 33            "cite": ['http', 'https'],
 34            "src": ['http', 'https'],
 35            "longDesc": ['http', 'https'],
 36        }
 37    )
 38    tag_names: list[str] = field(
 39        default_factory=lambda: [
 40            'h1',
 41            'h2',
 42            'h3',
 43            'h4',
 44            'h5',
 45            'h6',
 46            'br',
 47            'b',
 48            'i',
 49            'strong',
 50            'em',
 51            'a',
 52            'pre',
 53            'code',
 54            'img',
 55            'tt',
 56            'div',
 57            'ins',
 58            'del',
 59            'sup',
 60            'sub',
 61            'p',
 62            'ol',
 63            'ul',
 64            'table',
 65            'thead',
 66            'tbody',
 67            'tfoot',
 68            'blockquote',
 69            'dl',
 70            'dt',
 71            'dd',
 72            'kbd',
 73            'q',
 74            'samp',
 75            'var',
 76            'hr',
 77            'ruby',
 78            'rt',
 79            'rp',
 80            'li',
 81            'tr',
 82            'td',
 83            'th',
 84            's',
 85            'strike',
 86            'summary',
 87            'details',
 88            'caption',
 89            'figure',
 90            'figcaption',
 91            'abbr',
 92            'bdo',
 93            'cite',
 94            'dfn',
 95            'mark',
 96            'small',
 97            'span',
 98            'time',
 99            'wbr',
100            'input',
101        ]
102    )
103    attributes: dict[str, list[str | list[str]]] = field(
104        default_factory=lambda: {
105            "a": ['href'],
106            "img": ['src', 'longDesc'],
107            "input": [['type', 'checkbox'], ['disabled', True]],
108            "li": [['class', 'task-list-item']],
109            "div": ['itemScope', 'itemType'],
110            "blockquote": ['cite'],
111            "del": ['cite'],
112            "ins": ['cite'],
113            "q": ['cite'],
114            '*': [
115                'abbr',
116                'accept',
117                'acceptCharset',
118                'accessKey',
119                'action',
120                'align',
121                'alt',
122                'ariaDescribedBy',
123                'ariaHidden',
124                'ariaLabel',
125                'ariaLabelledBy',
126                'axis',
127                'border',
128                'cellPadding',
129                'cellSpacing',
130                'char',
131                'charOff',
132                'charSet',
133                'checked',
134                'clear',
135                'cols',
136                'colSpan',
137                'color',
138                'compact',
139                'coords',
140                'dateTime',
141                'dir',
142                'disabled',
143                'encType',
144                'htmlFor',
145                'frame',
146                'headers',
147                'height',
148                'hrefLang',
149                'hSpace',
150                'isMap',
151                'id',
152                'label',
153                'lang',
154                'maxLength',
155                'media',
156                'method',
157                'multiple',
158                'name',
159                'noHref',
160                'noShade',
161                'noWrap',
162                'open',
163                'prompt',
164                'readOnly',
165                'rel',
166                'rev',
167                'rows',
168                'rowSpan',
169                'rules',
170                'scope',
171                'selected',
172                'shape',
173                'size',
174                'span',
175                'start',
176                'summary',
177                'tabIndex',
178                'target',
179                'title',
180                'type',
181                'useMap',
182                'vAlign',
183                'value',
184                'vSpace',
185                'width',
186                'itemProp',
187            ],
188        }
189    )
190    required: dict[str, str | list[str]] = field(
191        default_factory=lambda: {
192            "input": {
193                "type": 'checkbox',
194                "disabled": True,
195            }
196        }
197    )

Dataclass of information on how to sanatize a phml tree.

strip (list[str]): The elements to strip from the tree. protocols (dict[str, list]): Collection of element name and allowed protocal value list tag_names (list[str]): List of allowed tag names. attributes (dict[str, list[str | list[str]]]): Collection of element name and allowed property names. required (dict[str, str | list[str]]): Collection of element names and their required properties and required property values.

Schema( strip: list[str] = <factory>, ancestors: dict[str, list] = <factory>, protocols: dict[str, list] = <factory>, tag_names: list[str] = <factory>, attributes: dict[str, list[str | list[str]]] = <factory>, required: dict[str, str | list[str]] = <factory>)