Coverage for tests/parsers/test_struct.py: 100%
231 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-04 15:55 +0100
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-04 15:55 +0100
1"""Tests for the `certus.parsers.struct` module."""
3import itertools
4import json
5import re
6import string
7import typing
8from unittest import mock
10import hypothesis as hyp
11import hypothesis.strategies as st
12import pytest
14from certus.parsers import struct
16from . import common
18D = typing.TypeVar("D")
20ST_PRIMITIVES = (
21 st.none() | st.booleans() | st.integers() | st.floats(allow_nan=False) | common.ST_STRINGS
22)
23ST_PRIMITIVE_LISTS = st.lists(ST_PRIMITIVES, min_size=1)
24ST_KEYS = st.text(string.ascii_lowercase + "_")
25ST_PRIMITIVE_DICTS = st.dictionaries(ST_KEYS, ST_PRIMITIVES, min_size=1)
26ST_JSON_DATA = st.recursive(
27 ST_PRIMITIVES,
28 lambda kids: st.lists(kids, min_size=1) | st.dictionaries(ST_KEYS, kids, min_size=1),
29 max_leaves=50,
30)
33@st.composite
34def st_tokenise_string(draw: st.DrawFn, string: str, start: int = 0) -> list[struct.nodes.Token]:
35 """Turn a string into a list of tokens."""
36 tokens, position = [], start
37 while string:
38 nchars = draw(st.integers(1, len(string)))
39 token = struct.nodes.Token(
40 value=string[:nchars], logprob=draw(common.ST_LOGPROBS), start=position
41 )
42 tokens.append(token)
43 string = string[nchars:]
44 position += nchars
46 return tokens
49@st.composite
50def st_span_lists(
51 draw: st.DrawFn, tokens: list[struct.nodes.Token], num: int
52) -> list[tuple[int, int]]:
53 """Create a list of span indices for a test."""
54 idx_strategy = st.integers(0, len(tokens))
55 idxs = draw(st.lists(idx_strategy, min_size=num, max_size=num, unique=True).map(sorted))
57 return list(itertools.pairwise(idxs))
60@st.composite
61def st_data_span_params(
62 draw: st.DrawFn, data_strategy: st.SearchStrategy[D]
63) -> tuple[D, list[struct.nodes.Token], list[tuple[int, int]]]:
64 """Create a dictionary, a token list, and some spans for a test."""
65 data = draw(data_strategy)
66 tokens = draw(st_tokenise_string(json.dumps(data)))
68 if isinstance(data, (dict, list)):
69 hyp.assume(len(tokens) > len(data))
71 num_items = len(data) + 2 if isinstance(data, (dict, list)) else 2
72 spans = draw(st_span_lists(tokens, num_items))
74 return data, tokens, spans
77def _check_parsed_primitive_class(element, tokens, start, end):
78 """Check a parsed primitive is the right node type for its span."""
79 span = tokens[start:end]
80 if len(span) > 1:
81 assert element == struct.nodes.Composite(children=span)
82 return
84 assert element == span[0]
87def _check_find_token_span(data, tokens, spans, find_mock, kw_mock):
88 """Check that the token span finder mock is called correctly."""
89 calls = find_mock.call_args_list
91 assert len(calls) == len(data) + 1
92 assert calls.pop(0) == mock.call(data, tokens, kw_mock, 0)
94 data_values = data.values() if isinstance(data, dict) else data
95 start = spans[0][0]
96 for call, value, span in zip(calls, data_values, spans[1:]):
97 assert call == mock.call(value, tokens, kw_mock, start)
98 start = span[1]
101@hyp.given(ST_JSON_DATA, common.st_token_lists())
102def test_parse_json_main(data, tokens):
103 """Check the core JSON parser runs as it should."""
104 dumps_kw, node = mock.Mock(), mock.Mock()
105 with mock.patch.object(struct, "_parse_json", return_value=(node, mock.Mock())) as parse_json:
106 parsed = struct.parse_json(data, tokens, dumps_kw)
108 assert parsed is node
109 parse_json.assert_called_once_with(data, tokens, dumps_kw)
112@hyp.given(ST_JSON_DATA, common.st_token_lists())
113def test_parse_json_main_dumps_kw_none_becomes_empty_dict(data, tokens):
114 """Check `dumps_kw=None` is resolved as an empty dictionary."""
115 with mock.patch.object(
116 struct, "_parse_json", return_value=(mock.Mock(), mock.Mock())
117 ) as parse_json:
118 _ = struct.parse_json(data, tokens, dumps_kw=None)
120 parse_json.assert_called_once_with(data, tokens, {})
123@hyp.given(st_data_span_params(ST_PRIMITIVE_DICTS))
124def test_parse_json_primitive_dict(params):
125 """
126 Check the parser runs with a dictionary of primitives.
128 We mock the token span finder here, telling it to spit out some
129 token lists for each entry. Then we check the result is an object
130 with the correct fields based on the length of the spans we provide,
131 and that the span finder is called correctly.
132 """
133 data, tokens, spans = params
134 dumps_kw = mock.Mock()
136 with mock.patch.object(struct, "_find_token_span", side_effect=spans) as find_token_span:
137 parsed, end = struct._parse_json(data, tokens, dumps_kw)
139 assert end == spans[0][1]
141 assert isinstance(parsed, struct.nodes.Object)
142 assert list(parsed.keys()) == list(data.keys())
143 for element, span in zip(parsed.values(), spans[1:]):
144 _check_parsed_primitive_class(element, tokens, *span)
146 _check_find_token_span(data, tokens, spans, find_token_span, dumps_kw)
149@hyp.given(st_data_span_params(ST_PRIMITIVE_LISTS))
150def test_parse_json_primitive_list(params):
151 """
152 Check the parser runs with a list of primitives.
154 We mock the token span finder here, telling it to spit out some
155 token lists for each element. Then we check the result is an array
156 with the correct elements based on the length of the spans we
157 provide, and that the span finder is called correctly.
158 """
159 data, tokens, spans = params
160 dumps_kw = mock.Mock()
162 with mock.patch.object(struct, "_find_token_span", side_effect=spans) as find_token_span:
163 parsed, end = struct._parse_json(data, tokens, dumps_kw)
165 assert end == spans[0][1]
167 assert isinstance(parsed, struct.nodes.Array)
168 assert len(parsed) == len(data)
169 for element, span in zip(parsed, spans[1:]):
170 _check_parsed_primitive_class(element, tokens, *span)
172 _check_find_token_span(data, tokens, spans, find_token_span, dumps_kw)
175@hyp.given(st_data_span_params(ST_PRIMITIVES))
176def test_parse_json_primitive(params):
177 """
178 Check the parser runs with a primitive.
180 We mock the token span finder here, telling it to spit out a span we
181 provide. Then we check the result is of the correct class based on
182 the length of the span, and that the finder is called once.
183 """
184 data, tokens, spans = params
185 dumps_kw = mock.Mock()
187 assert len(spans) == 1
188 span = spans[0]
190 with mock.patch.object(struct, "_find_token_span", return_value=span) as find_token_span:
191 parsed, end = struct._parse_json(data, tokens, dumps_kw)
193 assert end == span[1]
195 assert isinstance(parsed, (struct.nodes.Composite, struct.nodes.Token))
196 _check_parsed_primitive_class(parsed, tokens, *span)
198 find_token_span.assert_called_once_with(data, tokens, dumps_kw, 0)
201def test_parse_json_raises_for_invalid_json():
202 """Check the parser raises an error for anything other than JSON."""
203 tokens, dumps_kw = mock.Mock(), mock.Mock()
205 class NotJSON:
206 pass
208 with (
209 mock.patch.object(struct, "_find_token_span") as find_token_span,
210 pytest.raises(ValueError, match=r"Invalid JSON data:.*NotJSON"),
211 ):
212 _ = struct._parse_json(NotJSON(), tokens, dumps_kw) # pyright: ignore[reportArgumentType]
214 find_token_span.assert_not_called()
217@hyp.given(ST_JSON_DATA, common.st_token_lists(), ST_PRIMITIVE_DICTS, st.data())
218def test_find_token_span_match(data, tokens, dumps_kw, extra):
219 """Check the span-finder runs if there is a match."""
220 num = len(tokens)
221 offset, start, end = extra.draw(
222 st.tuples(st.integers(0, num), st.integers(0, num), st.integers(0, num)).map(sorted)
223 )
225 with (
226 mock.patch.object(struct, "_make_regex_from_json") as make_regex_from_json,
227 mock.patch.object(struct, "_find_span_start", return_value=start) as find_span_start,
228 mock.patch.object(struct, "_find_span_end", return_value=end) as find_span_end,
229 mock.patch.object(struct.re, "search") as search,
230 ):
231 span = struct._find_token_span(data, tokens, dumps_kw, offset)
233 assert span == (start, end)
235 make_regex_from_json.assert_called_once_with(data, dumps_kw)
236 search.assert_called_once_with(
237 make_regex_from_json.return_value, "".join(t.value for t in tokens[offset:]), re.DOTALL
238 )
239 find_span_start.assert_called_once_with(tokens, search.return_value, offset)
240 find_span_end.assert_called_once_with(tokens, make_regex_from_json.return_value, start)
243@hyp.given(ST_JSON_DATA, common.st_token_lists(), ST_PRIMITIVE_DICTS)
244def test_find_token_span_no_match(data, tokens, dumps_kw):
245 """Check the span-finder raises an error if there is no match."""
246 with (
247 mock.patch.object(struct, "_make_regex_from_json") as make_regex_from_json,
248 mock.patch.object(struct, "_find_span_start") as find_span_start,
249 mock.patch.object(struct, "_find_span_end") as find_span_end,
250 mock.patch.object(struct.re, "search", return_value=None) as search,
251 pytest.raises(RuntimeError),
252 ):
253 _ = struct._find_token_span(data, tokens, dumps_kw, 0)
255 make_regex_from_json.assert_called_once_with(data, dumps_kw)
256 search.assert_called_once_with(
257 make_regex_from_json.return_value, "".join(t.value for t in tokens), re.DOTALL
258 )
259 find_span_start.assert_not_called()
260 find_span_end.assert_not_called()
263@hyp.given(ST_PRIMITIVE_DICTS.filter(len))
264def test_make_regex_from_json_dict(data):
265 """Check the regex builder works for a dictionary."""
266 pattern = struct._make_regex_from_json(data, {})
268 assert isinstance(pattern, str)
269 assert re.compile(pattern)
270 assert pattern.startswith("\\{\\s*")
271 assert pattern.endswith("\\s*\\}")
272 assert re.fullmatch(pattern, json.dumps(data)) is not None
275@hyp.given(ST_PRIMITIVE_LISTS.filter(len))
276def test_make_regex_from_json_list(data):
277 """Check the regex builder works for a list."""
278 pattern = struct._make_regex_from_json(data, {})
280 assert isinstance(pattern, str)
281 assert re.compile(pattern)
282 assert pattern.startswith("\\[\\s*")
283 assert pattern.endswith("\\s*\\]")
284 assert re.fullmatch(pattern, json.dumps(data)) is not None
287@hyp.given(ST_PRIMITIVES)
288def test_make_regex_from_json_primitive(data):
289 """Check the regex builder works for a primitive."""
290 pattern = struct._make_regex_from_json(data, {})
292 assert isinstance(pattern, str)
293 assert re.compile(pattern)
294 assert re.fullmatch(pattern, json.dumps(data)) is not None
297@hyp.given(ST_JSON_DATA)
298def test_make_regex_from_json_recursive(data):
299 """Check the regex builder works for nested JSON data."""
300 pattern = struct._make_regex_from_json(data, {})
302 assert isinstance(pattern, str)
303 assert re.compile(pattern)
304 assert re.fullmatch(pattern, json.dumps(data)) is not None
306 opening_spans = {_.span() for _ in re.finditer(r"\\[\{\[]", pattern)}
307 opening_space_spans = {_.span() for _ in re.finditer(r"\\[\{\[](?=\\s\*)", pattern)}
308 assert opening_space_spans == opening_spans
310 closure_spans = {_.span() for _ in re.finditer(r"\\[\}\]]", pattern)}
311 closure_space_spans = {_.span() for _ in re.finditer(r"(?<=\\s\*)\\[\}\]]", pattern)}
312 assert closure_space_spans == closure_spans
315@st.composite
316def st_dicts_with_spaces(draw: st.DrawFn) -> dict[str, struct.JSONPrimitiveType]:
317 """Create a dictionary with multi-space blocks in its elements."""
318 whitespace_strategy = st.text(" ", min_size=2)
320 data = {}
321 for key, value in draw(ST_PRIMITIVE_DICTS).items():
322 key += draw(whitespace_strategy)
323 if isinstance(value, str):
324 value += draw(whitespace_strategy)
326 data[key] = value
328 return data
331@hyp.given(st_dicts_with_spaces())
332def test_make_regex_from_json_multispaces_only_in_strings(data):
333 """
334 Check that any multi-space blocks are inside string literals.
336 We construct this by looking at JSON objects with string keys and
337 values, where we add contiguous whitespaces.
338 """
339 pattern = struct._make_regex_from_json(data, {})
341 multi_space_spans = {_.span() for _ in re.finditer(r"(\\ ){2,}", pattern)}
342 string_literal_spans = {_.span() for _ in re.finditer(r'("(?:[^"\\]|\\.)*")', pattern)}
343 for start, end in multi_space_spans:
344 num_hits = sum(
345 start >= string_start and end <= string_end
346 for string_start, string_end in string_literal_spans
347 )
348 assert num_hits == 1
351@hyp.given(
352 ST_JSON_DATA, st.just({}) | st.fixed_dictionaries({"indent": st.sampled_from([0, 1, 2, 4])})
353)
354def test_make_regex_from_json_handles_indent(data, dumps_kw):
355 """Check that an indent keyword is always passed to the dumper."""
356 with mock.patch.object(struct.json, "dumps", side_effect=json.dumps) as dumps:
357 _ = struct._make_regex_from_json(data, dumps_kw)
359 dumps.assert_called_once_with(data, indent=dumps_kw.get("indent", 1))
362@st.composite
363def st_span_start_params(
364 draw: st.DrawFn,
365) -> tuple[typing.Sequence[struct.nodes.Token], int, int, int]:
366 """Create tokens, a start, and offsets for a span-start test."""
367 tokens = draw(common.st_token_lists(min_size=2))
368 start = draw(st.integers(0, len(tokens) - 1))
369 offset = draw(st.integers(0, start))
371 char_start_min = sum(len(token.value) for token in tokens[offset:start])
372 char_start_delta = draw(st.integers(0, len(tokens[start].value) - 1))
373 char_start = char_start_min + char_start_delta
375 return tokens, start, offset, char_start
378@hyp.given(st_span_start_params())
379def test_find_span_start_success(params):
380 """
381 Check the start-finder can exit successfully.
383 We enforce this scenario by constructing four things:
385 1. a list of tokens
386 2. an expected starting index
387 3. a token index offset somewhere up to the starting index
388 4. a character start somewhere in the starting index token given the
389 offset
390 """
391 tokens, start, offset, char_start = params
393 search = mock.Mock()
394 search.start.return_value = char_start
396 idx = struct._find_span_start(tokens, search, offset)
398 assert idx == start
399 search.start.assert_called_once_with()
402@hyp.given(common.st_token_lists(), st.data())
403def test_find_span_start_failure(tokens, extra):
404 """
405 Check the start-finder raises an error if it does not exit.
407 We pass a token list, an offset and a character start that is larger
408 than the total length of the tokens.
409 """
410 offset = extra.draw(st.integers(0, len(tokens) - 1))
412 search = mock.Mock()
413 search.start.return_value = sum(len(token.value) for token in tokens) + 1
415 with pytest.raises(RuntimeError, match="Unable to find start index"):
416 _ = struct._find_span_start(tokens, search, offset)
418 search.start.assert_called_once_with()
421@st.composite
422def st_span_end_params(
423 draw: st.DrawFn,
424) -> tuple[typing.Sequence[struct.nodes.Token], str, int, int]:
425 """Create tokens, a pattern, and indices for a span-end test."""
426 tokens = draw(common.st_token_lists(min_size=2))
428 start = draw(st.integers(0, len(tokens) - 2))
429 end = draw(st.integers(start + 1, len(tokens)))
431 pattern = re.escape("".join(token.value for token in tokens[start:end]))
433 return tokens, pattern, start, end
436@hyp.given(st_span_end_params())
437def test_find_span_end_success(params):
438 """
439 Check the end-finder can exit successfully.
441 We ensure this scenario by constructing four things:
443 1. a list of tokens
444 2. a start index
445 3. an expected end index
446 4. a regular expression matching the concatenation of the tokens
447 between these indices
448 """
449 tokens, pattern, start, end = params
451 idx = struct._find_span_end(tokens, pattern, start)
453 assert idx == end
456@hyp.given(common.st_token_lists(), st.data())
457def test_find_span_end_failure(tokens, extra):
458 """
459 Check the end-finder raises an error if it does not exit.
461 We pass a token list, a starting index, and mock the regex searcher
462 to always fail.
463 """
464 pattern = mock.Mock()
465 start = extra.draw(st.integers(0, len(tokens)))
467 with (
468 mock.patch.object(struct.re, "search", return_value=None) as search,
469 pytest.raises(RuntimeError),
470 ):
471 _ = struct._find_span_end(tokens, pattern, start)
473 assert search.call_args_list == [
474 mock.call(pattern, text, re.DOTALL)
475 for text in itertools.accumulate(token.value for token in tokens[start:])
476 ]