Coverage for tests/parsers/test_struct.py: 100%

231 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-04 15:55 +0100

1"""Tests for the `certus.parsers.struct` module.""" 

2 

3import itertools 

4import json 

5import re 

6import string 

7import typing 

8from unittest import mock 

9 

10import hypothesis as hyp 

11import hypothesis.strategies as st 

12import pytest 

13 

14from certus.parsers import struct 

15 

16from . import common 

17 

18D = typing.TypeVar("D") 

19 

20ST_PRIMITIVES = ( 

21 st.none() | st.booleans() | st.integers() | st.floats(allow_nan=False) | common.ST_STRINGS 

22) 

23ST_PRIMITIVE_LISTS = st.lists(ST_PRIMITIVES, min_size=1) 

24ST_KEYS = st.text(string.ascii_lowercase + "_") 

25ST_PRIMITIVE_DICTS = st.dictionaries(ST_KEYS, ST_PRIMITIVES, min_size=1) 

26ST_JSON_DATA = st.recursive( 

27 ST_PRIMITIVES, 

28 lambda kids: st.lists(kids, min_size=1) | st.dictionaries(ST_KEYS, kids, min_size=1), 

29 max_leaves=50, 

30) 

31 

32 

33@st.composite 

34def st_tokenise_string(draw: st.DrawFn, string: str, start: int = 0) -> list[struct.nodes.Token]: 

35 """Turn a string into a list of tokens.""" 

36 tokens, position = [], start 

37 while string: 

38 nchars = draw(st.integers(1, len(string))) 

39 token = struct.nodes.Token( 

40 value=string[:nchars], logprob=draw(common.ST_LOGPROBS), start=position 

41 ) 

42 tokens.append(token) 

43 string = string[nchars:] 

44 position += nchars 

45 

46 return tokens 

47 

48 

49@st.composite 

50def st_span_lists( 

51 draw: st.DrawFn, tokens: list[struct.nodes.Token], num: int 

52) -> list[tuple[int, int]]: 

53 """Create a list of span indices for a test.""" 

54 idx_strategy = st.integers(0, len(tokens)) 

55 idxs = draw(st.lists(idx_strategy, min_size=num, max_size=num, unique=True).map(sorted)) 

56 

57 return list(itertools.pairwise(idxs)) 

58 

59 

60@st.composite 

61def st_data_span_params( 

62 draw: st.DrawFn, data_strategy: st.SearchStrategy[D] 

63) -> tuple[D, list[struct.nodes.Token], list[tuple[int, int]]]: 

64 """Create a dictionary, a token list, and some spans for a test.""" 

65 data = draw(data_strategy) 

66 tokens = draw(st_tokenise_string(json.dumps(data))) 

67 

68 if isinstance(data, (dict, list)): 

69 hyp.assume(len(tokens) > len(data)) 

70 

71 num_items = len(data) + 2 if isinstance(data, (dict, list)) else 2 

72 spans = draw(st_span_lists(tokens, num_items)) 

73 

74 return data, tokens, spans 

75 

76 

77def _check_parsed_primitive_class(element, tokens, start, end): 

78 """Check a parsed primitive is the right node type for its span.""" 

79 span = tokens[start:end] 

80 if len(span) > 1: 

81 assert element == struct.nodes.Composite(children=span) 

82 return 

83 

84 assert element == span[0] 

85 

86 

87def _check_find_token_span(data, tokens, spans, find_mock, kw_mock): 

88 """Check that the token span finder mock is called correctly.""" 

89 calls = find_mock.call_args_list 

90 

91 assert len(calls) == len(data) + 1 

92 assert calls.pop(0) == mock.call(data, tokens, kw_mock, 0) 

93 

94 data_values = data.values() if isinstance(data, dict) else data 

95 start = spans[0][0] 

96 for call, value, span in zip(calls, data_values, spans[1:]): 

97 assert call == mock.call(value, tokens, kw_mock, start) 

98 start = span[1] 

99 

100 

101@hyp.given(ST_JSON_DATA, common.st_token_lists()) 

102def test_parse_json_main(data, tokens): 

103 """Check the core JSON parser runs as it should.""" 

104 dumps_kw, node = mock.Mock(), mock.Mock() 

105 with mock.patch.object(struct, "_parse_json", return_value=(node, mock.Mock())) as parse_json: 

106 parsed = struct.parse_json(data, tokens, dumps_kw) 

107 

108 assert parsed is node 

109 parse_json.assert_called_once_with(data, tokens, dumps_kw) 

110 

111 

112@hyp.given(ST_JSON_DATA, common.st_token_lists()) 

113def test_parse_json_main_dumps_kw_none_becomes_empty_dict(data, tokens): 

114 """Check `dumps_kw=None` is resolved as an empty dictionary.""" 

115 with mock.patch.object( 

116 struct, "_parse_json", return_value=(mock.Mock(), mock.Mock()) 

117 ) as parse_json: 

118 _ = struct.parse_json(data, tokens, dumps_kw=None) 

119 

120 parse_json.assert_called_once_with(data, tokens, {}) 

121 

122 

123@hyp.given(st_data_span_params(ST_PRIMITIVE_DICTS)) 

124def test_parse_json_primitive_dict(params): 

125 """ 

126 Check the parser runs with a dictionary of primitives. 

127 

128 We mock the token span finder here, telling it to spit out some 

129 token lists for each entry. Then we check the result is an object 

130 with the correct fields based on the length of the spans we provide, 

131 and that the span finder is called correctly. 

132 """ 

133 data, tokens, spans = params 

134 dumps_kw = mock.Mock() 

135 

136 with mock.patch.object(struct, "_find_token_span", side_effect=spans) as find_token_span: 

137 parsed, end = struct._parse_json(data, tokens, dumps_kw) 

138 

139 assert end == spans[0][1] 

140 

141 assert isinstance(parsed, struct.nodes.Object) 

142 assert list(parsed.keys()) == list(data.keys()) 

143 for element, span in zip(parsed.values(), spans[1:]): 

144 _check_parsed_primitive_class(element, tokens, *span) 

145 

146 _check_find_token_span(data, tokens, spans, find_token_span, dumps_kw) 

147 

148 

149@hyp.given(st_data_span_params(ST_PRIMITIVE_LISTS)) 

150def test_parse_json_primitive_list(params): 

151 """ 

152 Check the parser runs with a list of primitives. 

153 

154 We mock the token span finder here, telling it to spit out some 

155 token lists for each element. Then we check the result is an array 

156 with the correct elements based on the length of the spans we 

157 provide, and that the span finder is called correctly. 

158 """ 

159 data, tokens, spans = params 

160 dumps_kw = mock.Mock() 

161 

162 with mock.patch.object(struct, "_find_token_span", side_effect=spans) as find_token_span: 

163 parsed, end = struct._parse_json(data, tokens, dumps_kw) 

164 

165 assert end == spans[0][1] 

166 

167 assert isinstance(parsed, struct.nodes.Array) 

168 assert len(parsed) == len(data) 

169 for element, span in zip(parsed, spans[1:]): 

170 _check_parsed_primitive_class(element, tokens, *span) 

171 

172 _check_find_token_span(data, tokens, spans, find_token_span, dumps_kw) 

173 

174 

175@hyp.given(st_data_span_params(ST_PRIMITIVES)) 

176def test_parse_json_primitive(params): 

177 """ 

178 Check the parser runs with a primitive. 

179 

180 We mock the token span finder here, telling it to spit out a span we 

181 provide. Then we check the result is of the correct class based on 

182 the length of the span, and that the finder is called once. 

183 """ 

184 data, tokens, spans = params 

185 dumps_kw = mock.Mock() 

186 

187 assert len(spans) == 1 

188 span = spans[0] 

189 

190 with mock.patch.object(struct, "_find_token_span", return_value=span) as find_token_span: 

191 parsed, end = struct._parse_json(data, tokens, dumps_kw) 

192 

193 assert end == span[1] 

194 

195 assert isinstance(parsed, (struct.nodes.Composite, struct.nodes.Token)) 

196 _check_parsed_primitive_class(parsed, tokens, *span) 

197 

198 find_token_span.assert_called_once_with(data, tokens, dumps_kw, 0) 

199 

200 

201def test_parse_json_raises_for_invalid_json(): 

202 """Check the parser raises an error for anything other than JSON.""" 

203 tokens, dumps_kw = mock.Mock(), mock.Mock() 

204 

205 class NotJSON: 

206 pass 

207 

208 with ( 

209 mock.patch.object(struct, "_find_token_span") as find_token_span, 

210 pytest.raises(ValueError, match=r"Invalid JSON data:.*NotJSON"), 

211 ): 

212 _ = struct._parse_json(NotJSON(), tokens, dumps_kw) # pyright: ignore[reportArgumentType] 

213 

214 find_token_span.assert_not_called() 

215 

216 

217@hyp.given(ST_JSON_DATA, common.st_token_lists(), ST_PRIMITIVE_DICTS, st.data()) 

218def test_find_token_span_match(data, tokens, dumps_kw, extra): 

219 """Check the span-finder runs if there is a match.""" 

220 num = len(tokens) 

221 offset, start, end = extra.draw( 

222 st.tuples(st.integers(0, num), st.integers(0, num), st.integers(0, num)).map(sorted) 

223 ) 

224 

225 with ( 

226 mock.patch.object(struct, "_make_regex_from_json") as make_regex_from_json, 

227 mock.patch.object(struct, "_find_span_start", return_value=start) as find_span_start, 

228 mock.patch.object(struct, "_find_span_end", return_value=end) as find_span_end, 

229 mock.patch.object(struct.re, "search") as search, 

230 ): 

231 span = struct._find_token_span(data, tokens, dumps_kw, offset) 

232 

233 assert span == (start, end) 

234 

235 make_regex_from_json.assert_called_once_with(data, dumps_kw) 

236 search.assert_called_once_with( 

237 make_regex_from_json.return_value, "".join(t.value for t in tokens[offset:]), re.DOTALL 

238 ) 

239 find_span_start.assert_called_once_with(tokens, search.return_value, offset) 

240 find_span_end.assert_called_once_with(tokens, make_regex_from_json.return_value, start) 

241 

242 

243@hyp.given(ST_JSON_DATA, common.st_token_lists(), ST_PRIMITIVE_DICTS) 

244def test_find_token_span_no_match(data, tokens, dumps_kw): 

245 """Check the span-finder raises an error if there is no match.""" 

246 with ( 

247 mock.patch.object(struct, "_make_regex_from_json") as make_regex_from_json, 

248 mock.patch.object(struct, "_find_span_start") as find_span_start, 

249 mock.patch.object(struct, "_find_span_end") as find_span_end, 

250 mock.patch.object(struct.re, "search", return_value=None) as search, 

251 pytest.raises(RuntimeError), 

252 ): 

253 _ = struct._find_token_span(data, tokens, dumps_kw, 0) 

254 

255 make_regex_from_json.assert_called_once_with(data, dumps_kw) 

256 search.assert_called_once_with( 

257 make_regex_from_json.return_value, "".join(t.value for t in tokens), re.DOTALL 

258 ) 

259 find_span_start.assert_not_called() 

260 find_span_end.assert_not_called() 

261 

262 

263@hyp.given(ST_PRIMITIVE_DICTS.filter(len)) 

264def test_make_regex_from_json_dict(data): 

265 """Check the regex builder works for a dictionary.""" 

266 pattern = struct._make_regex_from_json(data, {}) 

267 

268 assert isinstance(pattern, str) 

269 assert re.compile(pattern) 

270 assert pattern.startswith("\\{\\s*") 

271 assert pattern.endswith("\\s*\\}") 

272 assert re.fullmatch(pattern, json.dumps(data)) is not None 

273 

274 

275@hyp.given(ST_PRIMITIVE_LISTS.filter(len)) 

276def test_make_regex_from_json_list(data): 

277 """Check the regex builder works for a list.""" 

278 pattern = struct._make_regex_from_json(data, {}) 

279 

280 assert isinstance(pattern, str) 

281 assert re.compile(pattern) 

282 assert pattern.startswith("\\[\\s*") 

283 assert pattern.endswith("\\s*\\]") 

284 assert re.fullmatch(pattern, json.dumps(data)) is not None 

285 

286 

287@hyp.given(ST_PRIMITIVES) 

288def test_make_regex_from_json_primitive(data): 

289 """Check the regex builder works for a primitive.""" 

290 pattern = struct._make_regex_from_json(data, {}) 

291 

292 assert isinstance(pattern, str) 

293 assert re.compile(pattern) 

294 assert re.fullmatch(pattern, json.dumps(data)) is not None 

295 

296 

297@hyp.given(ST_JSON_DATA) 

298def test_make_regex_from_json_recursive(data): 

299 """Check the regex builder works for nested JSON data.""" 

300 pattern = struct._make_regex_from_json(data, {}) 

301 

302 assert isinstance(pattern, str) 

303 assert re.compile(pattern) 

304 assert re.fullmatch(pattern, json.dumps(data)) is not None 

305 

306 opening_spans = {_.span() for _ in re.finditer(r"\\[\{\[]", pattern)} 

307 opening_space_spans = {_.span() for _ in re.finditer(r"\\[\{\[](?=\\s\*)", pattern)} 

308 assert opening_space_spans == opening_spans 

309 

310 closure_spans = {_.span() for _ in re.finditer(r"\\[\}\]]", pattern)} 

311 closure_space_spans = {_.span() for _ in re.finditer(r"(?<=\\s\*)\\[\}\]]", pattern)} 

312 assert closure_space_spans == closure_spans 

313 

314 

315@st.composite 

316def st_dicts_with_spaces(draw: st.DrawFn) -> dict[str, struct.JSONPrimitiveType]: 

317 """Create a dictionary with multi-space blocks in its elements.""" 

318 whitespace_strategy = st.text(" ", min_size=2) 

319 

320 data = {} 

321 for key, value in draw(ST_PRIMITIVE_DICTS).items(): 

322 key += draw(whitespace_strategy) 

323 if isinstance(value, str): 

324 value += draw(whitespace_strategy) 

325 

326 data[key] = value 

327 

328 return data 

329 

330 

331@hyp.given(st_dicts_with_spaces()) 

332def test_make_regex_from_json_multispaces_only_in_strings(data): 

333 """ 

334 Check that any multi-space blocks are inside string literals. 

335 

336 We construct this by looking at JSON objects with string keys and 

337 values, where we add contiguous whitespaces. 

338 """ 

339 pattern = struct._make_regex_from_json(data, {}) 

340 

341 multi_space_spans = {_.span() for _ in re.finditer(r"(\\ ){2,}", pattern)} 

342 string_literal_spans = {_.span() for _ in re.finditer(r'("(?:[^"\\]|\\.)*")', pattern)} 

343 for start, end in multi_space_spans: 

344 num_hits = sum( 

345 start >= string_start and end <= string_end 

346 for string_start, string_end in string_literal_spans 

347 ) 

348 assert num_hits == 1 

349 

350 

351@hyp.given( 

352 ST_JSON_DATA, st.just({}) | st.fixed_dictionaries({"indent": st.sampled_from([0, 1, 2, 4])}) 

353) 

354def test_make_regex_from_json_handles_indent(data, dumps_kw): 

355 """Check that an indent keyword is always passed to the dumper.""" 

356 with mock.patch.object(struct.json, "dumps", side_effect=json.dumps) as dumps: 

357 _ = struct._make_regex_from_json(data, dumps_kw) 

358 

359 dumps.assert_called_once_with(data, indent=dumps_kw.get("indent", 1)) 

360 

361 

362@st.composite 

363def st_span_start_params( 

364 draw: st.DrawFn, 

365) -> tuple[typing.Sequence[struct.nodes.Token], int, int, int]: 

366 """Create tokens, a start, and offsets for a span-start test.""" 

367 tokens = draw(common.st_token_lists(min_size=2)) 

368 start = draw(st.integers(0, len(tokens) - 1)) 

369 offset = draw(st.integers(0, start)) 

370 

371 char_start_min = sum(len(token.value) for token in tokens[offset:start]) 

372 char_start_delta = draw(st.integers(0, len(tokens[start].value) - 1)) 

373 char_start = char_start_min + char_start_delta 

374 

375 return tokens, start, offset, char_start 

376 

377 

378@hyp.given(st_span_start_params()) 

379def test_find_span_start_success(params): 

380 """ 

381 Check the start-finder can exit successfully. 

382 

383 We enforce this scenario by constructing four things: 

384 

385 1. a list of tokens 

386 2. an expected starting index 

387 3. a token index offset somewhere up to the starting index 

388 4. a character start somewhere in the starting index token given the 

389 offset 

390 """ 

391 tokens, start, offset, char_start = params 

392 

393 search = mock.Mock() 

394 search.start.return_value = char_start 

395 

396 idx = struct._find_span_start(tokens, search, offset) 

397 

398 assert idx == start 

399 search.start.assert_called_once_with() 

400 

401 

402@hyp.given(common.st_token_lists(), st.data()) 

403def test_find_span_start_failure(tokens, extra): 

404 """ 

405 Check the start-finder raises an error if it does not exit. 

406 

407 We pass a token list, an offset and a character start that is larger 

408 than the total length of the tokens. 

409 """ 

410 offset = extra.draw(st.integers(0, len(tokens) - 1)) 

411 

412 search = mock.Mock() 

413 search.start.return_value = sum(len(token.value) for token in tokens) + 1 

414 

415 with pytest.raises(RuntimeError, match="Unable to find start index"): 

416 _ = struct._find_span_start(tokens, search, offset) 

417 

418 search.start.assert_called_once_with() 

419 

420 

421@st.composite 

422def st_span_end_params( 

423 draw: st.DrawFn, 

424) -> tuple[typing.Sequence[struct.nodes.Token], str, int, int]: 

425 """Create tokens, a pattern, and indices for a span-end test.""" 

426 tokens = draw(common.st_token_lists(min_size=2)) 

427 

428 start = draw(st.integers(0, len(tokens) - 2)) 

429 end = draw(st.integers(start + 1, len(tokens))) 

430 

431 pattern = re.escape("".join(token.value for token in tokens[start:end])) 

432 

433 return tokens, pattern, start, end 

434 

435 

436@hyp.given(st_span_end_params()) 

437def test_find_span_end_success(params): 

438 """ 

439 Check the end-finder can exit successfully. 

440 

441 We ensure this scenario by constructing four things: 

442 

443 1. a list of tokens 

444 2. a start index 

445 3. an expected end index 

446 4. a regular expression matching the concatenation of the tokens 

447 between these indices 

448 """ 

449 tokens, pattern, start, end = params 

450 

451 idx = struct._find_span_end(tokens, pattern, start) 

452 

453 assert idx == end 

454 

455 

456@hyp.given(common.st_token_lists(), st.data()) 

457def test_find_span_end_failure(tokens, extra): 

458 """ 

459 Check the end-finder raises an error if it does not exit. 

460 

461 We pass a token list, a starting index, and mock the regex searcher 

462 to always fail. 

463 """ 

464 pattern = mock.Mock() 

465 start = extra.draw(st.integers(0, len(tokens))) 

466 

467 with ( 

468 mock.patch.object(struct.re, "search", return_value=None) as search, 

469 pytest.raises(RuntimeError), 

470 ): 

471 _ = struct._find_span_end(tokens, pattern, start) 

472 

473 assert search.call_args_list == [ 

474 mock.call(pattern, text, re.DOTALL) 

475 for text in itertools.accumulate(token.value for token in tokens[start:]) 

476 ]