Coverage for kye/loader/json_lines.py: 23%
57 statements
« prev ^ index » next coverage.py v7.3.2, created at 2024-01-04 14:46 -0700
« prev ^ index » next coverage.py v7.3.2, created at 2024-01-04 14:46 -0700
1import json
2from pathlib import Path
3from kye.types import Type, EDGE
4from typing import Any
5from duckdb import DuckDBPyConnection, DuckDBPyRelation
6import re
8DIR = Path(__file__).parent.parent.parent / 'data'
9DIR.mkdir(parents=True, exist_ok=True)
10assert DIR.is_dir()
12def normalize_value(typ: Type, data: Any):
13 if data is None:
14 return None
16 # TODO: reshape id maps { [id]: { ... } } to [ { id, ... } ]
17 # not sure if we want to do that auto-magically or have it explicitly
18 # defined as part of the schema
19 if typ.has_index:
20 # TODO: better error handling, i.e trace location in data
21 # so that we can report the location of the error
22 assert type(data) is dict
24 edges = {}
25 for edge in typ.edges:
26 if edge not in data:
27 continue
29 val = normalize_edge(typ, edge, data.get(edge))
30 if val is not None:
31 edges[edge] = val
33 missing_indexes = [key for key in typ.index if key not in edges]
34 assert len(missing_indexes) == 0, f'Missing indexes for {repr(typ)}: {",".join(missing_indexes)}'
36 if len(edges) == 0:
37 return None
39 return edges
41 assert type(data) is not dict
43 if type(data) is float:
44 return re.sub(r'\.0$', '', str(data))
46 return str(data)
48def normalize_values(typ: Type, data: Any):
49 if data is None:
50 return None
52 if type(data) is not list:
53 data = [ data ]
55 values = []
56 for item in data:
57 val = normalize_value(typ, item)
58 if val is not None:
59 values.append(val)
61 if len(values) == 0:
62 return None
64 return values
66def normalize_edge(typ: Type, edge: EDGE, data: Any):
67 if data is None:
68 return None
70 if typ.allows_multiple(edge):
71 return normalize_values(typ.get_edge(edge), data)
73 assert type(data) is not list
74 return normalize_value(typ.get_edge(edge), data)
76def from_json(typ: Type, data: list[dict], con: DuckDBPyConnection) -> DuckDBPyRelation:
77 file_path = DIR / f'{typ.ref}.jsonl'
79 with file_path.open('w', encoding='utf-8') as f:
80 for row in normalize_values(typ, data):
81 json.dump(row, f)
82 f.write('\n')
84 return con.read_json(str(file_path))