Coverage for kye/loader/json_lines.py: 23%

57 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-01-04 14:46 -0700

1import json 

2from pathlib import Path 

3from kye.types import Type, EDGE 

4from typing import Any 

5from duckdb import DuckDBPyConnection, DuckDBPyRelation 

6import re 

7 

8DIR = Path(__file__).parent.parent.parent / 'data' 

9DIR.mkdir(parents=True, exist_ok=True) 

10assert DIR.is_dir() 

11 

12def normalize_value(typ: Type, data: Any): 

13 if data is None: 

14 return None 

15 

16 # TODO: reshape id maps { [id]: { ... } } to [ { id, ... } ] 

17 # not sure if we want to do that auto-magically or have it explicitly 

18 # defined as part of the schema 

19 if typ.has_index: 

20 # TODO: better error handling, i.e trace location in data 

21 # so that we can report the location of the error 

22 assert type(data) is dict 

23 

24 edges = {} 

25 for edge in typ.edges: 

26 if edge not in data: 

27 continue 

28 

29 val = normalize_edge(typ, edge, data.get(edge)) 

30 if val is not None: 

31 edges[edge] = val 

32 

33 missing_indexes = [key for key in typ.index if key not in edges] 

34 assert len(missing_indexes) == 0, f'Missing indexes for {repr(typ)}: {",".join(missing_indexes)}' 

35 

36 if len(edges) == 0: 

37 return None 

38 

39 return edges 

40 

41 assert type(data) is not dict 

42 

43 if type(data) is float: 

44 return re.sub(r'\.0$', '', str(data)) 

45 

46 return str(data) 

47 

48def normalize_values(typ: Type, data: Any): 

49 if data is None: 

50 return None 

51 

52 if type(data) is not list: 

53 data = [ data ] 

54 

55 values = [] 

56 for item in data: 

57 val = normalize_value(typ, item) 

58 if val is not None: 

59 values.append(val) 

60 

61 if len(values) == 0: 

62 return None 

63 

64 return values 

65 

66def normalize_edge(typ: Type, edge: EDGE, data: Any): 

67 if data is None: 

68 return None 

69 

70 if typ.allows_multiple(edge): 

71 return normalize_values(typ.get_edge(edge), data) 

72 

73 assert type(data) is not list 

74 return normalize_value(typ.get_edge(edge), data) 

75 

76def from_json(typ: Type, data: list[dict], con: DuckDBPyConnection) -> DuckDBPyRelation: 

77 file_path = DIR / f'{typ.ref}.jsonl' 

78 

79 with file_path.open('w', encoding='utf-8') as f: 

80 for row in normalize_values(typ, data): 

81 json.dump(row, f) 

82 f.write('\n') 

83 

84 return con.read_json(str(file_path))