Coverage for kye/engine/validate.py: 17%
64 statements
« prev ^ index » next coverage.py v7.3.2, created at 2024-01-16 13:24 -0700
« prev ^ index » next coverage.py v7.3.2, created at 2024-01-16 13:24 -0700
1from kye.types import Type, EDGE
2from duckdb import DuckDBPyConnection, DuckDBPyRelation
4def struct_pack(edges: list[str], r: DuckDBPyRelation):
5 return 'struct_pack(' + ','.join(
6 f'''"{edge_name}":="{edge_name}"'''
7 for edge_name in edges
8 if edge_name in r.columns
9 ) + ')'
11def string_list(strings: list[str]):
12 # TODO: Escape strings
13 return "'" + "','".join(strings) + "'"
15def flag(err_msg: str, condition: str, r: DuckDBPyRelation, **kwargs):
16 fields = ["'" + err_msg + "'"]
17 for field in ['tbl','idx','row','col','val']:
18 if field in kwargs:
19 if kwargs[field] is None:
20 fields.append('NULL')
21 else:
22 fields.append("'" + kwargs[field] + "'")
23 elif field in r.columns:
24 fields.append(field)
25 else:
26 fields.append('NULL')
28 err = r.filter(condition).select(','.join(fields))
29 err.insert_into('errors')
30 return r.filter(f'''NOT({condition})''')
32def collect(groupby, relations: dict[str, DuckDBPyRelation]):
33 collected = None
34 for alias, r in relations.items():
35 assert len(r.columns) == 2
36 assert r.columns[0] == groupby
37 r = r.select(f'''{groupby}, {r.columns[1]} as {alias}''').set_alias(alias)
38 collected = collected.join(r, groupby, how='outer') if collected else r
39 return collected
41def row_index(r: DuckDBPyRelation, index: list[str], name: str = 'idx'):
42 r = r.filter(' AND '.join(f'{idx} IS NOT NULL' for idx in index))\
43 .select(f'''row, hash({struct_pack(sorted(index), r)}) as {name}''')
44 return r
46def row_indexes(edges: DuckDBPyRelation, typ: Type):
47 global_index = row_index(edges, typ.index).set_alias('idx')
48 partial_index = None
49 for idx in typ.indexes:
50 r = row_index(edges, idx, name='partial')
51 partial_index = partial_index.union(r) if partial_index else r
52 r = partial_index.join(global_index, 'row', how='left')
53 # Create a map of partial ids to full ids
54 partial_map = r.aggregate('partial, unnest(list_distinct(list(idx))) as idx').set_alias('partial_map')
55 # Redefine index using the partial_map
56 r = r.select('row, partial').join(partial_map, 'partial', how='left')
57 return r
59def compute_index(typ: Type, table: DuckDBPyRelation):
60 edges = table.filter(f'''col in ({string_list(typ.index)})''')
61 edges = flag('CONFLICTING_INDEX', 'cnt > 1',
62 edges.aggregate('''row, col, first(val) as val, count(distinct(val)) as cnt'''), tbl=typ.ref, val=None)
63 edges = collect('row', {
64 col: edges.filter(f"col = '{col}'").select('row, val')
65 for col in typ.index
66 })
67 indexes = row_indexes(edges, typ)
68 r = table.aggregate('row').join(indexes, 'row', how='left')
69 r = flag('MISSING_INDEX', 'partial IS NULL', r, tbl=typ.ref)
70 r = flag('INCOMPLETE_INDEX', 'idx IS NULL', r, tbl=typ.ref)
71 r = flag('CONFLICTING_INDEX', 'cnt > 1',
72 r.aggregate('row, first(idx) as idx, count(distinct(idx)) as cnt'), tbl=typ.ref, idx=None)
73 r = r.select('row, idx')
74 return r
77def check_edge(typ: Type, edge: EDGE, table: DuckDBPyRelation):
78 column = table.filter(f"col = '{edge}'")
79 if not typ.allows_multiple(edge):
80 column = flag('CONFLICTING_EDGE', 'cnt > 1',
81 column.aggregate('tbl, idx, col, unnest(list(distinct(val))) as val, count(distinct(val)) as cnt'), val=None)
82 if not typ.allows_null(edge):
83 flag('MISSING_EDGE', 'true',
84 table.aggregate('tbl,idx').join(column.select('tbl,idx').set_alias('col'), 'tbl,idx', how='anti'))
86def check_table(typ: Type, db: DuckDBPyConnection):
87 table = db.table('edges').filter(f'''tbl = '{typ.ref}' ''')
88 index = compute_index(typ, table)
89 db.sql(f'''
90 UPDATE edges
91 SET idx=index.idx
92 FROM index
93 WHERE edges.row = index.row
94 AND edges.tbl = '{typ.ref}';
95 ''')
96 table = table.filter('idx IS NOT NULL')
97 for edge in typ.edges:
98 check_edge(typ, edge, table)