Coverage for kye/engine/validate.py: 17%

64 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-01-16 13:24 -0700

1from kye.types import Type, EDGE 

2from duckdb import DuckDBPyConnection, DuckDBPyRelation 

3 

4def struct_pack(edges: list[str], r: DuckDBPyRelation): 

5 return 'struct_pack(' + ','.join( 

6 f'''"{edge_name}":="{edge_name}"''' 

7 for edge_name in edges 

8 if edge_name in r.columns 

9 ) + ')' 

10 

11def string_list(strings: list[str]): 

12 # TODO: Escape strings 

13 return "'" + "','".join(strings) + "'" 

14 

15def flag(err_msg: str, condition: str, r: DuckDBPyRelation, **kwargs): 

16 fields = ["'" + err_msg + "'"] 

17 for field in ['tbl','idx','row','col','val']: 

18 if field in kwargs: 

19 if kwargs[field] is None: 

20 fields.append('NULL') 

21 else: 

22 fields.append("'" + kwargs[field] + "'") 

23 elif field in r.columns: 

24 fields.append(field) 

25 else: 

26 fields.append('NULL') 

27 

28 err = r.filter(condition).select(','.join(fields)) 

29 err.insert_into('errors') 

30 return r.filter(f'''NOT({condition})''') 

31 

32def collect(groupby, relations: dict[str, DuckDBPyRelation]): 

33 collected = None 

34 for alias, r in relations.items(): 

35 assert len(r.columns) == 2 

36 assert r.columns[0] == groupby 

37 r = r.select(f'''{groupby}, {r.columns[1]} as {alias}''').set_alias(alias) 

38 collected = collected.join(r, groupby, how='outer') if collected else r 

39 return collected 

40 

41def row_index(r: DuckDBPyRelation, index: list[str], name: str = 'idx'): 

42 r = r.filter(' AND '.join(f'{idx} IS NOT NULL' for idx in index))\ 

43 .select(f'''row, hash({struct_pack(sorted(index), r)}) as {name}''') 

44 return r 

45 

46def row_indexes(edges: DuckDBPyRelation, typ: Type): 

47 global_index = row_index(edges, typ.index).set_alias('idx') 

48 partial_index = None 

49 for idx in typ.indexes: 

50 r = row_index(edges, idx, name='partial') 

51 partial_index = partial_index.union(r) if partial_index else r 

52 r = partial_index.join(global_index, 'row', how='left') 

53 # Create a map of partial ids to full ids 

54 partial_map = r.aggregate('partial, unnest(list_distinct(list(idx))) as idx').set_alias('partial_map') 

55 # Redefine index using the partial_map 

56 r = r.select('row, partial').join(partial_map, 'partial', how='left') 

57 return r 

58 

59def compute_index(typ: Type, table: DuckDBPyRelation): 

60 edges = table.filter(f'''col in ({string_list(typ.index)})''') 

61 edges = flag('CONFLICTING_INDEX', 'cnt > 1', 

62 edges.aggregate('''row, col, first(val) as val, count(distinct(val)) as cnt'''), tbl=typ.ref, val=None) 

63 edges = collect('row', { 

64 col: edges.filter(f"col = '{col}'").select('row, val') 

65 for col in typ.index 

66 }) 

67 indexes = row_indexes(edges, typ) 

68 r = table.aggregate('row').join(indexes, 'row', how='left') 

69 r = flag('MISSING_INDEX', 'partial IS NULL', r, tbl=typ.ref) 

70 r = flag('INCOMPLETE_INDEX', 'idx IS NULL', r, tbl=typ.ref) 

71 r = flag('CONFLICTING_INDEX', 'cnt > 1', 

72 r.aggregate('row, first(idx) as idx, count(distinct(idx)) as cnt'), tbl=typ.ref, idx=None) 

73 r = r.select('row, idx') 

74 return r 

75 

76 

77def check_edge(typ: Type, edge: EDGE, table: DuckDBPyRelation): 

78 column = table.filter(f"col = '{edge}'") 

79 if not typ.allows_multiple(edge): 

80 column = flag('CONFLICTING_EDGE', 'cnt > 1', 

81 column.aggregate('tbl, idx, col, unnest(list(distinct(val))) as val, count(distinct(val)) as cnt'), val=None) 

82 if not typ.allows_null(edge): 

83 flag('MISSING_EDGE', 'true', 

84 table.aggregate('tbl,idx').join(column.select('tbl,idx').set_alias('col'), 'tbl,idx', how='anti')) 

85 

86def check_table(typ: Type, db: DuckDBPyConnection): 

87 table = db.table('edges').filter(f'''tbl = '{typ.ref}' ''') 

88 index = compute_index(typ, table) 

89 db.sql(f''' 

90 UPDATE edges 

91 SET idx=index.idx 

92 FROM index 

93 WHERE edges.row = index.row 

94 AND edges.tbl = '{typ.ref}'; 

95 ''') 

96 table = table.filter('idx IS NOT NULL') 

97 for edge in typ.edges: 

98 check_edge(typ, edge, table)