dashontology
DashOntology — Auto-inferred data ontology from lineage graphs.
1"""DashOntology — Auto-inferred data ontology from lineage graphs.""" 2from dashontology.models import ObjectType, Link, Metric, Property, OntologyGraph 3from dashontology.naming import normalize_name, singularize, to_camel_case 4from dashontology.cardinality import infer_cardinality, infer_cardinality_from_ratio 5from dashontology.inference import infer_ontology 6from dashontology.ui import launch 7 8__version__ = "0.1.1" 9__all__ = [ 10 "ObjectType", "Link", "Metric", "Property", "OntologyGraph", 11 "normalize_name", "singularize", "to_camel_case", 12 "infer_cardinality", "infer_cardinality_from_ratio", 13 "infer_ontology", 14 "launch", 15]
24@dataclass 25class ObjectType: 26 name: str # "Customer" 27 source_table: str # "cat.silver.customers" 28 properties: list[Property] 29 role: str # entity | fact | junction | aggregation 30 confidence: float # 0.0 – 1.0 31 description: str = ""
34@dataclass 35class Link: 36 name: str # "Customer_Order" 37 from_type: str # "Customer" 38 to_type: str # "Order" 39 cardinality: str # "1:1" | "1:N" | "N:M" 40 from_column: str # FK column on the *from* side 41 to_column: str # PK column on the *to* side 42 via_table: Optional[str] = None # junction table for N:M links 43 confidence: float = 1.0 44 description: str = ""
13@dataclass 14class Property: 15 name: str # "customer_id" 16 column: str # source column name (may differ after rename) 17 data_type: str # "bigint", "string", etc. 18 is_nullable: bool = True 19 is_primary_key: bool = False 20 is_foreign_key: bool = False 21 is_pii: bool = False
55class OntologyGraph: 56 """Container for an inferred or manually-defined ontology.""" 57 58 def __init__( 59 self, 60 object_types: list[ObjectType] = None, 61 links: list[Link] = None, 62 metrics: list[Metric] = None, 63 ): 64 self.object_types: list[ObjectType] = object_types or [] 65 self.links: list[Link] = links or [] 66 self.metrics: list[Metric] = metrics or [] 67 68 # ── Lookup ─────────────────────────────────────────────────────────────── 69 70 def get_object(self, name: str) -> Optional[ObjectType]: 71 return next((o for o in self.object_types if o.name == name), None) 72 73 def links_from(self, type_name: str) -> list[Link]: 74 return [link for link in self.links if link.from_type == type_name] 75 76 def links_to(self, type_name: str) -> list[Link]: 77 return [link for link in self.links if link.to_type == type_name] 78 79 # ── Summary ────────────────────────────────────────────────────────────── 80 81 def summary(self) -> dict: 82 return { 83 "object_types": len(self.object_types), 84 "links": len(self.links), 85 "metrics": len(self.metrics), 86 "high_confidence_objects": sum(1 for o in self.object_types if o.confidence >= 0.8), 87 "high_confidence_links": sum(1 for link in self.links if link.confidence >= 0.8), 88 } 89 90 # ── Export ─────────────────────────────────────────────────────────────── 91 92 def to_dict(self) -> dict: 93 return { 94 "object_types": [ 95 { 96 "name": o.name, 97 "source_table": o.source_table, 98 "role": o.role, 99 "confidence": round(o.confidence, 3), 100 "description": o.description, 101 "properties": [ 102 { 103 "name": p.name, 104 "column": p.column, 105 "data_type": p.data_type, 106 "is_nullable": p.is_nullable, 107 "is_primary_key": p.is_primary_key, 108 "is_foreign_key": p.is_foreign_key, 109 "is_pii": p.is_pii, 110 } 111 for p in o.properties 112 ], 113 } 114 for o in self.object_types 115 ], 116 "links": [ 117 { 118 "name": link.name, 119 "from": link.from_type, 120 "to": link.to_type, 121 "cardinality": link.cardinality, 122 "from_column": link.from_column, 123 "to_column": link.to_column, 124 "via_table": link.via_table, 125 "confidence": round(link.confidence, 3), 126 "description": link.description, 127 } 128 for link in self.links 129 ], 130 "metrics": [ 131 { 132 "name": m.name, 133 "source_table": m.source_table, 134 "grain": m.grain, 135 "description": m.description, 136 } 137 for m in self.metrics 138 ], 139 } 140 141 def to_json(self, indent: int = 2) -> str: 142 return json.dumps(self.to_dict(), indent=indent) 143 144 def __repr__(self) -> str: 145 s = self.summary() 146 return ( 147 f"OntologyGraph({s['object_types']} objects, " 148 f"{s['links']} links, " 149 f"{s['metrics']} metrics)" 150 )
Container for an inferred or manually-defined ontology.
81 def summary(self) -> dict: 82 return { 83 "object_types": len(self.object_types), 84 "links": len(self.links), 85 "metrics": len(self.metrics), 86 "high_confidence_objects": sum(1 for o in self.object_types if o.confidence >= 0.8), 87 "high_confidence_links": sum(1 for link in self.links if link.confidence >= 0.8), 88 }
92 def to_dict(self) -> dict: 93 return { 94 "object_types": [ 95 { 96 "name": o.name, 97 "source_table": o.source_table, 98 "role": o.role, 99 "confidence": round(o.confidence, 3), 100 "description": o.description, 101 "properties": [ 102 { 103 "name": p.name, 104 "column": p.column, 105 "data_type": p.data_type, 106 "is_nullable": p.is_nullable, 107 "is_primary_key": p.is_primary_key, 108 "is_foreign_key": p.is_foreign_key, 109 "is_pii": p.is_pii, 110 } 111 for p in o.properties 112 ], 113 } 114 for o in self.object_types 115 ], 116 "links": [ 117 { 118 "name": link.name, 119 "from": link.from_type, 120 "to": link.to_type, 121 "cardinality": link.cardinality, 122 "from_column": link.from_column, 123 "to_column": link.to_column, 124 "via_table": link.via_table, 125 "confidence": round(link.confidence, 3), 126 "description": link.description, 127 } 128 for link in self.links 129 ], 130 "metrics": [ 131 { 132 "name": m.name, 133 "source_table": m.source_table, 134 "grain": m.grain, 135 "description": m.description, 136 } 137 for m in self.metrics 138 ], 139 }
85def normalize_name( 86 full_table_name: str, 87 glossary: dict[str, str] = None, 88) -> str: 89 """ 90 Convert a table name into a CamelCase object type name. 91 92 full_table_name — may include catalog/schema (e.g. "cat.schema.dim_customer") 93 glossary — optional {table_name: object_type} override map 94 95 Examples: 96 "cat.silver.dim_customer" → "Customer" 97 "fact_order_items" → "OrderItem" 98 "stg_raw_events" → "RawEvent" 99 """ 100 bare = full_table_name.split(".")[-1] 101 102 # Glossary takes priority 103 if glossary: 104 if full_table_name in glossary: 105 return glossary[full_table_name] 106 if bare in glossary: 107 return glossary[bare] 108 if bare.lower() in glossary: 109 return glossary[bare.lower()] 110 111 stripped = strip_prefix(bare) 112 # CamelCase the snake_case segments, then singularize the last word 113 parts = [p for p in stripped.split("_") if p] 114 if not parts: 115 return to_camel_case(bare) 116 117 # Singularize the last part (e.g. customers → customer → Customer) 118 parts[-1] = singularize(parts[-1]) 119 return "".join(p.capitalize() for p in parts)
Convert a table name into a CamelCase object type name.
full_table_name — may include catalog/schema (e.g. "cat.schema.dim_customer") glossary — optional {table_name: object_type} override map
Examples:
"cat.silver.dim_customer" → "Customer" "fact_order_items" → "OrderItem" "stg_raw_events" → "RawEvent"
41def singularize(word: str) -> str: 42 """ 43 Naive English singularizer. Handles the most common patterns 44 found in data lake table naming conventions. 45 """ 46 lower = word.lower() 47 48 if lower in _IRREGULAR_PLURALS: 49 # Preserve original case pattern 50 singular = _IRREGULAR_PLURALS[lower] 51 return singular.capitalize() if word[0].isupper() else singular 52 53 # Already singular guard — short words or known invariants 54 if len(lower) <= 3: 55 return word 56 57 # ies → y (categories → category) 58 if lower.endswith("ies") and len(lower) > 4: 59 return word[:-3] + "y" 60 61 # sses → ss (addresses → address — NOT address+s → addres) 62 if lower.endswith("sses"): 63 return word[:-2] 64 65 # xes / zes / ches / shes → remove es 66 if lower.endswith(("xes", "zes", "ches", "shes")): 67 return word[:-2] 68 69 # ses — status/statuses edge case handled by irregular; otherwise try -es 70 if lower.endswith("ses") and len(lower) > 5: 71 return word[:-2] 72 73 # Ends in 's' but not 'ss' or known ok endings 74 if lower.endswith("s") and not lower.endswith("ss"): 75 return word[:-1] 76 77 return word
Naive English singularizer. Handles the most common patterns found in data lake table naming conventions.
80def to_camel_case(snake: str) -> str: 81 """customer_order_item → CustomerOrderItem""" 82 return "".join(part.capitalize() for part in snake.split("_") if part)
customer_order_item → CustomerOrderItem
12def infer_cardinality( 13 from_unique: int, 14 from_total: int, 15 to_unique: int, 16 to_total: int, 17 one_to_one_threshold: float = 0.95, 18) -> tuple[str, float]: 19 """ 20 Infer cardinality from column uniqueness stats. 21 22 Parameters 23 ---------- 24 from_unique : distinct values in the FK column of the *from* table 25 from_total : total non-null rows in the FK column 26 to_unique : distinct values in the PK column of the *to* table 27 to_total : total non-null rows in the PK column 28 29 Returns 30 ------- 31 (cardinality: str, confidence: float) 32 cardinality ∈ {"1:1", "1:N", "N:M"} 33 34 Heuristics 35 ---------- 36 - to_unique ≈ to_total → PK side is truly unique (good PK) 37 - from_unique ≈ from_total → FK side is also unique → 1:1 38 - from_unique < from_total → many FK rows per PK value → 1:N 39 - from_unique ≈ from_total AND to_unique < to_total → N:M or data quality issue 40 """ 41 if from_total <= 0 or to_total <= 0: 42 return "1:N", 0.40 # can't tell, default to most common 43 44 from_uniq_rate = from_unique / from_total 45 to_uniq_rate = to_unique / to_total 46 47 pk_is_unique = to_uniq_rate >= one_to_one_threshold 48 49 if not pk_is_unique: 50 # PK side has duplicates — likely N:M or a bad join 51 return "N:M", 0.55 52 53 fk_is_unique = from_uniq_rate >= one_to_one_threshold 54 55 if fk_is_unique: 56 # Both sides are unique → 1:1 57 return "1:1", 0.85 58 59 # FK has duplicates, PK is unique → 1:N (one PK row → many FK rows) 60 # Confidence scales with how non-unique the FK side is 61 spread = 1.0 - from_uniq_rate # 0 = all unique, 1 = all same value 62 confidence = min(0.95, 0.65 + spread * 0.3) 63 return "1:N", round(confidence, 3)
Infer cardinality from column uniqueness stats.
Parameters
from_unique : distinct values in the FK column of the from table from_total : total non-null rows in the FK column to_unique : distinct values in the PK column of the to table to_total : total non-null rows in the PK column
Returns
(cardinality: str, confidence: float) cardinality ∈ {"1:1", "1:N", "N:M"}
Heuristics
- to_unique ≈ to_total → PK side is truly unique (good PK)
- from_unique ≈ from_total → FK side is also unique → 1:1
- from_unique < from_total → many FK rows per PK value → 1:N
- from_unique ≈ from_total AND to_unique < to_total → N:M or data quality issue
66def infer_cardinality_from_ratio(avg_fk_per_pk: float) -> tuple[str, float]: 67 """ 68 Simpler heuristic when only the average FK-per-PK ratio is known. 69 70 avg_fk_per_pk — average number of FK rows per unique PK value 71 72 Examples: 73 1.0 → 1:1 74 3.5 → 1:N 75 12.0 → 1:N (strong) 76 """ 77 if avg_fk_per_pk <= 1.05: 78 return "1:1", 0.80 79 if avg_fk_per_pk <= 1.5: 80 return "1:N", 0.60 # borderline 81 return "1:N", min(0.95, 0.70 + min(avg_fk_per_pk, 20) / 100)
Simpler heuristic when only the average FK-per-PK ratio is known.
avg_fk_per_pk — average number of FK rows per unique PK value
Examples:
1.0 → 1:1 3.5 → 1:N 12.0 → 1:N (strong)
197def infer_ontology( 198 lineage_graph: dict, 199 schemas: dict[str, list[dict]] = None, 200 glossary: dict[str, str] = None, 201 min_confidence: float = 0.50, 202 include_staging: bool = False, 203) -> OntologyGraph: 204 """ 205 Infer an OntologyGraph from a lineage graph dict. 206 207 Parameters 208 ---------- 209 lineage_graph 210 Dict from LineageGraph.to_dict() (or equivalent). 211 Keys: "tables", "table_edges", "column_edges" 212 schemas 213 Optional {table_full_name: [{name, type, nullable}]} override. 214 If provided, these columns take priority over lineage_graph["tables"]. 215 glossary 216 Optional {table_name: ObjectType_name} for custom name mappings. 217 min_confidence 218 Drop inferences below this threshold. 219 include_staging 220 Whether to include staging/temp tables as ObjectTypes. 221 222 Returns 223 ------- 224 OntologyGraph with ObjectType, Link, and Metric lists. 225 """ 226 from dashontology.models import ObjectType, Metric, OntologyGraph 227 228 tables_raw = lineage_graph.get("tables", {}) 229 table_edges = lineage_graph.get("table_edges", []) 230 column_edges = lineage_graph.get("column_edges", []) 231 232 # Compute upstream counts for role classification 233 upstream_counts: dict[str, int] = {} 234 downstream_counts: dict[str, int] = {} 235 for e in table_edges: 236 downstream_counts[e["source"]] = downstream_counts.get(e["source"], 0) + 1 237 upstream_counts[e["target"]] = upstream_counts.get(e["target"], 0) + 1 238 239 # Classify table roles using dashgov-style heuristics (inlined to avoid import) 240 from dashontology._classifier_bridge import classify_table_role 241 table_roles: dict[str, str] = {} 242 table_confidences: dict[str, float] = {} 243 for full_name, tbl_info in tables_raw.items(): 244 cols = schemas.get(full_name, tbl_info.get("columns", [])) if schemas else tbl_info.get("columns", []) 245 role, conf = classify_table_role( 246 full_name=full_name, 247 columns=cols, 248 n_upstream=upstream_counts.get(full_name, 0), 249 n_downstream=downstream_counts.get(full_name, 0), 250 ) 251 table_roles[full_name] = role 252 table_confidences[full_name] = conf 253 254 # ── Build name_map for ALL tables ─────────────────────────────────────── 255 # All tables need normalized names for link inference even if they don't 256 # become ObjectTypes (e.g. silver pass-through tables with low confidence). 257 name_map: dict[str, str] = { 258 full_name: normalize_name(full_name, glossary) 259 for full_name in tables_raw 260 } 261 262 # ── Build ObjectTypes and Metrics ──────────────────────────────────────── 263 object_types: list[ObjectType] = [] 264 metrics: list[Metric] = [] 265 266 for full_name, tbl_info in tables_raw.items(): 267 role = table_roles.get(full_name, "unknown") 268 conf = table_confidences.get(full_name, 0.4) 269 270 if conf < min_confidence: 271 continue 272 273 cols = schemas.get(full_name, tbl_info.get("columns", [])) if schemas else tbl_info.get("columns", []) 274 obj_name = name_map[full_name] 275 276 if role in _METRIC_ROLES: 277 grain = _guess_grain(cols) 278 metrics.append(Metric( 279 name=obj_name, 280 source_table=full_name, 281 grain=grain, 282 )) 283 elif role in _ENTITY_ROLES or (include_staging and role == "staging"): 284 fk_cols = {c["name"] for c in cols if _is_fk(c.get("name", ""))} 285 props = _build_properties(cols, fk_cols) 286 object_types.append(ObjectType( 287 name=obj_name, 288 source_table=full_name, 289 properties=props, 290 role=role, 291 confidence=conf, 292 )) 293 294 # ── Build Links ────────────────────────────────────────────────────────── 295 links: list[Link] = [] 296 if column_edges: 297 links = _infer_links_from_column_edges(column_edges, name_map, table_roles, glossary or {}) 298 if not links: 299 links = _infer_links_from_naming(tables_raw, name_map) 300 301 links = [link for link in links if link.confidence >= min_confidence] 302 303 return OntologyGraph(object_types=object_types, links=links, metrics=metrics)
Infer an OntologyGraph from a lineage graph dict.
Parameters
lineage_graph Dict from LineageGraph.to_dict() (or equivalent). Keys: "tables", "table_edges", "column_edges" schemas Optional {table_full_name: [{name, type, nullable}]} override. If provided, these columns take priority over lineage_graph["tables"]. glossary Optional {table_name: ObjectType_name} for custom name mappings. min_confidence Drop inferences below this threshold. include_staging Whether to include staging/temp tables as ObjectTypes.
Returns
OntologyGraph with ObjectType, Link, and Metric lists.
81def launch(): 82 try: 83 import ipywidgets as w 84 from IPython.display import display 85 except ImportError: 86 raise RuntimeError("ipywidgets required. Run: %pip install ipywidgets") 87 88 import dashui 89 90 # ── From lineage graph dict (paste JSON) ────────────────────────────────── 91 json_input = w.Textarea( 92 description="Lineage JSON:", 93 placeholder='Paste the output of dashgov.LineageGraph.to_json() here ...', 94 layout=w.Layout(width="100%", height="150px"), 95 ) 96 min_conf_slider = w.FloatSlider( 97 description="Min confidence:", value=0.6, min=0.0, max=1.0, step=0.05, 98 readout_format=".0%", 99 ) 100 include_staging_cb = w.Checkbox(value=False, description="Include staging tables") 101 glossary_input = w.Textarea( 102 description="Glossary (JSON):", 103 placeholder='{"raw_cust": "Customer", "tbl_ord": "Order"}', 104 layout=w.Layout(width="100%", height="60px"), 105 ) 106 infer_btn = dashui.action_button("Infer Ontology", style="success", emoji="🧠") 107 infer_output = dashui.output_panel() 108 ontology_viz = w.HTML(value="") 109 json_export = w.Textarea( 110 description="Export (JSON):", 111 layout=w.Layout(width="100%", height="120px"), 112 disabled=True, 113 ) 114 115 _last_ontology: list = [None] 116 117 def on_infer(b): 118 with infer_output: 119 infer_output.clear_output() 120 ontology_viz.value = "" 121 json_export.value = "" 122 raw = json_input.value.strip() 123 if not raw: 124 print("⚠️ Paste a lineage JSON above") 125 return 126 try: 127 import json as _json 128 from dashontology.inference import infer_ontology 129 lineage = _json.loads(raw) 130 glossary = {} 131 if glossary_input.value.strip(): 132 glossary = _json.loads(glossary_input.value.strip()) 133 ontology = infer_ontology( 134 lineage_graph=lineage, 135 glossary=glossary or None, 136 min_confidence=min_conf_slider.value, 137 include_staging=include_staging_cb.value, 138 ) 139 _last_ontology[0] = ontology 140 s = ontology.summary() 141 print(f"Object types : {s['object_types']} ({s['high_confidence_objects']} high-confidence)") 142 print(f"Links : {s['links']} ({s['high_confidence_links']} high-confidence)") 143 print(f"Metrics : {s['metrics']}") 144 ontology_viz.value = _ontology_html(ontology.to_dict()) 145 json_export.value = ontology.to_json() 146 except Exception as e: 147 print(f"❌ {e}") 148 149 infer_btn.on_click(on_infer) 150 151 ui = dashui.card([ 152 dashui.header("DashOntology — Auto-Inferred Data Ontology", library="dashontology", emoji="🧬"), 153 154 dashui.section("Step 1: Paste lineage graph"), 155 dashui.html( 156 "<div style='font-size:12px;color:#666;margin-bottom:4px'>" 157 "Run <code>dashgov.build_lineage_graph(...).to_json()</code> in another cell " 158 "and paste the result here. Or use the UC fetch in DashGov.</div>" 159 ), 160 json_input, 161 162 dashui.section("Step 2: Configure inference"), 163 w.HBox([min_conf_slider, include_staging_cb]), 164 glossary_input, 165 infer_btn, 166 infer_output, 167 168 dashui.section("Inferred ontology"), 169 ontology_viz, 170 171 dashui.section("JSON export"), 172 json_export, 173 ]) 174 display(ui)