dashontology

DashOntology — Auto-inferred data ontology from lineage graphs.

 1"""DashOntology — Auto-inferred data ontology from lineage graphs."""
 2from dashontology.models import ObjectType, Link, Metric, Property, OntologyGraph
 3from dashontology.naming import normalize_name, singularize, to_camel_case
 4from dashontology.cardinality import infer_cardinality, infer_cardinality_from_ratio
 5from dashontology.inference import infer_ontology
 6from dashontology.ui import launch
 7
 8__version__ = "0.1.3"
 9__all__ = [
10    "ObjectType", "Link", "Metric", "Property", "OntologyGraph",
11    "normalize_name", "singularize", "to_camel_case",
12    "infer_cardinality", "infer_cardinality_from_ratio",
13    "infer_ontology",
14    "launch",
15]
@dataclass
class ObjectType:
24@dataclass
25class ObjectType:
26    name: str                     # "Customer"
27    source_table: str             # "cat.silver.customers"
28    properties: list[Property]
29    role: str                     # entity | fact | junction | aggregation
30    confidence: float             # 0.0 – 1.0
31    description: str = ""
ObjectType( name: str, source_table: str, properties: list[Property], role: str, confidence: float, description: str = '')
name: str
source_table: str
properties: list[Property]
role: str
confidence: float
description: str = ''
@dataclass
class Metric:
47@dataclass
48class Metric:
49    name: str            # "monthly_revenue"
50    source_table: str
51    grain: str           # "month", "day", "customer", etc.
52    description: str = ""
Metric(name: str, source_table: str, grain: str, description: str = '')
name: str
source_table: str
grain: str
description: str = ''
@dataclass
class Property:
13@dataclass
14class Property:
15    name: str            # "customer_id"
16    column: str          # source column name (may differ after rename)
17    data_type: str       # "bigint", "string", etc.
18    is_nullable: bool = True
19    is_primary_key: bool = False
20    is_foreign_key: bool = False
21    is_pii: bool = False
Property( name: str, column: str, data_type: str, is_nullable: bool = True, is_primary_key: bool = False, is_foreign_key: bool = False, is_pii: bool = False)
name: str
column: str
data_type: str
is_nullable: bool = True
is_primary_key: bool = False
is_foreign_key: bool = False
is_pii: bool = False
class OntologyGraph:
 55class OntologyGraph:
 56    """Container for an inferred or manually-defined ontology."""
 57
 58    def __init__(
 59        self,
 60        object_types: list[ObjectType] = None,
 61        links: list[Link] = None,
 62        metrics: list[Metric] = None,
 63    ):
 64        self.object_types: list[ObjectType] = object_types or []
 65        self.links: list[Link] = links or []
 66        self.metrics: list[Metric] = metrics or []
 67
 68    # ── Lookup ───────────────────────────────────────────────────────────────
 69
 70    def get_object(self, name: str) -> Optional[ObjectType]:
 71        return next((o for o in self.object_types if o.name == name), None)
 72
 73    def links_from(self, type_name: str) -> list[Link]:
 74        return [link for link in self.links if link.from_type == type_name]
 75
 76    def links_to(self, type_name: str) -> list[Link]:
 77        return [link for link in self.links if link.to_type == type_name]
 78
 79    # ── Summary ──────────────────────────────────────────────────────────────
 80
 81    def summary(self) -> dict:
 82        return {
 83            "object_types": len(self.object_types),
 84            "links": len(self.links),
 85            "metrics": len(self.metrics),
 86            "high_confidence_objects": sum(1 for o in self.object_types if o.confidence >= 0.8),
 87            "high_confidence_links": sum(1 for link in self.links if link.confidence >= 0.8),
 88        }
 89
 90    # ── Export ───────────────────────────────────────────────────────────────
 91
 92    def to_dict(self) -> dict:
 93        return {
 94            "object_types": [
 95                {
 96                    "name": o.name,
 97                    "source_table": o.source_table,
 98                    "role": o.role,
 99                    "confidence": round(o.confidence, 3),
100                    "description": o.description,
101                    "properties": [
102                        {
103                            "name": p.name,
104                            "column": p.column,
105                            "data_type": p.data_type,
106                            "is_nullable": p.is_nullable,
107                            "is_primary_key": p.is_primary_key,
108                            "is_foreign_key": p.is_foreign_key,
109                            "is_pii": p.is_pii,
110                        }
111                        for p in o.properties
112                    ],
113                }
114                for o in self.object_types
115            ],
116            "links": [
117                {
118                    "name": link.name,
119                    "from": link.from_type,
120                    "to": link.to_type,
121                    "cardinality": link.cardinality,
122                    "from_column": link.from_column,
123                    "to_column": link.to_column,
124                    "via_table": link.via_table,
125                    "confidence": round(link.confidence, 3),
126                    "description": link.description,
127                }
128                for link in self.links
129            ],
130            "metrics": [
131                {
132                    "name": m.name,
133                    "source_table": m.source_table,
134                    "grain": m.grain,
135                    "description": m.description,
136                }
137                for m in self.metrics
138            ],
139        }
140
141    def to_json(self, indent: int = 2) -> str:
142        return json.dumps(self.to_dict(), indent=indent)
143
144    def __repr__(self) -> str:
145        s = self.summary()
146        return (
147            f"OntologyGraph({s['object_types']} objects, "
148            f"{s['links']} links, "
149            f"{s['metrics']} metrics)"
150        )

Container for an inferred or manually-defined ontology.

OntologyGraph( object_types: list[ObjectType] = None, links: list[Link] = None, metrics: list[Metric] = None)
58    def __init__(
59        self,
60        object_types: list[ObjectType] = None,
61        links: list[Link] = None,
62        metrics: list[Metric] = None,
63    ):
64        self.object_types: list[ObjectType] = object_types or []
65        self.links: list[Link] = links or []
66        self.metrics: list[Metric] = metrics or []
object_types: list[ObjectType]
metrics: list[Metric]
def get_object(self, name: str) -> Optional[ObjectType]:
70    def get_object(self, name: str) -> Optional[ObjectType]:
71        return next((o for o in self.object_types if o.name == name), None)
def summary(self) -> dict:
81    def summary(self) -> dict:
82        return {
83            "object_types": len(self.object_types),
84            "links": len(self.links),
85            "metrics": len(self.metrics),
86            "high_confidence_objects": sum(1 for o in self.object_types if o.confidence >= 0.8),
87            "high_confidence_links": sum(1 for link in self.links if link.confidence >= 0.8),
88        }
def to_dict(self) -> dict:
 92    def to_dict(self) -> dict:
 93        return {
 94            "object_types": [
 95                {
 96                    "name": o.name,
 97                    "source_table": o.source_table,
 98                    "role": o.role,
 99                    "confidence": round(o.confidence, 3),
100                    "description": o.description,
101                    "properties": [
102                        {
103                            "name": p.name,
104                            "column": p.column,
105                            "data_type": p.data_type,
106                            "is_nullable": p.is_nullable,
107                            "is_primary_key": p.is_primary_key,
108                            "is_foreign_key": p.is_foreign_key,
109                            "is_pii": p.is_pii,
110                        }
111                        for p in o.properties
112                    ],
113                }
114                for o in self.object_types
115            ],
116            "links": [
117                {
118                    "name": link.name,
119                    "from": link.from_type,
120                    "to": link.to_type,
121                    "cardinality": link.cardinality,
122                    "from_column": link.from_column,
123                    "to_column": link.to_column,
124                    "via_table": link.via_table,
125                    "confidence": round(link.confidence, 3),
126                    "description": link.description,
127                }
128                for link in self.links
129            ],
130            "metrics": [
131                {
132                    "name": m.name,
133                    "source_table": m.source_table,
134                    "grain": m.grain,
135                    "description": m.description,
136                }
137                for m in self.metrics
138            ],
139        }
def to_json(self, indent: int = 2) -> str:
141    def to_json(self, indent: int = 2) -> str:
142        return json.dumps(self.to_dict(), indent=indent)
def normalize_name(full_table_name: str, glossary: dict[str, str] = None) -> str:
 85def normalize_name(
 86    full_table_name: str,
 87    glossary: dict[str, str] = None,
 88) -> str:
 89    """
 90    Convert a table name into a CamelCase object type name.
 91
 92    full_table_name — may include catalog/schema (e.g. "cat.schema.dim_customer")
 93    glossary        — optional {table_name: object_type} override map
 94
 95    Examples:
 96        "cat.silver.dim_customer"  → "Customer"
 97        "fact_order_items"         → "OrderItem"
 98        "stg_raw_events"           → "RawEvent"
 99    """
100    bare = full_table_name.split(".")[-1]
101
102    # Glossary takes priority
103    if glossary:
104        if full_table_name in glossary:
105            return glossary[full_table_name]
106        if bare in glossary:
107            return glossary[bare]
108        if bare.lower() in glossary:
109            return glossary[bare.lower()]
110
111    stripped = strip_prefix(bare)
112    # CamelCase the snake_case segments, then singularize the last word
113    parts = [p for p in stripped.split("_") if p]
114    if not parts:
115        return to_camel_case(bare)
116
117    # Singularize the last part (e.g. customers → customer → Customer)
118    parts[-1] = singularize(parts[-1])
119    return "".join(p.capitalize() for p in parts)

Convert a table name into a CamelCase object type name.

full_table_name — may include catalog/schema (e.g. "cat.schema.dim_customer") glossary — optional {table_name: object_type} override map

Examples:

"cat.silver.dim_customer" → "Customer" "fact_order_items" → "OrderItem" "stg_raw_events" → "RawEvent"

def singularize(word: str) -> str:
41def singularize(word: str) -> str:
42    """
43    Naive English singularizer. Handles the most common patterns
44    found in data lake table naming conventions.
45    """
46    lower = word.lower()
47
48    if lower in _IRREGULAR_PLURALS:
49        # Preserve original case pattern
50        singular = _IRREGULAR_PLURALS[lower]
51        return singular.capitalize() if word[0].isupper() else singular
52
53    # Already singular guard — short words or known invariants
54    if len(lower) <= 3:
55        return word
56
57    # ies → y (categories → category)
58    if lower.endswith("ies") and len(lower) > 4:
59        return word[:-3] + "y"
60
61    # sses → ss (addresses → address — NOT address+s → addres)
62    if lower.endswith("sses"):
63        return word[:-2]
64
65    # xes / zes / ches / shes → remove es
66    if lower.endswith(("xes", "zes", "ches", "shes")):
67        return word[:-2]
68
69    # ses — status/statuses edge case handled by irregular; otherwise try -es
70    if lower.endswith("ses") and len(lower) > 5:
71        return word[:-2]
72
73    # Ends in 's' but not 'ss' or known ok endings
74    if lower.endswith("s") and not lower.endswith("ss"):
75        return word[:-1]
76
77    return word

Naive English singularizer. Handles the most common patterns found in data lake table naming conventions.

def to_camel_case(snake: str) -> str:
80def to_camel_case(snake: str) -> str:
81    """customer_order_item → CustomerOrderItem"""
82    return "".join(part.capitalize() for part in snake.split("_") if part)

customer_order_item → CustomerOrderItem

def infer_cardinality( from_unique: int, from_total: int, to_unique: int, to_total: int, one_to_one_threshold: float = 0.95) -> tuple[str, float]:
12def infer_cardinality(
13    from_unique: int,
14    from_total: int,
15    to_unique: int,
16    to_total: int,
17    one_to_one_threshold: float = 0.95,
18) -> tuple[str, float]:
19    """
20    Infer cardinality from column uniqueness stats.
21
22    Parameters
23    ----------
24    from_unique : distinct values in the FK column of the *from* table
25    from_total  : total non-null rows in the FK column
26    to_unique   : distinct values in the PK column of the *to* table
27    to_total    : total non-null rows in the PK column
28
29    Returns
30    -------
31    (cardinality: str, confidence: float)
32    cardinality ∈ {"1:1", "1:N", "N:M"}
33
34    Heuristics
35    ----------
36    - to_unique ≈ to_total   → PK side is truly unique (good PK)
37    - from_unique ≈ from_total → FK side is also unique → 1:1
38    - from_unique < from_total → many FK rows per PK value → 1:N
39    - from_unique ≈ from_total AND to_unique < to_total → N:M or data quality issue
40    """
41    if from_total <= 0 or to_total <= 0:
42        return "1:N", 0.40   # can't tell, default to most common
43
44    from_uniq_rate = from_unique / from_total
45    to_uniq_rate   = to_unique / to_total
46
47    pk_is_unique = to_uniq_rate >= one_to_one_threshold
48
49    if not pk_is_unique:
50        # PK side has duplicates — likely N:M or a bad join
51        return "N:M", 0.55
52
53    fk_is_unique = from_uniq_rate >= one_to_one_threshold
54
55    if fk_is_unique:
56        # Both sides are unique → 1:1
57        return "1:1", 0.85
58
59    # FK has duplicates, PK is unique → 1:N (one PK row → many FK rows)
60    # Confidence scales with how non-unique the FK side is
61    spread = 1.0 - from_uniq_rate   # 0 = all unique, 1 = all same value
62    confidence = min(0.95, 0.65 + spread * 0.3)
63    return "1:N", round(confidence, 3)

Infer cardinality from column uniqueness stats.

Parameters

from_unique : distinct values in the FK column of the from table from_total : total non-null rows in the FK column to_unique : distinct values in the PK column of the to table to_total : total non-null rows in the PK column

Returns

(cardinality: str, confidence: float) cardinality ∈ {"1:1", "1:N", "N:M"}

Heuristics

  • to_unique ≈ to_total → PK side is truly unique (good PK)
  • from_unique ≈ from_total → FK side is also unique → 1:1
  • from_unique < from_total → many FK rows per PK value → 1:N
  • from_unique ≈ from_total AND to_unique < to_total → N:M or data quality issue
def infer_cardinality_from_ratio(avg_fk_per_pk: float) -> tuple[str, float]:
66def infer_cardinality_from_ratio(avg_fk_per_pk: float) -> tuple[str, float]:
67    """
68    Simpler heuristic when only the average FK-per-PK ratio is known.
69
70    avg_fk_per_pk — average number of FK rows per unique PK value
71
72    Examples:
73        1.0  → 1:1
74        3.5  → 1:N
75        12.0 → 1:N (strong)
76    """
77    if avg_fk_per_pk <= 1.05:
78        return "1:1", 0.80
79    if avg_fk_per_pk <= 1.5:
80        return "1:N", 0.60   # borderline
81    return "1:N", min(0.95, 0.70 + min(avg_fk_per_pk, 20) / 100)

Simpler heuristic when only the average FK-per-PK ratio is known.

avg_fk_per_pk — average number of FK rows per unique PK value

Examples:

1.0 → 1:1 3.5 → 1:N 12.0 → 1:N (strong)

def infer_ontology( lineage_graph: dict, schemas: dict[str, list[dict]] = None, glossary: dict[str, str] = None, min_confidence: float = 0.5, include_staging: bool = False) -> OntologyGraph:
197def infer_ontology(
198    lineage_graph: dict,
199    schemas: dict[str, list[dict]] = None,
200    glossary: dict[str, str] = None,
201    min_confidence: float = 0.50,
202    include_staging: bool = False,
203) -> OntologyGraph:
204    """
205    Infer an OntologyGraph from a lineage graph dict.
206
207    Parameters
208    ----------
209    lineage_graph
210        Dict from LineageGraph.to_dict() (or equivalent).
211        Keys: "tables", "table_edges", "column_edges"
212    schemas
213        Optional {table_full_name: [{name, type, nullable}]} override.
214        If provided, these columns take priority over lineage_graph["tables"].
215    glossary
216        Optional {table_name: ObjectType_name} for custom name mappings.
217    min_confidence
218        Drop inferences below this threshold.
219    include_staging
220        Whether to include staging/temp tables as ObjectTypes.
221
222    Returns
223    -------
224    OntologyGraph with ObjectType, Link, and Metric lists.
225    """
226    from dashontology.models import ObjectType, Metric, OntologyGraph
227
228    tables_raw = lineage_graph.get("tables", {})
229    table_edges = lineage_graph.get("table_edges", [])
230    column_edges = lineage_graph.get("column_edges", [])
231
232    # Compute upstream counts for role classification
233    upstream_counts: dict[str, int] = {}
234    downstream_counts: dict[str, int] = {}
235    for e in table_edges:
236        downstream_counts[e["source"]] = downstream_counts.get(e["source"], 0) + 1
237        upstream_counts[e["target"]] = upstream_counts.get(e["target"], 0) + 1
238
239    # Classify table roles using dashgov-style heuristics (inlined to avoid import)
240    from dashontology._classifier_bridge import classify_table_role
241    table_roles: dict[str, str] = {}
242    table_confidences: dict[str, float] = {}
243    for full_name, tbl_info in tables_raw.items():
244        cols = schemas.get(full_name, tbl_info.get("columns", [])) if schemas else tbl_info.get("columns", [])
245        role, conf = classify_table_role(
246            full_name=full_name,
247            columns=cols,
248            n_upstream=upstream_counts.get(full_name, 0),
249            n_downstream=downstream_counts.get(full_name, 0),
250        )
251        table_roles[full_name] = role
252        table_confidences[full_name] = conf
253
254    # ── Build name_map for ALL tables ───────────────────────────────────────
255    # All tables need normalized names for link inference even if they don't
256    # become ObjectTypes (e.g. silver pass-through tables with low confidence).
257    name_map: dict[str, str] = {
258        full_name: normalize_name(full_name, glossary)
259        for full_name in tables_raw
260    }
261
262    # ── Build ObjectTypes and Metrics ────────────────────────────────────────
263    object_types: list[ObjectType] = []
264    metrics: list[Metric] = []
265
266    for full_name, tbl_info in tables_raw.items():
267        role = table_roles.get(full_name, "unknown")
268        conf = table_confidences.get(full_name, 0.4)
269
270        if conf < min_confidence:
271            continue
272
273        cols = schemas.get(full_name, tbl_info.get("columns", [])) if schemas else tbl_info.get("columns", [])
274        obj_name = name_map[full_name]
275
276        if role in _METRIC_ROLES:
277            grain = _guess_grain(cols)
278            metrics.append(Metric(
279                name=obj_name,
280                source_table=full_name,
281                grain=grain,
282            ))
283        elif role in _ENTITY_ROLES or (include_staging and role == "staging"):
284            fk_cols = {c["name"] for c in cols if _is_fk(c.get("name", ""))}
285            props = _build_properties(cols, fk_cols)
286            object_types.append(ObjectType(
287                name=obj_name,
288                source_table=full_name,
289                properties=props,
290                role=role,
291                confidence=conf,
292            ))
293
294    # ── Build Links ──────────────────────────────────────────────────────────
295    links: list[Link] = []
296    if column_edges:
297        links = _infer_links_from_column_edges(column_edges, name_map, table_roles, glossary or {})
298    if not links:
299        links = _infer_links_from_naming(tables_raw, name_map)
300
301    links = [link for link in links if link.confidence >= min_confidence]
302
303    return OntologyGraph(object_types=object_types, links=links, metrics=metrics)

Infer an OntologyGraph from a lineage graph dict.

Parameters

lineage_graph Dict from LineageGraph.to_dict() (or equivalent). Keys: "tables", "table_edges", "column_edges" schemas Optional {table_full_name: [{name, type, nullable}]} override. If provided, these columns take priority over lineage_graph["tables"]. glossary Optional {table_name: ObjectType_name} for custom name mappings. min_confidence Drop inferences below this threshold. include_staging Whether to include staging/temp tables as ObjectTypes.

Returns

OntologyGraph with ObjectType, Link, and Metric lists.

def launch():
 81def launch():
 82    try:
 83        import ipywidgets as w
 84        from IPython.display import display
 85    except ImportError:
 86        raise RuntimeError("ipywidgets required. Run: %pip install ipywidgets")
 87
 88    import dashui
 89
 90    # ── From lineage graph dict (paste JSON) ──────────────────────────────────
 91    json_input = w.Textarea(
 92        description="Lineage JSON:",
 93        placeholder='Paste the output of dashgov.LineageGraph.to_json() here ...',
 94        layout=w.Layout(width="100%", height="150px"),
 95    )
 96    min_conf_slider = w.FloatSlider(
 97        description="Min confidence:", value=0.6, min=0.0, max=1.0, step=0.05,
 98        readout_format=".0%",
 99    )
100    include_staging_cb = w.Checkbox(value=False, description="Include staging tables")
101    glossary_input = w.Textarea(
102        description="Glossary (JSON):",
103        placeholder='{"raw_cust": "Customer", "tbl_ord": "Order"}',
104        layout=w.Layout(width="100%", height="60px"),
105    )
106    infer_btn = dashui.action_button("Infer Ontology", style="success", emoji="🧠")
107    infer_output = dashui.output_panel()
108    ontology_viz = w.HTML(value="")
109    json_export = w.Textarea(
110        description="Export (JSON):",
111        layout=w.Layout(width="100%", height="120px"),
112        disabled=True,
113    )
114
115    _last_ontology: list = [None]
116
117    def on_infer(b):
118        with infer_output:
119            infer_output.clear_output()
120            ontology_viz.value = ""
121            json_export.value = ""
122            raw = json_input.value.strip()
123            if not raw:
124                print("⚠️  Paste a lineage JSON above")
125                return
126            try:
127                import json as _json
128                from dashontology.inference import infer_ontology
129                lineage = _json.loads(raw)
130                glossary = {}
131                if glossary_input.value.strip():
132                    glossary = _json.loads(glossary_input.value.strip())
133                ontology = infer_ontology(
134                    lineage_graph=lineage,
135                    glossary=glossary or None,
136                    min_confidence=min_conf_slider.value,
137                    include_staging=include_staging_cb.value,
138                )
139                _last_ontology[0] = ontology
140                s = ontology.summary()
141                print(f"Object types : {s['object_types']} ({s['high_confidence_objects']} high-confidence)")
142                print(f"Links        : {s['links']} ({s['high_confidence_links']} high-confidence)")
143                print(f"Metrics      : {s['metrics']}")
144                ontology_viz.value = _ontology_html(ontology.to_dict())
145                json_export.value = ontology.to_json()
146            except Exception as e:
147                print(f"❌ {e}")
148
149    infer_btn.on_click(on_infer)
150
151    ui = dashui.card([
152        dashui.header("DashOntology — Auto-Inferred Data Ontology", library="dashontology", emoji="🧬"),
153
154        dashui.section("Step 1: Paste lineage graph"),
155        dashui.html(
156            "<div style='font-size:12px;color:#666;margin-bottom:4px'>"
157            "Run <code>dashgov.build_lineage_graph(...).to_json()</code> in another cell "
158            "and paste the result here. Or use the UC fetch in DashGov.</div>"
159        ),
160        json_input,
161
162        dashui.section("Step 2: Configure inference"),
163        w.HBox([min_conf_slider, include_staging_cb]),
164        glossary_input,
165        infer_btn,
166        infer_output,
167
168        dashui.section("Inferred ontology"),
169        ontology_viz,
170
171        dashui.section("JSON export"),
172        json_export,
173    ])
174    display(ui)