dashsynthetic

DashSynthetic — Synthetic data generation for Databricks. Launch the UI with dashsynthetic.launch() inside a Databricks notebook.

 1"""
 2DashSynthetic — Synthetic data generation for Databricks.
 3Launch the UI with dashsynthetic.launch() inside a Databricks notebook.
 4"""
 5from dashsynthetic.generator import MultiTableGenerator, SyntheticGenerator
 6from dashsynthetic.relationships import RelationshipGraph
 7from dashsynthetic.ui import launch
 8
 9__version__ = "0.1.5"
10__all__ = ["SyntheticGenerator", "MultiTableGenerator", "RelationshipGraph", "launch"]
class SyntheticGenerator:
 8class SyntheticGenerator:
 9    """
10    Generate synthetic data from a source Databricks table or DataFrame.
11
12    Usage::
13        gen = SyntheticGenerator(table="catalog.schema.customers")
14        gen.set_volume(50000)
15        gen.preserve_correlations(True)
16        syn_df = gen.run()
17    """
18
19    def __init__(self, df=None, table: str = None, query: str = None):
20        self._source_df = self._resolve(df, table, query)
21        self._volume: int = 0
22        self._preserve_corr: bool = True
23        self._preserve_nulls: bool = True
24        self._preserve_distributions: bool = True
25        self._output_table: Optional[str] = None
26
27    def _resolve(self, df, table, query):
28        if df is not None:
29            return df
30        try:
31            from pyspark.sql import SparkSession
32            spark = SparkSession.getActiveSession()
33            if table:
34                return spark.table(table)
35            if query:
36                return spark.sql(query)
37        except Exception as e:
38            raise ValueError(f"Could not load source: {e}")
39        raise ValueError("Provide df, table, or query")
40
41    def set_volume(self, n_rows: int):
42        self._volume = n_rows
43        return self
44
45    def preserve_correlations(self, enabled: bool = True):
46        self._preserve_corr = enabled
47        return self
48
49    def preserve_null_patterns(self, enabled: bool = True):
50        self._preserve_nulls = enabled
51        return self
52
53    def preserve_distributions(self, enabled: bool = True):
54        self._preserve_distributions = enabled
55        return self
56
57    def output_to(self, table: str):
58        self._output_table = table
59        return self
60
61    def profile(self) -> dict:
62        """Profile the source dataframe — distributions, null rates, correlations."""
63        from dashsynthetic.profiler import profile_df
64        return profile_df(self._source_df)
65
66    def run(self):
67        """Generate and return a synthetic DataFrame."""
68        from dashsynthetic.engine import generate
69        syn_df = generate(
70            source_df=self._source_df,
71            n_rows=self._volume or self._source_df.count(),
72            preserve_corr=self._preserve_corr,
73            preserve_nulls=self._preserve_nulls,
74            preserve_distributions=self._preserve_distributions,
75        )
76        if self._output_table:
77            syn_df.write.format("delta").mode("overwrite") \
78                .option("overwriteSchema", "true") \
79                .saveAsTable(self._output_table)
80            print(f"Synthetic data written to {self._output_table}")
81        return syn_df

Generate synthetic data from a source Databricks table or DataFrame.

Usage:: gen = SyntheticGenerator(table="catalog.schema.customers") gen.set_volume(50000) gen.preserve_correlations(True) syn_df = gen.run()

SyntheticGenerator(df=None, table: str = None, query: str = None)
19    def __init__(self, df=None, table: str = None, query: str = None):
20        self._source_df = self._resolve(df, table, query)
21        self._volume: int = 0
22        self._preserve_corr: bool = True
23        self._preserve_nulls: bool = True
24        self._preserve_distributions: bool = True
25        self._output_table: Optional[str] = None
def set_volume(self, n_rows: int):
41    def set_volume(self, n_rows: int):
42        self._volume = n_rows
43        return self
def preserve_correlations(self, enabled: bool = True):
45    def preserve_correlations(self, enabled: bool = True):
46        self._preserve_corr = enabled
47        return self
def preserve_null_patterns(self, enabled: bool = True):
49    def preserve_null_patterns(self, enabled: bool = True):
50        self._preserve_nulls = enabled
51        return self
def preserve_distributions(self, enabled: bool = True):
53    def preserve_distributions(self, enabled: bool = True):
54        self._preserve_distributions = enabled
55        return self
def output_to(self, table: str):
57    def output_to(self, table: str):
58        self._output_table = table
59        return self
def profile(self) -> dict:
61    def profile(self) -> dict:
62        """Profile the source dataframe — distributions, null rates, correlations."""
63        from dashsynthetic.profiler import profile_df
64        return profile_df(self._source_df)

Profile the source dataframe — distributions, null rates, correlations.

def run(self):
66    def run(self):
67        """Generate and return a synthetic DataFrame."""
68        from dashsynthetic.engine import generate
69        syn_df = generate(
70            source_df=self._source_df,
71            n_rows=self._volume or self._source_df.count(),
72            preserve_corr=self._preserve_corr,
73            preserve_nulls=self._preserve_nulls,
74            preserve_distributions=self._preserve_distributions,
75        )
76        if self._output_table:
77            syn_df.write.format("delta").mode("overwrite") \
78                .option("overwriteSchema", "true") \
79                .saveAsTable(self._output_table)
80            print(f"Synthetic data written to {self._output_table}")
81        return syn_df

Generate and return a synthetic DataFrame.

class MultiTableGenerator:
 84class MultiTableGenerator:
 85    """
 86    Generate synthetic data for several related tables at once, preserving
 87    primary/foreign key referential integrity and master-data columns.
 88
 89    Usage::
 90        graph = RelationshipGraph()
 91        graph.add_table("Customer", table="catalog.schema.dim_customer", primary_key="customer_id")
 92        graph.add_table("Account", table="catalog.schema.fact_account", primary_key="account_id")
 93        graph.add_foreign_key("Account", "customer_id", "Customer", "customer_id")
 94
 95        gen = MultiTableGenerator(graph)
 96        gen.configure_table("Customer", n_rows=5000)
 97        gen.configure_table("Account", n_rows=20000, output_table="catalog.schema.syn_account")
 98        results = gen.run()   # {"Customer": df, "Account": df}
 99    """
100
101    def __init__(self, graph: RelationshipGraph):
102        self._graph = graph
103        self._specs: dict = {}
104
105    def configure_table(self, name: str, n_rows: int = 0, preserve_corr: bool = True,
106                        preserve_nulls: bool = True, preserve_distributions: bool = True,
107                        output_table: Optional[str] = None):
108        from dashsynthetic.multi_engine import TableGenSpec
109        self._specs[name] = TableGenSpec(
110            n_rows=n_rows, preserve_corr=preserve_corr, preserve_nulls=preserve_nulls,
111            preserve_distributions=preserve_distributions, output_table=output_table,
112        )
113        return self
114
115    def validate(self) -> list:
116        return self._graph.validate()
117
118    def generation_order(self) -> list:
119        return self._graph.generation_order()
120
121    def run(self) -> dict:
122        """Generate and return {table_name: synthetic DataFrame} in dependency order."""
123        from dashsynthetic.multi_engine import generate_multi
124        return generate_multi(self._graph, self._specs)

Generate synthetic data for several related tables at once, preserving primary/foreign key referential integrity and master-data columns.

Usage:: graph = RelationshipGraph() graph.add_table("Customer", table="catalog.schema.dim_customer", primary_key="customer_id") graph.add_table("Account", table="catalog.schema.fact_account", primary_key="account_id") graph.add_foreign_key("Account", "customer_id", "Customer", "customer_id")

gen = MultiTableGenerator(graph)
gen.configure_table("Customer", n_rows=5000)
gen.configure_table("Account", n_rows=20000, output_table="catalog.schema.syn_account")
results = gen.run()   # {"Customer": df, "Account": df}
MultiTableGenerator(graph: RelationshipGraph)
101    def __init__(self, graph: RelationshipGraph):
102        self._graph = graph
103        self._specs: dict = {}
def configure_table( self, name: str, n_rows: int = 0, preserve_corr: bool = True, preserve_nulls: bool = True, preserve_distributions: bool = True, output_table: Optional[str] = None):
105    def configure_table(self, name: str, n_rows: int = 0, preserve_corr: bool = True,
106                        preserve_nulls: bool = True, preserve_distributions: bool = True,
107                        output_table: Optional[str] = None):
108        from dashsynthetic.multi_engine import TableGenSpec
109        self._specs[name] = TableGenSpec(
110            n_rows=n_rows, preserve_corr=preserve_corr, preserve_nulls=preserve_nulls,
111            preserve_distributions=preserve_distributions, output_table=output_table,
112        )
113        return self
def validate(self) -> list:
115    def validate(self) -> list:
116        return self._graph.validate()
def generation_order(self) -> list:
118    def generation_order(self) -> list:
119        return self._graph.generation_order()
def run(self) -> dict:
121    def run(self) -> dict:
122        """Generate and return {table_name: synthetic DataFrame} in dependency order."""
123        from dashsynthetic.multi_engine import generate_multi
124        return generate_multi(self._graph, self._specs)

Generate and return {table_name: synthetic DataFrame} in dependency order.

class RelationshipGraph:
 26class RelationshipGraph:
 27    """
 28    Defines which tables exist, their primary/master-data columns, and the
 29    foreign keys linking them — so synthetic generation can run tables in
 30    dependency order and keep FK values referentially valid.
 31
 32    Usage::
 33        graph = RelationshipGraph()
 34        graph.add_table("Customer", table="catalog.schema.dim_customer", primary_key="customer_id")
 35        graph.add_table("Account", table="catalog.schema.fact_account", primary_key="account_id",
 36                         master_data_columns=["currency_code"])
 37        graph.add_foreign_key("Account", "customer_id", "Customer", "customer_id")
 38        graph.generation_order()   # -> ["Customer", "Account"]
 39    """
 40
 41    def __init__(self):
 42        self._tables: dict[str, TableNode] = {}
 43        self._foreign_keys: list[ForeignKey] = []
 44
 45    def add_table(self, name: str, table: str, primary_key: str | None = None,
 46                  master_data_columns: list[str] | None = None):
 47        self._tables[name] = TableNode(name, table, primary_key, master_data_columns or [])
 48        return self
 49
 50    def add_foreign_key(self, from_table: str, from_column: str,
 51                        to_table: str, to_column: str):
 52        self._foreign_keys.append(ForeignKey(from_table, from_column, to_table, to_column))
 53        return self
 54
 55    @property
 56    def tables(self) -> dict[str, TableNode]:
 57        return self._tables
 58
 59    @property
 60    def foreign_keys(self) -> list[ForeignKey]:
 61        return self._foreign_keys
 62
 63    def foreign_keys_for(self, table_name: str) -> list[ForeignKey]:
 64        """Foreign keys defined on `table_name` (i.e. it depends on their `to_table`)."""
 65        return [fk for fk in self._foreign_keys if fk.from_table == table_name]
 66
 67    def validate(self) -> list[str]:
 68        """Check referential integrity of tables/foreign keys; cycle detection."""
 69        issues = []
 70        for fk in self._foreign_keys:
 71            if fk.from_table not in self._tables:
 72                issues.append(f"Unknown table '{fk.from_table}' in foreign key")
 73            if fk.to_table not in self._tables:
 74                issues.append(f"Unknown table '{fk.to_table}' in foreign key")
 75        try:
 76            self.generation_order()
 77        except ValueError as e:
 78            issues.append(str(e))
 79        return issues
 80
 81    def generation_order(self) -> list[str]:
 82        """
 83        Topologically sort tables so every table is generated after the
 84        tables its foreign keys point to. Raises ValueError on a dependency cycle.
 85        """
 86        deps: dict[str, set[str]] = {name: set() for name in self._tables}
 87        for fk in self._foreign_keys:
 88            if fk.from_table in deps and fk.to_table in deps:
 89                deps[fk.from_table].add(fk.to_table)
 90
 91        ordered: list[str] = []
 92        visited: set[str] = set()
 93        visiting: set[str] = set()
 94
 95        def visit(name: str):
 96            if name in visited:
 97                return
 98            if name in visiting:
 99                raise ValueError(f"Dependency cycle detected involving table '{name}'")
100            visiting.add(name)
101            for dep in sorted(deps[name]):
102                visit(dep)
103            visiting.discard(name)
104            visited.add(name)
105            ordered.append(name)
106
107        for name in sorted(self._tables):
108            visit(name)
109        return ordered
110
111    def to_dict(self) -> dict:
112        return {
113            "tables": {
114                name: {
115                    "table": n.table,
116                    "primary_key": n.primary_key,
117                    "master_data_columns": n.master_data_columns,
118                }
119                for name, n in self._tables.items()
120            },
121            "foreign_keys": [
122                {
123                    "from_table": fk.from_table, "from_column": fk.from_column,
124                    "to_table": fk.to_table, "to_column": fk.to_column,
125                }
126                for fk in self._foreign_keys
127            ],
128        }
129
130    def to_json(self, indent: int = 2) -> str:
131        import json
132        return json.dumps(self.to_dict(), indent=indent)
133
134    def summary(self):
135        print(f"Tables:       {len(self._tables)}")
136        print(f"Foreign keys: {len(self._foreign_keys)}")
137        issues = self.validate()
138        if issues:
139            print(f"{len(issues)} validation issue(s):")
140            for i in issues:
141                print(f"   - {i}")
142        else:
143            print(f"Validation passed — generation order: {' → '.join(self.generation_order())}")

Defines which tables exist, their primary/master-data columns, and the foreign keys linking them — so synthetic generation can run tables in dependency order and keep FK values referentially valid.

Usage:: graph = RelationshipGraph() graph.add_table("Customer", table="catalog.schema.dim_customer", primary_key="customer_id") graph.add_table("Account", table="catalog.schema.fact_account", primary_key="account_id", master_data_columns=["currency_code"]) graph.add_foreign_key("Account", "customer_id", "Customer", "customer_id") graph.generation_order() # -> ["Customer", "Account"]

def add_table( self, name: str, table: str, primary_key: str | None = None, master_data_columns: list[str] | None = None):
45    def add_table(self, name: str, table: str, primary_key: str | None = None,
46                  master_data_columns: list[str] | None = None):
47        self._tables[name] = TableNode(name, table, primary_key, master_data_columns or [])
48        return self
def add_foreign_key( self, from_table: str, from_column: str, to_table: str, to_column: str):
50    def add_foreign_key(self, from_table: str, from_column: str,
51                        to_table: str, to_column: str):
52        self._foreign_keys.append(ForeignKey(from_table, from_column, to_table, to_column))
53        return self
tables: dict[str, dashsynthetic.relationships.TableNode]
55    @property
56    def tables(self) -> dict[str, TableNode]:
57        return self._tables
foreign_keys: list[dashsynthetic.relationships.ForeignKey]
59    @property
60    def foreign_keys(self) -> list[ForeignKey]:
61        return self._foreign_keys
def foreign_keys_for(self, table_name: str) -> list[dashsynthetic.relationships.ForeignKey]:
63    def foreign_keys_for(self, table_name: str) -> list[ForeignKey]:
64        """Foreign keys defined on `table_name` (i.e. it depends on their `to_table`)."""
65        return [fk for fk in self._foreign_keys if fk.from_table == table_name]

Foreign keys defined on table_name (i.e. it depends on their to_table).

def validate(self) -> list[str]:
67    def validate(self) -> list[str]:
68        """Check referential integrity of tables/foreign keys; cycle detection."""
69        issues = []
70        for fk in self._foreign_keys:
71            if fk.from_table not in self._tables:
72                issues.append(f"Unknown table '{fk.from_table}' in foreign key")
73            if fk.to_table not in self._tables:
74                issues.append(f"Unknown table '{fk.to_table}' in foreign key")
75        try:
76            self.generation_order()
77        except ValueError as e:
78            issues.append(str(e))
79        return issues

Check referential integrity of tables/foreign keys; cycle detection.

def generation_order(self) -> list[str]:
 81    def generation_order(self) -> list[str]:
 82        """
 83        Topologically sort tables so every table is generated after the
 84        tables its foreign keys point to. Raises ValueError on a dependency cycle.
 85        """
 86        deps: dict[str, set[str]] = {name: set() for name in self._tables}
 87        for fk in self._foreign_keys:
 88            if fk.from_table in deps and fk.to_table in deps:
 89                deps[fk.from_table].add(fk.to_table)
 90
 91        ordered: list[str] = []
 92        visited: set[str] = set()
 93        visiting: set[str] = set()
 94
 95        def visit(name: str):
 96            if name in visited:
 97                return
 98            if name in visiting:
 99                raise ValueError(f"Dependency cycle detected involving table '{name}'")
100            visiting.add(name)
101            for dep in sorted(deps[name]):
102                visit(dep)
103            visiting.discard(name)
104            visited.add(name)
105            ordered.append(name)
106
107        for name in sorted(self._tables):
108            visit(name)
109        return ordered

Topologically sort tables so every table is generated after the tables its foreign keys point to. Raises ValueError on a dependency cycle.

def to_dict(self) -> dict:
111    def to_dict(self) -> dict:
112        return {
113            "tables": {
114                name: {
115                    "table": n.table,
116                    "primary_key": n.primary_key,
117                    "master_data_columns": n.master_data_columns,
118                }
119                for name, n in self._tables.items()
120            },
121            "foreign_keys": [
122                {
123                    "from_table": fk.from_table, "from_column": fk.from_column,
124                    "to_table": fk.to_table, "to_column": fk.to_column,
125                }
126                for fk in self._foreign_keys
127            ],
128        }
def to_json(self, indent: int = 2) -> str:
130    def to_json(self, indent: int = 2) -> str:
131        import json
132        return json.dumps(self.to_dict(), indent=indent)
def summary(self):
134    def summary(self):
135        print(f"Tables:       {len(self._tables)}")
136        print(f"Foreign keys: {len(self._foreign_keys)}")
137        issues = self.validate()
138        if issues:
139            print(f"{len(issues)} validation issue(s):")
140            for i in issues:
141                print(f"   - {i}")
142        else:
143            print(f"Validation passed — generation order: {' → '.join(self.generation_order())}")
def launch():
 6def launch():
 7    try:
 8        import ipywidgets as w
 9        from IPython.display import display
10    except ImportError:
11        raise RuntimeError("ipywidgets required. Run: %pip install ipywidgets")
12
13    import dashui
14
15    tab = w.Tab(children=[_build_single_table_tab(w), _build_relationships_tab(w)])
16    tab.set_title(0, "Single Table")
17    tab.set_title(1, "Multi-Table Relationships")
18
19    ui = dashui.card([
20        dashui.header("DashSynthetic — Synthetic Data Generation",
21                      library="dashsynthetic"),
22        tab,
23    ])
24    display(ui)