dashsynthetic
DashSynthetic — Synthetic data generation for Databricks. Launch the UI with dashsynthetic.launch() inside a Databricks notebook.
1""" 2DashSynthetic — Synthetic data generation for Databricks. 3Launch the UI with dashsynthetic.launch() inside a Databricks notebook. 4""" 5from dashsynthetic.generator import MultiTableGenerator, SyntheticGenerator 6from dashsynthetic.relationships import RelationshipGraph 7from dashsynthetic.ui import launch 8 9__version__ = "0.1.4" 10__all__ = ["SyntheticGenerator", "MultiTableGenerator", "RelationshipGraph", "launch"]
8class SyntheticGenerator: 9 """ 10 Generate synthetic data from a source Databricks table or DataFrame. 11 12 Usage:: 13 gen = SyntheticGenerator(table="catalog.schema.customers") 14 gen.set_volume(50000) 15 gen.preserve_correlations(True) 16 syn_df = gen.run() 17 """ 18 19 def __init__(self, df=None, table: str = None, query: str = None): 20 self._source_df = self._resolve(df, table, query) 21 self._volume: int = 0 22 self._preserve_corr: bool = True 23 self._preserve_nulls: bool = True 24 self._preserve_distributions: bool = True 25 self._output_table: Optional[str] = None 26 27 def _resolve(self, df, table, query): 28 if df is not None: 29 return df 30 try: 31 from pyspark.sql import SparkSession 32 spark = SparkSession.getActiveSession() 33 if table: 34 return spark.table(table) 35 if query: 36 return spark.sql(query) 37 except Exception as e: 38 raise ValueError(f"Could not load source: {e}") 39 raise ValueError("Provide df, table, or query") 40 41 def set_volume(self, n_rows: int): 42 self._volume = n_rows 43 return self 44 45 def preserve_correlations(self, enabled: bool = True): 46 self._preserve_corr = enabled 47 return self 48 49 def preserve_null_patterns(self, enabled: bool = True): 50 self._preserve_nulls = enabled 51 return self 52 53 def preserve_distributions(self, enabled: bool = True): 54 self._preserve_distributions = enabled 55 return self 56 57 def output_to(self, table: str): 58 self._output_table = table 59 return self 60 61 def profile(self) -> dict: 62 """Profile the source dataframe — distributions, null rates, correlations.""" 63 from dashsynthetic.profiler import profile_df 64 return profile_df(self._source_df) 65 66 def run(self): 67 """Generate and return a synthetic DataFrame.""" 68 from dashsynthetic.engine import generate 69 syn_df = generate( 70 source_df=self._source_df, 71 n_rows=self._volume or self._source_df.count(), 72 preserve_corr=self._preserve_corr, 73 preserve_nulls=self._preserve_nulls, 74 preserve_distributions=self._preserve_distributions, 75 ) 76 if self._output_table: 77 syn_df.write.format("delta").mode("overwrite") \ 78 .option("overwriteSchema", "true") \ 79 .saveAsTable(self._output_table) 80 print(f"Synthetic data written to {self._output_table}") 81 return syn_df
Generate synthetic data from a source Databricks table or DataFrame.
Usage:: gen = SyntheticGenerator(table="catalog.schema.customers") gen.set_volume(50000) gen.preserve_correlations(True) syn_df = gen.run()
19 def __init__(self, df=None, table: str = None, query: str = None): 20 self._source_df = self._resolve(df, table, query) 21 self._volume: int = 0 22 self._preserve_corr: bool = True 23 self._preserve_nulls: bool = True 24 self._preserve_distributions: bool = True 25 self._output_table: Optional[str] = None
61 def profile(self) -> dict: 62 """Profile the source dataframe — distributions, null rates, correlations.""" 63 from dashsynthetic.profiler import profile_df 64 return profile_df(self._source_df)
Profile the source dataframe — distributions, null rates, correlations.
66 def run(self): 67 """Generate and return a synthetic DataFrame.""" 68 from dashsynthetic.engine import generate 69 syn_df = generate( 70 source_df=self._source_df, 71 n_rows=self._volume or self._source_df.count(), 72 preserve_corr=self._preserve_corr, 73 preserve_nulls=self._preserve_nulls, 74 preserve_distributions=self._preserve_distributions, 75 ) 76 if self._output_table: 77 syn_df.write.format("delta").mode("overwrite") \ 78 .option("overwriteSchema", "true") \ 79 .saveAsTable(self._output_table) 80 print(f"Synthetic data written to {self._output_table}") 81 return syn_df
Generate and return a synthetic DataFrame.
84class MultiTableGenerator: 85 """ 86 Generate synthetic data for several related tables at once, preserving 87 primary/foreign key referential integrity and master-data columns. 88 89 Usage:: 90 graph = RelationshipGraph() 91 graph.add_table("Customer", table="catalog.schema.dim_customer", primary_key="customer_id") 92 graph.add_table("Account", table="catalog.schema.fact_account", primary_key="account_id") 93 graph.add_foreign_key("Account", "customer_id", "Customer", "customer_id") 94 95 gen = MultiTableGenerator(graph) 96 gen.configure_table("Customer", n_rows=5000) 97 gen.configure_table("Account", n_rows=20000, output_table="catalog.schema.syn_account") 98 results = gen.run() # {"Customer": df, "Account": df} 99 """ 100 101 def __init__(self, graph: RelationshipGraph): 102 self._graph = graph 103 self._specs: dict = {} 104 105 def configure_table(self, name: str, n_rows: int = 0, preserve_corr: bool = True, 106 preserve_nulls: bool = True, preserve_distributions: bool = True, 107 output_table: Optional[str] = None): 108 from dashsynthetic.multi_engine import TableGenSpec 109 self._specs[name] = TableGenSpec( 110 n_rows=n_rows, preserve_corr=preserve_corr, preserve_nulls=preserve_nulls, 111 preserve_distributions=preserve_distributions, output_table=output_table, 112 ) 113 return self 114 115 def validate(self) -> list: 116 return self._graph.validate() 117 118 def generation_order(self) -> list: 119 return self._graph.generation_order() 120 121 def run(self) -> dict: 122 """Generate and return {table_name: synthetic DataFrame} in dependency order.""" 123 from dashsynthetic.multi_engine import generate_multi 124 return generate_multi(self._graph, self._specs)
Generate synthetic data for several related tables at once, preserving primary/foreign key referential integrity and master-data columns.
Usage:: graph = RelationshipGraph() graph.add_table("Customer", table="catalog.schema.dim_customer", primary_key="customer_id") graph.add_table("Account", table="catalog.schema.fact_account", primary_key="account_id") graph.add_foreign_key("Account", "customer_id", "Customer", "customer_id")
gen = MultiTableGenerator(graph)
gen.configure_table("Customer", n_rows=5000)
gen.configure_table("Account", n_rows=20000, output_table="catalog.schema.syn_account")
results = gen.run() # {"Customer": df, "Account": df}
105 def configure_table(self, name: str, n_rows: int = 0, preserve_corr: bool = True, 106 preserve_nulls: bool = True, preserve_distributions: bool = True, 107 output_table: Optional[str] = None): 108 from dashsynthetic.multi_engine import TableGenSpec 109 self._specs[name] = TableGenSpec( 110 n_rows=n_rows, preserve_corr=preserve_corr, preserve_nulls=preserve_nulls, 111 preserve_distributions=preserve_distributions, output_table=output_table, 112 ) 113 return self
121 def run(self) -> dict: 122 """Generate and return {table_name: synthetic DataFrame} in dependency order.""" 123 from dashsynthetic.multi_engine import generate_multi 124 return generate_multi(self._graph, self._specs)
Generate and return {table_name: synthetic DataFrame} in dependency order.
26class RelationshipGraph: 27 """ 28 Defines which tables exist, their primary/master-data columns, and the 29 foreign keys linking them — so synthetic generation can run tables in 30 dependency order and keep FK values referentially valid. 31 32 Usage:: 33 graph = RelationshipGraph() 34 graph.add_table("Customer", table="catalog.schema.dim_customer", primary_key="customer_id") 35 graph.add_table("Account", table="catalog.schema.fact_account", primary_key="account_id", 36 master_data_columns=["currency_code"]) 37 graph.add_foreign_key("Account", "customer_id", "Customer", "customer_id") 38 graph.generation_order() # -> ["Customer", "Account"] 39 """ 40 41 def __init__(self): 42 self._tables: dict[str, TableNode] = {} 43 self._foreign_keys: list[ForeignKey] = [] 44 45 def add_table(self, name: str, table: str, primary_key: str | None = None, 46 master_data_columns: list[str] | None = None): 47 self._tables[name] = TableNode(name, table, primary_key, master_data_columns or []) 48 return self 49 50 def add_foreign_key(self, from_table: str, from_column: str, 51 to_table: str, to_column: str): 52 self._foreign_keys.append(ForeignKey(from_table, from_column, to_table, to_column)) 53 return self 54 55 @property 56 def tables(self) -> dict[str, TableNode]: 57 return self._tables 58 59 @property 60 def foreign_keys(self) -> list[ForeignKey]: 61 return self._foreign_keys 62 63 def foreign_keys_for(self, table_name: str) -> list[ForeignKey]: 64 """Foreign keys defined on `table_name` (i.e. it depends on their `to_table`).""" 65 return [fk for fk in self._foreign_keys if fk.from_table == table_name] 66 67 def validate(self) -> list[str]: 68 """Check referential integrity of tables/foreign keys; cycle detection.""" 69 issues = [] 70 for fk in self._foreign_keys: 71 if fk.from_table not in self._tables: 72 issues.append(f"Unknown table '{fk.from_table}' in foreign key") 73 if fk.to_table not in self._tables: 74 issues.append(f"Unknown table '{fk.to_table}' in foreign key") 75 try: 76 self.generation_order() 77 except ValueError as e: 78 issues.append(str(e)) 79 return issues 80 81 def generation_order(self) -> list[str]: 82 """ 83 Topologically sort tables so every table is generated after the 84 tables its foreign keys point to. Raises ValueError on a dependency cycle. 85 """ 86 deps: dict[str, set[str]] = {name: set() for name in self._tables} 87 for fk in self._foreign_keys: 88 if fk.from_table in deps and fk.to_table in deps: 89 deps[fk.from_table].add(fk.to_table) 90 91 ordered: list[str] = [] 92 visited: set[str] = set() 93 visiting: set[str] = set() 94 95 def visit(name: str): 96 if name in visited: 97 return 98 if name in visiting: 99 raise ValueError(f"Dependency cycle detected involving table '{name}'") 100 visiting.add(name) 101 for dep in sorted(deps[name]): 102 visit(dep) 103 visiting.discard(name) 104 visited.add(name) 105 ordered.append(name) 106 107 for name in sorted(self._tables): 108 visit(name) 109 return ordered 110 111 def to_dict(self) -> dict: 112 return { 113 "tables": { 114 name: { 115 "table": n.table, 116 "primary_key": n.primary_key, 117 "master_data_columns": n.master_data_columns, 118 } 119 for name, n in self._tables.items() 120 }, 121 "foreign_keys": [ 122 { 123 "from_table": fk.from_table, "from_column": fk.from_column, 124 "to_table": fk.to_table, "to_column": fk.to_column, 125 } 126 for fk in self._foreign_keys 127 ], 128 } 129 130 def to_json(self, indent: int = 2) -> str: 131 import json 132 return json.dumps(self.to_dict(), indent=indent) 133 134 def summary(self): 135 print(f"Tables: {len(self._tables)}") 136 print(f"Foreign keys: {len(self._foreign_keys)}") 137 issues = self.validate() 138 if issues: 139 print(f"⚠️ {len(issues)} validation issue(s):") 140 for i in issues: 141 print(f" - {i}") 142 else: 143 print(f"✅ Validation passed — generation order: {' → '.join(self.generation_order())}")
Defines which tables exist, their primary/master-data columns, and the foreign keys linking them — so synthetic generation can run tables in dependency order and keep FK values referentially valid.
Usage:: graph = RelationshipGraph() graph.add_table("Customer", table="catalog.schema.dim_customer", primary_key="customer_id") graph.add_table("Account", table="catalog.schema.fact_account", primary_key="account_id", master_data_columns=["currency_code"]) graph.add_foreign_key("Account", "customer_id", "Customer", "customer_id") graph.generation_order() # -> ["Customer", "Account"]
63 def foreign_keys_for(self, table_name: str) -> list[ForeignKey]: 64 """Foreign keys defined on `table_name` (i.e. it depends on their `to_table`).""" 65 return [fk for fk in self._foreign_keys if fk.from_table == table_name]
Foreign keys defined on table_name (i.e. it depends on their to_table).
67 def validate(self) -> list[str]: 68 """Check referential integrity of tables/foreign keys; cycle detection.""" 69 issues = [] 70 for fk in self._foreign_keys: 71 if fk.from_table not in self._tables: 72 issues.append(f"Unknown table '{fk.from_table}' in foreign key") 73 if fk.to_table not in self._tables: 74 issues.append(f"Unknown table '{fk.to_table}' in foreign key") 75 try: 76 self.generation_order() 77 except ValueError as e: 78 issues.append(str(e)) 79 return issues
Check referential integrity of tables/foreign keys; cycle detection.
81 def generation_order(self) -> list[str]: 82 """ 83 Topologically sort tables so every table is generated after the 84 tables its foreign keys point to. Raises ValueError on a dependency cycle. 85 """ 86 deps: dict[str, set[str]] = {name: set() for name in self._tables} 87 for fk in self._foreign_keys: 88 if fk.from_table in deps and fk.to_table in deps: 89 deps[fk.from_table].add(fk.to_table) 90 91 ordered: list[str] = [] 92 visited: set[str] = set() 93 visiting: set[str] = set() 94 95 def visit(name: str): 96 if name in visited: 97 return 98 if name in visiting: 99 raise ValueError(f"Dependency cycle detected involving table '{name}'") 100 visiting.add(name) 101 for dep in sorted(deps[name]): 102 visit(dep) 103 visiting.discard(name) 104 visited.add(name) 105 ordered.append(name) 106 107 for name in sorted(self._tables): 108 visit(name) 109 return ordered
Topologically sort tables so every table is generated after the tables its foreign keys point to. Raises ValueError on a dependency cycle.
111 def to_dict(self) -> dict: 112 return { 113 "tables": { 114 name: { 115 "table": n.table, 116 "primary_key": n.primary_key, 117 "master_data_columns": n.master_data_columns, 118 } 119 for name, n in self._tables.items() 120 }, 121 "foreign_keys": [ 122 { 123 "from_table": fk.from_table, "from_column": fk.from_column, 124 "to_table": fk.to_table, "to_column": fk.to_column, 125 } 126 for fk in self._foreign_keys 127 ], 128 }
134 def summary(self): 135 print(f"Tables: {len(self._tables)}") 136 print(f"Foreign keys: {len(self._foreign_keys)}") 137 issues = self.validate() 138 if issues: 139 print(f"⚠️ {len(issues)} validation issue(s):") 140 for i in issues: 141 print(f" - {i}") 142 else: 143 print(f"✅ Validation passed — generation order: {' → '.join(self.generation_order())}")
6def launch(): 7 try: 8 import ipywidgets as w 9 from IPython.display import display 10 except ImportError: 11 raise RuntimeError("ipywidgets required. Run: %pip install ipywidgets") 12 13 import dashui 14 15 tab = w.Tab(children=[_build_single_table_tab(w), _build_relationships_tab(w)]) 16 tab.set_title(0, "Single Table") 17 tab.set_title(1, "Multi-Table Relationships") 18 19 ui = dashui.card([ 20 dashui.header("DashSynthetic — Synthetic Data Generation", 21 library="dashsynthetic", emoji="🧬"), 22 tab, 23 ]) 24 display(ui)