yaml_shredder.table_generator
Generate tabular structures from nested YAML/JSON data.
1"""Generate tabular structures from nested YAML/JSON data.""" 2 3from pathlib import Path 4from typing import Any 5 6import pandas as pd 7 8 9class TableGenerator: 10 """Generate relational tables from nested data structures.""" 11 12 def __init__(self): 13 """Initialize the table generator.""" 14 self.tables = {} 15 self.relationships = [] 16 17 def generate_tables(self, data: dict[str, Any], root_table_name: str = "ROOT") -> dict[str, pd.DataFrame]: 18 """ 19 Generate tables from nested data. 20 21 Args: 22 data: Source data dictionary 23 root_table_name: Name for the root table 24 25 Returns: 26 Dictionary of table_name -> DataFrame 27 """ 28 # Extract root-level scalar fields into root table 29 root_data = {k: v for k, v in data.items() if not isinstance(v, (list, dict))} 30 if root_data: 31 self.tables[root_table_name] = pd.DataFrame([root_data]) 32 33 # Process nested structures 34 self._process_structure(data, root_table_name, {}) 35 36 return self.tables 37 38 def _process_structure(self, obj: Any, parent_table: str, parent_keys: dict[str, Any], path: str = "") -> None: 39 """ 40 Recursively process structure to extract tables. 41 42 Args: 43 obj: Object to process 44 parent_table: Name of parent table 45 parent_keys: Keys from parent for foreign key relationships 46 path: Current path in structure 47 """ 48 if isinstance(obj, dict): 49 for key, value in obj.items(): 50 current_path = f"{path}.{key}" if path else key 51 52 if isinstance(value, list) and value and isinstance(value[0], dict): 53 # Array of objects -> create table 54 table_name = self._path_to_table_name(current_path) 55 self._create_table_from_array(value, table_name, parent_table, parent_keys) 56 elif isinstance(value, dict): 57 # Nested object -> continue traversal 58 self._process_structure(value, parent_table, parent_keys, current_path) 59 60 def _create_table_from_array( 61 self, array: list[dict[str, Any]], table_name: str, parent_table: str, parent_keys: dict[str, Any] 62 ) -> None: 63 """ 64 Create a table from an array of objects. 65 66 Args: 67 array: Array of objects 68 table_name: Name for the table 69 parent_table: Parent table name 70 parent_keys: Parent keys for relationships 71 """ 72 # Flatten objects and add parent foreign keys 73 rows = [] 74 for i, item in enumerate(array): 75 row = self._flatten_dict(item) 76 77 # Add parent foreign keys 78 for parent_key, parent_value in parent_keys.items(): 79 row[f"parent_{parent_key}"] = parent_value 80 81 # Add row index for ordering 82 row["_row_index"] = i 83 84 rows.append(row) 85 86 # Create DataFrame 87 df = pd.DataFrame(rows) 88 89 # Store table 90 self.tables[table_name] = df 91 92 # Record relationship 93 if parent_keys: 94 self.relationships.append( 95 {"parent_table": parent_table, "child_table": table_name, "foreign_keys": list(parent_keys.keys())} 96 ) 97 98 # Process nested arrays within this array 99 for i, item in enumerate(array): # noqa: B007 100 item_keys = {**parent_keys} 101 # Add identifying keys from this level 102 for key in ["id", "name", "code"]: 103 if key in item: 104 item_keys[key] = item[key] 105 break 106 107 for key, value in item.items(): 108 if isinstance(value, list) and value and isinstance(value[0], dict): 109 nested_table_name = f"{table_name}_{key}" 110 self._create_table_from_array(value, nested_table_name, table_name, item_keys) 111 112 def _flatten_dict(self, d: dict[str, Any], parent_key: str = "", sep: str = "_") -> dict[str, Any]: 113 """ 114 Flatten nested dictionary, keeping only scalar values. 115 116 Args: 117 d: Dictionary to flatten 118 parent_key: Parent key for nested items 119 sep: Separator for nested keys 120 121 Returns: 122 Flattened dictionary 123 """ 124 items = [] 125 126 for k, v in d.items(): 127 new_key = f"{parent_key}{sep}{k}" if parent_key else k 128 129 if isinstance(v, dict): 130 # Recursively flatten nested dict 131 items.extend(self._flatten_dict(v, new_key, sep=sep).items()) 132 elif isinstance(v, list): 133 # For lists, convert to JSON string or skip 134 if v and not isinstance(v[0], dict): 135 # Simple list - join as string 136 items.append((new_key, ", ".join(map(str, v)))) 137 # Skip lists of objects (they become separate tables) 138 else: 139 # Scalar value 140 items.append((new_key, v)) 141 142 return dict(items) 143 144 def _path_to_table_name(self, path: str) -> str: 145 """ 146 Convert path to table name. 147 148 Args: 149 path: Path like "actions" or "warehouse.settings" 150 151 Returns: 152 Table name 153 """ 154 parts = path.replace(".", "_").split("_") 155 return "_".join(parts).upper() 156 157 def save_tables(self, output_dir: str | Path, format: str = "csv") -> None: 158 """ 159 Save all tables to files. 160 161 Args: 162 output_dir: Directory to save tables 163 format: Output format ('csv', 'parquet', 'excel') 164 """ 165 output_dir = Path(output_dir) 166 output_dir.mkdir(parents=True, exist_ok=True) 167 168 for table_name, df in self.tables.items(): 169 if format == "csv": 170 filepath = output_dir / f"{table_name}.csv" 171 df.to_csv(filepath, index=False) 172 elif format == "parquet": 173 filepath = output_dir / f"{table_name}.parquet" 174 df.to_parquet(filepath, index=False) 175 elif format == "excel": 176 filepath = output_dir / f"{table_name}.xlsx" 177 df.to_excel(filepath, index=False) 178 179 print(f"Saved {table_name}: {len(df)} rows, {len(df.columns)} columns -> {filepath}") 180 181 def print_summary(self) -> None: 182 """Print summary of generated tables.""" 183 print(f"\n{'=' * 60}") 184 print("GENERATED TABLES SUMMARY") 185 print(f"{'=' * 60}\n") 186 187 print(f"Total tables: {len(self.tables)}\n") 188 189 for table_name, df in self.tables.items(): 190 print(f"Table: {table_name}") 191 print(f" Rows: {len(df)}") 192 print(f" Columns: {len(df.columns)}") 193 print(f" Column names: {', '.join(df.columns[:5])}") 194 if len(df.columns) > 5: 195 print(f" ... and {len(df.columns) - 5} more") 196 print() 197 198 if self.relationships: 199 print(f"{'-' * 60}") 200 print("RELATIONSHIPS:") 201 print(f"{'-' * 60}\n") 202 for rel in self.relationships: 203 print(f"{rel['parent_table']} -> {rel['child_table']}") 204 print(f" Foreign keys: {', '.join(rel['foreign_keys'])}\n")
class
TableGenerator:
10class TableGenerator: 11 """Generate relational tables from nested data structures.""" 12 13 def __init__(self): 14 """Initialize the table generator.""" 15 self.tables = {} 16 self.relationships = [] 17 18 def generate_tables(self, data: dict[str, Any], root_table_name: str = "ROOT") -> dict[str, pd.DataFrame]: 19 """ 20 Generate tables from nested data. 21 22 Args: 23 data: Source data dictionary 24 root_table_name: Name for the root table 25 26 Returns: 27 Dictionary of table_name -> DataFrame 28 """ 29 # Extract root-level scalar fields into root table 30 root_data = {k: v for k, v in data.items() if not isinstance(v, (list, dict))} 31 if root_data: 32 self.tables[root_table_name] = pd.DataFrame([root_data]) 33 34 # Process nested structures 35 self._process_structure(data, root_table_name, {}) 36 37 return self.tables 38 39 def _process_structure(self, obj: Any, parent_table: str, parent_keys: dict[str, Any], path: str = "") -> None: 40 """ 41 Recursively process structure to extract tables. 42 43 Args: 44 obj: Object to process 45 parent_table: Name of parent table 46 parent_keys: Keys from parent for foreign key relationships 47 path: Current path in structure 48 """ 49 if isinstance(obj, dict): 50 for key, value in obj.items(): 51 current_path = f"{path}.{key}" if path else key 52 53 if isinstance(value, list) and value and isinstance(value[0], dict): 54 # Array of objects -> create table 55 table_name = self._path_to_table_name(current_path) 56 self._create_table_from_array(value, table_name, parent_table, parent_keys) 57 elif isinstance(value, dict): 58 # Nested object -> continue traversal 59 self._process_structure(value, parent_table, parent_keys, current_path) 60 61 def _create_table_from_array( 62 self, array: list[dict[str, Any]], table_name: str, parent_table: str, parent_keys: dict[str, Any] 63 ) -> None: 64 """ 65 Create a table from an array of objects. 66 67 Args: 68 array: Array of objects 69 table_name: Name for the table 70 parent_table: Parent table name 71 parent_keys: Parent keys for relationships 72 """ 73 # Flatten objects and add parent foreign keys 74 rows = [] 75 for i, item in enumerate(array): 76 row = self._flatten_dict(item) 77 78 # Add parent foreign keys 79 for parent_key, parent_value in parent_keys.items(): 80 row[f"parent_{parent_key}"] = parent_value 81 82 # Add row index for ordering 83 row["_row_index"] = i 84 85 rows.append(row) 86 87 # Create DataFrame 88 df = pd.DataFrame(rows) 89 90 # Store table 91 self.tables[table_name] = df 92 93 # Record relationship 94 if parent_keys: 95 self.relationships.append( 96 {"parent_table": parent_table, "child_table": table_name, "foreign_keys": list(parent_keys.keys())} 97 ) 98 99 # Process nested arrays within this array 100 for i, item in enumerate(array): # noqa: B007 101 item_keys = {**parent_keys} 102 # Add identifying keys from this level 103 for key in ["id", "name", "code"]: 104 if key in item: 105 item_keys[key] = item[key] 106 break 107 108 for key, value in item.items(): 109 if isinstance(value, list) and value and isinstance(value[0], dict): 110 nested_table_name = f"{table_name}_{key}" 111 self._create_table_from_array(value, nested_table_name, table_name, item_keys) 112 113 def _flatten_dict(self, d: dict[str, Any], parent_key: str = "", sep: str = "_") -> dict[str, Any]: 114 """ 115 Flatten nested dictionary, keeping only scalar values. 116 117 Args: 118 d: Dictionary to flatten 119 parent_key: Parent key for nested items 120 sep: Separator for nested keys 121 122 Returns: 123 Flattened dictionary 124 """ 125 items = [] 126 127 for k, v in d.items(): 128 new_key = f"{parent_key}{sep}{k}" if parent_key else k 129 130 if isinstance(v, dict): 131 # Recursively flatten nested dict 132 items.extend(self._flatten_dict(v, new_key, sep=sep).items()) 133 elif isinstance(v, list): 134 # For lists, convert to JSON string or skip 135 if v and not isinstance(v[0], dict): 136 # Simple list - join as string 137 items.append((new_key, ", ".join(map(str, v)))) 138 # Skip lists of objects (they become separate tables) 139 else: 140 # Scalar value 141 items.append((new_key, v)) 142 143 return dict(items) 144 145 def _path_to_table_name(self, path: str) -> str: 146 """ 147 Convert path to table name. 148 149 Args: 150 path: Path like "actions" or "warehouse.settings" 151 152 Returns: 153 Table name 154 """ 155 parts = path.replace(".", "_").split("_") 156 return "_".join(parts).upper() 157 158 def save_tables(self, output_dir: str | Path, format: str = "csv") -> None: 159 """ 160 Save all tables to files. 161 162 Args: 163 output_dir: Directory to save tables 164 format: Output format ('csv', 'parquet', 'excel') 165 """ 166 output_dir = Path(output_dir) 167 output_dir.mkdir(parents=True, exist_ok=True) 168 169 for table_name, df in self.tables.items(): 170 if format == "csv": 171 filepath = output_dir / f"{table_name}.csv" 172 df.to_csv(filepath, index=False) 173 elif format == "parquet": 174 filepath = output_dir / f"{table_name}.parquet" 175 df.to_parquet(filepath, index=False) 176 elif format == "excel": 177 filepath = output_dir / f"{table_name}.xlsx" 178 df.to_excel(filepath, index=False) 179 180 print(f"Saved {table_name}: {len(df)} rows, {len(df.columns)} columns -> {filepath}") 181 182 def print_summary(self) -> None: 183 """Print summary of generated tables.""" 184 print(f"\n{'=' * 60}") 185 print("GENERATED TABLES SUMMARY") 186 print(f"{'=' * 60}\n") 187 188 print(f"Total tables: {len(self.tables)}\n") 189 190 for table_name, df in self.tables.items(): 191 print(f"Table: {table_name}") 192 print(f" Rows: {len(df)}") 193 print(f" Columns: {len(df.columns)}") 194 print(f" Column names: {', '.join(df.columns[:5])}") 195 if len(df.columns) > 5: 196 print(f" ... and {len(df.columns) - 5} more") 197 print() 198 199 if self.relationships: 200 print(f"{'-' * 60}") 201 print("RELATIONSHIPS:") 202 print(f"{'-' * 60}\n") 203 for rel in self.relationships: 204 print(f"{rel['parent_table']} -> {rel['child_table']}") 205 print(f" Foreign keys: {', '.join(rel['foreign_keys'])}\n")
Generate relational tables from nested data structures.
TableGenerator()
13 def __init__(self): 14 """Initialize the table generator.""" 15 self.tables = {} 16 self.relationships = []
Initialize the table generator.
def
generate_tables( self, data: dict[str, typing.Any], root_table_name: str = 'ROOT') -> dict[str, pandas.core.frame.DataFrame]:
18 def generate_tables(self, data: dict[str, Any], root_table_name: str = "ROOT") -> dict[str, pd.DataFrame]: 19 """ 20 Generate tables from nested data. 21 22 Args: 23 data: Source data dictionary 24 root_table_name: Name for the root table 25 26 Returns: 27 Dictionary of table_name -> DataFrame 28 """ 29 # Extract root-level scalar fields into root table 30 root_data = {k: v for k, v in data.items() if not isinstance(v, (list, dict))} 31 if root_data: 32 self.tables[root_table_name] = pd.DataFrame([root_data]) 33 34 # Process nested structures 35 self._process_structure(data, root_table_name, {}) 36 37 return self.tables
Generate tables from nested data.
Arguments:
- data: Source data dictionary
- root_table_name: Name for the root table
Returns:
Dictionary of table_name -> DataFrame
def
save_tables(self, output_dir: str | pathlib._local.Path, format: str = 'csv') -> None:
158 def save_tables(self, output_dir: str | Path, format: str = "csv") -> None: 159 """ 160 Save all tables to files. 161 162 Args: 163 output_dir: Directory to save tables 164 format: Output format ('csv', 'parquet', 'excel') 165 """ 166 output_dir = Path(output_dir) 167 output_dir.mkdir(parents=True, exist_ok=True) 168 169 for table_name, df in self.tables.items(): 170 if format == "csv": 171 filepath = output_dir / f"{table_name}.csv" 172 df.to_csv(filepath, index=False) 173 elif format == "parquet": 174 filepath = output_dir / f"{table_name}.parquet" 175 df.to_parquet(filepath, index=False) 176 elif format == "excel": 177 filepath = output_dir / f"{table_name}.xlsx" 178 df.to_excel(filepath, index=False) 179 180 print(f"Saved {table_name}: {len(df)} rows, {len(df.columns)} columns -> {filepath}")
Save all tables to files.
Arguments:
- output_dir: Directory to save tables
- format: Output format ('csv', 'parquet', 'excel')
def
print_summary(self) -> None:
182 def print_summary(self) -> None: 183 """Print summary of generated tables.""" 184 print(f"\n{'=' * 60}") 185 print("GENERATED TABLES SUMMARY") 186 print(f"{'=' * 60}\n") 187 188 print(f"Total tables: {len(self.tables)}\n") 189 190 for table_name, df in self.tables.items(): 191 print(f"Table: {table_name}") 192 print(f" Rows: {len(df)}") 193 print(f" Columns: {len(df.columns)}") 194 print(f" Column names: {', '.join(df.columns[:5])}") 195 if len(df.columns) > 5: 196 print(f" ... and {len(df.columns) - 5} more") 197 print() 198 199 if self.relationships: 200 print(f"{'-' * 60}") 201 print("RELATIONSHIPS:") 202 print(f"{'-' * 60}\n") 203 for rel in self.relationships: 204 print(f"{rel['parent_table']} -> {rel['child_table']}") 205 print(f" Foreign keys: {', '.join(rel['foreign_keys'])}\n")
Print summary of generated tables.