yaml_shredder.table_generator

Generate tabular structures from nested YAML/JSON data.

  1"""Generate tabular structures from nested YAML/JSON data."""
  2
  3from pathlib import Path
  4from typing import Any
  5
  6import pandas as pd
  7
  8
  9class TableGenerator:
 10    """Generate relational tables from nested data structures."""
 11
 12    def __init__(self):
 13        """Initialize the table generator."""
 14        self.tables = {}
 15        self.relationships = []
 16
 17    def generate_tables(self, data: dict[str, Any], root_table_name: str = "ROOT") -> dict[str, pd.DataFrame]:
 18        """
 19        Generate tables from nested data.
 20
 21        Args:
 22            data: Source data dictionary
 23            root_table_name: Name for the root table
 24
 25        Returns:
 26            Dictionary of table_name -> DataFrame
 27        """
 28        # Extract root-level scalar fields into root table
 29        root_data = {k: v for k, v in data.items() if not isinstance(v, (list, dict))}
 30        if root_data:
 31            self.tables[root_table_name] = pd.DataFrame([root_data])
 32
 33        # Process nested structures
 34        self._process_structure(data, root_table_name, {})
 35
 36        return self.tables
 37
 38    def _process_structure(self, obj: Any, parent_table: str, parent_keys: dict[str, Any], path: str = "") -> None:
 39        """
 40        Recursively process structure to extract tables.
 41
 42        Args:
 43            obj: Object to process
 44            parent_table: Name of parent table
 45            parent_keys: Keys from parent for foreign key relationships
 46            path: Current path in structure
 47        """
 48        if isinstance(obj, dict):
 49            for key, value in obj.items():
 50                current_path = f"{path}.{key}" if path else key
 51
 52                if isinstance(value, list) and value and isinstance(value[0], dict):
 53                    # Array of objects -> create table
 54                    table_name = self._path_to_table_name(current_path)
 55                    self._create_table_from_array(value, table_name, parent_table, parent_keys)
 56                elif isinstance(value, dict):
 57                    # Nested object -> continue traversal
 58                    self._process_structure(value, parent_table, parent_keys, current_path)
 59
 60    def _create_table_from_array(
 61        self, array: list[dict[str, Any]], table_name: str, parent_table: str, parent_keys: dict[str, Any]
 62    ) -> None:
 63        """
 64        Create a table from an array of objects.
 65
 66        Args:
 67            array: Array of objects
 68            table_name: Name for the table
 69            parent_table: Parent table name
 70            parent_keys: Parent keys for relationships
 71        """
 72        # Flatten objects and add parent foreign keys
 73        rows = []
 74        for i, item in enumerate(array):
 75            row = self._flatten_dict(item)
 76
 77            # Add parent foreign keys
 78            for parent_key, parent_value in parent_keys.items():
 79                row[f"parent_{parent_key}"] = parent_value
 80
 81            # Add row index for ordering
 82            row["_row_index"] = i
 83
 84            rows.append(row)
 85
 86        # Create DataFrame
 87        df = pd.DataFrame(rows)
 88
 89        # Store table
 90        self.tables[table_name] = df
 91
 92        # Record relationship
 93        if parent_keys:
 94            self.relationships.append(
 95                {"parent_table": parent_table, "child_table": table_name, "foreign_keys": list(parent_keys.keys())}
 96            )
 97
 98        # Process nested arrays within this array
 99        for i, item in enumerate(array):  # noqa: B007
100            item_keys = {**parent_keys}
101            # Add identifying keys from this level
102            for key in ["id", "name", "code"]:
103                if key in item:
104                    item_keys[key] = item[key]
105                    break
106
107            for key, value in item.items():
108                if isinstance(value, list) and value and isinstance(value[0], dict):
109                    nested_table_name = f"{table_name}_{key}"
110                    self._create_table_from_array(value, nested_table_name, table_name, item_keys)
111
112    def _flatten_dict(self, d: dict[str, Any], parent_key: str = "", sep: str = "_") -> dict[str, Any]:
113        """
114        Flatten nested dictionary, keeping only scalar values.
115
116        Args:
117            d: Dictionary to flatten
118            parent_key: Parent key for nested items
119            sep: Separator for nested keys
120
121        Returns:
122            Flattened dictionary
123        """
124        items = []
125
126        for k, v in d.items():
127            new_key = f"{parent_key}{sep}{k}" if parent_key else k
128
129            if isinstance(v, dict):
130                # Recursively flatten nested dict
131                items.extend(self._flatten_dict(v, new_key, sep=sep).items())
132            elif isinstance(v, list):
133                # For lists, convert to JSON string or skip
134                if v and not isinstance(v[0], dict):
135                    # Simple list - join as string
136                    items.append((new_key, ", ".join(map(str, v))))
137                # Skip lists of objects (they become separate tables)
138            else:
139                # Scalar value
140                items.append((new_key, v))
141
142        return dict(items)
143
144    def _path_to_table_name(self, path: str) -> str:
145        """
146        Convert path to table name.
147
148        Args:
149            path: Path like "actions" or "warehouse.settings"
150
151        Returns:
152            Table name
153        """
154        parts = path.replace(".", "_").split("_")
155        return "_".join(parts).upper()
156
157    def save_tables(self, output_dir: str | Path, format: str = "csv") -> None:
158        """
159        Save all tables to files.
160
161        Args:
162            output_dir: Directory to save tables
163            format: Output format ('csv', 'parquet', 'excel')
164        """
165        output_dir = Path(output_dir)
166        output_dir.mkdir(parents=True, exist_ok=True)
167
168        for table_name, df in self.tables.items():
169            if format == "csv":
170                filepath = output_dir / f"{table_name}.csv"
171                df.to_csv(filepath, index=False)
172            elif format == "parquet":
173                filepath = output_dir / f"{table_name}.parquet"
174                df.to_parquet(filepath, index=False)
175            elif format == "excel":
176                filepath = output_dir / f"{table_name}.xlsx"
177                df.to_excel(filepath, index=False)
178
179            print(f"Saved {table_name}: {len(df)} rows, {len(df.columns)} columns -> {filepath}")
180
181    def print_summary(self) -> None:
182        """Print summary of generated tables."""
183        print(f"\n{'=' * 60}")
184        print("GENERATED TABLES SUMMARY")
185        print(f"{'=' * 60}\n")
186
187        print(f"Total tables: {len(self.tables)}\n")
188
189        for table_name, df in self.tables.items():
190            print(f"Table: {table_name}")
191            print(f"  Rows: {len(df)}")
192            print(f"  Columns: {len(df.columns)}")
193            print(f"  Column names: {', '.join(df.columns[:5])}")
194            if len(df.columns) > 5:
195                print(f"    ... and {len(df.columns) - 5} more")
196            print()
197
198        if self.relationships:
199            print(f"{'-' * 60}")
200            print("RELATIONSHIPS:")
201            print(f"{'-' * 60}\n")
202            for rel in self.relationships:
203                print(f"{rel['parent_table']} -> {rel['child_table']}")
204                print(f"  Foreign keys: {', '.join(rel['foreign_keys'])}\n")
class TableGenerator:
 10class TableGenerator:
 11    """Generate relational tables from nested data structures."""
 12
 13    def __init__(self):
 14        """Initialize the table generator."""
 15        self.tables = {}
 16        self.relationships = []
 17
 18    def generate_tables(self, data: dict[str, Any], root_table_name: str = "ROOT") -> dict[str, pd.DataFrame]:
 19        """
 20        Generate tables from nested data.
 21
 22        Args:
 23            data: Source data dictionary
 24            root_table_name: Name for the root table
 25
 26        Returns:
 27            Dictionary of table_name -> DataFrame
 28        """
 29        # Extract root-level scalar fields into root table
 30        root_data = {k: v for k, v in data.items() if not isinstance(v, (list, dict))}
 31        if root_data:
 32            self.tables[root_table_name] = pd.DataFrame([root_data])
 33
 34        # Process nested structures
 35        self._process_structure(data, root_table_name, {})
 36
 37        return self.tables
 38
 39    def _process_structure(self, obj: Any, parent_table: str, parent_keys: dict[str, Any], path: str = "") -> None:
 40        """
 41        Recursively process structure to extract tables.
 42
 43        Args:
 44            obj: Object to process
 45            parent_table: Name of parent table
 46            parent_keys: Keys from parent for foreign key relationships
 47            path: Current path in structure
 48        """
 49        if isinstance(obj, dict):
 50            for key, value in obj.items():
 51                current_path = f"{path}.{key}" if path else key
 52
 53                if isinstance(value, list) and value and isinstance(value[0], dict):
 54                    # Array of objects -> create table
 55                    table_name = self._path_to_table_name(current_path)
 56                    self._create_table_from_array(value, table_name, parent_table, parent_keys)
 57                elif isinstance(value, dict):
 58                    # Nested object -> continue traversal
 59                    self._process_structure(value, parent_table, parent_keys, current_path)
 60
 61    def _create_table_from_array(
 62        self, array: list[dict[str, Any]], table_name: str, parent_table: str, parent_keys: dict[str, Any]
 63    ) -> None:
 64        """
 65        Create a table from an array of objects.
 66
 67        Args:
 68            array: Array of objects
 69            table_name: Name for the table
 70            parent_table: Parent table name
 71            parent_keys: Parent keys for relationships
 72        """
 73        # Flatten objects and add parent foreign keys
 74        rows = []
 75        for i, item in enumerate(array):
 76            row = self._flatten_dict(item)
 77
 78            # Add parent foreign keys
 79            for parent_key, parent_value in parent_keys.items():
 80                row[f"parent_{parent_key}"] = parent_value
 81
 82            # Add row index for ordering
 83            row["_row_index"] = i
 84
 85            rows.append(row)
 86
 87        # Create DataFrame
 88        df = pd.DataFrame(rows)
 89
 90        # Store table
 91        self.tables[table_name] = df
 92
 93        # Record relationship
 94        if parent_keys:
 95            self.relationships.append(
 96                {"parent_table": parent_table, "child_table": table_name, "foreign_keys": list(parent_keys.keys())}
 97            )
 98
 99        # Process nested arrays within this array
100        for i, item in enumerate(array):  # noqa: B007
101            item_keys = {**parent_keys}
102            # Add identifying keys from this level
103            for key in ["id", "name", "code"]:
104                if key in item:
105                    item_keys[key] = item[key]
106                    break
107
108            for key, value in item.items():
109                if isinstance(value, list) and value and isinstance(value[0], dict):
110                    nested_table_name = f"{table_name}_{key}"
111                    self._create_table_from_array(value, nested_table_name, table_name, item_keys)
112
113    def _flatten_dict(self, d: dict[str, Any], parent_key: str = "", sep: str = "_") -> dict[str, Any]:
114        """
115        Flatten nested dictionary, keeping only scalar values.
116
117        Args:
118            d: Dictionary to flatten
119            parent_key: Parent key for nested items
120            sep: Separator for nested keys
121
122        Returns:
123            Flattened dictionary
124        """
125        items = []
126
127        for k, v in d.items():
128            new_key = f"{parent_key}{sep}{k}" if parent_key else k
129
130            if isinstance(v, dict):
131                # Recursively flatten nested dict
132                items.extend(self._flatten_dict(v, new_key, sep=sep).items())
133            elif isinstance(v, list):
134                # For lists, convert to JSON string or skip
135                if v and not isinstance(v[0], dict):
136                    # Simple list - join as string
137                    items.append((new_key, ", ".join(map(str, v))))
138                # Skip lists of objects (they become separate tables)
139            else:
140                # Scalar value
141                items.append((new_key, v))
142
143        return dict(items)
144
145    def _path_to_table_name(self, path: str) -> str:
146        """
147        Convert path to table name.
148
149        Args:
150            path: Path like "actions" or "warehouse.settings"
151
152        Returns:
153            Table name
154        """
155        parts = path.replace(".", "_").split("_")
156        return "_".join(parts).upper()
157
158    def save_tables(self, output_dir: str | Path, format: str = "csv") -> None:
159        """
160        Save all tables to files.
161
162        Args:
163            output_dir: Directory to save tables
164            format: Output format ('csv', 'parquet', 'excel')
165        """
166        output_dir = Path(output_dir)
167        output_dir.mkdir(parents=True, exist_ok=True)
168
169        for table_name, df in self.tables.items():
170            if format == "csv":
171                filepath = output_dir / f"{table_name}.csv"
172                df.to_csv(filepath, index=False)
173            elif format == "parquet":
174                filepath = output_dir / f"{table_name}.parquet"
175                df.to_parquet(filepath, index=False)
176            elif format == "excel":
177                filepath = output_dir / f"{table_name}.xlsx"
178                df.to_excel(filepath, index=False)
179
180            print(f"Saved {table_name}: {len(df)} rows, {len(df.columns)} columns -> {filepath}")
181
182    def print_summary(self) -> None:
183        """Print summary of generated tables."""
184        print(f"\n{'=' * 60}")
185        print("GENERATED TABLES SUMMARY")
186        print(f"{'=' * 60}\n")
187
188        print(f"Total tables: {len(self.tables)}\n")
189
190        for table_name, df in self.tables.items():
191            print(f"Table: {table_name}")
192            print(f"  Rows: {len(df)}")
193            print(f"  Columns: {len(df.columns)}")
194            print(f"  Column names: {', '.join(df.columns[:5])}")
195            if len(df.columns) > 5:
196                print(f"    ... and {len(df.columns) - 5} more")
197            print()
198
199        if self.relationships:
200            print(f"{'-' * 60}")
201            print("RELATIONSHIPS:")
202            print(f"{'-' * 60}\n")
203            for rel in self.relationships:
204                print(f"{rel['parent_table']} -> {rel['child_table']}")
205                print(f"  Foreign keys: {', '.join(rel['foreign_keys'])}\n")

Generate relational tables from nested data structures.

TableGenerator()
13    def __init__(self):
14        """Initialize the table generator."""
15        self.tables = {}
16        self.relationships = []

Initialize the table generator.

tables
relationships
def generate_tables( self, data: dict[str, typing.Any], root_table_name: str = 'ROOT') -> dict[str, pandas.core.frame.DataFrame]:
18    def generate_tables(self, data: dict[str, Any], root_table_name: str = "ROOT") -> dict[str, pd.DataFrame]:
19        """
20        Generate tables from nested data.
21
22        Args:
23            data: Source data dictionary
24            root_table_name: Name for the root table
25
26        Returns:
27            Dictionary of table_name -> DataFrame
28        """
29        # Extract root-level scalar fields into root table
30        root_data = {k: v for k, v in data.items() if not isinstance(v, (list, dict))}
31        if root_data:
32            self.tables[root_table_name] = pd.DataFrame([root_data])
33
34        # Process nested structures
35        self._process_structure(data, root_table_name, {})
36
37        return self.tables

Generate tables from nested data.

Arguments:
  • data: Source data dictionary
  • root_table_name: Name for the root table
Returns:

Dictionary of table_name -> DataFrame

def save_tables(self, output_dir: str | pathlib._local.Path, format: str = 'csv') -> None:
158    def save_tables(self, output_dir: str | Path, format: str = "csv") -> None:
159        """
160        Save all tables to files.
161
162        Args:
163            output_dir: Directory to save tables
164            format: Output format ('csv', 'parquet', 'excel')
165        """
166        output_dir = Path(output_dir)
167        output_dir.mkdir(parents=True, exist_ok=True)
168
169        for table_name, df in self.tables.items():
170            if format == "csv":
171                filepath = output_dir / f"{table_name}.csv"
172                df.to_csv(filepath, index=False)
173            elif format == "parquet":
174                filepath = output_dir / f"{table_name}.parquet"
175                df.to_parquet(filepath, index=False)
176            elif format == "excel":
177                filepath = output_dir / f"{table_name}.xlsx"
178                df.to_excel(filepath, index=False)
179
180            print(f"Saved {table_name}: {len(df)} rows, {len(df.columns)} columns -> {filepath}")

Save all tables to files.

Arguments:
  • output_dir: Directory to save tables
  • format: Output format ('csv', 'parquet', 'excel')
def print_summary(self) -> None:
182    def print_summary(self) -> None:
183        """Print summary of generated tables."""
184        print(f"\n{'=' * 60}")
185        print("GENERATED TABLES SUMMARY")
186        print(f"{'=' * 60}\n")
187
188        print(f"Total tables: {len(self.tables)}\n")
189
190        for table_name, df in self.tables.items():
191            print(f"Table: {table_name}")
192            print(f"  Rows: {len(df)}")
193            print(f"  Columns: {len(df.columns)}")
194            print(f"  Column names: {', '.join(df.columns[:5])}")
195            if len(df.columns) > 5:
196                print(f"    ... and {len(df.columns) - 5} more")
197            print()
198
199        if self.relationships:
200            print(f"{'-' * 60}")
201            print("RELATIONSHIPS:")
202            print(f"{'-' * 60}\n")
203            for rel in self.relationships:
204                print(f"{rel['parent_table']} -> {rel['child_table']}")
205                print(f"  Foreign keys: {', '.join(rel['foreign_keys'])}\n")

Print summary of generated tables.