yaml_shredder.schema_generator

Automatic JSON Schema generation from YAML/JSON files.

  1"""Automatic JSON Schema generation from YAML/JSON files."""
  2
  3import json
  4from datetime import date, datetime
  5from pathlib import Path
  6from typing import Any
  7
  8import yaml
  9from genson import SchemaBuilder
 10
 11
 12class SchemaGenerator:
 13    """Generate JSON Schema from multiple YAML/JSON examples."""
 14
 15    def __init__(self):
 16        """Initialize the schema generator."""
 17        self.builder = SchemaBuilder()
 18        self.files_processed = []
 19
 20    def _normalize_data(self, obj: Any) -> Any:
 21        """
 22        Normalize data by converting datetime objects to strings.
 23
 24        Args:
 25            obj: Data to normalize
 26
 27        Returns:
 28            Normalized data
 29        """
 30        if isinstance(obj, (datetime, date)):
 31            return obj.isoformat()
 32        elif isinstance(obj, dict):
 33            return {k: self._normalize_data(v) for k, v in obj.items()}
 34        elif isinstance(obj, list):
 35            return [self._normalize_data(item) for item in obj]
 36        else:
 37            return obj
 38
 39    def add_yaml_file(self, file_path: str | Path) -> None:
 40        """
 41        Add a YAML file to the schema builder.
 42
 43        Args:
 44            file_path: Path to YAML file
 45        """
 46        file_path = Path(file_path)
 47        with open(file_path) as f:
 48            data = yaml.safe_load(f)
 49
 50        normalized_data = self._normalize_data(data)
 51        self.builder.add_object(normalized_data)
 52        self.files_processed.append(str(file_path))
 53
 54    def add_json_file(self, file_path: str | Path) -> None:
 55        """
 56        Add a JSON file to the schema builder.
 57
 58        Args:
 59            file_path: Path to JSON file
 60        """
 61        file_path = Path(file_path)
 62        with open(file_path) as f:
 63            data = json.load(f)
 64
 65        normalized_data = self._normalize_data(data)
 66        self.builder.add_object(normalized_data)
 67        self.files_processed.append(str(file_path))
 68
 69    def add_object(self, obj: dict[str, Any]) -> None:
 70        """
 71        Add a Python object to the schema builder.
 72
 73        Args:
 74            obj: Dictionary object to add
 75        """
 76        normalized_data = self._normalize_data(obj)
 77        self.builder.add_object(normalized_data)
 78
 79    def generate_schema(self) -> dict[str, Any]:
 80        """
 81        Generate the JSON schema from all added examples.
 82
 83        Returns:
 84            JSON schema as dictionary
 85        """
 86        return self.builder.to_schema()
 87
 88    def save_schema(self, output_path: str | Path) -> None:
 89        """
 90        Save the generated schema to a file.
 91
 92        Args:
 93            output_path: Path where to save the schema
 94        """
 95        schema = self.generate_schema()
 96        output_path = Path(output_path)
 97
 98        with open(output_path, "w") as f:
 99            json.dump(schema, f, indent=2)
100
101    def get_stats(self) -> dict[str, Any]:
102        """
103        Get statistics about the schema generation process.
104
105        Returns:
106            Dictionary with statistics
107        """
108        schema = self.generate_schema()
109        return {
110            "files_processed": len(self.files_processed),
111            "file_list": self.files_processed,
112            "schema_properties": len(schema.get("properties", {})),
113            "required_fields": len(schema.get("required", [])),
114        }
115
116
117def generate_schema_from_directory(
118    directory: str | Path, pattern: str = "*.yaml", output_file: str | Path | None = None
119) -> dict[str, Any]:
120    """
121    Generate schema from all matching files in a directory.
122
123    Args:
124        directory: Directory to scan
125        pattern: File pattern to match (default: *.yaml)
126        output_file: Optional path to save schema
127
128    Returns:
129        Generated JSON schema
130    """
131    directory = Path(directory)
132    generator = SchemaGenerator()
133
134    # Find all matching files
135    files = sorted(directory.rglob(pattern))
136
137    if not files:
138        raise ValueError(f"No files matching '{pattern}' found in {directory}")
139
140    # Process each file
141    for file_path in files:
142        if pattern.endswith(".yaml") or pattern.endswith(".yml"):
143            generator.add_yaml_file(file_path)
144        elif pattern.endswith(".json"):
145            generator.add_json_file(file_path)
146
147    # Generate and optionally save schema
148    schema = generator.generate_schema()
149
150    if output_file:
151        generator.save_schema(output_file)
152
153    # Print statistics
154    stats = generator.get_stats()
155    print("Schema generation complete:")
156    print(f"  Files processed: {stats['files_processed']}")
157    print(f"  Properties found: {stats['schema_properties']}")
158    print(f"  Required fields: {stats['required_fields']}")
159
160    return schema
class SchemaGenerator:
 13class SchemaGenerator:
 14    """Generate JSON Schema from multiple YAML/JSON examples."""
 15
 16    def __init__(self):
 17        """Initialize the schema generator."""
 18        self.builder = SchemaBuilder()
 19        self.files_processed = []
 20
 21    def _normalize_data(self, obj: Any) -> Any:
 22        """
 23        Normalize data by converting datetime objects to strings.
 24
 25        Args:
 26            obj: Data to normalize
 27
 28        Returns:
 29            Normalized data
 30        """
 31        if isinstance(obj, (datetime, date)):
 32            return obj.isoformat()
 33        elif isinstance(obj, dict):
 34            return {k: self._normalize_data(v) for k, v in obj.items()}
 35        elif isinstance(obj, list):
 36            return [self._normalize_data(item) for item in obj]
 37        else:
 38            return obj
 39
 40    def add_yaml_file(self, file_path: str | Path) -> None:
 41        """
 42        Add a YAML file to the schema builder.
 43
 44        Args:
 45            file_path: Path to YAML file
 46        """
 47        file_path = Path(file_path)
 48        with open(file_path) as f:
 49            data = yaml.safe_load(f)
 50
 51        normalized_data = self._normalize_data(data)
 52        self.builder.add_object(normalized_data)
 53        self.files_processed.append(str(file_path))
 54
 55    def add_json_file(self, file_path: str | Path) -> None:
 56        """
 57        Add a JSON file to the schema builder.
 58
 59        Args:
 60            file_path: Path to JSON file
 61        """
 62        file_path = Path(file_path)
 63        with open(file_path) as f:
 64            data = json.load(f)
 65
 66        normalized_data = self._normalize_data(data)
 67        self.builder.add_object(normalized_data)
 68        self.files_processed.append(str(file_path))
 69
 70    def add_object(self, obj: dict[str, Any]) -> None:
 71        """
 72        Add a Python object to the schema builder.
 73
 74        Args:
 75            obj: Dictionary object to add
 76        """
 77        normalized_data = self._normalize_data(obj)
 78        self.builder.add_object(normalized_data)
 79
 80    def generate_schema(self) -> dict[str, Any]:
 81        """
 82        Generate the JSON schema from all added examples.
 83
 84        Returns:
 85            JSON schema as dictionary
 86        """
 87        return self.builder.to_schema()
 88
 89    def save_schema(self, output_path: str | Path) -> None:
 90        """
 91        Save the generated schema to a file.
 92
 93        Args:
 94            output_path: Path where to save the schema
 95        """
 96        schema = self.generate_schema()
 97        output_path = Path(output_path)
 98
 99        with open(output_path, "w") as f:
100            json.dump(schema, f, indent=2)
101
102    def get_stats(self) -> dict[str, Any]:
103        """
104        Get statistics about the schema generation process.
105
106        Returns:
107            Dictionary with statistics
108        """
109        schema = self.generate_schema()
110        return {
111            "files_processed": len(self.files_processed),
112            "file_list": self.files_processed,
113            "schema_properties": len(schema.get("properties", {})),
114            "required_fields": len(schema.get("required", [])),
115        }

Generate JSON Schema from multiple YAML/JSON examples.

SchemaGenerator()
16    def __init__(self):
17        """Initialize the schema generator."""
18        self.builder = SchemaBuilder()
19        self.files_processed = []

Initialize the schema generator.

builder
files_processed
def add_yaml_file(self, file_path: str | pathlib._local.Path) -> None:
40    def add_yaml_file(self, file_path: str | Path) -> None:
41        """
42        Add a YAML file to the schema builder.
43
44        Args:
45            file_path: Path to YAML file
46        """
47        file_path = Path(file_path)
48        with open(file_path) as f:
49            data = yaml.safe_load(f)
50
51        normalized_data = self._normalize_data(data)
52        self.builder.add_object(normalized_data)
53        self.files_processed.append(str(file_path))

Add a YAML file to the schema builder.

Arguments:
  • file_path: Path to YAML file
def add_json_file(self, file_path: str | pathlib._local.Path) -> None:
55    def add_json_file(self, file_path: str | Path) -> None:
56        """
57        Add a JSON file to the schema builder.
58
59        Args:
60            file_path: Path to JSON file
61        """
62        file_path = Path(file_path)
63        with open(file_path) as f:
64            data = json.load(f)
65
66        normalized_data = self._normalize_data(data)
67        self.builder.add_object(normalized_data)
68        self.files_processed.append(str(file_path))

Add a JSON file to the schema builder.

Arguments:
  • file_path: Path to JSON file
def add_object(self, obj: dict[str, typing.Any]) -> None:
70    def add_object(self, obj: dict[str, Any]) -> None:
71        """
72        Add a Python object to the schema builder.
73
74        Args:
75            obj: Dictionary object to add
76        """
77        normalized_data = self._normalize_data(obj)
78        self.builder.add_object(normalized_data)

Add a Python object to the schema builder.

Arguments:
  • obj: Dictionary object to add
def generate_schema(self) -> dict[str, typing.Any]:
80    def generate_schema(self) -> dict[str, Any]:
81        """
82        Generate the JSON schema from all added examples.
83
84        Returns:
85            JSON schema as dictionary
86        """
87        return self.builder.to_schema()

Generate the JSON schema from all added examples.

Returns:

JSON schema as dictionary

def save_schema(self, output_path: str | pathlib._local.Path) -> None:
 89    def save_schema(self, output_path: str | Path) -> None:
 90        """
 91        Save the generated schema to a file.
 92
 93        Args:
 94            output_path: Path where to save the schema
 95        """
 96        schema = self.generate_schema()
 97        output_path = Path(output_path)
 98
 99        with open(output_path, "w") as f:
100            json.dump(schema, f, indent=2)

Save the generated schema to a file.

Arguments:
  • output_path: Path where to save the schema
def get_stats(self) -> dict[str, typing.Any]:
102    def get_stats(self) -> dict[str, Any]:
103        """
104        Get statistics about the schema generation process.
105
106        Returns:
107            Dictionary with statistics
108        """
109        schema = self.generate_schema()
110        return {
111            "files_processed": len(self.files_processed),
112            "file_list": self.files_processed,
113            "schema_properties": len(schema.get("properties", {})),
114            "required_fields": len(schema.get("required", [])),
115        }

Get statistics about the schema generation process.

Returns:

Dictionary with statistics

def generate_schema_from_directory( directory: str | pathlib._local.Path, pattern: str = '*.yaml', output_file: str | pathlib._local.Path | None = None) -> dict[str, typing.Any]:
118def generate_schema_from_directory(
119    directory: str | Path, pattern: str = "*.yaml", output_file: str | Path | None = None
120) -> dict[str, Any]:
121    """
122    Generate schema from all matching files in a directory.
123
124    Args:
125        directory: Directory to scan
126        pattern: File pattern to match (default: *.yaml)
127        output_file: Optional path to save schema
128
129    Returns:
130        Generated JSON schema
131    """
132    directory = Path(directory)
133    generator = SchemaGenerator()
134
135    # Find all matching files
136    files = sorted(directory.rglob(pattern))
137
138    if not files:
139        raise ValueError(f"No files matching '{pattern}' found in {directory}")
140
141    # Process each file
142    for file_path in files:
143        if pattern.endswith(".yaml") or pattern.endswith(".yml"):
144            generator.add_yaml_file(file_path)
145        elif pattern.endswith(".json"):
146            generator.add_json_file(file_path)
147
148    # Generate and optionally save schema
149    schema = generator.generate_schema()
150
151    if output_file:
152        generator.save_schema(output_file)
153
154    # Print statistics
155    stats = generator.get_stats()
156    print("Schema generation complete:")
157    print(f"  Files processed: {stats['files_processed']}")
158    print(f"  Properties found: {stats['schema_properties']}")
159    print(f"  Required fields: {stats['required_fields']}")
160
161    return schema

Generate schema from all matching files in a directory.

Arguments:
  • directory: Directory to scan
  • pattern: File pattern to match (default: *.yaml)
  • output_file: Optional path to save schema
Returns:

Generated JSON schema