Coverage for src/dataknobs_data/schema.py: 35%
91 statements
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-13 11:23 -0700
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-13 11:23 -0700
1"""Database schema definitions for field structures."""
3from __future__ import annotations
5from dataclasses import dataclass, field
6from typing import Any
8from .fields import FieldType
11@dataclass
12class FieldSchema:
13 """Schema definition for a field without actual data.
15 Defines the structure and constraints for a field in a database schema.
16 Used for validation, type checking, and backend schema generation.
18 Attributes:
19 name: Field name
20 type: Field data type
21 metadata: Additional field metadata
22 required: Whether the field is required
23 default: Default value if field is missing
25 Example:
26 ```python
27 from dataknobs_data.schema import FieldSchema
28 from dataknobs_data.fields import FieldType
30 # Simple field schema
31 name_schema = FieldSchema(name="name", type=FieldType.STRING, required=True)
33 # Vector field schema with metadata
34 embedding_schema = FieldSchema(
35 name="embedding",
36 type=FieldType.VECTOR,
37 metadata={"dimensions": 384, "source_field": "content"},
38 required=False
39 )
41 # Check if vector field
42 is_vector = embedding_schema.is_vector_field() # True
43 dims = embedding_schema.get_dimensions() # 384
44 ```
45 """
47 name: str
48 type: FieldType
49 metadata: dict[str, Any] = field(default_factory=dict)
50 required: bool = False
51 default: Any = None
53 def is_vector_field(self) -> bool:
54 """Check if this is a vector field.
56 Returns:
57 True if the field type is VECTOR or SPARSE_VECTOR
59 Example:
60 ```python
61 vector_schema = FieldSchema(name="embedding", type=FieldType.VECTOR)
62 print(vector_schema.is_vector_field()) # True
64 text_schema = FieldSchema(name="content", type=FieldType.TEXT)
65 print(text_schema.is_vector_field()) # False
66 ```
67 """
68 return self.type in (FieldType.VECTOR, FieldType.SPARSE_VECTOR)
70 def get_dimensions(self) -> int | None:
71 """Get vector dimensions if this is a vector field."""
72 if self.is_vector_field():
73 return self.metadata.get("dimensions")
74 return None
76 def get_source_field(self) -> str | None:
77 """Get source field if this is a derived vector field."""
78 if self.is_vector_field():
79 return self.metadata.get("source_field")
80 return None
82 def to_dict(self) -> dict[str, Any]:
83 """Convert to dictionary representation."""
84 return {
85 "name": self.name,
86 "type": self.type.value,
87 "metadata": self.metadata,
88 "required": self.required,
89 "default": self.default,
90 }
92 @classmethod
93 def from_dict(cls, data: dict[str, Any]) -> FieldSchema:
94 """Create from dictionary representation."""
95 return cls(
96 name=data["name"],
97 type=FieldType(data["type"]),
98 metadata=data.get("metadata", {}),
99 required=data.get("required", False),
100 default=data.get("default"),
101 )
104@dataclass
105class DatabaseSchema:
106 """Schema definition for a database.
108 Defines the structure of a database by specifying field schemas. Used for
109 validation, type checking, and ensuring consistent data structure across records.
111 Attributes:
112 fields: Dictionary mapping field names to FieldSchema objects
113 metadata: Optional schema-level metadata
115 Example:
116 ```python
117 from dataknobs_data.schema import DatabaseSchema, FieldSchema
118 from dataknobs_data.fields import FieldType
120 # Create schema using .create() method
121 schema = DatabaseSchema.create(
122 name=FieldType.STRING,
123 age=FieldType.INTEGER,
124 email=FieldType.STRING
125 )
127 # With vector field and metadata
128 schema = DatabaseSchema.create(
129 content=FieldType.TEXT,
130 embedding=(FieldType.VECTOR, {
131 "dimensions": 384,
132 "source_field": "content"
133 })
134 )
136 # Add fields after creation
137 schema.add_field(FieldSchema(
138 name="created_at",
139 type=FieldType.DATETIME,
140 required=True
141 ))
143 # Get field schemas
144 content_schema = schema.get_field("content")
145 all_field_names = schema.get_field_names()
146 ```
147 """
149 fields: dict[str, FieldSchema] = field(default_factory=dict)
150 metadata: dict[str, Any] = field(default_factory=dict)
152 @classmethod
153 def create(cls, **field_definitions) -> DatabaseSchema:
154 """Create a schema from keyword arguments.
156 Args:
157 **field_definitions: Field definitions where each key is a field name and
158 each value is either a FieldType or a tuple of (FieldType, options_dict)
160 Returns:
161 A new DatabaseSchema instance
163 Example:
164 ```python
165 # Simple field types
166 schema = DatabaseSchema.create(
167 name=FieldType.STRING,
168 age=FieldType.INTEGER
169 )
171 # With field options
172 schema = DatabaseSchema.create(
173 content=FieldType.TEXT,
174 embedding=(FieldType.VECTOR, {"dimensions": 384, "source_field": "content"}),
175 score=(FieldType.FLOAT, {"required": True, "default": 0.0})
176 )
177 ```
178 """
179 schema = cls()
180 for name, definition in field_definitions.items():
181 if isinstance(definition, FieldType):
182 # Simple field type
183 schema.add_field(FieldSchema(name=name, type=definition))
184 elif isinstance(definition, tuple):
185 # Field type with metadata/options
186 field_type, options = definition
187 field_metadata = options.get("metadata", {})
188 if "dimensions" in options:
189 field_metadata["dimensions"] = options["dimensions"]
190 if "source_field" in options:
191 field_metadata["source_field"] = options["source_field"]
193 schema.add_field(FieldSchema(
194 name=name,
195 type=field_type,
196 metadata=field_metadata,
197 required=options.get("required", False),
198 default=options.get("default")
199 ))
200 else:
201 raise ValueError(f"Invalid field definition for {name}: {definition}")
202 return schema
204 def add_field(self, field_schema: FieldSchema) -> DatabaseSchema:
205 """Add a field to the schema.
207 Returns self for chaining.
208 """
209 self.fields[field_schema.name] = field_schema
210 return self
212 def add_text_field(self, name: str, required: bool = False) -> DatabaseSchema:
213 """Add a text field to the schema."""
214 return self.add_field(FieldSchema(name=name, type=FieldType.TEXT, required=required))
216 def add_vector_field(
217 self,
218 name: str,
219 dimensions: int,
220 source_field: str | None = None,
221 required: bool = False
222 ) -> DatabaseSchema:
223 """Add a vector field to the schema."""
224 return self.add_field(FieldSchema(
225 name=name,
226 type=FieldType.VECTOR,
227 metadata={"dimensions": dimensions, "source_field": source_field},
228 required=required
229 ))
231 def remove_field(self, name: str) -> bool:
232 """Remove a field from the schema."""
233 if name in self.fields:
234 del self.fields[name]
235 return True
236 return False
238 def get_vector_fields(self) -> dict[str, FieldSchema]:
239 """Get all vector fields in the schema."""
240 return {
241 name: field
242 for name, field in self.fields.items()
243 if field.is_vector_field()
244 }
246 def get_source_fields(self) -> dict[str, list[str]]:
247 """Get mapping of source fields to their dependent vector fields."""
248 source_map = {}
249 for name, field_obj in self.fields.items():
250 if field_obj.is_vector_field():
251 source = field_obj.get_source_field()
252 if source:
253 if source not in source_map:
254 source_map[source] = []
255 source_map[source].append(name)
256 return source_map
258 def to_dict(self) -> dict[str, Any]:
259 """Convert to dictionary representation."""
260 return {
261 "fields": {name: f.to_dict() for name, f in self.fields.items()},
262 "metadata": self.metadata,
263 }
265 @classmethod
266 def from_dict(cls, data: dict[str, Any]) -> DatabaseSchema:
267 """Create from dictionary representation.
269 Supports multiple formats:
270 1. Full format with FieldSchema dicts
271 2. Simple format with just field types
272 3. Mixed format
274 Examples:
275 # Simple format
276 {"fields": {"content": "text", "score": "float"}}
278 # Full format
279 {"fields": {"content": {"type": "text", "required": true}}}
281 # Vector fields
282 {"fields": {"embedding": {"type": "vector", "dimensions": 384}}}
283 """
284 schema = cls(metadata=data.get("metadata", {}))
286 for name, field_data in data.get("fields", {}).items():
287 if isinstance(field_data, str):
288 # Simple string type
289 schema.fields[name] = FieldSchema(
290 name=name,
291 type=FieldType(field_data)
292 )
293 elif isinstance(field_data, dict):
294 if "type" in field_data:
295 # Full field schema dict
296 field_type = FieldType(field_data["type"])
297 metadata = {}
299 # Handle vector-specific fields
300 if "dimensions" in field_data:
301 metadata["dimensions"] = field_data["dimensions"]
302 if "source_field" in field_data:
303 metadata["source_field"] = field_data["source_field"]
305 # Merge with explicit metadata
306 if "metadata" in field_data:
307 metadata.update(field_data["metadata"])
309 schema.fields[name] = FieldSchema(
310 name=name,
311 type=field_type,
312 metadata=metadata,
313 required=field_data.get("required", False),
314 default=field_data.get("default")
315 )
316 else:
317 # Try to parse as FieldSchema dict
318 schema.fields[name] = FieldSchema.from_dict(field_data)
320 return schema