Coverage for src / dataknobs_data / fields.py: 26%
163 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-26 15:45 -0700
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-26 15:45 -0700
1"""Field type definitions and metadata for structured data records.
3This module defines field types, validation, and metadata structures used by
4Record objects to represent typed data fields with constraints and transformations.
5"""
7from __future__ import annotations
9import copy
10from dataclasses import dataclass, field
11from datetime import datetime
12from enum import Enum
13from typing import TYPE_CHECKING, Any
15if TYPE_CHECKING:
16 import numpy as np
17 from collections.abc import Callable
18else:
19 from typing import Callable
22class FieldType(Enum):
23 """Enumeration of supported field types.
25 Defines the data types that can be stored in Record fields. Field types enable
26 type validation, schema enforcement, and backend-specific optimizations.
28 Attributes:
29 STRING: Short text (< 1000 chars)
30 TEXT: Long text content
31 INTEGER: Whole numbers
32 FLOAT: Decimal numbers
33 BOOLEAN: True/False values
34 DATETIME: Date and time values
35 JSON: Structured JSON data (dicts, lists)
36 BINARY: Binary data (bytes)
37 VECTOR: Dense vector embeddings for similarity search
38 SPARSE_VECTOR: Sparse vector representations
40 Example:
41 ```python
42 from dataknobs_data import Field, FieldType
44 # Create typed fields
45 name_field = Field(name="name", value="Alice", type=FieldType.STRING)
46 age_field = Field(name="age", value=30, type=FieldType.INTEGER)
47 tags_field = Field(name="tags", value=["python", "data"], type=FieldType.JSON)
49 # Auto-detection (type is inferred from value)
50 auto_field = Field(name="score", value=95.5) # Auto-detected as FLOAT
51 ```
52 """
54 STRING = "string"
55 INTEGER = "integer"
56 FLOAT = "float"
57 BOOLEAN = "boolean"
58 DATETIME = "datetime"
59 JSON = "json"
60 BINARY = "binary"
61 TEXT = "text"
62 VECTOR = "vector"
63 SPARSE_VECTOR = "sparse_vector"
66@dataclass
67class Field:
68 """Represents a single field in a record.
70 A Field encapsulates a named value along with its type and optional metadata.
71 Field types are automatically detected if not explicitly provided.
73 Attributes:
74 name: The field name
75 value: The field value (can be any Python type)
76 type: The field type (auto-detected if None)
77 metadata: Optional metadata dictionary
79 Example:
80 ```python
81 from dataknobs_data import Field, FieldType
83 # Auto-detected type
84 name = Field(name="name", value="Alice")
85 print(name.type) # FieldType.STRING
87 # Explicit type
88 score = Field(name="score", value=95.5, type=FieldType.FLOAT)
90 # With metadata
91 vector = Field(
92 name="embedding",
93 value=[0.1, 0.2, 0.3],
94 type=FieldType.VECTOR,
95 metadata={"dimensions": 3, "model": "text-embedding-3-small"}
96 )
98 # Validation
99 is_valid = name.validate() # True
101 # Type conversion
102 str_score = score.convert_to(FieldType.STRING)
103 print(str_score.value) # "95.5"
104 ```
105 """
107 name: str
108 value: Any
109 type: FieldType | None = None
110 metadata: dict[str, Any] = field(default_factory=dict)
112 def __post_init__(self):
113 """Auto-detect type if not provided."""
114 if self.type is None:
115 self.type = self._detect_type(self.value)
117 def _detect_type(self, value: Any) -> FieldType:
118 """Detect the field type from the value.
120 Args:
121 value: The value to analyze
123 Returns:
124 The detected FieldType
126 Example:
127 ```python
128 field = Field(name="data", value=[1, 2, 3])
129 detected_type = field._detect_type([1, 2, 3])
130 print(detected_type) # FieldType.JSON
131 ```
132 """
133 if value is None:
134 return FieldType.STRING
135 elif isinstance(value, bool):
136 return FieldType.BOOLEAN
137 elif isinstance(value, int):
138 return FieldType.INTEGER
139 elif isinstance(value, float):
140 return FieldType.FLOAT
141 elif isinstance(value, datetime):
142 return FieldType.DATETIME
143 elif isinstance(value, (dict, list)):
144 return FieldType.JSON
145 elif isinstance(value, bytes):
146 return FieldType.BINARY
147 elif isinstance(value, str):
148 if len(value) > 1000:
149 return FieldType.TEXT
150 return FieldType.STRING
151 else:
152 return FieldType.JSON
154 def copy(self) -> Field:
155 """Create a deep copy of the field."""
156 return Field(
157 name=self.name,
158 value=copy.deepcopy(self.value),
159 type=self.type,
160 metadata=copy.deepcopy(self.metadata)
161 )
163 def validate(self) -> bool:
164 """Validate that the value matches the field type.
166 Returns:
167 True if the value is valid for the field type, False otherwise
169 Example:
170 ```python
171 # Valid field
172 age = Field(name="age", value=30, type=FieldType.INTEGER)
173 print(age.validate()) # True
175 # Invalid field (wrong type for value)
176 bad_field = Field(name="count", value="not a number", type=FieldType.INTEGER)
177 print(bad_field.validate()) # False
178 ```
179 """
180 if self.value is None:
181 return True
183 type_validators = {
184 FieldType.STRING: lambda v: isinstance(v, str),
185 FieldType.INTEGER: lambda v: isinstance(v, int) and not isinstance(v, bool),
186 FieldType.FLOAT: lambda v: isinstance(v, (int, float)) and not isinstance(v, bool),
187 FieldType.BOOLEAN: lambda v: isinstance(v, bool),
188 FieldType.DATETIME: lambda v: isinstance(v, datetime),
189 FieldType.JSON: lambda v: isinstance(v, (dict, list)),
190 FieldType.BINARY: lambda v: isinstance(v, bytes),
191 FieldType.TEXT: lambda v: isinstance(v, str),
192 }
194 if self.type is None:
195 return True
196 validator = type_validators.get(self.type)
197 if validator:
198 return validator(self.value)
199 return True
201 def convert_to(self, target_type: FieldType) -> Field:
202 """Convert the field to a different type.
204 Args:
205 target_type: The target FieldType to convert to
207 Returns:
208 A new Field with the converted value and type
210 Raises:
211 ValueError: If conversion is not possible or fails
213 Example:
214 ```python
215 # Integer to string
216 age = Field(name="age", value=30, type=FieldType.INTEGER)
217 age_str = age.convert_to(FieldType.STRING)
218 print(age_str.value) # "30"
220 # String to integer
221 count = Field(name="count", value="42", type=FieldType.STRING)
222 count_int = count.convert_to(FieldType.INTEGER)
223 print(count_int.value) # 42
224 ```
225 """
226 if self.type == target_type:
227 return self
229 converters: dict[tuple[FieldType, FieldType], Callable[[Any], Any]] = {
230 (FieldType.INTEGER, FieldType.STRING): str,
231 (FieldType.INTEGER, FieldType.FLOAT): float,
232 (FieldType.FLOAT, FieldType.STRING): str,
233 (FieldType.FLOAT, FieldType.INTEGER): int,
234 (FieldType.BOOLEAN, FieldType.STRING): lambda v: "true" if v else "false",
235 (FieldType.BOOLEAN, FieldType.INTEGER): int,
236 (FieldType.STRING, FieldType.INTEGER): int,
237 (FieldType.STRING, FieldType.FLOAT): float,
238 (FieldType.STRING, FieldType.BOOLEAN): lambda v: v.lower() in ("true", "1", "yes"),
239 (FieldType.STRING, FieldType.TEXT): lambda v: v,
240 (FieldType.TEXT, FieldType.STRING): lambda v: v,
241 }
243 if self.type is None:
244 raise ValueError(f"Cannot convert {self.name} from None to {target_type}")
246 converter_key = (self.type, target_type)
247 if converter_key in converters:
248 try:
249 converter = converters[converter_key]
250 new_value = converter(self.value)
251 return Field(
252 name=self.name, value=new_value, type=target_type, metadata=self.metadata.copy()
253 )
254 except (ValueError, TypeError) as e:
255 raise ValueError(
256 f"Cannot convert {self.name} from {self.type} to {target_type}: {e}"
257 ) from e
258 else:
259 raise ValueError(f"No converter available from {self.type} to {target_type}")
261 def to_dict(self) -> dict[str, Any]:
262 """Convert the field to a dictionary representation."""
263 return {
264 "name": self.name,
265 "value": self.value,
266 "type": self.type.value if self.type else None,
267 "metadata": self.metadata,
268 }
270 @classmethod
271 def from_dict(cls, data: dict[str, Any]) -> Field:
272 """Create a field from a dictionary representation."""
273 field_type = None
274 if data.get("type"):
275 field_type = FieldType(data["type"])
277 # Handle vector fields specially
278 if field_type in (FieldType.VECTOR, FieldType.SPARSE_VECTOR):
279 return VectorField.from_dict(data)
281 return cls(
282 name=data["name"],
283 value=data["value"],
284 type=field_type,
285 metadata=data.get("metadata", {}),
286 )
289class VectorField(Field):
290 """Represents a vector field with embeddings and metadata.
292 Examples:
293 # Simple usage - name optional when used in Record
294 record = Record({
295 "embedding": VectorField(value=[0.1, 0.2, 0.3])
296 })
298 # With explicit configuration
299 import numpy as np
300 embedding_array = np.array([0.1, 0.2, 0.3])
301 field = VectorField(
302 value=embedding_array,
303 name="doc_embedding",
304 model_name="all-MiniLM-L6-v2",
305 source_field="content"
306 )
308 # From text using embedding function
309 def my_embedding_fn(text):
310 # In practice, use a real model like sentence-transformers
311 return np.array([0.1, 0.2, 0.3])
313 field = VectorField.from_text(
314 "This is the text to embed",
315 embedding_fn=my_embedding_fn
316 )
317 """
319 def __init__(
320 self,
321 value: np.ndarray | list[float],
322 name: str | None = None, # Made optional
323 dimensions: int | None = None, # Auto-detected from value
324 source_field: str | None = None,
325 model_name: str | None = None,
326 model_version: str | None = None,
327 metadata: dict[str, Any] | None = None,
328 ):
329 """Initialize a vector field.
331 Args:
332 value: Vector data as numpy array or list of floats
333 name: Field name (optional, defaults to "embedding")
334 dimensions: Expected dimensions (auto-detected if not provided)
335 source_field: Name of the text field this vector was generated from
336 model_name: Name of the embedding model used
337 model_version: Version of the embedding model
338 metadata: Additional metadata
339 """
340 # Import numpy lazily to avoid hard dependency
341 try:
342 import numpy as np
343 except ImportError as e:
344 raise ImportError(
345 "numpy is required for vector fields. Install with: pip install numpy"
346 ) from e
348 # Set default name if not provided
349 if name is None:
350 name = "embedding"
352 # Convert to numpy array if needed
353 if isinstance(value, list):
354 value = np.array(value, dtype=np.float32)
355 elif isinstance(value, np.ndarray):
356 # Ensure float32 dtype for consistency
357 if value.dtype != np.float32:
358 value = value.astype(np.float32)
359 else:
360 raise TypeError(
361 f"Vector value must be numpy array or list, got {type(value)}"
362 )
364 # Auto-detect dimensions if not provided
365 actual_dims = len(value) if value.ndim == 1 else value.shape[-1]
366 if dimensions is None:
367 dimensions = actual_dims
368 elif dimensions != actual_dims:
369 raise ValueError(
370 f"Vector dimension mismatch for field '{name}': "
371 f"expected {dimensions}, got {actual_dims}"
372 )
374 # Store vector metadata
375 vector_metadata = metadata or {}
376 vector_metadata.update({
377 "dimensions": dimensions,
378 "source_field": source_field,
379 "model": {
380 "name": model_name,
381 "version": model_version,
382 } if model_name else None,
383 })
385 super().__init__(
386 name=name,
387 value=value,
388 type=FieldType.VECTOR,
389 metadata=vector_metadata,
390 )
392 self.dimensions = dimensions
393 self.source_field = source_field
394 self.model_name = model_name
395 self.model_version = model_version
397 @classmethod
398 def from_text(
399 cls,
400 text: str,
401 embedding_fn: Callable[[str], Any],
402 name: str | None = None,
403 dimensions: int | None = None,
404 model_name: str | None = None,
405 model_version: str | None = None,
406 **kwargs: Any
407 ) -> VectorField:
408 """Create a VectorField from text using an embedding function.
410 Args:
411 text: Text to embed
412 embedding_fn: Function that takes text and returns embedding vector
413 name: Field name (optional, defaults to "embedding")
414 dimensions: Expected dimensions (auto-detected if not provided)
415 model_name: Name of the embedding model
416 model_version: Version of the embedding model
417 **kwargs: Additional arguments passed to VectorField constructor
419 Returns:
420 VectorField instance with the generated embedding
422 Example:
423 field = VectorField.from_text(
424 "Machine learning is fascinating",
425 embedding_fn=model.encode,
426 model_name="all-MiniLM-L6-v2"
427 )
428 """
429 embedding = embedding_fn(text)
430 return cls(
431 value=embedding,
432 name=name,
433 dimensions=dimensions,
434 source_field="text", # Indicate it came from text
435 model_name=model_name,
436 model_version=model_version,
437 **kwargs
438 )
440 def validate(self) -> bool:
441 """Validate the vector field."""
442 if self.value is None:
443 return True
445 try:
446 import numpy as np
448 if not isinstance(self.value, np.ndarray):
449 return False
451 if self.value.ndim not in (1, 2):
452 return False
454 # Check dimensions match metadata
455 actual_dims = len(self.value) if self.value.ndim == 1 else self.value.shape[-1]
456 expected_dims = self.metadata.get("dimensions")
457 if expected_dims and actual_dims != expected_dims:
458 return False
460 return True
461 except ImportError:
462 return False
464 def to_list(self) -> list[float]:
465 """Convert vector to a list of floats."""
466 import numpy as np
468 if isinstance(self.value, np.ndarray):
469 return self.value.tolist()
470 return list(self.value)
472 def cosine_similarity(self, other: VectorField | np.ndarray | list[float]) -> float:
473 """Compute cosine similarity with another vector."""
474 import numpy as np
476 if isinstance(other, VectorField):
477 other_vec = other.value
478 elif isinstance(other, list):
479 other_vec = np.array(other, dtype=np.float32)
480 else:
481 other_vec = other
483 # Compute cosine similarity
484 dot_product = np.dot(self.value, other_vec)
485 norm_a = np.linalg.norm(self.value)
486 norm_b = np.linalg.norm(other_vec)
488 if norm_a == 0 or norm_b == 0:
489 return 0.0
491 return float(dot_product / (norm_a * norm_b))
493 def euclidean_distance(self, other: VectorField | np.ndarray | list[float]) -> float:
494 """Compute Euclidean distance to another vector."""
495 import numpy as np
497 if isinstance(other, VectorField):
498 other_vec = other.value
499 elif isinstance(other, list):
500 other_vec = np.array(other, dtype=np.float32)
501 else:
502 other_vec = other
504 return float(np.linalg.norm(self.value - other_vec))
506 def to_dict(self) -> dict[str, Any]:
507 """Convert to dictionary representation."""
508 return {
509 "name": self.name,
510 "value": self.to_list(),
511 "type": self.type.value,
512 "metadata": self.metadata,
513 "dimensions": self.dimensions,
514 }
516 @classmethod
517 def from_dict(cls, data: dict[str, Any]) -> VectorField:
518 """Create from dictionary representation."""
519 metadata = data.get("metadata", {})
520 model_info = metadata.get("model", {})
522 return cls(
523 name=data["name"],
524 value=data["value"],
525 dimensions=data.get("dimensions") or metadata.get("dimensions"),
526 source_field=metadata.get("source_field"),
527 model_name=model_info.get("name") if model_info else None,
528 model_version=model_info.get("version") if model_info else None,
529 metadata=metadata,
530 )