Coverage for src/dataknobs_data/fields.py: 31%
163 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-31 15:06 -0600
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-31 15:06 -0600
1from __future__ import annotations
3import copy
4from dataclasses import dataclass, field
5from datetime import datetime
6from enum import Enum
7from typing import TYPE_CHECKING, Any
9if TYPE_CHECKING:
10 import numpy as np
11 from collections.abc import Callable
12else:
13 from typing import Callable
16class FieldType(Enum):
17 """Enumeration of supported field types."""
19 STRING = "string"
20 INTEGER = "integer"
21 FLOAT = "float"
22 BOOLEAN = "boolean"
23 DATETIME = "datetime"
24 JSON = "json"
25 BINARY = "binary"
26 TEXT = "text"
27 VECTOR = "vector"
28 SPARSE_VECTOR = "sparse_vector"
31@dataclass
32class Field:
33 """Represents a single field in a record."""
35 name: str
36 value: Any
37 type: FieldType | None = None
38 metadata: dict[str, Any] = field(default_factory=dict)
40 def __post_init__(self):
41 """Auto-detect type if not provided."""
42 if self.type is None:
43 self.type = self._detect_type(self.value)
45 def _detect_type(self, value: Any) -> FieldType:
46 """Detect the field type from the value."""
47 if value is None:
48 return FieldType.STRING
49 elif isinstance(value, bool):
50 return FieldType.BOOLEAN
51 elif isinstance(value, int):
52 return FieldType.INTEGER
53 elif isinstance(value, float):
54 return FieldType.FLOAT
55 elif isinstance(value, datetime):
56 return FieldType.DATETIME
57 elif isinstance(value, (dict, list)):
58 return FieldType.JSON
59 elif isinstance(value, bytes):
60 return FieldType.BINARY
61 elif isinstance(value, str):
62 if len(value) > 1000:
63 return FieldType.TEXT
64 return FieldType.STRING
65 else:
66 return FieldType.JSON
68 def copy(self) -> Field:
69 """Create a deep copy of the field."""
70 return Field(
71 name=self.name,
72 value=copy.deepcopy(self.value),
73 type=self.type,
74 metadata=copy.deepcopy(self.metadata)
75 )
77 def validate(self) -> bool:
78 """Validate that the value matches the field type."""
79 if self.value is None:
80 return True
82 type_validators = {
83 FieldType.STRING: lambda v: isinstance(v, str),
84 FieldType.INTEGER: lambda v: isinstance(v, int) and not isinstance(v, bool),
85 FieldType.FLOAT: lambda v: isinstance(v, (int, float)) and not isinstance(v, bool),
86 FieldType.BOOLEAN: lambda v: isinstance(v, bool),
87 FieldType.DATETIME: lambda v: isinstance(v, datetime),
88 FieldType.JSON: lambda v: isinstance(v, (dict, list)),
89 FieldType.BINARY: lambda v: isinstance(v, bytes),
90 FieldType.TEXT: lambda v: isinstance(v, str),
91 }
93 if self.type is None:
94 return True
95 validator = type_validators.get(self.type)
96 if validator:
97 return validator(self.value)
98 return True
100 def convert_to(self, target_type: FieldType) -> Field:
101 """Convert the field to a different type."""
102 if self.type == target_type:
103 return self
105 converters: dict[tuple[FieldType, FieldType], Callable[[Any], Any]] = {
106 (FieldType.INTEGER, FieldType.STRING): str,
107 (FieldType.INTEGER, FieldType.FLOAT): float,
108 (FieldType.FLOAT, FieldType.STRING): str,
109 (FieldType.FLOAT, FieldType.INTEGER): int,
110 (FieldType.BOOLEAN, FieldType.STRING): lambda v: "true" if v else "false",
111 (FieldType.BOOLEAN, FieldType.INTEGER): int,
112 (FieldType.STRING, FieldType.INTEGER): int,
113 (FieldType.STRING, FieldType.FLOAT): float,
114 (FieldType.STRING, FieldType.BOOLEAN): lambda v: v.lower() in ("true", "1", "yes"),
115 (FieldType.STRING, FieldType.TEXT): lambda v: v,
116 (FieldType.TEXT, FieldType.STRING): lambda v: v,
117 }
119 if self.type is None:
120 raise ValueError(f"Cannot convert {self.name} from None to {target_type}")
122 converter_key = (self.type, target_type)
123 if converter_key in converters:
124 try:
125 converter = converters[converter_key]
126 new_value = converter(self.value)
127 return Field(
128 name=self.name, value=new_value, type=target_type, metadata=self.metadata.copy()
129 )
130 except (ValueError, TypeError) as e:
131 raise ValueError(
132 f"Cannot convert {self.name} from {self.type} to {target_type}: {e}"
133 ) from e
134 else:
135 raise ValueError(f"No converter available from {self.type} to {target_type}")
137 def to_dict(self) -> dict[str, Any]:
138 """Convert the field to a dictionary representation."""
139 return {
140 "name": self.name,
141 "value": self.value,
142 "type": self.type.value if self.type else None,
143 "metadata": self.metadata,
144 }
146 @classmethod
147 def from_dict(cls, data: dict[str, Any]) -> Field:
148 """Create a field from a dictionary representation."""
149 field_type = None
150 if data.get("type"):
151 field_type = FieldType(data["type"])
153 # Handle vector fields specially
154 if field_type in (FieldType.VECTOR, FieldType.SPARSE_VECTOR):
155 return VectorField.from_dict(data)
157 return cls(
158 name=data["name"],
159 value=data["value"],
160 type=field_type,
161 metadata=data.get("metadata", {}),
162 )
165class VectorField(Field):
166 """Represents a vector field with embeddings and metadata.
168 Examples:
169 # Simple usage - name optional when used in Record
170 record = Record({
171 "embedding": VectorField(value=[0.1, 0.2, 0.3])
172 })
174 # With explicit configuration
175 field = VectorField(
176 value=embedding_array,
177 name="doc_embedding",
178 model_name="all-MiniLM-L6-v2",
179 source_field="content"
180 )
182 # From text using embedding function
183 field = VectorField.from_text(
184 "This is the text to embed",
185 embedding_fn=model.encode
186 )
187 """
189 def __init__(
190 self,
191 value: np.ndarray | list[float],
192 name: str | None = None, # Made optional
193 dimensions: int | None = None, # Auto-detected from value
194 source_field: str | None = None,
195 model_name: str | None = None,
196 model_version: str | None = None,
197 metadata: dict[str, Any] | None = None,
198 ):
199 """Initialize a vector field.
201 Args:
202 value: Vector data as numpy array or list of floats
203 name: Field name (optional, defaults to "embedding")
204 dimensions: Expected dimensions (auto-detected if not provided)
205 source_field: Name of the text field this vector was generated from
206 model_name: Name of the embedding model used
207 model_version: Version of the embedding model
208 metadata: Additional metadata
209 """
210 # Import numpy lazily to avoid hard dependency
211 try:
212 import numpy as np
213 except ImportError as e:
214 raise ImportError(
215 "numpy is required for vector fields. Install with: pip install numpy"
216 ) from e
218 # Set default name if not provided
219 if name is None:
220 name = "embedding"
222 # Convert to numpy array if needed
223 if isinstance(value, list):
224 value = np.array(value, dtype=np.float32)
225 elif isinstance(value, np.ndarray):
226 # Ensure float32 dtype for consistency
227 if value.dtype != np.float32:
228 value = value.astype(np.float32)
229 else:
230 raise TypeError(
231 f"Vector value must be numpy array or list, got {type(value)}"
232 )
234 # Auto-detect dimensions if not provided
235 actual_dims = len(value) if value.ndim == 1 else value.shape[-1]
236 if dimensions is None:
237 dimensions = actual_dims
238 elif dimensions != actual_dims:
239 raise ValueError(
240 f"Vector dimension mismatch for field '{name}': "
241 f"expected {dimensions}, got {actual_dims}"
242 )
244 # Store vector metadata
245 vector_metadata = metadata or {}
246 vector_metadata.update({
247 "dimensions": dimensions,
248 "source_field": source_field,
249 "model": {
250 "name": model_name,
251 "version": model_version,
252 } if model_name else None,
253 })
255 super().__init__(
256 name=name,
257 value=value,
258 type=FieldType.VECTOR,
259 metadata=vector_metadata,
260 )
262 self.dimensions = dimensions
263 self.source_field = source_field
264 self.model_name = model_name
265 self.model_version = model_version
267 @classmethod
268 def from_text(
269 cls,
270 text: str,
271 embedding_fn: Callable[[str], Any],
272 name: str | None = None,
273 dimensions: int | None = None,
274 model_name: str | None = None,
275 model_version: str | None = None,
276 **kwargs
277 ) -> VectorField:
278 """Create a VectorField from text using an embedding function.
280 Args:
281 text: Text to embed
282 embedding_fn: Function that takes text and returns embedding vector
283 name: Field name (optional, defaults to "embedding")
284 dimensions: Expected dimensions (auto-detected if not provided)
285 model_name: Name of the embedding model
286 model_version: Version of the embedding model
287 **kwargs: Additional arguments passed to VectorField constructor
289 Returns:
290 VectorField instance with the generated embedding
292 Example:
293 field = VectorField.from_text(
294 "Machine learning is fascinating",
295 embedding_fn=model.encode,
296 model_name="all-MiniLM-L6-v2"
297 )
298 """
299 embedding = embedding_fn(text)
300 return cls(
301 value=embedding,
302 name=name,
303 dimensions=dimensions,
304 source_field="text", # Indicate it came from text
305 model_name=model_name,
306 model_version=model_version,
307 **kwargs
308 )
310 def validate(self) -> bool:
311 """Validate the vector field."""
312 if self.value is None:
313 return True
315 try:
316 import numpy as np
318 if not isinstance(self.value, np.ndarray):
319 return False
321 if self.value.ndim not in (1, 2):
322 return False
324 # Check dimensions match metadata
325 actual_dims = len(self.value) if self.value.ndim == 1 else self.value.shape[-1]
326 expected_dims = self.metadata.get("dimensions")
327 if expected_dims and actual_dims != expected_dims:
328 return False
330 return True
331 except ImportError:
332 return False
334 def to_list(self) -> list[float]:
335 """Convert vector to a list of floats."""
336 import numpy as np
338 if isinstance(self.value, np.ndarray):
339 return self.value.tolist()
340 return list(self.value)
342 def cosine_similarity(self, other: VectorField | np.ndarray | list[float]) -> float:
343 """Compute cosine similarity with another vector."""
344 import numpy as np
346 if isinstance(other, VectorField):
347 other_vec = other.value
348 elif isinstance(other, list):
349 other_vec = np.array(other, dtype=np.float32)
350 else:
351 other_vec = other
353 # Compute cosine similarity
354 dot_product = np.dot(self.value, other_vec)
355 norm_a = np.linalg.norm(self.value)
356 norm_b = np.linalg.norm(other_vec)
358 if norm_a == 0 or norm_b == 0:
359 return 0.0
361 return float(dot_product / (norm_a * norm_b))
363 def euclidean_distance(self, other: VectorField | np.ndarray | list[float]) -> float:
364 """Compute Euclidean distance to another vector."""
365 import numpy as np
367 if isinstance(other, VectorField):
368 other_vec = other.value
369 elif isinstance(other, list):
370 other_vec = np.array(other, dtype=np.float32)
371 else:
372 other_vec = other
374 return float(np.linalg.norm(self.value - other_vec))
376 def to_dict(self) -> dict[str, Any]:
377 """Convert to dictionary representation."""
378 return {
379 "name": self.name,
380 "value": self.to_list(),
381 "type": self.type.value,
382 "metadata": self.metadata,
383 "dimensions": self.dimensions,
384 }
386 @classmethod
387 def from_dict(cls, data: dict[str, Any]) -> VectorField:
388 """Create from dictionary representation."""
389 metadata = data.get("metadata", {})
390 model_info = metadata.get("model", {})
392 return cls(
393 name=data["name"],
394 value=data["value"],
395 dimensions=data.get("dimensions") or metadata.get("dimensions"),
396 source_field=metadata.get("source_field"),
397 model_name=model_info.get("name") if model_info else None,
398 model_version=model_info.get("version") if model_info else None,
399 metadata=metadata,
400 )