Coverage for src/dataknobs_data/fields.py: 35%

163 statements  

« prev     ^ index     » next       coverage.py v7.11.3, created at 2025-11-13 11:34 -0700

1"""Field type definitions and metadata for structured data records. 

2 

3This module defines field types, validation, and metadata structures used by 

4Record objects to represent typed data fields with constraints and transformations. 

5""" 

6 

7from __future__ import annotations 

8 

9import copy 

10from dataclasses import dataclass, field 

11from datetime import datetime 

12from enum import Enum 

13from typing import TYPE_CHECKING, Any 

14 

15if TYPE_CHECKING: 

16 import numpy as np 

17 from collections.abc import Callable 

18else: 

19 from typing import Callable 

20 

21 

22class FieldType(Enum): 

23 """Enumeration of supported field types. 

24 

25 Defines the data types that can be stored in Record fields. Field types enable 

26 type validation, schema enforcement, and backend-specific optimizations. 

27 

28 Attributes: 

29 STRING: Short text (< 1000 chars) 

30 TEXT: Long text content 

31 INTEGER: Whole numbers 

32 FLOAT: Decimal numbers 

33 BOOLEAN: True/False values 

34 DATETIME: Date and time values 

35 JSON: Structured JSON data (dicts, lists) 

36 BINARY: Binary data (bytes) 

37 VECTOR: Dense vector embeddings for similarity search 

38 SPARSE_VECTOR: Sparse vector representations 

39 

40 Example: 

41 ```python 

42 from dataknobs_data import Field, FieldType 

43 

44 # Create typed fields 

45 name_field = Field(name="name", value="Alice", type=FieldType.STRING) 

46 age_field = Field(name="age", value=30, type=FieldType.INTEGER) 

47 tags_field = Field(name="tags", value=["python", "data"], type=FieldType.JSON) 

48 

49 # Auto-detection (type is inferred from value) 

50 auto_field = Field(name="score", value=95.5) # Auto-detected as FLOAT 

51 ``` 

52 """ 

53 

54 STRING = "string" 

55 INTEGER = "integer" 

56 FLOAT = "float" 

57 BOOLEAN = "boolean" 

58 DATETIME = "datetime" 

59 JSON = "json" 

60 BINARY = "binary" 

61 TEXT = "text" 

62 VECTOR = "vector" 

63 SPARSE_VECTOR = "sparse_vector" 

64 

65 

66@dataclass 

67class Field: 

68 """Represents a single field in a record. 

69 

70 A Field encapsulates a named value along with its type and optional metadata. 

71 Field types are automatically detected if not explicitly provided. 

72 

73 Attributes: 

74 name: The field name 

75 value: The field value (can be any Python type) 

76 type: The field type (auto-detected if None) 

77 metadata: Optional metadata dictionary 

78 

79 Example: 

80 ```python 

81 from dataknobs_data import Field, FieldType 

82 

83 # Auto-detected type 

84 name = Field(name="name", value="Alice") 

85 print(name.type) # FieldType.STRING 

86 

87 # Explicit type 

88 score = Field(name="score", value=95.5, type=FieldType.FLOAT) 

89 

90 # With metadata 

91 vector = Field( 

92 name="embedding", 

93 value=[0.1, 0.2, 0.3], 

94 type=FieldType.VECTOR, 

95 metadata={"dimensions": 3, "model": "text-embedding-3-small"} 

96 ) 

97 

98 # Validation 

99 is_valid = name.validate() # True 

100 

101 # Type conversion 

102 str_score = score.convert_to(FieldType.STRING) 

103 print(str_score.value) # "95.5" 

104 ``` 

105 """ 

106 

107 name: str 

108 value: Any 

109 type: FieldType | None = None 

110 metadata: dict[str, Any] = field(default_factory=dict) 

111 

112 def __post_init__(self): 

113 """Auto-detect type if not provided.""" 

114 if self.type is None: 

115 self.type = self._detect_type(self.value) 

116 

117 def _detect_type(self, value: Any) -> FieldType: 

118 """Detect the field type from the value. 

119 

120 Args: 

121 value: The value to analyze 

122 

123 Returns: 

124 The detected FieldType 

125 

126 Example: 

127 ```python 

128 field = Field(name="data", value=[1, 2, 3]) 

129 detected_type = field._detect_type([1, 2, 3]) 

130 print(detected_type) # FieldType.JSON 

131 ``` 

132 """ 

133 if value is None: 

134 return FieldType.STRING 

135 elif isinstance(value, bool): 

136 return FieldType.BOOLEAN 

137 elif isinstance(value, int): 

138 return FieldType.INTEGER 

139 elif isinstance(value, float): 

140 return FieldType.FLOAT 

141 elif isinstance(value, datetime): 

142 return FieldType.DATETIME 

143 elif isinstance(value, (dict, list)): 

144 return FieldType.JSON 

145 elif isinstance(value, bytes): 

146 return FieldType.BINARY 

147 elif isinstance(value, str): 

148 if len(value) > 1000: 

149 return FieldType.TEXT 

150 return FieldType.STRING 

151 else: 

152 return FieldType.JSON 

153 

154 def copy(self) -> Field: 

155 """Create a deep copy of the field.""" 

156 return Field( 

157 name=self.name, 

158 value=copy.deepcopy(self.value), 

159 type=self.type, 

160 metadata=copy.deepcopy(self.metadata) 

161 ) 

162 

163 def validate(self) -> bool: 

164 """Validate that the value matches the field type. 

165 

166 Returns: 

167 True if the value is valid for the field type, False otherwise 

168 

169 Example: 

170 ```python 

171 # Valid field 

172 age = Field(name="age", value=30, type=FieldType.INTEGER) 

173 print(age.validate()) # True 

174 

175 # Invalid field (wrong type for value) 

176 bad_field = Field(name="count", value="not a number", type=FieldType.INTEGER) 

177 print(bad_field.validate()) # False 

178 ``` 

179 """ 

180 if self.value is None: 

181 return True 

182 

183 type_validators = { 

184 FieldType.STRING: lambda v: isinstance(v, str), 

185 FieldType.INTEGER: lambda v: isinstance(v, int) and not isinstance(v, bool), 

186 FieldType.FLOAT: lambda v: isinstance(v, (int, float)) and not isinstance(v, bool), 

187 FieldType.BOOLEAN: lambda v: isinstance(v, bool), 

188 FieldType.DATETIME: lambda v: isinstance(v, datetime), 

189 FieldType.JSON: lambda v: isinstance(v, (dict, list)), 

190 FieldType.BINARY: lambda v: isinstance(v, bytes), 

191 FieldType.TEXT: lambda v: isinstance(v, str), 

192 } 

193 

194 if self.type is None: 

195 return True 

196 validator = type_validators.get(self.type) 

197 if validator: 

198 return validator(self.value) 

199 return True 

200 

201 def convert_to(self, target_type: FieldType) -> Field: 

202 """Convert the field to a different type. 

203 

204 Args: 

205 target_type: The target FieldType to convert to 

206 

207 Returns: 

208 A new Field with the converted value and type 

209 

210 Raises: 

211 ValueError: If conversion is not possible or fails 

212 

213 Example: 

214 ```python 

215 # Integer to string 

216 age = Field(name="age", value=30, type=FieldType.INTEGER) 

217 age_str = age.convert_to(FieldType.STRING) 

218 print(age_str.value) # "30" 

219 

220 # String to integer 

221 count = Field(name="count", value="42", type=FieldType.STRING) 

222 count_int = count.convert_to(FieldType.INTEGER) 

223 print(count_int.value) # 42 

224 ``` 

225 """ 

226 if self.type == target_type: 

227 return self 

228 

229 converters: dict[tuple[FieldType, FieldType], Callable[[Any], Any]] = { 

230 (FieldType.INTEGER, FieldType.STRING): str, 

231 (FieldType.INTEGER, FieldType.FLOAT): float, 

232 (FieldType.FLOAT, FieldType.STRING): str, 

233 (FieldType.FLOAT, FieldType.INTEGER): int, 

234 (FieldType.BOOLEAN, FieldType.STRING): lambda v: "true" if v else "false", 

235 (FieldType.BOOLEAN, FieldType.INTEGER): int, 

236 (FieldType.STRING, FieldType.INTEGER): int, 

237 (FieldType.STRING, FieldType.FLOAT): float, 

238 (FieldType.STRING, FieldType.BOOLEAN): lambda v: v.lower() in ("true", "1", "yes"), 

239 (FieldType.STRING, FieldType.TEXT): lambda v: v, 

240 (FieldType.TEXT, FieldType.STRING): lambda v: v, 

241 } 

242 

243 if self.type is None: 

244 raise ValueError(f"Cannot convert {self.name} from None to {target_type}") 

245 

246 converter_key = (self.type, target_type) 

247 if converter_key in converters: 

248 try: 

249 converter = converters[converter_key] 

250 new_value = converter(self.value) 

251 return Field( 

252 name=self.name, value=new_value, type=target_type, metadata=self.metadata.copy() 

253 ) 

254 except (ValueError, TypeError) as e: 

255 raise ValueError( 

256 f"Cannot convert {self.name} from {self.type} to {target_type}: {e}" 

257 ) from e 

258 else: 

259 raise ValueError(f"No converter available from {self.type} to {target_type}") 

260 

261 def to_dict(self) -> dict[str, Any]: 

262 """Convert the field to a dictionary representation.""" 

263 return { 

264 "name": self.name, 

265 "value": self.value, 

266 "type": self.type.value if self.type else None, 

267 "metadata": self.metadata, 

268 } 

269 

270 @classmethod 

271 def from_dict(cls, data: dict[str, Any]) -> Field: 

272 """Create a field from a dictionary representation.""" 

273 field_type = None 

274 if data.get("type"): 

275 field_type = FieldType(data["type"]) 

276 

277 # Handle vector fields specially 

278 if field_type in (FieldType.VECTOR, FieldType.SPARSE_VECTOR): 

279 return VectorField.from_dict(data) 

280 

281 return cls( 

282 name=data["name"], 

283 value=data["value"], 

284 type=field_type, 

285 metadata=data.get("metadata", {}), 

286 ) 

287 

288 

289class VectorField(Field): 

290 """Represents a vector field with embeddings and metadata. 

291 

292 Examples: 

293 # Simple usage - name optional when used in Record 

294 record = Record({ 

295 "embedding": VectorField(value=[0.1, 0.2, 0.3]) 

296 }) 

297 

298 # With explicit configuration 

299 import numpy as np 

300 embedding_array = np.array([0.1, 0.2, 0.3]) 

301 field = VectorField( 

302 value=embedding_array, 

303 name="doc_embedding", 

304 model_name="all-MiniLM-L6-v2", 

305 source_field="content" 

306 ) 

307 

308 # From text using embedding function 

309 def my_embedding_fn(text): 

310 # In practice, use a real model like sentence-transformers 

311 return np.array([0.1, 0.2, 0.3]) 

312 

313 field = VectorField.from_text( 

314 "This is the text to embed", 

315 embedding_fn=my_embedding_fn 

316 ) 

317 """ 

318 

319 def __init__( 

320 self, 

321 value: np.ndarray | list[float], 

322 name: str | None = None, # Made optional 

323 dimensions: int | None = None, # Auto-detected from value 

324 source_field: str | None = None, 

325 model_name: str | None = None, 

326 model_version: str | None = None, 

327 metadata: dict[str, Any] | None = None, 

328 ): 

329 """Initialize a vector field. 

330 

331 Args: 

332 value: Vector data as numpy array or list of floats 

333 name: Field name (optional, defaults to "embedding") 

334 dimensions: Expected dimensions (auto-detected if not provided) 

335 source_field: Name of the text field this vector was generated from 

336 model_name: Name of the embedding model used 

337 model_version: Version of the embedding model 

338 metadata: Additional metadata 

339 """ 

340 # Import numpy lazily to avoid hard dependency 

341 try: 

342 import numpy as np 

343 except ImportError as e: 

344 raise ImportError( 

345 "numpy is required for vector fields. Install with: pip install numpy" 

346 ) from e 

347 

348 # Set default name if not provided 

349 if name is None: 

350 name = "embedding" 

351 

352 # Convert to numpy array if needed 

353 if isinstance(value, list): 

354 value = np.array(value, dtype=np.float32) 

355 elif isinstance(value, np.ndarray): 

356 # Ensure float32 dtype for consistency 

357 if value.dtype != np.float32: 

358 value = value.astype(np.float32) 

359 else: 

360 raise TypeError( 

361 f"Vector value must be numpy array or list, got {type(value)}" 

362 ) 

363 

364 # Auto-detect dimensions if not provided 

365 actual_dims = len(value) if value.ndim == 1 else value.shape[-1] 

366 if dimensions is None: 

367 dimensions = actual_dims 

368 elif dimensions != actual_dims: 

369 raise ValueError( 

370 f"Vector dimension mismatch for field '{name}': " 

371 f"expected {dimensions}, got {actual_dims}" 

372 ) 

373 

374 # Store vector metadata 

375 vector_metadata = metadata or {} 

376 vector_metadata.update({ 

377 "dimensions": dimensions, 

378 "source_field": source_field, 

379 "model": { 

380 "name": model_name, 

381 "version": model_version, 

382 } if model_name else None, 

383 }) 

384 

385 super().__init__( 

386 name=name, 

387 value=value, 

388 type=FieldType.VECTOR, 

389 metadata=vector_metadata, 

390 ) 

391 

392 self.dimensions = dimensions 

393 self.source_field = source_field 

394 self.model_name = model_name 

395 self.model_version = model_version 

396 

397 @classmethod 

398 def from_text( 

399 cls, 

400 text: str, 

401 embedding_fn: Callable[[str], Any], 

402 name: str | None = None, 

403 dimensions: int | None = None, 

404 model_name: str | None = None, 

405 model_version: str | None = None, 

406 **kwargs: Any 

407 ) -> VectorField: 

408 """Create a VectorField from text using an embedding function. 

409  

410 Args: 

411 text: Text to embed 

412 embedding_fn: Function that takes text and returns embedding vector 

413 name: Field name (optional, defaults to "embedding") 

414 dimensions: Expected dimensions (auto-detected if not provided) 

415 model_name: Name of the embedding model 

416 model_version: Version of the embedding model 

417 **kwargs: Additional arguments passed to VectorField constructor 

418  

419 Returns: 

420 VectorField instance with the generated embedding 

421  

422 Example: 

423 field = VectorField.from_text( 

424 "Machine learning is fascinating", 

425 embedding_fn=model.encode, 

426 model_name="all-MiniLM-L6-v2" 

427 ) 

428 """ 

429 embedding = embedding_fn(text) 

430 return cls( 

431 value=embedding, 

432 name=name, 

433 dimensions=dimensions, 

434 source_field="text", # Indicate it came from text 

435 model_name=model_name, 

436 model_version=model_version, 

437 **kwargs 

438 ) 

439 

440 def validate(self) -> bool: 

441 """Validate the vector field.""" 

442 if self.value is None: 

443 return True 

444 

445 try: 

446 import numpy as np 

447 

448 if not isinstance(self.value, np.ndarray): 

449 return False 

450 

451 if self.value.ndim not in (1, 2): 

452 return False 

453 

454 # Check dimensions match metadata 

455 actual_dims = len(self.value) if self.value.ndim == 1 else self.value.shape[-1] 

456 expected_dims = self.metadata.get("dimensions") 

457 if expected_dims and actual_dims != expected_dims: 

458 return False 

459 

460 return True 

461 except ImportError: 

462 return False 

463 

464 def to_list(self) -> list[float]: 

465 """Convert vector to a list of floats.""" 

466 import numpy as np 

467 

468 if isinstance(self.value, np.ndarray): 

469 return self.value.tolist() 

470 return list(self.value) 

471 

472 def cosine_similarity(self, other: VectorField | np.ndarray | list[float]) -> float: 

473 """Compute cosine similarity with another vector.""" 

474 import numpy as np 

475 

476 if isinstance(other, VectorField): 

477 other_vec = other.value 

478 elif isinstance(other, list): 

479 other_vec = np.array(other, dtype=np.float32) 

480 else: 

481 other_vec = other 

482 

483 # Compute cosine similarity 

484 dot_product = np.dot(self.value, other_vec) 

485 norm_a = np.linalg.norm(self.value) 

486 norm_b = np.linalg.norm(other_vec) 

487 

488 if norm_a == 0 or norm_b == 0: 

489 return 0.0 

490 

491 return float(dot_product / (norm_a * norm_b)) 

492 

493 def euclidean_distance(self, other: VectorField | np.ndarray | list[float]) -> float: 

494 """Compute Euclidean distance to another vector.""" 

495 import numpy as np 

496 

497 if isinstance(other, VectorField): 

498 other_vec = other.value 

499 elif isinstance(other, list): 

500 other_vec = np.array(other, dtype=np.float32) 

501 else: 

502 other_vec = other 

503 

504 return float(np.linalg.norm(self.value - other_vec)) 

505 

506 def to_dict(self) -> dict[str, Any]: 

507 """Convert to dictionary representation.""" 

508 return { 

509 "name": self.name, 

510 "value": self.to_list(), 

511 "type": self.type.value, 

512 "metadata": self.metadata, 

513 "dimensions": self.dimensions, 

514 } 

515 

516 @classmethod 

517 def from_dict(cls, data: dict[str, Any]) -> VectorField: 

518 """Create from dictionary representation.""" 

519 metadata = data.get("metadata", {}) 

520 model_info = metadata.get("model", {}) 

521 

522 return cls( 

523 name=data["name"], 

524 value=data["value"], 

525 dimensions=data.get("dimensions") or metadata.get("dimensions"), 

526 source_field=metadata.get("source_field"), 

527 model_name=model_info.get("name") if model_info else None, 

528 model_version=model_info.get("version") if model_info else None, 

529 metadata=metadata, 

530 )