Coverage for src/dataknobs_data/fields.py: 31%

163 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-31 15:06 -0600

1from __future__ import annotations 

2 

3import copy 

4from dataclasses import dataclass, field 

5from datetime import datetime 

6from enum import Enum 

7from typing import TYPE_CHECKING, Any 

8 

9if TYPE_CHECKING: 

10 import numpy as np 

11 from collections.abc import Callable 

12else: 

13 from typing import Callable 

14 

15 

16class FieldType(Enum): 

17 """Enumeration of supported field types.""" 

18 

19 STRING = "string" 

20 INTEGER = "integer" 

21 FLOAT = "float" 

22 BOOLEAN = "boolean" 

23 DATETIME = "datetime" 

24 JSON = "json" 

25 BINARY = "binary" 

26 TEXT = "text" 

27 VECTOR = "vector" 

28 SPARSE_VECTOR = "sparse_vector" 

29 

30 

31@dataclass 

32class Field: 

33 """Represents a single field in a record.""" 

34 

35 name: str 

36 value: Any 

37 type: FieldType | None = None 

38 metadata: dict[str, Any] = field(default_factory=dict) 

39 

40 def __post_init__(self): 

41 """Auto-detect type if not provided.""" 

42 if self.type is None: 

43 self.type = self._detect_type(self.value) 

44 

45 def _detect_type(self, value: Any) -> FieldType: 

46 """Detect the field type from the value.""" 

47 if value is None: 

48 return FieldType.STRING 

49 elif isinstance(value, bool): 

50 return FieldType.BOOLEAN 

51 elif isinstance(value, int): 

52 return FieldType.INTEGER 

53 elif isinstance(value, float): 

54 return FieldType.FLOAT 

55 elif isinstance(value, datetime): 

56 return FieldType.DATETIME 

57 elif isinstance(value, (dict, list)): 

58 return FieldType.JSON 

59 elif isinstance(value, bytes): 

60 return FieldType.BINARY 

61 elif isinstance(value, str): 

62 if len(value) > 1000: 

63 return FieldType.TEXT 

64 return FieldType.STRING 

65 else: 

66 return FieldType.JSON 

67 

68 def copy(self) -> Field: 

69 """Create a deep copy of the field.""" 

70 return Field( 

71 name=self.name, 

72 value=copy.deepcopy(self.value), 

73 type=self.type, 

74 metadata=copy.deepcopy(self.metadata) 

75 ) 

76 

77 def validate(self) -> bool: 

78 """Validate that the value matches the field type.""" 

79 if self.value is None: 

80 return True 

81 

82 type_validators = { 

83 FieldType.STRING: lambda v: isinstance(v, str), 

84 FieldType.INTEGER: lambda v: isinstance(v, int) and not isinstance(v, bool), 

85 FieldType.FLOAT: lambda v: isinstance(v, (int, float)) and not isinstance(v, bool), 

86 FieldType.BOOLEAN: lambda v: isinstance(v, bool), 

87 FieldType.DATETIME: lambda v: isinstance(v, datetime), 

88 FieldType.JSON: lambda v: isinstance(v, (dict, list)), 

89 FieldType.BINARY: lambda v: isinstance(v, bytes), 

90 FieldType.TEXT: lambda v: isinstance(v, str), 

91 } 

92 

93 if self.type is None: 

94 return True 

95 validator = type_validators.get(self.type) 

96 if validator: 

97 return validator(self.value) 

98 return True 

99 

100 def convert_to(self, target_type: FieldType) -> Field: 

101 """Convert the field to a different type.""" 

102 if self.type == target_type: 

103 return self 

104 

105 converters: dict[tuple[FieldType, FieldType], Callable[[Any], Any]] = { 

106 (FieldType.INTEGER, FieldType.STRING): str, 

107 (FieldType.INTEGER, FieldType.FLOAT): float, 

108 (FieldType.FLOAT, FieldType.STRING): str, 

109 (FieldType.FLOAT, FieldType.INTEGER): int, 

110 (FieldType.BOOLEAN, FieldType.STRING): lambda v: "true" if v else "false", 

111 (FieldType.BOOLEAN, FieldType.INTEGER): int, 

112 (FieldType.STRING, FieldType.INTEGER): int, 

113 (FieldType.STRING, FieldType.FLOAT): float, 

114 (FieldType.STRING, FieldType.BOOLEAN): lambda v: v.lower() in ("true", "1", "yes"), 

115 (FieldType.STRING, FieldType.TEXT): lambda v: v, 

116 (FieldType.TEXT, FieldType.STRING): lambda v: v, 

117 } 

118 

119 if self.type is None: 

120 raise ValueError(f"Cannot convert {self.name} from None to {target_type}") 

121 

122 converter_key = (self.type, target_type) 

123 if converter_key in converters: 

124 try: 

125 converter = converters[converter_key] 

126 new_value = converter(self.value) 

127 return Field( 

128 name=self.name, value=new_value, type=target_type, metadata=self.metadata.copy() 

129 ) 

130 except (ValueError, TypeError) as e: 

131 raise ValueError( 

132 f"Cannot convert {self.name} from {self.type} to {target_type}: {e}" 

133 ) from e 

134 else: 

135 raise ValueError(f"No converter available from {self.type} to {target_type}") 

136 

137 def to_dict(self) -> dict[str, Any]: 

138 """Convert the field to a dictionary representation.""" 

139 return { 

140 "name": self.name, 

141 "value": self.value, 

142 "type": self.type.value if self.type else None, 

143 "metadata": self.metadata, 

144 } 

145 

146 @classmethod 

147 def from_dict(cls, data: dict[str, Any]) -> Field: 

148 """Create a field from a dictionary representation.""" 

149 field_type = None 

150 if data.get("type"): 

151 field_type = FieldType(data["type"]) 

152 

153 # Handle vector fields specially 

154 if field_type in (FieldType.VECTOR, FieldType.SPARSE_VECTOR): 

155 return VectorField.from_dict(data) 

156 

157 return cls( 

158 name=data["name"], 

159 value=data["value"], 

160 type=field_type, 

161 metadata=data.get("metadata", {}), 

162 ) 

163 

164 

165class VectorField(Field): 

166 """Represents a vector field with embeddings and metadata. 

167  

168 Examples: 

169 # Simple usage - name optional when used in Record 

170 record = Record({ 

171 "embedding": VectorField(value=[0.1, 0.2, 0.3]) 

172 }) 

173  

174 # With explicit configuration 

175 field = VectorField( 

176 value=embedding_array, 

177 name="doc_embedding", 

178 model_name="all-MiniLM-L6-v2", 

179 source_field="content" 

180 ) 

181  

182 # From text using embedding function 

183 field = VectorField.from_text( 

184 "This is the text to embed", 

185 embedding_fn=model.encode 

186 ) 

187 """ 

188 

189 def __init__( 

190 self, 

191 value: np.ndarray | list[float], 

192 name: str | None = None, # Made optional 

193 dimensions: int | None = None, # Auto-detected from value 

194 source_field: str | None = None, 

195 model_name: str | None = None, 

196 model_version: str | None = None, 

197 metadata: dict[str, Any] | None = None, 

198 ): 

199 """Initialize a vector field. 

200 

201 Args: 

202 value: Vector data as numpy array or list of floats 

203 name: Field name (optional, defaults to "embedding") 

204 dimensions: Expected dimensions (auto-detected if not provided) 

205 source_field: Name of the text field this vector was generated from 

206 model_name: Name of the embedding model used 

207 model_version: Version of the embedding model 

208 metadata: Additional metadata 

209 """ 

210 # Import numpy lazily to avoid hard dependency 

211 try: 

212 import numpy as np 

213 except ImportError as e: 

214 raise ImportError( 

215 "numpy is required for vector fields. Install with: pip install numpy" 

216 ) from e 

217 

218 # Set default name if not provided 

219 if name is None: 

220 name = "embedding" 

221 

222 # Convert to numpy array if needed 

223 if isinstance(value, list): 

224 value = np.array(value, dtype=np.float32) 

225 elif isinstance(value, np.ndarray): 

226 # Ensure float32 dtype for consistency 

227 if value.dtype != np.float32: 

228 value = value.astype(np.float32) 

229 else: 

230 raise TypeError( 

231 f"Vector value must be numpy array or list, got {type(value)}" 

232 ) 

233 

234 # Auto-detect dimensions if not provided 

235 actual_dims = len(value) if value.ndim == 1 else value.shape[-1] 

236 if dimensions is None: 

237 dimensions = actual_dims 

238 elif dimensions != actual_dims: 

239 raise ValueError( 

240 f"Vector dimension mismatch for field '{name}': " 

241 f"expected {dimensions}, got {actual_dims}" 

242 ) 

243 

244 # Store vector metadata 

245 vector_metadata = metadata or {} 

246 vector_metadata.update({ 

247 "dimensions": dimensions, 

248 "source_field": source_field, 

249 "model": { 

250 "name": model_name, 

251 "version": model_version, 

252 } if model_name else None, 

253 }) 

254 

255 super().__init__( 

256 name=name, 

257 value=value, 

258 type=FieldType.VECTOR, 

259 metadata=vector_metadata, 

260 ) 

261 

262 self.dimensions = dimensions 

263 self.source_field = source_field 

264 self.model_name = model_name 

265 self.model_version = model_version 

266 

267 @classmethod 

268 def from_text( 

269 cls, 

270 text: str, 

271 embedding_fn: Callable[[str], Any], 

272 name: str | None = None, 

273 dimensions: int | None = None, 

274 model_name: str | None = None, 

275 model_version: str | None = None, 

276 **kwargs 

277 ) -> VectorField: 

278 """Create a VectorField from text using an embedding function. 

279  

280 Args: 

281 text: Text to embed 

282 embedding_fn: Function that takes text and returns embedding vector 

283 name: Field name (optional, defaults to "embedding") 

284 dimensions: Expected dimensions (auto-detected if not provided) 

285 model_name: Name of the embedding model 

286 model_version: Version of the embedding model 

287 **kwargs: Additional arguments passed to VectorField constructor 

288  

289 Returns: 

290 VectorField instance with the generated embedding 

291  

292 Example: 

293 field = VectorField.from_text( 

294 "Machine learning is fascinating", 

295 embedding_fn=model.encode, 

296 model_name="all-MiniLM-L6-v2" 

297 ) 

298 """ 

299 embedding = embedding_fn(text) 

300 return cls( 

301 value=embedding, 

302 name=name, 

303 dimensions=dimensions, 

304 source_field="text", # Indicate it came from text 

305 model_name=model_name, 

306 model_version=model_version, 

307 **kwargs 

308 ) 

309 

310 def validate(self) -> bool: 

311 """Validate the vector field.""" 

312 if self.value is None: 

313 return True 

314 

315 try: 

316 import numpy as np 

317 

318 if not isinstance(self.value, np.ndarray): 

319 return False 

320 

321 if self.value.ndim not in (1, 2): 

322 return False 

323 

324 # Check dimensions match metadata 

325 actual_dims = len(self.value) if self.value.ndim == 1 else self.value.shape[-1] 

326 expected_dims = self.metadata.get("dimensions") 

327 if expected_dims and actual_dims != expected_dims: 

328 return False 

329 

330 return True 

331 except ImportError: 

332 return False 

333 

334 def to_list(self) -> list[float]: 

335 """Convert vector to a list of floats.""" 

336 import numpy as np 

337 

338 if isinstance(self.value, np.ndarray): 

339 return self.value.tolist() 

340 return list(self.value) 

341 

342 def cosine_similarity(self, other: VectorField | np.ndarray | list[float]) -> float: 

343 """Compute cosine similarity with another vector.""" 

344 import numpy as np 

345 

346 if isinstance(other, VectorField): 

347 other_vec = other.value 

348 elif isinstance(other, list): 

349 other_vec = np.array(other, dtype=np.float32) 

350 else: 

351 other_vec = other 

352 

353 # Compute cosine similarity 

354 dot_product = np.dot(self.value, other_vec) 

355 norm_a = np.linalg.norm(self.value) 

356 norm_b = np.linalg.norm(other_vec) 

357 

358 if norm_a == 0 or norm_b == 0: 

359 return 0.0 

360 

361 return float(dot_product / (norm_a * norm_b)) 

362 

363 def euclidean_distance(self, other: VectorField | np.ndarray | list[float]) -> float: 

364 """Compute Euclidean distance to another vector.""" 

365 import numpy as np 

366 

367 if isinstance(other, VectorField): 

368 other_vec = other.value 

369 elif isinstance(other, list): 

370 other_vec = np.array(other, dtype=np.float32) 

371 else: 

372 other_vec = other 

373 

374 return float(np.linalg.norm(self.value - other_vec)) 

375 

376 def to_dict(self) -> dict[str, Any]: 

377 """Convert to dictionary representation.""" 

378 return { 

379 "name": self.name, 

380 "value": self.to_list(), 

381 "type": self.type.value, 

382 "metadata": self.metadata, 

383 "dimensions": self.dimensions, 

384 } 

385 

386 @classmethod 

387 def from_dict(cls, data: dict[str, Any]) -> VectorField: 

388 """Create from dictionary representation.""" 

389 metadata = data.get("metadata", {}) 

390 model_info = metadata.get("model", {}) 

391 

392 return cls( 

393 name=data["name"], 

394 value=data["value"], 

395 dimensions=data.get("dimensions") or metadata.get("dimensions"), 

396 source_field=metadata.get("source_field"), 

397 model_name=model_info.get("name") if model_info else None, 

398 model_version=model_info.get("version") if model_info else None, 

399 metadata=metadata, 

400 )