Coverage for src / dataknobs_data / schema.py: 35%

91 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-26 15:45 -0700

1"""Database schema definitions for field structures.""" 

2 

3from __future__ import annotations 

4 

5from dataclasses import dataclass, field 

6from typing import Any 

7 

8from .fields import FieldType 

9 

10 

11@dataclass 

12class FieldSchema: 

13 """Schema definition for a field without actual data. 

14 

15 Defines the structure and constraints for a field in a database schema. 

16 Used for validation, type checking, and backend schema generation. 

17 

18 Attributes: 

19 name: Field name 

20 type: Field data type 

21 metadata: Additional field metadata 

22 required: Whether the field is required 

23 default: Default value if field is missing 

24 

25 Example: 

26 ```python 

27 from dataknobs_data.schema import FieldSchema 

28 from dataknobs_data.fields import FieldType 

29 

30 # Simple field schema 

31 name_schema = FieldSchema(name="name", type=FieldType.STRING, required=True) 

32 

33 # Vector field schema with metadata 

34 embedding_schema = FieldSchema( 

35 name="embedding", 

36 type=FieldType.VECTOR, 

37 metadata={"dimensions": 384, "source_field": "content"}, 

38 required=False 

39 ) 

40 

41 # Check if vector field 

42 is_vector = embedding_schema.is_vector_field() # True 

43 dims = embedding_schema.get_dimensions() # 384 

44 ``` 

45 """ 

46 

47 name: str 

48 type: FieldType 

49 metadata: dict[str, Any] = field(default_factory=dict) 

50 required: bool = False 

51 default: Any = None 

52 

53 def is_vector_field(self) -> bool: 

54 """Check if this is a vector field. 

55 

56 Returns: 

57 True if the field type is VECTOR or SPARSE_VECTOR 

58 

59 Example: 

60 ```python 

61 vector_schema = FieldSchema(name="embedding", type=FieldType.VECTOR) 

62 print(vector_schema.is_vector_field()) # True 

63 

64 text_schema = FieldSchema(name="content", type=FieldType.TEXT) 

65 print(text_schema.is_vector_field()) # False 

66 ``` 

67 """ 

68 return self.type in (FieldType.VECTOR, FieldType.SPARSE_VECTOR) 

69 

70 def get_dimensions(self) -> int | None: 

71 """Get vector dimensions if this is a vector field.""" 

72 if self.is_vector_field(): 

73 return self.metadata.get("dimensions") 

74 return None 

75 

76 def get_source_field(self) -> str | None: 

77 """Get source field if this is a derived vector field.""" 

78 if self.is_vector_field(): 

79 return self.metadata.get("source_field") 

80 return None 

81 

82 def to_dict(self) -> dict[str, Any]: 

83 """Convert to dictionary representation.""" 

84 return { 

85 "name": self.name, 

86 "type": self.type.value, 

87 "metadata": self.metadata, 

88 "required": self.required, 

89 "default": self.default, 

90 } 

91 

92 @classmethod 

93 def from_dict(cls, data: dict[str, Any]) -> FieldSchema: 

94 """Create from dictionary representation.""" 

95 return cls( 

96 name=data["name"], 

97 type=FieldType(data["type"]), 

98 metadata=data.get("metadata", {}), 

99 required=data.get("required", False), 

100 default=data.get("default"), 

101 ) 

102 

103 

104@dataclass 

105class DatabaseSchema: 

106 """Schema definition for a database. 

107 

108 Defines the structure of a database by specifying field schemas. Used for 

109 validation, type checking, and ensuring consistent data structure across records. 

110 

111 Attributes: 

112 fields: Dictionary mapping field names to FieldSchema objects 

113 metadata: Optional schema-level metadata 

114 

115 Example: 

116 ```python 

117 from dataknobs_data.schema import DatabaseSchema, FieldSchema 

118 from dataknobs_data.fields import FieldType 

119 

120 # Create schema using .create() method 

121 schema = DatabaseSchema.create( 

122 name=FieldType.STRING, 

123 age=FieldType.INTEGER, 

124 email=FieldType.STRING 

125 ) 

126 

127 # With vector field and metadata 

128 schema = DatabaseSchema.create( 

129 content=FieldType.TEXT, 

130 embedding=(FieldType.VECTOR, { 

131 "dimensions": 384, 

132 "source_field": "content" 

133 }) 

134 ) 

135 

136 # Add fields after creation 

137 schema.add_field(FieldSchema( 

138 name="created_at", 

139 type=FieldType.DATETIME, 

140 required=True 

141 )) 

142 

143 # Get field schemas 

144 content_schema = schema.get_field("content") 

145 all_field_names = schema.get_field_names() 

146 ``` 

147 """ 

148 

149 fields: dict[str, FieldSchema] = field(default_factory=dict) 

150 metadata: dict[str, Any] = field(default_factory=dict) 

151 

152 @classmethod 

153 def create(cls, **field_definitions) -> DatabaseSchema: 

154 """Create a schema from keyword arguments. 

155 

156 Args: 

157 **field_definitions: Field definitions where each key is a field name and 

158 each value is either a FieldType or a tuple of (FieldType, options_dict) 

159 

160 Returns: 

161 A new DatabaseSchema instance 

162 

163 Example: 

164 ```python 

165 # Simple field types 

166 schema = DatabaseSchema.create( 

167 name=FieldType.STRING, 

168 age=FieldType.INTEGER 

169 ) 

170 

171 # With field options 

172 schema = DatabaseSchema.create( 

173 content=FieldType.TEXT, 

174 embedding=(FieldType.VECTOR, {"dimensions": 384, "source_field": "content"}), 

175 score=(FieldType.FLOAT, {"required": True, "default": 0.0}) 

176 ) 

177 ``` 

178 """ 

179 schema = cls() 

180 for name, definition in field_definitions.items(): 

181 if isinstance(definition, FieldType): 

182 # Simple field type 

183 schema.add_field(FieldSchema(name=name, type=definition)) 

184 elif isinstance(definition, tuple): 

185 # Field type with metadata/options 

186 field_type, options = definition 

187 field_metadata = options.get("metadata", {}) 

188 if "dimensions" in options: 

189 field_metadata["dimensions"] = options["dimensions"] 

190 if "source_field" in options: 

191 field_metadata["source_field"] = options["source_field"] 

192 

193 schema.add_field(FieldSchema( 

194 name=name, 

195 type=field_type, 

196 metadata=field_metadata, 

197 required=options.get("required", False), 

198 default=options.get("default") 

199 )) 

200 else: 

201 raise ValueError(f"Invalid field definition for {name}: {definition}") 

202 return schema 

203 

204 def add_field(self, field_schema: FieldSchema) -> DatabaseSchema: 

205 """Add a field to the schema. 

206  

207 Returns self for chaining. 

208 """ 

209 self.fields[field_schema.name] = field_schema 

210 return self 

211 

212 def add_text_field(self, name: str, required: bool = False) -> DatabaseSchema: 

213 """Add a text field to the schema.""" 

214 return self.add_field(FieldSchema(name=name, type=FieldType.TEXT, required=required)) 

215 

216 def add_vector_field( 

217 self, 

218 name: str, 

219 dimensions: int, 

220 source_field: str | None = None, 

221 required: bool = False 

222 ) -> DatabaseSchema: 

223 """Add a vector field to the schema.""" 

224 return self.add_field(FieldSchema( 

225 name=name, 

226 type=FieldType.VECTOR, 

227 metadata={"dimensions": dimensions, "source_field": source_field}, 

228 required=required 

229 )) 

230 

231 def remove_field(self, name: str) -> bool: 

232 """Remove a field from the schema.""" 

233 if name in self.fields: 

234 del self.fields[name] 

235 return True 

236 return False 

237 

238 def get_vector_fields(self) -> dict[str, FieldSchema]: 

239 """Get all vector fields in the schema.""" 

240 return { 

241 name: field 

242 for name, field in self.fields.items() 

243 if field.is_vector_field() 

244 } 

245 

246 def get_source_fields(self) -> dict[str, list[str]]: 

247 """Get mapping of source fields to their dependent vector fields.""" 

248 source_map = {} 

249 for name, field_obj in self.fields.items(): 

250 if field_obj.is_vector_field(): 

251 source = field_obj.get_source_field() 

252 if source: 

253 if source not in source_map: 

254 source_map[source] = [] 

255 source_map[source].append(name) 

256 return source_map 

257 

258 def to_dict(self) -> dict[str, Any]: 

259 """Convert to dictionary representation.""" 

260 return { 

261 "fields": {name: f.to_dict() for name, f in self.fields.items()}, 

262 "metadata": self.metadata, 

263 } 

264 

265 @classmethod 

266 def from_dict(cls, data: dict[str, Any]) -> DatabaseSchema: 

267 """Create from dictionary representation. 

268  

269 Supports multiple formats: 

270 1. Full format with FieldSchema dicts 

271 2. Simple format with just field types 

272 3. Mixed format 

273  

274 Examples: 

275 # Simple format 

276 {"fields": {"content": "text", "score": "float"}} 

277  

278 # Full format 

279 {"fields": {"content": {"type": "text", "required": true}}} 

280  

281 # Vector fields 

282 {"fields": {"embedding": {"type": "vector", "dimensions": 384}}} 

283 """ 

284 schema = cls(metadata=data.get("metadata", {})) 

285 

286 for name, field_data in data.get("fields", {}).items(): 

287 if isinstance(field_data, str): 

288 # Simple string type 

289 schema.fields[name] = FieldSchema( 

290 name=name, 

291 type=FieldType(field_data) 

292 ) 

293 elif isinstance(field_data, dict): 

294 if "type" in field_data: 

295 # Full field schema dict 

296 field_type = FieldType(field_data["type"]) 

297 metadata = {} 

298 

299 # Handle vector-specific fields 

300 if "dimensions" in field_data: 

301 metadata["dimensions"] = field_data["dimensions"] 

302 if "source_field" in field_data: 

303 metadata["source_field"] = field_data["source_field"] 

304 

305 # Merge with explicit metadata 

306 if "metadata" in field_data: 

307 metadata.update(field_data["metadata"]) 

308 

309 schema.fields[name] = FieldSchema( 

310 name=name, 

311 type=field_type, 

312 metadata=metadata, 

313 required=field_data.get("required", False), 

314 default=field_data.get("default") 

315 ) 

316 else: 

317 # Try to parse as FieldSchema dict 

318 schema.fields[name] = FieldSchema.from_dict(field_data) 

319 

320 return schema