Coverage for src/dataknobs_data/factory.py: 19%

72 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-29 14:14 -0600

1"""Backend factory for dynamic database creation.""" 

2 

3import logging 

4from typing import Any 

5 

6from dataknobs_config import FactoryBase 

7 

8from dataknobs_data.database import SyncDatabase 

9 

10# Import the VectorStoreFactory from vector.stores.factory 

11from dataknobs_data.vector.stores.factory import VectorStoreFactory 

12 

13 

14logger = logging.getLogger(__name__) 

15 

16 

17class DatabaseFactory(FactoryBase): 

18 """Factory for creating database backends dynamically. 

19  

20 This factory allows creating different database implementations 

21 based on configuration, supporting all available backends. 

22  

23 Configuration Options: 

24 backend (str): Backend type (memory, file, postgres, elasticsearch, s3) 

25 **kwargs: Backend-specific configuration options 

26  

27 Example Configuration: 

28 databases: 

29 - name: main 

30 factory: database 

31 backend: postgres 

32 host: localhost 

33 database: myapp 

34  

35 - name: cache 

36 factory: database 

37 backend: memory 

38  

39 - name: archive 

40 factory: database 

41 backend: s3 

42 bucket: my-archive-bucket 

43 prefix: archives/ 

44 """ 

45 

46 def create(self, **config) -> SyncDatabase: 

47 """Create a database instance based on configuration. 

48  

49 Args: 

50 **config: Configuration including 'backend' field and backend-specific options 

51  

52 Returns: 

53 Instance of appropriate database backend 

54  

55 Raises: 

56 ValueError: If backend type is not recognized or not available 

57 """ 

58 backend_type = config.pop("backend", "memory").lower() 

59 

60 logger.info(f"Creating database with backend: {backend_type}") 

61 

62 # Check if vector_enabled is set 

63 vector_enabled = config.get("vector_enabled", False) 

64 

65 if vector_enabled: 

66 # All backends now have vector support (some native, some via Python) 

67 logger.debug(f"Vector support enabled for backend: {backend_type}") 

68 

69 if backend_type in ("memory", "mem"): 

70 from dataknobs_data.backends.memory import SyncMemoryDatabase 

71 return SyncMemoryDatabase.from_config(config) 

72 

73 elif backend_type == "file": 

74 from dataknobs_data.backends.file import SyncFileDatabase 

75 return SyncFileDatabase.from_config(config) 

76 

77 elif backend_type in ("postgres", "postgresql", "pg"): 

78 try: 

79 from dataknobs_data.backends.postgres import SyncPostgresDatabase 

80 return SyncPostgresDatabase.from_config(config) 

81 except ImportError as e: 

82 raise ValueError( 

83 "PostgreSQL backend requires psycopg2. " 

84 "Install with: pip install dataknobs-data[postgres]" 

85 ) from e 

86 

87 elif backend_type in ("elasticsearch", "es"): 

88 try: 

89 from dataknobs_data.backends.elasticsearch import SyncElasticsearchDatabase 

90 return SyncElasticsearchDatabase.from_config(config) 

91 except ImportError as e: 

92 raise ValueError( 

93 "Elasticsearch backend requires elasticsearch package. " 

94 "Install with: pip install dataknobs-data[elasticsearch]" 

95 ) from e 

96 

97 elif backend_type == "sqlite": 

98 from dataknobs_data.backends.sqlite import SyncSQLiteDatabase 

99 return SyncSQLiteDatabase.from_config(config) 

100 

101 elif backend_type == "s3": 

102 try: 

103 from dataknobs_data.backends.s3 import SyncS3Database 

104 return SyncS3Database.from_config(config) 

105 except ImportError as e: 

106 raise ValueError( 

107 "S3 backend requires boto3. " 

108 "Install with: pip install dataknobs-data[s3]" 

109 ) from e 

110 

111 else: 

112 raise ValueError( 

113 f"Unknown backend type: {backend_type}. " 

114 f"Available backends: memory, file, postgres, elasticsearch, sqlite, s3" 

115 ) 

116 

117 

118 def get_backend_info(self, backend_type: str) -> dict[str, Any]: 

119 """Get information about a specific backend. 

120  

121 Args: 

122 backend_type: Name of the backend 

123  

124 Returns: 

125 Dictionary with backend information 

126 """ 

127 info = { 

128 "memory": { 

129 "description": "In-memory storage for testing and caching", 

130 "persistent": False, 

131 "requires_install": False, 

132 "config_options": { 

133 "initial_data": "Optional initial data dictionary" 

134 } 

135 }, 

136 "file": { 

137 "description": "File-based storage (JSON, CSV, Parquet)", 

138 "persistent": True, 

139 "requires_install": False, 

140 "config_options": { 

141 "path": "Path to the file (required)", 

142 "format": "File format: json, csv, parquet (default: json)", 

143 "compression": "Optional compression: gzip, bz2, xz" 

144 } 

145 }, 

146 "postgres": { 

147 "description": "PostgreSQL database backend with native vector support (pgvector)", 

148 "persistent": True, 

149 "requires_install": "pip install dataknobs-data[postgres]", 

150 "vector_support": True, 

151 "config_options": { 

152 "host": "Database host (required)", 

153 "port": "Database port (default: 5432)", 

154 "database": "Database name (required)", 

155 "user": "Username (required)", 

156 "password": "Password (required)", 

157 "table": "Table name (default: records)", 

158 "vector_enabled": "Enable vector support (default: False)", 

159 "vector_metric": "Distance metric for vectors: cosine, euclidean, dot_product (default: cosine)" 

160 } 

161 }, 

162 "elasticsearch": { 

163 "description": "Elasticsearch search engine backend with native KNN vector support", 

164 "persistent": True, 

165 "requires_install": "pip install dataknobs-data[elasticsearch]", 

166 "vector_support": True, 

167 "config_options": { 

168 "hosts": "List of host URLs (required)", 

169 "index": "Index name (required)", 

170 "doc_type": "Document type (default: _doc)", 

171 "username": "Optional username", 

172 "password": "Optional password", 

173 "vector_enabled": "Enable vector support (default: False)", 

174 "vector_metric": "Distance metric for vectors: cosine, euclidean, dot_product (default: cosine)" 

175 } 

176 }, 

177 "sqlite": { 

178 "description": "SQLite database backend with Python-based vector support", 

179 "persistent": True, 

180 "requires_install": False, 

181 "vector_support": True, 

182 "config_options": { 

183 "path": "Path to database file (required)", 

184 "table": "Table name (default: records)", 

185 "vector_enabled": "Enable vector support (default: False)", 

186 "vector_metric": "Distance metric for vectors: cosine, euclidean, dot_product (default: cosine)" 

187 } 

188 }, 

189 "s3": { 

190 "description": "AWS S3 object storage backend", 

191 "persistent": True, 

192 "requires_install": "pip install dataknobs-data[s3]", 

193 "config_options": { 

194 "bucket": "S3 bucket name (required)", 

195 "prefix": "Object key prefix (default: records/)", 

196 "region": "AWS region (default: us-east-1)", 

197 "endpoint_url": "Custom endpoint for S3-compatible services", 

198 "access_key_id": "AWS access key (or use IAM role)", 

199 "secret_access_key": "AWS secret key (or use IAM role)" 

200 } 

201 } 

202 } 

203 

204 return info.get(backend_type.lower(), { 

205 "description": "Unknown backend", 

206 "error": f"Backend '{backend_type}' not recognized" 

207 }) 

208 

209 

210class AsyncDatabaseFactory(FactoryBase): 

211 """Factory for creating async database backends. 

212  

213 Note: Currently only some backends support async operations. 

214 """ 

215 

216 def create(self, **config) -> Any: 

217 """Create an async database instance. 

218  

219 Args: 

220 **config: Configuration including 'backend' field 

221  

222 Returns: 

223 Instance of appropriate async database backend 

224  

225 Raises: 

226 ValueError: If backend doesn't support async operations 

227 """ 

228 backend_type = config.pop("backend", "memory").lower() 

229 

230 # Check if vector_enabled is set 

231 vector_enabled = config.get("vector_enabled", False) 

232 

233 if vector_enabled: 

234 # All backends now have vector support (some native, some via Python) 

235 logger.debug(f"Vector support enabled for async backend: {backend_type}") 

236 

237 if backend_type in ("memory", "mem"): 

238 from dataknobs_data.backends.memory import AsyncMemoryDatabase 

239 return AsyncMemoryDatabase.from_config(config) 

240 

241 elif backend_type == "file": 

242 from dataknobs_data.backends.file import AsyncFileDatabase 

243 return AsyncFileDatabase.from_config(config) 

244 

245 elif backend_type in ("postgres", "postgresql", "pg"): 

246 from dataknobs_data.backends.postgres import AsyncPostgresDatabase 

247 return AsyncPostgresDatabase.from_config(config) 

248 

249 elif backend_type in ("elasticsearch", "es"): 

250 from dataknobs_data.backends.elasticsearch_async import AsyncElasticsearchDatabase 

251 return AsyncElasticsearchDatabase.from_config(config) 

252 

253 elif backend_type == "s3": 

254 from dataknobs_data.backends.s3_async import AsyncS3Database 

255 return AsyncS3Database.from_config(config) 

256 

257 elif backend_type == "sqlite": 

258 from dataknobs_data.backends.sqlite_async import AsyncSQLiteDatabase 

259 return AsyncSQLiteDatabase.from_config(config) 

260 

261 else: 

262 raise ValueError( 

263 f"Backend '{backend_type}' does not support async operations yet. " 

264 f"Available async backends: memory, file, postgres, elasticsearch, s3, sqlite" 

265 ) 

266 

267 

268# TODO: Add AsyncVectorStoreFactory when async vector stores are implemented 

269# The async vector store implementations (AsyncFaissVectorStore, AsyncChromaVectorStore,  

270# AsyncMemoryVectorStore) and base class (AsyncVectorStore) need to be created first. 

271 

272 

273# Create singleton instances for registration 

274database_factory = DatabaseFactory() 

275async_database_factory = AsyncDatabaseFactory() 

276vector_store_factory = VectorStoreFactory() 

277# TODO: add an 'async_vector_store_factory = AsyncVectorStoreFactory()' when async vector stores are implemented