Coverage for src/dataknobs_data/vector/stores/common.py: 16%

87 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-31 15:06 -0600

1"""Common base implementation for vector stores.""" 

2 

3from __future__ import annotations 

4 

5from typing import TYPE_CHECKING, Any, cast 

6 

7from dataknobs_config import ConfigurableBase 

8 

9from ..types import DistanceMetric 

10 

11if TYPE_CHECKING: 

12 import numpy as np 

13 

14 

15class VectorStoreBase(ConfigurableBase): 

16 """Base implementation with common functionality for all vector stores. 

17  

18 This class provides: 

19 - Configuration parsing following the database pattern 

20 - Common parameter extraction 

21 - Shared utility methods 

22 """ 

23 

24 def __init__(self, config: dict[str, Any] | None = None): 

25 """Initialize vector store with configuration. 

26  

27 Args: 

28 config: Configuration dictionary with backend-specific parameters 

29 """ 

30 # ConfigurableBase doesn't have __init__, so don't call super().__init__() 

31 self.config = config or {} 

32 self._parse_common_config() 

33 self._parse_backend_config() 

34 self._initialized = False 

35 

36 def _parse_common_config(self) -> None: 

37 """Parse common configuration parameters. 

38  

39 Extracts parameters that are common to all vector stores: 

40 - dimensions: Vector dimensions 

41 - metric: Distance metric 

42 - persist_path: Path for persistent storage 

43 - batch_size: Batch size for operations 

44 - index_params: Index-specific parameters 

45 - search_params: Search-specific parameters 

46 """ 

47 # Extract dimensions (required for most stores) 

48 self.dimensions = self.config.get("dimensions", 0) 

49 

50 # Extract and parse metric 

51 metric = self.config.get("metric", "cosine") 

52 if isinstance(metric, str): 

53 self.metric = DistanceMetric(metric) 

54 else: 

55 self.metric = metric 

56 

57 # Extract paths and sizes 

58 self.persist_path = self.config.get("persist_path") 

59 self.batch_size = self.config.get("batch_size", 100) 

60 

61 # Extract parameter dictionaries 

62 self.index_params = self.config.get("index_params", {}) 

63 self.search_params = self.config.get("search_params", {}) 

64 

65 # Store any additional metadata 

66 self.metadata = self.config.get("metadata", {}) 

67 

68 def _parse_backend_config(self) -> None: 

69 """Parse backend-specific configuration. 

70  

71 Override this method in subclasses to handle backend-specific parameters. 

72 """ 

73 pass 

74 

75 def _validate_dimensions(self) -> None: 

76 """Validate vector dimensions. 

77  

78 Raises: 

79 ValueError: If dimensions are invalid 

80 """ 

81 if self.dimensions <= 0: 

82 raise ValueError(f"Dimensions must be positive, got {self.dimensions}") 

83 if self.dimensions > 65536: 

84 raise ValueError(f"Dimensions {self.dimensions} exceeds maximum (65536)") 

85 

86 def _normalize_vector(self, vector: np.ndarray) -> np.ndarray: 

87 """Normalize a vector for cosine similarity. 

88  

89 Args: 

90 vector: Vector to normalize 

91  

92 Returns: 

93 Normalized vector 

94 """ 

95 import numpy as np 

96 

97 norm = np.linalg.norm(vector) 

98 if norm == 0: 

99 return vector 

100 return vector / norm 

101 

102 def _calculate_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float: 

103 """Calculate similarity between two vectors based on configured metric. 

104  

105 Args: 

106 vec1: First vector 

107 vec2: Second vector 

108  

109 Returns: 

110 Similarity score 

111 """ 

112 import numpy as np 

113 

114 if self.metric == DistanceMetric.COSINE: 

115 # Cosine similarity 

116 norm1 = np.linalg.norm(vec1) 

117 norm2 = np.linalg.norm(vec2) 

118 if norm1 == 0 or norm2 == 0: 

119 return 0.0 

120 return float(np.dot(vec1, vec2) / (norm1 * norm2)) 

121 

122 elif self.metric in (DistanceMetric.EUCLIDEAN, DistanceMetric.L2): 

123 # Convert distance to similarity 

124 distance = float(np.linalg.norm(vec1 - vec2)) 

125 return 1.0 / (1.0 + distance) 

126 

127 elif self.metric in (DistanceMetric.DOT_PRODUCT, DistanceMetric.INNER_PRODUCT): 

128 # Dot product 

129 return float(np.dot(vec1, vec2)) 

130 

131 elif self.metric == DistanceMetric.L1: 

132 # Manhattan distance to similarity 

133 distance = np.sum(np.abs(vec1 - vec2)) 

134 return 1.0 / (1.0 + distance) 

135 

136 else: 

137 # Default to cosine 

138 norm1 = np.linalg.norm(vec1) 

139 norm2 = np.linalg.norm(vec2) 

140 if norm1 == 0 or norm2 == 0: 

141 return 0.0 

142 return float(np.dot(vec1, vec2) / (norm1 * norm2)) 

143 

144 def _convert_distance_to_score(self, distance: float) -> float: 

145 """Convert a distance to a similarity score based on metric. 

146  

147 Args: 

148 distance: Distance value 

149  

150 Returns: 

151 Similarity score (higher is more similar) 

152 """ 

153 if self.metric == DistanceMetric.COSINE: 

154 # Cosine distance is 1 - similarity 

155 return 1.0 - distance 

156 elif self.metric in (DistanceMetric.EUCLIDEAN, DistanceMetric.L2): 

157 # Convert distance to similarity 

158 return 1.0 / (1.0 + distance) 

159 elif self.metric == DistanceMetric.L1: 

160 # Manhattan distance to similarity 

161 return 1.0 / (1.0 + distance) 

162 else: 

163 # For dot product and others, higher is better 

164 return distance 

165 

166 def _prepare_vector(self, vector: np.ndarray | list[float] | list[np.ndarray], normalize: bool = False) -> np.ndarray: 

167 """Prepare a vector for storage or search. 

168  

169 Args: 

170 vector: Input vector (numpy array, list of floats, or list of arrays) 

171 normalize: Whether to normalize for cosine similarity 

172  

173 Returns: 

174 Prepared numpy array 

175 """ 

176 import numpy as np 

177 

178 # Convert to numpy array 

179 if isinstance(vector, list): 

180 if len(vector) > 0 and isinstance(vector[0], np.ndarray): 

181 # List of arrays - stack them 

182 vector = np.vstack(vector).astype(np.float32) 

183 else: 

184 # List of floats 

185 vector = np.array(vector, dtype=np.float32) 

186 else: 

187 vector = np.asarray(vector, dtype=np.float32) 

188 

189 # Ensure vector is an ndarray at this point 

190 assert isinstance(vector, np.ndarray) 

191 

192 # Ensure correct shape 

193 if vector.ndim == 1: 

194 vector = vector.reshape(1, -1) 

195 

196 # Normalize if needed (e.g., for cosine similarity) 

197 if normalize or self.metric == DistanceMetric.COSINE: 

198 # Apply normalization for cosine similarity 

199 norms = np.linalg.norm(vector, axis=1, keepdims=True) 

200 norms[norms == 0] = 1 # Avoid division by zero 

201 vector = vector / norms 

202 

203 return cast("np.ndarray", vector) 

204 

205 def _apply_metadata_filter(self, candidates: list[tuple[Any, dict]], filter: dict[str, Any]) -> list[tuple[Any, dict]]: 

206 """Apply metadata filter to candidates. 

207  

208 Args: 

209 candidates: List of (id, metadata) tuples 

210 filter: Filter criteria as key-value pairs 

211  

212 Returns: 

213 Filtered list of candidates 

214 """ 

215 if not filter: 

216 return candidates 

217 

218 filtered = [] 

219 for item_id, metadata in candidates: 

220 # Check if all filter conditions match 

221 match = all( 

222 metadata.get(key) == value 

223 for key, value in filter.items() 

224 ) 

225 if match: 

226 filtered.append((item_id, metadata)) 

227 

228 return filtered 

229 

230 def __repr__(self) -> str: 

231 """String representation.""" 

232 return ( 

233 f"{self.__class__.__name__}(" 

234 f"dimensions={self.dimensions}, " 

235 f"metric={self.metric.value})" 

236 )