Coverage for src/dataknobs_data/vector/stores/common.py: 16%
87 statements
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-13 11:23 -0700
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-13 11:23 -0700
1"""Common base implementation for vector stores."""
3from __future__ import annotations
5from typing import TYPE_CHECKING, Any, cast
7from dataknobs_config import ConfigurableBase
9from ..types import DistanceMetric
11if TYPE_CHECKING:
12 import numpy as np
15class VectorStoreBase(ConfigurableBase):
16 """Base implementation with common functionality for all vector stores.
18 This class provides:
19 - Configuration parsing following the database pattern
20 - Common parameter extraction
21 - Shared utility methods
22 """
24 def __init__(self, config: dict[str, Any] | None = None):
25 """Initialize vector store with configuration.
27 Args:
28 config: Configuration dictionary with backend-specific parameters
29 """
30 # ConfigurableBase doesn't have __init__, so don't call super().__init__()
31 self.config = config or {}
32 self._parse_common_config()
33 self._parse_backend_config()
34 self._initialized = False
36 def _parse_common_config(self) -> None:
37 """Parse common configuration parameters.
39 Extracts parameters that are common to all vector stores:
40 - dimensions: Vector dimensions
41 - metric: Distance metric
42 - persist_path: Path for persistent storage
43 - batch_size: Batch size for operations
44 - index_params: Index-specific parameters
45 - search_params: Search-specific parameters
46 """
47 # Extract dimensions (required for most stores)
48 self.dimensions = self.config.get("dimensions", 0)
50 # Extract and parse metric
51 metric = self.config.get("metric", "cosine")
52 if isinstance(metric, str):
53 self.metric = DistanceMetric(metric)
54 else:
55 self.metric = metric
57 # Extract paths and sizes
58 self.persist_path = self.config.get("persist_path")
59 self.batch_size = self.config.get("batch_size", 100)
61 # Extract parameter dictionaries
62 self.index_params = self.config.get("index_params", {})
63 self.search_params = self.config.get("search_params", {})
65 # Store any additional metadata
66 self.metadata = self.config.get("metadata", {})
68 def _parse_backend_config(self) -> None:
69 """Parse backend-specific configuration.
71 Override this method in subclasses to handle backend-specific parameters.
72 """
73 pass
75 def _validate_dimensions(self) -> None:
76 """Validate vector dimensions.
78 Raises:
79 ValueError: If dimensions are invalid
80 """
81 if self.dimensions <= 0:
82 raise ValueError(f"Dimensions must be positive, got {self.dimensions}")
83 if self.dimensions > 65536:
84 raise ValueError(f"Dimensions {self.dimensions} exceeds maximum (65536)")
86 def _normalize_vector(self, vector: np.ndarray) -> np.ndarray:
87 """Normalize a vector for cosine similarity.
89 Args:
90 vector: Vector to normalize
92 Returns:
93 Normalized vector
94 """
95 import numpy as np
97 norm = np.linalg.norm(vector)
98 if norm == 0:
99 return vector
100 return vector / norm
102 def _calculate_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
103 """Calculate similarity between two vectors based on configured metric.
105 Args:
106 vec1: First vector
107 vec2: Second vector
109 Returns:
110 Similarity score
111 """
112 import numpy as np
114 if self.metric == DistanceMetric.COSINE:
115 # Cosine similarity
116 norm1 = np.linalg.norm(vec1)
117 norm2 = np.linalg.norm(vec2)
118 if norm1 == 0 or norm2 == 0:
119 return 0.0
120 return float(np.dot(vec1, vec2) / (norm1 * norm2))
122 elif self.metric in (DistanceMetric.EUCLIDEAN, DistanceMetric.L2):
123 # Convert distance to similarity
124 distance = float(np.linalg.norm(vec1 - vec2))
125 return 1.0 / (1.0 + distance)
127 elif self.metric in (DistanceMetric.DOT_PRODUCT, DistanceMetric.INNER_PRODUCT):
128 # Dot product
129 return float(np.dot(vec1, vec2))
131 elif self.metric == DistanceMetric.L1:
132 # Manhattan distance to similarity
133 distance = np.sum(np.abs(vec1 - vec2))
134 return 1.0 / (1.0 + distance)
136 else:
137 # Default to cosine
138 norm1 = np.linalg.norm(vec1)
139 norm2 = np.linalg.norm(vec2)
140 if norm1 == 0 or norm2 == 0:
141 return 0.0
142 return float(np.dot(vec1, vec2) / (norm1 * norm2))
144 def _convert_distance_to_score(self, distance: float) -> float:
145 """Convert a distance to a similarity score based on metric.
147 Args:
148 distance: Distance value
150 Returns:
151 Similarity score (higher is more similar)
152 """
153 if self.metric == DistanceMetric.COSINE:
154 # Cosine distance is 1 - similarity
155 return 1.0 - distance
156 elif self.metric in (DistanceMetric.EUCLIDEAN, DistanceMetric.L2):
157 # Convert distance to similarity
158 return 1.0 / (1.0 + distance)
159 elif self.metric == DistanceMetric.L1:
160 # Manhattan distance to similarity
161 return 1.0 / (1.0 + distance)
162 else:
163 # For dot product and others, higher is better
164 return distance
166 def _prepare_vector(self, vector: np.ndarray | list[float] | list[np.ndarray], normalize: bool = False) -> np.ndarray:
167 """Prepare a vector for storage or search.
169 Args:
170 vector: Input vector (numpy array, list of floats, or list of arrays)
171 normalize: Whether to normalize for cosine similarity
173 Returns:
174 Prepared numpy array
175 """
176 import numpy as np
178 # Convert to numpy array
179 if isinstance(vector, list):
180 if len(vector) > 0 and isinstance(vector[0], np.ndarray):
181 # List of arrays - stack them
182 vector = np.vstack(vector).astype(np.float32)
183 else:
184 # List of floats
185 vector = np.array(vector, dtype=np.float32)
186 else:
187 vector = np.asarray(vector, dtype=np.float32)
189 # Ensure vector is an ndarray at this point
190 assert isinstance(vector, np.ndarray)
192 # Ensure correct shape
193 if vector.ndim == 1:
194 vector = vector.reshape(1, -1)
196 # Normalize if needed (e.g., for cosine similarity)
197 if normalize or self.metric == DistanceMetric.COSINE:
198 # Apply normalization for cosine similarity
199 norms = np.linalg.norm(vector, axis=1, keepdims=True)
200 norms[norms == 0] = 1 # Avoid division by zero
201 vector = vector / norms
203 return cast("np.ndarray", vector)
205 def _apply_metadata_filter(self, candidates: list[tuple[Any, dict]], filter: dict[str, Any]) -> list[tuple[Any, dict]]:
206 """Apply metadata filter to candidates.
208 Args:
209 candidates: List of (id, metadata) tuples
210 filter: Filter criteria as key-value pairs
212 Returns:
213 Filtered list of candidates
214 """
215 if not filter:
216 return candidates
218 filtered = []
219 for item_id, metadata in candidates:
220 # Check if all filter conditions match
221 match = all(
222 metadata.get(key) == value
223 for key, value in filter.items()
224 )
225 if match:
226 filtered.append((item_id, metadata))
228 return filtered
230 def __repr__(self) -> str:
231 """String representation."""
232 return (
233 f"{self.__class__.__name__}("
234 f"dimensions={self.dimensions}, "
235 f"metric={self.metric.value})"
236 )