Coverage for src \ truenex_memory \ core \ embedder.py: 98%
63 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-19 10:21 +0200
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-19 10:21 +0200
1"""Local embedding primitives for offline retrieval tests."""
3from __future__ import annotations
5from dataclasses import dataclass
6import hashlib
7import math
8import re
9from typing import Protocol
12TARGET_EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
13DEFAULT_EMBEDDING_DIMENSIONS = 384
16@dataclass(frozen=True)
17class EmbedderMetadata:
18 """Metadata describing the local backend and intended production model."""
20 backend: str
21 model_name: str
22 dimensions: int
23 normalized: bool = True
24 requires_network: bool = False
25 downloads_model: bool = False
28class LocalEmbedder(Protocol):
29 """Protocol implemented by local, testable embedding backends."""
31 @property
32 def metadata(self) -> EmbedderMetadata:
33 """Return backend metadata for diagnostics and vector-store setup."""
35 def embed_query(self, text: str) -> list[float]:
36 """Embed a retrieval query."""
38 def embed_documents(self, texts: list[str]) -> list[list[float]]:
39 """Embed one or more documents or chunks."""
42class HashingEmbedder:
43 """Deterministic local embedder that never downloads model weights.
45 The metadata names ``intfloat/multilingual-e5-base`` as the target model so
46 persisted vectors can declare their intended production replacement, while
47 tests keep a small dependency-free backend.
48 """
50 def __init__(self, dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS) -> None:
51 if dimensions < 1:
52 raise ValueError("dimensions must be greater than zero")
53 self._metadata = EmbedderMetadata(
54 backend="hashing",
55 model_name=TARGET_EMBEDDING_MODEL,
56 dimensions=dimensions,
57 )
59 @property
60 def model_name(self) -> str:
61 """Return a stable persisted model/backend identifier."""
63 return f"{self.metadata.backend}-fallback:{self.metadata.model_name}"
65 @property
66 def dimensions(self) -> int:
67 """Return embedding dimensionality."""
69 return self.metadata.dimensions
71 def embed(self, text: str) -> list[float]:
72 """Embed text without query/passage prefixes for generic local retrieval."""
74 _validate_text(text)
75 return self._embed(text)
77 @property
78 def metadata(self) -> EmbedderMetadata:
79 return self._metadata
81 def embed_query(self, text: str) -> list[float]:
82 _validate_text(text)
83 return self._embed(f"query: {text}")
85 def embed_documents(self, texts: list[str]) -> list[list[float]]:
86 for text in texts:
87 _validate_text(text)
88 return [self._embed(f"passage: {text}") for text in texts]
90 def _embed(self, text: str) -> list[float]:
91 vector = [0.0] * self.metadata.dimensions
92 for token in _tokens(text):
93 digest = hashlib.blake2b(token.encode("utf-8"), digest_size=16).digest()
94 index = int.from_bytes(digest[:8], "big") % self.metadata.dimensions
95 sign = 1.0 if digest[8] % 2 == 0 else -1.0
96 vector[index] += sign
97 return _normalize(vector)
100def _validate_text(text: str) -> None:
101 if not text.strip():
102 raise ValueError("text cannot be empty")
105def _tokens(text: str) -> list[str]:
106 return [token.lower() for token in re.findall(r"\w+", text, flags=re.UNICODE)]
109def _normalize(vector: list[float]) -> list[float]:
110 norm = math.sqrt(sum(value * value for value in vector))
111 if norm == 0:
112 return vector
113 return [value / norm for value in vector]