Coverage for src \ truenex_memory \ core \ embedder.py: 98%

63 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-19 10:21 +0200

1"""Local embedding primitives for offline retrieval tests.""" 

2 

3from __future__ import annotations 

4 

5from dataclasses import dataclass 

6import hashlib 

7import math 

8import re 

9from typing import Protocol 

10 

11 

12TARGET_EMBEDDING_MODEL = "intfloat/multilingual-e5-base" 

13DEFAULT_EMBEDDING_DIMENSIONS = 384 

14 

15 

16@dataclass(frozen=True) 

17class EmbedderMetadata: 

18 """Metadata describing the local backend and intended production model.""" 

19 

20 backend: str 

21 model_name: str 

22 dimensions: int 

23 normalized: bool = True 

24 requires_network: bool = False 

25 downloads_model: bool = False 

26 

27 

28class LocalEmbedder(Protocol): 

29 """Protocol implemented by local, testable embedding backends.""" 

30 

31 @property 

32 def metadata(self) -> EmbedderMetadata: 

33 """Return backend metadata for diagnostics and vector-store setup.""" 

34 

35 def embed_query(self, text: str) -> list[float]: 

36 """Embed a retrieval query.""" 

37 

38 def embed_documents(self, texts: list[str]) -> list[list[float]]: 

39 """Embed one or more documents or chunks.""" 

40 

41 

42class HashingEmbedder: 

43 """Deterministic local embedder that never downloads model weights. 

44 

45 The metadata names ``intfloat/multilingual-e5-base`` as the target model so 

46 persisted vectors can declare their intended production replacement, while 

47 tests keep a small dependency-free backend. 

48 """ 

49 

50 def __init__(self, dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS) -> None: 

51 if dimensions < 1: 

52 raise ValueError("dimensions must be greater than zero") 

53 self._metadata = EmbedderMetadata( 

54 backend="hashing", 

55 model_name=TARGET_EMBEDDING_MODEL, 

56 dimensions=dimensions, 

57 ) 

58 

59 @property 

60 def model_name(self) -> str: 

61 """Return a stable persisted model/backend identifier.""" 

62 

63 return f"{self.metadata.backend}-fallback:{self.metadata.model_name}" 

64 

65 @property 

66 def dimensions(self) -> int: 

67 """Return embedding dimensionality.""" 

68 

69 return self.metadata.dimensions 

70 

71 def embed(self, text: str) -> list[float]: 

72 """Embed text without query/passage prefixes for generic local retrieval.""" 

73 

74 _validate_text(text) 

75 return self._embed(text) 

76 

77 @property 

78 def metadata(self) -> EmbedderMetadata: 

79 return self._metadata 

80 

81 def embed_query(self, text: str) -> list[float]: 

82 _validate_text(text) 

83 return self._embed(f"query: {text}") 

84 

85 def embed_documents(self, texts: list[str]) -> list[list[float]]: 

86 for text in texts: 

87 _validate_text(text) 

88 return [self._embed(f"passage: {text}") for text in texts] 

89 

90 def _embed(self, text: str) -> list[float]: 

91 vector = [0.0] * self.metadata.dimensions 

92 for token in _tokens(text): 

93 digest = hashlib.blake2b(token.encode("utf-8"), digest_size=16).digest() 

94 index = int.from_bytes(digest[:8], "big") % self.metadata.dimensions 

95 sign = 1.0 if digest[8] % 2 == 0 else -1.0 

96 vector[index] += sign 

97 return _normalize(vector) 

98 

99 

100def _validate_text(text: str) -> None: 

101 if not text.strip(): 

102 raise ValueError("text cannot be empty") 

103 

104 

105def _tokens(text: str) -> list[str]: 

106 return [token.lower() for token in re.findall(r"\w+", text, flags=re.UNICODE)] 

107 

108 

109def _normalize(vector: list[float]) -> list[float]: 

110 norm = math.sqrt(sum(value * value for value in vector)) 

111 if norm == 0: 

112 return vector 

113 return [value / norm for value in vector]