Source code for jeevesagent.memory.embedder

"""Embedders that turn text into vectors.

Two implementations land in this slice:

* :class:`HashEmbedder` — deterministic, zero-dep, SHA256-seeded
  Gaussian sample. Same text → same vector. Perfect for tests, dev,
  and for memory backends that only need *some* vector to enable
  recall without the cost of a real embedding API.
* :class:`OpenAIEmbedder` — wraps OpenAI's
  ``text-embedding-3-{small,large}`` via the official ``openai`` SDK.
  Lazy SDK import inside ``__init__`` so the module loads without
  ``openai`` installed; the import only fires when constructing
  without ``client=``.
"""

from __future__ import annotations

import hashlib
import math
import os
import random
from typing import Any

DEFAULT_HASH_DIMENSIONS = 384


[docs] class HashEmbedder: """Deterministic SHA256-seeded unit vectors. Each text gets a fresh ``random.Random`` seeded by the SHA256 of its UTF-8 bytes, then samples ``dimensions`` Gaussian values and L2-normalises the result. Same text always produces the same vector; different texts produce well-distributed vectors with cosine distances that correlate with literal text equality (not semantic similarity). Use this in tests (fast, no network) and as a default for in-memory backends that need *some* vector but don't need real semantic recall. """ def __init__(self, dimensions: int = DEFAULT_HASH_DIMENSIONS) -> None: if dimensions <= 0: raise ValueError(f"dimensions must be positive, got {dimensions}") self.name: str = f"hash-embedder-{dimensions}" self.dimensions: int = dimensions
[docs] async def embed(self, text: str) -> list[float]: digest = hashlib.sha256(text.encode("utf-8")).digest() rng = random.Random(digest) vec = [rng.gauss(0.0, 1.0) for _ in range(self.dimensions)] norm = math.sqrt(sum(v * v for v in vec)) if norm <= 0.0: return vec return [v / norm for v in vec]
[docs] async def embed_batch(self, texts: list[str]) -> list[list[float]]: # Per-text RNG seed makes batch-vs-single equivalent. return [await self.embed(t) for t in texts]
[docs] class OpenAIEmbedder: """Embeddings via OpenAI's ``embeddings.create`` API. Dimensions are fixed by the model: * ``text-embedding-3-small`` -> 1536 * ``text-embedding-3-large`` -> 3072 * ``text-embedding-ada-002`` -> 1536 Pass ``dimensions=`` only for ``text-embedding-3-*`` models, which support the ``dimensions`` parameter for projection. """ _DEFAULT_DIMS: dict[str, int] = { "text-embedding-3-small": 1536, "text-embedding-3-large": 3072, "text-embedding-ada-002": 1536, } def __init__( self, model: str = "text-embedding-3-small", *, dimensions: int | None = None, client: Any | None = None, api_key: str | None = None, ) -> None: self.name: str = model self.dimensions: int = dimensions or self._DEFAULT_DIMS.get(model, 1536) self._explicit_dimensions = dimensions if client is not None: self._client = client else: try: from openai import AsyncOpenAI except ImportError as exc: # pragma: no cover raise ImportError( "OpenAI SDK not installed. " "Install with: pip install 'jeevesagent[openai]'" ) from exc self._client = AsyncOpenAI( api_key=api_key or os.environ.get("OPENAI_API_KEY"), )
[docs] async def embed(self, text: str) -> list[float]: kwargs: dict[str, Any] = {"model": self.name, "input": text} if self._explicit_dimensions is not None: kwargs["dimensions"] = self._explicit_dimensions result = await self._client.embeddings.create(**kwargs) embedding = result.data[0].embedding return list(embedding)
[docs] async def embed_batch(self, texts: list[str]) -> list[list[float]]: if not texts: return [] kwargs: dict[str, Any] = {"model": self.name, "input": texts} if self._explicit_dimensions is not None: kwargs["dimensions"] = self._explicit_dimensions result = await self._client.embeddings.create(**kwargs) # OpenAI returns data sorted by request order. return [list(item.embedding) for item in result.data]
# --------------------------------------------------------------------------- # Voyage AI # ---------------------------------------------------------------------------
[docs] class VoyageEmbedder: """Embeddings via Voyage AI's ``voyageai`` SDK. Models and dimensions: * ``voyage-3`` / ``voyage-3-large`` / ``voyage-code-3`` -> 1024 * ``voyage-3-lite`` -> 512 ``input_type`` controls how Voyage encodes the text: * ``"document"`` (default) — for corpus / fact-store entries * ``"query"`` — for retrieval queries Pass an explicit ``input_type=`` if your embedder is dedicated to one role; for the agent loop's mixed use (we embed both stored triples and recall queries through the same embedder), the ``"document"`` default is the safer choice. """ _DEFAULT_DIMS: dict[str, int] = { "voyage-3": 1024, "voyage-3-large": 1024, "voyage-code-3": 1024, "voyage-3-lite": 512, } def __init__( self, model: str = "voyage-3", *, client: Any | None = None, api_key: str | None = None, input_type: str = "document", ) -> None: self.name: str = model self.dimensions: int = self._DEFAULT_DIMS.get(model, 1024) self._input_type = input_type if client is not None: self._client = client else: try: import voyageai # type: ignore[import-not-found, import-untyped] except ImportError as exc: # pragma: no cover raise ImportError( "voyageai is not installed. " "Install with: pip install 'jeevesagent[voyage]'" ) from exc # voyageai ships py.typed in newer releases but doesn't # re-export AsyncClient from its package __init__. The # class exists at runtime (defined in voyageai.client_async # and bound on the package). Locally without the stubs the # whole module is Any and this never fires; CI with the # stubs sees attr-defined — hence the inline ignore. # Paired with disable_error_code = ["unused-ignore"] in # pyproject so the ignore is a no-op in stub-less envs. self._client = voyageai.AsyncClient( # type: ignore[attr-defined] api_key=api_key or os.environ.get("VOYAGE_API_KEY"), )
[docs] async def embed(self, text: str) -> list[float]: result = await self._client.embed( texts=[text], model=self.name, input_type=self._input_type, ) return list(result.embeddings[0])
[docs] async def embed_batch(self, texts: list[str]) -> list[list[float]]: if not texts: return [] result = await self._client.embed( texts=texts, model=self.name, input_type=self._input_type, ) return [list(e) for e in result.embeddings]
# --------------------------------------------------------------------------- # Cohere # ---------------------------------------------------------------------------
[docs] class CohereEmbedder: """Embeddings via Cohere's ``cohere`` SDK. Models and dimensions: * ``embed-english-v3.0`` / ``embed-multilingual-v3.0`` -> 1024 * ``embed-english-light-v3.0`` / ``embed-multilingual-light-v3.0`` -> 384 ``input_type`` is required by Cohere v3 models: * ``"search_document"`` (default) — corpus / fact-store entries * ``"search_query"`` — retrieval queries * ``"classification"`` / ``"clustering"`` for non-retrieval uses """ _DEFAULT_DIMS: dict[str, int] = { "embed-english-v3.0": 1024, "embed-multilingual-v3.0": 1024, "embed-english-light-v3.0": 384, "embed-multilingual-light-v3.0": 384, } def __init__( self, model: str = "embed-english-v3.0", *, client: Any | None = None, api_key: str | None = None, input_type: str = "search_document", ) -> None: self.name: str = model self.dimensions: int = self._DEFAULT_DIMS.get(model, 1024) self._input_type = input_type if client is not None: self._client = client else: try: import cohere # type: ignore[import-not-found, import-untyped] except ImportError as exc: # pragma: no cover raise ImportError( "cohere is not installed. " "Install with: pip install 'jeevesagent[cohere]'" ) from exc self._client = cohere.AsyncClient( api_key=api_key or os.environ.get("COHERE_API_KEY"), )
[docs] async def embed(self, text: str) -> list[float]: result = await self._client.embed( texts=[text], model=self.name, input_type=self._input_type, embedding_types=["float"], ) return list(result.embeddings.float[0])
[docs] async def embed_batch(self, texts: list[str]) -> list[list[float]]: if not texts: return [] result = await self._client.embed( texts=texts, model=self.name, input_type=self._input_type, embedding_types=["float"], ) return [list(e) for e in result.embeddings.float]