fix: bypass LiteLLM for Ollama embeddings to resolve 400 Bad Request

lexgenius · claude · lexgenius · commit eca2212d782c · 2026-04-03T23:52:12.000-04:00
LiteLLM's Ollama embedding handler sends a malformed request to Ollama's /api/embed endpoint, causing a 400 Bad Request error on Ollama 0.18.x. - Add `_ollama_embed()` to `LiteLLMEmbeddingWrapper` that calls Ollama's `/api/embed` directly via httpx, stripping the "ollama/" prefix from the model name (the root cause of the malformed request) - Route `embed_query` and `embed_documents` through this helper when provider == "ollama", bypassing LiteLLM entirely - Wrap `search_similarity_threshold` in try/except so an embedding failure returns [] instead of crashing the agent Fixes #1425 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
diff --git a/models.py b/models.py
@@ -593,13 +593,25 @@ def __init__(
         **kwargs: Any,
     ):
         self.model_name = f"{provider}/{model}" if provider != "openai" else model
+        self.provider = provider
         self.kwargs = kwargs
         self.a0_model_conf = model_config
 
+    def _ollama_embed(self, inputs: List[str]) -> List[List[float]]:
+        import httpx
+        api_base = self.kwargs.get("api_base", "http://host.docker.internal:11434").rstrip("/")
+        model = self.model_name.split("/", 1)[-1]  # strip "ollama/" prefix
+        resp = httpx.post(f"{api_base}/api/embed", json={"model": model, "input": inputs}, timeout=60)
+        resp.raise_for_status()
+        return resp.json()["embeddings"]
+
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
         # Apply rate limiting if configured
         apply_rate_limiter_sync(self.a0_model_conf, " ".join(texts))
 
+        if self.provider == "ollama":
+            return self._ollama_embed(texts)
+
         resp = embedding(model=self.model_name, input=texts, **self.kwargs)
         return [
             item.get("embedding") if isinstance(item, dict) else item.embedding  # type: ignore
@@ -610,6 +622,9 @@ def embed_query(self, text: str) -> List[float]:
         # Apply rate limiting if configured
         apply_rate_limiter_sync(self.a0_model_conf, text)
 
+        if self.provider == "ollama":
+            return self._ollama_embed([text])[0]
+
         resp = embedding(model=self.model_name, input=[text], **self.kwargs)
         item = resp.data[0]  # type: ignore
         return item.get("embedding") if isinstance(item, dict) else item.embedding  # type: ignore
diff --git a/plugins/_memory/helpers/memory.py b/plugins/_memory/helpers/memory.py
@@ -337,13 +337,17 @@ async def search_similarity_threshold(
     ):
         comparator = Memory._get_comparator(filter) if filter else None
 
-        return await self.db.asearch(
-            query,
-            search_type="similarity_score_threshold",
-            k=limit,
-            score_threshold=threshold,
-            filter=comparator,
-        )
+        try:
+            return await self.db.asearch(
+                query,
+                search_type="similarity_score_threshold",
+                k=limit,
+                score_threshold=threshold,
+                filter=comparator,
+            )
+        except Exception as e:
+            PrintStyle(font_color="yellow").print(f"Memory search failed (embedding error): {e}")
+            return []
 
     async def delete_documents_by_query(
         self, query: str, threshold: float, filter: str = ""