fix: fallback local batch embedding to sequential mode

Mijamind719 · codex · Mijamind719 · commit e3520979c68e · 2026-04-13T08:39:00.000+08:00
Co-authored-by: GPT-5.4 &lt;noreply@openai.com&gt;
diff --git a/openviking/models/embedder/local_embedders.py b/openviking/models/embedder/local_embedders.py
@@ -205,16 +205,16 @@ def _extract_embeddings(payload: Any) -> List[List[float]]:
                 return vectors
         raise RuntimeError("Unexpected llama-cpp-python batch embedding response format")
 
+    def _embed_formatted_text(self, formatted: str) -> EmbedResult:
+        payload = self._llama.create_embedding(formatted)
+        return EmbedResult(dense_vector=self._extract_embedding(payload))
+
     def embed(self, text: str, is_query: bool = False) -> EmbedResult:
         formatted = self._format_text(text, is_query=is_query)
 
-        def _call() -> EmbedResult:
-            payload = self._llama.create_embedding(formatted)
-            return EmbedResult(dense_vector=self._extract_embedding(payload))
-
         try:
             result = self._run_with_retry(
-                _call,
+                lambda: self._embed_formatted_text(formatted),
                 logger=logger,
                 operation_name="local embedding",
             )
@@ -236,20 +236,35 @@ def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedRes
 
         formatted = [self._format_text(text, is_query=is_query) for text in texts]
 
-        def _call() -> List[EmbedResult]:
+        def _call_batch() -> List[EmbedResult]:
             payload = self._llama.create_embedding(formatted)
             return [
                 EmbedResult(dense_vector=vector) for vector in self._extract_embeddings(payload)
             ]
 
         try:
             results = self._run_with_retry(
-                _call,
+                _call_batch,
                 logger=logger,
                 operation_name="local batch embedding",
             )
-        except Exception as exc:
-            raise RuntimeError(f"Local batch embedding failed: {exc}") from exc
+        except Exception as batch_exc:
+            logger.warning(
+                "Local batch embedding failed for model=%s (%s); falling back to sequential embedding",
+                self.model_name,
+                batch_exc,
+            )
+            try:
+                results = [
+                    self._run_with_retry(
+                        lambda formatted_text=text: self._embed_formatted_text(formatted_text),
+                        logger=logger,
+                        operation_name="local sequential batch embedding",
+                    )
+                    for text in formatted
+                ]
+            except Exception as exc:
+                raise RuntimeError(f"Local batch embedding failed: {exc}") from exc
 
         estimated_tokens = sum(self._estimate_tokens(text) for text in formatted)
         self.update_token_usage(
diff --git a/tests/unit/test_local_embedder.py b/tests/unit/test_local_embedder.py
@@ -52,6 +52,14 @@ def create_embedding(self, payload):
         return {"data": [{"embedding": [0.1] * 512}]}
 
 
+class _FakeLlamaFailBatch(_FakeLlama):
+    def create_embedding(self, payload):
+        self.__class__.inputs.append(payload)
+        if isinstance(payload, list) and len(payload) > 1:
+            raise RuntimeError("llama_decode returned -1")
+        return {"data": [{"embedding": [0.2] * 512}]}
+
+
 @pytest.fixture(autouse=True)
 def _reset_fake_llama():
     _FakeLlama.init_kwargs = []
@@ -149,3 +157,30 @@ def test_local_embedder_embed_batch_preserves_count(monkeypatch, tmp_path):
         f"{DEFAULT_BGE_ZH_QUERY_INSTRUCTION}a",
         f"{DEFAULT_BGE_ZH_QUERY_INSTRUCTION}b",
     ]
+
+
+def test_local_embedder_embed_batch_falls_back_to_sequential(monkeypatch, tmp_path):
+    model_path = tmp_path / "model.gguf"
+    model_path.write_bytes(b"gguf")
+
+    _FakeLlamaFailBatch.init_kwargs = []
+    _FakeLlamaFailBatch.inputs = []
+
+    monkeypatch.setattr(
+        "openviking.models.embedder.local_embedders.importlib.import_module",
+        lambda _name: SimpleNamespace(Llama=_FakeLlamaFailBatch),
+    )
+
+    embedder = LocalDenseEmbedder(model_path=str(model_path))
+    results = embedder.embed_batch(["a", "b"], is_query=True)
+
+    assert len(results) == 2
+    assert all(len(item.dense_vector) == 512 for item in results)
+    assert _FakeLlamaFailBatch.inputs[0] == [
+        f"{DEFAULT_BGE_ZH_QUERY_INSTRUCTION}a",
+        f"{DEFAULT_BGE_ZH_QUERY_INSTRUCTION}b",
+    ]
+    assert _FakeLlamaFailBatch.inputs[1:] == [
+        f"{DEFAULT_BGE_ZH_QUERY_INSTRUCTION}a",
+        f"{DEFAULT_BGE_ZH_QUERY_INSTRUCTION}b",
+    ]