fix(core): self-heal corrupt FastEmbed model cache (#900)

phernandez · claude · web-flow · commit 5b034f081d0a · 2026-06-07T17:37:30.000-05:00
Signed-off-by: phernandez &lt;paul@basicmachines.co&gt;
Co-authored-by: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/basic_memory/mcp/tools/search.py b/src/basic_memory/mcp/tools/search.py
@@ -83,6 +83,47 @@ def _format_search_error_response(
                `search_notes("{project}", "{query}", search_type="{search_type}")`
             """).strip()
 
+    # Corrupt/missing FastEmbed model cache (interrupted download leaves a partial
+    # snapshot missing model_optimized.onnx; the ONNX runtime then raises NO_SUCHFILE).
+    # Basic Memory self-heals by re-downloading on the next load, but if the user still
+    # hits this, point them at the cache dir to clear manually and offer a text fallback.
+    error_lower = error_message.lower()
+    # "load model from" is the exact ONNX phrasing ("Load model from <path>.onnx failed").
+    # The looser "load model" matched unrelated errors, so we keep only the specific phrase
+    # alongside the onnxruntime / no_suchfile / model_optimized.onnx fingerprints.
+    if (
+        "onnxruntime" in error_lower
+        or "no_suchfile" in error_lower
+        or "model_optimized.onnx" in error_lower
+        or "load model from" in error_lower
+    ):
+        # Deferred import: keeps the repository layer out of the tool's import graph
+        # (matches the SearchClient deferral below) and is only needed on this error path.
+        from basic_memory.repository.embedding_provider_factory import _resolve_cache_dir
+
+        try:
+            cache_dir = _resolve_cache_dir(get_container().config)
+        except RuntimeError:
+            cache_dir = _resolve_cache_dir(ConfigManager().config)
+        return dedent(f"""
+            # Search Failed - Embedding Model Missing or Corrupt
+
+            The local FastEmbed model could not be loaded for query '{query}': {error_message}
+
+            This usually means an earlier model download was interrupted and left an
+            incomplete file in the model cache.
+
+            ## How to fix
+            1. Delete the FastEmbed model cache so it re-downloads on the next search:
+               `{cache_dir}`
+            2. Run your search again (the model downloads automatically on first use):
+               `search_notes("{project}", "{query}", search_type="{search_type}")`
+
+            ## Workaround right now
+            - Use full-text search, which needs no embedding model:
+              `search_notes("{project}", "{query}", search_type="text")`
+            """).strip()
+
     # FTS5 syntax errors
     if "syntax error" in error_message.lower() or "fts5" in error_message.lower():
         clean_query = (
diff --git a/src/basic_memory/repository/fastembed_provider.py b/src/basic_memory/repository/fastembed_provider.py
@@ -4,6 +4,8 @@
 
 import asyncio
 import math
+import shutil
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 from loguru import logger
@@ -15,6 +17,24 @@
     from fastembed import TextEmbedding  # pragma: no cover
 
 
+# Substrings that identify the ONNX "model artifact file is missing" load failure (as
+# opposed to a config error, a download/network error, or a genuinely offline machine).
+# An interrupted FastEmbed download can leave the HuggingFace snapshot dir present but
+# missing ``model_optimized.onnx``; the ONNX runtime then raises ``NO_SUCHFILE`` and every
+# subsequent load repeats it until the cache is cleared. Matched case-insensitively.
+#
+# IMPORTANT: this text match is necessary but NOT sufficient to trigger a purge. The error
+# text alone cannot distinguish a corrupt cache from a normal cold load (model not yet
+# downloaded). Purging is gated on a positive filesystem confirmation that the snapshot dir
+# exists on disk but the model artifact file is missing — see ``_corrupt_model_subdirs``.
+_MISSING_ARTIFACT_ERROR_MARKERS = (
+    "no_suchfile",
+    "model_optimized.onnx",
+    "file doesn't exist",
+    "no such file",
+)
+
+
 class FastEmbedEmbeddingProvider(EmbeddingProvider):
     """Local ONNX embedding provider backed by FastEmbed."""
 
@@ -53,6 +73,156 @@ def __init__(
         self._model: TextEmbedding | None = None
         self._model_lock = asyncio.Lock()
 
+    def _resolved_model_name(self) -> str:
+        """Return the FastEmbed model name after applying our local aliases."""
+        return self._MODEL_ALIASES.get(self.model_name, self.model_name)
+
+    def _create_model(self) -> "TextEmbedding":
+        try:
+            from fastembed import TextEmbedding
+        except ImportError as exc:  # pragma: no cover - exercised via tests with monkeypatch
+            raise SemanticDependenciesMissingError(
+                "fastembed package is missing. "
+                "Install/update basic-memory to include semantic dependencies: "
+                "pip install -U basic-memory"
+            ) from exc
+        resolved_model_name = self._resolved_model_name()
+        if self.cache_dir is not None and self.threads is not None:
+            return TextEmbedding(
+                model_name=resolved_model_name,
+                cache_dir=self.cache_dir,
+                threads=self.threads,
+            )
+        if self.cache_dir is not None:
+            return TextEmbedding(model_name=resolved_model_name, cache_dir=self.cache_dir)
+        if self.threads is not None:
+            return TextEmbedding(model_name=resolved_model_name, threads=self.threads)
+        return TextEmbedding(model_name=resolved_model_name)
+
+    def _model_cache_candidates(self) -> list[tuple[Path, str]]:
+        """Resolve ``(snapshot_dir, model_file)`` pairs for this model under ``cache_dir``.
+
+        FastEmbed stores each model under ``<cache_dir>/models--<org>--<repo>`` where the
+        repo is the model's HuggingFace source (e.g. ``BAAI/bge-small-en-v1.5`` resolves to
+        ``models--qdrant--bge-small-en-v1.5-onnx-q``). We resolve the source and the expected
+        model artifact filename from FastEmbed's own model description so corruption detection
+        and deletion are scoped to exactly this model's tree — never the whole cache or
+        unrelated models.
+
+        Note: ``TextEmbedding._list_supported_models()`` is an intentional use of an
+        undocumented FastEmbed API. The broad ``except`` below is a known defensive fallback:
+        if the lookup ever changes shape we degrade to "no candidates" (so we never purge)
+        rather than crashing the load path.
+        """
+        if self.cache_dir is None:
+            return []
+
+        # FastEmbed matches model names case-insensitively (model_management.py:
+        # ``model_name.lower() == model.model.lower()``). Mirror that here so a config like
+        # model="baai/bge-small-en-v1.5" still resolves to the same HF source/cache subdir.
+        resolved_model_name = self._resolved_model_name().lower()
+        candidates: list[tuple[Path, str]] = []
+        seen: set[Path] = set()
+        cache_root = Path(self.cache_dir)
+        try:
+            from fastembed import TextEmbedding
+
+            for description in TextEmbedding._list_supported_models():
+                if description.model.lower() != resolved_model_name:
+                    continue
+                hf_source = description.sources.hf
+                model_file = description.model_file
+                if not hf_source or not model_file:
+                    continue
+                # HuggingFace hub names cache dirs ``models--<repo with '/' -> '--'>``.
+                snapshot_dir = cache_root / f"models--{hf_source.replace('/', '--')}"
+                if snapshot_dir not in seen:
+                    seen.add(snapshot_dir)
+                    candidates.append((snapshot_dir, model_file))
+        except Exception as exc:  # pragma: no cover - defensive: never block load on lookup
+            logger.warning(
+                "Could not resolve FastEmbed model source for cache cleanup: "
+                "model_name={model_name} error={error}",
+                model_name=resolved_model_name,
+                error=exc,
+            )
+
+        return candidates
+
+    def _corrupt_model_subdirs(self) -> list[Path]:
+        """Return cache subdirs that are POSITIVELY confirmed corrupt by filesystem state.
+
+        A model is corrupt when its HuggingFace cache dir exists on disk but at least one
+        materialized snapshot revision is missing the expected model artifact file (e.g.
+        ``model_optimized.onnx``) — the exact fingerprint of an interrupted download. A normal
+        cold load (no cache dir yet) is NOT corruption and yields no entries here, so it can
+        never trigger a purge.
+
+        Inspection is PER-REVISION on purpose: HuggingFace keeps multiple revisions under one
+        ``models--<repo>`` tree, so a corrupt current snapshot can coexist with an older
+        complete one. Checking ``rglob(model_file)`` across the whole tree would let the old
+        artifact mask the broken current revision and leave it self-perpetuating, so we
+        require every revision to carry the artifact.
+        """
+        corrupt: list[Path] = []
+        for model_dir, model_file in self._model_cache_candidates():
+            # Trigger: the model's cache dir does not exist at all.
+            # Why: this is a normal cold/first load — the model simply hasn't been
+            #      downloaded yet. Purging here would be wrong and pointless.
+            # Outcome: skip; not corrupt.
+            if not model_dir.exists():
+                continue
+            snapshots_root = model_dir / "snapshots"
+            revision_dirs = (
+                [d for d in snapshots_root.iterdir() if d.is_dir()]
+                if snapshots_root.is_dir()
+                else []
+            )
+            # Trigger: the cache dir exists but no snapshot revision has materialized.
+            # Why/Outcome: an interrupted download that never wrote a revision — corrupt.
+            if not revision_dirs:
+                corrupt.append(model_dir)
+                continue
+            # Trigger: any individual revision is missing the artifact (rglob covers the
+            # artifact at any depth within that revision, e.g. snapshots/<rev>/onnx/...).
+            # Why: a complete OLD revision must not mask a corrupt CURRENT one.
+            # Outcome: flag the model dir so the whole tree re-downloads cleanly.
+            if any(not any(rev.rglob(model_file)) for rev in revision_dirs):
+                corrupt.append(model_dir)
+        return corrupt
+
+    def _purge_model_subdirs(self, subdirs: list[Path]) -> bool:
+        """Delete confirmed-corrupt cache subtrees so the next load re-downloads them.
+
+        Returns True when at least one targeted subdir is actually gone afterwards. On
+        Windows a locked file can make ``shutil.rmtree(ignore_errors=True)`` silently no-op;
+        reporting success in that case would let the caller retry against the same broken
+        cache, so each subdir only counts as removed once it has actually disappeared.
+        """
+        removed_any = False
+        for subdir in subdirs:
+            logger.warning(
+                "Removing corrupt FastEmbed model cache to force re-download: {path}",
+                path=str(subdir),
+            )
+            shutil.rmtree(subdir, ignore_errors=True)
+            # Set removed only when the subdir is truly gone — a silent rmtree no-op
+            # (e.g. a locked file on Windows) must not be reported as a successful purge.
+            if not subdir.exists():
+                removed_any = True
+        return removed_any
+
+    @staticmethod
+    def _is_missing_artifact_error(exc: Exception) -> bool:
+        """Return True when the load failure text matches the ONNX missing-artifact signature.
+
+        This is only the text-level gate; it is necessary but NOT sufficient to purge. The
+        purge additionally requires filesystem-confirmed corruption (``_corrupt_model_subdirs``)
+        so a transient/offline/"from any source" load error never deletes a valid cache.
+        """
+        message = str(exc).lower()
+        return any(marker in message for marker in _MISSING_ARTIFACT_ERROR_MARKERS)
+
     async def _load_model(self) -> "TextEmbedding":
         if self._model is not None:
             return self._model
@@ -61,36 +231,42 @@ async def _load_model(self) -> "TextEmbedding":
             if self._model is not None:
                 return self._model
 
-            def _create_model() -> "TextEmbedding":
-                try:
-                    from fastembed import TextEmbedding
-                except (
-                    ImportError
-                ) as exc:  # pragma: no cover - exercised via tests with monkeypatch
-                    raise SemanticDependenciesMissingError(
-                        "fastembed package is missing. "
-                        "Install/update basic-memory to include semantic dependencies: "
-                        "pip install -U basic-memory"
-                    ) from exc
-                resolved_model_name = self._MODEL_ALIASES.get(self.model_name, self.model_name)
-                if self.cache_dir is not None and self.threads is not None:
-                    return TextEmbedding(
-                        model_name=resolved_model_name,
-                        cache_dir=self.cache_dir,
-                        threads=self.threads,
-                    )
-                if self.cache_dir is not None:
-                    return TextEmbedding(model_name=resolved_model_name, cache_dir=self.cache_dir)
-                if self.threads is not None:
-                    return TextEmbedding(model_name=resolved_model_name, threads=self.threads)
-                return TextEmbedding(model_name=resolved_model_name)
-
-            self._model = await asyncio.to_thread(_create_model)
+            try:
+                self._model = await asyncio.to_thread(self._create_model)
+            except Exception as exc:
+                # Trigger: model construction raised the ONNX missing-artifact error AND a
+                #          filesystem check positively confirms a corrupt cache subdir (the
+                #          snapshot dir exists but the model artifact file is missing — the
+                #          fingerprint of an interrupted download).
+                # Why: the raw ONNXRuntimeError is self-perpetuating — every retry hits the
+                #      same broken snapshot until the cache is cleared. We must NOT misread a
+                #      normal cold load (no snapshot dir, model simply not downloaded yet) or a
+                #      transient/offline "from any source" error as corruption, because purging
+                #      then breaks the happy path. Both the error-text gate and the positive
+                #      filesystem confirmation are required before we delete anything.
+                # Outcome: confirmed corruption → purge exactly this model's subdir and retry
+                #          once so a fresh download can land. Every other failure (including a
+                #          retry that still fails) re-raises the ORIGINAL exception so the
+                #          message stays actionable and we never loop.
+                if not self._is_missing_artifact_error(exc):
+                    raise
+                corrupt_subdirs = self._corrupt_model_subdirs()
+                if not corrupt_subdirs:
+                    raise
+                if not self._purge_model_subdirs(corrupt_subdirs):
+                    raise
+                logger.info(
+                    "Retrying FastEmbed model load after clearing corrupt cache: "
+                    "model_name={model_name}",
+                    model_name=self._resolved_model_name(),
+                )
+                self._model = await asyncio.to_thread(self._create_model)
+
             logger.info(
                 "FastEmbed model loaded: model_name={model_name} batch_size={batch_size} "
                 "threads={threads} configured_parallel={configured_parallel} "
                 "effective_parallel={effective_parallel}",
-                model_name=self._MODEL_ALIASES.get(self.model_name, self.model_name),
+                model_name=self._resolved_model_name(),
                 batch_size=self.batch_size,
                 threads=self.threads,
                 configured_parallel=self.parallel,
diff --git a/tests/mcp/test_tool_search.py b/tests/mcp/test_tool_search.py
@@ -479,6 +479,42 @@ def test_format_search_error_semantic_dependencies_missing(self):
         assert "# Search Failed - Semantic Dependencies Missing" in result
         assert "pip install -U basic-memory" in result
 
+    def test_format_search_error_corrupt_embedding_model(self):
+        """Test formatting for a corrupt/missing FastEmbed model (ONNX NO_SUCHFILE)."""
+        from basic_memory.config import ConfigManager
+        from basic_memory.repository.embedding_provider_factory import _resolve_cache_dir
+
+        result = _format_search_error_response(
+            "test-project",
+            "[ONNXRuntimeError] : 3 : NO_SUCHFILE : Load model from "
+            "/home/u/.basic-memory/fastembed_cache/models--qdrant--bge-small-en-v1.5-onnx-q/"
+            "snapshots/abc/model_optimized.onnx failed. File doesn't exist",
+            "semantic query",
+            "hybrid",
+        )
+
+        expected_cache_dir = _resolve_cache_dir(ConfigManager().config)
+        assert "# Search Failed - Embedding Model Missing or Corrupt" in result
+        # Names the actual resolved cache dir so the user knows what to delete.
+        assert expected_cache_dir in result
+        # Offers full-text search as an immediate workaround.
+        assert 'search_type="text"' in result
+
+    def test_format_search_error_load_model_phrase_does_not_overmatch(self):
+        """A generic error mentioning 'load model' (no 'from') must not hit the embedding branch.
+
+        The marker was tightened from the broad 'load model' to the exact ONNX phrasing
+        'load model from' so unrelated failures fall through to the generic handler.
+        """
+        result = _format_search_error_response(
+            "test-project",
+            "Failed to load model configuration for this project",
+            "test query",
+        )
+
+        assert "# Search Failed - Embedding Model Missing or Corrupt" not in result
+        assert "# Search Failed" in result
+
     def test_format_search_error_generic(self):
         """Test formatting for generic errors."""
         result = _format_search_error_response("test-project", "unknown error", "test query")
diff --git a/tests/repository/test_fastembed_provider.py b/tests/repository/test_fastembed_provider.py