Skip to content

Commit 5b034f0

Browse files
phernandezclaude
andauthored
fix(core): self-heal corrupt FastEmbed model cache (#900)
Signed-off-by: phernandez <paul@basicmachines.co> Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 271c883 commit 5b034f0

4 files changed

Lines changed: 623 additions & 26 deletions

File tree

src/basic_memory/mcp/tools/search.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,47 @@ def _format_search_error_response(
8383
`search_notes("{project}", "{query}", search_type="{search_type}")`
8484
""").strip()
8585

86+
# Corrupt/missing FastEmbed model cache (interrupted download leaves a partial
87+
# snapshot missing model_optimized.onnx; the ONNX runtime then raises NO_SUCHFILE).
88+
# Basic Memory self-heals by re-downloading on the next load, but if the user still
89+
# hits this, point them at the cache dir to clear manually and offer a text fallback.
90+
error_lower = error_message.lower()
91+
# "load model from" is the exact ONNX phrasing ("Load model from <path>.onnx failed").
92+
# The looser "load model" matched unrelated errors, so we keep only the specific phrase
93+
# alongside the onnxruntime / no_suchfile / model_optimized.onnx fingerprints.
94+
if (
95+
"onnxruntime" in error_lower
96+
or "no_suchfile" in error_lower
97+
or "model_optimized.onnx" in error_lower
98+
or "load model from" in error_lower
99+
):
100+
# Deferred import: keeps the repository layer out of the tool's import graph
101+
# (matches the SearchClient deferral below) and is only needed on this error path.
102+
from basic_memory.repository.embedding_provider_factory import _resolve_cache_dir
103+
104+
try:
105+
cache_dir = _resolve_cache_dir(get_container().config)
106+
except RuntimeError:
107+
cache_dir = _resolve_cache_dir(ConfigManager().config)
108+
return dedent(f"""
109+
# Search Failed - Embedding Model Missing or Corrupt
110+
111+
The local FastEmbed model could not be loaded for query '{query}': {error_message}
112+
113+
This usually means an earlier model download was interrupted and left an
114+
incomplete file in the model cache.
115+
116+
## How to fix
117+
1. Delete the FastEmbed model cache so it re-downloads on the next search:
118+
`{cache_dir}`
119+
2. Run your search again (the model downloads automatically on first use):
120+
`search_notes("{project}", "{query}", search_type="{search_type}")`
121+
122+
## Workaround right now
123+
- Use full-text search, which needs no embedding model:
124+
`search_notes("{project}", "{query}", search_type="text")`
125+
""").strip()
126+
86127
# FTS5 syntax errors
87128
if "syntax error" in error_message.lower() or "fts5" in error_message.lower():
88129
clean_query = (

src/basic_memory/repository/fastembed_provider.py

Lines changed: 202 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
import asyncio
66
import math
7+
import shutil
8+
from pathlib import Path
79
from typing import TYPE_CHECKING
810

911
from loguru import logger
@@ -15,6 +17,24 @@
1517
from fastembed import TextEmbedding # pragma: no cover
1618

1719

20+
# Substrings that identify the ONNX "model artifact file is missing" load failure (as
21+
# opposed to a config error, a download/network error, or a genuinely offline machine).
22+
# An interrupted FastEmbed download can leave the HuggingFace snapshot dir present but
23+
# missing ``model_optimized.onnx``; the ONNX runtime then raises ``NO_SUCHFILE`` and every
24+
# subsequent load repeats it until the cache is cleared. Matched case-insensitively.
25+
#
26+
# IMPORTANT: this text match is necessary but NOT sufficient to trigger a purge. The error
27+
# text alone cannot distinguish a corrupt cache from a normal cold load (model not yet
28+
# downloaded). Purging is gated on a positive filesystem confirmation that the snapshot dir
29+
# exists on disk but the model artifact file is missing — see ``_corrupt_model_subdirs``.
30+
_MISSING_ARTIFACT_ERROR_MARKERS = (
31+
"no_suchfile",
32+
"model_optimized.onnx",
33+
"file doesn't exist",
34+
"no such file",
35+
)
36+
37+
1838
class FastEmbedEmbeddingProvider(EmbeddingProvider):
1939
"""Local ONNX embedding provider backed by FastEmbed."""
2040

@@ -53,6 +73,156 @@ def __init__(
5373
self._model: TextEmbedding | None = None
5474
self._model_lock = asyncio.Lock()
5575

76+
def _resolved_model_name(self) -> str:
77+
"""Return the FastEmbed model name after applying our local aliases."""
78+
return self._MODEL_ALIASES.get(self.model_name, self.model_name)
79+
80+
def _create_model(self) -> "TextEmbedding":
81+
try:
82+
from fastembed import TextEmbedding
83+
except ImportError as exc: # pragma: no cover - exercised via tests with monkeypatch
84+
raise SemanticDependenciesMissingError(
85+
"fastembed package is missing. "
86+
"Install/update basic-memory to include semantic dependencies: "
87+
"pip install -U basic-memory"
88+
) from exc
89+
resolved_model_name = self._resolved_model_name()
90+
if self.cache_dir is not None and self.threads is not None:
91+
return TextEmbedding(
92+
model_name=resolved_model_name,
93+
cache_dir=self.cache_dir,
94+
threads=self.threads,
95+
)
96+
if self.cache_dir is not None:
97+
return TextEmbedding(model_name=resolved_model_name, cache_dir=self.cache_dir)
98+
if self.threads is not None:
99+
return TextEmbedding(model_name=resolved_model_name, threads=self.threads)
100+
return TextEmbedding(model_name=resolved_model_name)
101+
102+
def _model_cache_candidates(self) -> list[tuple[Path, str]]:
103+
"""Resolve ``(snapshot_dir, model_file)`` pairs for this model under ``cache_dir``.
104+
105+
FastEmbed stores each model under ``<cache_dir>/models--<org>--<repo>`` where the
106+
repo is the model's HuggingFace source (e.g. ``BAAI/bge-small-en-v1.5`` resolves to
107+
``models--qdrant--bge-small-en-v1.5-onnx-q``). We resolve the source and the expected
108+
model artifact filename from FastEmbed's own model description so corruption detection
109+
and deletion are scoped to exactly this model's tree — never the whole cache or
110+
unrelated models.
111+
112+
Note: ``TextEmbedding._list_supported_models()`` is an intentional use of an
113+
undocumented FastEmbed API. The broad ``except`` below is a known defensive fallback:
114+
if the lookup ever changes shape we degrade to "no candidates" (so we never purge)
115+
rather than crashing the load path.
116+
"""
117+
if self.cache_dir is None:
118+
return []
119+
120+
# FastEmbed matches model names case-insensitively (model_management.py:
121+
# ``model_name.lower() == model.model.lower()``). Mirror that here so a config like
122+
# model="baai/bge-small-en-v1.5" still resolves to the same HF source/cache subdir.
123+
resolved_model_name = self._resolved_model_name().lower()
124+
candidates: list[tuple[Path, str]] = []
125+
seen: set[Path] = set()
126+
cache_root = Path(self.cache_dir)
127+
try:
128+
from fastembed import TextEmbedding
129+
130+
for description in TextEmbedding._list_supported_models():
131+
if description.model.lower() != resolved_model_name:
132+
continue
133+
hf_source = description.sources.hf
134+
model_file = description.model_file
135+
if not hf_source or not model_file:
136+
continue
137+
# HuggingFace hub names cache dirs ``models--<repo with '/' -> '--'>``.
138+
snapshot_dir = cache_root / f"models--{hf_source.replace('/', '--')}"
139+
if snapshot_dir not in seen:
140+
seen.add(snapshot_dir)
141+
candidates.append((snapshot_dir, model_file))
142+
except Exception as exc: # pragma: no cover - defensive: never block load on lookup
143+
logger.warning(
144+
"Could not resolve FastEmbed model source for cache cleanup: "
145+
"model_name={model_name} error={error}",
146+
model_name=resolved_model_name,
147+
error=exc,
148+
)
149+
150+
return candidates
151+
152+
def _corrupt_model_subdirs(self) -> list[Path]:
153+
"""Return cache subdirs that are POSITIVELY confirmed corrupt by filesystem state.
154+
155+
A model is corrupt when its HuggingFace cache dir exists on disk but at least one
156+
materialized snapshot revision is missing the expected model artifact file (e.g.
157+
``model_optimized.onnx``) — the exact fingerprint of an interrupted download. A normal
158+
cold load (no cache dir yet) is NOT corruption and yields no entries here, so it can
159+
never trigger a purge.
160+
161+
Inspection is PER-REVISION on purpose: HuggingFace keeps multiple revisions under one
162+
``models--<repo>`` tree, so a corrupt current snapshot can coexist with an older
163+
complete one. Checking ``rglob(model_file)`` across the whole tree would let the old
164+
artifact mask the broken current revision and leave it self-perpetuating, so we
165+
require every revision to carry the artifact.
166+
"""
167+
corrupt: list[Path] = []
168+
for model_dir, model_file in self._model_cache_candidates():
169+
# Trigger: the model's cache dir does not exist at all.
170+
# Why: this is a normal cold/first load — the model simply hasn't been
171+
# downloaded yet. Purging here would be wrong and pointless.
172+
# Outcome: skip; not corrupt.
173+
if not model_dir.exists():
174+
continue
175+
snapshots_root = model_dir / "snapshots"
176+
revision_dirs = (
177+
[d for d in snapshots_root.iterdir() if d.is_dir()]
178+
if snapshots_root.is_dir()
179+
else []
180+
)
181+
# Trigger: the cache dir exists but no snapshot revision has materialized.
182+
# Why/Outcome: an interrupted download that never wrote a revision — corrupt.
183+
if not revision_dirs:
184+
corrupt.append(model_dir)
185+
continue
186+
# Trigger: any individual revision is missing the artifact (rglob covers the
187+
# artifact at any depth within that revision, e.g. snapshots/<rev>/onnx/...).
188+
# Why: a complete OLD revision must not mask a corrupt CURRENT one.
189+
# Outcome: flag the model dir so the whole tree re-downloads cleanly.
190+
if any(not any(rev.rglob(model_file)) for rev in revision_dirs):
191+
corrupt.append(model_dir)
192+
return corrupt
193+
194+
def _purge_model_subdirs(self, subdirs: list[Path]) -> bool:
195+
"""Delete confirmed-corrupt cache subtrees so the next load re-downloads them.
196+
197+
Returns True when at least one targeted subdir is actually gone afterwards. On
198+
Windows a locked file can make ``shutil.rmtree(ignore_errors=True)`` silently no-op;
199+
reporting success in that case would let the caller retry against the same broken
200+
cache, so each subdir only counts as removed once it has actually disappeared.
201+
"""
202+
removed_any = False
203+
for subdir in subdirs:
204+
logger.warning(
205+
"Removing corrupt FastEmbed model cache to force re-download: {path}",
206+
path=str(subdir),
207+
)
208+
shutil.rmtree(subdir, ignore_errors=True)
209+
# Set removed only when the subdir is truly gone — a silent rmtree no-op
210+
# (e.g. a locked file on Windows) must not be reported as a successful purge.
211+
if not subdir.exists():
212+
removed_any = True
213+
return removed_any
214+
215+
@staticmethod
216+
def _is_missing_artifact_error(exc: Exception) -> bool:
217+
"""Return True when the load failure text matches the ONNX missing-artifact signature.
218+
219+
This is only the text-level gate; it is necessary but NOT sufficient to purge. The
220+
purge additionally requires filesystem-confirmed corruption (``_corrupt_model_subdirs``)
221+
so a transient/offline/"from any source" load error never deletes a valid cache.
222+
"""
223+
message = str(exc).lower()
224+
return any(marker in message for marker in _MISSING_ARTIFACT_ERROR_MARKERS)
225+
56226
async def _load_model(self) -> "TextEmbedding":
57227
if self._model is not None:
58228
return self._model
@@ -61,36 +231,42 @@ async def _load_model(self) -> "TextEmbedding":
61231
if self._model is not None:
62232
return self._model
63233

64-
def _create_model() -> "TextEmbedding":
65-
try:
66-
from fastembed import TextEmbedding
67-
except (
68-
ImportError
69-
) as exc: # pragma: no cover - exercised via tests with monkeypatch
70-
raise SemanticDependenciesMissingError(
71-
"fastembed package is missing. "
72-
"Install/update basic-memory to include semantic dependencies: "
73-
"pip install -U basic-memory"
74-
) from exc
75-
resolved_model_name = self._MODEL_ALIASES.get(self.model_name, self.model_name)
76-
if self.cache_dir is not None and self.threads is not None:
77-
return TextEmbedding(
78-
model_name=resolved_model_name,
79-
cache_dir=self.cache_dir,
80-
threads=self.threads,
81-
)
82-
if self.cache_dir is not None:
83-
return TextEmbedding(model_name=resolved_model_name, cache_dir=self.cache_dir)
84-
if self.threads is not None:
85-
return TextEmbedding(model_name=resolved_model_name, threads=self.threads)
86-
return TextEmbedding(model_name=resolved_model_name)
87-
88-
self._model = await asyncio.to_thread(_create_model)
234+
try:
235+
self._model = await asyncio.to_thread(self._create_model)
236+
except Exception as exc:
237+
# Trigger: model construction raised the ONNX missing-artifact error AND a
238+
# filesystem check positively confirms a corrupt cache subdir (the
239+
# snapshot dir exists but the model artifact file is missing — the
240+
# fingerprint of an interrupted download).
241+
# Why: the raw ONNXRuntimeError is self-perpetuating — every retry hits the
242+
# same broken snapshot until the cache is cleared. We must NOT misread a
243+
# normal cold load (no snapshot dir, model simply not downloaded yet) or a
244+
# transient/offline "from any source" error as corruption, because purging
245+
# then breaks the happy path. Both the error-text gate and the positive
246+
# filesystem confirmation are required before we delete anything.
247+
# Outcome: confirmed corruption → purge exactly this model's subdir and retry
248+
# once so a fresh download can land. Every other failure (including a
249+
# retry that still fails) re-raises the ORIGINAL exception so the
250+
# message stays actionable and we never loop.
251+
if not self._is_missing_artifact_error(exc):
252+
raise
253+
corrupt_subdirs = self._corrupt_model_subdirs()
254+
if not corrupt_subdirs:
255+
raise
256+
if not self._purge_model_subdirs(corrupt_subdirs):
257+
raise
258+
logger.info(
259+
"Retrying FastEmbed model load after clearing corrupt cache: "
260+
"model_name={model_name}",
261+
model_name=self._resolved_model_name(),
262+
)
263+
self._model = await asyncio.to_thread(self._create_model)
264+
89265
logger.info(
90266
"FastEmbed model loaded: model_name={model_name} batch_size={batch_size} "
91267
"threads={threads} configured_parallel={configured_parallel} "
92268
"effective_parallel={effective_parallel}",
93-
model_name=self._MODEL_ALIASES.get(self.model_name, self.model_name),
269+
model_name=self._resolved_model_name(),
94270
batch_size=self.batch_size,
95271
threads=self.threads,
96272
configured_parallel=self.parallel,

tests/mcp/test_tool_search.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,42 @@ def test_format_search_error_semantic_dependencies_missing(self):
479479
assert "# Search Failed - Semantic Dependencies Missing" in result
480480
assert "pip install -U basic-memory" in result
481481

482+
def test_format_search_error_corrupt_embedding_model(self):
483+
"""Test formatting for a corrupt/missing FastEmbed model (ONNX NO_SUCHFILE)."""
484+
from basic_memory.config import ConfigManager
485+
from basic_memory.repository.embedding_provider_factory import _resolve_cache_dir
486+
487+
result = _format_search_error_response(
488+
"test-project",
489+
"[ONNXRuntimeError] : 3 : NO_SUCHFILE : Load model from "
490+
"/home/u/.basic-memory/fastembed_cache/models--qdrant--bge-small-en-v1.5-onnx-q/"
491+
"snapshots/abc/model_optimized.onnx failed. File doesn't exist",
492+
"semantic query",
493+
"hybrid",
494+
)
495+
496+
expected_cache_dir = _resolve_cache_dir(ConfigManager().config)
497+
assert "# Search Failed - Embedding Model Missing or Corrupt" in result
498+
# Names the actual resolved cache dir so the user knows what to delete.
499+
assert expected_cache_dir in result
500+
# Offers full-text search as an immediate workaround.
501+
assert 'search_type="text"' in result
502+
503+
def test_format_search_error_load_model_phrase_does_not_overmatch(self):
504+
"""A generic error mentioning 'load model' (no 'from') must not hit the embedding branch.
505+
506+
The marker was tightened from the broad 'load model' to the exact ONNX phrasing
507+
'load model from' so unrelated failures fall through to the generic handler.
508+
"""
509+
result = _format_search_error_response(
510+
"test-project",
511+
"Failed to load model configuration for this project",
512+
"test query",
513+
)
514+
515+
assert "# Search Failed - Embedding Model Missing or Corrupt" not in result
516+
assert "# Search Failed" in result
517+
482518
def test_format_search_error_generic(self):
483519
"""Test formatting for generic errors."""
484520
result = _format_search_error_response("test-project", "unknown error", "test query")

0 commit comments

Comments
 (0)