44
55import asyncio
66import math
7+ import shutil
8+ from pathlib import Path
79from typing import TYPE_CHECKING
810
911from loguru import logger
1517 from fastembed import TextEmbedding # pragma: no cover
1618
1719
20+ # Substrings that identify the ONNX "model artifact file is missing" load failure (as
21+ # opposed to a config error, a download/network error, or a genuinely offline machine).
22+ # An interrupted FastEmbed download can leave the HuggingFace snapshot dir present but
23+ # missing ``model_optimized.onnx``; the ONNX runtime then raises ``NO_SUCHFILE`` and every
24+ # subsequent load repeats it until the cache is cleared. Matched case-insensitively.
25+ #
26+ # IMPORTANT: this text match is necessary but NOT sufficient to trigger a purge. The error
27+ # text alone cannot distinguish a corrupt cache from a normal cold load (model not yet
28+ # downloaded). Purging is gated on a positive filesystem confirmation that the snapshot dir
29+ # exists on disk but the model artifact file is missing — see ``_corrupt_model_subdirs``.
30+ _MISSING_ARTIFACT_ERROR_MARKERS = (
31+ "no_suchfile" ,
32+ "model_optimized.onnx" ,
33+ "file doesn't exist" ,
34+ "no such file" ,
35+ )
36+
37+
1838class FastEmbedEmbeddingProvider (EmbeddingProvider ):
1939 """Local ONNX embedding provider backed by FastEmbed."""
2040
@@ -53,6 +73,156 @@ def __init__(
5373 self ._model : TextEmbedding | None = None
5474 self ._model_lock = asyncio .Lock ()
5575
76+ def _resolved_model_name (self ) -> str :
77+ """Return the FastEmbed model name after applying our local aliases."""
78+ return self ._MODEL_ALIASES .get (self .model_name , self .model_name )
79+
80+ def _create_model (self ) -> "TextEmbedding" :
81+ try :
82+ from fastembed import TextEmbedding
83+ except ImportError as exc : # pragma: no cover - exercised via tests with monkeypatch
84+ raise SemanticDependenciesMissingError (
85+ "fastembed package is missing. "
86+ "Install/update basic-memory to include semantic dependencies: "
87+ "pip install -U basic-memory"
88+ ) from exc
89+ resolved_model_name = self ._resolved_model_name ()
90+ if self .cache_dir is not None and self .threads is not None :
91+ return TextEmbedding (
92+ model_name = resolved_model_name ,
93+ cache_dir = self .cache_dir ,
94+ threads = self .threads ,
95+ )
96+ if self .cache_dir is not None :
97+ return TextEmbedding (model_name = resolved_model_name , cache_dir = self .cache_dir )
98+ if self .threads is not None :
99+ return TextEmbedding (model_name = resolved_model_name , threads = self .threads )
100+ return TextEmbedding (model_name = resolved_model_name )
101+
102+ def _model_cache_candidates (self ) -> list [tuple [Path , str ]]:
103+ """Resolve ``(snapshot_dir, model_file)`` pairs for this model under ``cache_dir``.
104+
105+ FastEmbed stores each model under ``<cache_dir>/models--<org>--<repo>`` where the
106+ repo is the model's HuggingFace source (e.g. ``BAAI/bge-small-en-v1.5`` resolves to
107+ ``models--qdrant--bge-small-en-v1.5-onnx-q``). We resolve the source and the expected
108+ model artifact filename from FastEmbed's own model description so corruption detection
109+ and deletion are scoped to exactly this model's tree — never the whole cache or
110+ unrelated models.
111+
112+ Note: ``TextEmbedding._list_supported_models()`` is an intentional use of an
113+ undocumented FastEmbed API. The broad ``except`` below is a known defensive fallback:
114+ if the lookup ever changes shape we degrade to "no candidates" (so we never purge)
115+ rather than crashing the load path.
116+ """
117+ if self .cache_dir is None :
118+ return []
119+
120+ # FastEmbed matches model names case-insensitively (model_management.py:
121+ # ``model_name.lower() == model.model.lower()``). Mirror that here so a config like
122+ # model="baai/bge-small-en-v1.5" still resolves to the same HF source/cache subdir.
123+ resolved_model_name = self ._resolved_model_name ().lower ()
124+ candidates : list [tuple [Path , str ]] = []
125+ seen : set [Path ] = set ()
126+ cache_root = Path (self .cache_dir )
127+ try :
128+ from fastembed import TextEmbedding
129+
130+ for description in TextEmbedding ._list_supported_models ():
131+ if description .model .lower () != resolved_model_name :
132+ continue
133+ hf_source = description .sources .hf
134+ model_file = description .model_file
135+ if not hf_source or not model_file :
136+ continue
137+ # HuggingFace hub names cache dirs ``models--<repo with '/' -> '--'>``.
138+ snapshot_dir = cache_root / f"models--{ hf_source .replace ('/' , '--' )} "
139+ if snapshot_dir not in seen :
140+ seen .add (snapshot_dir )
141+ candidates .append ((snapshot_dir , model_file ))
142+ except Exception as exc : # pragma: no cover - defensive: never block load on lookup
143+ logger .warning (
144+ "Could not resolve FastEmbed model source for cache cleanup: "
145+ "model_name={model_name} error={error}" ,
146+ model_name = resolved_model_name ,
147+ error = exc ,
148+ )
149+
150+ return candidates
151+
152+ def _corrupt_model_subdirs (self ) -> list [Path ]:
153+ """Return cache subdirs that are POSITIVELY confirmed corrupt by filesystem state.
154+
155+ A model is corrupt when its HuggingFace cache dir exists on disk but at least one
156+ materialized snapshot revision is missing the expected model artifact file (e.g.
157+ ``model_optimized.onnx``) — the exact fingerprint of an interrupted download. A normal
158+ cold load (no cache dir yet) is NOT corruption and yields no entries here, so it can
159+ never trigger a purge.
160+
161+ Inspection is PER-REVISION on purpose: HuggingFace keeps multiple revisions under one
162+ ``models--<repo>`` tree, so a corrupt current snapshot can coexist with an older
163+ complete one. Checking ``rglob(model_file)`` across the whole tree would let the old
164+ artifact mask the broken current revision and leave it self-perpetuating, so we
165+ require every revision to carry the artifact.
166+ """
167+ corrupt : list [Path ] = []
168+ for model_dir , model_file in self ._model_cache_candidates ():
169+ # Trigger: the model's cache dir does not exist at all.
170+ # Why: this is a normal cold/first load — the model simply hasn't been
171+ # downloaded yet. Purging here would be wrong and pointless.
172+ # Outcome: skip; not corrupt.
173+ if not model_dir .exists ():
174+ continue
175+ snapshots_root = model_dir / "snapshots"
176+ revision_dirs = (
177+ [d for d in snapshots_root .iterdir () if d .is_dir ()]
178+ if snapshots_root .is_dir ()
179+ else []
180+ )
181+ # Trigger: the cache dir exists but no snapshot revision has materialized.
182+ # Why/Outcome: an interrupted download that never wrote a revision — corrupt.
183+ if not revision_dirs :
184+ corrupt .append (model_dir )
185+ continue
186+ # Trigger: any individual revision is missing the artifact (rglob covers the
187+ # artifact at any depth within that revision, e.g. snapshots/<rev>/onnx/...).
188+ # Why: a complete OLD revision must not mask a corrupt CURRENT one.
189+ # Outcome: flag the model dir so the whole tree re-downloads cleanly.
190+ if any (not any (rev .rglob (model_file )) for rev in revision_dirs ):
191+ corrupt .append (model_dir )
192+ return corrupt
193+
194+ def _purge_model_subdirs (self , subdirs : list [Path ]) -> bool :
195+ """Delete confirmed-corrupt cache subtrees so the next load re-downloads them.
196+
197+ Returns True when at least one targeted subdir is actually gone afterwards. On
198+ Windows a locked file can make ``shutil.rmtree(ignore_errors=True)`` silently no-op;
199+ reporting success in that case would let the caller retry against the same broken
200+ cache, so each subdir only counts as removed once it has actually disappeared.
201+ """
202+ removed_any = False
203+ for subdir in subdirs :
204+ logger .warning (
205+ "Removing corrupt FastEmbed model cache to force re-download: {path}" ,
206+ path = str (subdir ),
207+ )
208+ shutil .rmtree (subdir , ignore_errors = True )
209+ # Set removed only when the subdir is truly gone — a silent rmtree no-op
210+ # (e.g. a locked file on Windows) must not be reported as a successful purge.
211+ if not subdir .exists ():
212+ removed_any = True
213+ return removed_any
214+
215+ @staticmethod
216+ def _is_missing_artifact_error (exc : Exception ) -> bool :
217+ """Return True when the load failure text matches the ONNX missing-artifact signature.
218+
219+ This is only the text-level gate; it is necessary but NOT sufficient to purge. The
220+ purge additionally requires filesystem-confirmed corruption (``_corrupt_model_subdirs``)
221+ so a transient/offline/"from any source" load error never deletes a valid cache.
222+ """
223+ message = str (exc ).lower ()
224+ return any (marker in message for marker in _MISSING_ARTIFACT_ERROR_MARKERS )
225+
56226 async def _load_model (self ) -> "TextEmbedding" :
57227 if self ._model is not None :
58228 return self ._model
@@ -61,36 +231,42 @@ async def _load_model(self) -> "TextEmbedding":
61231 if self ._model is not None :
62232 return self ._model
63233
64- def _create_model () -> "TextEmbedding" :
65- try :
66- from fastembed import TextEmbedding
67- except (
68- ImportError
69- ) as exc : # pragma: no cover - exercised via tests with monkeypatch
70- raise SemanticDependenciesMissingError (
71- "fastembed package is missing. "
72- "Install/update basic-memory to include semantic dependencies: "
73- "pip install -U basic-memory"
74- ) from exc
75- resolved_model_name = self ._MODEL_ALIASES .get (self .model_name , self .model_name )
76- if self .cache_dir is not None and self .threads is not None :
77- return TextEmbedding (
78- model_name = resolved_model_name ,
79- cache_dir = self .cache_dir ,
80- threads = self .threads ,
81- )
82- if self .cache_dir is not None :
83- return TextEmbedding (model_name = resolved_model_name , cache_dir = self .cache_dir )
84- if self .threads is not None :
85- return TextEmbedding (model_name = resolved_model_name , threads = self .threads )
86- return TextEmbedding (model_name = resolved_model_name )
87-
88- self ._model = await asyncio .to_thread (_create_model )
234+ try :
235+ self ._model = await asyncio .to_thread (self ._create_model )
236+ except Exception as exc :
237+ # Trigger: model construction raised the ONNX missing-artifact error AND a
238+ # filesystem check positively confirms a corrupt cache subdir (the
239+ # snapshot dir exists but the model artifact file is missing — the
240+ # fingerprint of an interrupted download).
241+ # Why: the raw ONNXRuntimeError is self-perpetuating — every retry hits the
242+ # same broken snapshot until the cache is cleared. We must NOT misread a
243+ # normal cold load (no snapshot dir, model simply not downloaded yet) or a
244+ # transient/offline "from any source" error as corruption, because purging
245+ # then breaks the happy path. Both the error-text gate and the positive
246+ # filesystem confirmation are required before we delete anything.
247+ # Outcome: confirmed corruption → purge exactly this model's subdir and retry
248+ # once so a fresh download can land. Every other failure (including a
249+ # retry that still fails) re-raises the ORIGINAL exception so the
250+ # message stays actionable and we never loop.
251+ if not self ._is_missing_artifact_error (exc ):
252+ raise
253+ corrupt_subdirs = self ._corrupt_model_subdirs ()
254+ if not corrupt_subdirs :
255+ raise
256+ if not self ._purge_model_subdirs (corrupt_subdirs ):
257+ raise
258+ logger .info (
259+ "Retrying FastEmbed model load after clearing corrupt cache: "
260+ "model_name={model_name}" ,
261+ model_name = self ._resolved_model_name (),
262+ )
263+ self ._model = await asyncio .to_thread (self ._create_model )
264+
89265 logger .info (
90266 "FastEmbed model loaded: model_name={model_name} batch_size={batch_size} "
91267 "threads={threads} configured_parallel={configured_parallel} "
92268 "effective_parallel={effective_parallel}" ,
93- model_name = self ._MODEL_ALIASES . get ( self . model_name , self . model_name ),
269+ model_name = self ._resolved_model_name ( ),
94270 batch_size = self .batch_size ,
95271 threads = self .threads ,
96272 configured_parallel = self .parallel ,
0 commit comments