fix: add FastEmbed runtime tuning knobs and provider caching

phernandez · claude · phernandez · commit 9b199c6dcb51 · 2026-03-03T10:26:47.000-06:00
Add configurable cache_dir, threads, and parallel settings for FastEmbed
to support cloud deployments where defaults fail. Cache embedding providers
at the process level to avoid re-creating heavy ONNX model instances.

- Add semantic_embedding_cache_dir, semantic_embedding_threads, and
  semantic_embedding_parallel config fields
- Thread-safe provider cache with double-checked locking in factory
- Forward runtime knobs through to TextEmbedding and embed() calls
- Fix if/elif chain in factory for correct error handling

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: phernandez &lt;paul@basicmachines.co&gt;
diff --git a/src/basic_memory/cli/commands/project.py b/src/basic_memory/cli/commands/project.py
@@ -227,9 +227,7 @@ async def _list_projects(ws: str | None = None):
 
         console.print(table)
         if cloud_error is not None:
-            console.print(
-                f"[yellow]Cloud project discovery failed: {cloud_error}[/yellow]"
-            )
+            console.print(f"[yellow]Cloud project discovery failed: {cloud_error}[/yellow]")
             console.print(
                 "[dim]Showing local projects only. "
                 "Run 'bm cloud login' or 'bm cloud api-key save <key>' if this is a credentials issue.[/dim]"
diff --git a/src/basic_memory/config.py b/src/basic_memory/config.py
@@ -173,6 +173,20 @@ class BasicMemoryConfig(BaseSettings):
         description="Batch size for embedding generation.",
         gt=0,
     )
+    semantic_embedding_cache_dir: str | None = Field(
+        default=None,
+        description="Optional cache directory for FastEmbed model artifacts.",
+    )
+    semantic_embedding_threads: int | None = Field(
+        default=None,
+        description="Optional FastEmbed runtime thread count override.",
+        gt=0,
+    )
+    semantic_embedding_parallel: int | None = Field(
+        default=None,
+        description="Optional FastEmbed embed() parallelism override.",
+        gt=0,
+    )
     semantic_vector_k: int = Field(
         default=100,
         description="Vector candidate count for vector and hybrid retrieval.",
@@ -709,9 +723,7 @@ def load_config(self) -> BasicMemoryConfig:
                     # Create backup before overwriting so users can revert if needed
                     backup_path = self.config_file.with_suffix(".json.bak")
                     shutil.copy2(self.config_file, backup_path)
-                    logger.info(
-                        f"Migrating config to current format (backup: {backup_path})"
-                    )
+                    logger.info(f"Migrating config to current format (backup: {backup_path})")
                     save_basic_memory_config(self.config_file, _CONFIG_CACHE)
 
                 return _CONFIG_CACHE
diff --git a/src/basic_memory/repository/embedding_provider_factory.py b/src/basic_memory/repository/embedding_provider_factory.py
@@ -1,41 +1,85 @@
 """Factory for creating configured semantic embedding providers."""
 
+from threading import Lock
+
 from basic_memory.config import BasicMemoryConfig
 from basic_memory.repository.embedding_provider import EmbeddingProvider
 
+type ProviderCacheKey = tuple[str, str, int | None, int, str | None, int | None, int | None]
+
+_EMBEDDING_PROVIDER_CACHE: dict[ProviderCacheKey, EmbeddingProvider] = {}
+_EMBEDDING_PROVIDER_CACHE_LOCK = Lock()
+
+
+def _provider_cache_key(app_config: BasicMemoryConfig) -> ProviderCacheKey:
+    """Build a stable cache key from provider-relevant semantic embedding config."""
+    return (
+        app_config.semantic_embedding_provider.strip().lower(),
+        app_config.semantic_embedding_model,
+        app_config.semantic_embedding_dimensions,
+        app_config.semantic_embedding_batch_size,
+        app_config.semantic_embedding_cache_dir,
+        app_config.semantic_embedding_threads,
+        app_config.semantic_embedding_parallel,
+    )
+
+
+def reset_embedding_provider_cache() -> None:
+    """Clear process-level embedding provider cache (used by tests)."""
+    with _EMBEDDING_PROVIDER_CACHE_LOCK:
+        _EMBEDDING_PROVIDER_CACHE.clear()
+
 
 def create_embedding_provider(app_config: BasicMemoryConfig) -> EmbeddingProvider:
     """Create an embedding provider based on semantic config.
 
     When semantic_embedding_dimensions is set in config, it overrides
     the provider's default dimensions (384 for FastEmbed, 1536 for OpenAI).
     """
+    cache_key = _provider_cache_key(app_config)
+    with _EMBEDDING_PROVIDER_CACHE_LOCK:
+        if cached_provider := _EMBEDDING_PROVIDER_CACHE.get(cache_key):
+            return cached_provider
+
     provider_name = app_config.semantic_embedding_provider.strip().lower()
     extra_kwargs: dict = {}
     if app_config.semantic_embedding_dimensions is not None:
         extra_kwargs["dimensions"] = app_config.semantic_embedding_dimensions
 
+    provider: EmbeddingProvider
     if provider_name == "fastembed":
         # Deferred import: fastembed (and its onnxruntime dep) may not be installed
         from basic_memory.repository.fastembed_provider import FastEmbedEmbeddingProvider
 
-        return FastEmbedEmbeddingProvider(
+        if app_config.semantic_embedding_cache_dir is not None:
+            extra_kwargs["cache_dir"] = app_config.semantic_embedding_cache_dir
+        if app_config.semantic_embedding_threads is not None:
+            extra_kwargs["threads"] = app_config.semantic_embedding_threads
+        if app_config.semantic_embedding_parallel is not None:
+            extra_kwargs["parallel"] = app_config.semantic_embedding_parallel
+
+        provider = FastEmbedEmbeddingProvider(
             model_name=app_config.semantic_embedding_model,
             batch_size=app_config.semantic_embedding_batch_size,
             **extra_kwargs,
         )
-
-    if provider_name == "openai":
+    elif provider_name == "openai":
         # Deferred import: openai may not be installed
         from basic_memory.repository.openai_provider import OpenAIEmbeddingProvider
 
         model_name = app_config.semantic_embedding_model or "text-embedding-3-small"
         if model_name == "bge-small-en-v1.5":
             model_name = "text-embedding-3-small"
-        return OpenAIEmbeddingProvider(
+        provider = OpenAIEmbeddingProvider(
             model_name=model_name,
             batch_size=app_config.semantic_embedding_batch_size,
             **extra_kwargs,
         )
+    else:
+        raise ValueError(f"Unsupported semantic embedding provider: {provider_name}")
 
-    raise ValueError(f"Unsupported semantic embedding provider: {provider_name}")
+    with _EMBEDDING_PROVIDER_CACHE_LOCK:
+        if cached_provider := _EMBEDDING_PROVIDER_CACHE.get(cache_key):
+            return cached_provider
+        _EMBEDDING_PROVIDER_CACHE[cache_key] = provider
+        return provider
diff --git a/src/basic_memory/repository/fastembed_provider.py b/src/basic_memory/repository/fastembed_provider.py
@@ -25,10 +25,16 @@ def __init__(
         *,
         batch_size: int = 64,
         dimensions: int = 384,
+        cache_dir: str | None = None,
+        threads: int | None = None,
+        parallel: int | None = None,
     ) -> None:
         self.model_name = model_name
         self.dimensions = dimensions
         self.batch_size = batch_size
+        self.cache_dir = cache_dir
+        self.threads = threads
+        self.parallel = parallel
         self._model: TextEmbedding | None = None
         self._model_lock = asyncio.Lock()
 
@@ -52,6 +58,16 @@ def _create_model() -> "TextEmbedding":
                         "pip install -U basic-memory"
                     ) from exc
                 resolved_model_name = self._MODEL_ALIASES.get(self.model_name, self.model_name)
+                if self.cache_dir is not None and self.threads is not None:
+                    return TextEmbedding(
+                        model_name=resolved_model_name,
+                        cache_dir=self.cache_dir,
+                        threads=self.threads,
+                    )
+                if self.cache_dir is not None:
+                    return TextEmbedding(model_name=resolved_model_name, cache_dir=self.cache_dir)
+                if self.threads is not None:
+                    return TextEmbedding(model_name=resolved_model_name, threads=self.threads)
                 return TextEmbedding(model_name=resolved_model_name)
 
             self._model = await asyncio.to_thread(_create_model)
@@ -64,7 +80,10 @@ async def embed_documents(self, texts: list[str]) -> list[list[float]]:
         model = await self._load_model()
 
         def _embed_batch() -> list[list[float]]:
-            vectors = list(model.embed(texts, batch_size=self.batch_size))
+            embed_kwargs: dict[str, int] = {"batch_size": self.batch_size}
+            if self.parallel is not None:
+                embed_kwargs["parallel"] = self.parallel
+            vectors = list(model.embed(texts, **embed_kwargs))
             normalized: list[list[float]] = []
             for vector in vectors:
                 values = vector.tolist() if hasattr(vector, "tolist") else vector
diff --git a/tests/api/v2/test_schema_router.py b/tests/api/v2/test_schema_router.py
@@ -668,7 +668,8 @@ async def test_validate_reads_schema_from_file_not_database(
 
     # Overwrite the file on disk with validation=strict
     file_path = Path(file_service.base_path) / schema_entity.file_path
-    file_path.write_text(dedent("""\
+    file_path.write_text(
+        dedent("""\
         ---
         title: Editable Schema
         permalink: schemas/editable-schema
@@ -685,7 +686,8 @@ async def test_validate_reads_schema_from_file_not_database(
 
         ## Observations
         - [note] Schema that will be edited on disk
-    """))
+    """)
+    )
 
     # Create a note missing "role" — strict mode should produce errors, not warnings
     note_entity, _ = await entity_service.create_or_update_entity(
@@ -749,7 +751,8 @@ async def test_validate_falls_back_to_db_on_incomplete_frontmatter(
 
     # Overwrite file with frontmatter missing the 'schema' key
     file_path = Path(file_service.base_path) / schema_entity.file_path
-    file_path.write_text(dedent("""\
+    file_path.write_text(
+        dedent("""\
         ---
         title: Incomplete Schema
         permalink: schemas/incomplete-schema
@@ -761,7 +764,8 @@ async def test_validate_falls_back_to_db_on_incomplete_frontmatter(
 
         ## Observations
         - [note] Mid-edit state
-    """))
+    """)
+    )
 
     # Create a note to validate against this schema
     note_entity, _ = await entity_service.create_or_update_entity(
diff --git a/tests/cli/test_cloud_status.py b/tests/cli/test_cloud_status.py
@@ -63,9 +63,7 @@ def is_token_valid(self, t):
     monkeypatch.setattr(
         "basic_memory.cli.commands.cloud.core_commands.ConfigManager", FakeConfigManager
     )
-    monkeypatch.setattr(
-        "basic_memory.cli.commands.cloud.core_commands.CLIAuth", FakeAuth
-    )
+    monkeypatch.setattr("basic_memory.cli.commands.cloud.core_commands.CLIAuth", FakeAuth)
     monkeypatch.setattr(
         "basic_memory.cli.commands.cloud.core_commands.get_cloud_config",
         lambda: ("cid", "domain", "https://cloud.example.com"),
diff --git a/tests/repository/test_fastembed_provider.py b/tests/repository/test_fastembed_provider.py
@@ -19,14 +19,25 @@ def tolist(self):
 
 class _StubTextEmbedding:
     init_count = 0
+    last_init_kwargs: dict = {}
+    last_embed_kwargs: dict = {}
 
-    def __init__(self, model_name: str):
+    def __init__(self, model_name: str, cache_dir: str | None = None, threads: int | None = None):
         self.model_name = model_name
         self.embed_calls = 0
+        _StubTextEmbedding.last_init_kwargs = {
+            "model_name": model_name,
+            "cache_dir": cache_dir,
+            "threads": threads,
+        }
         _StubTextEmbedding.init_count += 1
 
-    def embed(self, texts: list[str], batch_size: int = 64):
+    def embed(self, texts: list[str], batch_size: int = 64, parallel: int | None = None):
         self.embed_calls += 1
+        _StubTextEmbedding.last_embed_kwargs = {
+            "batch_size": batch_size,
+            "parallel": parallel,
+        }
         for text in texts:
             if "wide" in text:
                 yield _StubVector([1.0, 0.0, 0.0, 0.0, 0.5])
@@ -85,3 +96,30 @@ def _raising_import(name, globals=None, locals=None, fromlist=(), level=0):
         await provider.embed_query("test")
 
     assert "pip install -U basic-memory" in str(error.value)
+
+
+@pytest.mark.asyncio
+async def test_fastembed_provider_passes_runtime_knobs_to_fastembed(monkeypatch):
+    """Provider should pass optional runtime tuning knobs through to FastEmbed."""
+    module = type(sys)("fastembed")
+    module.TextEmbedding = _StubTextEmbedding
+    monkeypatch.setitem(sys.modules, "fastembed", module)
+    _StubTextEmbedding.last_init_kwargs = {}
+    _StubTextEmbedding.last_embed_kwargs = {}
+
+    provider = FastEmbedEmbeddingProvider(
+        model_name="stub-model",
+        dimensions=4,
+        batch_size=8,
+        cache_dir="/tmp/fastembed-cache",
+        threads=3,
+        parallel=2,
+    )
+    await provider.embed_documents(["runtime knobs"])
+
+    assert _StubTextEmbedding.last_init_kwargs == {
+        "model_name": "stub-model",
+        "cache_dir": "/tmp/fastembed-cache",
+        "threads": 3,
+    }
+    assert _StubTextEmbedding.last_embed_kwargs == {"batch_size": 8, "parallel": 2}
diff --git a/tests/repository/test_openai_provider.py b/tests/repository/test_openai_provider.py
diff --git a/tests/test_config.py b/tests/test_config.py