perf(core): tune fastembed auto threads

phernandez · phernandez · commit 8c75457f5fb2 · 2026-04-08T15:13:55.000-05:00
Signed-off-by: phernandez &lt;paul@basicmachines.co&gt;
diff --git a/src/basic_memory/repository/embedding_provider_factory.py b/src/basic_memory/repository/embedding_provider_factory.py
@@ -26,14 +26,16 @@ def _available_cpu_count() -> int | None:
     process_cpu_count = getattr(os, "process_cpu_count", None)
     if callable(process_cpu_count):
         cpu_count = process_cpu_count()
-        if cpu_count is not None and cpu_count > 0:
+        if isinstance(cpu_count, int) and cpu_count > 0:
             return cpu_count
 
     cpu_count = os.cpu_count()
     return cpu_count if cpu_count is not None and cpu_count > 0 else None
 
 
-def _resolve_fastembed_runtime_knobs(app_config: BasicMemoryConfig) -> tuple[int | None, int | None]:
+def _resolve_fastembed_runtime_knobs(
+    app_config: BasicMemoryConfig,
+) -> tuple[int | None, int | None]:
     """Resolve FastEmbed threads/parallel from explicit config or CPU-aware defaults."""
     configured_threads = app_config.semantic_embedding_threads
     configured_parallel = app_config.semantic_embedding_parallel
@@ -45,15 +47,15 @@ def _resolve_fastembed_runtime_knobs(app_config: BasicMemoryConfig) -> tuple[int
         return None, None
 
     # Trigger: local laptops and cloud workers expose different CPU budgets.
-    # Why: FastEmbed throughput wants enough ONNX threads to use the machine,
-    # but the multiprocessing-style ``parallel`` fan-out can add a lot of
-    # overhead for this workload and make full rebuilds slower instead of faster.
-    # Outcome: when config leaves the knobs unset, each process uses a bounded
-    # thread count and keeps FastEmbed on the simpler single-process path.
+    # Why: full rebuilds got faster when FastEmbed used most, but not all, of
+    # the available CPUs. Leaving a little headroom avoids starving the rest of
+    # the pipeline while still giving ONNX enough threads to stay busy.
+    # Outcome: when config leaves the knobs unset, each process reserves a small
+    # CPU cushion and keeps FastEmbed on the simpler single-process path.
     if available_cpus <= 2:
         return available_cpus, 1
 
-    threads = min(8, available_cpus)
+    threads = min(8, max(2, available_cpus - 2))
     return threads, 1
 
 
diff --git a/tests/repository/test_openai_provider.py b/tests/repository/test_openai_provider.py
@@ -282,6 +282,28 @@ def test_embedding_provider_factory_auto_tunes_fastembed_runtime_knobs_from_cpu_
 
     provider = create_embedding_provider(config)
 
+    assert isinstance(provider, FastEmbedEmbeddingProvider)
+    assert provider.threads == 6
+    assert provider.parallel == 1
+
+
+def test_embedding_provider_factory_auto_tuning_caps_large_cpu_budgets(monkeypatch):
+    """Large workers should still leave some headroom and stop at the thread cap."""
+    monkeypatch.setattr(embedding_provider_factory_module.os, "process_cpu_count", lambda: 16)
+    monkeypatch.setattr(embedding_provider_factory_module.os, "cpu_count", lambda: 16)
+
+    config = BasicMemoryConfig(
+        env="test",
+        projects={"test-project": "/tmp/basic-memory-test"},
+        default_project="test-project",
+        semantic_search_enabled=True,
+        semantic_embedding_provider="fastembed",
+        semantic_embedding_threads=None,
+        semantic_embedding_parallel=None,
+    )
+
+    provider = create_embedding_provider(config)
+
     assert isinstance(provider, FastEmbedEmbeddingProvider)
     assert provider.threads == 8
     assert provider.parallel == 1