perf(embeddings): optimize CPU resource allocation

donhardman · donhardman · commit 18ad409568b2 · 2026-04-23T12:47:25.000+03:00
diff --git a/embeddings/src/model/local.rs b/embeddings/src/model/local.rs
@@ -32,10 +32,11 @@ use tokenizers::Tokenizer;
 /// Default batch size per ONNX forward pass.
 const DEFAULT_BATCH_SIZE: usize = 8;
 
-/// Default number of parallel sessions: num_cpus / 2 (min 1).
-/// Each session gets 2 intra threads. Enough sessions to handle concurrent callers.
+/// Default number of parallel sessions: num_cpus / batch_size (min 1).
+/// Each session uses all cores (intra=0). Sessions scale with machine size
+/// relative to batch size — more cores = more parallel callers supported.
 fn default_num_sessions() -> usize {
-    (available_cpus() / 2).max(1)
+    (available_cpus() / batch_size()).max(1)
 }
 
 fn batch_size() -> usize {
@@ -60,13 +61,13 @@ fn num_sessions() -> usize {
         .unwrap_or_else(default_num_sessions)
 }
 
-/// Intra-op threads per session. Default: num_cpus / num_sessions.
-/// Total threads = num_sessions × intra_threads = num_cpus (no oversubscription).
+/// Intra-op threads per session. Default: 0 (all cores).
+/// Each session gets full CPU access. ORT internally manages thread contention.
 fn intra_threads() -> usize {
     std::env::var("EMBEDDINGS_INTRA_THREADS")
         .ok()
         .and_then(|v| v.parse().ok())
-        .unwrap_or_else(|| (available_cpus() / num_sessions()).max(1))
+        .unwrap_or(0)
 }
 
 /// Model architecture type - determines pooling strategy