Skip to content

Commit 18ad409

Browse files
committed
perf(embeddings): optimize CPU resource allocation
1 parent 040f62c commit 18ad409

1 file changed

Lines changed: 7 additions & 6 deletions

File tree

embeddings/src/model/local.rs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,11 @@ use tokenizers::Tokenizer;
3232
/// Default batch size per ONNX forward pass.
3333
const DEFAULT_BATCH_SIZE: usize = 8;
3434

35-
/// Default number of parallel sessions: num_cpus / 2 (min 1).
36-
/// Each session gets 2 intra threads. Enough sessions to handle concurrent callers.
35+
/// Default number of parallel sessions: num_cpus / batch_size (min 1).
36+
/// Each session uses all cores (intra=0). Sessions scale with machine size
37+
/// relative to batch size — more cores = more parallel callers supported.
3738
fn default_num_sessions() -> usize {
38-
(available_cpus() / 2).max(1)
39+
(available_cpus() / batch_size()).max(1)
3940
}
4041

4142
fn batch_size() -> usize {
@@ -60,13 +61,13 @@ fn num_sessions() -> usize {
6061
.unwrap_or_else(default_num_sessions)
6162
}
6263

63-
/// Intra-op threads per session. Default: num_cpus / num_sessions.
64-
/// Total threads = num_sessions × intra_threads = num_cpus (no oversubscription).
64+
/// Intra-op threads per session. Default: 0 (all cores).
65+
/// Each session gets full CPU access. ORT internally manages thread contention.
6566
fn intra_threads() -> usize {
6667
std::env::var("EMBEDDINGS_INTRA_THREADS")
6768
.ok()
6869
.and_then(|v| v.parse().ok())
69-
.unwrap_or_else(|| (available_cpus() / num_sessions()).max(1))
70+
.unwrap_or(0)
7071
}
7172

7273
/// Model architecture type - determines pooling strategy

0 commit comments

Comments
 (0)