@@ -32,10 +32,11 @@ use tokenizers::Tokenizer;
3232/// Default batch size per ONNX forward pass.
3333const DEFAULT_BATCH_SIZE : usize = 8 ;
3434
35- /// Default number of parallel sessions: num_cpus / 2 (min 1).
36- /// Each session gets 2 intra threads. Enough sessions to handle concurrent callers.
35+ /// Default number of parallel sessions: num_cpus / batch_size (min 1).
36+ /// Each session uses all cores (intra=0). Sessions scale with machine size
37+ /// relative to batch size — more cores = more parallel callers supported.
3738fn default_num_sessions ( ) -> usize {
38- ( available_cpus ( ) / 2 ) . max ( 1 )
39+ ( available_cpus ( ) / batch_size ( ) ) . max ( 1 )
3940}
4041
4142fn batch_size ( ) -> usize {
@@ -60,13 +61,13 @@ fn num_sessions() -> usize {
6061 . unwrap_or_else ( default_num_sessions)
6162}
6263
63- /// Intra-op threads per session. Default: num_cpus / num_sessions .
64- /// Total threads = num_sessions × intra_threads = num_cpus (no oversubscription) .
64+ /// Intra-op threads per session. Default: 0 (all cores) .
65+ /// Each session gets full CPU access. ORT internally manages thread contention .
6566fn intra_threads ( ) -> usize {
6667 std:: env:: var ( "EMBEDDINGS_INTRA_THREADS" )
6768 . ok ( )
6869 . and_then ( |v| v. parse ( ) . ok ( ) )
69- . unwrap_or_else ( || ( available_cpus ( ) / num_sessions ( ) ) . max ( 1 ) )
70+ . unwrap_or ( 0 )
7071}
7172
7273/// Model architecture type - determines pooling strategy
0 commit comments