From 380b685df7cd81917d3ac25d326fef0e4a1793cb Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 3 Jun 2026 10:39:11 -0700 Subject: [PATCH 1/2] Add Kimi K2.5 EP sweep points --- .github/configs/nvidia-master.yaml | 8 ++++++++ .../single_node/fixed_seq_len/kimik2.5_fp4_b200.sh | 9 ++++++++- .../single_node/fixed_seq_len/kimik2.5_fp4_b300.sh | 7 +++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3ee32f6c6..b10247c5c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2661,12 +2661,16 @@ kimik2.5-fp4-b200-vllm: osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html # does not have a B300-specific recipe, so this config reuses the existing @@ -2714,12 +2718,16 @@ kimik2.5-fp4-b300-vllm: osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } dsr1-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 diff --git a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh index 59b55c90c..a625d86ae 100644 --- a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh @@ -5,6 +5,7 @@ source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + EP_SIZE \ CONC \ ISL \ OSL \ @@ -39,9 +40,15 @@ start_gpu_monitor # benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh). export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 +EP_ARGS=() +if [ "$EP_SIZE" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --tensor-parallel-size=$TP \ +"${EP_ARGS[@]}" \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --max-num-seqs $CONC \ @@ -82,4 +89,4 @@ fi # Stop GPU monitoring stop_gpu_monitor -set +x \ No newline at end of file +set +x diff --git a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh index db6d3fb0d..7e2a65eec 100755 --- a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh @@ -9,6 +9,7 @@ source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + EP_SIZE \ CONC \ ISL \ OSL \ @@ -47,9 +48,15 @@ fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +EP_ARGS=() +if [ "$EP_SIZE" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + set -x vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --tensor-parallel-size $TP \ +"${EP_ARGS[@]}" \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --max-num-seqs $CONC \ From 8c2f79338f7ee8c4cfd10b002f28cc190d1c37f1 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 3 Jun 2026 10:44:57 -0700 Subject: [PATCH 2/2] Add Kimi K2.5 EP sweep changelog --- perf-changelog.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1b9d2f0db..0c078c175 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3430,3 +3430,10 @@ - "Image: vllm/vllm-openai:v0.20.1" - "Same 1k/1k and 8k/1k search space as gb300, plus a new tp8-1p1d at low concurrencies for both ISLs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652 + +- config-keys: + - kimik2.5-fp4-b200-vllm + - kimik2.5-fp4-b300-vllm + description: + - "Add expert-parallel sweep points for Kimi K2.5 FP4 B200/B300 vLLM aggregate benchmarks" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1658