SemiAnalysisAI · jasonlizhengjian · Jun 3, 2026 · Jun 3, 2026
@@ -2661,12 +2661,16 @@ kimik2.5-fp4-b200-vllm:
       osl: 1024
       search-space:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 4 }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
+      - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 4 }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
+      - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
 
 # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
 # does not have a B300-specific recipe, so this config reuses the existing
@@ -2714,12 +2718,16 @@ kimik2.5-fp4-b300-vllm:
       osl: 1024
       search-space:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 4 }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
+      - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 4 }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
+      - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
 
 dsr1-fp8-b200-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130

diff --git a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh
@@ -5,6 +5,7 @@ source "$(dirname "$0")/../../benchmark_lib.sh"
 check_env_vars \
     MODEL \
     TP \
+    EP_SIZE \
     CONC \
     ISL \
     OSL \
@@ -39,9 +40,15 @@ start_gpu_monitor
 # benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh).
 export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
 
+EP_ARGS=()
+if [ "$EP_SIZE" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
 set -x
 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --tensor-parallel-size=$TP \
+"${EP_ARGS[@]}" \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --max-num-seqs $CONC \
@@ -82,4 +89,4 @@ fi
 
 # Stop GPU monitoring
 stop_gpu_monitor
-set +x
+set +x
diff --git a/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
@@ -9,6 +9,7 @@ source "$(dirname "$0")/../../benchmark_lib.sh"
 check_env_vars \
     MODEL \
     TP \
+    EP_SIZE \
     CONC \
     ISL \
     OSL \
@@ -47,9 +48,15 @@ fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
+EP_ARGS=()
+if [ "$EP_SIZE" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
 set -x
 vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --tensor-parallel-size $TP \
+"${EP_ARGS[@]}" \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --max-num-seqs $CONC \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3430,3 +3430,10 @@
     - "Image: vllm/vllm-openai:v0.20.1"
     - "Same 1k/1k and 8k/1k search space as gb300, plus a new tp8-1p1d at low concurrencies for both ISLs"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652
+
+- config-keys:
+    - kimik2.5-fp4-b200-vllm
+    - kimik2.5-fp4-b300-vllm
+  description:
+    - "Add expert-parallel sweep points for Kimi K2.5 FP4 B200/B300 vLLM aggregate benchmarks"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1658