fix(agentx): fix TP sizes and remove hardcoded max-model-len on MI355X agentic benchmarks

seungrokj · claude · seungrokj · commit c215222a9bd5 · 2026-05-28T09:37:08.000+09:00
- dsv4 and minimaxm2.5 agentic: remove MAX_MODEL_LEN override and --max-model-len flag to let vLLM use server default
- amd-master.yaml: update dsv4 agentic TP from 4→8, minimaxm2.5 agentic TP from 4→1
- launch_mi355x-amds.sh: extend HF_HUB_CACHE_MOUNT override to vllm framework for DeepSeek-V4-Pro

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
@@ -652,8 +652,8 @@ dsv4-fp4-mi355x-vllm-agentic:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
-      - { tp: 4, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
+      - { tp: 8, offloading: none, conc-list: [16, 24, 32, 40] }
+      - { tp: 8, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
 
 minimaxm2.5-fp8-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.21.0
@@ -786,8 +786,8 @@ minimaxm2.5-fp4-mi355x-vllm-agentic:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
-      - { tp: 4, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
+      - { tp: 1, offloading: none, conc-list: [16, 24, 32, 40] }
+      - { tp: 1, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
 
 minimaxm2.5-fp8-mi300x-vllm:
   image: vllm/vllm-openai-rocm:v0.16.0
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_vllm.sh
@@ -22,13 +22,9 @@ MAX_DELAY=${MAX_DELAY:-60}
 ADVANCE_MIN=${ADVANCE_MIN:-0.0}
 ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 EP_SIZE=${EP_SIZE:-1}
-# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0.
-# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this
-# script we need the concrete value so AgentX filters prompt+max_tokens against
-# the same limit vLLM enforces.
-if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then
-    MAX_MODEL_LEN=262144
-fi
+#if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then
+#    MAX_MODEL_LEN=262144
+#fi
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -249,7 +245,6 @@ VLLM_CMD=(
     --tokenizer-mode deepseek_v4 \
     --reasoning-parser deepseek_v4 \
     --compilation-config '{"mode":3,"cudagraph_mode":"FULL_AND_PIECEWISE"}' \
-    --max-model-len "$MAX_MODEL_LEN"
     --max-num-seqs "$CONC"
     "${PREFIX_CACHE_ARGS[@]}"
     "${OFFLOAD_ARGS[@]}"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh
@@ -22,13 +22,9 @@ MAX_DELAY=${MAX_DELAY:-60}
 ADVANCE_MIN=${ADVANCE_MIN:-0.0}
 ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 EP_SIZE=${EP_SIZE:-1}
-# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0.
-# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this
-# script we need the concrete value so AgentX filters prompt+max_tokens against
-# the same limit vLLM enforces.
-if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then
-    MAX_MODEL_LEN=262144
-fi
+#if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then
+#    MAX_MODEL_LEN=262144
+#fi
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -245,7 +241,6 @@ VLLM_CMD=(
     --block-size=32
     --trust-remote-code
     --attention-backend "ROCM_AITER_FA" \
-    --max-model-len "$MAX_MODEL_LEN"
     --max-num-seqs "$CONC"
     "${PREFIX_CACHE_ARGS[@]}"
     "${OFFLOAD_ARGS[@]}"
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
@@ -214,7 +214,7 @@ else
     fi
 
     # to prevent reading outdated saved model. use a fresh model from hf repo
-    if [[ "$FRAMEWORK" == "atom" ]] && [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Pro" ]]; then
+    if [[ ("$FRAMEWORK" == "vllm" || "$FRAMEWORK" == "atom") ]] && [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Pro" ]]; then
         export HF_HUB_CACHE_MOUNT="/it-share/hf-hub-cache/"
     fi