revert: drop MAX_MODEL_LEN cap from Kimi H100/H200 launchers

cquil11 · claude · cquil11 · commit d7411dd47436 · 2026-05-14T10:27:00.000-05:00
Per agentic benchmark design: must not cap context. Reverts the H100
MAX=16K + gpu-mem 0.85 and H200 MAX=131K caps; runs back to no
--max-model-len flag at all (vLLM uses the model's native context).

Any OOM / KV-init failures will be diagnosed separately rather than
sidestepped via a cap.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
@@ -16,14 +16,6 @@ DURATION=${DURATION:-1800}
 MAX_DELAY=${MAX_DELAY:-60}
 ADVANCE_MIN=${ADVANCE_MIN:-0.0}
 ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-# H100 80 GB HBM is barely enough for Kimi K2.5 INT4 (~44 GB/GPU weights at
-# TP=8) plus KV reservation plus MoE intermediate buffers. R2 at MAX=32K hit
-# CUDA OOM in fused_marlin_moe even at conc=1. Drop to 16K so KV reservation
-# is half, and pair with --gpu-memory-utilization 0.85 (below) to leave room
-# for the ~900 MiB-per-call MoE workspace.
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=16384
-fi
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -61,9 +53,8 @@ export VLLM_USE_FLASHINFER_MOE_INT4=1
 vllm serve $MODEL \
 --host 0.0.0.0 \
 --port $PORT \
---gpu-memory-utilization 0.85 \
+--gpu-memory-utilization 0.95 \
 --tensor-parallel-size $TP \
---max-model-len $MAX_MODEL_LEN \
 --max-num-seqs $CONC \
 --reasoning-parser kimi_k2 \
 --tool-call-parser kimi_k2 \
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
@@ -16,13 +16,6 @@ DURATION=${DURATION:-1800}
 MAX_DELAY=${MAX_DELAY:-60}
 ADVANCE_MIN=${ADVANCE_MIN:-0.0}
 ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-# H200 141 GB HBM is too tight for Kimi K2.5 INT4's native 1M-token context;
-# without a cap, vLLM either fails to allocate KV blocks at engine init or
-# aiperf's Configure-Profiling phase times out waiting for the slow KV
-# initialization. Cap at 131K for plenty of context with KV headroom.
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -62,7 +55,6 @@ vllm serve $MODEL \
 --port $PORT \
 --gpu-memory-utilization 0.95 \
 --tensor-parallel-size $TP \
---max-model-len $MAX_MODEL_LEN \
 --max-num-seqs $CONC \
 --reasoning-parser kimi_k2 \
 --tool-call-parser kimi_k2 \