add dispatch token clamp (>=256) and run benchmark+eval at conc-64

Oseltamivir · Oseltamivir · commit 9983cc0f853b · 2026-06-03T13:52:50.000-07:00
Clamp MORI_MAX_DISPATCH_TOKENS_DECODE to minimum 256 when DP+EP are
both enabled, preventing SGLang's low-latency All2All kernel from being
selected. That kernel silently corrupts outputs at small buffer sizes.

Run A of A/B test: benchmark + eval WITH clamp on conc-64 DEP8+MTP3.
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -248,6 +248,15 @@ if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
     MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
 fi
 
+# Clamp dispatch tokens to >= 256 to avoid the low-latency All2All kernel
+# variant in MoRI which silently corrupts outputs at small buffer sizes.
+if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then
+    if [[ $MORI_MAX_DISPATCH_TOKENS_DECODE -lt 256 ]]; then
+        echo "[WARN] Clamping MORI_MAX_DISPATCH_TOKENS_DECODE from $MORI_MAX_DISPATCH_TOKENS_DECODE to 256 (All2All kernel threshold)"
+        MORI_MAX_DISPATCH_TOKENS_DECODE=256
+    fi
+fi
+
 # =============================================================================
 # Cluster Topology Configuration
 # =============================================================================
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3448,6 +3448,5 @@
 - config-keys:
     - dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp
   description:
-    - "Throwaway: conc-64-only gsm8k eval for DEP8+MTP3 to reproduce SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK < 256 corruption (dispatch=32 triggers broken All2All kernel, expect 0pct gsm8k). Not for merge."
+    - "Throwaway: conc-64 DEP8+MTP3 benchmark+eval WITH dispatch token clamp (MORI_MAX_DISPATCH_TOKENS_DECODE >= 256). A/B test for All2All kernel corruption fix."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1659
-  evals-only: true