feat(agentic): add qwen3.5-fp8-h100-sglang-agentic recipe

cquil11 · claude · cquil11 · commit 72cf856fb1ca · 2026-05-27T17:02:46.000-05:00
New agentic-coding recipe targeting H100 (runner: h100-dgxc) running
Qwen3.5-397B-A17B FP8 via SGLang v0.5.12-cu130. Mirrors the b300 SGLang
agentic shape with H100-appropriate kernel flags:

- attention-backend: flashinfer (sm_90; trtllm_mha is Blackwell-only).
- mem-fraction-static 0.75 (vs 0.80 on B300) and chunked-prefill-size
  8192 (vs 16384) to fit Qwen-397B FP8 weights + KV in H100's 80 GB
  HBM3 at TP=8.
- conc-list capped at 16 across both arms; agentic ISLs hit ~80k-200k
  on the 256k corpus and Qwen at conc=32 OOM'd in the fixed_seq_len
  sweep at lower ISL too.

Recipe wires WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
so the 256k-capped variant (470 traces, max in+out &lt;= 256k) is used
instead of the unfiltered 052726 corpus (which has up to ~1M-token
requests the H100 max_model_len=131k server would reject).

Two sweep arms:
  - none:    --disable-radix-cache, conc-list [1, 2, 4, 8, 16]
  - hicache: --enable-hierarchical-cache + sized from TOTAL_CPU_DRAM_GB,
             conc-list [4, 8, 16] (capped where hicache stabilizes)

Yaml key is qwen3.5-fp8-h100-sglang-agentic; script filename is the
bare `qwen3.5_fp8_h100.sh` under benchmarks/single_node/agentic/ —
the h100 launchers don't support framework-tagged script names, and
this matches the precedent set by qwen3.5_fp8_b200.sh (which is the
sglang-agentic recipe under the same bare name).

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
Signed-off-by: Cam Quilici &lt;cjquilici@gmail.com&gt;
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -9399,6 +9399,31 @@ qwen3.5-fp8-h100-sglang:
       search-space:
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
 
+# Diverged from qwen3.5-fp8-h100-sglang (agentic-coding sibling). Reasons below;
+# the original qwen3.5-fp8-h100-sglang entry stays byte-identical to origin/main
+# so its fixed-seq-len sweep is unaffected.
+#   - scenarios: replaced fixed-seq-len with agentic-coding.
+#   - runner: 'h100' -> 'h100-dgxc' (agentic runs need the dgxc-slurm cluster).
+# Image is identical to the base entry (lmsysorg/sglang:v0.5.12-cu130).
+# CONC range conservative for H100's 80 GB HBM3 under the long-ISL with-
+# subagents corpus. hicache arm capped at conc 16 since high-conc + hicache
+# tends to flake on first runs and conc 16 covers the cliff. The bench script
+# sets WEKA_LOADER_OVERRIDE to the 256k-capped corpus variant.
+qwen3.5-fp8-h100-sglang-agentic:
+  image: lmsysorg/sglang:v0.5.12-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: h100-dgxc
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 8, offloading: none,    conc-list: [1, 2, 4, 8, 16] }
+      - { tp: 8, ep: 8, offloading: hicache, conc-list: [4, 8, 16] }
+
 qwen3.5-fp8-h100-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
@@ -0,0 +1,128 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for Qwen3.5 FP8 on H100 using SGLang.
+#
+# H100 has 80 GB HBM3 (vs B300's 192 GB), so weights + KV fit tighter.
+# Mem-fraction-static lowered to 0.75 and chunked-prefill-size halved to
+# 8192 (mirrors fixed_seq_len/qwen3.5_fp8_h100.sh). Attention backend is
+# flashinfer (sm_90); the trtllm_mha path is Blackwell-only.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV only with radix cache disabled.
+#   hicache - SGLang HiCache with local CPU hierarchical cache.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
+SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-10}
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=131072
+fi
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+nvidia-smi
+
+# ---- Resolve traces and install deps ----------------------------------------
+# H100 max_model_len caps at 131k (HBM-bound). The unfiltered with-subagents
+# corpus has requests up to ~1M proxy tokens that the server would reject.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k); even
+# at 131k context, the rejection rate is much lower than against the
+# unfiltered corpus.
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+CACHE_ARGS=()
+case "$OFFLOADING" in
+    none)
+        CACHE_ARGS=(--disable-radix-cache)
+        ;;
+    hicache)
+        # HiCache extends RadixAttention, so do not pass --disable-radix-cache.
+        # H100 nodes typically expose ~1.5-2 TB usable CPU DRAM; Qwen3.5's
+        # hybrid GDN/Mamba path allocates two HiCache host pools per TP rank
+        # (one KV, one Mamba). Workflow passes a generic TOTAL_CPU_DRAM_GB, so
+        # keep the per-rank-per-pool conversion local to this script.
+        TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-1500}"
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --page-size 64
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend kernel
+            --hicache-mem-layout page_first
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting SGLang server..."
+export PYTHONNOUSERSITE=1
+
+{ set +x; } 2>/dev/null
+SGLANG_CMD=(
+    python3 -m sglang.launch_server
+    --model-path="$MODEL"
+    --host=0.0.0.0
+    --port="$PORT"
+    --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8"
+    --trust-remote-code
+    --tensor-parallel-size="$TP"
+    --data-parallel-size=1
+    --expert-parallel-size="$EP_SIZE"
+    --quantization fp8
+    --kv-cache-dtype fp8_e4m3
+    --mamba-ssm-dtype bfloat16
+    --attention-backend flashinfer
+    --enable-flashinfer-allreduce-fusion
+    --cuda-graph-max-bs "$CONC"
+    --max-running-requests "$CONC"
+    --max-prefill-tokens 8192
+    --chunked-prefill-size 8192
+    --mem-fraction-static 0.75
+    --stream-interval 50
+    --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL"
+    --tokenizer-worker-num 6
+    --tokenizer-path "$MODEL"
+    --context-length "$MAX_MODEL_LEN"
+    --enable-metrics
+    "${CACHE_ARGS[@]}"
+)
+printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt"
+"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"