Skip to content

Commit b3b3476

Browse files
seungrokjclaude
andcommitted
[AMD] qwen3.5-fp4-mi355x-sglang-agentic: add hicache variant config and script
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 3707963 commit b3b3476

2 files changed

Lines changed: 166 additions & 0 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,22 @@ qwen3.5-fp4-mi355x-sglang:
433433
- { tp: 2, conc-start: 4, conc-end: 256 }
434434
- { tp: 4, conc-start: 4, conc-end: 16 }
435435

436+
# target
437+
qwen3.5-fp4-mi355x-sglang-hicache:
438+
image: lmsysorg/sglang:v0.5.12-rocm720-mi35x
439+
model: amd/Qwen3.5-397B-A17B-MXFP4
440+
model-prefix: qwen3.5
441+
runner: mi355x
442+
precision: fp4
443+
framework: sglang
444+
multinode: false
445+
scenarios:
446+
agentic-coding:
447+
- duration: 1800
448+
search-space:
449+
- { tp: 2, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] }
450+
- { tp: 2, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] }
451+
436452
qwen3.5-fp4-mi355x-atom:
437453
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
438454
model: amd/Qwen3.5-397B-A17B-MXFP4
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
set -x
4+
5+
# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
6+
#
7+
# Base server recipe follows the upstream MI300X reference
8+
# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
9+
# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
10+
# The agentic harness (resolve_trace_source / build_replay_cmd /
11+
# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
12+
# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
13+
#
14+
# Required env vars:
15+
# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
16+
#
17+
# OFFLOADING values:
18+
# none - SGLang GPU KV with the default RadixAttention prefix cache.
19+
# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
20+
21+
source "$(dirname "$0")/../../benchmark_lib.sh"
22+
23+
check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
24+
25+
PORT=${PORT:-8888}
26+
DURATION=${DURATION:-1800}
27+
EP_SIZE=${EP_SIZE:-1}
28+
29+
SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30}
30+
31+
if [[ -n "${SLURM_JOB_ID:-}" ]]; then
32+
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
33+
fi
34+
35+
# `hf download` creates the target dir if missing and is itself idempotent.
36+
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
37+
# Either way, MODEL_PATH is what the server is launched with.
38+
if [[ -n "${MODEL_PATH:-}" ]]; then
39+
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
40+
hf download "$MODEL" --local-dir "$MODEL_PATH"
41+
fi
42+
else
43+
hf download "$MODEL"
44+
export MODEL_PATH="$MODEL"
45+
fi
46+
47+
rocm-smi || true
48+
amd-smi || true
49+
50+
# ---- Resolve traces and install deps ----------------------------------------
51+
# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
52+
# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
53+
# signal at high concurrency.
54+
#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
55+
#060226
56+
export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
57+
58+
# ---- Resolve traces and install deps ----------------------------------------
59+
resolve_trace_source
60+
install_agentic_deps
61+
62+
# ---- Cache / offload config -------------------------------------------------
63+
SERVER_LOG="$RESULT_DIR/server.log"
64+
mkdir -p "$RESULT_DIR"
65+
66+
CACHE_ARGS=()
67+
WARMUP_ARGS=()
68+
CUDA_GRAPH_MAX_BS="$CONC"
69+
case "$OFFLOADING" in
70+
none)
71+
# Leave SGLang's default RadixAttention prefix cache on — agentic
72+
# replay needs it; --disable-radix-cache would zero the hit rate.
73+
;;
74+
hicache)
75+
# Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per
76+
# TP rank (one hierarchical KV, one hierarchical Mamba), so the
77+
# node-total DRAM budget divides by TP and the host-pool count.
78+
TOTAL_CPU_DRAM_GB=3000
79+
HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
80+
HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}"
81+
HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
82+
# Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which
83+
# requires page_size=1. Keep the safer direct/layer_first copy path;
84+
# kernel/page_first faults on first prefill in this mode on ROCm.
85+
HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
86+
HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
87+
HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
88+
HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
89+
if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
90+
HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
91+
fi
92+
if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
93+
echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
94+
exit 1
95+
fi
96+
echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
97+
CACHE_ARGS=(
98+
--page-size "$HICACHE_PAGE_SIZE"
99+
--enable-hierarchical-cache
100+
--hicache-size "$HICACHE_SIZE_GB"
101+
--hicache-io-backend "$HICACHE_IO_BACKEND"
102+
--hicache-mem-layout "$HICACHE_MEM_LAYOUT"
103+
--hicache-write-policy "$HICACHE_WRITE_POLICY"
104+
)
105+
# HiCache startup reaches API readiness but SGLang's internal warmup
106+
# request can time out on this path; let aiperf own benchmark traffic.
107+
WARMUP_ARGS=(--skip-server-warmup)
108+
# Don't force ROCm graph capture at every high concurrency point; conc=16
109+
# is the highest known-good capture size for this model/server path.
110+
HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}"
111+
if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
112+
CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
113+
fi
114+
;;
115+
*)
116+
echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
117+
exit 1
118+
;;
119+
esac
120+
121+
echo "Starting SGLang server..."
122+
export PYTHONNOUSERSITE=1
123+
124+
python3 -m sglang.launch_server \
125+
--attention-backend aiter \
126+
--model-path $MODEL \
127+
--host=0.0.0.0 \
128+
--port $PORT \
129+
--tensor-parallel-size $TP \
130+
--ep-size $EP_SIZE \
131+
--trust-remote-code \
132+
--model-loader-extra-config '{"enable_multithread_load": true}' \
133+
--watchdog-timeout 1200 \
134+
--tokenizer-worker-num 6 \
135+
--cuda-graph-max-bs $CONC \
136+
--max-running-requests $CONC \
137+
--scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
138+
--mem-fraction-static 0.8 \
139+
"${CACHE_ARGS[@]}" \
140+
"${WARMUP_ARGS[@]}" \
141+
--enable-metrics > "$SERVER_LOG" 2>&1 &
142+
SERVER_PID=$!
143+
echo "Server PID: $SERVER_PID"
144+
145+
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
146+
147+
# ---- Run benchmark ----------------------------------------------------------
148+
build_replay_cmd "$RESULT_DIR"
149+
150+
run_agentic_replay_and_write_outputs "$RESULT_DIR"

0 commit comments

Comments
 (0)