Skip to content

Commit 34cd6b0

Browse files
seungrokjclaude
andcommitted
[AMD] glm5.1-fp4-mi355x-sglang-agentic: add hicache variant config and update scripts
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent dc25a0b commit 34cd6b0

3 files changed

Lines changed: 116 additions & 10 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2510,6 +2510,23 @@ glm5.1-fp4-mi355x-sglang-agentic:
25102510
# sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
25112511
- { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
25122512

2513+
# target
2514+
glm5.1-fp4-mi355x-sglang-agentic-hicache:
2515+
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
2516+
model: amd/GLM-5.1-MXFP4
2517+
model-prefix: glm5.1
2518+
runner: mi355x
2519+
precision: fp4
2520+
framework: sglang
2521+
multinode: false
2522+
scenarios:
2523+
agentic-coding:
2524+
- duration: 1800
2525+
search-space:
2526+
# sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
2527+
- { tp: 2, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] }
2528+
- { tp: 2, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48] }
2529+
25132530
kimik2.5-fp4-mi355x-vllm-agentic:
25142531
image: vllm/vllm-openai-rocm:v0.22.0
25152532
model: amd/Kimi-K2.5-MXFP4

benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh

Lines changed: 87 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,29 @@
22
set -euo pipefail
33
set -x
44

5-
# Agentic trace replay benchmark for GLM-5.1 FP4 on MI355X using SGLang.
5+
# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
6+
#
7+
# Base server recipe follows the upstream MI300X reference
8+
# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
9+
# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
10+
# The agentic harness (resolve_trace_source / build_replay_cmd /
11+
# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
12+
# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
613
#
714
# Required env vars:
8-
# MODEL, TP, CONC, RESULT_DIR
15+
# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
16+
#
17+
# OFFLOADING values:
18+
# none - SGLang GPU KV with the default RadixAttention prefix cache.
19+
# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
920

1021
source "$(dirname "$0")/../../benchmark_lib.sh"
1122

12-
check_env_vars MODEL TP CONC RESULT_DIR DURATION
23+
check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
1324

14-
if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
15-
MAX_MODEL_LEN=131072
16-
fi
25+
PORT=${PORT:-8888}
26+
DURATION=${DURATION:-1800}
27+
EP_SIZE=${EP_SIZE:-1}
1728

1829
if [[ -n "${SLURM_JOB_ID:-}" ]]; then
1930
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -30,8 +41,16 @@ else
3041
hf download "$MODEL"
3142
export MODEL_PATH="$MODEL"
3243
fi
44+
3345
rocm-smi || true
3446
amd-smi || true
47+
# ---- Resolve traces and install deps ----------------------------------------
48+
# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
49+
# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
50+
# signal at high concurrency.
51+
#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
52+
#060226
53+
export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
3554

3655
# ---- Resolve traces and install deps ----------------------------------------
3756
resolve_trace_source
@@ -48,26 +67,85 @@ mkdir -p "$RESULT_DIR"
4867

4968
pip install -U transformers
5069

70+
CACHE_ARGS=()
71+
WARMUP_ARGS=()
72+
CUDA_GRAPH_MAX_BS="$CONC"
73+
case "$OFFLOADING" in
74+
none)
75+
# Leave SGLang's default RadixAttention prefix cache on — agentic
76+
# replay needs it; --disable-radix-cache would zero the hit rate.
77+
;;
78+
hicache)
79+
# GLM-5.1 FP4 uses a standard transformer (no hybrid Mamba path),
80+
# so one HiCache host pool per TP rank is sufficient.
81+
# The node-total DRAM budget divides by TP and host-pool count.
82+
TOTAL_CPU_DRAM_GB=3000
83+
HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}"
84+
HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-500}}"
85+
HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
86+
# GLM-5.1 uses standard paged attention (no no_buffer scheduler constraint),
87+
# so page_size can be left at the default. Keep the safer direct/layer_first
88+
# copy path on ROCm.
89+
HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
90+
HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
91+
HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
92+
HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
93+
if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
94+
HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
95+
fi
96+
if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
97+
echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
98+
exit 1
99+
fi
100+
echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
101+
CACHE_ARGS=(
102+
--page-size "$HICACHE_PAGE_SIZE"
103+
--enable-hierarchical-cache
104+
--hicache-size "$HICACHE_SIZE_GB"
105+
--hicache-io-backend "$HICACHE_IO_BACKEND"
106+
--hicache-mem-layout "$HICACHE_MEM_LAYOUT"
107+
--hicache-write-policy "$HICACHE_WRITE_POLICY"
108+
)
109+
# HiCache startup reaches API readiness but SGLang's internal warmup
110+
# request can time out on this path; let aiperf own benchmark traffic.
111+
WARMUP_ARGS=(--skip-server-warmup)
112+
# Don't force ROCm graph capture at every high concurrency point; conc=16
113+
# is the highest known-good capture size for this model/server path.
114+
HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}"
115+
if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
116+
CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
117+
fi
118+
;;
119+
*)
120+
echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
121+
exit 1
122+
;;
123+
esac
124+
51125
echo "Starting SGLang server..."
52126
export PYTHONNOUSERSITE=1
53127

128+
pip install -U transformers
54129
python3 -m sglang.launch_server \
55-
--model-path "$MODEL_PATH" --served-model-name "$MODEL" \
130+
--model-path "$MODEL_PATH" \
131+
--served-model-name "$MODEL" \
56132
--host=0.0.0.0 \
57133
--port $PORT \
58134
--tensor-parallel-size $TP \
59135
--trust-remote-code \
60136
--cuda-graph-max-bs $CONC \
61137
--max-running-requests $CONC \
62-
--context-length $MAX_MODEL_LEN \
63138
--mem-fraction-static 0.85 \
64139
--tool-call-parser glm47 \
65140
--reasoning-parser glm45 \
66141
--model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
67142
--nsa-prefill-backend tilelang \
68143
--nsa-decode-backend tilelang \
144+
--watchdog-timeout 1200 \
69145
--kv-cache-dtype fp8_e4m3 \
70146
--tokenizer-worker-num $((TP*2)) \
147+
"${CACHE_ARGS[@]}" \
148+
"${WARMUP_ARGS[@]}" \
71149
--enable-metrics > "$SERVER_LOG" 2>&1 &
72150
SERVER_PID=$!
73151
echo "Server PID: $SERVER_PID"
@@ -77,4 +155,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
77155
# ---- Run benchmark ----------------------------------------------------------
78156
build_replay_cmd "$RESULT_DIR"
79157

80-
run_agentic_replay_and_write_outputs "$RESULT_DIR"
158+
run_agentic_replay_and_write_outputs "$RESULT_DIR"

benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,18 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
3232
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
3333
fi
3434

35-
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
35+
# `hf download` creates the target dir if missing and is itself idempotent.
36+
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
37+
# Either way, MODEL_PATH is what the server is launched with.
38+
if [[ -n "${MODEL_PATH:-}" ]]; then
39+
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
40+
hf download "$MODEL" --local-dir "$MODEL_PATH"
41+
fi
42+
else
43+
hf download "$MODEL"
44+
export MODEL_PATH="$MODEL"
45+
fi
46+
3647
rocm-smi || true
3748
amd-smi || true
3849

0 commit comments

Comments
 (0)