Skip to content

Commit 3450ced

Browse files
cquil11github-actions[bot]claude
authored
[AMD] Update AMD MI300X, MI325X, MI355X GPT-OSS vLLM images to v0.16.0 (#806)
* Update AMD MI300X, MI325X, MI355X GPT-OSS vLLM images to v0.16.0 MI300X & MI325X: Bump image from v0.15.1 to v0.16.0 (scripts compatible) MI355X: Major rewrite from custom ROCm v0.10.1 to upstream v0.16.0: - Fix env var names (VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION) - Add missing VLLM_ROCM_USE_AITER=1 master toggle - Remove non-existent VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4 - Remove deprecated --max-seq-len-to-capture and --async-scheduling - Simplify compilation config to cudagraph_mode FULL_AND_PIECEWISE - Add HIP_VISIBLE_DEVICES Ray compatibility and MEC firmware check Closes #803 Co-authored-by: Cameron Quilici <cquil11@users.noreply.github.com> * Update perf-changelog.yaml * Update perf-changelog PR links to #806 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Update gptoss_fp4_mi355x.sh * Consolidate perf-changelog entries for MI300X/MI325X/MI355X into one Co-authored-by: Cameron Quilici <cquil11@users.noreply.github.com> * update HIP VISIBLE DEVICES --------- Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: Cameron Quilici <cquil11@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c5bbe05 commit 3450ced

3 files changed

Lines changed: 32 additions & 15 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ minimaxm2.5-fp8-mi355x-vllm:
230230
- { tp: 4, conc-start: 4, conc-end: 64 }
231231

232232
gptoss-fp4-mi300x-vllm:
233-
image: vllm/vllm-openai-rocm:v0.15.1
233+
image: vllm/vllm-openai-rocm:v0.16.0
234234
model: openai/gpt-oss-120b
235235
model-prefix: gptoss
236236
runner: mi300x
@@ -261,7 +261,7 @@ gptoss-fp4-mi300x-vllm:
261261
- { tp: 8, conc-start: 4, conc-end: 16 }
262262

263263
gptoss-fp4-mi325x-vllm:
264-
image: vllm/vllm-openai-rocm:v0.15.1
264+
image: vllm/vllm-openai-rocm:v0.16.0
265265
model: openai/gpt-oss-120b
266266
model-prefix: gptoss
267267
runner: mi325x
@@ -292,7 +292,7 @@ gptoss-fp4-mi325x-vllm:
292292
- { tp: 8, conc-start: 4, conc-end: 16 }
293293

294294
gptoss-fp4-mi355x-vllm:
295-
image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
295+
image: vllm/vllm-openai-rocm:v0.16.0
296296
model: openai/gpt-oss-120b
297297
model-prefix: gptoss
298298
runner: mi355x

benchmarks/single_node/gptoss_fp4_mi355x.sh

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,24 @@ fi
1818

1919
hf download "$MODEL"
2020

21-
cat > config.yaml << EOF
22-
compilation-config: '{"compile_sizes":[1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,256,512,1024,2048,8192] , "cudagraph_capture_sizes":[1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,136,144,152,160,168,176,184,192,200,208,216,224,232,240,248,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512,520,528,536,544,552,560,568,576,584,592,600,608,616,624,632,640,648,656,664,672,680,688,696,704,712,720,728,736,744,752,760,768,776,784,792,800,808,816,824,832,840,848,856,864,872,880,888,896,904,912,920,928,936,944,952,960,968,976,984,992,1000,1008,1016,1024,2048,4096,8192] , "cudagraph_mode": "FULL_AND_PIECEWISE"}'
23-
EOF
21+
# If the machine runs a MEC FW older than 177, RCCL
22+
# cannot reclaim some memory.
23+
# Disable that features to avoid crashes.
24+
# This is related to the changes in the driver at:
25+
# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
26+
version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'`
27+
if [[ "$version" == "" || $version -lt 177 ]]; then
28+
export HSA_NO_SCRATCH_RECLAIM=1
29+
fi
2430

25-
sleep 5
26-
cat config.yaml
31+
# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+
32+
if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
33+
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
34+
fi
2735

28-
export VLLM_USE_AITER_UNIFIED_ATTENTION=1
36+
export VLLM_ROCM_USE_AITER=1
37+
export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
2938
export VLLM_ROCM_USE_AITER_MHA=0
30-
export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1
3139

3240
SERVER_LOG=/workspace/server.log
3341
PORT=${PORT:-8888}
@@ -37,12 +45,10 @@ vllm serve $MODEL --port $PORT \
3745
--tensor-parallel-size=$TP \
3846
--gpu-memory-utilization 0.95 \
3947
--max-model-len $MAX_MODEL_LEN \
40-
--max-seq-len-to-capture $MAX_MODEL_LEN \
41-
--config config.yaml \
48+
--compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
4249
--block-size=64 \
4350
--no-enable-prefix-caching \
44-
--disable-log-requests \
45-
--async-scheduling > $SERVER_LOG 2>&1 &
51+
--disable-log-requests > $SERVER_LOG 2>&1 &
4652

4753
SERVER_PID=$!
4854

perf-changelog.yaml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -733,4 +733,15 @@
733733
- "Extend concurrency range to conc-end: 256 across all sequence lengths (1k1k, 1k8k, 8k1k)"
734734
- "Fix MTP 1k8k conc-start from 256 to 4 to enable full concurrency sweep"
735735
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/699
736-
736+
737+
- config-keys:
738+
- gptoss-fp4-mi300x-vllm
739+
- gptoss-fp4-mi325x-vllm
740+
- gptoss-fp4-mi355x-vllm
741+
description:
742+
- "Update AMD GPT-OSS vLLM images to v0.16.0 (MI300X/MI325X from v0.15.1, MI355X from custom v0.10.1)"
743+
- "MI355X: Fix env vars (VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION), add VLLM_ROCM_USE_AITER=1, remove deprecated flags"
744+
- "MI355X: Simplify compilation config to cudagraph_mode FULL_AND_PIECEWISE, add HIP_VISIBLE_DEVICES Ray fix"
745+
- "Gains: fused add+rmsnorm+pad for GPT-OSS (automatic via PassManager), AITER attention block size fix"
746+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/806
747+

0 commit comments

Comments
 (0)