Skip to content

Commit 8b3573b

Browse files
cquil11claude
andauthored
[AMD/ROCM] ATOM support for new models: Kimi-K2.5 FP4 and MiniMax-M2.5 (#992)
* [AMD/ROCM] ATOM support for new models: Kimi-K2.5 FP4 and MiniMax-M2.5 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * update perf changelog --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 24eea11 commit 8b3573b

4 files changed

Lines changed: 210 additions & 0 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,26 @@ kimik2.5-fp4-mi355x-vllm:
313313
- { tp: 8, conc-start: 4, conc-end: 64 }
314314
- { tp: 4, conc-start: 4, conc-end: 64 }
315315

316+
kimik2.5-fp4-mi355x-atom:
317+
image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
318+
model: amd/Kimi-K2.5-MXFP4
319+
model-prefix: kimik2.5
320+
runner: mi355x
321+
precision: fp4
322+
framework: atom
323+
multinode: false
324+
seq-len-configs:
325+
- isl: 1024
326+
osl: 1024
327+
search-space:
328+
- { tp: 8, conc-start: 4, conc-end: 128 }
329+
- { tp: 4, conc-start: 4, conc-end: 128 }
330+
- isl: 8192
331+
osl: 1024
332+
search-space:
333+
- { tp: 8, conc-start: 4, conc-end: 128 }
334+
- { tp: 4, conc-start: 4, conc-end: 128 }
335+
316336
minimaxm2.5-fp8-mi355x-vllm:
317337
image: vllm/vllm-openai-rocm:v0.18.0
318338
model: MiniMaxAI/MiniMax-M2.5
@@ -335,6 +355,28 @@ minimaxm2.5-fp8-mi355x-vllm:
335355
- { tp: 4, conc-start: 4, conc-end: 64 }
336356
- { tp: 8, ep: 8, conc-start: 32, conc-end: 256 }
337357

358+
minimaxm2.5-fp8-mi355x-atom:
359+
image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
360+
model: MiniMaxAI/MiniMax-M2.5
361+
model-prefix: minimaxm2.5
362+
runner: mi355x
363+
precision: fp8
364+
framework: atom
365+
multinode: false
366+
seq-len-configs:
367+
- isl: 1024
368+
osl: 1024
369+
search-space:
370+
- { tp: 2, conc-start: 4, conc-end: 128 }
371+
- { tp: 4, conc-start: 4, conc-end: 128 }
372+
- { tp: 8, ep: 8, conc-start: 32, conc-end: 256 }
373+
- isl: 8192
374+
osl: 1024
375+
search-space:
376+
- { tp: 2, conc-start: 4, conc-end: 128 }
377+
- { tp: 4, conc-start: 4, conc-end: 128 }
378+
- { tp: 8, ep: 8, conc-start: 32, conc-end: 256 }
379+
338380
minimaxm2.5-fp8-mi300x-vllm:
339381
image: vllm/vllm-openai-rocm:v0.16.0
340382
model: MiniMaxAI/MiniMax-M2.5
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/usr/bin/env bash
2+
3+
source "$(dirname "$0")/../benchmark_lib.sh"
4+
5+
check_env_vars \
6+
MODEL \
7+
TP \
8+
CONC \
9+
ISL \
10+
OSL \
11+
RANDOM_RANGE_RATIO \
12+
RESULT_FILENAME \
13+
EP_SIZE \
14+
DP_ATTENTION
15+
16+
if [[ -n "$SLURM_JOB_ID" ]]; then
17+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
18+
fi
19+
20+
echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
21+
22+
SERVER_LOG=/workspace/server.log
23+
PORT=${PORT:-8888}
24+
25+
export OMP_NUM_THREADS=1
26+
27+
# Calculate max-model-len based on ISL and OSL
28+
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
29+
CALCULATED_MAX_MODEL_LEN=""
30+
else
31+
CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
32+
fi
33+
34+
if [ "$EP_SIZE" -gt 1 ]; then
35+
EP=" --enable-expert-parallel"
36+
else
37+
EP=" "
38+
fi
39+
40+
# Start GPU monitoring (power, temperature, clocks every second)
41+
start_gpu_monitor
42+
43+
set -x
44+
45+
python3 -m atom.entrypoints.openai_server \
46+
--model $MODEL \
47+
--server-port $PORT \
48+
-tp $TP \
49+
--kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
50+
--trust-remote-code \
51+
> $SERVER_LOG 2>&1 &
52+
53+
SERVER_PID=$!
54+
55+
# Wait for server to be ready
56+
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
57+
58+
export PYTHONDONTWRITEBYTECODE=1
59+
run_benchmark_serving \
60+
--model "$MODEL" \
61+
--port "$PORT" \
62+
--backend vllm \
63+
--input-len "$ISL" \
64+
--output-len "$OSL" \
65+
--random-range-ratio "$RANDOM_RANGE_RATIO" \
66+
--num-prompts "$((CONC * 10))" \
67+
--max-concurrency "$CONC" \
68+
--result-filename "$RESULT_FILENAME" \
69+
--result-dir /workspace/ \
70+
--trust-remote-code
71+
72+
# After throughput, run evaluation only if RUN_EVAL is true
73+
if [ "${RUN_EVAL}" = "true" ]; then
74+
run_eval --framework lm-eval --port "$PORT"
75+
append_lm_eval_summary
76+
fi
77+
78+
# Stop GPU monitoring
79+
stop_gpu_monitor
80+
set +x
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/usr/bin/env bash
2+
3+
source "$(dirname "$0")/../benchmark_lib.sh"
4+
5+
check_env_vars \
6+
MODEL \
7+
TP \
8+
CONC \
9+
ISL \
10+
OSL \
11+
RANDOM_RANGE_RATIO \
12+
RESULT_FILENAME \
13+
EP_SIZE \
14+
DP_ATTENTION
15+
16+
if [[ -n "$SLURM_JOB_ID" ]]; then
17+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
18+
fi
19+
20+
echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
21+
22+
SERVER_LOG=/workspace/server.log
23+
PORT=${PORT:-8888}
24+
25+
export OMP_NUM_THREADS=1
26+
27+
# Calculate max-model-len based on ISL and OSL
28+
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
29+
CALCULATED_MAX_MODEL_LEN=""
30+
else
31+
CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
32+
fi
33+
34+
if [ "$EP_SIZE" -gt 1 ]; then
35+
EP=" --enable-expert-parallel"
36+
else
37+
EP=" "
38+
fi
39+
40+
# Start GPU monitoring (power, temperature, clocks every second)
41+
start_gpu_monitor
42+
43+
set -x
44+
45+
python3 -m atom.entrypoints.openai_server \
46+
--model $MODEL \
47+
--server-port $PORT \
48+
-tp $TP \
49+
--kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
50+
--trust-remote-code \
51+
> $SERVER_LOG 2>&1 &
52+
53+
SERVER_PID=$!
54+
55+
# Wait for server to be ready
56+
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
57+
58+
export PYTHONDONTWRITEBYTECODE=1
59+
run_benchmark_serving \
60+
--model "$MODEL" \
61+
--port "$PORT" \
62+
--backend vllm \
63+
--input-len "$ISL" \
64+
--output-len "$OSL" \
65+
--random-range-ratio "$RANDOM_RANGE_RATIO" \
66+
--num-prompts "$((CONC * 10))" \
67+
--max-concurrency "$CONC" \
68+
--result-filename "$RESULT_FILENAME" \
69+
--result-dir /workspace/ \
70+
--trust-remote-code
71+
72+
# After throughput, run evaluation only if RUN_EVAL is true
73+
if [ "${RUN_EVAL}" = "true" ]; then
74+
run_eval --framework lm-eval --port "$PORT"
75+
append_lm_eval_summary
76+
fi
77+
78+
# Stop GPU monitoring
79+
stop_gpu_monitor
80+
set +x

perf-changelog.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1227,3 +1227,11 @@
12271227
- "DeepSeek R1 MI355X FP8 ATOM-MTP config to support MTP 3 tokens"
12281228
- "Image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2"
12291229
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/984
1230+
1231+
- config-keys:
1232+
- kimik2.5-fp4-mi355x-atom
1233+
- minimaxm2.5-fp8-mi355x-atom
1234+
description:
1235+
- "New model support on ATOM framework"
1236+
- "Kimi-K2.5 FP4, and MiniMax-M2.5 FP8 configs added for MI355X ATOM"
1237+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/963

0 commit comments

Comments
 (0)