Skip to content

Commit 242ab88

Browse files
committed
Merge remote-tracking branch 'origin/main' into update-dsv4-trt-image-2dd03e6
2 parents 14a1bb3 + 1b23499 commit 242ab88

199 files changed

Lines changed: 7083 additions & 1559 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/configs/amd-master.yaml

Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2407,3 +2407,318 @@ glm5-fp8-mi325x-sglang-mtp:
24072407
osl: 1024
24082408
search-space:
24092409
- { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
2410+
2411+
# ============================================================================
2412+
# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries).
2413+
# Recipes that ALREADY existed on main were intentionally left at main's version
2414+
# to preserve main behavior; PR-branch modifications to those recipes are NOT
2415+
# brought in here.
2416+
# ============================================================================
2417+
2418+
qwen3.5-fp8-mi355x-sglang-agentic-hicache:
2419+
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
2420+
model: Qwen/Qwen3.5-397B-A17B-FP8
2421+
model-prefix: qwen3.5
2422+
runner: mi355x
2423+
precision: fp8
2424+
framework: sglang
2425+
multinode: false
2426+
scenarios:
2427+
agentic-coding:
2428+
- duration: 1800
2429+
search-space:
2430+
- { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
2431+
- { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
2432+
2433+
dsv4-fp4-mi355x-vllm-agentic:
2434+
image: vllm/vllm-openai-rocm:v0.21.0
2435+
model: deepseek-ai/DeepSeek-V4-Pro
2436+
model-prefix: dsv4
2437+
runner: mi355x
2438+
precision: fp4
2439+
framework: vllm
2440+
multinode: false
2441+
scenarios:
2442+
agentic-coding:
2443+
- duration: 1800
2444+
search-space:
2445+
- { tp: 8, offloading: none, conc-list: [1, 2, 4] }
2446+
- { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
2447+
- { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
2448+
2449+
dsr1-fp4-mi355x-sglang-disagg-mtp:
2450+
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
2451+
model: amd/DeepSeek-R1-0528-MXFP4-v2
2452+
model-prefix: dsr1
2453+
runner: mi355x-disagg
2454+
precision: fp4
2455+
framework: sglang-disagg
2456+
multinode: true
2457+
disagg: true
2458+
scenarios:
2459+
fixed-seq-len:
2460+
- isl: 1024
2461+
osl: 1024
2462+
search-space:
2463+
# MTP configurations
2464+
# 1P1D TP8
2465+
- spec-decoding: "mtp"
2466+
conc-list: [ 1, 2, 4, 8 ]
2467+
prefill:
2468+
num-worker: 1
2469+
tp: 8
2470+
ep: 1
2471+
dp-attn: false
2472+
additional-settings:
2473+
- "PREFILL_NODES=1"
2474+
decode:
2475+
num-worker: 1
2476+
tp: 8
2477+
ep: 1
2478+
dp-attn: false
2479+
additional-settings:
2480+
- "DECODE_NODES=1"
2481+
- "DECODE_MTP_SIZE=3"
2482+
2483+
# 1P2D TP8
2484+
- spec-decoding: "mtp"
2485+
conc-list: [ 2, 4, 8, 16, 32 ]
2486+
prefill:
2487+
num-worker: 1
2488+
tp: 8
2489+
ep: 1
2490+
dp-attn: false
2491+
additional-settings:
2492+
- "PREFILL_NODES=1"
2493+
decode:
2494+
num-worker: 2
2495+
tp: 8
2496+
ep: 1
2497+
dp-attn: false
2498+
additional-settings:
2499+
- "DECODE_NODES=2"
2500+
- "DECODE_MTP_SIZE=3"
2501+
2502+
# 1P2D TP8
2503+
- spec-decoding: "mtp"
2504+
conc-list: [ 64, 128, 256 ]
2505+
prefill:
2506+
num-worker: 1
2507+
tp: 8
2508+
ep: 1
2509+
dp-attn: false
2510+
additional-settings:
2511+
- "PREFILL_NODES=1"
2512+
decode:
2513+
num-worker: 2
2514+
tp: 8
2515+
ep: 1
2516+
dp-attn: false
2517+
additional-settings:
2518+
- "DECODE_NODES=2"
2519+
- "DECODE_MTP_SIZE=2"
2520+
2521+
# 1P2D TP4
2522+
- spec-decoding: "mtp"
2523+
conc-list: [ 64, 128, 256 ]
2524+
prefill:
2525+
num-worker: 1
2526+
tp: 4
2527+
ep: 1
2528+
dp-attn: false
2529+
additional-settings:
2530+
- "PREFILL_NODES=1"
2531+
decode:
2532+
num-worker: 2
2533+
tp: 8
2534+
ep: 1
2535+
dp-attn: false
2536+
additional-settings:
2537+
- "DECODE_NODES=2"
2538+
- "DECODE_MTP_SIZE=2"
2539+
2540+
# 1*DEP4+ 1*DEP8
2541+
- spec-decoding: "mtp"
2542+
conc-list: [ 1024, 2048, 4096 ]
2543+
prefill:
2544+
num-worker: 1
2545+
tp: 4
2546+
ep: 4
2547+
dp-attn: true
2548+
additional-settings:
2549+
- "PREFILL_NODES=1"
2550+
decode:
2551+
num-worker: 1
2552+
tp: 8
2553+
ep: 8
2554+
dp-attn: true
2555+
additional-settings:
2556+
- "DECODE_NODES=1"
2557+
- "DECODE_MTP_SIZE=1"
2558+
2559+
- isl: 8192
2560+
osl: 1024
2561+
search-space:
2562+
# MTP configurations
2563+
# 1P1D pure TP8
2564+
- spec-decoding: "mtp"
2565+
conc-list: [ 1, 2, 4, 8 ]
2566+
prefill:
2567+
num-worker: 1
2568+
tp: 8
2569+
ep: 1
2570+
dp-attn: false
2571+
additional-settings:
2572+
- "PREFILL_NODES=1"
2573+
decode:
2574+
num-worker: 1
2575+
tp: 8
2576+
ep: 1
2577+
dp-attn: false
2578+
additional-settings:
2579+
- "DECODE_NODES=1"
2580+
- "DECODE_MTP_SIZE=3"
2581+
2582+
# 1P2D TP8
2583+
- spec-decoding: "mtp"
2584+
conc-list: [ 2, 4, 8, 16, 32 ]
2585+
prefill:
2586+
num-worker: 1
2587+
tp: 8
2588+
ep: 1
2589+
dp-attn: false
2590+
additional-settings:
2591+
- "PREFILL_NODES=1"
2592+
decode:
2593+
num-worker: 2
2594+
tp: 8
2595+
ep: 1
2596+
dp-attn: false
2597+
additional-settings:
2598+
- "DECODE_NODES=2"
2599+
- "DECODE_MTP_SIZE=3"
2600+
2601+
# 1P2D TP8
2602+
- spec-decoding: "mtp"
2603+
conc-list: [ 64, 128, 256 ]
2604+
prefill:
2605+
num-worker: 1
2606+
tp: 8
2607+
ep: 1
2608+
dp-attn: false
2609+
additional-settings:
2610+
- "PREFILL_NODES=1"
2611+
decode:
2612+
num-worker: 2
2613+
tp: 8
2614+
ep: 1
2615+
dp-attn: false
2616+
additional-settings:
2617+
- "DECODE_NODES=2"
2618+
- "DECODE_MTP_SIZE=2"
2619+
2620+
# 1*DEP8 + 1*DEP8
2621+
- spec-decoding: "mtp"
2622+
conc-list: [ 128, 512 ]
2623+
prefill:
2624+
num-worker: 1
2625+
tp: 8
2626+
ep: 8
2627+
dp-attn: true
2628+
additional-settings:
2629+
- "PREFILL_NODES=1"
2630+
decode:
2631+
num-worker: 1
2632+
tp: 8
2633+
ep: 8
2634+
dp-attn: true
2635+
additional-settings:
2636+
- "DECODE_NODES=1"
2637+
- "DECODE_MTP_SIZE=1"
2638+
2639+
# 1*DEP8 + 1*DEP8
2640+
- spec-decoding: "mtp"
2641+
conc-list: [ 64, 256 ]
2642+
prefill:
2643+
num-worker: 1
2644+
tp: 8
2645+
ep: 8
2646+
dp-attn: true
2647+
additional-settings:
2648+
- "PREFILL_NODES=1"
2649+
decode:
2650+
num-worker: 1
2651+
tp: 8
2652+
ep: 8
2653+
dp-attn: true
2654+
additional-settings:
2655+
- "DECODE_NODES=1"
2656+
- "DECODE_MTP_SIZE=1"
2657+
2658+
# 2*DEP8 + 1*DEP8
2659+
- spec-decoding: "mtp"
2660+
conc-list: [ 1024, 2048, 4096 ]
2661+
prefill:
2662+
num-worker: 2
2663+
tp: 8
2664+
ep: 8
2665+
dp-attn: true
2666+
additional-settings:
2667+
- "PREFILL_NODES=2"
2668+
decode:
2669+
num-worker: 1
2670+
tp: 8
2671+
ep: 8
2672+
dp-attn: true
2673+
additional-settings:
2674+
- "DECODE_NODES=1"
2675+
- "DECODE_MTP_SIZE=1"
2676+
2677+
2678+
# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
2679+
# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
2680+
# image tag, so bumping sglang is just an image tag bump here. Sweeps
2681+
# DP-attention on/off and EP=8.
2682+
2683+
# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below;
2684+
# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so
2685+
# its fixed-seq-len sweep is unaffected.
2686+
# - scenarios: replaced fixed-seq-len with agentic-coding.
2687+
# Image is identical to the base entry (rocm/sgl-dev DSv4 build).
2688+
# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware
2689+
# comparability. Offload sweep is none-only (SGLang has no equivalent of
2690+
# vLLM's SimpleCPUOffloadConnector path that we exercise on b200).
2691+
dsv4-fp4-mi355x-sglang-agentic:
2692+
image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4
2693+
model: deepseek-ai/DeepSeek-V4-Pro
2694+
model-prefix: dsv4
2695+
runner: mi355x
2696+
precision: fp4
2697+
framework: sglang
2698+
multinode: false
2699+
scenarios:
2700+
agentic-coding:
2701+
- duration: 1800
2702+
search-space:
2703+
- { tp: 8, offloading: none, conc-list: [16, 32, 64] }
2704+
- { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] }
2705+
2706+
# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
2707+
# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
2708+
# on 2026-05-05, so any nightly built after that includes the
2709+
# DeepseekV4ForCausalLM model class.
2710+
#
2711+
# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
2712+
# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
2713+
# files keyed on the image string and short-circuits re-import if the
2714+
# file already exists, so the floating tag silently keeps a stale build
2715+
# even after Docker Hub updates `:nightly`.
2716+
#
2717+
# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
2718+
# rest); InferenceX classifies this as fp4 — same as the sister sglang
2719+
# and atom DSv4 mi355x entries below. Image and serving flags follow the
2720+
# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
2721+
# executor, triton_unfused MoE (required for the FP4 expert format),
2722+
# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
2723+
# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
2724+
# probe to validate the ROCm DP+EP path.

0 commit comments

Comments
 (0)