@@ -2407,3 +2407,318 @@ glm5-fp8-mi325x-sglang-mtp:
24072407 osl : 1024
24082408 search-space :
24092409 - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
2410+
2411+ # ============================================================================
2412+ # Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries).
2413+ # Recipes that ALREADY existed on main were intentionally left at main's version
2414+ # to preserve main behavior; PR-branch modifications to those recipes are NOT
2415+ # brought in here.
2416+ # ============================================================================
2417+
2418+ qwen3.5-fp8-mi355x-sglang-agentic-hicache :
2419+ image : lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
2420+ model : Qwen/Qwen3.5-397B-A17B-FP8
2421+ model-prefix : qwen3.5
2422+ runner : mi355x
2423+ precision : fp8
2424+ framework : sglang
2425+ multinode : false
2426+ scenarios :
2427+ agentic-coding :
2428+ - duration : 1800
2429+ search-space :
2430+ - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
2431+ - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
2432+
2433+ dsv4-fp4-mi355x-vllm-agentic :
2434+ image : vllm/vllm-openai-rocm:v0.21.0
2435+ model : deepseek-ai/DeepSeek-V4-Pro
2436+ model-prefix : dsv4
2437+ runner : mi355x
2438+ precision : fp4
2439+ framework : vllm
2440+ multinode : false
2441+ scenarios :
2442+ agentic-coding :
2443+ - duration : 1800
2444+ search-space :
2445+ - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
2446+ - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
2447+ - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
2448+
2449+ dsr1-fp4-mi355x-sglang-disagg-mtp :
2450+ image : lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
2451+ model : amd/DeepSeek-R1-0528-MXFP4-v2
2452+ model-prefix : dsr1
2453+ runner : mi355x-disagg
2454+ precision : fp4
2455+ framework : sglang-disagg
2456+ multinode : true
2457+ disagg : true
2458+ scenarios :
2459+ fixed-seq-len :
2460+ - isl : 1024
2461+ osl : 1024
2462+ search-space :
2463+ # MTP configurations
2464+ # 1P1D TP8
2465+ - spec-decoding : " mtp"
2466+ conc-list : [ 1, 2, 4, 8 ]
2467+ prefill :
2468+ num-worker : 1
2469+ tp : 8
2470+ ep : 1
2471+ dp-attn : false
2472+ additional-settings :
2473+ - " PREFILL_NODES=1"
2474+ decode :
2475+ num-worker : 1
2476+ tp : 8
2477+ ep : 1
2478+ dp-attn : false
2479+ additional-settings :
2480+ - " DECODE_NODES=1"
2481+ - " DECODE_MTP_SIZE=3"
2482+
2483+ # 1P2D TP8
2484+ - spec-decoding : " mtp"
2485+ conc-list : [ 2, 4, 8, 16, 32 ]
2486+ prefill :
2487+ num-worker : 1
2488+ tp : 8
2489+ ep : 1
2490+ dp-attn : false
2491+ additional-settings :
2492+ - " PREFILL_NODES=1"
2493+ decode :
2494+ num-worker : 2
2495+ tp : 8
2496+ ep : 1
2497+ dp-attn : false
2498+ additional-settings :
2499+ - " DECODE_NODES=2"
2500+ - " DECODE_MTP_SIZE=3"
2501+
2502+ # 1P2D TP8
2503+ - spec-decoding : " mtp"
2504+ conc-list : [ 64, 128, 256 ]
2505+ prefill :
2506+ num-worker : 1
2507+ tp : 8
2508+ ep : 1
2509+ dp-attn : false
2510+ additional-settings :
2511+ - " PREFILL_NODES=1"
2512+ decode :
2513+ num-worker : 2
2514+ tp : 8
2515+ ep : 1
2516+ dp-attn : false
2517+ additional-settings :
2518+ - " DECODE_NODES=2"
2519+ - " DECODE_MTP_SIZE=2"
2520+
2521+ # 1P2D TP4
2522+ - spec-decoding : " mtp"
2523+ conc-list : [ 64, 128, 256 ]
2524+ prefill :
2525+ num-worker : 1
2526+ tp : 4
2527+ ep : 1
2528+ dp-attn : false
2529+ additional-settings :
2530+ - " PREFILL_NODES=1"
2531+ decode :
2532+ num-worker : 2
2533+ tp : 8
2534+ ep : 1
2535+ dp-attn : false
2536+ additional-settings :
2537+ - " DECODE_NODES=2"
2538+ - " DECODE_MTP_SIZE=2"
2539+
2540+ # 1*DEP4+ 1*DEP8
2541+ - spec-decoding : " mtp"
2542+ conc-list : [ 1024, 2048, 4096 ]
2543+ prefill :
2544+ num-worker : 1
2545+ tp : 4
2546+ ep : 4
2547+ dp-attn : true
2548+ additional-settings :
2549+ - " PREFILL_NODES=1"
2550+ decode :
2551+ num-worker : 1
2552+ tp : 8
2553+ ep : 8
2554+ dp-attn : true
2555+ additional-settings :
2556+ - " DECODE_NODES=1"
2557+ - " DECODE_MTP_SIZE=1"
2558+
2559+ - isl : 8192
2560+ osl : 1024
2561+ search-space :
2562+ # MTP configurations
2563+ # 1P1D pure TP8
2564+ - spec-decoding : " mtp"
2565+ conc-list : [ 1, 2, 4, 8 ]
2566+ prefill :
2567+ num-worker : 1
2568+ tp : 8
2569+ ep : 1
2570+ dp-attn : false
2571+ additional-settings :
2572+ - " PREFILL_NODES=1"
2573+ decode :
2574+ num-worker : 1
2575+ tp : 8
2576+ ep : 1
2577+ dp-attn : false
2578+ additional-settings :
2579+ - " DECODE_NODES=1"
2580+ - " DECODE_MTP_SIZE=3"
2581+
2582+ # 1P2D TP8
2583+ - spec-decoding : " mtp"
2584+ conc-list : [ 2, 4, 8, 16, 32 ]
2585+ prefill :
2586+ num-worker : 1
2587+ tp : 8
2588+ ep : 1
2589+ dp-attn : false
2590+ additional-settings :
2591+ - " PREFILL_NODES=1"
2592+ decode :
2593+ num-worker : 2
2594+ tp : 8
2595+ ep : 1
2596+ dp-attn : false
2597+ additional-settings :
2598+ - " DECODE_NODES=2"
2599+ - " DECODE_MTP_SIZE=3"
2600+
2601+ # 1P2D TP8
2602+ - spec-decoding : " mtp"
2603+ conc-list : [ 64, 128, 256 ]
2604+ prefill :
2605+ num-worker : 1
2606+ tp : 8
2607+ ep : 1
2608+ dp-attn : false
2609+ additional-settings :
2610+ - " PREFILL_NODES=1"
2611+ decode :
2612+ num-worker : 2
2613+ tp : 8
2614+ ep : 1
2615+ dp-attn : false
2616+ additional-settings :
2617+ - " DECODE_NODES=2"
2618+ - " DECODE_MTP_SIZE=2"
2619+
2620+ # 1*DEP8 + 1*DEP8
2621+ - spec-decoding : " mtp"
2622+ conc-list : [ 128, 512 ]
2623+ prefill :
2624+ num-worker : 1
2625+ tp : 8
2626+ ep : 8
2627+ dp-attn : true
2628+ additional-settings :
2629+ - " PREFILL_NODES=1"
2630+ decode :
2631+ num-worker : 1
2632+ tp : 8
2633+ ep : 8
2634+ dp-attn : true
2635+ additional-settings :
2636+ - " DECODE_NODES=1"
2637+ - " DECODE_MTP_SIZE=1"
2638+
2639+ # 1*DEP8 + 1*DEP8
2640+ - spec-decoding : " mtp"
2641+ conc-list : [ 64, 256 ]
2642+ prefill :
2643+ num-worker : 1
2644+ tp : 8
2645+ ep : 8
2646+ dp-attn : true
2647+ additional-settings :
2648+ - " PREFILL_NODES=1"
2649+ decode :
2650+ num-worker : 1
2651+ tp : 8
2652+ ep : 8
2653+ dp-attn : true
2654+ additional-settings :
2655+ - " DECODE_NODES=1"
2656+ - " DECODE_MTP_SIZE=1"
2657+
2658+ # 2*DEP8 + 1*DEP8
2659+ - spec-decoding : " mtp"
2660+ conc-list : [ 1024, 2048, 4096 ]
2661+ prefill :
2662+ num-worker : 2
2663+ tp : 8
2664+ ep : 8
2665+ dp-attn : true
2666+ additional-settings :
2667+ - " PREFILL_NODES=2"
2668+ decode :
2669+ num-worker : 1
2670+ tp : 8
2671+ ep : 8
2672+ dp-attn : true
2673+ additional-settings :
2674+ - " DECODE_NODES=1"
2675+ - " DECODE_MTP_SIZE=1"
2676+
2677+
2678+ # DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
2679+ # amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
2680+ # image tag, so bumping sglang is just an image tag bump here. Sweeps
2681+ # DP-attention on/off and EP=8.
2682+
2683+ # Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below;
2684+ # the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so
2685+ # its fixed-seq-len sweep is unaffected.
2686+ # - scenarios: replaced fixed-seq-len with agentic-coding.
2687+ # Image is identical to the base entry (rocm/sgl-dev DSv4 build).
2688+ # CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware
2689+ # comparability. Offload sweep is none-only (SGLang has no equivalent of
2690+ # vLLM's SimpleCPUOffloadConnector path that we exercise on b200).
2691+ dsv4-fp4-mi355x-sglang-agentic :
2692+ image : rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4
2693+ model : deepseek-ai/DeepSeek-V4-Pro
2694+ model-prefix : dsv4
2695+ runner : mi355x
2696+ precision : fp4
2697+ framework : sglang
2698+ multinode : false
2699+ scenarios :
2700+ agentic-coding :
2701+ - duration : 1800
2702+ search-space :
2703+ - { tp: 8, offloading: none, conc-list: [16, 32, 64] }
2704+ - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] }
2705+
2706+ # DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
2707+ # nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
2708+ # on 2026-05-05, so any nightly built after that includes the
2709+ # DeepseekV4ForCausalLM model class.
2710+ #
2711+ # IMPORTANT: pin to a digest-suffixed nightly tag rather than the
2712+ # floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
2713+ # files keyed on the image string and short-circuits re-import if the
2714+ # file already exists, so the floating tag silently keeps a stale build
2715+ # even after Docker Hub updates `:nightly`.
2716+ #
2717+ # DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
2718+ # rest); InferenceX classifies this as fp4 — same as the sister sglang
2719+ # and atom DSv4 mi355x entries below. Image and serving flags follow the
2720+ # validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
2721+ # executor, triton_unfused MoE (required for the FP4 expert format),
2722+ # async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
2723+ # gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
2724+ # probe to validate the ROCm DP+EP path.
0 commit comments