Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 38 additions & 9 deletions .github/benchmark/sglang_benchmark_models.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"extra_args": "--trust-remote-code --tensor-parallel-size 8",
"bench_args": "",
"runner": "atom-mi355-8gpu-aac-runner",
"nightly_group": "A",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1"
},
{
Expand All @@ -18,6 +19,7 @@
"extra_args": "--trust-remote-code --tensor-parallel-size 4",
"bench_args": "",
"runner": "atom-mi355-8gpu-aac-runner",
"nightly_group": "A",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1"
},
{
Expand All @@ -28,6 +30,7 @@
"extra_args": "--trust-remote-code --tensor-parallel-size 8",
"bench_args": "",
"runner": "atom-mi355-8gpu-aac-runner",
"nightly_group": "A",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1"
},
{
Expand All @@ -39,17 +42,43 @@
"extra_args": "--trust-remote-code --tensor-parallel-size 4",
"bench_args": "",
"runner": "atom-mi355-8gpu-aac-runner",
"nightly_group": "A",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1"
},
{
"display": "DeepSeek-R1-0528-MXFP4 FP4 TP8 EP8",
"dashboard_model": "DeepSeek-R1-0528-MXFP4-tp8-ep8",
"source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"prefix": "deepseek-r1-fp4-tp8-ep8",
"extra_args": "--trust-remote-code --tensor-parallel-size 8 --expert-parallel-size 8",
"bench_args": "",
"runner": "atom-mi355-8gpu-aac-runner",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1"
"display": "DeepSeek-R1-0528-MXFP4 FP4 TP8 EP8",
"dashboard_model": "DeepSeek-R1-0528-MXFP4-tp8-ep8",
"source_path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"path": "amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4",
"prefix": "deepseek-r1-fp4-tp8-ep8",
"extra_args": "--trust-remote-code --tensor-parallel-size 8 --expert-parallel-size 8",
"bench_args": "",
"runner": "atom-mi355-8gpu-aac-runner",
"nightly_group": "A",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nSGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1"
},
{
"display": "Qwen3.5-397B-A17B-FP8 TP4",
"dashboard_model": "Qwen3.5-397B-A17B-FP8-tp4",
"source_path": "Qwen/Qwen3.5-397B-A17B-FP8",
"path": "Qwen/Qwen3.5-397B-A17B-FP8",
"prefix": "qwen3-5-397b-a17b-fp8-tp4",
"extra_args": "--tensor-parallel-size 4 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache",
"bench_args": "",
"runner": "atom-mi355-8gpu-aac-runner",
"nightly_group": "B",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0"
},
{
"display": "Qwen3.5-397B-A17B-FP8 TP8",
"dashboard_model": "Qwen3.5-397B-A17B-FP8",
"source_path": "Qwen/Qwen3.5-397B-A17B-FP8",
"path": "Qwen/Qwen3.5-397B-A17B-FP8",
"prefix": "qwen3-5-397b-a17b-fp8-tp8",
"extra_args": "--tensor-parallel-size 8 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache",
"bench_args": "",
"runner": "atom-mi355-8gpu-aac-runner",
"nightly_group": "B",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0"
}
]
48 changes: 48 additions & 0 deletions .github/benchmark/sglang_models_accuracy.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,54 @@
"accuracy_baseline_model": "deepseek-ai/DeepSeek-R1-0528",
"_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k."
},
{
"model_name": "Qwen3.5-35B-A3B-FP8 TP2",
"model_path": "Qwen/Qwen3.5-35B-A3B-FP8",
"extraArgs": "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0",
"runner": "linux-atom-mi35x-4",
"test_level": "nightly",
"accuracy_threshold": 0.76,
"accuracy_baseline": null,
"accuracy_baseline_model": "Qwen/Qwen3.5-35B-A3B-FP8",
"_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k."
},
{
"model_name": "Qwen3.5-35B-A3B TP2",
"model_path": "Qwen/Qwen3.5-35B-A3B",
"extraArgs": "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0",
"runner": "linux-atom-mi35x-4",
"test_level": "nightly",
"accuracy_threshold": 0.83,
"accuracy_baseline": null,
"accuracy_baseline_model": "Qwen/Qwen3.5-35B-A3B",
"_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k."
},
{
"model_name": "Qwen3.5-397B-A17B-FP8 TP4",
"model_path": "Qwen/Qwen3.5-397B-A17B-FP8",
"extraArgs": "--tensor-parallel-size 4 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0",
"runner": "linux-atom-mi35x-4",
"test_level": "nightly",
"accuracy_threshold": 0.83,
"accuracy_baseline": null,
"accuracy_baseline_model": "Qwen/Qwen3.5-397B-A17B-FP8",
"_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k."
},
{
"model_name": "Qwen3.5-397B-A17B-FP8 TP8",
"model_path": "Qwen/Qwen3.5-397B-A17B-FP8",
"extraArgs": "--tensor-parallel-size 8 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache",
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.83,
"accuracy_baseline": null,
"accuracy_baseline_model": "Qwen/Qwen3.5-397B-A17B-FP8",
"_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for gsm8k."
},
{
"model_name": "DeepSeek-R1-FP8 TP8",
"model_path": "deepseek-ai/DeepSeek-R1-0528",
Expand Down
25 changes: 15 additions & 10 deletions .github/scripts/atom_sglang_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ set -euo pipefail
# Optional environment variables:
# SGLANG_EXTRA_ARGS
# SGLANG_ENV_VARS
# SGLANG_DEFAULT_SERVER_ARGS
# SGLANG_PORT
# SGLANG_HOST
# MAX_WAIT_RETRIES
Expand Down Expand Up @@ -146,11 +147,6 @@ launch_server() {
local resolved_model_path
resolved_model_path=$(resolve_model_path "${MODEL_PATH}")

local -a extra_arg_array=()
if [[ -n "${MODEL_EXTRA_ARGS}" ]]; then
read -r -a extra_arg_array <<< "${MODEL_EXTRA_ARGS}"
fi

prepare_runtime_paths

export AITER_QUICK_REDUCE_QUANTIZATION="${AITER_QUICK_REDUCE_QUANTIZATION:-INT4}"
Expand All @@ -168,6 +164,19 @@ launch_server() {
done <<< "$(printf '%b' "${MODEL_ENV_VARS}")"
fi

local default_server_args
default_server_args=${SGLANG_DEFAULT_SERVER_ARGS---trust-remote-code --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.8 --page-size 1 --disable-radix-cache}

local -a default_arg_array=()
if [[ -n "${default_server_args}" ]]; then
read -r -a default_arg_array <<< "${default_server_args}"
fi

local -a extra_arg_array=()
if [[ -n "${MODEL_EXTRA_ARGS}" ]]; then
read -r -a extra_arg_array <<< "${MODEL_EXTRA_ARGS}"
fi

rm -rf /root/.cache

rm -f "${SGLANG_PID_FILE}" "${SGLANG_LOG_FILE}" || true
Expand All @@ -182,11 +191,7 @@ launch_server() {
--model-path "${resolved_model_path}" \
--host "${SGLANG_HOST}" \
--port "${SGLANG_PORT}" \
--trust-remote-code \
--kv-cache-dtype fp8_e4m3 \
--mem-fraction-static 0.8 \
--page-size 1 \
--disable-radix-cache \
"${default_arg_array[@]}" \
"${extra_arg_array[@]}" \
> "${SGLANG_LOG_FILE}" 2>&1 &

Expand Down
60 changes: 60 additions & 0 deletions .github/workflows/atom-sglang-accuracy-validation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,26 @@ on:
required: false
type: boolean
default: false
run_qwen35_35b_a3b_fp8_tp2:
description: "Qwen3.5-35B-A3B-FP8 TP2"
required: false
type: boolean
default: false
run_qwen35_35b_a3b_tp2:
description: "Qwen3.5-35B-A3B TP2"
required: false
type: boolean
default: false
run_qwen35_397b_a17b_fp8_tp4:
description: "Qwen3.5-397B-A17B-FP8 TP4"
required: false
type: boolean
default: false
run_qwen35_397b_a17b_fp8_tp8:
description: "Qwen3.5-397B-A17B-FP8 TP8"
required: false
type: boolean
default: false
run_dsr1_fp8_tp8:
description: "DeepSeek-R1-FP8 TP8"
required: false
Expand Down Expand Up @@ -70,6 +90,10 @@ jobs:
id: meta
env:
RUN_DSR1_FP8_TP4: ${{ inputs.run_dsr1_fp8_tp4 }}
RUN_QWEN35_35B_A3B_FP8_TP2: ${{ inputs.run_qwen35_35b_a3b_fp8_tp2 }}
RUN_QWEN35_35B_A3B_TP2: ${{ inputs.run_qwen35_35b_a3b_tp2 }}
RUN_QWEN35_397B_A17B_FP8_TP4: ${{ inputs.run_qwen35_397b_a17b_fp8_tp4 }}
RUN_QWEN35_397B_A17B_FP8_TP8: ${{ inputs.run_qwen35_397b_a17b_fp8_tp8 }}
RUN_DSR1_FP8_TP8: ${{ inputs.run_dsr1_fp8_tp8 }}
RUN_DSR1_FP4_TP4: ${{ inputs.run_dsr1_fp4_tp4 }}
RUN_DSR1_FP4_TP8: ${{ inputs.run_dsr1_fp4_tp8 }}
Expand All @@ -94,6 +118,42 @@ jobs:
"env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1",
"runner": "linux-atom-mi35x-4",
},
{
"toggle_env": "RUN_QWEN35_35B_A3B_FP8_TP2",
"model_name": "Qwen3.5-35B-A3B-FP8 TP2",
"model_path": "Qwen/Qwen3.5-35B-A3B-FP8",
"extra_args": "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache",
"accuracy_test_threshold": 0.76,
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0",
"runner": "linux-atom-mi35x-4",
},
{
"toggle_env": "RUN_QWEN35_35B_A3B_TP2",
"model_name": "Qwen3.5-35B-A3B TP2",
"model_path": "Qwen/Qwen3.5-35B-A3B",
"extra_args": "--tensor-parallel-size 2 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache",
"accuracy_test_threshold": 0.83,
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0",
"runner": "linux-atom-mi35x-4",
},
{
"toggle_env": "RUN_QWEN35_397B_A17B_FP8_TP4",
"model_name": "Qwen3.5-397B-A17B-FP8 TP4",
"model_path": "Qwen/Qwen3.5-397B-A17B-FP8",
"extra_args": "--tensor-parallel-size 4 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache",
"accuracy_test_threshold": 0.83,
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0",
"runner": "linux-atom-mi35x-4",
},
{
"toggle_env": "RUN_QWEN35_397B_A17B_FP8_TP8",
"model_name": "Qwen3.5-397B-A17B-FP8 TP8",
"model_path": "Qwen/Qwen3.5-397B-A17B-FP8",
"extra_args": "--tensor-parallel-size 8 --mem-fraction-static 0.9 --reasoning-parser qwen3 --disable-radix-cache",
"accuracy_test_threshold": 0.83,
"env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0",
"runner": "linux-atom-mi35x-8",
},
{
"toggle_env": "RUN_DSR1_FP8_TP8",
"model_name": "DeepSeek-R1-FP8 TP8",
Expand Down
Loading
Loading