From 27510c41e1a59c1de10972da665c1d861a6b0195 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 18 Jun 2026 04:51:00 +0800 Subject: [PATCH 1/2] perf(mi300x): use load-time block FP8 MoE conversion Co-authored-by: OpenAI Codex Signed-off-by: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> --- .github/configs/amd-master.yaml | 9 +- .../fixed_seq_len/minimaxm3_fp8_mi300x.sh | 50 ++- .../minimaxm3_mi300x_mxfp8.patch | 406 ++++++++++++++++++ perf-changelog.yaml | 10 +- utils/process_changelog.py | 29 ++ utils/test_process_changelog.py | 62 +++ 6 files changed, 557 insertions(+), 9 deletions(-) create mode 100644 benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch create mode 100644 utils/test_process_changelog.py diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 606e3c2af..d0a20bef8 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2851,10 +2851,11 @@ minimaxm3-fp8-mi355x-vllm-mtp: - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } -# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and -# MI355X serving shape, but retain the default BF16 KV cache because this -# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100 -# search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency. +# MiniMax-M3 MXFP8 MI300X recipe. Convert the checkpoint's MXFP8 MoE weights to +# 128x128 block FP8 at load time and use the regular Triton block-FP8 backend. +# Retain the default BF16 KV cache because this checkpoint lacks calibrated +# ROCm FP8 attention scales. Use TP8 for latency and TP8+EP8 at high +# concurrency. minimaxm3-fp8-mi300x-vllm: image: vllm/vllm-openai-rocm:minimax-m3 model: MiniMaxAI/MiniMax-M3-MXFP8 diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh index f2cdaf284..d2b01a291 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh @@ -1,10 +1,12 @@ #!/usr/bin/env bash # MiniMax-M3 MXFP8 MI300X (gfx942) single-node vLLM recipe. -# Reuses the dedicated ROCm image and the MI355X serving shape. Block size 128 -# is mandatory for MSA sparse attention. Keep the default BF16 KV cache on -# gfx942: the checkpoint has no calibrated q/prob scales for ROCm FP8 -# attention, and vLLM's fallback scale of 1.0 corrupts model accuracy. +# Reuses the dedicated ROCm image and converts MXFP8 MoE weights to 128x128 +# block FP8 at load time. Block size 128 is mandatory for MSA sparse attention. +# Keep the default BF16 KV cache on gfx942: the checkpoint has no calibrated +# q/prob scales for ROCm FP8 attention, and vLLM's fallback scale of 1.0 +# corrupts model accuracy. +# Target image vLLM revision: 4a560dd8db67c270f5e2afb614558271b76f2294. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -24,6 +26,46 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi +if ! VLLM_PACKAGE_ROOT="$( + python3 - <<'PY' +from pathlib import Path + +import vllm + +print(Path(vllm.__file__).resolve().parent.parent) +PY +)"; then + echo "Failed to locate the installed vLLM package" >&2 + exit 1 +fi +if [[ -z "$VLLM_PACKAGE_ROOT" || ! -d "$VLLM_PACKAGE_ROOT/vllm" ]]; then + echo "Invalid installed vLLM package root: $VLLM_PACKAGE_ROOT" >&2 + exit 1 +fi + +MXFP8_PATCH="$(dirname "$0")/minimaxm3_mi300x_mxfp8.patch" +if [[ ! -f "$MXFP8_PATCH" ]]; then + echo "MI300X MXFP8 patch is missing: $MXFP8_PATCH" >&2 + exit 1 +fi + +PATCH_CHECK_ARGS=(--batch --silent -d "$VLLM_PACKAGE_ROOT" -p1 --dry-run) +if patch "${PATCH_CHECK_ARGS[@]}" --reverse --forward < "$MXFP8_PATCH"; then + echo "MI300X MXFP8 patch is already fully applied" +elif patch "${PATCH_CHECK_ARGS[@]}" --forward < "$MXFP8_PATCH"; then + if ! patch --batch --forward -d "$VLLM_PACKAGE_ROOT" -p1 < "$MXFP8_PATCH"; then + echo "Failed to apply the MI300X MXFP8 patch" >&2 + exit 1 + fi +else + echo "Installed vLLM is neither cleanly patchable nor fully patched" >&2 + exit 1 +fi +if ! patch "${PATCH_CHECK_ARGS[@]}" --reverse --forward < "$MXFP8_PATCH"; then + echo "MI300X MXFP8 patch verification failed" >&2 + exit 1 +fi + if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi if [ -n "$ROCR_VISIBLE_DEVICES" ]; then diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch b/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch new file mode 100644 index 000000000..1e83f431d --- /dev/null +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch @@ -0,0 +1,406 @@ +diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +index c275cecc1591f16e91791e9b007cdb6fcaac40b4..f20c20c4d2a475ca00926c98608edc6b645dd4c1 100644 +--- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json ++++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +@@ -1,28 +1,28 @@ + { + "1": { +- "BLOCK_SIZE_M": 16, +- "BLOCK_SIZE_N": 128, +- "BLOCK_SIZE_K": 256, +- "GROUP_SIZE_M": 1, +- "num_warps": 8, ++ "BLOCK_SIZE_M": 32, ++ "BLOCK_SIZE_N": 16, ++ "BLOCK_SIZE_K": 128, ++ "GROUP_SIZE_M": 16, ++ "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { +- "BLOCK_SIZE_M": 16, +- "BLOCK_SIZE_N": 128, +- "BLOCK_SIZE_K": 256, ++ "BLOCK_SIZE_M": 32, ++ "BLOCK_SIZE_N": 16, ++ "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, +- "num_warps": 8, ++ "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { +- "BLOCK_SIZE_M": 16, +- "BLOCK_SIZE_N": 128, +- "BLOCK_SIZE_K": 256, +- "GROUP_SIZE_M": 1, +- "num_warps": 8, ++ "BLOCK_SIZE_M": 32, ++ "BLOCK_SIZE_N": 16, ++ "BLOCK_SIZE_K": 128, ++ "GROUP_SIZE_M": 8, ++ "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, +@@ -36,11 +36,11 @@ + "waves_per_eu": 0 + }, + "16": { +- "BLOCK_SIZE_M": 16, +- "BLOCK_SIZE_N": 128, ++ "BLOCK_SIZE_M": 32, ++ "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, +- "num_warps": 8, ++ "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0 + }, +diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +new file mode 100644 +index 0000000000000000000000000000000000000000..b1bfee7fc386385fa8b8f06e561d528ea4762d87 +--- /dev/null ++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +@@ -0,0 +1,74 @@ ++{ ++ "64": { ++ "BLOCK_SIZE_M": 32, ++ "BLOCK_SIZE_N": 128, ++ "BLOCK_SIZE_K": 128, ++ "GROUP_SIZE_M": 4, ++ "num_warps": 2, ++ "num_stages": 2, ++ "waves_per_eu": 2 ++ }, ++ "128": { ++ "BLOCK_SIZE_M": 32, ++ "BLOCK_SIZE_N": 128, ++ "BLOCK_SIZE_K": 256, ++ "GROUP_SIZE_M": 1, ++ "num_warps": 2, ++ "num_stages": 2, ++ "waves_per_eu": 1 ++ }, ++ "256": { ++ "BLOCK_SIZE_M": 64, ++ "BLOCK_SIZE_N": 128, ++ "BLOCK_SIZE_K": 256, ++ "GROUP_SIZE_M": 1, ++ "num_warps": 4, ++ "num_stages": 2, ++ "waves_per_eu": 2 ++ }, ++ "512": { ++ "BLOCK_SIZE_M": 64, ++ "BLOCK_SIZE_N": 128, ++ "BLOCK_SIZE_K": 128, ++ "GROUP_SIZE_M": 1, ++ "num_warps": 4, ++ "num_stages": 2, ++ "waves_per_eu": 2 ++ }, ++ "1024": { ++ "BLOCK_SIZE_M": 64, ++ "BLOCK_SIZE_N": 128, ++ "BLOCK_SIZE_K": 128, ++ "GROUP_SIZE_M": 1, ++ "num_warps": 4, ++ "num_stages": 2, ++ "waves_per_eu": 2 ++ }, ++ "2048": { ++ "BLOCK_SIZE_M": 64, ++ "BLOCK_SIZE_N": 128, ++ "BLOCK_SIZE_K": 128, ++ "GROUP_SIZE_M": 1, ++ "num_warps": 4, ++ "num_stages": 2, ++ "waves_per_eu": 2 ++ }, ++ "4096": { ++ "BLOCK_SIZE_M": 256, ++ "BLOCK_SIZE_N": 128, ++ "BLOCK_SIZE_K": 128, ++ "GROUP_SIZE_M": 1, ++ "num_warps": 4, ++ "num_stages": 2, ++ "waves_per_eu": 0 ++ }, ++ "8192": { ++ "BLOCK_SIZE_M": 128, ++ "BLOCK_SIZE_N": 256, ++ "BLOCK_SIZE_K": 128, ++ "GROUP_SIZE_M": 1, ++ "num_warps": 4, ++ "num_stages": 2, ++ "waves_per_eu": 0 ++ } ++} +diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py +index d0d7c76481b0a315e9c57810d40394822f62594c..e82429b8ecddc9b8e44f003a537de08b5cb7a045 100644 +--- a/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py ++++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py +@@ -7,8 +7,11 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig + from vllm.model_executor.layers.fused_moe.oracle.fp8 import ( + Fp8MoeBackend, + backend_to_kernel_cls, ++ select_fp8_moe_backend, + ) + from vllm.model_executor.layers.quantization.utils.quant_utils import ( ++ kFp8Dynamic128Sym, ++ kFp8Static128BlockSym, + kMxfp8Dynamic, + kMxfp8Static, + ) +@@ -101,13 +104,31 @@ def _select_rocm_mxfp8_backend() -> tuple[Fp8MoeBackend, type[mk.FusedMoEExperts + + def select_mxfp8_moe_backend( + config: FusedMoEConfig, ++ *, ++ block_fp8_on_fnuz: bool = False, + ) -> tuple[Fp8MoeBackend, type[mk.FusedMoEExperts]]: + """Select the MXFP8 MoE backend and the best expert class. + ++ ``block_fp8_on_fnuz`` must only be enabled by quantization methods that ++ convert their MXFP8 weights to 128x128 block FP8 before kernel setup. ++ + Returns: + A tuple of (fp8_backend, experts_cls). + """ + ++ if block_fp8_on_fnuz and current_platform.is_fp8_fnuz(): ++ logger.info_once( ++ "MXFP8 MoE weights will be converted to 128x128 block FP8 at load time." ++ ) ++ block_fp8_backend, experts_cls = select_fp8_moe_backend( ++ config=config, ++ weight_key=kFp8Static128BlockSym, ++ activation_key=kFp8Dynamic128Sym, ++ ) ++ assert block_fp8_backend is not None ++ assert experts_cls is not None ++ return block_fp8_backend, experts_cls ++ + runner_backend = config.moe_backend + if runner_backend != "auto": + backend = _BACKEND_NAME_MAP.get(runner_backend) +diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py +index 33c7c7532a0ba823e4e7a23538300a5977a4553e..9b9d73f7b5fc138cac3dc3349a24a473d2c1faf6 100644 +--- a/vllm/model_executor/layers/quantization/modelopt.py ++++ b/vllm/model_executor/layers/quantization/modelopt.py +@@ -69,6 +69,7 @@ from vllm.model_executor.layers.quantization.utils.mxfp8_utils import ( + MXFP8_BLOCK_SIZE, + MXFP8_SCALE_DTYPE, + MXFP8_VALUE_DTYPE, ++ convert_mxfp8_to_block_fp8, + ) + from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, +@@ -92,6 +93,7 @@ from vllm.model_executor.parameter import ( + PerTensorScaleParameter, + ) + from vllm.model_executor.utils import replace_parameter, set_weight_attrs ++from vllm.platforms import current_platform + + if TYPE_CHECKING: + from vllm.model_executor.models.utils import WeightsMapper +@@ -1876,7 +1878,7 @@ class ModelOptMxFp8LinearMethod(LinearMethodBase): + + + class ModelOptMxFp8FusedMoE(FusedMoEMethodBase): +- """FlashInfer TRTLLM MXFP8 block-scale MoE for ModelOpt checkpoints.""" ++ """ModelOpt MXFP8 MoE quantization.""" + + def __init__( + self, +@@ -1884,11 +1886,17 @@ class ModelOptMxFp8FusedMoE(FusedMoEMethodBase): + moe_config: FusedMoEConfig, + ) -> None: + super().__init__(moe_config) +- self.weight_block_size = [1, MXFP8_BLOCK_SIZE] ++ self.requantize_mxfp8_to_block_fp8 = current_platform.is_fp8_fnuz() ++ self.weight_block_size = ( ++ [128, 128] if self.requantize_mxfp8_to_block_fp8 else [1, MXFP8_BLOCK_SIZE] ++ ) + self.quant_config = quant_config + assert self.quant_config.is_checkpoint_mxfp8_serialized + +- self.mxfp8_backend, self.experts_cls = select_mxfp8_moe_backend(config=self.moe) ++ self.mxfp8_backend, self.experts_cls = select_mxfp8_moe_backend( ++ config=self.moe, ++ block_fp8_on_fnuz=self.requantize_mxfp8_to_block_fp8, ++ ) + + def create_weights( + self, +@@ -2129,15 +2137,35 @@ class ModelOptMxFp8FusedMoE(FusedMoEMethodBase): + + self._check_weight_dtypes(layer) + ++ if self.requantize_mxfp8_to_block_fp8: ++ w13, w13_scale = convert_mxfp8_to_block_fp8( ++ layer.w13_weight, ++ layer.w13_weight_scale, ++ block_size=(128, 128), ++ ) ++ w2, w2_scale = convert_mxfp8_to_block_fp8( ++ layer.w2_weight, ++ layer.w2_weight_scale, ++ block_size=(128, 128), ++ ) ++ logger.info_once( ++ "Converted MXFP8 MoE weights to 128x128 block FP8 at load time." ++ ) ++ else: ++ w13 = layer.w13_weight ++ w2 = layer.w2_weight ++ w13_scale = layer.w13_weight_scale ++ w2_scale = layer.w2_weight_scale ++ + layer.weight_block_size = self.weight_block_size + + w13, w2, w13_scale, w2_scale = convert_to_fp8_moe_kernel_format( + fp8_backend=self.mxfp8_backend, + layer=layer, +- w13=layer.w13_weight, +- w2=layer.w2_weight, +- w13_scale=layer.w13_weight_scale, +- w2_scale=layer.w2_weight_scale, ++ w13=w13, ++ w2=w2, ++ w13_scale=w13_scale, ++ w2_scale=w2_scale, + w13_input_scale=None, + w2_input_scale=None, + ) +diff --git a/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +index e6063b463284f912f6cd923d9cf65ff9515d56f2..27d7be0a1740630269c305cfe7377eebdbd54285 100644 +--- a/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py ++++ b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +@@ -227,6 +227,122 @@ def dequant_mxfp8_to_bf16(x: torch.Tensor, scales: torch.Tensor) -> torch.Tensor + return dequantized.to(torch.bfloat16) + + ++def _quantize_block_fp8_ocp( ++ weight: torch.Tensor, ++ block_size: tuple[int, int], ++) -> tuple[torch.Tensor, torch.Tensor]: ++ """Quantize a matrix to OCP E4M3 with per-block FP32 scales.""" ++ block_m, block_k = block_size ++ n, k = weight.shape ++ padded_n = ((n + block_m - 1) // block_m) * block_m ++ padded_k = ((k + block_k - 1) // block_k) * block_k ++ padded = torch.zeros( ++ (padded_n, padded_k), ++ dtype=torch.float32, ++ device=weight.device, ++ ) ++ padded[:n, :k] = weight.float() ++ ++ blocks = padded.view( ++ padded_n // block_m, ++ block_m, ++ padded_k // block_k, ++ block_k, ++ ) ++ amax = blocks.abs().amax(dim=(1, 3), keepdim=True).clamp(min=1e-4) ++ scales = amax / torch.finfo(torch.float8_e4m3fn).max ++ quantized = (blocks / scales).to(torch.float8_e4m3fn) ++ return ( ++ quantized.view(padded_n, padded_k)[:n, :k].contiguous(), ++ scales.view(padded_n // block_m, padded_k // block_k).contiguous(), ++ ) ++ ++ ++def convert_mxfp8_to_block_fp8( ++ weight: torch.Tensor, ++ scales: torch.Tensor, ++ block_size: tuple[int, int] = (128, 128), ++) -> tuple[torch.Tensor, torch.Tensor]: ++ """Convert MXFP8 matrices to block-scaled FP8. ++ ++ Each matrix is dequantized and requantized independently to bound the ++ temporary BF16 memory used while loading MoE weights. ++ """ ++ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( ++ normalize_e4m3fn_to_e4m3fnuz, ++ ) ++ from vllm.platforms import current_platform ++ ++ if weight.ndim < 2: ++ raise ValueError( ++ f"MXFP8 weight must have at least 2 dimensions, got {weight.ndim}." ++ ) ++ if weight.dtype != MXFP8_VALUE_DTYPE: ++ raise ValueError( ++ f"MXFP8 weight must use {MXFP8_VALUE_DTYPE}, got {weight.dtype}." ++ ) ++ if scales.dtype != MXFP8_SCALE_DTYPE: ++ raise ValueError( ++ f"MXFP8 scales must use {MXFP8_SCALE_DTYPE}, got {scales.dtype}." ++ ) ++ if weight.shape[-1] % MXFP8_BLOCK_SIZE != 0: ++ raise ValueError( ++ f"MXFP8 weight K dimension must be divisible by {MXFP8_BLOCK_SIZE}, " ++ f"got {weight.shape[-1]}." ++ ) ++ ++ expected_scale_shape = (*weight.shape[:-1], weight.shape[-1] // MXFP8_BLOCK_SIZE) ++ if scales.shape != expected_scale_shape: ++ raise ValueError( ++ f"Expected MXFP8 scale shape {expected_scale_shape}, " ++ f"got {tuple(scales.shape)}." ++ ) ++ ++ block_m, block_k = block_size ++ if block_m <= 0 or block_k <= 0: ++ raise ValueError(f"Block dimensions must be positive, got {block_size}.") ++ ++ n, k = weight.shape[-2:] ++ scale_shape = ( ++ *weight.shape[:-2], ++ (n + block_m - 1) // block_m, ++ (k + block_k - 1) // block_k, ++ ) ++ quantized = torch.empty_like(weight, dtype=current_platform.fp8_dtype()) ++ block_scales = torch.empty( ++ scale_shape, ++ dtype=torch.float32, ++ device=weight.device, ++ ) ++ ++ weight_view = weight.reshape(-1, n, k) ++ scales_view = scales.reshape(-1, n, k // MXFP8_BLOCK_SIZE) ++ quantized_view = quantized.reshape(-1, n, k) ++ block_scales_view = block_scales.reshape( ++ -1, ++ scale_shape[-2], ++ scale_shape[-1], ++ ) ++ for matrix_idx in range(weight_view.shape[0]): ++ dequantized = dequant_mxfp8_to_bf16( ++ weight_view[matrix_idx], ++ scales_view[matrix_idx], ++ ) ++ matrix, matrix_scales = _quantize_block_fp8_ocp( ++ dequantized, ++ block_size=(block_m, block_k), ++ ) ++ if current_platform.is_fp8_fnuz(): ++ matrix, matrix_scales, _ = normalize_e4m3fn_to_e4m3fnuz( ++ matrix, ++ matrix_scales, ++ ) ++ quantized_view[matrix_idx].copy_(matrix) ++ block_scales_view[matrix_idx].copy_(matrix_scales) ++ ++ return quantized.contiguous(), block_scales.contiguous() ++ ++ + def mxfp8_e4m3_quantize_fake( + x: torch.Tensor, + is_sf_swizzled_layout: bool = False, diff --git a/perf-changelog.yaml b/perf-changelog.yaml index dec0d280c..0fb914c37 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3926,4 +3926,12 @@ - "Container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13" - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026 (gb300_nvfp4 STP recipes)" - "Runner script launch_gb300-nv.sh: added dynamo-trt-specific glm5-fp4 case with SERVED_MODEL_NAME and SRT_SLURM_MODEL_PREFIX=nvidia/GLM-5-NVFP4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1798 \ No newline at end of file + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1798 + +- config-keys: + - minimaxm3-fp8-mi300x-vllm + description: + - "Convert checkpoint MXFP8 MoE weights once at load time to 128x128 block FP8 on gfx942." + - "Normalize OCP E4M3 values to FNUZ and use the regular Triton block-FP8 backend." + - "Use measured low-token TP tiles and a tuned local-expert table without changing the TP8/TP8+EP8 matrix." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1753 diff --git a/utils/process_changelog.py b/utils/process_changelog.py index 1514f8d36..9de8c8c6a 100644 --- a/utils/process_changelog.py +++ b/utils/process_changelog.py @@ -3,6 +3,7 @@ import re import subprocess from collections import defaultdict +from pathlib import Path import yaml from constants import GENERATE_SWEEPS_PY_SCRIPT, MASTER_CONFIGS @@ -15,6 +16,34 @@ def get_added_lines(base_ref: str, head_ref: str, filepath: str) -> str: + repo_root = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], + capture_output=True, + text=True, + ) + git_filepath = filepath + if repo_root.returncode == 0: + resolved_path = Path(filepath).resolve() + resolved_root = Path(repo_root.stdout.strip()).resolve() + if resolved_path == resolved_root or resolved_root in resolved_path.parents: + git_filepath = resolved_path.relative_to(resolved_root).as_posix() + + base_file = subprocess.run( + ["git", "show", f"{base_ref}:{git_filepath}"], + capture_output=True, + ) + head_file = subprocess.run( + ["git", "show", f"{head_ref}:{git_filepath}"], + capture_output=True, + ) + + if ( + base_file.returncode == 0 + and head_file.returncode == 0 + and head_file.stdout.startswith(base_file.stdout) + ): + return head_file.stdout[len(base_file.stdout) :].decode() + result = subprocess.run( ["git", "diff", base_ref, head_ref, "--", filepath], capture_output=True, diff --git a/utils/test_process_changelog.py b/utils/test_process_changelog.py new file mode 100644 index 000000000..7af5d2d77 --- /dev/null +++ b/utils/test_process_changelog.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import subprocess +from pathlib import Path + +import pytest + +import process_changelog + + +def _git(repo: Path, *args: str) -> str: + result = subprocess.run( + ["git", *args], + cwd=repo, + check=True, + capture_output=True, + text=True, + ) + return result.stdout.strip() + + +def _commit(repo: Path, content: bytes, message: str) -> str: + (repo / "perf-changelog.yaml").write_bytes(content) + _git(repo, "add", "perf-changelog.yaml") + _git(repo, "commit", "-m", message) + return _git(repo, "rev-parse", "HEAD") + + +@pytest.fixture +def changelog_repo(tmp_path: Path) -> Path: + _git(tmp_path, "init") + _git(tmp_path, "config", "user.name", "Test User") + _git(tmp_path, "config", "user.email", "test@example.com") + return tmp_path + + +def test_get_added_lines_accepts_append_after_missing_newline( + changelog_repo: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + base_content = b"- config-keys:\n - old\n pr-link: old" + appended = b"\n\n- config-keys:\n - new\n pr-link: new\n" + base_ref = _commit(changelog_repo, base_content, "base") + head_ref = _commit(changelog_repo, base_content + appended, "append") + monkeypatch.chdir(changelog_repo) + + assert ( + process_changelog.get_added_lines( + base_ref, head_ref, str(changelog_repo / "perf-changelog.yaml") + ).encode() + == appended + ) + + +def test_get_added_lines_rejects_non_whitespace_deletion( + changelog_repo: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + base_ref = _commit(changelog_repo, b"- old\n", "base") + head_ref = _commit(changelog_repo, b"- new\n", "replace") + monkeypatch.chdir(changelog_repo) + + with pytest.raises(ValueError, match="Deletions are not allowed"): + process_changelog.get_added_lines(base_ref, head_ref, "perf-changelog.yaml") From 75213943ac60b044284d0fa81ec720bf0c8c017d Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 18 Jun 2026 07:00:49 +0800 Subject: [PATCH 2/2] fix(mi300x): preserve M3 SwiGLU parameters in FP8 patch Signed-off-by: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> --- .../minimaxm3_mi300x_mxfp8.patch | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch b/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch index 1e83f431d..7fe8fe6af 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch @@ -1,3 +1,24 @@ +diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py +index 0755699d1a4545649e8f5af5de77bbf2c6b24fab..905a9bea3c59ee3ef14a5acede345ffc2fd4a36d 100644 +--- a/vllm/model_executor/layers/fused_moe/config.py ++++ b/vllm/model_executor/layers/fused_moe/config.py +@@ -603,6 +603,8 @@ def fp8_w8a8_moe_quant_config( + a2_gscale: torch.Tensor | None = None, + g1_alphas: torch.Tensor | None = None, + g2_alphas: torch.Tensor | None = None, ++ gemm1_alpha: float | None = None, ++ gemm1_beta: float | None = None, + gemm1_clamp_limit: float | None = None, + ) -> FusedMoEQuantConfig: + """ +@@ -623,5 +625,7 @@ def fp8_w8a8_moe_quant_config( + per_act_token_quant=per_act_token_quant, + per_out_ch_quant=per_out_ch_quant, + block_shape=block_shape, ++ gemm1_alpha=gemm1_alpha, ++ gemm1_beta=gemm1_beta, + gemm1_clamp_limit=gemm1_clamp_limit, + ) diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json index c275cecc1591f16e91791e9b007cdb6fcaac40b4..f20c20c4d2a475ca00926c98608edc6b645dd4c1 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -188,6 +209,18 @@ index d0d7c76481b0a315e9c57810d40394822f62594c..e82429b8ecddc9b8e44f003a537de08b runner_backend = config.moe_backend if runner_backend != "auto": backend = _BACKEND_NAME_MAP.get(runner_backend) +diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py +index acbf2cb46ad42927fa344363059fe37a970d132b..1b5030b190960dd3758a25d156389be749f31530 100644 +--- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py ++++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py +@@ -568,5 +568,7 @@ def make_fp8_moe_quant_config( + block_shape=block_shape, + per_act_token_quant=per_act_token_quant, + per_out_ch_quant=per_out_ch_quant, ++ gemm1_alpha=gemm1_alpha, ++ gemm1_beta=gemm1_beta, + gemm1_clamp_limit=swiglu_limit, + ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 33c7c7532a0ba823e4e7a23538300a5977a4553e..9b9d73f7b5fc138cac3dc3349a24a473d2c1faf6 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py