Skip to content

Commit 7521394

Browse files
committed
fix(mi300x): preserve M3 SwiGLU parameters in FP8 patch
Signed-off-by: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
1 parent 27510c4 commit 7521394

1 file changed

Lines changed: 33 additions & 0 deletions

File tree

benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,24 @@
1+
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
2+
index 0755699d1a4545649e8f5af5de77bbf2c6b24fab..905a9bea3c59ee3ef14a5acede345ffc2fd4a36d 100644
3+
--- a/vllm/model_executor/layers/fused_moe/config.py
4+
+++ b/vllm/model_executor/layers/fused_moe/config.py
5+
@@ -603,6 +603,8 @@ def fp8_w8a8_moe_quant_config(
6+
a2_gscale: torch.Tensor | None = None,
7+
g1_alphas: torch.Tensor | None = None,
8+
g2_alphas: torch.Tensor | None = None,
9+
+ gemm1_alpha: float | None = None,
10+
+ gemm1_beta: float | None = None,
11+
gemm1_clamp_limit: float | None = None,
12+
) -> FusedMoEQuantConfig:
13+
"""
14+
@@ -623,5 +625,7 @@ def fp8_w8a8_moe_quant_config(
15+
per_act_token_quant=per_act_token_quant,
16+
per_out_ch_quant=per_out_ch_quant,
17+
block_shape=block_shape,
18+
+ gemm1_alpha=gemm1_alpha,
19+
+ gemm1_beta=gemm1_beta,
20+
gemm1_clamp_limit=gemm1_clamp_limit,
21+
)
122
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
223
index c275cecc1591f16e91791e9b007cdb6fcaac40b4..f20c20c4d2a475ca00926c98608edc6b645dd4c1 100644
324
--- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -188,6 +209,18 @@ index d0d7c76481b0a315e9c57810d40394822f62594c..e82429b8ecddc9b8e44f003a537de08b
188209
runner_backend = config.moe_backend
189210
if runner_backend != "auto":
190211
backend = _BACKEND_NAME_MAP.get(runner_backend)
212+
diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
213+
index acbf2cb46ad42927fa344363059fe37a970d132b..1b5030b190960dd3758a25d156389be749f31530 100644
214+
--- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py
215+
+++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
216+
@@ -568,5 +568,7 @@ def make_fp8_moe_quant_config(
217+
block_shape=block_shape,
218+
per_act_token_quant=per_act_token_quant,
219+
per_out_ch_quant=per_out_ch_quant,
220+
+ gemm1_alpha=gemm1_alpha,
221+
+ gemm1_beta=gemm1_beta,
222+
gemm1_clamp_limit=swiglu_limit,
223+
)
191224
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
192225
index 33c7c7532a0ba823e4e7a23538300a5977a4553e..9b9d73f7b5fc138cac3dc3349a24a473d2c1faf6 100644
193226
--- a/vllm/model_executor/layers/quantization/modelopt.py

0 commit comments

Comments
 (0)