|
| 1 | +diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py |
| 2 | +index 0755699d1a4545649e8f5af5de77bbf2c6b24fab..905a9bea3c59ee3ef14a5acede345ffc2fd4a36d 100644 |
| 3 | +--- a/vllm/model_executor/layers/fused_moe/config.py |
| 4 | ++++ b/vllm/model_executor/layers/fused_moe/config.py |
| 5 | +@@ -603,6 +603,8 @@ def fp8_w8a8_moe_quant_config( |
| 6 | + a2_gscale: torch.Tensor | None = None, |
| 7 | + g1_alphas: torch.Tensor | None = None, |
| 8 | + g2_alphas: torch.Tensor | None = None, |
| 9 | ++ gemm1_alpha: float | None = None, |
| 10 | ++ gemm1_beta: float | None = None, |
| 11 | + gemm1_clamp_limit: float | None = None, |
| 12 | + ) -> FusedMoEQuantConfig: |
| 13 | + """ |
| 14 | +@@ -623,5 +625,7 @@ def fp8_w8a8_moe_quant_config( |
| 15 | + per_act_token_quant=per_act_token_quant, |
| 16 | + per_out_ch_quant=per_out_ch_quant, |
| 17 | + block_shape=block_shape, |
| 18 | ++ gemm1_alpha=gemm1_alpha, |
| 19 | ++ gemm1_beta=gemm1_beta, |
| 20 | + gemm1_clamp_limit=gemm1_clamp_limit, |
| 21 | + ) |
1 | 22 | diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json |
2 | 23 | index c275cecc1591f16e91791e9b007cdb6fcaac40b4..f20c20c4d2a475ca00926c98608edc6b645dd4c1 100644 |
3 | 24 | --- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json |
@@ -188,6 +209,18 @@ index d0d7c76481b0a315e9c57810d40394822f62594c..e82429b8ecddc9b8e44f003a537de08b |
188 | 209 | runner_backend = config.moe_backend |
189 | 210 | if runner_backend != "auto": |
190 | 211 | backend = _BACKEND_NAME_MAP.get(runner_backend) |
| 212 | +diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py |
| 213 | +index acbf2cb46ad42927fa344363059fe37a970d132b..1b5030b190960dd3758a25d156389be749f31530 100644 |
| 214 | +--- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py |
| 215 | ++++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py |
| 216 | +@@ -568,5 +568,7 @@ def make_fp8_moe_quant_config( |
| 217 | + block_shape=block_shape, |
| 218 | + per_act_token_quant=per_act_token_quant, |
| 219 | + per_out_ch_quant=per_out_ch_quant, |
| 220 | ++ gemm1_alpha=gemm1_alpha, |
| 221 | ++ gemm1_beta=gemm1_beta, |
| 222 | + gemm1_clamp_limit=swiglu_limit, |
| 223 | + ) |
191 | 224 | diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py |
192 | 225 | index 33c7c7532a0ba823e4e7a23538300a5977a4553e..9b9d73f7b5fc138cac3dc3349a24a473d2c1faf6 100644 |
193 | 226 | --- a/vllm/model_executor/layers/quantization/modelopt.py |
|
0 commit comments