perf: exl3 decode kernel optimization experiments (#1655)

AlpinDale · web-flow · commit 6c59bc7167c5 · 2026-04-28T05:45:20.000+04:30
* perf: exl3 decode kernel optimization experiments

Signed-off-by: AlpinDale &lt;alpindale@gmail.com&gt;

* fix: remove unsafe EXL3 shape overrides

Signed-off-by: AlpinDale &lt;alpindale@gmail.com&gt;

---------

Signed-off-by: AlpinDale &lt;alpindale@gmail.com&gt;
diff --git a/aphrodite/config/model.py b/aphrodite/config/model.py
@@ -585,6 +585,7 @@ def __post_init__(
             if self.pooler_config.tok_pooling_type is None:
                 self.pooler_config.tok_pooling_type = default_tok_pooling_type
 
+        requested_dtype = self.dtype
         self.dtype: torch.dtype = _get_and_verify_dtype(
             self.model,
             self.hf_config,
@@ -667,6 +668,15 @@ def __post_init__(
         self.config_updated = False
         self._try_verify_and_update_model_config()
         self._verify_quantization()
+        if (
+            self.quantization == "exl3"
+            and isinstance(requested_dtype, str)
+            and requested_dtype.lower() == "auto"
+            and self.dtype != torch.float16
+            and "moe" in self.hf_config.model_type.lower()
+        ):
+            logger.info("Defaulting EXL3 activation dtype from %s to torch.float16.", self.dtype)
+            self.dtype = torch.float16
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
diff --git a/aphrodite/model_executor/layers/quantization/exl3.py b/aphrodite/model_executor/layers/quantization/exl3.py
@@ -184,6 +184,8 @@ def _exl3_gate_up(
         -1,
         0,
     )
+    if x.shape[0] == 1:
+        return output.view(1, out_features * 2)
     return torch.cat([output[0], output[1]], dim=-1)
 
 
diff --git a/csrc/quantization/exl3/exllamav3_ext/quant/exl3_gemm_kernel.cuh b/csrc/quantization/exl3/exllamav3_ext/quant/exl3_gemm_kernel.cuh
@@ -223,4 +223,4 @@ __global__ __launch_bounds__(EXL3_GEMM_BASE_THREADS* TILESIZE_K /
       }
     }
   }
-}
+}
diff --git a/csrc/quantization/exl3/exllamav3_ext/quant/exl3_kernel_map.cu b/csrc/quantization/exl3/exllamav3_ext/quant/exl3_kernel_map.cu
@@ -159,19 +159,10 @@ fp_exl3_mgemm_kernel select_exl3_mgemm_kernel(
     int cc, int size_m, int size_k, int size_n, int K, bool c_fp32,
     int force_shape_idx, int* out_block_dim, int* out_shape_idx, int* num_sms,
     int cb, int bszm_in, int bszm_out) {
-  int shape_idx;
-  if (force_shape_idx > 0) {
-    shape_idx = force_shape_idx;
-  } else if (cc == CC_BLACKWELL && K == 4 && size_m == 1 && size_k == 1024 &&
-             size_n == 256 && bszm_out <= 32) {
-    shape_idx = 4;
-  } else if (cc == CC_BLACKWELL && K == 4 && size_m == 1 && size_k == 1024 &&
-             size_n == 3072 && bszm_out == 2) {
-    shape_idx = 2;
-  } else {
-    shape_idx = select_gemm_shape(cc, size_m, size_k, size_n, K, true, bszm_in,
-                                  bszm_out);
-  }
+  int shape_idx = force_shape_idx <= 0
+                      ? select_gemm_shape(cc, size_m, size_k, size_n, K, true,
+                                          bszm_in, bszm_out)
+                      : force_shape_idx;
   TORCH_CHECK(shape_idx > 0, "exl3_mgemm: no compatible kernel");
   if (out_shape_idx) *out_shape_idx = shape_idx;
   if (out_block_dim) *out_block_dim = exl3_gemm_blockdim[shape_idx];

Original file line number	Diff line number	Diff line change
`@@ -184,6 +184,8 @@ def _exl3_gate_up(`
`184`	`184`	`-1,`
`185`	`185`	`0,`
`186`	`186`	`)`
	`187`	`+ if x.shape[0] == 1:`
	`188`	`+ return output.view(1, out_features * 2)`
`187`	`189`	`return torch.cat([output[0], output[1]], dim=-1)`
`188`	`190`
`189`	`191`
Original file line number	Diff line number	Diff line change
`@@ -223,4 +223,4 @@ __global__ __launch_bounds__(EXL3_GEMM_BASE_THREADS* TILESIZE_K /`
`223`	`223`	`}`
`224`	`224`	`}`
`225`	`225`	`}`
`226`		`-}`
	`226`	`+}`