update the fusion path

hxbai · hxbai · commit 257253afb2e2 · 2026-05-20T23:14:22.000-07:00
Signed-off-by: Hongxiao Bai &lt;hongxiaob@nvidia.com&gt;
diff --git a/tests/pytorch/test_fusible_ops.py b/tests/pytorch/test_fusible_ops.py
@@ -3578,7 +3578,14 @@ def test_layernorm_mlp(
     @pytest.mark.parametrize("glu_interleave_size", (None, 32))
     @pytest.mark.parametrize("delay_wgrad_compute", (False, True))
     @pytest.mark.parametrize("hidden_size", (128, 256))
-    @pytest.mark.parametrize("activation", ("scaled_swiglu", "scaled_clamped_qgeglu"))
+    @pytest.mark.parametrize(
+        "activation",
+        (
+            "scaled_swiglu",
+            "scaled_clamped_qgeglu",
+            "scaled_clamped_qgeglu_custom",
+        ),
+    )
     def test_grouped_mlp(
         self,
         *,
@@ -3623,10 +3630,20 @@ def test_grouped_mlp(
             pytest.skip("single_grouped_bias requires bias=True")
         if with_quantization and dtype not in (torch.bfloat16, torch.float16):
             pytest.skip("Quantized group GEMM is only supported with BF16/FP16")
-        if quantization == "nvfp4" and activation == "scaled_clamped_qgeglu" and bias:
+        if quantization == "nvfp4" and activation.startswith("scaled_clamped_qgeglu") and bias:
             # TODO: ksivaman: Need to debug numerics for this case.
             pytest.skip("Bias/dbias not yet supported in NVFP4 fused grouped MLP with GeGLU")
 
+        # Activation parameters for clamped QGeGLU variants
+        if activation == "scaled_clamped_qgeglu_custom":
+            geglu_limit = 5.0
+            geglu_alpha = 1.5
+            geglu_offset = 0.5
+        else:
+            geglu_limit = 7.0
+            geglu_alpha = 1.702
+            geglu_offset = 1.0
+
         # Random data
         x_ref, x_test = make_reference_and_test_tensors(
             in_shape,
@@ -3717,11 +3734,10 @@ def test_grouped_mlp(
             if activation == "scaled_swiglu":
                 x = torch.nn.functional.silu(x1) * x2
             else:
-                lim = torch.tensor(7.0, device=x1.device, dtype=x1.dtype)
-                geglu_alpha = 1.702
+                lim = torch.tensor(geglu_limit, device=x1.device, dtype=x1.dtype)
                 x1c = torch.minimum(x1, lim)
                 x2c = torch.clamp(x2, -lim, lim)
-                x = (x2c + 1) * (x1c * torch.sigmoid(geglu_alpha * x1c))
+                x = (x2c + geglu_offset) * (x1c * torch.sigmoid(geglu_alpha * x1c))
             x = x * probs[group_idx].unsqueeze(-1)
             x = torch.nn.functional.linear(x, fc2_ws_ref[group_idx])
             if bias:
@@ -3732,11 +3748,15 @@ def test_grouped_mlp(
 
         # Construct operations
         recipe = make_recipe(quantization)
-        scaled_act = (
-            te_ops.ScaledSwiGLU(glu_interleave_size=glu_interleave_size)
-            if activation == "scaled_swiglu"
-            else te_ops.ScaledClampedQGeGLU(glu_interleave_size=glu_interleave_size)
-        )
+        if activation == "scaled_swiglu":
+            scaled_act = te_ops.ScaledSwiGLU(glu_interleave_size=glu_interleave_size)
+        else:
+            scaled_act = te_ops.ScaledClampedQGeGLU(
+                glu_interleave_size=glu_interleave_size,
+                limit=geglu_limit,
+                alpha=geglu_alpha,
+                glu_linear_offset=geglu_offset,
+            )
         with te.quantized_model_init(enabled=with_quantization, recipe=recipe):
             fc1 = te_ops.GroupedLinear(
                 group_size,
diff --git a/transformer_engine/pytorch/ops/_common.py b/transformer_engine/pytorch/ops/_common.py
@@ -21,16 +21,34 @@
 from ..utils import canonicalize_dtype
 
 
+@functools.lru_cache(maxsize=1)
+def _cudnn_frontend_version() -> Optional[PkgVersion]:
+    """Return the installed cuDNN-frontend version, or ``None``."""
+    try:
+        return PkgVersion(get_pkg_version("nvidia-cudnn-frontend"))
+    except PackageNotFoundError:
+        return None
+
+
 @functools.lru_cache(maxsize=1)
 def _cudnn_frontend_version_supported() -> bool:
     """Check cuDNN frontend is at least 1.23.0.
 
-    All grouped MLP fused-kernel features require cuDNN frontend 1.23.0.
+    All grouped MLP fused-kernel features require cuDNN frontend >= 1.23.0.
     """
-    try:
-        return PkgVersion(get_pkg_version("nvidia-cudnn-frontend")) >= PkgVersion("1.23.0")
-    except PackageNotFoundError:
-        return False
+    ver = _cudnn_frontend_version()
+    return ver is not None and ver >= PkgVersion("1.23.0")
+
+
+@functools.lru_cache(maxsize=1)
+def _cudnn_frontend_geglu_runtime_params() -> bool:
+    """Check cuDNN frontend is at least 1.24.0.
+
+    Runtime-configurable GeGLU parameters (linear_offset, geglu_alpha,
+    glu_clamp_max, glu_clamp_min) require cuDNN frontend >= 1.24.0.
+    """
+    ver = _cudnn_frontend_version()
+    return ver is not None and ver >= PkgVersion("1.24.0")
 
 
 def is_quantized_tensor(tensor: torch.Tensor | QuantizedTensorStorage) -> bool:
@@ -256,9 +274,14 @@ def fuse_grouped_mlp_ops(
             and isinstance(window[2], GroupedLinear)
         ):
             matches_pattern = False
-        elif isinstance(window[1], ScaledClampedQGeGLU) and (
-            abs(window[1]._clamped.alpha - 1.702) > 0.001
-            or abs(window[1]._clamped.glu_linear_offset - 1.0) > 0.001
+        elif (
+            isinstance(window[1], ScaledClampedQGeGLU)
+            and not _cudnn_frontend_geglu_runtime_params()
+            and (
+                abs(window[1]._clamped.alpha - 1.702) > 0.001
+                or abs(window[1]._clamped.glu_linear_offset - 1.0) > 0.001
+                or abs(window[1]._clamped.limit - 7.0) > 0.001
+            )
         ):
             matches_pattern = False
         elif window[0].num_groups != window[2].num_groups:
diff --git a/transformer_engine/pytorch/ops/fused/backward_grouped_mlp.py b/transformer_engine/pytorch/ops/fused/backward_grouped_mlp.py
@@ -22,6 +22,7 @@
 from ..fuser import register_backward_fusion
 from ..op import FusedOperation, FusibleOperation, OperationContext
 from .._common import (
+    _cudnn_frontend_geglu_runtime_params,
     _cudnn_frontend_version_supported,
     fuse_grouped_mlp_ops,
     get_accumulate_flag_in_param,
@@ -314,11 +315,16 @@ def __init__(
             self.grouped_gemm_dglu_kernel()  # Try triggering import error
             raise RuntimeError(f"{self.__class__.__name__} is not supported on this system.")
         validate_grouped_mlp_dims(fc1, swiglu, fc2)
-        # The cuDNN dgeglu implementation corresponds to ScaledClampedQGeGLU.
-        # The act_func string should be fixed on the cuDNN FE side.
-        self._cudnn_dact_func: str = (
-            "dgeglu" if isinstance(swiglu, ScaledClampedQGeGLU) else "dswiglu"
+        is_geglu = isinstance(swiglu, ScaledClampedQGeGLU)
+        self._cudnn_dact_func: str = "dgeglu" if is_geglu else "dswiglu"
+        self._pass_geglu_runtime_params: bool = (
+            is_geglu and _cudnn_frontend_geglu_runtime_params()
         )
+        if self._pass_geglu_runtime_params:
+            self._cudnn_linear_offset: float = swiglu._clamped.glu_linear_offset
+            self._cudnn_geglu_alpha: float = swiglu._clamped.alpha
+            self._cudnn_glu_clamp_max: float = swiglu._clamped.limit
+            self._cudnn_glu_clamp_min: float = -swiglu._clamped.limit
 
     def fuser_backward(
         self,
@@ -472,6 +478,13 @@ def fuser_backward(
             "act_func": self._cudnn_dact_func,
             "use_dynamic_sched": True,
         }
+        if self._pass_geglu_runtime_params:
+            fc2_dglu_kwargs.update(
+                linear_offset=self._cudnn_linear_offset,
+                geglu_alpha=self._cudnn_geglu_alpha,
+                glu_clamp_max=self._cudnn_glu_clamp_max,
+                glu_clamp_min=self._cudnn_glu_clamp_min,
+            )
 
         if fc2_op.single_grouped_weight:
             # Clone and swizzle scales for GEMM
diff --git a/transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py b/transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py
@@ -23,6 +23,7 @@
 from ..fuser import register_forward_fusion
 from ..op import FusedOperation, FusibleOperation, OperationContext
 from .._common import (
+    _cudnn_frontend_geglu_runtime_params,
     _cudnn_frontend_version_supported,
     fuse_grouped_mlp_ops,
     is_quantized_tensor,
@@ -97,9 +98,16 @@ def __init__(
             self.grouped_gemm_glu_kernel()  # Try triggering import error
             raise RuntimeError(f"{self.__class__.__name__} is not supported on this system.")
         validate_grouped_mlp_dims(fc1, swiglu, fc2)
-        # The cuDNN geglu implementation corresponds to ScaledClampedQGeGLU.
-        # The act_func string should be fixed on the cuDNN FE side.
-        self._cudnn_act_func: str = "geglu" if isinstance(swiglu, ScaledClampedQGeGLU) else "swiglu"
+        is_geglu = isinstance(swiglu, ScaledClampedQGeGLU)
+        self._cudnn_act_func: str = "geglu" if is_geglu else "swiglu"
+        self._pass_geglu_runtime_params: bool = (
+            is_geglu and _cudnn_frontend_geglu_runtime_params()
+        )
+        if self._pass_geglu_runtime_params:
+            self._cudnn_linear_offset: float = swiglu._clamped.glu_linear_offset
+            self._cudnn_geglu_alpha: float = swiglu._clamped.alpha
+            self._cudnn_glu_clamp_max: float = swiglu._clamped.limit
+            self._cudnn_glu_clamp_min: float = -swiglu._clamped.limit
 
     def fuser_forward(
         self,
@@ -305,6 +313,13 @@ def fuser_forward(
             "act_func": self._cudnn_act_func,
             "use_dynamic_sched": True,
         }
+        if self._pass_geglu_runtime_params:
+            fc1_glu_kwargs.update(
+                linear_offset=self._cudnn_linear_offset,
+                geglu_alpha=self._cudnn_geglu_alpha,
+                glu_clamp_max=self._cudnn_glu_clamp_max,
+                glu_clamp_min=self._cudnn_glu_clamp_min,
+            )
 
         if fc1_op.single_grouped_weight:
             # Clone and swizzle scales for GEMM.