ROCm · valarLip · May 12, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json
@@ -534,7 +534,7 @@
         "srcs": [
             "f'{AITER_CSRC_DIR}/pybind/moe_topk_pybind.cu'",
             "f'{AITER_CSRC_DIR}/py_itfs_ck/topk_sigmoid_kernels.cu'",
-            "f'{AITER_CSRC_DIR}/kernels/topk_softplus_kernels.cu'",
+            "f'{AITER_CSRC_DIR}/kernels/topk_gating_kernels.cu'",
             "f'{CK_DIR}/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp'"
         ],
         "flags_extra_cc": [],

diff --git a/aiter/ops/topk.py b/aiter/ops/topk.py
@@ -12,6 +12,12 @@
 from ..utility import dtypes
 
 
+# DEPRECATED: low-level binding kept for backward compatibility only.
+# Will be removed once all callers have migrated to topk_gating() below.
+# New code should use topk_gating(), which:
+#   - accepts an Optional[Tensor] correction_bias (None => no bias)
+#   - validates score_func string
+#   - exposes the same C++ kernel under a more accurate name
 @compile_ops("module_moe_topk")
 def topk_softplus(
     topk_weights: torch.Tensor,
@@ -20,9 +26,54 @@ def topk_softplus(
     correction_bias: torch.Tensor,
     need_renorm: bool,
     routed_scaling_factor: float = 1.0,
+    score_func: str = "sqrtsoftplus",
 ) -> None: ...
 
 
+_VALID_SCORE_FUNCS = {"sqrtsoftplus", "sigmoid", "softmax"}
+
+
+def topk_gating(
+    topk_weights: torch.Tensor,
+    topk_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: Optional[torch.Tensor] = None,
+    need_renorm: bool = True,
+    routed_scaling_factor: float = 1.0,
+    score_func: str = "sqrtsoftplus",
+) -> None:
+    """Unified fused topk gating for MoE routing.
+
+    Args:
+        score_func: one of {"sqrtsoftplus" (DeepSeek V4-Pro default),
+                            "sigmoid" (Llama4),
+                            "softmax" (DeepSeek V3 / classic MoE)}.
+        correction_bias: optional bias tensor, pass None for no bias.
+
+    Note: softmax is already normalized, so renorm is forced off.
+    """
+    assert (
+        score_func in _VALID_SCORE_FUNCS
+    ), f"Unknown score_func '{score_func}', expected one of {_VALID_SCORE_FUNCS}"
+    if correction_bias is None:
+        # Match gating dtype/device so dispatch picks DTYPE_B == DTYPE_I,
+        # avoiding extra kernel template instantiations.
+        correction_bias = torch.empty(
+            0, dtype=gating_output.dtype, device=gating_output.device
+        )
+    if score_func == "softmax":
+        need_renorm = False
+    topk_softplus(
+        topk_weights,
+        topk_indices,
+        gating_output,
+        correction_bias,
+        need_renorm,
+        routed_scaling_factor,
+        score_func,
+    )
+
+
 @compile_ops("module_moe_asm", fc_name="biased_grouped_topk")
 def biased_grouped_topk_hip(
     gating_output: torch.Tensor,

diff --git a/csrc/include/moe_op.h b/csrc/include/moe_op.h
@@ -55,7 +55,8 @@ void topk_softplus(torch::Tensor& topk_weights,
                     torch::Tensor& gating_output,
                     torch::Tensor& correction_bias,
                     bool need_renorm,
-                    float routed_scaling_factor = 1.0);
+                    float routed_scaling_factor = 1.0,
+                    const std::string& score_func = "sqrtsoftplus");
 
 void moe_align_block_size(torch::Tensor topk_ids,
                           int64_t num_experts,

diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp
@@ -1180,7 +1180,8 @@ namespace py = pybind11;
           py::arg("correction_bias"),                    \
           py::arg("need_renorm"),                        \
           py::arg("routed_scaling_factor") = 1.0,        \
-          "Apply topk sqrtsoftplus to the gating outputs.");
+          py::arg("score_func") = "sqrtsoftplus",        \
+          "Fused topk gating: score_func='sqrtsoftplus'|'sigmoid'|'softmax'.");
 
 #define MOE_SORTING_PYBIND                             \
     m.def("moe_sorting_fwd",                           \