diff --git a/‎benchmark/scripts/benchmark_fused_moe.py‎
Lines changed: 12 additions & 5 deletions b/‎benchmark/scripts/benchmark_fused_moe.py‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎src/liger_kernel/ops/backends/_ascend/ops/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎src/liger_kernel/ops/backends/_ascend/ops/__init__.py‎
Lines changed: 4 additions & 0 deletions
@@ -24,7 +24,7 @@
 from utils import run_memory_benchmark
 from utils import run_speed_benchmark
 
-from liger_kernel.ops.fused_moe import LigerFusedMoEFunction
+from liger_kernel.ops import LigerFusedMoEFunction
 from liger_kernel.utils import get_total_gpu_memory
 from liger_kernel.utils import infer_device
 
@@ -106,11 +106,11 @@ def _setup_fused_moe(input: SingleBenchmarkRunInput):
     if input.kernel_provider == "liger":
 
         def fwd_fn():
-            return LigerFusedMoEFunction.apply(x, gup, dn, idx, wts)
+            return LigerFusedMoEFunction.apply(x, gup, dn, idx, wts).to(device)
     elif input.kernel_provider == "huggingface":
 
         def fwd_fn():
-            return _huggingface_moe_forward(x, gup, dn, idx, wts)
+            return _huggingface_moe_forward(x, gup, dn, idx, wts).to(device)
     else:
         raise ValueError(f"Unknown provider: {input.kernel_provider}")
 
@@ -157,7 +157,10 @@ def _warmup_liger(T, E, H, intermediate_dim, K, dtype, sweep_dim):
     warmup_out = warmup_fn()
     warmup_out.sum().backward()
     del warmup_out
-    torch.cuda.synchronize()
+    if device == "cuda" and torch.cuda.is_available():
+        torch.cuda.synchronize()
+    elif device == "npu" and hasattr(torch, "npu") and torch.npu.is_available():
+        torch.npu.synchronize()
 
 
 # ---------------------------------------------------------------------------
@@ -231,7 +234,11 @@ def _probe():
             print(f"  warmup E={e_val}...")
             _warmup_liger(probe_T, e_val, H, intermediate_dim, K, dtype, sweep_dim="E")
 
-    torch.cuda.synchronize()
+    if device == "cuda" and torch.cuda.is_available():
+        torch.cuda.synchronize()
+    elif device == "npu" and hasattr(torch, "npu") and torch.npu.is_available():
+        torch.npu.synchronize()
+
     print("Autotune warmup complete.\n")
 
     if args.sweep_dim == "num_tokens":
 
@@ -32,6 +32,8 @@
 from liger_kernel.ops.backends._ascend.ops.fused_linear_jsd import LigerFusedLinearJSDFunction
 from liger_kernel.ops.backends._ascend.ops.fused_linear_jsd import fused_linear_jsd_backward
 from liger_kernel.ops.backends._ascend.ops.fused_linear_jsd import fused_linear_jsd_forward
+from liger_kernel.ops.backends._ascend.ops.fused_moe import LigerFusedMoEFunction
+from liger_kernel.ops.backends._ascend.ops.fused_moe import compute_routing_metadata
 from liger_kernel.ops.backends._ascend.ops.fused_neighborhood_attention import LigerFusedNeighborhoodAttentionFunction
 from liger_kernel.ops.backends._ascend.ops.fused_neighborhood_attention import fused_neighborhood_attention_forward
 from liger_kernel.ops.backends._ascend.ops.geglu import LigerGELUMulFunction
@@ -146,4 +148,6 @@
     "LigerFusedLinearCrossEntropyFunction",
     "fused_linear_cross_entropy_forward",
     "fused_linear_cross_entropy_backward",
+    "LigerFusedMoEFunction",
+    "compute_routing_metadata",
 ]