diff --git a/‎benchmark/scripts/benchmark_fused_moe.py‎
Lines changed: 18 additions & 3 deletions b/‎benchmark/scripts/benchmark_fused_moe.py‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎src/liger_kernel/ops/backends/_ascend/ops/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎src/liger_kernel/ops/backends/_ascend/ops/__init__.py‎
Lines changed: 4 additions & 0 deletions
@@ -24,7 +24,7 @@
 from utils import run_memory_benchmark
 from utils import run_speed_benchmark
 
-from liger_kernel.ops.fused_moe import LigerFusedMoEFunction
+from liger_kernel.ops import LigerFusedMoEFunction
 from liger_kernel.utils import get_total_gpu_memory
 from liger_kernel.utils import infer_device
 
@@ -157,7 +157,14 @@ def _warmup_liger(T, E, H, intermediate_dim, K, dtype, sweep_dim):
     warmup_out = warmup_fn()
     warmup_out.sum().backward()
     del warmup_out
-    torch.cuda.synchronize()
+    if device == "cuda":
+        torch.cuda.synchronize()
+    elif device == "npu":
+        torch.npu.synchronize()
+    elif device == "xpu":
+        torch.xpu.synchronize()
+    else:
+        torch.cpu.synchronize()
 
 
 # ---------------------------------------------------------------------------
@@ -231,7 +238,15 @@ def _probe():
             print(f"  warmup E={e_val}...")
             _warmup_liger(probe_T, e_val, H, intermediate_dim, K, dtype, sweep_dim="E")
 
-    torch.cuda.synchronize()
+    if device == "cuda":
+        torch.cuda.synchronize()
+    elif device == "npu":
+        torch.npu.synchronize()
+    elif device == "xpu":
+        torch.xpu.synchronize()
+    else:
+        torch.cpu.synchronize()
+
     print("Autotune warmup complete.\n")
 
     if args.sweep_dim == "num_tokens":
 
@@ -32,6 +32,8 @@
 from liger_kernel.ops.backends._ascend.ops.fused_linear_jsd import LigerFusedLinearJSDFunction
 from liger_kernel.ops.backends._ascend.ops.fused_linear_jsd import fused_linear_jsd_backward
 from liger_kernel.ops.backends._ascend.ops.fused_linear_jsd import fused_linear_jsd_forward
+from liger_kernel.ops.backends._ascend.ops.fused_moe import LigerFusedMoEFunction
+from liger_kernel.ops.backends._ascend.ops.fused_moe import compute_routing_metadata
 from liger_kernel.ops.backends._ascend.ops.fused_neighborhood_attention import LigerFusedNeighborhoodAttentionFunction
 from liger_kernel.ops.backends._ascend.ops.fused_neighborhood_attention import fused_neighborhood_attention_forward
 from liger_kernel.ops.backends._ascend.ops.geglu import LigerGELUMulFunction
@@ -149,6 +151,8 @@
     "LigerFusedLinearCrossEntropyFunction",
     "fused_linear_cross_entropy_forward",
     "fused_linear_cross_entropy_backward",
+    "LigerFusedMoEFunction",
+    "compute_routing_metadata",
     "LigerMHCCoeffsFunction",
     "LigerMHCPreFunction",
     "LigerMHCPostResFunction",