[NPU A3] Fix benchmark issues for fused_linear_jsd and dyt. (#1231)

sunyi0505 · web-flow · commit 547cf4c8f8ac · 2026-05-23T11:41:11.000Z
## Summary Fix benchmark issues for fused_linear_jsd and dyt. 1.dyt throws errors when using torch.compile on NPU. Add logic in benchmark to disable torch.compile baseline for NPU devices. 2.fused_linear_jsd encounters out-of-limit grid error exceeding 65536 on NPU. The issue arises from taking num_row as grid size. Replace it with min(num_cores, n_rows) to fix the problem. ## Testing Done dyt: <img width="1699" height="480" alt="image" src="https://github.com/user-attachments/assets/a0a44250-fc8d-45d5-9b5a-1c4529a1db2b" /> fused_linear_jsd: <img width="1676" height="499" alt="image" src="https://github.com/user-attachments/assets/c5a91b9f-5b74-4065-a6b7-74118820b43f" /> Atlas 800T-A3 x86 Complete the following tasks before sending your PR, and replace `[ ]` with `[x]` to indicate you have done them. --> - Hardware Type: <BLANK> - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [x] run `make test-convergence` to ensure convergence
diff --git a/benchmark/scripts/benchmark_dyt.py b/benchmark/scripts/benchmark_dyt.py
@@ -20,6 +20,13 @@
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
 
 
+def get_kernel_providers():
+    providers = ["liger", "torch"]
+    if device != "npu":
+        providers.append("torch_compile")
+    return providers
+
+
 def setup_dyt(input: SingleBenchmarkRunInput):
     """Create input tensor and DyT layer from benchmark config."""
     from test.transformers.test_dyt import LigerDyT
@@ -85,7 +92,7 @@ def setup_dyt(input: SingleBenchmarkRunInput):
                 overwrite=args.overwrite,
             )
 
-        common_configs["kernel_providers"] = ["liger", "torch", "torch_compile"]
+        common_configs["kernel_providers"] = get_kernel_providers()
         run_benchmarks(
             bench_test_fn=build_speed_bench_fn(setup_dyt),
             kernel_operation_modes=["forward", "backward", "full"],
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/fused_linear_jsd.py b/src/liger_kernel/ops/backends/_ascend/ops/fused_linear_jsd.py
@@ -2,16 +2,38 @@
 
 import torch
 import triton
+import triton.language as tl
 
 from liger_kernel.ops.backends._ascend.ops.jsd import _jsd_kernel
 from liger_kernel.ops.utils import amp_custom_bwd
 from liger_kernel.ops.utils import amp_custom_fwd
-from liger_kernel.ops.utils import element_mul_kernel
 from liger_kernel.ops.utils import get_npu_core_count
 
 MAX_FUSED_SIZE = 4096
 
 
+@triton.jit
+def _element_mul_kernel(
+    X_ptr,
+    X_stride,
+    grad_output_ptr,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    grad_output = tl.load(grad_output_ptr)
+
+    for row_idx in range(pid, n_rows, num_progs):
+        row_ptr = X_ptr + row_idx * X_stride
+        for col_start in range(0, n_cols, BLOCK_SIZE):
+            offsets = col_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_cols
+            values = tl.load(row_ptr + offsets, mask=mask)
+            tl.store(row_ptr + offsets, values * grad_output, mask=mask)
+
+
 def fused_linear_jsd_forward(
     student_input,
     student_weight,
@@ -131,11 +153,13 @@ def fused_linear_jsd_backward(grad_output, grad_input, grad_weight):
         BT, H = grad_input.shape
         n_rows = BT
         BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(H))
+        num_cores = get_npu_core_count()
 
-        element_mul_kernel[(n_rows,)](
+        _element_mul_kernel[(min(num_cores, n_rows),)](
             grad_input,
             grad_input.stride(-2),
             grad_output,
+            n_rows,
             H,
             BLOCK_SIZE=BLOCK_SIZE,
         )
@@ -145,10 +169,11 @@ def fused_linear_jsd_backward(grad_output, grad_input, grad_weight):
             V, H = grad_weight.shape
             n_rows = V
 
-            element_mul_kernel[(n_rows,)](
+            _element_mul_kernel[(min(num_cores, n_rows),)](
                 grad_weight,
                 grad_weight.stride(-2),
                 grad_output,
+                n_rows,
                 H,
                 BLOCK_SIZE=BLOCK_SIZE,
             )