pytorch
diff --git a/‎backends/cuda/tests/test_chunk_gated_delta_rule.py‎
Lines changed: 10 additions & 8 deletions b/‎backends/cuda/tests/test_chunk_gated_delta_rule.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎backends/cuda/triton/kernels/chunk_gated_delta_rule.py‎
Lines changed: 110 additions & 32 deletions b/‎backends/cuda/triton/kernels/chunk_gated_delta_rule.py‎
Lines changed: 110 additions & 32 deletions
diff --git a/‎examples/models/qwen3_5_moe/bench_fla.py‎
Lines changed: 140 additions & 0 deletions b/‎examples/models/qwen3_5_moe/bench_fla.py‎
Lines changed: 140 additions & 0 deletions
@@ -98,7 +98,11 @@ def _make_inputs_from_fla(
     q = torch.rand(B, seq_len, H, K, dtype=dtype, device=device)
     k = torch.rand(B, seq_len, H, K, dtype=dtype, device=device)
     v = torch.rand(B, seq_len, H, V, dtype=dtype, device=device)
-    beta = torch.rand(B, seq_len, H, dtype=torch.float32, device=device).sigmoid().to(dtype)
+    beta = (
+        torch.rand(B, seq_len, H, dtype=torch.float32, device=device)
+        .sigmoid()
+        .to(dtype)
+    )
     g = F.logsigmoid(torch.rand(B, seq_len, H, dtype=torch.float32, device=device))
     g = (g / gate_logit_normalizer).to(dtype)
     if mask_p > 0:
@@ -261,7 +265,9 @@ def test_recurrent_t1(self):
         model = ChunkGatedDeltaModel().eval()
         for seed, norm, mask_p, nonzero_h0, desc in FLA_TEST_CONFIGS:
             with self.subTest(desc=desc):
-                inputs = _make_inputs_from_fla(seed, norm, mask_p, nonzero_h0, seq_len=1)
+                inputs = _make_inputs_from_fla(
+                    seed, norm, mask_p, nonzero_h0, seq_len=1
+                )
                 q, k, v, g, beta, h0 = inputs
 
                 with torch.no_grad():
@@ -312,12 +318,8 @@ def test_dispatch_multiple_seq_lengths(self):
                 self.assertEqual(s_ours.shape, torch.Size([B, H, K, V]))
                 o_diff = (o_ours.float() - o_ref.float()).abs().max().item()
                 s_diff = (s_ours.float() - s_ref.float()).abs().max().item()
-                self.assertLess(
-                    o_diff, 0.02, f"T={seq_len}: output diff {o_diff}"
-                )
-                self.assertLess(
-                    s_diff, 0.02, f"T={seq_len}: state diff {s_diff}"
-                )
+                self.assertLess(o_diff, 0.02, f"T={seq_len}: output diff {o_diff}")
+                self.assertLess(s_diff, 0.02, f"T={seq_len}: state diff {s_diff}")
 
     def test_export_cuda(self):
         with tempfile.TemporaryDirectory() as tmpdir:
 
@@ -68,13 +68,13 @@ def _unwrap(kernel):
 @triton.jit
 def _recurrent_gated_delta_rule_kernel(
     # Pointers — all inputs [B, 1, H, *] squeezed to [B, H, *]
-    q_ptr,      # [B, H, K]
-    k_ptr,      # [B, H, K]
-    v_ptr,      # [B, H, V]
-    g_ptr,      # [B, H]
-    beta_ptr,   # [B, H]
+    q_ptr,  # [B, H, K]
+    k_ptr,  # [B, H, K]
+    v_ptr,  # [B, H, V]
+    g_ptr,  # [B, H]
+    beta_ptr,  # [B, H]
     state_ptr,  # [B, H, K, V] input state (read)
-    o_ptr,      # [B, H, V] output
+    o_ptr,  # [B, H, V] output
     new_state_ptr,  # [B, H, K, V] output state (write)
     # Dims
     K: tl.constexpr,
@@ -137,7 +137,9 @@ def _recurrent_gated_delta_rule_kernel(
         o_tile = tl.sum(state_tile * q_vec[:, None], axis=0) * scale
 
         # Store output tile
-        tl.store(o_ptr + v_base + v_range, o_tile.to(o_ptr.dtype.element_ty), mask=v_mask)
+        tl.store(
+            o_ptr + v_base + v_range, o_tile.to(o_ptr.dtype.element_ty), mask=v_mask
+        )
 
         # Store new state tile
         tl.store(
@@ -212,34 +214,74 @@ def _launch_chunked(q, k, v, g, beta, initial_state, scale):
     # 1. chunk_local_cumsum
     g_cumsum = torch.empty(B, T, H, dtype=torch.float32, device=q.device)
     wrap_triton(_unwrap(chunk_local_cumsum_scalar_kernel))[(NT, B * H)](
-        s=g, o=g_cumsum, scale=0, cu_seqlens=0, chunk_indices=0,
-        T=T, B=B, H=H, BT=BT,
-        HEAD_FIRST=False, REVERSE=False, HAS_SCALE=False, IS_VARLEN=False,
+        s=g,
+        o=g_cumsum,
+        scale=0,
+        cu_seqlens=0,
+        chunk_indices=0,
+        T=T,
+        B=B,
+        H=H,
+        BT=BT,
+        HEAD_FIRST=False,
+        REVERSE=False,
+        HAS_SCALE=False,
+        IS_VARLEN=False,
     )
 
     # 2. chunk_scaled_dot_kkt
     A = torch.empty(B, T, H, BT, device=q.device, dtype=torch.float32)
     wrap_triton(_unwrap(chunk_scaled_dot_kkt_fwd_kernel))[(NT, B * H)](
-        k=k, g=g_cumsum, beta=beta, A=A,
-        cu_seqlens=0, chunk_indices=0,
-        T=T, H=H, K=K, BT=BT, USE_G=True, IS_VARLEN=False,
+        k=k,
+        g=g_cumsum,
+        beta=beta,
+        A=A,
+        cu_seqlens=0,
+        chunk_indices=0,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        USE_G=True,
+        IS_VARLEN=False,
     )
 
     # 3. solve_tril
     Ai = torch.zeros_like(A, dtype=k.dtype)
     wrap_triton(_unwrap(merge_16x16_to_64x64_inverse_kernel))[NT, B * H](
-        A=A, Ai=Ai, cu_seqlens=0, chunk_indices=0,
-        T=T, H=H, BT=BT, USE_TMA=IS_TMA_SUPPORTED, IS_VARLEN=False,
+        A=A,
+        Ai=Ai,
+        cu_seqlens=0,
+        chunk_indices=0,
+        T=T,
+        H=H,
+        BT=BT,
+        USE_TMA=IS_TMA_SUPPORTED,
+        IS_VARLEN=False,
     )
 
     # 4. recompute_w_u
     w = torch.empty_like(k)
     u = torch.empty_like(v)
     wrap_triton(_unwrap(recompute_w_u_fwd_kernel))[(NT, B * H)](
-        k=k, v=v, beta=beta, w=w, u=u, A=Ai, g=g_cumsum,
-        cu_seqlens=0, chunk_indices=0,
-        T=T, H=H, K=K, V=V, BT=BT, BK=64, BV=64,
-        USE_G=True, IS_VARLEN=False,
+        k=k,
+        v=v,
+        beta=beta,
+        w=w,
+        u=u,
+        A=Ai,
+        g=g_cumsum,
+        cu_seqlens=0,
+        chunk_indices=0,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=64,
+        BV=64,
+        USE_G=True,
+        IS_VARLEN=False,
     )
 
     # 5. chunk_gated_delta_rule_fwd_h
@@ -251,13 +293,30 @@ def grid_h(meta):
         return (triton.cdiv(V, meta["BV"]), B * H)
 
     wrap_triton(_unwrap(chunk_gated_delta_rule_fwd_kernel_h_blockdim64))[grid_h](
-        k=k, v=u, w=w, v_new=v_new, g=g_cumsum, gk=0,
-        h=h, h0=initial_state, ht=final_state,
-        cu_seqlens=0, chunk_offsets=0,
-        T=T, H=H, K=K, V=V, BT=BT,
-        USE_EXP2=False, TRANSPOSE_STATE=False, USE_G=True, USE_GK=False,
-        USE_INITIAL_STATE=True, STORE_FINAL_STATE=True,
-        SAVE_NEW_VALUE=True, IS_VARLEN=False,
+        k=k,
+        v=u,
+        w=w,
+        v_new=v_new,
+        g=g_cumsum,
+        gk=0,
+        h=h,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=0,
+        chunk_offsets=0,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        USE_EXP2=False,
+        TRANSPOSE_STATE=False,
+        USE_G=True,
+        USE_GK=False,
+        USE_INITIAL_STATE=True,
+        STORE_FINAL_STATE=True,
+        SAVE_NEW_VALUE=True,
+        IS_VARLEN=False,
     )
 
     # 6. chunk_fwd_o
@@ -267,10 +326,25 @@ def grid_o(meta):
         return (triton.cdiv(V, meta["BV"]), NT, B * H)
 
     wrap_triton(_unwrap(chunk_fwd_kernel_o))[grid_o](
-        q=q, k=k, v=v_new, h=h, g=g_cumsum, g_gamma=0, o=o,
-        cu_seqlens=0, chunk_indices=0, scale=scale,
-        T=T, H=H, K=K, V=V, BT=BT,
-        TRANSPOSE_STATE=False, USE_G=True, USE_G_GAMMA=False, IS_VARLEN=False,
+        q=q,
+        k=k,
+        v=v_new,
+        h=h,
+        g=g_cumsum,
+        g_gamma=0,
+        o=o,
+        cu_seqlens=0,
+        chunk_indices=0,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        TRANSPOSE_STATE=False,
+        USE_G=True,
+        USE_G_GAMMA=False,
+        IS_VARLEN=False,
     )
 
     return o, final_state
@@ -299,8 +373,12 @@ def _validate_inputs(q, k, v, g, beta, initial_state):
     if not (q.dtype == k.dtype == v.dtype):
         raise ValueError("q, k, v must have the same dtype")
     if not (
-        q.device == k.device == v.device
-        == g.device == beta.device == initial_state.device
+        q.device
+        == k.device
+        == v.device
+        == g.device
+        == beta.device
+        == initial_state.device
     ):
         raise ValueError("All tensors must be on the same device")
     if K > 256:
 
@@ -0,0 +1,140 @@
+"""Benchmark recurrent vs chunked FLA in full model decode with torch.compile.
+
+Usage:
+  # Recurrent (current code):
+  python bench_fla.py --prequantized ~/models/Qwen3.5-35B-A3B-HQQ-INT4-local --mode recurrent
+  # Chunked (original FLA triton kernels):
+  python bench_fla.py --prequantized ~/models/Qwen3.5-35B-A3B-HQQ-INT4-local --mode chunked
+"""
+import argparse
+import time
+import torch
+
+
+def patch_chunked():
+    """Restore chunked FLA in GatedDeltaNet before model construction."""
+    import executorch.examples.models.qwen3_5_moe.model as mod
+
+    original_forward = mod.GatedDeltaNet.forward
+
+    def chunked_forward(self, x, input_pos):
+        """GatedDeltaNet.forward using chunked FLA triton kernels."""
+        import torch.nn.functional as F
+
+        B, T, _ = x.size()
+
+        reset = (input_pos[0] == 0).to(self.conv_state.dtype)
+        keep = 1.0 - reset
+        self.conv_state[:B].mul_(keep)
+        self.recurrent_state[:B].mul_(keep)
+
+        proj = self.in_proj(x)
+        cd = self.conv_dim
+        vd = self.value_dim
+        nh = self.num_v_heads
+        mixed_qkv = proj[..., :cd]
+        z = proj[..., cd : cd + vd].reshape(B, T, self.num_v_heads, self.head_v_dim)
+        b = proj[..., cd + vd : cd + vd + nh]
+        a = proj[..., cd + vd + nh :]
+
+        qkv_t = mixed_qkv.transpose(1, 2)
+        conv_input = torch.cat([self.conv_state[:B], qkv_t], dim=-1)
+        with torch.no_grad():
+            self.conv_state[:B].copy_(conv_input[:, :, -self.conv_kernel_size :])
+        w = self.conv1d.weight.squeeze(1).float()
+        T_conv = conv_input.shape[-1] - self.conv_kernel_size + 1
+        acc = torch.zeros(
+            B, conv_input.shape[1], T_conv,
+            dtype=torch.float32, device=conv_input.device,
+        )
+        for k in range(self.conv_kernel_size):
+            acc = acc + conv_input[:, :, k : k + T_conv].float() * w[:, k : k + 1]
+        qkv_conv = F.silu(acc[:, :, -T:]).to(conv_input.dtype).transpose(1, 2)
+
+        kd = self.key_dim
+        q = qkv_conv[..., :kd].reshape(B, T, self.num_k_heads, self.head_k_dim)
+        k = qkv_conv[..., kd : 2 * kd].reshape(B, T, self.num_k_heads, self.head_k_dim)
+        v = qkv_conv[..., 2 * kd :].reshape(B, T, self.num_v_heads, self.head_v_dim)
+
+        q = F.normalize(q, p=2, dim=-1)
+        k = F.normalize(k, p=2, dim=-1)
+
+        if self.head_repeat > 1:
+            q = q.repeat_interleave(self.head_repeat, dim=2)
+            k = k.repeat_interleave(self.head_repeat, dim=2)
+
+        beta = b.sigmoid()
+        g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
+
+        # Use chunked FLA triton kernels
+        output, state = torch.ops.triton.chunk_gated_delta_rule(
+            q, k, v, g, beta, self.recurrent_state[:B]
+        )
+        with torch.no_grad():
+            self.recurrent_state[:B].copy_(state)
+
+        output = output.reshape(-1, self.head_v_dim)
+        z = z.reshape(-1, self.head_v_dim)
+        output = self.norm(output, z)
+        output = output.reshape(B, T, -1)
+
+        return self.out_proj(output)
+
+    mod.GatedDeltaNet.forward = chunked_forward
+    print("Patched: using chunked FLA triton kernels")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prequantized", required=True)
+    parser.add_argument("--mode", choices=["recurrent", "chunked"], required=True)
+    parser.add_argument("--steps", type=int, default=50)
+    parser.add_argument("--warmup", type=int, default=30)
+    parser.add_argument("--no-compile", action="store_true")
+    args = parser.parse_args()
+
+    # Patch BEFORE any model import if chunked
+    if args.mode == "chunked":
+        patch_chunked()
+
+    import executorch.backends.cuda.triton.kernels  # register triton ops
+    from executorch.examples.models.qwen3_5_moe.export import load_prequantized_model
+    from executorch.examples.models.qwen3_5_moe.inference import _move_to_cuda
+
+    print("Loading model...")
+    model, config = load_prequantized_model(args.prequantized, max_seq_len=4096)
+    _move_to_cuda(model, config)
+    model.eval()
+
+    if not args.no_compile:
+        print("Compiling with torch.compile...")
+        model = torch.compile(model, mode="default")
+
+    # Warmup
+    print(f"Warming up ({args.warmup} steps)...")
+    with torch.no_grad():
+        for i in range(args.warmup):
+            tok = torch.tensor([[1]], dtype=torch.long, device="cuda")
+            pos = torch.tensor([i], dtype=torch.long, device="cuda")
+            model(tok, pos)
+    torch.cuda.synchronize()
+
+    # Benchmark
+    print(f"Benchmarking ({args.steps} decode steps)...")
+    torch.cuda.synchronize()
+    t0 = time.perf_counter()
+    with torch.no_grad():
+        for i in range(args.steps):
+            tok = torch.tensor([[1]], dtype=torch.long, device="cuda")
+            pos = torch.tensor([args.warmup + i], dtype=torch.long, device="cuda")
+            model(tok, pos)
+    torch.cuda.synchronize()
+    elapsed = time.perf_counter() - t0
+
+    tok_s = args.steps / elapsed
+    ms_per_step = elapsed / args.steps * 1000
+    print(f"\nResult [{args.mode}]: {tok_s:.1f} tok/s ({ms_per_step:.2f} ms/step, {args.steps} steps)")
+
+
+if __name__ == "__main__":
+    main()