Infini-AI-Lab
diff --git a/‎cuda_mla/PROGRESS.md‎
Lines changed: 203 additions & 0 deletions b/‎cuda_mla/PROGRESS.md‎
Lines changed: 203 additions & 0 deletions
diff --git a/‎cuda_mla/REPORT.md‎
Lines changed: 197 additions & 113 deletions b/‎cuda_mla/REPORT.md‎
Lines changed: 197 additions & 113 deletions
diff --git a/‎cuda_mla/spec/bs_sweep.py‎
Lines changed: 61 additions & 0 deletions b/‎cuda_mla/spec/bs_sweep.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎cuda_mla/spec/final_h2h.py‎
Lines changed: 46 additions & 0 deletions b/‎cuda_mla/spec/final_h2h.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎cuda_mla/spec/full_sweep.py‎
Lines changed: 55 additions & 0 deletions b/‎cuda_mla/spec/full_sweep.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎cuda_mla/spec/k_h16.cu‎
Lines changed: 19 additions & 0 deletions b/‎cuda_mla/spec/k_h16.cu‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎cuda_mla/spec/k_h20_bs128_blk64.cu‎
Lines changed: 11 additions & 7 deletions b/‎cuda_mla/spec/k_h20_bs128_blk64.cu‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎cuda_mla/spec/mla_decoder.cu‎
Lines changed: 103 additions & 0 deletions b/‎cuda_mla/spec/mla_decoder.cu‎
Lines changed: 103 additions & 0 deletions
@@ -0,0 +1,61 @@
+"""Validate MLADecoder is general across bs and block_size (vs Triton).
+Correctness + throughput on uniform and ragged, bs in {8,32,64,128,256}, blk in {32,64}."""
+import os, statistics, random, torch
+from torch.utils.cpp_extension import load
+from vortex_torch.engine.sgl.attention_backend.triton_mla_kernel import (
+    decode_blocktable_mla_opt, decode_blocktable_mla_split)
+HERE = "cuda_mla/spec"; KV_DIM, KV_LORA, H = 576, 512, 20; sm = 1.0 / (KV_DIM ** 0.5)
+bd = HERE + "/build_decoder"; os.makedirs(bd, exist_ok=True)
+mod = load(name="vortex_mla_decoder", sources=[HERE + "/mla_decoder.cu"],
+           extra_cuda_cflags=["-O3", "-arch=sm_100a", "--use_fast_math", "-lineinfo"],
+           extra_include_paths=[HERE], build_directory=bd, verbose=False)
+
+def mk(bs, blk, sls):
+    maxtok = int(max(sls)); nb = (maxtok + blk - 1) // blk; npg = bs * nb
+    latent = torch.randn(npg * blk, KV_DIM, device='cuda', dtype=torch.bfloat16)
+    bt = torch.randperm(npg, device='cuda', dtype=torch.int32).view(bs, nb).contiguous()
+    sl = torch.tensor(sls, device='cuda', dtype=torch.int32)
+    q = torch.randn(bs, H, KV_DIM, device='cuda', dtype=torch.bfloat16)
+    return q, latent, bt, sl, nb
+
+def ref(q, latent, bt, sl, blk):
+    bs = q.size(0); out = torch.empty(bs, H, KV_LORA, device='cuda', dtype=torch.float32)
+    qf, lf = q.float(), latent.float()
+    for b in range(bs):
+        s = int(sl[b]); nb = (s + blk - 1) // blk
+        rows = [torch.arange(int(bt[b, j]) * blk, int(bt[b, j]) * blk + blk, device='cuda') for j in range(nb)]
+        slots = torch.cat(rows)[:s]; k = lf[slots]
+        out[b] = torch.softmax((qf[b] @ k.t()) * sm, -1) @ k[:, :KV_LORA]
+    return out
+
+def bench(call, q, latent, bt, sl, reps=8):
+    o = torch.empty(q.size(0), H, KV_LORA, device='cuda', dtype=torch.bfloat16); vals = []
+    for _ in range(reps):
+        for _ in range(15): call(q, latent, bt, sl, o)
+        torch.cuda.synchronize()
+        s = torch.cuda.Event(enable_timing=True); e = torch.cuda.Event(enable_timing=True); s.record()
+        for _ in range(40): call(q, latent, bt, sl, o)
+        e.record(); torch.cuda.synchronize()
+        vals.append(int(sl.sum()) * KV_DIM * 2 / ((s.elapsed_time(e) / 40) * 1e-3) / 1e9)
+    return statistics.median(vals)
+
+random.seed(0)
+for blk in (32, 64):
+    print(f"\n========== block_size={blk}  (GB/s; mine/triton_best) ==========")
+    print(f"{'bs':>5} {'pattern':>10} | {'splits':>6} {'mine':>6} {'tri_sp':>6} {'tri_ks':>6} | ratio  err")
+    for bs in (8, 32, 64, 128, 256):
+        for pat in ("uniform", "ragged"):
+            if pat == "uniform": sls = [2048] * bs
+            else: sls = [random.choice([256, 512, 1024, 2048, 4096]) for _ in range(bs)]
+            nb = (max(sls) + blk - 1) // blk
+            q, latent, bt, sl, _ = mk(bs, blk, sls)
+            dec = mod.MLADecoder(bs, H, blk, nb)
+            o = torch.empty(bs, H, KV_LORA, device='cuda', dtype=torch.bfloat16)
+            dec.plan(sl); dec.run(q, latent, bt, o, sm); torch.cuda.synchronize()
+            err = (o.float() - ref(q, latent, bt, sl, blk)).abs().max().item()
+            me = bench(lambda q,l,bt,sl,o: (dec.plan(sl), dec.run(q,l,bt,o,sm)), q, latent, bt, sl)
+            tsp = bench(lambda q,l,bt,sl,o: decode_blocktable_mla_opt(q,l,bt,sl,sm,blk,KV_LORA,o), q, latent, bt, sl)
+            tks = bench(lambda q,l,bt,sl,o: decode_blocktable_mla_split(q,l,bt,sl,sm,blk,KV_LORA,o), q, latent, bt, sl)
+            tb = max(tsp, tks)
+            tag = "OK" if err < 3e-2 else "FAIL<<"
+            print(f"{bs:>5} {pat:>10} | {dec.target_ctas:>6} {me:>6.0f} {tsp:>6.0f} {tks:>6.0f} | {me/tb:.2f}x  {err:.1e} {tag}")
@@ -0,0 +1,46 @@
+"""FINAL apples-to-apples on one empty GPU: the delivered kernel (k_h20_bs128_blk64
+run() = bf16-O + MINB3 + sp3 + vectorized Q-load) vs Triton best."""
+import os, statistics, torch
+from torch.utils.cpp_extension import load
+from vortex_torch.engine.sgl.attention_backend.triton_mla_kernel import (
+    decode_blocktable_mla_opt, decode_blocktable_mla_split)
+HERE = "cuda_mla/spec"; KV_DIM, KV_LORA, H = 576, 512, 20; sm = 1.0 / (KV_DIM ** 0.5)
+bd = HERE + "/build_k_h20_bs128_blk64"; os.makedirs(bd, exist_ok=True)
+mod = load(name="vortex_k_h20_bs128_blk64", sources=[HERE + "/k_h20_bs128_blk64.cu"],
+           extra_cuda_cflags=["-O3", "-arch=sm_100a", "--use_fast_math", "-lineinfo"],
+           extra_include_paths=[HERE], build_directory=bd, verbose=False)
+
+def mk(bs, blk, tok, ragged=False):
+    nb = (tok + blk - 1) // blk; npg = bs * nb
+    latent = torch.randn(npg * blk, KV_DIM, device='cuda', dtype=torch.bfloat16)
+    bt = torch.randperm(npg, device='cuda', dtype=torch.int32).view(bs, nb).contiguous()
+    sl = (torch.randint(tok // 2, tok + 1, (bs,), device='cuda', dtype=torch.int32) if ragged
+          else torch.full((bs,), tok, device='cuda', dtype=torch.int32))
+    q = torch.randn(bs, H, KV_DIM, device='cuda', dtype=torch.bfloat16)
+    return q, latent, bt, sl
+
+def bench(call, bs, blk, tok, reps=12, ragged=False):
+    vals = []
+    for _ in range(reps):
+        q, latent, bt, sl = mk(bs, blk, tok, ragged)
+        o = torch.empty(bs, H, KV_LORA, device='cuda', dtype=torch.bfloat16)
+        f = lambda: call(q, latent, bt, sl, o)
+        for _ in range(20): f()
+        torch.cuda.synchronize()
+        s = torch.cuda.Event(enable_timing=True); e = torch.cuda.Event(enable_timing=True); s.record()
+        for _ in range(50): f()
+        e.record(); torch.cuda.synchronize()
+        vals.append(bs * tok * KV_DIM * 2 / ((s.elapsed_time(e) / 50) * 1e-3) / 1e9)
+    return statistics.median(vals)
+
+mine = lambda q,l,bt,sl,o: mod.run(q,l,bt,sl,o,sm)
+trsp = lambda q,l,bt,sl,o: decode_blocktable_mla_opt(q,l,bt,sl,sm,64,KV_LORA,o)
+trks = lambda q,l,bt,sl,o: decode_blocktable_mla_split(q,l,bt,sl,sm,64,KV_LORA,o)
+
+print("=== FINAL h20 bs=128 blk=64 (one empty GPU, GB/s) ===")
+print(f"{'sel':>8} {'mine':>7} {'tri_sp':>7} {'tri_ks':>7}  mine/best")
+for tok in (1024, 2048, 3072, 4096):
+    m = bench(mine, 128, 64, tok); a = bench(trsp, 128, 64, tok); k = bench(trks, 128, 64, tok)
+    print(f"{tok:>8} {m:>7.0f} {a:>7.0f} {k:>7.0f}  {m/max(a,k):.3f}")
+m = bench(mine, 128, 64, 2048, ragged=True); a = bench(trsp, 128, 64, 2048, ragged=True)
+print(f"{'ragged':>8} {m:>7.0f} {a:>7.0f} {'':>7}  {m/a:.3f}")
@@ -0,0 +1,55 @@
+"""Full bs x blk table: MLADecoder (plan+run, bs-general work-queue) vs Triton best.
+bs in {1..8} U {8*i, 1<=i<=16}; blk in {16,32,64}; sel=2048 uniform. Writes CSV + grid."""
+import os, statistics, json, torch
+from torch.utils.cpp_extension import load
+from vortex_torch.engine.sgl.attention_backend.triton_mla_kernel import (
+    decode_blocktable_mla_opt, decode_blocktable_mla_split)
+HERE = "cuda_mla/spec"; KV_DIM, KV_LORA, H = 576, 512, 20; sm = 1.0 / (KV_DIM ** 0.5)
+bd = HERE + "/build_decoder"; os.makedirs(bd, exist_ok=True)
+mod = load(name="vortex_mla_decoder", sources=[HERE + "/mla_decoder.cu"],
+           extra_cuda_cflags=["-O3", "-arch=sm_100a", "--use_fast_math", "-lineinfo"],
+           extra_include_paths=[HERE], build_directory=bd, verbose=False)
+SEL = 2048
+
+def mk(bs, blk, tok):
+    nb = (tok + blk - 1) // blk; npg = bs * nb
+    latent = torch.randn(npg * blk, KV_DIM, device='cuda', dtype=torch.bfloat16)
+    bt = torch.randperm(npg, device='cuda', dtype=torch.int32).view(bs, nb).contiguous()
+    sl = torch.full((bs,), tok, device='cuda', dtype=torch.int32)
+    q = torch.randn(bs, H, KV_DIM, device='cuda', dtype=torch.bfloat16)
+    return q, latent, bt, sl, nb
+
+def bench(call, q, latent, bt, sl, reps=6):
+    o = torch.empty(q.size(0), H, KV_LORA, device='cuda', dtype=torch.bfloat16); vals = []
+    for _ in range(reps):
+        for _ in range(12): call(q, latent, bt, sl, o)
+        torch.cuda.synchronize()
+        s = torch.cuda.Event(enable_timing=True); e = torch.cuda.Event(enable_timing=True); s.record()
+        for _ in range(40): call(q, latent, bt, sl, o)
+        e.record(); torch.cuda.synchronize()
+        vals.append(int(sl.sum()) * KV_DIM * 2 / ((s.elapsed_time(e) / 40) * 1e-3) / 1e9)
+    return statistics.median(vals)
+
+bss = sorted(set(list(range(1, 9)) + [8 * i for i in range(1, 17)]))
+rows = []
+for blk in (16, 32, 64):
+    for bs in bss:
+        nb = (SEL + blk - 1) // blk
+        q, latent, bt, sl, _ = mk(bs, blk, SEL)
+        dec = mod.MLADecoder(bs, H, blk, nb)
+        me = bench(lambda q,l,bt,sl,o: (dec.plan(sl), dec.run(q,l,bt,o,sm)), q, latent, bt, sl)
+        tsp = bench(lambda q,l,bt,sl,o: decode_blocktable_mla_opt(q,l,bt,sl,sm,blk,KV_LORA,o), q, latent, bt, sl)
+        tks = bench(lambda q,l,bt,sl,o: decode_blocktable_mla_split(q,l,bt,sl,sm,blk,KV_LORA,o), q, latent, bt, sl)
+        tb = max(tsp, tks); best = 'sp' if tsp >= tks else 'ks'
+        rows.append(dict(blk=blk, bs=bs, mine=me, tri_sp=tsp, tri_ks=tks, ratio=me/tb, tbest=best))
+        print(f"blk={blk:2d} bs={bs:3d}: mine={me:5.0f}  tri_sp={tsp:5.0f}  tri_ks={tks:5.0f}  ratio={me/tb:.2f}x")
+
+json.dump(rows, open(HERE + "/full_sweep.json", "w"), indent=0)
+# pretty grid per blk
+print("\n\n##### TABLE (GB/s mine | ratio vs Triton-best), sel=2048 uniform #####")
+for blk in (16, 32, 64):
+    print(f"\n### block_size = {blk} ###")
+    print(f"{'bs':>4} | {'mine':>6} {'tri_sp':>6} {'tri_ks':>6} | {'ratio':>6}")
+    for r in rows:
+        if r['blk'] == blk:
+            print(f"{r['bs']:>4} | {r['mine']:>6.0f} {r['tri_sp']:>6.0f} {r['tri_ks']:>6.0f} | {r['ratio']:>5.2f}x")
@@ -0,0 +1,19 @@
+// H=16 standalone flagship (MTILES=1, no head padding). vs H=20: M=16 halves the Q
+// smem (~18.7KB) and Oreg (32 regs) => ~55KB smem => 4 CTAs/SM (vs 3) => 44% DRAM.
+// Optimum: NWARPS=4, STAGES=2, MINB=5, splits=4 (~3000-3023 GB/s @ bs128, 1.1-1.9x Triton).
+// (MINB=5: smem caps occupancy at 4 CTAs, but the lower reg target schedules ~1% better
+// than MINB=4; the tiny M=16 Oreg keeps it spill-free.) BLK template must match block_size.
+#include "mla_ldm.cuh"
+#define V(nm, BLK, MB) \
+  void nm(torch::Tensor q, torch::Tensor l, torch::Tensor bt, torch::Tensor sl, \
+          torch::Tensor o, double s, int sp) { ldm::launch<BLK,16,4,2,1,MB>(q,l,bt,sl,o,s,sp); }
+V(b64_b4,64,4) V(b64_b5,64,5) V(b32_b4,32,4) V(b32_b5,32,5) V(b16_b4,16,4) V(b16_b5,16,5)
+// defaults: the measured optimum per block size (MINB=5, splits=4).
+void run64(torch::Tensor q,torch::Tensor l,torch::Tensor bt,torch::Tensor sl,torch::Tensor o,double s){ ldm::launch<64,16,4,2,1,5>(q,l,bt,sl,o,s,4); }
+void run32(torch::Tensor q,torch::Tensor l,torch::Tensor bt,torch::Tensor sl,torch::Tensor o,double s){ ldm::launch<32,16,4,2,1,5>(q,l,bt,sl,o,s,4); }
+void run16(torch::Tensor q,torch::Tensor l,torch::Tensor bt,torch::Tensor sl,torch::Tensor o,double s){ ldm::launch<16,16,4,2,1,5>(q,l,bt,sl,o,s,4); }
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("run64",&run64); m.def("run32",&run32); m.def("run16",&run16);
+  m.def("b64_b4",&b64_b4); m.def("b64_b5",&b64_b5); m.def("b32_b4",&b32_b4);
+  m.def("b32_b5",&b32_b5); m.def("b16_b4",&b16_b4); m.def("b16_b5",&b16_b5);
+}
@@ -1,12 +1,16 @@
-// H=20, bs=128, block_size=64. ldmatrix + reg-O + reg-softmax + cp.async + split-KV.
+// H=20, bs=128, block_size=64. ldmatrix + bf16-packed reg-O + reg-softmax +
+// cp.async + split-KV. WINNING CONFIG (ncu-tuned): NWARPS=4, STAGES=2, MINB=3
+// (3 CTAs/SM => 18.75% occ, the occupancy wall the bandwidth-bound decode hit),
+// splits=3 (populates the 3-block capacity). 2059 GB/s @ sel=2048 vs Triton 1971.
 #include "mla_ldm.cuh"
-#define V(nm, NW, ST) \
+#define V(nm, NW, ST, MB) \
   void nm(torch::Tensor q, torch::Tensor l, torch::Tensor bt, torch::Tensor sl, \
-          torch::Tensor o, double s, int sp) { ldm::launch<64,16,NW,ST>(q,l,bt,sl,o,s,sp); }
-V(r_w4_s1,4,1) V(r_w4_s2,4,2) V(r_w8_s1,8,1) V(r_w8_s2,8,2)
+          torch::Tensor o, double s, int sp) { ldm::launch<64,16,NW,ST,1,MB>(q,l,bt,sl,o,s,sp); }
+V(r_w4_s2_b2,4,2,2) V(r_w4_s2_b3,4,2,3) V(r_w8_s2_b2,8,2,2) V(r_w8_s2_b3,8,2,3)
+// default: the measured optimum for h20/bs128/blk64 (MINB=3, splits=3).
 void run(torch::Tensor q, torch::Tensor l, torch::Tensor bt, torch::Tensor sl,
-         torch::Tensor o, double s) { ldm::launch<64,16,4,2>(q,l,bt,sl,o,s,2); }
+         torch::Tensor o, double s) { ldm::launch<64,16,4,2,1,3>(q,l,bt,sl,o,s,3); }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("run",&run); m.def("r_w4_s1",&r_w4_s1); m.def("r_w4_s2",&r_w4_s2);
-  m.def("r_w8_s1",&r_w8_s1); m.def("r_w8_s2",&r_w8_s2);
+  m.def("run",&run); m.def("r_w4_s2_b2",&r_w4_s2_b2); m.def("r_w4_s2_b3",&r_w4_s2_b3);
+  m.def("r_w8_s2_b2",&r_w8_s2_b2); m.def("r_w8_s2_b3",&r_w8_s2_b3);
 }
@@ -0,0 +1,103 @@
+// MLADecoder: flashinfer-style init/plan/run for ragged-batch MLA decode,
+// general across batch size and block size.
+//
+//   __init__(bs, H, block_size, max_blocks, ...)  -- ALLOCATE once + fix geometry.
+//        A bs-aware policy sets the schedule knobs: target #active CTAs (one
+//        MINB=3 wave ~ 3*SM, so LOW bs auto-gets many splits/request and HIGH bs
+//        gets few), a chunk_min floor (avoid tiny-chunk overhead), a per-request
+//        split cap, and the MINB to launch run() with.
+//   plan(seqlens)  -- POPULATE the load-balanced work queue from live seqlens.
+//   run(q, latent, block_table, o, sm_scale)  -- EXECUTE; dispatches the decode
+//        kernel by (block_size, MINB). No seqlens => one plan() feeds all layers.
+//
+// Both plan() and run() are fixed-grid launches on the current stream with the
+// pre-allocated buffers => both CUDA-graph-capturable.
+#include "mla_ldm.cuh"
+#include <torch/extension.h>
+#include <algorithm>
+
+struct MLADecoder {
+  int bs = 0, H = 0, block_size = 0, max_blocks = 0;
+  int target = 0, target_ctas = 0, max_split_cap = 0, chunk_min = 0, minb = 3;
+  int MTILES = 0, M = 0, sm_count = 0;
+  torch::Tensor work_batch, work_kv_start, work_kv_end, work_offset, mid_o, mid_m, mid_l;
+
+  // ---- init: bs-aware schedule policy + allocation. Negative knob args => auto. ----
+  MLADecoder(int bs_, int H_, int block_size_, int max_blocks_,
+             int max_split_cap_ = -1, int chunk_min_ = -1, int minb_ = -1) {
+    bs = bs_; H = H_; block_size = block_size_; max_blocks = max_blocks_;
+    MTILES = (H + 15) / 16; M = MTILES * 16;
+    int dev; cudaGetDevice(&dev);
+    cudaDeviceProp prop; cudaGetDeviceProperties(&prop, dev);
+    sm_count = prop.multiProcessorCount;
+
+    // Achievable CTAs/SM is set by smem (the run_wq footprint, STAGES=2/NT=16), which
+    // scales with M = MTILES*16: H<=16 (M=16) => ~55KB => 4 CTAs/SM; H<=32 (M=32) =>
+    // ~74KB => 3. We fill exactly one such wave: target active CTAs = ctas*SM, and
+    // MINB=ctas (launch_bounds forces that occupancy; the small-M Oreg keeps it spill-
+    // free). So H=16 auto-uses 4 CTAs/splits~4, H=20 uses 3 CTAs/splits~3.
+    int smem_cta = 2 * 16 * ldm::HDP * 2 + M * ldm::HDP * 2 + M * 16 * 2 + 3 * M * 4;
+    int smem_sm = (int)prop.sharedMemPerMultiprocessor;
+    int ctas = std::max(1, std::min(6, smem_sm / smem_cta));
+    minb = (minb_ > 0) ? minb_ : ctas;                        // CTAs/SM occupancy target
+    target = ctas * sm_count;                                 // active CTAs to fill one wave
+    chunk_min = (chunk_min_ > 0) ? chunk_min_ : 128;          // don't split below 128 tokens
+    // per-request cap: enough for low bs to fill a wave, bounded so one request can't
+    // starve others on skew. ~ceil(target/bs) headroom, clamped to [ctas, target].
+    int auto_cap = std::min(target, std::max(ctas, 2 * ((target + bs - 1) / bs)));
+    max_split_cap = (max_split_cap_ > 0) ? max_split_cap_ : auto_cap;
+    // queue length: safe upper bound on sum(nsplits) (rounding + the >=1 clamp).
+    target_ctas = std::max(target, bs) + bs;
+
+    auto i32 = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
+    auto f32 = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    work_batch = torch::empty({target_ctas}, i32);
+    work_kv_start = torch::empty({target_ctas}, i32);
+    work_kv_end = torch::empty({target_ctas}, i32);
+    work_offset = torch::empty({bs + 1}, i32);
+    mid_o = torch::empty({target_ctas, M, ldm::CKV}, f32);
+    mid_m = torch::empty({target_ctas, M}, f32);
+    mid_l = torch::empty({target_ctas, M}, f32);
+  }
+
+  // ---- plan: populate the work queue from current seqlens. ----
+  void plan(torch::Tensor seqlens) {
+    TORCH_CHECK(seqlens.size(0) == bs, "plan: seqlens batch ", seqlens.size(0), " != init bs ", bs);
+    ldm::run_schedule_wq(seqlens, work_batch, work_kv_start, work_kv_end, work_offset,
+                         target, max_split_cap, chunk_min);
+  }
+
+  // ---- run: dispatch the decode kernel by (block_size, MINB). ----
+  void run(torch::Tensor q, torch::Tensor latent, torch::Tensor block_table,
+           torch::Tensor o, double sm_scale) {
+#define RUN(BLK, MB) ldm::run_wq<BLK, 16, 4, 2, 1, MB>(q, latent, block_table, o, work_batch, \
+        work_kv_start, work_kv_end, work_offset, mid_o, mid_m, mid_l, sm_scale)
+// MINB is the occupancy target chosen in init from M (3 for H<=32, 4 for H<=16).
+#define DISPATCH_MB(BLK) do { \
+    if (minb <= 2) { RUN(BLK, 2); } else if (minb == 3) { RUN(BLK, 3); } \
+    else if (minb == 4) { RUN(BLK, 4); } else { RUN(BLK, 5); } } while (0)
+    if (block_size == 64) { DISPATCH_MB(64); }
+    else if (block_size == 32) { DISPATCH_MB(32); }
+    else if (block_size == 16) { DISPATCH_MB(16); }
+    else TORCH_CHECK(false, "MLADecoder: unsupported block_size ", block_size, " (need 16/32/64)");
+#undef DISPATCH_MB
+#undef RUN
+  }
+};
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  pybind11::class_<MLADecoder>(m, "MLADecoder")
+      .def(pybind11::init<int, int, int, int, int, int, int>(),
+           pybind11::arg("bs"), pybind11::arg("H"), pybind11::arg("block_size"),
+           pybind11::arg("max_blocks"), pybind11::arg("max_split_cap") = -1,
+           pybind11::arg("chunk_min") = -1, pybind11::arg("minb") = -1)
+      .def("plan", &MLADecoder::plan, pybind11::arg("seqlens"))
+      .def("run", &MLADecoder::run,
+           pybind11::arg("q"), pybind11::arg("latent"), pybind11::arg("block_table"),
+           pybind11::arg("o"), pybind11::arg("sm_scale"))
+      .def_readonly("target", &MLADecoder::target)
+      .def_readonly("target_ctas", &MLADecoder::target_ctas)
+      .def_readonly("max_split_cap", &MLADecoder::max_split_cap)
+      .def_readonly("chunk_min", &MLADecoder::chunk_min)
+      .def_readonly("minb", &MLADecoder::minb);
+}