Infini-AI-Lab
diff --git a/‎bench_mla_opt/diag_mask.py‎
Lines changed: 167 additions & 0 deletions b/‎bench_mla_opt/diag_mask.py‎
Lines changed: 167 additions & 0 deletions
diff --git a/‎bench_triton_mla.py‎
Lines changed: 135 additions & 0 deletions b/‎bench_triton_mla.py‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎cuda_mla/.gitignore‎
Lines changed: 7 additions & 0 deletions b/‎cuda_mla/.gitignore‎
Lines changed: 7 additions & 0 deletions
@@ -0,0 +1,167 @@
+"""Diagnostic: isolate the cost of (load-mask, where-mask, loop-peel) in the
+single-pass MLA kernel, in the benchmark regime (seqlen % BLOCK_N == 0, so the
+fully-unmasked path is numerically correct). Run under CUDA_VISIBLE_DEVICES.
+"""
+import statistics
+import torch
+import triton
+import triton.language as tl
+from bench_triton_mla import make_inputs, KV_DIM, KV_LORA
+
+
+@triton.jit
+def _diag_kernel(
+    Q, K_Buffer, V_Buffer, sm_scale, Seqlens, Block_Table, O,
+    stride_qbs, stride_qh, stride_buf_kbs, stride_buf_vbs,
+    stride_obs, stride_oh, stride_bt_b,
+    q_head_num: tl.constexpr, BLOCK_SIZE: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr, BLOCK_DPE: tl.constexpr, BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr, BLOCK_H: tl.constexpr,
+    USE_LOAD_MASK: tl.constexpr, USE_WHERE: tl.constexpr, PEEL: tl.constexpr,
+    WITH_TAIL: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head_id = tl.program_id(1)
+    cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < q_head_num
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+    seqlen = tl.load(Seqlens + cur_batch)
+
+    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :]
+    q = tl.load(Q + offs_q, mask=mask_h[:, None], other=0.0)
+    off_qpe = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_dpe[None, :]
+    qpe = tl.load(Q + off_qpe, mask=mask_h[:, None], other=0.0)
+
+    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)
+    bt_base = Block_Table + cur_batch * stride_bt_b
+
+    if PEEL:
+        loop_end = (seqlen // BLOCK_N) * BLOCK_N
+    else:
+        loop_end = seqlen
+
+    for start_n in range(0, loop_end, BLOCK_N):
+        offs_n = start_n + tl.arange(0, BLOCK_N)
+        valid = offs_n < seqlen
+        if USE_LOAD_MASK:
+            page = tl.load(bt_base + (offs_n // BLOCK_SIZE), mask=valid, other=0)
+        else:
+            page = tl.load(bt_base + (offs_n // BLOCK_SIZE))
+        kv_loc = page * BLOCK_SIZE + (offs_n % BLOCK_SIZE)
+        if USE_LOAD_MASK:
+            k = tl.load(K_Buffer + kv_loc[None, :] * stride_buf_kbs + offs_d[:, None],
+                        mask=valid[None, :], other=0.0)
+            kpe = tl.load(K_Buffer + kv_loc[None, :] * stride_buf_kbs + offs_dpe[:, None],
+                          mask=valid[None, :], other=0.0)
+            v = tl.load(V_Buffer + kv_loc[:, None] * stride_buf_vbs + offs_dv[None, :],
+                        mask=valid[:, None], other=0.0)
+        else:
+            k = tl.load(K_Buffer + kv_loc[None, :] * stride_buf_kbs + offs_d[:, None])
+            kpe = tl.load(K_Buffer + kv_loc[None, :] * stride_buf_kbs + offs_dpe[:, None])
+            v = tl.load(V_Buffer + kv_loc[:, None] * stride_buf_vbs + offs_dv[None, :])
+        qk = tl.dot(q, k.to(q.dtype))
+        qk += tl.dot(qpe, kpe.to(qpe.dtype))
+        qk *= sm_scale
+        if USE_WHERE:
+            qk = tl.where(valid[None, :], qk, float("-inf"))
+        n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+        re_scale = tl.exp(e_max - n_e_max)
+        p = tl.exp(qk - n_e_max[:, None])
+        acc *= re_scale[:, None]
+        acc += tl.dot(p.to(v.dtype), v)
+        e_sum = e_sum * re_scale + tl.sum(p, 1)
+        e_max = n_e_max
+
+    if WITH_TAIL:
+        # separate duplicated masked tail block (mirrors _fwd_blocktable_mla_kernel_opt)
+        if loop_end < seqlen:
+            offs_n = loop_end + tl.arange(0, BLOCK_N)
+            valid = offs_n < seqlen
+            page = tl.load(bt_base + (offs_n // BLOCK_SIZE), mask=valid, other=0)
+            kv_loc = page * BLOCK_SIZE + (offs_n % BLOCK_SIZE)
+            k = tl.load(K_Buffer + kv_loc[None, :] * stride_buf_kbs + offs_d[:, None],
+                        mask=valid[None, :], other=0.0)
+            qk = tl.dot(q, k.to(q.dtype))
+            kpe = tl.load(K_Buffer + kv_loc[None, :] * stride_buf_kbs + offs_dpe[:, None],
+                          mask=valid[None, :], other=0.0)
+            qk += tl.dot(qpe, kpe.to(qpe.dtype))
+            qk *= sm_scale
+            qk = tl.where(valid[None, :], qk, float("-inf"))
+            v = tl.load(V_Buffer + kv_loc[:, None] * stride_buf_vbs + offs_dv[None, :],
+                        mask=valid[:, None], other=0.0)
+            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            acc *= re_scale[:, None]
+            acc += tl.dot(p.to(v.dtype), v)
+            e_sum = e_sum * re_scale + tl.sum(p, 1)
+            e_max = n_e_max
+
+    offs_o = cur_batch * stride_obs + cur_head[:, None] * stride_oh + offs_dv[None, :]
+    tl.store(O + offs_o, acc / e_sum[:, None], mask=mask_h[:, None])
+
+
+def run(q, latent, bt, sl, sm, blk, o, BLOCK_H, lm, wh, peel, tail=False, num_warps=8):
+    bs, H, _ = q.shape
+    grid = (bs, triton.cdiv(H, BLOCK_H))
+    _diag_kernel[grid](
+        q, latent, latent, sm, sl, bt, o,
+        q.stride(0), q.stride(1), latent.stride(0), latent.stride(0),
+        o.stride(0), o.stride(1), bt.stride(0),
+        q_head_num=H, BLOCK_SIZE=blk, BLOCK_DMODEL=512, BLOCK_DPE=64,
+        BLOCK_DV=512, BLOCK_N=blk, BLOCK_H=BLOCK_H,
+        USE_LOAD_MASK=lm, USE_WHERE=wh, PEEL=peel, WITH_TAIL=tail, num_warps=num_warps,
+    )
+    return o
+
+
+def bench(fn, q, latent, bt, sl, o, blk, iters=50, warmup=20):
+    sm = 1.0 / (KV_DIM ** 0.5)
+    for _ in range(warmup):
+        fn(q, latent, bt, sl, sm, blk, o)
+    torch.cuda.synchronize()
+    s = torch.cuda.Event(enable_timing=True); e = torch.cuda.Event(enable_timing=True)
+    s.record()
+    for _ in range(iters):
+        fn(q, latent, bt, sl, sm, blk, o)
+    e.record(); torch.cuda.synchronize()
+    return s.elapsed_time(e) / iters
+
+
+def gbps(q, latent, bt, sl, o, blk, BLOCK_H, lm, wh, peel, tail=False):
+    bs, H, _ = q.shape
+    tok = int(sl[0])
+    best = []
+    for _ in range(3):
+        ms = bench(lambda *a: run(*a, BLOCK_H, lm, wh, peel, tail), q, latent, bt, sl, o, blk)
+        best.append(bs * tok * KV_DIM * 2 / (ms * 1e-3) / 1e9)
+    return statistics.median(best)
+
+
+if __name__ == "__main__":
+    dev, dt = "cuda", torch.bfloat16
+    bs = 128
+    configs = [
+        # name, load_mask, where, peel, with_tail
+        ("baseline  single-loop lm+wh ", True,  True,  False, False),
+        ("unmasked  single-loop       ", False, False, False, False),
+        ("peel, NO tail block         ", False, False, True,  False),
+        ("peel + SEPARATE tail block  ", False, False, True,  True),
+    ]
+    for H, BLOCK_H in [(16, 16), (20, 16), (20, 32)]:
+        print(f"\n=== H={H} BLOCK_H={BLOCK_H} bs={bs} (median of 3 GB/s) ===")
+        hdr = f"{'config':<30}" + "".join(f"{f'b{b}/t{t}':>11}" for b in (16, 64) for t in (1024, 4096))
+        print(hdr)
+        for name, lm, wh, peel, tail in configs:
+            row = f"{name:<30}"
+            for blk in (16, 64):
+                for tok in (1024, 4096):
+                    q, latent, bt, sl, o = make_inputs(bs, H, blk, tok, dev, dt)
+                    g = gbps(q, latent, bt, sl, o, blk, BLOCK_H, lm, wh, peel, tail)
+                    row += f"{g:>11.0f}"
+            print(row)
@@ -0,0 +1,135 @@
+"""Microbenchmark for the vortex block-table MLA decode kernel(s).
+
+Pure kernel efficiency — NO sglang / RULER. Builds synthetic decode inputs
+(q, fused latent pool, sparse block_table, seqlens), times each kernel variant
+with CUDA events, and reports achieved HBM bandwidth.
+
+Bandwidth model (decode is KV-read bound): the kernel must read, per request,
+its `selected_tokens` fused-latent rows (kv_lora_rank + qk_rope = 576 bf16).
+Pages are distinct + scattered across requests (no L2 reuse), so
+
+    KV_bytes = bs * selected_tokens * 576 * dtype_size
+    achieved_BW = KV_bytes / kernel_time
+
+is the meaningful "useful" HBM bandwidth (a perfect kernel reads each latent row
+once per request). B200 HBM3e peak ~= 8 TB/s.
+
+    export HF_HOME=/raid/catalyst/models/
+    CUDA_VISIBLE_DEVICES=<gpu> python marks/mla/bench_triton_mla.py
+"""
+import argparse
+import json
+import torch
+
+from vortex_torch.engine.sgl.attention_backend.triton_mla_kernel import (
+    KERNELS,  # {name: fn(q, latent, block_table, seqlens, sm_scale, block_size, kv_lora_rank, o)}
+)
+
+KV_DIM = 576
+KV_LORA = 512
+PEAK_BW_GBPS = 8000.0  # B200 HBM3e ~8 TB/s
+
+
+def make_inputs(bs, num_heads, block_size, selected_tokens, device, dtype):
+    assert selected_tokens % block_size == 0
+    n_blocks = selected_tokens // block_size
+    num_pages = bs * n_blocks                      # distinct page per (req, slot)
+    latent = torch.randn(num_pages * block_size, KV_DIM, device=device, dtype=dtype)
+    pages = torch.randperm(num_pages, device=device, dtype=torch.int32)
+    block_table = pages.view(bs, n_blocks).contiguous()
+    seqlens = torch.full((bs,), selected_tokens, device=device, dtype=torch.int32)
+    q = torch.randn(bs, num_heads, KV_DIM, device=device, dtype=dtype)
+    o = torch.empty(bs, num_heads, KV_LORA, device=device, dtype=dtype)
+    return q, latent, block_table, seqlens, o
+
+
+def torch_reference(q, latent, block_table, seqlens, sm_scale, block_size):
+    """Dense block-sparse MLA attention reference (fp32) for correctness."""
+    bs, H, _ = q.shape
+    out = torch.empty(bs, H, KV_LORA, device=q.device, dtype=torch.float32)
+    qf = q.float()
+    lf = latent.float()
+    for b in range(bs):
+        sl = int(seqlens[b])
+        nb = (sl + block_size - 1) // block_size
+        rows = []
+        for j in range(nb):
+            page = int(block_table[b, j])
+            rows.append(torch.arange(page * block_size, page * block_size + block_size,
+                                     device=q.device))
+        slots = torch.cat(rows)[:sl]
+        k = lf[slots]                       # [sl, 576]
+        scores = (qf[b] @ k.t()) * sm_scale  # [H, sl]
+        p = torch.softmax(scores, dim=-1)
+        out[b] = p @ k[:, :KV_LORA]          # [H, 512]
+    return out
+
+
+def bench_one(fn, q, latent, block_table, seqlens, o, block_size, iters=50, warmup=20):
+    sm_scale = 1.0 / (KV_DIM ** 0.5)
+    for _ in range(warmup):
+        fn(q, latent, block_table, seqlens, sm_scale, block_size, KV_LORA, o)
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        fn(q, latent, block_table, seqlens, sm_scale, block_size, KV_LORA, o)
+    end.record()
+    torch.cuda.synchronize()
+    return start.elapsed_time(end) / iters
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--bs", type=int, default=128)
+    ap.add_argument("--heads", type=int, default=16)
+    ap.add_argument("--dtype", default="bf16")
+    ap.add_argument("--kernels", default="all")
+    ap.add_argument("--out", default="marks/mla/bench_results.jsonl")
+    args = ap.parse_args()
+
+    device = "cuda"
+    dtype = {"bf16": torch.bfloat16, "fp16": torch.float16}[args.dtype]
+    names = list(KERNELS) if args.kernels == "all" else args.kernels.split(",")
+
+    blocks = [16, 32, 64]
+    token_budgets = [512, 1024, 2048, 4096]
+    sm_scale = 1.0 / (KV_DIM ** 0.5)
+
+    print(f"B200 MLA decode kernel bench | bs={args.bs} heads={args.heads} "
+          f"dtype={args.dtype} | peak~{PEAK_BW_GBPS/1000:.0f}TB/s")
+    print(f"{'kernel':<14}{'blk':>4}{'sel_tok':>8}{'ms':>9}{'GB/s':>9}{'%peak':>7}{'maxerr':>10}")
+
+    results = []
+    # correctness on a small case once per kernel
+    for name in names:
+        fn = KERNELS[name]
+        # --- correctness (small) ---
+        q, latent, bt, sl, o = make_inputs(4, args.heads, 32, 256, device, dtype)
+        ref = torch_reference(q, latent, bt, sl, sm_scale, 32)
+        out = fn(q, latent, bt, sl, sm_scale, 32, KV_LORA, o)
+        maxerr = (out.float() - ref).abs().max().item()
+        # --- sweep ---
+        for blk in blocks:
+            for tok in token_budgets:
+                q, latent, bt, sl, o = make_inputs(args.bs, args.heads, blk, tok, device, dtype)
+                ms = bench_one(fn, q, latent, bt, sl, o, blk)
+                kv_bytes = args.bs * tok * KV_DIM * (2 if dtype != torch.float32 else 4)
+                gbps = kv_bytes / (ms * 1e-3) / 1e9
+                pct = gbps / PEAK_BW_GBPS * 100
+                print(f"{name:<14}{blk:>4}{tok:>8}{ms:>9.3f}{gbps:>9.0f}{pct:>6.1f}%"
+                      f"{maxerr:>10.2e}")
+                results.append({"kernel": name, "bs": args.bs, "heads": args.heads,
+                                "block": blk, "sel_tok": tok, "ms": round(ms, 4),
+                                "gbps": round(gbps, 1), "pct_peak": round(pct, 1),
+                                "maxerr": maxerr})
+    with open(args.out, "w") as f:
+        for r in results:
+            f.write(json.dumps(r) + "\n")
+    print(f"[bench] wrote {len(results)} rows to {args.out}")
+
+
+if __name__ == "__main__":
+    main()
+
@@ -0,0 +1,7 @@
+# JIT build artifacts (torch cpp_extension / ninja output)
+build/
+build_*/
+spec/build_*/
+**/__pycache__/
+*.so
+*.o