ROCm
diff --git a/‎aiter/jit/optCompilerConfig.json‎
Lines changed: 11 additions & 0 deletions b/‎aiter/jit/optCompilerConfig.json‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎aiter/mla.py‎
Lines changed: 71 additions & 0 deletions b/‎aiter/mla.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎aiter/ops/attention.py‎
Lines changed: 42 additions & 0 deletions b/‎aiter/ops/attention.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎aiter/utility/dtypes.py‎
Lines changed: 9 additions & 0 deletions b/‎aiter/utility/dtypes.py‎
Lines changed: 9 additions & 0 deletions
@@ -138,6 +138,17 @@
         "verbose": "False",
         "blob_gen_cmd": "f'{AITER_META_DIR}/hsa/codegen.py -m mla --output_dir {{}}'"
     },
+    "module_mla_v4_asm": {
+        "srcs": [
+            "f'{AITER_CSRC_DIR}/py_itfs_cu/asm_mla_v4.cu'"
+        ],
+        "flags_extra_cc": [],
+        "flags_extra_hip": [],
+        "extra_ldflags": "None",
+        "extra_include": [],
+        "verbose": "False",
+        "blob_gen_cmd": "f'{AITER_META_DIR}/hsa/codegen.py -m mla_v4 --output_dir {{}}'"
+    },
     "module_cache": {
         "srcs": [
             "f'{AITER_CSRC_DIR}/pybind/cache_pybind.cu'",
 
@@ -973,3 +973,74 @@ def mla_prefill_reduce(
             )  # [tile_q, v_head_dim]
 
             output[qo_start:qo_end, head_idx, :] = final_output[:q_len, :]
+
+
+# ---------------------------------------------------------------------------
+# DSv4 MLA — additive entry point. Does NOT touch any existing
+#   gqa_ratio=16, sub_Q=64, page_size=1, q_dtype=fp8, kv_dtype=fp8
+# ---------------------------------------------------------------------------
+def mla_decode_fwd_v4_nm(
+    q,                    # [total_query_len, num_heads, head_size]   FP8 packed Q+e8m0
+    qrope,                # [total_query_len, num_heads, kv_rotary]   BF16
+    kv_buffer,            # [num_page, page_size, num_kv_heads, dim_qk_packed]
+    kvrope,               # [num_page, page_size, num_kv_heads, kv_rotary]
+    output,               # [total_query_len, num_heads, v_head_dim]  BF16 (used for out_16_nosplit=1)
+    qo_indptr,            # [num_seqs+1]
+    kv_indptr,            # [num_seqs+1]
+    kv_page_indices,      # [num_page_used]
+    kv_last_page_lens,    # [num_seqs]
+    split_indptr,         # [num_seqs+1]
+    max_seqlen_q,
+    sm_scale=None,        # ignored on v4 nm; kernel hardcodes 1/sqrt(512)
+    out_16_nosplit=0,
+    num_kv_splits=1,
+    sub_Q=64,
+    logits=None,
+    attn_lse=None,
+):
+    """v4 nm-recompile MLA decode forward (mi350 / gfx950 wave64).
+
+    Routes through the canonical aiter JIT C-ABI module
+    `module_mla_v4_asm` (csrc/py_itfs_cu/asm_mla_v4.cu). Returns
+    (logits, attn_lse) — caller is responsible for any post-reduce /
+    final-O work.
+
+    logits/attn_lse may be pre-allocated (e.g. for SENTINEL pre-fill in
+    correctness tests); if not, we allocate the canonical 5D layout
+    `[num_seqs, num_kv_splits, num_kv_heads, gqa*max_seqlen_q, v_head_dim]`
+    (and `[..., 1]` for attn_lse) inferred from the input shapes.
+    """
+    num_seqs = qo_indptr.shape[0] - 1
+    num_heads = q.size(1)
+    v_head_dim = output.size(2)
+    num_kv_heads = kv_buffer.size(2)
+    gqa_ratio = num_heads // num_kv_heads
+    q_seq_lens_internal = gqa_ratio * max_seqlen_q
+
+    if logits is None:
+        logits = torch.empty(
+            (num_seqs, num_kv_splits, num_kv_heads, q_seq_lens_internal, v_head_dim),
+            dtype=dtypes.fp32, device=q.device,
+        )
+    if attn_lse is None:
+        attn_lse = torch.empty(
+            (num_seqs, num_kv_splits, num_kv_heads, q_seq_lens_internal, 1),
+            dtype=dtypes.fp32, device=q.device,
+        )
+
+    # softmax_scale is ignored by the v4 nm kernel (hardcodes 1/sqrt(512));
+    # we still pass *something* through to satisfy the C ABI.
+    sm_scale_arg = 0.0 if sm_scale is None else float(sm_scale)
+
+    aiter.mla_decode_v4_asm(
+        q, qrope, kv_buffer, kvrope,
+        qo_indptr, kv_indptr, kv_page_indices, kv_last_page_lens,
+        split_indptr,
+        max_seqlen_q,
+        sm_scale_arg,
+        int(out_16_nosplit),
+        int(sub_Q),
+        int(num_kv_splits),
+        logits, attn_lse, output,
+    )
+    return logits, attn_lse
@@ -688,6 +688,48 @@ def mla_decode_stage1_asm_fwd(
 ) -> None: ...
 
 
+MD_NAME_V4 = "module_mla_v4_asm"
+
+
+@compile_ops(MD_NAME_V4, ffi_type="ctypes")
+def mla_decode_v4_asm(
+    # [total_query_len, num_heads, head_size]   FP8 packed Q + e8m0 scale region
+    Q: torch.Tensor,
+    # [total_query_len, num_heads, kv_rotary]   BF16
+    qrope: torch.Tensor,
+    # [num_page, page_size, num_kv_heads, head_size]  FP8
+    KV: torch.Tensor,
+    # [num_page, page_size, num_kv_heads, kv_rotary]  BF16
+    kvrope: torch.Tensor,
+    # [num_seqs+1]
+    qo_indptr: torch.Tensor,
+    # [num_seqs+1]
+    kv_indptr: torch.Tensor,
+    # [num_page_used]
+    kv_page_indices: torch.Tensor,
+    # [num_seqs]
+    kv_last_page_lens: torch.Tensor,
+    # [num_seqs+1]
+    split_indptr: torch.Tensor,
+    max_seqlen_q: int,
+    # ignored on v4 nm; kernel hardcodes 1/sqrt(kV4DimNope+kV4DimRope)=1/sqrt(512)
+    softmax_scale: float,
+    # 0 = fp32 split-out path; 1 = bf16 nosplit reduce path
+    out_16_nosplit: int,
+    # poc_kl `sub_Q` (= per-WG Q tile); only 64 currently shipped
+    sub_Q: int,
+    # poc_kl `passes`
+    num_kv_splits: int,
+    # outputs
+    # [num_seqs, num_kv_splits, num_kv_heads, gqa*max_seqlen_q, v_head_dim] FP32
+    splitData: torch.Tensor,
+    # [num_seqs, num_kv_splits, num_kv_heads, gqa*max_seqlen_q, 1]          FP32
+    splitLse: torch.Tensor,
+    # [total_query_len, num_heads, v_head_dim] BF16 (used when out_16_nosplit==1)
+    output: torch.Tensor,
+) -> None: ...
+
+
 @compile_ops(MD_NAME, ffi_type="ctypes")
 def mla_prefill_asm_fwd(
     # [num_seqs, num_heads, head_size]
 
@@ -39,6 +39,15 @@ def get_dtype_fp8():
 globals().update({f"AITER_DTYPE_{name}": idx for name, idx in aiter_dtypes.items()})
 _torch_to_aiter_dtype = {globals()[name]: idx for name, idx in aiter_dtypes.items()}
 
+# Both e4m3fn (OCP) and e4m3fnuz (ROCm NUZ) are valid FP8 variants at the
+# byte level for kernels that just read raw FP8 bytes. Map both torch dtypes
+# to the same AITER_DTYPE_fp8 enum so the strict dtype check in
+# torch_to_aiter() / torch_to_aiter_pybind() accepts whichever variant the
+# caller has — letting v3/v4 MLA tests use either dtype interchangeably.
+if "fp8" in aiter_dtypes:
+    for _alt_fp8 in (torch.float8_e4m3fn, torch.float8_e4m3fnuz):
+        _torch_to_aiter_dtype.setdefault(_alt_fp8, aiter_dtypes["fp8"])
+
 
 def torch_to_aiter_pybind(tensor: torch.Tensor):
     """Convert torch.Tensor to pybind aiter_tensor_t for passing to C++ ops.