DatasunriseOU
diff --git a/‎bench/tilelang_ports/sparse_mla_blockscaled.json‎
Lines changed: 62 additions & 25 deletions b/‎bench/tilelang_ports/sparse_mla_blockscaled.json‎
Lines changed: 62 additions & 25 deletions
diff --git a/‎bench/tilelang_ports/sparse_mla_fp8.json‎
Lines changed: 88 additions & 25 deletions b/‎bench/tilelang_ports/sparse_mla_fp8.json‎
Lines changed: 88 additions & 25 deletions
diff --git a/‎cppmega_mlx/nn/_tilelang/__init__.py‎
Lines changed: 18 additions & 0 deletions b/‎cppmega_mlx/nn/_tilelang/__init__.py‎
Lines changed: 18 additions & 0 deletions
@@ -5,19 +5,19 @@
   "shape": {
     "q_shape": [
       1,
-      64,
       4,
+      2,
       64
     ],
     "kv_shape": [
       1,
-      64,
+      4,
       1,
       64
     ],
     "indices_shape": [
       1,
-      64,
+      4,
       1,
       16
     ],
@@ -31,52 +31,89 @@
     "codegen_blocker_reason": "sparse_mla_blockscaled direct-MSL kernel built via mx.fast.metal_kernel is available; block-scaled MXFP8 dequant happens inline inside MSL on uint8 e4m3 storage with E8M0 block-scales. Apple MSL 4.0 has no native float8 simdgroup matrix, so the matmuls run as plain register fma.",
     "block_size": 32
   },
+  "path_c_tilelang_e8m0_qk_status": {
+    "available": false,
+    "reason": "TileLang Path C E8M0 Sparse-MLA QK is not safe to dispatch: no simdgroup_multiply_accumulate; scale operands disappeared from emitted MSL; E8M0 exp2(byte - 127) decode markers missing; scale operands are not indexed by K/32; scalar fallback markers present; Sparse-MLA M=1/topk tile violates current Metal FP8 simdgroup tile constraints",
+    "target": "metal",
+    "m": 1,
+    "n": 16,
+    "k": 64,
+    "transpose_B": true,
+    "scale_block_size": 32,
+    "scale_layout": "logical_unswizzled_k_axis_blocks",
+    "features": {
+      "kernel_void": 1,
+      "simdgroup_multiply_accumulate": 0,
+      "simdgroup_load": 0,
+      "simdgroup_store": 0,
+      "fp8_e4m3_decode_helper": 3,
+      "A_scale_refs": 0,
+      "B_scale_refs": 0,
+      "signature_has_A_scale": false,
+      "signature_has_B_scale": false,
+      "e8m0_exp2": 0,
+      "e8m0_bias_subtract_127": 0,
+      "e8m0_sentinel_255": 0,
+      "e8m0_zero_sentinel": 1,
+      "k_block_shift_5": 0,
+      "k_block_div_32": 0,
+      "A_scale_collapsed_zero": 0,
+      "B_scale_collapsed_zero": 0,
+      "float_a_val": true,
+      "float_b_val": true,
+      "threadgroup_half": false,
+      "scale_format": "e8m0_block_k32",
+      "scale_block_size": 32,
+      "scale_axis": "contracted_k",
+      "scale_layout": "logical_unswizzled_k_axis_blocks"
+    }
+  },
   "parity": {
     "blockscaled_vs_bf16": {
-      "max_abs_err": 0.011328823864459991,
-      "max_rel_err": 0.11758861583070061
+      "max_abs_err": 0.004521891474723816,
+      "max_rel_err": 0.06304103596741027
     },
     "quantized_matmul_vs_bf16": {
       "max_abs_err": 0.0,
       "max_rel_err": 0.0
     },
     "msl_blockscaled_vs_bf16": {
-      "max_abs_err": 0.011317778378725052,
-      "max_rel_err": 0.1174739681502098
+      "max_abs_err": 0.0045216078869998455,
+      "max_rel_err": 0.06303708238647923
     },
     "msl_blockscaled_vs_bs_ref": {
-      "max_abs_err": 3.0465424060821533e-05,
-      "max_rel_err": 0.0003377691831805473
+      "max_abs_err": 2.81408429145813e-05,
+      "max_rel_err": 0.00039434889228588904
     }
   },
   "bench": {
     "bf16_reference": {
       "label": "bf16_reference",
-      "median_ms": 0.9395829401910305,
-      "min_ms": 0.665042083710432,
-      "max_ms": 0.9992080740630627,
-      "iters": 8
+      "median_ms": 0.608749920502305,
+      "min_ms": 0.5949169863015413,
+      "max_ms": 0.6114158313721418,
+      "iters": 3
     },
     "blockscaled_reference": {
       "label": "blockscaled_reference",
-      "median_ms": 0.645665917545557,
-      "min_ms": 0.3741669934242964,
-      "max_ms": 1.2330419849604368,
-      "iters": 8
+      "median_ms": 0.5933749489486217,
+      "min_ms": 0.5850829184055328,
+      "max_ms": 0.6149171385914087,
+      "iters": 3
     },
     "quantized_matmul_reference": {
       "label": "quantized_matmul_reference",
-      "median_ms": 0.3338339738547802,
-      "min_ms": 0.2703331410884857,
-      "max_ms": 0.5257499869912863,
-      "iters": 8
+      "median_ms": 0.5981249269098043,
+      "min_ms": 0.5954578518867493,
+      "max_ms": 0.7106249686330557,
+      "iters": 3
     },
     "path_b_msl_blockscaled_fwd": {
       "label": "path_b_msl_blockscaled_fwd",
-      "median_ms": 0.9964159689843655,
-      "min_ms": 0.5499999970197678,
-      "max_ms": 1.009250059723854,
-      "iters": 8
+      "median_ms": 0.18091709353029728,
+      "min_ms": 0.1712499652057886,
+      "max_ms": 0.20291702821850777,
+      "iters": 3
     }
   }
 }
@@ -5,19 +5,19 @@
   "shape": {
     "q_shape": [
       1,
-      64,
       4,
+      2,
       64
     ],
     "kv_shape": [
       1,
-      64,
+      4,
       1,
       64
     ],
     "indices_shape": [
       1,
-      64,
+      4,
       1,
       16
     ],
@@ -31,52 +31,115 @@
     "codegen_blocker_reason": "sparse_mla_fp8 direct-MSL kernel built via mx.fast.metal_kernel is available; FP8 e4m3 dequant happens inline inside MSL on uint8 storage. Apple MSL 4.0 has no native float8 simdgroup matrix, so the matmuls run as plain register fma ops (still ~2x faster than the BF16-fallback reference because the dequant fuses with the QK loop).",
     "fp8_dtype": "float8_e4m3"
   },
+  "path_c_tilelang_qk_status": {
+    "available": false,
+    "reason": "TileLang Path C FP8 Sparse-MLA QK is not safe to dispatch: no simdgroup_multiply_accumulate; scale operands disappeared from emitted MSL; scalar fallback markers present; Sparse-MLA M=1/topk tile violates current Metal FP8 simdgroup tile constraints",
+    "target": "metal",
+    "m": 1,
+    "n": 16,
+    "k": 64,
+    "transpose_B": true,
+    "features": {
+      "kernel_void": 1,
+      "simdgroup_multiply_accumulate": 0,
+      "simdgroup_load": 0,
+      "simdgroup_store": 0,
+      "fp8_e4m3_decode_helper": 3,
+      "A_scale_refs": 0,
+      "B_scale_refs": 0,
+      "signature_has_A_scale": false,
+      "signature_has_B_scale": false,
+      "float_a_val": true,
+      "float_b_val": true,
+      "threadgroup_half": false
+    }
+  },
+  "path_c_tilelang_qk_reduce_status": {
+    "available": true,
+    "reason": "TileLang Path C FP8 Sparse-MLA real QK reducer is dispatchable for M=1/topk with per-row B scales",
+    "target": "metal",
+    "n": 16,
+    "k": 64,
+    "outputs_per_block": 4,
+    "reduce_threads": 32,
+    "vec": 4,
+    "features": {
+      "kernel_void": 1,
+      "fp8_e4m3_decode_helper": 3,
+      "scalar_fp8_byte_decode": 3,
+      "scalar_fp8_byte_decode_calls": 2,
+      "tvm_thread_allreduce": 0,
+      "simd_sum": 0,
+      "simd_shuffle_down": 5,
+      "A_scale_refs": 1,
+      "B_scale_refs": 1,
+      "signature_has_A_scale": true,
+      "signature_has_B_scale": true,
+      "per_row_B_scale": true,
+      "reinterpret_cast": 0,
+      "device_const_uint": 0,
+      "uchar4": 0,
+      "threadgroup_half": false,
+      "qk_shape": "m1_n_topk_k"
+    }
+  },
   "parity": {
     "fp8_vs_bf16": {
-      "max_abs_err": 0.0030034519731998444,
-      "max_rel_err": 0.031174618342377305
+      "max_abs_err": 0.0034084729850292206,
+      "max_rel_err": 0.047518537152928295
     },
     "quantized_matmul_vs_bf16": {
       "max_abs_err": 0.0,
       "max_rel_err": 0.0
     },
     "msl_fp8_vs_bf16": {
-      "max_abs_err": 0.0030084550380706787,
-      "max_rel_err": 0.0312265481349234
+      "max_abs_err": 0.003423169255256653,
+      "max_rel_err": 0.04772342223368997
     },
     "msl_fp8_vs_fp8_ref": {
-      "max_abs_err": 3.0413269996643066e-05,
-      "max_rel_err": 0.0003079714788127215
+      "max_abs_err": 3.0234456062316895e-05,
+      "max_rel_err": 0.00042000606965991613
+    },
+    "path_c_qk_reduce_vs_oracle": {
+      "max_abs_err": 0.0,
+      "max_rel_err": 0.0
     }
   },
   "bench": {
     "bf16_reference": {
       "label": "bf16_reference",
-      "median_ms": 0.9395829401910305,
-      "min_ms": 0.665042083710432,
-      "max_ms": 0.9992080740630627,
-      "iters": 8
+      "median_ms": 0.608749920502305,
+      "min_ms": 0.5949169863015413,
+      "max_ms": 0.6114158313721418,
+      "iters": 3
     },
     "fp8_reference": {
       "label": "fp8_reference",
-      "median_ms": 0.4516250919550657,
-      "min_ms": 0.33437483943998814,
-      "max_ms": 0.5283341743052006,
-      "iters": 8
+      "median_ms": 0.6220829673111439,
+      "min_ms": 0.6120421458035707,
+      "max_ms": 0.6377911195158958,
+      "iters": 3
     },
     "quantized_matmul_reference": {
       "label": "quantized_matmul_reference",
-      "median_ms": 0.3338339738547802,
-      "min_ms": 0.2703331410884857,
-      "max_ms": 0.5257499869912863,
-      "iters": 8
+      "median_ms": 0.5981249269098043,
+      "min_ms": 0.5954578518867493,
+      "max_ms": 0.7106249686330557,
+      "iters": 3
     },
     "path_b_msl_fp8_fwd": {
       "label": "path_b_msl_fp8_fwd",
-      "median_ms": 0.9899579454213381,
-      "min_ms": 0.21674996241927147,
-      "max_ms": 2.00541689991951,
-      "iters": 8
+      "median_ms": 0.5564999300986528,
+      "min_ms": 0.22545899264514446,
+      "max_ms": 0.6050418596714735,
+      "iters": 3
+    },
+    "path_c_tilelang_fp8_qk_reduce": {
+      "label": "path_c_tilelang_fp8_qk_reduce",
+      "median_ms": 0.20920787937939167,
+      "min_ms": 0.1621670089662075,
+      "max_ms": 0.5389999132603407,
+      "iters": 3
     }
   }
 }
@@ -104,6 +104,16 @@
     sparse_mla_blockscaled_metal_status,
     sparse_mla_blockscaled_reference,
 )
+from cppmega_mlx.nn._tilelang.sparse_mla_blockscaled_path_c import (
+    E8M0_BLOCK_SIZE,
+    E8M0_LAYOUT,
+    E8M0_SCALE_FORMAT,
+    SparseMLABlockScaledPathCStatus,
+    blockscaled_sparse_mla_qk_msl_features,
+    blockscaled_sparse_mla_qk_path_c_status,
+    lower_blockscaled_sparse_mla_qk_msl,
+    make_blockscaled_sparse_mla_qk_kernel,
+)
 from cppmega_mlx.nn._tilelang.sparse_mla_fp8 import (
     SparseMLAFp8MetalStatus,
     sparse_mla_fp8_apply,
@@ -123,11 +133,15 @@
 __all__ = [
     "FP8MSLKernelStatus",
     "FP8VecmatPathCStatus",
+    "E8M0_BLOCK_SIZE",
+    "E8M0_LAYOUT",
+    "E8M0_SCALE_FORMAT",
     "M2RNNMetalStatus",
     "Mamba3MetalStatus",
     "MXFP8_BLOCK_SIZE",
     "PathBStatus",
     "SparseMLABlockScaledMetalStatus",
+    "SparseMLABlockScaledPathCStatus",
     "SparseMLAFp8MetalStatus",
     "SparseMLAMetalStatus",
     "SparseMLAPathCStatus",
@@ -139,6 +153,8 @@
     "build_mlx_body",
     "bwd_dadt_fused",
     "bwd_dtrap_ddt",
+    "blockscaled_sparse_mla_qk_msl_features",
+    "blockscaled_sparse_mla_qk_path_c_status",
     "compute_dacs_segsum",
     "fp8_msl_kernels",
     "fp8_msl_status",
@@ -150,6 +166,8 @@
     "fp8_vecmat_path_c_status",
     "half_to_fp8",
     "lower_fp8_vecmat_msl",
+    "lower_blockscaled_sparse_mla_qk_msl",
+    "make_blockscaled_sparse_mla_qk_kernel",
     "make_fp8_vecmat_reduce_kernel",
     "m2rnn",
     "m2rnn_apply",