addressed review commits

xintin · xintin · commit 2c03c782c6b8 · 2026-04-09T20:21:08.000Z
Signed-off-by: xintin &lt;gaurav.verma@amd.com&gt;
diff --git a/examples/python/7.1_schedule.py b/examples/python/7.1_schedule.py
@@ -32,6 +32,7 @@
 from wave_lang.kernel.wave.templates import (
     get_tagged_mxfp4_gemm,
     get_tagged_mxfp4_gemm_preshuffle_b,
+    get_tagged_mxfp4_gemm_preshuffle_b_wide_store,
     get_tagged_mxfp4_gemm_preshuffle_scales,
     get_tagged_mxfp4_gemm_preshuffle_scales_and_B,
 )
@@ -432,18 +433,16 @@ def test_dbuf_4wave_mxfp_dynamic_preshuffle_b_gemm_wide_stores(
 ):
     """Preshuffle-B MXFP4 GEMM with dynamic M, N, K and wide epilogue stores.
 
-    Uses wide_stores=True to swap MFMA operands (B as LHS, A as RHS),
+    Uses the wide_store variant to swap MFMA operands (B as LHS, A as RHS),
     aligning the accumulator's contiguous values with the output's stride-1
     dimension. The coalesce_wide_stores pass emits v_permlane16_swap_b32
     + buffer_store_dwordx4 (8 bf16 per store) instead of buffer_store_short.
     """
-    gemm, options = get_tagged_mxfp4_gemm_preshuffle_b(
+    gemm, options = get_tagged_mxfp4_gemm_preshuffle_b_wide_store(
         shape,
         block,
         wave_shape=(2, 2),
         reorder_workgroups=True,
-        output_dtype=tkl.bf16,
-        wide_stores=True,
     )
     dynamic_symbols = [tkl.sym.M, tkl.sym.N, tkl.sym.K]
     for sym in dynamic_symbols:
diff --git a/lit_tests/kernel/wave/wide_stores_mxfp4.py b/lit_tests/kernel/wave/wide_stores_mxfp4.py
@@ -3,7 +3,7 @@
 """
 Test wide store coalescing for preshuffle-B MXFP4 GEMM with bf16 output.
 
-When wide_stores=True, the kernel swaps MFMA operands (B as LHS, A as RHS)
+The wide_store variant kernel swaps MFMA operands (B as LHS, A as RHS)
 so the accumulator's 4-contiguous values align with the output's stride-1
 dimension. The coalesce_wide_stores pass tags eligible bf16 global
 writes, and the codegen emits v_permlane16_swap_b32 to exchange data
@@ -23,21 +23,21 @@
 from wave_lang.kernel.wave.compile import wave_compile
 from wave_lang.kernel.wave.constraints import ScaledMMAType
 from wave_lang.kernel.wave.schedules import get_mxfp4_asymmetric_schedule
-from wave_lang.kernel.wave.templates import get_tagged_mxfp4_gemm_preshuffle_b
+from wave_lang.kernel.wave.templates import (
+    get_tagged_mxfp4_gemm_preshuffle_b_wide_store,
+)
 from wave_lang.kernel.wave.utils.general_utils import run_test
 
 
 @run_test
 def test_wide_stores_preshuffle_b_mxfp4():
     shape = (1024, 3072, 8192)
     block = (256, 192, 256)
-    kernel, options = get_tagged_mxfp4_gemm_preshuffle_b(
+    kernel, options = get_tagged_mxfp4_gemm_preshuffle_b_wide_store(
         shape,
         block,
         wave_shape=(2, 2),
         reorder_workgroups=True,
-        output_dtype=tkl.bf16,
-        wide_stores=True,
         mfma_variant=ScaledMMAType.F32_16x16x128_F8F6F4,
     )
     dynamic_symbols = [tkl.sym.M, tkl.sym.N, tkl.sym.K]
diff --git a/tests/kernel/wave_gemm_mxfp_test.py b/tests/kernel/wave_gemm_mxfp_test.py
@@ -29,6 +29,7 @@
 from wave_lang.kernel.wave.templates import (
     get_tagged_mxfp4_gemm,
     get_tagged_mxfp4_gemm_preshuffle_b,
+    get_tagged_mxfp4_gemm_preshuffle_b_wide_store,
     get_tagged_mxfp4_gemm_preshuffle_scales,
     get_tagged_mxfp4_gemm_preshuffle_scales_and_B,
 )
@@ -1051,17 +1052,15 @@ def testScaledGemmMXFP4PreshuffleBWideStores(
 ):
     """End-to-end test for MXFP4 GEMM with wide epilogue stores (dwordx4).
 
-    Uses wide_stores=True to swap MFMA operands and emit buffer_store_dwordx4
-    via v_permlane16_swap_b32 for bf16 output.
+    Uses the wide_store variant to swap MFMA operands and emit
+    buffer_store_dwordx4 via v_permlane16_swap_b32 for bf16 output.
     """
-    gemm, options = get_tagged_mxfp4_gemm_preshuffle_b(
+    gemm, options = get_tagged_mxfp4_gemm_preshuffle_b_wide_store(
         shape,
         block_shape,
         wave_shape=wave_shape,
         mfma_variant=mfma_variant,
         reorder_workgroups=True,
-        output_dtype=tkl.bf16,
-        wide_stores=True,
     )
     dynamic_symbols = [tkl.sym.M, tkl.sym.N, tkl.sym.K]
     for sym in dynamic_symbols:
diff --git a/wave_lang/kernel/compiler/wave_codegen/read_write.py b/wave_lang/kernel/compiler/wave_codegen/read_write.py
@@ -1436,14 +1436,16 @@ def _write_permlane_pack_to_global(
     wide_i32 = vector_d.from_elements(v4i32_type, [d0, d1, d2, d3])
     wide_vec = vector_d.bitcast(v8bf16_type, wide_i32)
 
-    four = arith_d.constant(idx_type, 4)
+    elems_per_thread = arith_d.constant(idx_type, num_elems)
 
     adj_th = list(start_indices_th)
-    adj_th[-1] = arith_d.select(is_lower, adj_th[-1], arith_d.subi(adj_th[-1], four))
+    adj_th[-1] = arith_d.select(
+        is_lower, adj_th[-1], arith_d.subi(adj_th[-1], elems_per_thread)
+    )
 
     adj_full = list(start_indices)
     adj_full[-1] = arith_d.select(
-        is_lower, adj_full[-1], arith_d.subi(adj_full[-1], four)
+        is_lower, adj_full[-1], arith_d.subi(adj_full[-1], elems_per_thread)
     )
 
     _create_vec_read_write(
diff --git a/wave_lang/kernel/wave/templates/__init__.py b/wave_lang/kernel/wave/templates/__init__.py
@@ -9,6 +9,7 @@
 from .tagged_mxfp4_gemm import (
     get_tagged_mxfp4_gemm,
     get_tagged_mxfp4_gemm_preshuffle_b,
+    get_tagged_mxfp4_gemm_preshuffle_b_wide_store,
     get_tagged_mxfp4_gemm_preshuffle_scales,
     get_tagged_mxfp4_gemm_preshuffle_scales_and_B,
 )
@@ -18,6 +19,7 @@
     "get_tagged_bshd_attention_kernel",
     "get_tagged_mxfp4_gemm",
     "get_tagged_mxfp4_gemm_preshuffle_b",
+    "get_tagged_mxfp4_gemm_preshuffle_b_wide_store",
     "get_tagged_mxfp4_gemm_preshuffle_scales",
     "get_tagged_mxfp4_gemm_preshuffle_scales_and_B",
 ]
diff --git a/wave_lang/kernel/wave/templates/tagged_mxfp4_gemm.py b/wave_lang/kernel/wave/templates/tagged_mxfp4_gemm.py
@@ -10,8 +10,9 @@
 All ops are tagged for use with MXFP4 schedule functions (e.g. get_mxfp4_dbuf_schedule).
 
 Provides:
-  - get_tagged_mxfp4_gemm:                  vanilla (A, B via LDS)
-  - get_tagged_mxfp4_gemm_preshuffle_b:     B + B_scale preshuffled (direct global reads)
+  - get_tagged_mxfp4_gemm:                           vanilla (A, B via LDS)
+  - get_tagged_mxfp4_gemm_preshuffle_b:              B + B_scale preshuffled (direct global reads)
+  - get_tagged_mxfp4_gemm_preshuffle_b_wide_store:   same + wide epilogue stores via permlane swap
 
 Required tags: k_loop, read_a, read_a_scale, read_b, read_b_scale,
 bitcast_a, bitcast_a_scale, bitcast_b, bitcast_b_scale, scaled_mma.
@@ -377,37 +378,20 @@ def get_tagged_mxfp4_gemm_preshuffle_scales_and_B(
     )
 
 
-def get_tagged_mxfp4_gemm_preshuffle_b(
-    shape: tuple[int, int, int] = (1024, 1024, 8192),
-    block_shape: tuple[int, int, int] = (256, 256, 256),
-    wave_shape: tuple[int, int] = (2, 2),
-    mfma_variant: ScaledMMAType = ScaledMMAType.F32_16x16x128_F8F6F4,
-    a_address_space: tkl.AddressSpace = SHARED_ADDRESS_SPACE,
+def _get_tagged_mxfp4_gemm_preshuffle_b_impl(
+    shape: tuple[int, int, int],
+    block_shape: tuple[int, int, int],
+    wave_shape: tuple[int, int],
+    mfma_variant: ScaledMMAType,
+    a_address_space: tkl.AddressSpace,
+    *,
     a_scale_preshuffle: bool = True,
-    reorder_workgroups=True,
-    group_size_n=32,
+    reorder_workgroups: bool = True,
+    group_size_n: int = 32,
     output_dtype=tkl.f32,
     wide_stores: bool = False,
 ):
-    """Return a tagged MXFP4 scaled GEMM kernel with preshuffled B and B_scale.
-
-    B data is read directly from global memory using a preshuffle mapping
-    (aiter shuffle_weight permutation).  B scales are also read from global
-    memory using an e8m0 scale preshuffle mapping.  A and A_scale go through
-    shared memory (LDS) as usual.
-
-    All ops are tagged for use with MXFP4 schedule functions.
-
-    Args:
-        shape: (M, N, K) problem dimensions.
-        block_shape: (BLOCK_M, BLOCK_N, BLOCK_K) tile sizes.
-        wave_shape: (WAVE_M, WAVE_N) waves per workgroup.
-        mfma_variant: Scaled MMA instruction type.
-        a_address_space: Address space for A and A_scale (typically SHARED).
-
-    Returns:
-        (kernel_function, WaveCompileOptions)
-    """
+    """Shared implementation for preshuffle-B MXFP4 GEMM with optional wide stores."""
     M = tkl.sym.M
     N = tkl.sym.N
     K = tkl.sym.K
@@ -599,6 +583,94 @@ def repeat(
     return gemm, options
 
 
+def get_tagged_mxfp4_gemm_preshuffle_b(
+    shape: tuple[int, int, int] = (1024, 1024, 8192),
+    block_shape: tuple[int, int, int] = (256, 256, 256),
+    wave_shape: tuple[int, int] = (2, 2),
+    mfma_variant: ScaledMMAType = ScaledMMAType.F32_16x16x128_F8F6F4,
+    a_address_space: tkl.AddressSpace = SHARED_ADDRESS_SPACE,
+    a_scale_preshuffle: bool = True,
+    reorder_workgroups=True,
+    group_size_n=32,
+    output_dtype=tkl.f32,
+):
+    """Return a tagged MXFP4 scaled GEMM kernel with preshuffled B and B_scale.
+
+    B data is read directly from global memory using a preshuffle mapping
+    (aiter shuffle_weight permutation).  B scales are also read from global
+    memory using an e8m0 scale preshuffle mapping.  A and A_scale go through
+    shared memory (LDS) as usual.
+
+    All ops are tagged for use with MXFP4 schedule functions.
+
+    Args:
+        shape: (M, N, K) problem dimensions.
+        block_shape: (BLOCK_M, BLOCK_N, BLOCK_K) tile sizes.
+        wave_shape: (WAVE_M, WAVE_N) waves per workgroup.
+        mfma_variant: Scaled MMA instruction type.
+        a_address_space: Address space for A and A_scale (typically SHARED).
+
+    Returns:
+        (kernel_function, WaveCompileOptions)
+    """
+    return _get_tagged_mxfp4_gemm_preshuffle_b_impl(
+        shape,
+        block_shape,
+        wave_shape,
+        mfma_variant,
+        a_address_space,
+        a_scale_preshuffle=a_scale_preshuffle,
+        reorder_workgroups=reorder_workgroups,
+        group_size_n=group_size_n,
+        output_dtype=output_dtype,
+        wide_stores=False,
+    )
+
+
+def get_tagged_mxfp4_gemm_preshuffle_b_wide_store(
+    shape: tuple[int, int, int] = (1024, 1024, 8192),
+    block_shape: tuple[int, int, int] = (256, 256, 256),
+    wave_shape: tuple[int, int] = (2, 2),
+    mfma_variant: ScaledMMAType = ScaledMMAType.F32_16x16x128_F8F6F4,
+    a_address_space: tkl.AddressSpace = SHARED_ADDRESS_SPACE,
+    a_scale_preshuffle: bool = True,
+    reorder_workgroups=True,
+    group_size_n=32,
+    output_dtype=tkl.bf16,
+):
+    """Return a tagged MXFP4 scaled GEMM kernel with preshuffled B, B_scale, and wide stores.
+
+    Like :func:`get_tagged_mxfp4_gemm_preshuffle_b` but swaps MFMA operands
+    (B as LHS, A as RHS) so the accumulator's 4-contiguous values align with
+    the output memory's stride-1 dimension.  The ``coalesce_wide_stores`` pass
+    emits ``v_permlane16_swap_b32`` + ``buffer_store_dwordx4`` (8 bf16 per
+    store) instead of scalar ``buffer_store_short``.
+
+    Args:
+        shape: (M, N, K) problem dimensions.
+        block_shape: (BLOCK_M, BLOCK_N, BLOCK_K) tile sizes.
+        wave_shape: (WAVE_M, WAVE_N) waves per workgroup.
+        mfma_variant: Scaled MMA instruction type.
+        a_address_space: Address space for A and A_scale (typically SHARED).
+        output_dtype: Output element type (default bf16).
+
+    Returns:
+        (kernel_function, WaveCompileOptions)
+    """
+    return _get_tagged_mxfp4_gemm_preshuffle_b_impl(
+        shape,
+        block_shape,
+        wave_shape,
+        mfma_variant,
+        a_address_space,
+        a_scale_preshuffle=a_scale_preshuffle,
+        reorder_workgroups=reorder_workgroups,
+        group_size_n=group_size_n,
+        output_dtype=output_dtype,
+        wide_stores=True,
+    )
+
+
 def _reorder_mxfp4_workgroups(m, n, block_m, block_n, group_size_n):
     """Remap workgroup indices to a new order based on group_size_n along N dimension.
 
diff --git a/wave_lang/kernel/wave/wide_store_coalescing.py b/wave_lang/kernel/wave/wide_store_coalescing.py
@@ -5,12 +5,13 @@
 """
 Graph pass that tags eligible epilogue bf16 stores for wide store coalescing.
 
-When a kernel uses swapped MFMA operands (wide_stores=True), the
-accumulator's 4-contiguous values align with the output's stride-1
-dimension. This pass identifies Write nodes that use the source/target
-dimension remapping pattern (indicating swapped operands) and tags them
-so the codegen emits v_permlane16_swap_b32 + buffer_store_dwordx4
-instead of scalar buffer_store_short.
+When a kernel uses swapped MFMA operands (e.g.
+``get_tagged_mxfp4_gemm_preshuffle_b_wide_store``), the accumulator's
+4-contiguous values align with the output's stride-1 dimension. This
+pass identifies Write nodes that use the source/target dimension
+remapping pattern (indicating swapped operands) and tags them so the
+codegen emits v_permlane16_swap_b32 + buffer_store_dwordx4 instead of
+scalar buffer_store_short.
 
 Only tags writes that satisfy ALL conditions:
   1. Target memory is global address space
@@ -30,9 +31,9 @@ def coalesce_wide_stores(trace: CapturedTrace):
     """Tag eligible bf16 global writes for permlane16_swap wide stores.
 
     Only tags Write nodes that use the source/target dimension remapping
-    pattern, which indicates the kernel was built with ``wide_stores=True``
-    (swapped MFMA operands). Writes without source/target are left
-    untouched, making this pass safe to run unconditionally.
+    pattern (swapped MFMA operands, as produced by the wide_store kernel
+    variant). Writes without source/target are left untouched, making
+    this pass safe to run unconditionally.
     """
     import wave_lang.kernel.lang as tkl