iree-org
diff --git a/‎examples/python/7.1_schedule.py‎
Lines changed: 16 additions & 11 deletions b/‎examples/python/7.1_schedule.py‎
Lines changed: 16 additions & 11 deletions
diff --git a/‎wave_lang/kernel/compiler/wave_codegen/read_write.py‎
Lines changed: 12 additions & 2 deletions b/‎wave_lang/kernel/compiler/wave_codegen/read_write.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎wave_lang/kernel/wave/coalesce_epilogue_stores.py‎
Lines changed: 48 additions & 0 deletions b/‎wave_lang/kernel/wave/coalesce_epilogue_stores.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎wave_lang/kernel/wave/compile.py‎
Lines changed: 10 additions & 1 deletion b/‎wave_lang/kernel/wave/compile.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎wave_lang/kernel/wave/compile_options.py‎
Lines changed: 3 additions & 3 deletions b/‎wave_lang/kernel/wave/compile_options.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎wave_lang/kernel/wave/templates/tagged_mxfp4_gemm.py‎
Lines changed: 2 additions & 4 deletions b/‎wave_lang/kernel/wave/templates/tagged_mxfp4_gemm.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎waveasm/include/waveasm/Dialect/WaveASMOps.td‎
Lines changed: 15 additions & 0 deletions b/‎waveasm/include/waveasm/Dialect/WaveASMOps.td‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎waveasm/include/waveasm/Transforms/TranslateFromMLIR.h‎
Lines changed: 16 additions & 0 deletions b/‎waveasm/include/waveasm/Transforms/TranslateFromMLIR.h‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎waveasm/lib/Transforms/AssemblyEmitter.cpp‎
Lines changed: 65 additions & 0 deletions b/‎waveasm/lib/Transforms/AssemblyEmitter.cpp‎
Lines changed: 65 additions & 0 deletions
@@ -465,15 +465,21 @@ def test_dbuf_4wave_mxfp_dynamic_preshuffle_b_gemm_wide_stores(
 
 def test_dbuf_4wave_mxfp_dynamic_preshuffle_b_gemm_asm(
     is_debug=False,
-    shape=(1024, 1024, 8192),
-    block=(128, 256, 256),
+    shape=(1024, 3072, 8192),
+    block=(128, 128, 256),
     eliminate_epilogue=False,
 ):
-    """Preshuffle-B MXFP4 GEMM with dynamic M, N, K."""
-    gemm, options = get_tagged_mxfp4_gemm_preshuffle_b(
-        shape, block, wave_shape=(1, 4), reorder_workgroups=False
+    """Preshuffle-B MXFP4 GEMM with coalesced dwordx4 stores (WaveASM backend).
+
+    Same kernel as the LLVM coalesced-stores test but compiled through the
+    C++ WaveASM backend.  Emits v_permlane16_swap_b32 + buffer_store_dwordx4.
+    """
+    gemm, options = get_tagged_mxfp4_gemm_preshuffle_b_wide_store(
+        shape,
+        block,
+        wave_shape=(1, 4),
+        reorder_workgroups=True,
     )
-    # Make M, N, K dynamic so the compiler does not specialize on problem size.
     dynamic_symbols = [tkl.sym.M, tkl.sym.N, tkl.sym.K]
     for sym in dynamic_symbols:
         del options.subs[sym]
@@ -483,18 +489,17 @@ def test_dbuf_4wave_mxfp_dynamic_preshuffle_b_gemm_asm(
     options.use_wave_asm_backend = True
     options.wave_runtime = True
     options.eliminate_epilogue = eliminate_epilogue
-    options.dump_intermediates = "build/intermediates/"
+    options.coalesce_epilogue_stores = True
+    options._skip_vgpr_compaction = True
     schedule = get_mxfp4_asymmetric_schedule(
         eliminate_epilogue=eliminate_epilogue, is_bscale_shuffled=True
     )
     options.print_ir_after = "all" if is_debug else []
     options = set_default_run_config(options)
     gemm = wave_compile(options, gemm, schedule)
 
-    _run_mxfp_gemm_preshuffle(gemm, shape, all=True)
-    print(
-        "MXFP GEMM preshuffle-B 4-wave dynamic M, N, K (WaveASM backend) test passed!"
-    )
+    _run_mxfp_gemm_preshuffle(gemm, shape, all=True, output_dtype=torch.bfloat16)
+    print("MXFP GEMM preshuffle-B 4-wave dwordx4 (WaveASM backend) test passed!")
 
 
 if __name__ == "__main__":
 
@@ -1322,9 +1322,19 @@ def handle_write(emitter: WaveEmitter, node: fx.Node):
 
     use_llvm_store = flags != MemoryAccessFlags.NONE
 
+    is_shared = get_custom(memory).type.address_space == SHARED_ADDRESS_SPACE
+    is_bf16 = isinstance(element_type, BF16Type)
+
+    if (
+        is_bf16
+        and not is_shared
+        and emitter.options.use_buffer_ops
+        and emitter.options.backend == "asm"
+    ):
+        mask = None
+
     if getattr(node, "_permlane_pack_global", False):
-        is_shared = get_custom(memory).type.address_space == SHARED_ADDRESS_SPACE
-        if not is_shared and isinstance(element_type, BF16Type):
+        if not is_shared and is_bf16:
             _write_permlane_pack_to_global(
                 emitter,
                 insert_vector,
 
@@ -0,0 +1,48 @@
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+Graph pass that coalesces epilogue bf16 stores via permlane16_swap.
+
+Marks eligible Write nodes so the codegen combines each thread's 4 bf16
+values with its partner lane's (16 lanes apart) via v_permlane16_swap_b32,
+producing 8 consecutive bf16 written as a single buffer_store_dwordx4.
+No LDS staging or barriers required.
+
+Precondition: the output memory must have M as the innermost (contiguous)
+dimension (i.e. transpose_output=True producing [N, M] layout) so that 8
+consecutive bf16 elements span 8 adjacent M rows.
+"""
+
+from .._support.tracing import CapturedTrace
+from ..lang.global_symbols import GLOBAL_ADDRESS_SPACE
+from ..ops.wave_ops import Write, get_custom
+from .region_canonicalization import RegionFormat, requires_region_format
+from .utils.symbol_utils import subs_idxc
+
+
+@requires_region_format(RegionFormat.SCHEDULE_SIGNATURE_PLACEHOLDERS)
+def coalesce_epilogue_stores(trace: CapturedTrace):
+    """Tag epilogue bf16 global writes for permlane16_swap packing.
+
+    Walks the root graph and sets ``_permlane_pack_global = True`` on
+    every Write node that targets global memory with bf16 dtype.
+    The codegen in ``_write_permlane_pack_to_global`` handles the rest.
+    """
+    import wave_lang.kernel.lang as tkl
+
+    root_graph = trace.get_root_graph()
+
+    for node in root_graph.nodes:
+        if node.op != "call_function":
+            continue
+        custom = get_custom(node)
+        if not isinstance(custom, Write):
+            continue
+        mem_type = custom.memory_type
+        if (
+            subs_idxc(mem_type.address_space) == GLOBAL_ADDRESS_SPACE
+            and mem_type.dtype == tkl.bf16
+        ):
+            node._permlane_pack_global = True
@@ -534,6 +534,11 @@ def build_graph_passes(
 
     graph_passes.append(partial(coalesce_wide_stores, trace))
 
+    if options.coalesce_epilogue_stores:
+        from .coalesce_epilogue_stores import coalesce_epilogue_stores
+
+        graph_passes.append(partial(coalesce_epilogue_stores, trace))
+
     graph_passes += [
         partial(simplify_indices, trace, launchable.constraints),
         partial(
@@ -1370,7 +1375,11 @@ def _generate_asm_code(mb, options):
             "--waveasm-scc-spill-reload",
             "--waveasm-scc-verifier",
             "--waveasm-linear-scan=max-vgprs=256 max-agprs=256",
-            "--waveasm-vgpr-compaction",
+            *(
+                []
+                if getattr(options, "_skip_vgpr_compaction", False)
+                else ["--waveasm-vgpr-compaction"]
+            ),
             waitcnt_flag,
             f"--waveasm-hazard-mitigation=target={options.target}",
             "--emit-assembly",
 
@@ -105,6 +105,8 @@ class WaveCompileOptions:
     specialize: bool = False
     eliminate_epilogue: bool = False
 
+    coalesce_epilogue_stores: bool = False
+
     # Cluster barrier signal/wait delay in number of loop iterations
     # None - no barriers inside the loop
     # 0 - signal and wait on same iteration
@@ -118,11 +120,9 @@ class WaveCompileOptions:
     # keep read linearization without annotating every buffer.
     allow_noncontiguous_runtime_buffers: bool = False
 
-    # Dynamic strides are enabled whenever wave_runtime is active,
-    # supported by both LLVM and waveasm backends.
     @property
     def dynamic_strides(self) -> bool:
-        return self.wave_runtime
+        return self.wave_runtime and self.backend == "llvm"
 
     # === Print options ===
     mlir_print_ir_after_all: bool = False
 
@@ -404,10 +404,6 @@ def _get_tagged_mxfp4_gemm_preshuffle_b_impl(
     K_PACKED = tkl.sym.K_PACKED
     K_SCALE_SHUFFLED = tkl.sym.K_SCALE_SHUFFLED
 
-    if wide_stores:
-        m_symbol = tkl.sym.m_symbol
-        n_symbol = tkl.sym.n_symbol
-
     constraints: list[tkw.Constraint] = [tkw.WorkgroupConstraint(M, BLOCK_M, 0)]
     constraints += [tkw.WorkgroupConstraint(N, BLOCK_N, 1)]
     constraints += [tkw.TilingConstraint(K, BLOCK_K)]
@@ -426,6 +422,8 @@ def _get_tagged_mxfp4_gemm_preshuffle_b_impl(
     constraints += [tkw.Assumption(K > BLOCK_K * 6)]
 
     if wide_stores:
+        m_symbol = tkl.sym.m_symbol
+        n_symbol = tkl.sym.n_symbol
         constraints += [tkw.IteratorBindings({m_symbol: M, n_symbol: N})]
         constraints += [tkw.Assumption(Eq(M % BLOCK_M, 0))]
         constraints += [tkw.Assumption(Eq(N % BLOCK_N, 0))]
 
@@ -521,6 +521,21 @@ def WaveASM_V_READFIRSTLANE_B32 : WAVEASMOp<"v_readfirstlane_b32", [Pure]> {
   let assemblyFormat = "$src attr-dict `:` type($src) `->` type($dst)";
 }
 
+// Lane swap operations (VGPR <-> VGPR across lanes)
+def WaveASM_V_PERMLANE16_SWAP_B32 : WAVEASMOp<"v_permlane16_swap_b32", [Pure]> {
+  let summary = "Swap VGPR values between lanes 16 apart";
+  let description = [{
+    Exchanges a 32-bit value between paired lanes that are 16 positions apart.
+    Lane i swaps with lane i^16 within each 32-lane half-wave.
+    The hardware writes the swapped value to dst AND clobbers src.
+    The handler must ensure the original source value is preserved in a
+    separate register before invoking this instruction.
+  }];
+  let arguments = (ins WaveASM_AnyVGPR:$src);
+  let results = (outs WaveASM_AnyVGPR:$dst);
+  let assemblyFormat = "$src attr-dict `:` type($src) `->` type($dst)";
+}
+
 // Bit operations
 def WaveASM_V_NOT_B32 : VALUUnaryOp<"v_not_b32">;
 def WaveASM_V_NOT_B64 : VALUUnaryOp<"v_not_b64">;
 
@@ -52,8 +52,24 @@ class ValueMapper {
     return valueMap.contains(mlirValue);
   }
 
+  /// Map a sub-element of a struct-typed MLIR value (for llvm.extractvalue).
+  void setExtraMapping(mlir::Value structVal, int64_t index,
+                       mlir::Value elemVal) {
+    extraMap[{structVal, index}] = elemVal;
+  }
+
+  /// Get a sub-element of a struct-typed MLIR value.
+  std::optional<mlir::Value> getExtraMapping(mlir::Value structVal,
+                                             int64_t index) const {
+    auto it = extraMap.find({structVal, index});
+    if (it != extraMap.end())
+      return it->second;
+    return std::nullopt;
+  }
+
 private:
   llvm::DenseMap<mlir::Value, mlir::Value> valueMap;
+  llvm::DenseMap<std::pair<mlir::Value, int64_t>, mlir::Value> extraMap;
 };
 
 //===----------------------------------------------------------------------===//
 
@@ -972,6 +972,71 @@ std::optional<std::string> KernelGenerator::generateOp(Operation *op) {
             return formatter.format("v_cvt_pk_bf16_f32", operands);
           })
 
+      // V_PERMLANE16_SWAP_B32: swap lanes 16 apart.
+      // The hardware clobbers BOTH dst and src. When the allocator assigns
+      // dst==src, we must save the original to a scratch register, swap
+      // through another scratch, then restore the original.
+      .Case<V_PERMLANE16_SWAP_B32>(
+          [&](V_PERMLANE16_SWAP_B32 swapOp) -> std::optional<std::string> {
+            std::string dst = resolveValue(swapOp.getDst());
+            std::string src = resolveValue(swapOp.getSrc());
+            if (dst != src) {
+              llvm::SmallVector<std::string> operands = {dst, src};
+              return formatter.format("v_permlane16_swap_b32", operands);
+            }
+            // dst==src: save original, swap through scratch, restore original
+            std::string scratch0 = formatVGPRRange(kScratchVGPR, 1);
+            std::string scratch1 = formatVGPRRange(kScratchVGPR - 1, 1);
+            peakVGPRs = std::max(peakVGPRs, kScratchVGPR + 1);
+            invalidateScratchCache();
+            // 1. Save original src to scratch0
+            // 2. Copy src to scratch1 for the swap
+            // 3. Swap: dst gets partner's scratch1, scratch1 clobbered
+            // 4. Restore original from scratch0 back to src
+            return "  v_mov_b32 " + scratch0 + ", " + src + "\n" +
+                   "  v_mov_b32 " + scratch1 + ", " + src + "\n" +
+                   "  v_permlane16_swap_b32 " + dst + ", " + scratch1 + "\n" +
+                   "  v_mov_b32 " + src + ", " + scratch0;
+          })
+
+      // V_ACCVGPR_READ_B32: unroll multi-register reads into scalar ops
+      .Case<V_ACCVGPR_READ_B32>(
+          [&](V_ACCVGPR_READ_B32 readOp) -> std::optional<std::string> {
+            Value dst = readOp.getDst();
+            Value src = readOp.getSrc();
+            int64_t dstSize = getRegSize(dst.getType());
+            int64_t srcSize = getRegSize(src.getType());
+            int64_t size = std::max(dstSize, srcSize);
+            if (size <= 1) {
+              return emitDefaultFormat(readOp, "v_accvgpr_read_b32");
+            }
+            int64_t dstBase = -1, srcBase = -1;
+            if (auto pv = dyn_cast<PVRegType>(dst.getType()))
+              dstBase = pv.getIndex();
+            else if (isVirtualRegType(dst.getType()))
+              dstBase = mapping.getPhysReg(dst);
+            if (auto pa = dyn_cast<PARegType>(src.getType()))
+              srcBase = pa.getIndex();
+            else if (isVirtualRegType(src.getType()))
+              srcBase = mapping.getPhysReg(src);
+            if (dstBase < 0 || srcBase < 0) {
+              llvm::errs() << "V_ACCVGPR_READ_B32 fallback: dstBase=" << dstBase
+                           << " srcBase=" << srcBase << " dstSize=" << dstSize
+                           << " srcSize=" << srcSize
+                           << " dstType=" << dst.getType()
+                           << " srcType=" << src.getType() << "\n";
+              return emitDefaultFormat(readOp, "v_accvgpr_read_b32");
+            }
+            std::string lines;
+            for (int64_t i = 0; i < size; ++i) {
+              if (i > 0)
+                lines += "\n";
+              lines += "  v_accvgpr_read_b32 v" + std::to_string(dstBase + i) +
+                       ", a" + std::to_string(srcBase + i);
+            }
+            return lines;
+          })
+
       // Carry ops: on GFX9, carry-out is implicit VCC.
       // v_add_co_u32:  dst, vcc, src0, src1
       // v_addc_co_u32: dst, vcc, src0, src1, vcc  (carry-in).