Clang-format

justinrosner · justinrosner · commit 79dd5dbc8b6d · 2026-03-09T21:15:58.000Z
diff --git a/mlir/include/mlir/Dialect/Rock/IR/RockAttrDefs.td b/mlir/include/mlir/Dialect/Rock/IR/RockAttrDefs.td
@@ -464,10 +464,8 @@ def Rock_GemmLoadTileDirectToLDSDefault
     : I32EnumAttrCase<"DirectToLDSDefault", 3>;
 def Rock_GemmLoadTileDirectToLDSDoubleBuffer
     : I32EnumAttrCase<"DirectToLDSDoubleBuffer", 4>;
-def Rock_GemmLoadTileGlobalReadOnly
-    : I32EnumAttrCase<"GlobalReadOnly", 5>;
-def Rock_GemmLoadTileLDSWriteFromRegs
-    : I32EnumAttrCase<"LDSWriteFromRegs", 6>;
+def Rock_GemmLoadTileGlobalReadOnly : I32EnumAttrCase<"GlobalReadOnly", 5>;
+def Rock_GemmLoadTileLDSWriteFromRegs : I32EnumAttrCase<"LDSWriteFromRegs", 6>;
 
 def Rock_GemmLoadTileType
     : Rock_I32Enum<"GemmLoadTileType", "GEMM load tile types",
diff --git a/mlir/include/mlir/Dialect/Rock/Passes.td b/mlir/include/mlir/Dialect/Rock/Passes.td
@@ -107,7 +107,10 @@ def RockRegularizePass : Pass<"rock-regularize", "::mlir::func::FuncOp"> {
 
 def RockGridwiseGemmToBlockwisePass : Pass<"rock-gridwise-gemm-to-blockwise", "::mlir::func::FuncOp"> {
   let summary = "expand gridwise gemm into blockwise copy, blockwise gemm, and threadwise copy";
-  let dependentDialects = ["rock::RockDialect", "affine::AffineDialect", "gpu::GPUDialect", "vector::VectorDialect", "memref::MemRefDialect", "linalg::LinalgDialect", "scf::SCFDialect", "amdgpu::AMDGPUDialect"];
+  let dependentDialects = ["rock::RockDialect", "affine::AffineDialect",
+                           "gpu::GPUDialect", "vector::VectorDialect",
+                           "memref::MemRefDialect", "linalg::LinalgDialect",
+                           "scf::SCFDialect", "amdgpu::AMDGPUDialect"];
 }
 
 def RockLinalgAlignPass : Pass<"rock-linalg-align", "::mlir::func::FuncOp"> {
diff --git a/mlir/lib/Dialect/Rock/Transforms/BlockwiseGemmToThreadwise.cpp b/mlir/lib/Dialect/Rock/Transforms/BlockwiseGemmToThreadwise.cpp
@@ -1418,7 +1418,7 @@ struct BlockwiseReduceRewritePattern
         // Branchless reduction: each thread reads all rTidDim partial
         // values from LDS and reduces locally in registers. This avoids
         // creating conditional branches (scf.if) that split softmax into
-        // multiple basic blocks. 
+        // multiple basic blocks.
         // Trade-off: every thread does rTidCount LDS reads (instead of
         // log2(rTidCount) conditional reads in the tree reduction). For
         // typical attention configs where rTidCount is small (e.g., 4),
@@ -1429,8 +1429,8 @@ struct BlockwiseReduceRewritePattern
           int64_t rTidCount = threadViewShape[rTidDim];
 
           // Accumulator for the full reduction.
-          auto accRegType = MemRefType::get(
-              {1}, elemType, AffineMap{}, privateMemoryAddressSpace);
+          auto accRegType = MemRefType::get({1}, elemType, AffineMap{},
+                                            privateMemoryAddressSpace);
           Value accReg = GpuAllocOp::create(rewriter, loc, accRegType);
           FillOp::create(rewriter, loc, accReg, initVal);
 
@@ -1463,8 +1463,8 @@ struct BlockwiseReduceRewritePattern
                 InBoundsStoreOp::create(rewriter, loc, ldVal, accReg,
                                         zeroConstantOp);
               } else {
-                Value accVal = InBoundsLoadOp::create(
-                    rewriter, loc, elemType, accReg, zeroConstantOp);
+                Value accVal = InBoundsLoadOp::create(rewriter, loc, elemType,
+                                                      accReg, zeroConstantOp);
                 Value reduced = createReducingOp(op, ldVal, accVal, rewriter);
                 InBoundsStoreOp::create(rewriter, loc, reduced, accReg,
                                         zeroConstantOp);
@@ -1476,8 +1476,8 @@ struct BlockwiseReduceRewritePattern
           // All threads with the same nrtid compute the same value,
           // so concurrent writes to the same location are safe.
           {
-            Value reducedVal = InBoundsLoadOp::create(
-                rewriter, loc, elemType, accReg, zeroConstantOp);
+            Value reducedVal = InBoundsLoadOp::create(rewriter, loc, elemType,
+                                                      accReg, zeroConstantOp);
             SmallVector<Value, 3> writeInits{nrtid, zeroConstantOp,
                                              zeroConstantOp};
             SmallVector<int64_t> writeBounds{1, 1, 1};
@@ -1486,8 +1486,7 @@ struct BlockwiseReduceRewritePattern
             TransformingForOp writeLoop = TransformingForOp::create(
                 rewriter, loc, ArrayRef<ValueRange>{writeInits},
                 ArrayRef<Attribute>{threadToLDSViewTrs},
-                ArrayRef<int64_t>(writeBounds),
-                ArrayRef<int64_t>(writeStrides),
+                ArrayRef<int64_t>(writeBounds), ArrayRef<int64_t>(writeStrides),
                 /*forceUnroll=*/true, /*useIndexDiffs=*/true);
             {
               PatternRewriter::InsertionGuard guard(rewriter);
diff --git a/mlir/lib/Dialect/Rock/Transforms/BlockwiseLoadTileToThreadwise.cpp b/mlir/lib/Dialect/Rock/Transforms/BlockwiseLoadTileToThreadwise.cpp
@@ -244,10 +244,8 @@ class LoweringBlockwiseLoadTileOp final
     else
       b.setInsertionPoint(op);
 
-    bool globalReadOnly =
-        loadType == GemmLoadTileType::GlobalReadOnly;
-    bool ldsWriteFromRegs =
-        loadType == GemmLoadTileType::LDSWriteFromRegs;
+    bool globalReadOnly = loadType == GemmLoadTileType::GlobalReadOnly;
+    bool ldsWriteFromRegs = loadType == GemmLoadTileType::LDSWriteFromRegs;
 
     Value loadBuffer, storeBuffer;
     if (globalReadOnly || ldsWriteFromRegs) {
@@ -258,8 +256,8 @@ class LoweringBlockwiseLoadTileOp final
              "destRegisters must be set for split-phase load types");
       loadBuffer = destRegisters;
       if (ldsWriteFromRegs) {
-        storeBuffer = gpuAlloc(b, loc, copyPerThread, elementType,
-                               AddressSpace::Private);
+        storeBuffer =
+            gpuAlloc(b, loc, copyPerThread, elementType, AddressSpace::Private);
       }
     } else if (loadType == GemmLoadTileType::BypassLDS) {
       auto privateMemoryAddressSpace = b.getAttr<gpu::AddressSpaceAttr>(
@@ -320,13 +318,13 @@ class LoweringBlockwiseLoadTileOp final
         Value wrappedSource =
             transform(b, source, maybeBufferViews->gridSubTile);
 
-        ThreadwiseReadIntoOp::create(
-            b, loc, vectorOfBoolShapedLike(loadBuffer), wrappedSource,
-            loadBuffer,
-            /*dynamicValidities=*/ValueRange{},
-            /*extraViews=*/b.getArrayAttr({}),
-            /*extraIndices=*/indices, forceUnroll, true,
-            /*ldsTransposeConfig=*/nullptr);
+        ThreadwiseReadIntoOp::create(b, loc, vectorOfBoolShapedLike(loadBuffer),
+                                     wrappedSource, loadBuffer,
+                                     /*dynamicValidities=*/ValueRange{},
+                                     /*extraViews=*/b.getArrayAttr({}),
+                                     /*extraIndices=*/indices, forceUnroll,
+                                     true,
+                                     /*ldsTransposeConfig=*/nullptr);
 
         if (!globalReadOnly && rock::isGlobalPrefetchSupported(arch)) {
           // add one to k_loop to prefetch next iteration
diff --git a/mlir/lib/Dialect/Rock/Transforms/GridwiseGemmToBlockwise.cpp b/mlir/lib/Dialect/Rock/Transforms/GridwiseGemmToBlockwise.cpp
@@ -33,8 +33,8 @@
 #include "mlir/Dialect/Rock/utility/math.h"
 #include "mlir/Dialect/Rock/utility/transformMapUtils.h"
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -2823,23 +2823,21 @@ struct GridwiseAttentionAccelRewritePattern
                    << "V prefetch Phase 2 hoist decision: "
                    << (hoistVPhase2 ? "HOIST" : "DEFER")
                    << " (hoistedTotal=" << hoistedTotal << ", max=" << maxLDS
-                   << ", sumWS=" << sumWSBytes
-                   << ", gemm0=" << gemm0PeakBytes
+                   << ", sumWS=" << sumWSBytes << ", gemm0=" << gemm0PeakBytes
                    << ", gemm1=" << gemm1PeakBytes << ")\n");
       }
 
       if (prefetchFirstVTile) {
         // Set up grid coordinates for the first V tile.
         gridCoordsGemm1 = layout::makeGxNGridLayout(
-            rewriter, loc, bid, zero, gemm1NBlocks, gridSize, arch,
-            numChiplets, splitKVConst);
+            rewriter, loc, bid, zero, gemm1NBlocks, gridSize, arch, numChiplets,
+            splitKVConst);
         gridCoordsGemm1.m_block = zero; // First V tile (block index 0)
 
         // Allocate a flat register buffer shared between the GlobalReadOnly
         // and LDSWriteFromRegs phases. Size must match what the lowering
         // computes: copyPerThread = (kPerBlock * dPerBlock) / blockSize.
-        int64_t vCopyPerThread =
-            (gemm1KPerBlock * gemm1MPerBlock) / blockSize;
+        int64_t vCopyPerThread = (gemm1KPerBlock * gemm1MPerBlock) / blockSize;
         vPrefetchRegs = gpuAlloc(rewriter, loc, vCopyPerThread, elemTypeV,
                                  gpu::AddressSpace::Private);
 
@@ -2851,15 +2849,15 @@ struct GridwiseAttentionAccelRewritePattern
             rewriter, loc, gemm1KPerBlock * gemm1MPerBlock, elemTypeV);
         loadAndStoreGemmInputTile(
             rewriter, loc, inV,
-            /*kIter=*/mLoopIV, tid, gridCoordsGemm1, dummyLDS,
-            vPrefetchRegs, GemmLoadTileType::GlobalReadOnly, "m", blockSize,
-            elemTypeV, elemTypeVLoad, gemm1TuningParams, featuresAttr,
-            matrixParamsV, matrixParamsKxQ);
+            /*kIter=*/mLoopIV, tid, gridCoordsGemm1, dummyLDS, vPrefetchRegs,
+            GemmLoadTileType::GlobalReadOnly, "m", blockSize, elemTypeV,
+            elemTypeVLoad, gemm1TuningParams, featuresAttr, matrixParamsV,
+            matrixParamsKxQ);
 
         // Insert a scheduling barrier to prevent the LLVM backend scheduler
         // from sinking the V global loads past the softmax computation.
-        amdgpu::SchedBarrierOp::create(
-            rewriter, loc, amdgpu::sched_barrier_opt_enum::none);
+        amdgpu::SchedBarrierOp::create(rewriter, loc,
+                                       amdgpu::sched_barrier_opt_enum::none);
       }
 
       int64_t prePadG0M = gemm0M;
@@ -3034,9 +3032,9 @@ struct GridwiseAttentionAccelRewritePattern
           loadAndStoreGemmInputTile(
               rewriter, loc, inV,
               /*kIter=*/mLoopIV, tid, gridCoordsGemm1, ldsByteBufferV,
-              vPrefetchRegs, GemmLoadTileType::LDSWriteFromRegs, "m",
-              blockSize, elemTypeV, elemTypeVLoad, gemm1TuningParams,
-              featuresAttr, matrixParamsV, matrixParamsKxQ);
+              vPrefetchRegs, GemmLoadTileType::LDSWriteFromRegs, "m", blockSize,
+              elemTypeV, elemTypeVLoad, gemm1TuningParams, featuresAttr,
+              matrixParamsV, matrixParamsKxQ);
         }
 
         // Softmax sum reduction
@@ -3074,9 +3072,9 @@ struct GridwiseAttentionAccelRewritePattern
           loadAndStoreGemmInputTile(
               rewriter, loc, inV,
               /*kIter=*/mLoopIV, tid, gridCoordsGemm1, ldsByteBufferV,
-              vPrefetchRegs, GemmLoadTileType::LDSWriteFromRegs, "m",
-              blockSize, elemTypeV, elemTypeVLoad, gemm1TuningParams,
-              featuresAttr, matrixParamsV, matrixParamsKxQ);
+              vPrefetchRegs, GemmLoadTileType::LDSWriteFromRegs, "m", blockSize,
+              elemTypeV, elemTypeVLoad, gemm1TuningParams, featuresAttr,
+              matrixParamsV, matrixParamsKxQ);
           LDSBarrierOp::create(rewriter, loc);
         }
       }
@@ -3140,9 +3138,9 @@ struct GridwiseAttentionAccelRewritePattern
         }
 
         // Helper lambda: emit GEMM1 MMA + PostProcess for a single V tile.
-        auto emitGemm1Compute =
-            [&](Value g1MBlockIdx, GemmLoadTileType vLoadType,
-                Value vRegBuf) -> LogicalResult {
+        auto emitGemm1Compute = [&](Value g1MBlockIdx,
+                                    GemmLoadTileType vLoadType,
+                                    Value vRegBuf) -> LogicalResult {
           // Emit GEMM 1 MMA.
           auto computeStage = StageOp::create(rewriter, loc, "MMA");
           {
@@ -3155,8 +3153,8 @@ struct GridwiseAttentionAccelRewritePattern
               zeroAccBuffer(rewriter, loc, matrixC);
             } else {
               if (gemm1MBlocks > 1) {
-                matrixC = createSliceOfFirstDim(rewriter, loc, matrixC,
-                                                g1MBlockIdx);
+                matrixC =
+                    createSliceOfFirstDim(rewriter, loc, matrixC, g1MBlockIdx);
               }
             }
 
@@ -3206,20 +3204,19 @@ struct GridwiseAttentionAccelRewritePattern
             auto loadTypeKxD = doBypassLDSSecondGemm
                                    ? GemmLoadTileType::BypassLDS
                                    : GemmLoadTileType::Default;
-            blockwiseGemmAccel(
-                rewriter, loc, vLoadType, loadTypeKxD, vRegBuf,
-                preAccelRegBufferQxK, matrixC, matrixParamsV, matrixParamsKxQ,
-                ldsTileBufferV, gemm1LDSBufferB,
-                /*scaleA=*/nullptr, /*scaleB=*/nullptr,
-                /*bufferScaleA=*/nullptr, /*bufferScaleB=*/nullptr,
-                featuresAttr, op.getBlockSizeAttr(), gemm1TuningParams);
+            blockwiseGemmAccel(rewriter, loc, vLoadType, loadTypeKxD, vRegBuf,
+                               preAccelRegBufferQxK, matrixC, matrixParamsV,
+                               matrixParamsKxQ, ldsTileBufferV, gemm1LDSBufferB,
+                               /*scaleA=*/nullptr, /*scaleB=*/nullptr,
+                               /*bufferScaleA=*/nullptr,
+                               /*bufferScaleB=*/nullptr, featuresAttr,
+                               op.getBlockSizeAttr(), gemm1TuningParams);
 
             rock::YieldOp::create(rewriter, loc);
           }
 
           // Emit GEMM 1 PostProcess.
-          auto postProcessStage =
-              StageOp::create(rewriter, loc, "PostProcess");
+          auto postProcessStage = StageOp::create(rewriter, loc, "PostProcess");
           {
             PatternRewriter::InsertionGuard guard(rewriter);
             rewriter.setInsertionPointToStart(
@@ -3232,8 +3229,8 @@ struct GridwiseAttentionAccelRewritePattern
             if (!op.getEnableSoftmax() && gemm1MBlocks > 1) {
               gemm1OutBufferPerG1MBlock = createSliceOfFirstDim(
                   rewriter, loc, gemm1OutBuffer, g1MBlockIdx);
-              matrixC = createSliceOfFirstDim(rewriter, loc, matrixC,
-                                              g1MBlockIdx);
+              matrixC =
+                  createSliceOfFirstDim(rewriter, loc, matrixC, g1MBlockIdx);
             }
 
             accelEmitterPtrGemm1->computeOutputConversion(
@@ -3280,11 +3277,10 @@ struct GridwiseAttentionAccelRewritePattern
           // Default load path expects rank-1, so allocate a separate buf.
           Value peeledVRegBuf = preAccelRegBufferV;
           if (doubleBuffering) {
-            auto [peeledVForLoad, peeledVBuf] =
-                createRegInterrimBufferForAccel(
-                    rewriter, loc, accelParamsGemm1.argTypeA,
-                    accelParamsGemm1.kBasePerThread,
-                    /*repeats=*/1, directToLDS);
+            auto [peeledVForLoad, peeledVBuf] = createRegInterrimBufferForAccel(
+                rewriter, loc, accelParamsGemm1.argTypeA,
+                accelParamsGemm1.kBasePerThread,
+                /*repeats=*/1, directToLDS);
             peeledVRegBuf = peeledVBuf;
           }
           if (!doBypassLDSSecondGemm)
@@ -3298,25 +3294,23 @@ struct GridwiseAttentionAccelRewritePattern
           if (gemm1MBlocks > 1) {
             LDSBarrierOp::create(rewriter, loc);
 
-            Value startG1M =
-                rewriter.createOrFold<ConstantIndexOp>(loc, 1);
+            Value startG1M = rewriter.createOrFold<ConstantIndexOp>(loc, 1);
             Value endG1MLoop =
                 rewriter.createOrFold<ConstantIndexOp>(loc, gemm1MBlocks);
             Value oneVal =
                 rewriter.createOrFold<arith::ConstantIndexOp>(loc, 1);
-            scf::ForOp g1MLoopOp = scf::ForOp::create(
-                rewriter, loc, startG1M, endG1MLoop, oneVal);
+            scf::ForOp g1MLoopOp =
+                scf::ForOp::create(rewriter, loc, startG1M, endG1MLoop, oneVal);
             // Only pipeline when >1 iteration remains; pipelining a
             // single iteration causes barrier mismatches.
             if (gemm1MBlocks > 2) {
               bool g1DoubleBuffering =
                   loadType == GemmLoadTileType::DoubleBuffer ||
                   loadType == GemmLoadTileType::DirectToLDSDoubleBuffer;
               int64_t g1InitiationInterval = g1DoubleBuffering ? 1 : 2;
-              g1MLoopOp->setAttr(
-                  PipelineAttr::getMnemonic(),
-                  rock::PipelineAttr::get(rewriter.getContext(),
-                                          g1InitiationInterval));
+              g1MLoopOp->setAttr(PipelineAttr::getMnemonic(),
+                                 rock::PipelineAttr::get(rewriter.getContext(),
+                                                         g1InitiationInterval));
             }
             {
               OpBuilder::InsertionGuard guard(rewriter);
@@ -3916,11 +3910,10 @@ void RockGridwiseGemmToBlockwisePass::runOnOperation() {
   ConversionTarget target(*ctx);
   target.addIllegalOp<rock::GridwiseGemmOp, rock::GridwiseGemmAccelOp,
                       GridwiseAttentionAccelOp>();
-  target.addLegalDialect<arith::ArithDialect, rock::RockDialect,
-                         memref::MemRefDialect, affine::AffineDialect,
-                         vector::VectorDialect, linalg::LinalgDialect,
-                         scf::SCFDialect, math::MathDialect,
-                         amdgpu::AMDGPUDialect>();
+  target.addLegalDialect<
+      arith::ArithDialect, rock::RockDialect, memref::MemRefDialect,
+      affine::AffineDialect, vector::VectorDialect, linalg::LinalgDialect,
+      scf::SCFDialect, math::MathDialect, amdgpu::AMDGPUDialect>();
   target.addLegalOp<gpu::PrintfOp>();
 
   RewritePatternSet patterns(ctx);
diff --git a/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp b/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp
@@ -22,12 +22,12 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/Rock/IR/Rock.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Rock/Passes.h"
 #include "mlir/Dialect/Rock/Transforms/RockMultibuffer.h"
 #include "mlir/Dialect/Rock/utility/loweringUtils.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "mlir/Pass/PassManager.h"
diff --git a/mlir/lib/Dialect/Rock/Transforms/ThreadwiseGemmLowering.cpp b/mlir/lib/Dialect/Rock/Transforms/ThreadwiseGemmLowering.cpp
@@ -823,7 +823,8 @@ LogicalResult ThreadwiseReadIntoRewritePattern::matchAndRewrite(
     // may have fewer dimensions (dstRank). The last dstRank elements of the
     // domain-1 coords correspond to the dest buffer dimensions.
     int64_t dstRank = dstBufferType.getRank();
-    Block::BlockArgListType allDestCoords = loadLoop.getLowerCoords(/*domain=*/1);
+    Block::BlockArgListType allDestCoords =
+        loadLoop.getLowerCoords(/*domain=*/1);
     size_t dropCount = allDestCoords.size() - dstRank;
     SmallVector<Value> destCoords(allDestCoords.begin() + dropCount,
                                   allDestCoords.end());