Skip to content

Commit b2eb91a

Browse files
[Codegen][AMDGPU] Add UseGlobalTransposeLoad promotion attr and operand promotion
Introduces IREEGPU_UseGlobalTransposeLoad, a promotion attribute for RDNA4 (gfx1200+) that drives matmul operands to be loaded via global_load_tr. The attribute implements both IREEGPU_PromotionAttr and IREECodegen_LoweringConfigAttrInterface, with tiling sizes [N=8, K=1] (vectorSize x 1) derived from the element type. A new transposePromoteOperand path in GPUPromoteMatmulOperands creates a linalg.generic copy with K-inner thread mapping: - input map: reads B[K, N] (K-outer, N-inner in memory) - output map: writes alloc[N, K] (N-outer, K-inner, for contiguous K writes) This K-inner tiling aligns with global_load_tr's 8x8 wave-level transpose semantics: 8 consecutive lanes each read 8 contiguous N-elements, and the hardware transposes so each lane holds a K-direction slice. The copy op is tagged with UseGlobalTransposeLoadAttr as its lowering config so the ROCDLLoadToTransposeLoad pass (PR 2) can recognise and lower it. amdgpu.fat_raw_buffer_cast is stripped before creating the copy because global_load_tr requires a flat global pointer, not a fat buffer descriptor. Part of: #24454 Co-authored-by: Claude Sonnet 4 (1M context) <noreply@anthropic.com> Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
1 parent 0135652 commit b2eb91a

8 files changed

Lines changed: 319 additions & 2 deletions

File tree

compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp

Lines changed: 122 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,12 @@
1515
#include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
1616
#include "iree/compiler/Codegen/Utils/Utils.h"
1717
#include "llvm/ADT/STLExtras.h"
18+
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
1819
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
1920
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
2021
#include "mlir/Dialect/Linalg/IR/Linalg.h"
2122
#include "mlir/Dialect/Tensor/IR/Tensor.h"
23+
#include "mlir/IR/AffineMap.h"
2224
#include "mlir/IR/Builders.h"
2325
#include "mlir/IR/BuiltinTypes.h"
2426
#include "mlir/Interfaces/FunctionInterfaces.h"
@@ -118,6 +120,119 @@ void promoteResult(OpBuilder &builder, Operation *op, Value valToMakeShared) {
118120
});
119121
}
120122

123+
/// Traces through tensor.extract_slice ops to find the
124+
/// iree_codegen.load_from_buffer feeding this value. Returns the slice chain
125+
/// (outermost first) and the load op, or failure if the pattern isn't matched.
126+
static FailureOr<std::pair<SmallVector<tensor::ExtractSliceOp>,
127+
IREE::Codegen::LoadFromBufferOp>>
128+
findLoadFromBuffer(Value v) {
129+
SmallVector<tensor::ExtractSliceOp> slices;
130+
while (auto sliceOp = v.getDefiningOp<tensor::ExtractSliceOp>()) {
131+
slices.push_back(sliceOp);
132+
v = sliceOp.getSource();
133+
}
134+
auto loadOp = v.getDefiningOp<IREE::Codegen::LoadFromBufferOp>();
135+
if (!loadOp) {
136+
return failure();
137+
}
138+
return std::make_pair(slices, loadOp);
139+
}
140+
141+
/// Promotes an operand for global_load_tr by copying to shared memory.
142+
/// Strips fat_raw_buffer so the copy reads from a flat global pointer
143+
/// (required by global_load_tr), then creates a linalg.generic that
144+
/// iterates (K-outer, N-inner) so vectorization produces vector<1x8> reads
145+
/// — 8 contiguous N-direction elements per lane. This is the access pattern
146+
/// that global_load_tr_b128 requires.
147+
/// The matmul's indexing map is updated to read from the transposed [N,K]
148+
/// shared memory layout.
149+
Value transposePromoteOperand(OpBuilder &builder, Operation *op,
150+
unsigned index) {
151+
OpOperand &operand = op->getOpOperand(index);
152+
Location loc = op->getLoc();
153+
154+
// Strip fat_raw_buffer if present so global_load_tr can use a flat pointer.
155+
Value sourceValue = operand.get();
156+
auto maybeLoad = findLoadFromBuffer(sourceValue);
157+
if (succeeded(maybeLoad)) {
158+
auto &[slices, loadOp] = *maybeLoad;
159+
if (auto castOp =
160+
loadOp.getBuffer().getDefiningOp<amdgpu::FatRawBufferCastOp>()) {
161+
OpBuilder::InsertionGuard g(builder);
162+
builder.setInsertionPointAfter(loadOp);
163+
auto flatMemrefType = cast<MemRefType>(castOp.getSource().getType());
164+
auto flatTensorType = RankedTensorType::get(
165+
flatMemrefType.getShape(), flatMemrefType.getElementType());
166+
Value flatLoad = IREE::Codegen::LoadFromBufferOp::create(
167+
builder, loadOp.getLoc(), flatTensorType, castOp.getSource());
168+
sourceValue = flatLoad;
169+
for (auto sliceOp : llvm::reverse(slices)) {
170+
builder.setInsertionPointAfter(sliceOp);
171+
sourceValue = tensor::ExtractSliceOp::create(
172+
builder, sliceOp.getLoc(), sliceOp.getResultType(), sourceValue,
173+
sliceOp.getMixedOffsets(), sliceOp.getMixedSizes(),
174+
sliceOp.getMixedStrides());
175+
}
176+
}
177+
}
178+
179+
auto tensorType = cast<RankedTensorType>(sourceValue.getType());
180+
MLIRContext *ctx = op->getContext();
181+
182+
// Create the transposed output buffer [N, K].
183+
SmallVector<OpFoldResult> mixedSizes =
184+
tensor::getMixedSizes(builder, loc, sourceValue);
185+
SmallVector<OpFoldResult> transposedSizes(mixedSizes.rbegin(),
186+
mixedSizes.rend());
187+
Value empty = tensor::EmptyOp::create(builder, loc, transposedSizes,
188+
tensorType.getElementType());
189+
190+
// linalg.generic with (d0=K outer, d1=N inner):
191+
// input map: (d0, d1) -> (d0, d1) reads src[k, n]
192+
// output map: (d0, d1) -> (d1, d0) writes dst[n, k]
193+
// With N as the inner (vectorized) dimension, each thread reads
194+
// vector<1x8> (8 contiguous N elements at fixed K) — the correct
195+
// access pattern for global_load_tr_b128.
196+
// Loop iteration order: (d0=N outer, d1=K inner).
197+
// With K as the inner (fast-varying per-lane) dimension,
198+
// UseGlobalTransposeLoadAttr's tiling [N=vectorSize, K=1] maps K to
199+
// linear_dim_0 (fast thread dim): 8 consecutive lanes get 8 consecutive
200+
// K rows, which is the correct wave-level setup for global_load_tr.
201+
// Each lane reads vector<1x8> (8 contiguous N from its K row). The tag
202+
// UseGlobalTransposeLoadAttr drives the thread tiling level sizes.
203+
// input map (d0=N, d1=K) -> (d1, d0) reads B[K, N]
204+
// output map (d0=N, d1=K) -> (d0, d1) writes alloc[N, K]
205+
AffineExpr d0 = builder.getAffineDimExpr(0); // N (outer)
206+
AffineExpr d1 = builder.getAffineDimExpr(1); // K (inner)
207+
AffineMap inputMap = AffineMap::get(2, 0, {d1, d0}, ctx);
208+
AffineMap outputMap = AffineMap::get(2, 0, {d0, d1}, ctx);
209+
SmallVector<utils::IteratorType> iterTypes(2, utils::IteratorType::parallel);
210+
211+
auto copyOp = linalg::GenericOp::create(
212+
builder, loc, empty.getType(), sourceValue, empty,
213+
ArrayRef<AffineMap>{inputMap, outputMap}, iterTypes,
214+
[](OpBuilder &b, Location l, ValueRange args) {
215+
linalg::YieldOp::create(b, l, args[0]);
216+
});
217+
// Use UseGlobalTransposeLoadAttr as the lowering config so the tiling pass
218+
// produces K-inner thread assignment via getStaticTilingLevelSizes.
219+
setLoweringConfig(copyOp, IREE::GPU::UseGlobalTransposeLoadAttr::get(ctx));
220+
221+
// Update the matmul's indexing map for this operand by reversing its
222+
// results to reflect the [N, K] shared memory layout.
223+
if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
224+
SmallVector<AffineMap> maps(genericOp.getIndexingMapsArray());
225+
AffineMap oldMap = maps[index];
226+
SmallVector<AffineExpr> results(oldMap.getResults().rbegin(),
227+
oldMap.getResults().rend());
228+
maps[index] = AffineMap::get(oldMap.getNumDims(), oldMap.getNumSymbols(),
229+
results, ctx);
230+
genericOp.setIndexingMapsAttr(builder.getAffineMapArrayAttr(maps));
231+
}
232+
233+
return copyOp.getResult(0);
234+
}
235+
121236
void promoteOperand(OpBuilder &builder, Operation *op, unsigned index,
122237
IREE::GPU::PromotionAttr promotionAttr) {
123238
auto dpsOp = dyn_cast<DestinationStyleOpInterface>(op);
@@ -133,9 +248,14 @@ void promoteOperand(OpBuilder &builder, Operation *op, unsigned index,
133248
// TODO(qedawkins): Move result promotion to attribute interface.
134249
return promoteResult(builder, op, op->getResult(index));
135250
}
136-
OpOperand &operand = op->getOpOperand(index);
137251

138-
Value replacement = promotionAttr.promoteOperand(builder, operand);
252+
Value replacement;
253+
if (isa<IREE::GPU::UseGlobalTransposeLoadAttr>(promotionAttr)) {
254+
replacement = transposePromoteOperand(builder, op, index);
255+
} else {
256+
OpOperand &operand = op->getOpOperand(index);
257+
replacement = promotionAttr.promoteOperand(builder, operand);
258+
}
139259
op->setOperand(index, replacement);
140260
}
141261

compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ iree_lit_test_suite(
5959
"gpu_pad_operands.mlir",
6060
"gpu_pipeline.mlir",
6161
"gpu_promote_matmul_operands.mlir",
62+
"gpu_promote_matmul_operands_global_transpose.mlir",
6263
"gpu_promotion_analysis.mlir",
6364
"gpu_reorder_workgroups.mlir",
6465
"gpu_reorder_workgroups_static.mlir",

compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ iree_lit_test_suite(
5454
"gpu_pad_operands.mlir"
5555
"gpu_pipeline.mlir"
5656
"gpu_promote_matmul_operands.mlir"
57+
"gpu_promote_matmul_operands_global_transpose.mlir"
5758
"gpu_promotion_analysis.mlir"
5859
"gpu_reorder_workgroups.mlir"
5960
"gpu_reorder_workgroups_static.mlir"
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// RUN: iree-opt %s --split-input-file \
2+
// RUN: --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-promote-matmul-operands),canonicalize)" \
3+
// RUN: | FileCheck %s
4+
5+
// Verify UseGlobalTransposeLoad promotion:
6+
// - Creates a linalg.generic copy with K-inner thread mapping
7+
// (input map reads B[K,N], output map writes alloc[N,K])
8+
// - Tags the copy with #iree_gpu.use_global_transpose_load lowering config
9+
// - The matmul RHS is updated to use the promoted (transposed) buffer
10+
11+
// -----
12+
13+
#lowering_config = #iree_gpu.lowering_config<{
14+
promote_operands = [1],
15+
promotion_types = [#iree_gpu.use_global_transpose_load]}>
16+
17+
// CHECK-LABEL: func.func @transpose_promote_rhs
18+
// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<32x64xbf16>
19+
// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<64x128xbf16>
20+
func.func @transpose_promote_rhs(%lhs: tensor<32x64xbf16>,
21+
%rhs: tensor<64x128xbf16>) -> tensor<32x128xbf16> {
22+
%cst = arith.constant 0.0 : bf16
23+
%empty = tensor.empty() : tensor<32x128xbf16>
24+
%fill = linalg.fill ins(%cst : bf16) outs(%empty : tensor<32x128xbf16>) -> tensor<32x128xbf16>
25+
%mm = linalg.generic {
26+
indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
27+
affine_map<(d0, d1, d2) -> (d2, d1)>,
28+
affine_map<(d0, d1, d2) -> (d0, d1)>],
29+
iterator_types = ["parallel", "parallel", "reduction"],
30+
lowering_config = #lowering_config}
31+
ins(%lhs, %rhs : tensor<32x64xbf16>, tensor<64x128xbf16>)
32+
outs(%fill : tensor<32x128xbf16>) {
33+
^bb0(%in0: bf16, %in1: bf16, %out0: bf16):
34+
%mul = arith.mulf %in0, %in1 : bf16
35+
%add = arith.addf %out0, %mul : bf16
36+
linalg.yield %add : bf16
37+
} -> tensor<32x128xbf16>
38+
return %mm : tensor<32x128xbf16>
39+
}
40+
41+
// The copy linalg.generic has:
42+
// input map: (d0, d1) -> (d1, d0) reads B[K, N] (K-outer, N-inner)
43+
// output map: (d0, d1) -> (d0, d1) writes alloc[N, K] (N-outer, K-inner)
44+
// This gives K-inner thread assignment so global_load_tr's 8x8 wave transpose
45+
// aligns with the thread→lane mapping.
46+
//
47+
// The copy has the transposed output shape [N=128, K=64] and K-inner config.
48+
// CHECK: tensor.empty() : tensor<128x64xbf16>
49+
// CHECK: linalg.generic
50+
// CHECK-SAME: ins({{.*}} : tensor<64x128xbf16>)
51+
// CHECK-SAME: outs({{.*}} : tensor<128x64xbf16>)
52+
// CHECK-SAME: lowering_config = #iree_gpu.use_global_transpose_load
53+
//
54+
// The matmul's RHS input is the promoted [N=128, K=64] buffer.
55+
// CHECK: linalg.generic
56+
// CHECK-SAME: ins({{.*}}, {{.*}} : tensor<32x64xbf16>, tensor<128x64xbf16>)
57+
58+
// -----
59+
60+
// LHS promotion with UseGlobalTransposeLoad (transposedLhs case).
61+
#lowering_config_lhs = #iree_gpu.lowering_config<{
62+
promote_operands = [0],
63+
promotion_types = [#iree_gpu.use_global_transpose_load]}>
64+
65+
// CHECK-LABEL: func.func @transpose_promote_lhs
66+
// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<64x32xbf16>
67+
func.func @transpose_promote_lhs(%lhs: tensor<64x32xbf16>,
68+
%rhs: tensor<64x128xbf16>) -> tensor<32x128xbf16> {
69+
%cst = arith.constant 0.0 : bf16
70+
%empty = tensor.empty() : tensor<32x128xbf16>
71+
%fill = linalg.fill ins(%cst : bf16) outs(%empty : tensor<32x128xbf16>) -> tensor<32x128xbf16>
72+
// transposedLhs: LHS is K-outer (K, M) instead of (M, K)
73+
%mm = linalg.generic {
74+
indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d0)>,
75+
affine_map<(d0, d1, d2) -> (d2, d1)>,
76+
affine_map<(d0, d1, d2) -> (d0, d1)>],
77+
iterator_types = ["parallel", "parallel", "reduction"],
78+
lowering_config = #lowering_config_lhs}
79+
ins(%lhs, %rhs : tensor<64x32xbf16>, tensor<64x128xbf16>)
80+
outs(%fill : tensor<32x128xbf16>) {
81+
^bb0(%in0: bf16, %in1: bf16, %out0: bf16):
82+
%mul = arith.mulf %in0, %in1 : bf16
83+
%add = arith.addf %out0, %mul : bf16
84+
linalg.yield %add : bf16
85+
} -> tensor<32x128xbf16>
86+
return %mm : tensor<32x128xbf16>
87+
}
88+
89+
// CHECK: linalg.generic
90+
// CHECK-SAME: ins({{.*}} : tensor<64x32xbf16>)
91+
// CHECK-SAME: lowering_config = #iree_gpu.use_global_transpose_load

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/DerivedConfigUtils.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,4 +221,44 @@ SmallVector<int64_t> globalLoadDMATileSizes(Operation *op) {
221221
return tileSizes;
222222
}
223223

224+
SmallVector<int64_t> globalTransposeLoadTileSizes(Operation *op) {
225+
auto funcOp = op->getParentOfType<FunctionOpInterface>();
226+
std::optional<SmallVector<int64_t>> workgroupSize = getWorkgroupSize(funcOp);
227+
if (!workgroupSize) {
228+
return {};
229+
}
230+
auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
231+
if (!linalgOp) {
232+
return {};
233+
}
234+
235+
SmallVector<int64_t> loopRanges = linalgOp.getStaticLoopRanges();
236+
if (loopRanges.size() != 2) {
237+
return {};
238+
}
239+
240+
int64_t targetSubgroupSize = getGPUTargetAttr(op).getPreferredSubgroupSize();
241+
// Vector size: 8 bf16 elements = 128 bits per lane per global_load_tr call.
242+
int64_t elemBits = getElementTypeOrSelf(linalgOp->getResultTypes()[0])
243+
.getIntOrFloatBitWidth();
244+
int64_t vectorSize = 128 / elemBits; // 8 for bf16/f16, 16 for i8
245+
246+
int64_t numThreads = llvm::product_of(*workgroupSize);
247+
248+
// Tile: [d0=N (outer, step=vectorSize), d1=K (inner, step=1)].
249+
// The K axis (d1, step=1) maps to linear_dim_0 (fast thread dim), giving
250+
// K-inner wave assignment needed by global_load_tr.
251+
int64_t kRange = loopRanges[1]; // K dimension (inner of linalg.generic)
252+
int64_t nRange = loopRanges[0]; // N dimension (outer of linalg.generic)
253+
254+
// Each thread handles vectorSize N-values (vectorized) and 1 K-value.
255+
// Total tasks: (nRange / vectorSize) * kRange = numThreads * numIterations.
256+
(void)nRange;
257+
(void)kRange;
258+
(void)numThreads;
259+
(void)targetSubgroupSize;
260+
261+
return {vectorSize, 1}; // [N_step=vectorSize, K_step=1]
262+
}
263+
224264
} // namespace mlir::iree_compiler::IREE::GPU

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/DerivedConfigUtils.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@ namespace mlir::iree_compiler::IREE::GPU {
1515
SmallVector<int64_t> deriveThreadTileSizes(Operation *op);
1616
SmallVector<int64_t> globalLoadDMATileSizes(Operation *op);
1717

18+
/// Returns thread-level tile sizes [N=vectorSize, K=1] for the
19+
/// global_transpose_load copy. With N as the outer step and K=1 as the inner,
20+
/// the K axis maps to linear_dim_0 (fast thread dim), giving K-inner wave
21+
/// assignment. 8 consecutive lanes then get 8 consecutive K rows, which is
22+
/// required for global_load_tr's wave-level 8x8 cross-lane transpose.
23+
SmallVector<int64_t> globalTransposeLoadTileSizes(Operation *op);
24+
1825
} // namespace mlir::iree_compiler::IREE::GPU
1926

2027
namespace mlir::iree_compiler {

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3190,6 +3190,33 @@ bool UseGlobalLoadDMAAttr::hasTilingLevel(unsigned level) const {
31903190
return level == llvm::to_underlying(GPU::TilingLevel::Thread);
31913191
}
31923192

3193+
//===----------------------------------------------------------------------===//
3194+
// UseGlobalTransposeLoadAttr - LoweringConfigAttrInterface
3195+
//===----------------------------------------------------------------------===//
3196+
3197+
SmallVector<int64_t>
3198+
UseGlobalTransposeLoadAttr::getStaticTilingLevelSizes(unsigned level,
3199+
Operation *op) const {
3200+
if (level == llvm::to_underlying(GPU::TilingLevel::Thread)) {
3201+
return globalTransposeLoadTileSizes(op);
3202+
}
3203+
return {};
3204+
}
3205+
3206+
SmallVector<OpFoldResult>
3207+
UseGlobalTransposeLoadAttr::getTilingLevelSizes(OpBuilder &b, unsigned level,
3208+
Operation *op) const {
3209+
if (level == llvm::to_underlying(GPU::TilingLevel::Thread)) {
3210+
SmallVector<int64_t> sizes = globalTransposeLoadTileSizes(op);
3211+
return getAsIndexOpFoldResult(b.getContext(), sizes);
3212+
}
3213+
return {};
3214+
}
3215+
3216+
bool UseGlobalTransposeLoadAttr::hasTilingLevel(unsigned level) const {
3217+
return level == llvm::to_underlying(GPU::TilingLevel::Thread);
3218+
}
3219+
31933220
//===----------------------------------------------------------------------===//
31943221
// PromoteWithCacheSwizzleAttr
31953222
//===----------------------------------------------------------------------===//

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,36 @@ def IREEGPU_UseGlobalLoadDma :
9797
let parameters = (ins);
9898
}
9999

100+
def IREEGPU_UseGlobalTransposeLoad :
101+
AttrDef<IREEGPU_Dialect, "UseGlobalTransposeLoad", [
102+
DeclareAttrInterfaceMethods<IREECodegen_LoweringConfigAttrInterface, [
103+
"getStaticTilingLevelSizes",
104+
"getTilingLevelSizes",
105+
"hasTilingLevel",
106+
]>,
107+
DeclareAttrInterfaceMethods<IREEGPU_PromotionAttr>
108+
]> {
109+
let mnemonic = "use_global_transpose_load";
110+
let summary = [{
111+
Promote an operand using the global memory transpose load instruction.
112+
}];
113+
let description = [{
114+
Promotion attribute indicating that the operand should be loaded from global
115+
memory using the `amdgpu.global_transpose_load` instruction, which loads
116+
and transposes a matrix tile in a single subgroup operation.
117+
118+
This is applicable for gfx1200+ targets when the operand's memory layout
119+
requires a transpose relative to what the MMA intrinsic expects:
120+
- LHS when `transposedLhs` (K is not the innermost dimension)
121+
- RHS when `!transposedRhs` (N is not the innermost dimension)
122+
123+
Supported element types for gfx1200+: i8, f8E5M2FNUZ, f8E4M3FNUZ,
124+
f8E5M2, f8E4M3FN, f16, bf16, i16.
125+
}];
126+
let assemblyFormat = "";
127+
let parameters = (ins);
128+
}
129+
100130
def IREEGPU_PromoteWithCacheSwizzle :
101131
AttrDef<IREEGPU_Dialect, "PromoteWithCacheSwizzle", [
102132
DeclareAttrInterfaceMethods<IREEGPU_PromotionAttr, [

0 commit comments

Comments
 (0)