ROCm
diff --git a/‎mlir/include/mlir/Dialect/MIGraphX/IR/MIGraphX.td‎
Lines changed: 1 addition & 1 deletion b/‎mlir/include/mlir/Dialect/MIGraphX/IR/MIGraphX.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/lib/Conversion/TosaToRock/TosaToRock.cpp‎
Lines changed: 13 additions & 3 deletions b/‎mlir/lib/Conversion/TosaToRock/TosaToRock.cpp‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎mlir/lib/Dialect/Rock/Transforms/BufferizableOpInterfaceImpl.cpp‎
Lines changed: 3 additions & 3 deletions b/‎mlir/lib/Dialect/Rock/Transforms/BufferizableOpInterfaceImpl.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎mlir/test/Conversion/TosaToRock/tosa-to-rock-paged-attention.mlir‎
Lines changed: 4 additions & 4 deletions b/‎mlir/test/Conversion/TosaToRock/tosa-to-rock-paged-attention.mlir‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎mlir/test/Dialect/Rock/effects.mlir‎
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Dialect/Rock/effects.mlir‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Dialect/Rock/gridwise-attention-prefix-causal.mlir‎
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Dialect/Rock/gridwise-attention-prefix-causal.mlir‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Dialect/Rock/gridwise-gemm-input-fusion-type-change.mlir‎
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Dialect/Rock/gridwise-gemm-input-fusion-type-change.mlir‎
Lines changed: 1 addition & 1 deletion
@@ -233,7 +233,7 @@ def MIGraphX_DerefOp : MIGraphX_Op<"deref">,
 
     Example:
     ```mlir
-    %result = migraphx.deref %addrs {target_type = 1 : i64}
+    %result = migraphx.deref %addrs
         : <1x64x8192xui64, 524288x8192x1> to <1x64x8192xf16, 524288x8192x1>
     ```
   }];
 
@@ -3428,7 +3428,19 @@ static FailureOr<Value> matchDerefInputPattern(Value derefInput) {
   Value lhsSource = getPreBroadcastSource(lhs);
   Value rhsSource = getPreBroadcastSource(rhs);
 
-  // Helper to trace back through view ops to find the original 3D tensor
+  auto lhsType = cast<ShapedType>(lhsSource.getType());
+  auto rhsType = cast<ShapedType>(rhsSource.getType());
+
+  // Check which one has last dimension = 1 (pointers)
+  // The pointers tensor should have shape [batch, blocks, 1]
+  if (lhsType.getRank() == 3 && lhsType.getShape()[2] == 1)
+    return lhsSource;
+  if (rhsType.getRank() == 3 && rhsType.getShape()[2] == 1)
+    return rhsSource;
+
+  // If the direct check didn't find the pointers, trace back through view ops
+  // to find the original 3D tensor. This handles cases where the pointer tensor
+  // goes through reshape/slice operations.
   auto traceBackThroughViewOps = [](Value v) -> Value {
     while (Operation *defOp = v.getDefiningOp()) {
       if (!viewOps.contains(defOp->getName().getStringRef()))
@@ -3446,8 +3458,6 @@ static FailureOr<Value> matchDerefInputPattern(Value derefInput) {
   auto lhsOriginalType = cast<ShapedType>(lhsOriginal.getType());
   auto rhsOriginalType = cast<ShapedType>(rhsOriginal.getType());
 
-  // Check which one has last dimension = 1 (pointers)
-  // The pointers tensor should have shape [batch, blocks, 1]
   if (lhsOriginalType.getRank() == 3 && lhsOriginalType.getShape()[2] == 1)
     return lhsOriginal;
   if (rhsOriginalType.getRank() == 3 && rhsOriginalType.getShape()[2] == 1)
 
@@ -333,9 +333,9 @@ struct DerefOpInterface
     auto derefOp = mlir::cast<DerefOp>(op);
 
     // Get buffer for pointers operand
-    FailureOr<Value> PointersBuffer =
+    FailureOr<Value> pointersBuffer =
         getBuffer(rewriter, derefOp.getPointers(), options, state);
-    if (failed(PointersBuffer))
+    if (failed(pointersBuffer))
       return failure();
 
     // Determine the result memref type from the tensor type
@@ -353,7 +353,7 @@ struct DerefOpInterface
 
     // Create new op with memref types
     replaceOpWithNewBufferizedOp<DerefOp>(rewriter, op, resultMemRefType,
-                                          *PointersBuffer);
+                                          *pointersBuffer);
     return success();
   }
 };
 
@@ -39,15 +39,15 @@ func.func @test_paged_attention(
   %7 = tosa.mul %4, %4, %2 : (tensor<1x64x8192xi64>, tensor<1x64x8192xi64>, tensor<1xi8>) -> tensor<1x64x8192xi64>
   %8 = tosa.add %6, %7 : (tensor<1x64x8192xi64>, tensor<1x64x8192xi64>) -> tensor<1x64x8192xi64>
 
-  // CHECK: %[[KEY_DEREF:.*]] = rock.deref %{{.*}} : tensor<1x64x1xi64> -> tensor<1x64x8192xf16>
+  // CHECK: %[[VAL_DEREF:.*]] = rock.deref %{{.*}} : tensor<1x64x1xi64> -> tensor<1x64x8192xf16>
   %9 = tosa.custom %8 {domain_name = "rocmlir", implementation_attrs = "", operator_name = "deref"} : (tensor<1x64x8192xi64>) -> tensor<1x64x8192xf16>
   %extracted_slice_3 = tensor.extract_slice %expanded_0[0, 0, 0] [1, 1, 64] [1, 1, 1] : tensor<1x16x64xi64> to tensor<1x1x64xi64>
   %collapsed_4 = tensor.collapse_shape %extracted_slice_3 [[0, 1], [2]] : tensor<1x1x64xi64> into tensor<1x64xi64>
   %expanded_5 = tensor.expand_shape %collapsed_4 [[0], [1, 2]] output_shape [1, 64, 1] : tensor<1x64xi64> into tensor<1x64x1xi64>
   %10 = tosa.mul %expanded_5, %4, %2 : (tensor<1x64x1xi64>, tensor<1x64x8192xi64>, tensor<1xi8>) -> tensor<1x64x8192xi64>
   %11 = tosa.add %10, %7 : (tensor<1x64x8192xi64>, tensor<1x64x8192xi64>) -> tensor<1x64x8192xi64>
 
-  // CHECK: %[[VAL_DEREF:.*]] = rock.deref %{{.*}} : tensor<1x64x1xi64> -> tensor<1x64x8192xf16>
+  // CHECK: %[[KEY_DEREF:.*]] = rock.deref %{{.*}} : tensor<1x64x1xi64> -> tensor<1x64x8192xf16>
   %12 = tosa.custom %11 {domain_name = "rocmlir", implementation_attrs = "", operator_name = "deref"} : (tensor<1x64x8192xi64>) -> tensor<1x64x8192xf16>
   %extracted_slice_6 = tensor.extract_slice %5[0, 0, 0, 0] [1, 14, 1500, 64] [1, 1, 1, 1] : tensor<1x18x1500x64xf16> to tensor<1x14x1500x64xf16>
   %collapsed_7 = tensor.collapse_shape %9 [[0], [1, 2]] : tensor<1x64x8192xf16> into tensor<1x524288xf16>
@@ -71,8 +71,8 @@ func.func @test_paged_attention(
   %22 = tosa.mul %20, %21, %2 : (tensor<1x1x1x1xi32>, tensor<1x14x1500x4096xi32>, tensor<1xi8>) -> tensor<1x14x1500x4096xi32>
 
   // CHECK: rock.attention
-  // CHECK: keyAddresses = (%[[VAL_DEREF]] : tensor<1x64x8192xf16>)
-  // CHECK: valueAddresses = (%[[KEY_DEREF]] : tensor<1x64x8192xf16>)
+  // CHECK: keyAddresses = (%[[KEY_DEREF]] : tensor<1x64x8192xf16>)
+  // CHECK: valueAddresses = (%[[VAL_DEREF]] : tensor<1x64x8192xf16>)
 
   %23 = "tosa.const"() <{values = dense<0> : tensor<1x14x1500x4096xi32>}> : () -> tensor<1x14x1500x4096xi32>
   %24 = tosa.greater %23, %22 : (tensor<1x14x1500x4096xi32>, tensor<1x14x1500x4096xi32>) -> tensor<1x14x1500x4096xi1>
 
@@ -464,7 +464,7 @@ func.func @rock_gridwise_attn(%arg0: memref<1x384x64xf32>,
     params0 = #rock.accel_gemm_params<kpackPerBlock = 32, mPerBlock = 32, nPerBlock = 32, kpack = 1, mPerWave = 32, nPerWave = 32, mnPerXdl = 32, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2, wavesPerEU = 0, gridGroupSize = 0, forceUnroll = true>,
     params1 = #rock.accel_gemm_params<kpackPerBlock = 32, mPerBlock = 32, nPerBlock = 32, kpack = 1, mPerWave = 32, nPerWave = 32, mnPerXdl = 32, splitKFactor = 1, scheduleVersion = 1, outputSwizzle = 2, wavesPerEU = 0, gridGroupSize = 0, forceUnroll = true>,
     firstGemmIndices = array<i64: 0>,
-    operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 0, 1, 0>,
+    operand_segment_sizes = array<i32: 1, 1, 1, 0, 0, 0, 0, 0, 1, 0>,
     splitKV = 1 : i32,
     storeMethod = #rock<StoreMethod set>
   } : memref<1x64x384xf32>, memref<1x64x384xf32>, memref<1x384x64xf32>, memref<1x384x64xf32>
 
@@ -110,7 +110,7 @@ module {
       }
       memref.copy %alloc_0, %arg6 : memref<1x14x4x16xf16> to memref<1x14x4x16xf16>
       rock.yield
-    } {blockSize = 64 : i32, causal, firstGemmIndices = array<i64: 0>, gridSize = 14 : i32, operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 1, 1, 0>, params0 = #accel_gemm_params, params1 = #accel_gemm_params, prePadG0M = 16 : index, prePadG0N = 4 : index, softmaxType = f32, splitKV = 1 : i32, storeMethod = #rock<StoreMethod set>} : memref<14x64x32xf16>, memref<14x64x32xf16>, memref<14x32x64xf16>, memref<14xi32>, memref<14x32x64xf16>
+    } {blockSize = 64 : i32, causal, firstGemmIndices = array<i64: 0>, gridSize = 14 : i32, operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 1, 0, 0, 1, 0>, params0 = #accel_gemm_params, params1 = #accel_gemm_params, prePadG0M = 16 : index, prePadG0N = 4 : index, softmaxType = f32, splitKV = 1 : i32, storeMethod = #rock<StoreMethod set>} : memref<14x64x32xf16>, memref<14x64x32xf16>, memref<14x32x64xf16>, memref<14xi32>, memref<14x32x64xf16>
     memref.copy %alloc, %arg4 : memref<3584xf16> to memref<3584xf16>
     return
   }
 
@@ -102,7 +102,7 @@ module {
       }
       memref.copy %alloc_1, %arg7 : memref<1x16x1500x1500xf32> to memref<1x16x1500x1500xf32>
       rock.yield
-    } {blockSize = 64 : i32, firstGemmIndices = array<i64: 0>, gridSize = 752 : i32, operandSegmentSizes = array<i32: 1, 1, 1, 1, 0, 0, 1, 0>, params0 = #accel_gemm_params, params1 = #accel_gemm_params, prePadG0M = 1500 : index, prePadG0N = 1500 : index, softmaxType = f32, splitKV = 1 : i32, storeMethod = #rock<StoreMethod set>} : memref<16x64x1504xf16>, memref<16x64x1504xf16>, memref<16x1504x64xf32>, memref<2250000xf32>, memref<16x1504x64xf32>
+    } {blockSize = 64 : i32, firstGemmIndices = array<i64: 0>, gridSize = 752 : i32, operandSegmentSizes = array<i32: 1, 1, 1, 1, 0, 0, 0, 0, 1, 0>, params0 = #accel_gemm_params, params1 = #accel_gemm_params, prePadG0M = 1500 : index, prePadG0N = 1500 : index, softmaxType = f32, splitKV = 1 : i32, storeMethod = #rock<StoreMethod set>} : memref<16x64x1504xf16>, memref<16x64x1504xf16>, memref<16x1504x64xf32>, memref<2250000xf32>, memref<16x1504x64xf32>
     memref.copy %alloc_0, %arg4 : memref<1536000xf32> to memref<1536000xf32>
     return
   }
Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ module {`
`110`	`110`	`}`
`111`	`111`	`memref.copy %alloc_0, %arg6 : memref<1x14x4x16xf16> to memref<1x14x4x16xf16>`
`112`	`112`	`rock.yield`
`113`		`- } {blockSize = 64 : i32, causal, firstGemmIndices = array<i64: 0>, gridSize = 14 : i32, operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 1, 1, 0>, params0 = #accel_gemm_params, params1 = #accel_gemm_params, prePadG0M = 16 : index, prePadG0N = 4 : index, softmaxType = f32, splitKV = 1 : i32, storeMethod = #rock<StoreMethod set>} : memref<14x64x32xf16>, memref<14x64x32xf16>, memref<14x32x64xf16>, memref<14xi32>, memref<14x32x64xf16>`
	`113`	`+ } {blockSize = 64 : i32, causal, firstGemmIndices = array<i64: 0>, gridSize = 14 : i32, operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 1, 0, 0, 1, 0>, params0 = #accel_gemm_params, params1 = #accel_gemm_params, prePadG0M = 16 : index, prePadG0N = 4 : index, softmaxType = f32, splitKV = 1 : i32, storeMethod = #rock<StoreMethod set>} : memref<14x64x32xf16>, memref<14x64x32xf16>, memref<14x32x64xf16>, memref<14xi32>, memref<14x32x64xf16>`
`114`	`114`	`memref.copy %alloc, %arg4 : memref<3584xf16> to memref<3584xf16>`
`115`	`115`	`return`
`116`	`116`	`}`
Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ module {`
`102`	`102`	`}`
`103`	`103`	`memref.copy %alloc_1, %arg7 : memref<1x16x1500x1500xf32> to memref<1x16x1500x1500xf32>`
`104`	`104`	`rock.yield`
`105`		`- } {blockSize = 64 : i32, firstGemmIndices = array<i64: 0>, gridSize = 752 : i32, operandSegmentSizes = array<i32: 1, 1, 1, 1, 0, 0, 1, 0>, params0 = #accel_gemm_params, params1 = #accel_gemm_params, prePadG0M = 1500 : index, prePadG0N = 1500 : index, softmaxType = f32, splitKV = 1 : i32, storeMethod = #rock<StoreMethod set>} : memref<16x64x1504xf16>, memref<16x64x1504xf16>, memref<16x1504x64xf32>, memref<2250000xf32>, memref<16x1504x64xf32>`
	`105`	`+ } {blockSize = 64 : i32, firstGemmIndices = array<i64: 0>, gridSize = 752 : i32, operandSegmentSizes = array<i32: 1, 1, 1, 1, 0, 0, 0, 0, 1, 0>, params0 = #accel_gemm_params, params1 = #accel_gemm_params, prePadG0M = 1500 : index, prePadG0N = 1500 : index, softmaxType = f32, splitKV = 1 : i32, storeMethod = #rock<StoreMethod set>} : memref<16x64x1504xf16>, memref<16x64x1504xf16>, memref<16x1504x64xf32>, memref<2250000xf32>, memref<16x1504x64xf32>`
`106`	`106`	`memref.copy %alloc_0, %arg4 : memref<1536000xf32> to memref<1536000xf32>`
`107`	`107`	`return`
`108`	`108`	`}`