iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/VerifyPipelineConstraints.cpp‎
Lines changed: 4 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/VerifyPipelineConstraints.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/test/insert_smt_constraints.mlir‎
Lines changed: 2 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/test/insert_smt_constraints.mlir‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/test/verify_smt_constraints_e2e.mlir‎
Lines changed: 163 additions & 6 deletions b/‎compiler/src/iree/compiler/Codegen/Common/test/verify_smt_constraints_e2e.mlir‎
Lines changed: 163 additions & 6 deletions
@@ -156,6 +156,10 @@ struct ConstraintEvaluator {
                     smt::IntDivOp, smt::IntModOp, smt::IntMulOp, smt::IntSubOp,
                     smt::IteOp, smt::NotOp, smt::OrOp>(
                   [&](auto op) { return eval(op); })
+              .Case<smt::DeclareFunOp>([&](smt::DeclareFunOp declOp) {
+                intValues[declOp.getResult()] = std::nullopt;
+                return success();
+              })
               .Default([](Operation *unhandled) {
                 return unhandled->emitError(
                     "unsupported op in constraint evaluator");
 
@@ -48,6 +48,8 @@ hal.executable @matmul_f32_ex {
 // CHECK:           linalg.fill
 // CHECK-NOT:       iree_codegen.smt.constraints
 // CHECK:           linalg.matmul
+// CHECK:           iree_codegen.smt.constraints target = <set = 0>, pipeline = #iree_gpu.pipeline<TileAndFuse>
+// CHECK-NEXT{LITERAL}: knobs = {mma_kind = #iree_codegen.smt.one_of_knob<"mma_idx", [#iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>]>, reduction = [0, 0, #iree_codegen.smt.int_knob<"red_2">], subgroup = [#iree_codegen.smt.int_knob<"sg_0">, #iree_codegen.smt.int_knob<"sg_1">, 0], subgroup_size = #iree_codegen.smt.int_knob<"sg_size">, workgroup = [#iree_codegen.smt.int_knob<"wg_0">, #iree_codegen.smt.int_knob<"wg_1">, 0], workgroup_size = [#iree_codegen.smt.int_knob<"wg_size_x">, #iree_codegen.smt.int_knob<"wg_size_y">, #iree_codegen.smt.int_knob<"wg_size_z">]}
 //
 // CHECK:           iree_codegen.smt.constraints target = <set = 0>, pipeline = #iree_gpu.pipeline<VectorDistribute>,
 // CHECK-NEXT{LITERAL}: knobs = {mma_kind = #iree_codegen.smt.one_of_knob<"mma_idx", [#iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>]>, reduction = [0, 0, #iree_codegen.smt.int_knob<"red_2">], subgroup_basis = [[#iree_codegen.smt.int_knob<"sg_m_cnt">, #iree_codegen.smt.int_knob<"sg_n_cnt">, 1], [0, 1, 2]], subgroup_size = #iree_codegen.smt.int_knob<"sg_size">, workgroup = [#iree_codegen.smt.int_knob<"wg_0">, #iree_codegen.smt.int_knob<"wg_1">, 0], workgroup_size = [#iree_codegen.smt.int_knob<"wg_size_x">, #iree_codegen.smt.int_knob<"wg_size_y">, #iree_codegen.smt.int_knob<"wg_size_z">]}
 
@@ -22,7 +22,7 @@
     pipeline = #iree_gpu.pipeline<VectorDistribute>
     workgroup_size = [64, 1, 1] subgroup_size = 64>
 
-func.func @matmul_e2e_generated_violation(
+func.func @matmul_e2e_generated_violation_vd(
     %lhs: tensor<128x64xf32>, %rhs: tensor<64x256xf32>)
     -> tensor<128x256xf32>
     attributes {hal.executable.target = #exec_target,
@@ -48,6 +48,47 @@ func.func @matmul_e2e_generated_violation(
 
 // -----
 
+#gpu_target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+  compute = fp32, storage = b32, subgroup = shuffle,
+  mma = [<MFMA_F32_16x16x4_F32>],
+  subgroup_size_choices = [64],
+  max_load_instruction_bits = 128,
+  max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+  max_workgroup_memory_bytes = 65536,
+  max_workgroup_counts = [2147483647, 2147483647, 2147483647]
+>>
+#exec_target = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+    {iree_codegen.target_info = #gpu_target}>
+#translation = #iree_codegen.translation_info<
+    pipeline = #iree_gpu.pipeline<TileAndFuse>
+    workgroup_size = [64, 1, 1] subgroup_size = 64>
+
+func.func @matmul_e2e_generated_violation_tf(
+    %lhs: tensor<128x64xf32>, %rhs: tensor<64x256xf32>)
+    -> tensor<128x256xf32>
+    attributes {hal.executable.target = #exec_target,
+                translation_info = #translation} {
+  %cst = arith.constant 0.0 : f32
+  %init = tensor.empty() : tensor<128x256xf32>
+  %fill = linalg.fill {root_op = #iree_codegen.root_op<set = 0>}
+      ins(%cst : f32) outs(%init : tensor<128x256xf32>)
+      -> tensor<128x256xf32>
+  // expected-error @below {{pipeline constraints violated}}
+  // expected-note @below {{dim_0 must be divisible by wg_0 (128 % 48 == 0)}}
+  %result = linalg.matmul {
+      lowering_config = #iree_gpu.lowering_config<{
+          workgroup = [48, 64, 0],
+          reduction = [0, 0, 16],
+          mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
+          subgroup = [1, 1, 0]}>,
+      root_op = #iree_codegen.root_op<set = 0>}
+      ins(%lhs, %rhs : tensor<128x64xf32>, tensor<64x256xf32>)
+      outs(%fill : tensor<128x256xf32>) -> tensor<128x256xf32>
+  return %result : tensor<128x256xf32>
+}
+
+// -----
+
 #gpu_target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
   mma = [<MFMA_F32_16x16x4_F32>],
@@ -63,7 +104,7 @@ func.func @matmul_e2e_generated_violation(
     pipeline = #iree_gpu.pipeline<VectorDistribute>
     workgroup_size = [64, 1, 1] subgroup_size = 64>
 
-func.func @conv_e2e_generated_violation(
+func.func @conv_e2e_generated_violation_vd(
     %input: tensor<1x18x130x64xf32>, %filter: tensor<3x3x64x128xf32>)
     -> tensor<1x16x128x128xf32>
     attributes {hal.executable.target = #exec_target,
@@ -93,6 +134,50 @@ func.func @conv_e2e_generated_violation(
 
 // -----
 
+#gpu_target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+  compute = fp32, storage = b32, subgroup = shuffle,
+  mma = [<MFMA_F32_16x16x4_F32>],
+  subgroup_size_choices = [64],
+  max_load_instruction_bits = 128,
+  max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+  max_workgroup_memory_bytes = 65536,
+  max_workgroup_counts = [2147483647, 2147483647, 2147483647]
+>>
+#exec_target = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+    {iree_codegen.target_info = #gpu_target}>
+#translation = #iree_codegen.translation_info<
+    pipeline = #iree_gpu.pipeline<TileAndFuse>
+    workgroup_size = [64, 1, 1] subgroup_size = 64>
+
+func.func @conv_e2e_generated_violation_tf(
+    %input: tensor<1x18x130x64xf32>, %filter: tensor<3x3x64x128xf32>)
+    -> tensor<1x16x128x128xf32>
+    attributes {hal.executable.target = #exec_target,
+                translation_info = #translation} {
+  %cst = arith.constant 0.0 : f32
+  %init = tensor.empty() : tensor<1x16x128x128xf32>
+  %fill = linalg.fill {root_op = #iree_codegen.root_op<set = 1>}
+      ins(%cst : f32) outs(%init : tensor<1x16x128x128xf32>)
+      -> tensor<1x16x128x128xf32>
+  // expected-error @below {{pipeline constraints violated}}
+  // expected-note @below {{dim_2 must be divisible by wg_2 (128 % 48 == 0)}}
+  %result = linalg.conv_2d_nhwc_hwcf {
+      dilations = dense<1> : tensor<2xi64>,
+      lowering_config = #iree_gpu.lowering_config<{
+          workgroup = [1, 1, 48, 64, 0, 0, 0],
+          reduction = [0, 0, 0, 0, 1, 1, 16],
+          mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
+          subgroup = [1, 1, 1, 1, 0, 0, 0]}>,
+      root_op = #iree_codegen.root_op<set = 1>,
+      strides = dense<1> : tensor<2xi64>}
+      ins(%input, %filter : tensor<1x18x130x64xf32>,
+                               tensor<3x3x64x128xf32>)
+      outs(%fill : tensor<1x16x128x128xf32>) -> tensor<1x16x128x128xf32>
+  return %result : tensor<1x16x128x128xf32>
+}
+
+// -----
+
 // Test: End-to-end constraint insertion and verification.
 // Use the same shapes as above but with divisible workgroup sizes.
 // It should pass verification and have constraints erased.
@@ -111,7 +196,7 @@ func.func @conv_e2e_generated_violation(
     pipeline = #iree_gpu.pipeline<VectorDistribute>
     workgroup_size = [64, 1, 1] subgroup_size = 64>
 
-func.func @matmul_e2e_constraints_erased(
+func.func @matmul_e2e_constraints_erased_vd(
     %lhs: tensor<128x64xf32>, %rhs: tensor<64x256xf32>)
     -> tensor<128x256xf32>
     attributes {hal.executable.target = #exec_target,
@@ -133,11 +218,11 @@ func.func @matmul_e2e_constraints_erased(
   return %result : tensor<128x256xf32>
 }
 
-// CHECK-LABEL: func.func @matmul_e2e_constraints_erased
+// CHECK-LABEL: func.func @matmul_e2e_constraints_erased_vd
 // CHECK:       linalg.matmul
 // CHECK-NOT:   iree_codegen.smt.constraints
 
-func.func @conv_e2e_constraints_erased(
+func.func @conv_e2e_constraints_erased_vd(
     %input: tensor<1x18x130x64xf32>, %filter: tensor<3x3x64x128xf32>)
     -> tensor<1x16x128x128xf32>
     attributes {hal.executable.target = #exec_target,
@@ -163,6 +248,78 @@ func.func @conv_e2e_constraints_erased(
   return %result : tensor<1x16x128x128xf32>
 }
 
-// CHECK-LABEL: func.func @conv_e2e_constraints_erased
+// CHECK-LABEL: func.func @conv_e2e_constraints_erased_vd
+// CHECK:       linalg.conv_2d_nhwc_hwcf
+// CHECK-NOT:   iree_codegen.smt.constraints
+
+// -----
+
+#gpu_target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+  compute = fp32, storage = b32, subgroup = shuffle,
+  mma = [<MFMA_F32_16x16x4_F32>],
+  subgroup_size_choices = [64],
+  max_load_instruction_bits = 128,
+  max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+  max_workgroup_memory_bytes = 65536,
+  max_workgroup_counts = [2147483647, 2147483647, 2147483647]
+>>
+#exec_target = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+    {iree_codegen.target_info = #gpu_target}>
+#translation = #iree_codegen.translation_info<
+    pipeline = #iree_gpu.pipeline<TileAndFuse>
+    workgroup_size = [64, 1, 1] subgroup_size = 64>
+
+func.func @matmul_e2e_constraints_erased_tf(
+    %lhs: tensor<128x64xf32>, %rhs: tensor<64x256xf32>)
+    -> tensor<128x256xf32>
+    attributes {hal.executable.target = #exec_target,
+                translation_info = #translation} {
+  %cst = arith.constant 0.0 : f32
+  %init = tensor.empty() : tensor<128x256xf32>
+  %fill = linalg.fill {root_op = #iree_codegen.root_op<set = 0>}
+      ins(%cst : f32) outs(%init : tensor<128x256xf32>)
+      -> tensor<128x256xf32>
+  %result = linalg.matmul {
+      lowering_config = #iree_gpu.lowering_config<{
+          workgroup = [16, 16, 0],
+          reduction = [0, 0, 16],
+          mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
+          subgroup = [1, 1, 0]}>,
+      root_op = #iree_codegen.root_op<set = 0>}
+      ins(%lhs, %rhs : tensor<128x64xf32>, tensor<64x256xf32>)
+      outs(%fill : tensor<128x256xf32>) -> tensor<128x256xf32>
+  return %result : tensor<128x256xf32>
+}
+
+// CHECK-LABEL: func.func @matmul_e2e_constraints_erased_tf
+// CHECK:       linalg.matmul
+// CHECK-NOT:   iree_codegen.smt.constraints
+
+func.func @conv_e2e_constraints_erased_tf(
+    %input: tensor<1x18x130x64xf32>, %filter: tensor<3x3x64x128xf32>)
+    -> tensor<1x16x128x128xf32>
+    attributes {hal.executable.target = #exec_target,
+                translation_info = #translation} {
+  %cst = arith.constant 0.0 : f32
+  %init = tensor.empty() : tensor<1x16x128x128xf32>
+  %fill = linalg.fill {root_op = #iree_codegen.root_op<set = 1>}
+      ins(%cst : f32) outs(%init : tensor<1x16x128x128xf32>)
+      -> tensor<1x16x128x128xf32>
+  %result = linalg.conv_2d_nhwc_hwcf {
+      dilations = dense<1> : tensor<2xi64>,
+      lowering_config = #iree_gpu.lowering_config<{
+          workgroup = [1, 1, 16, 64, 0, 0, 0],
+          reduction = [0, 0, 0, 0, 1, 1, 16],
+          mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
+          subgroup = [0, 1, 0, 1, 1, 0, 0]}>,
+      root_op = #iree_codegen.root_op<set = 1>,
+      strides = dense<1> : tensor<2xi64>}
+      ins(%input, %filter : tensor<1x18x130x64xf32>,
+                               tensor<3x3x64x128xf32>)
+      outs(%fill : tensor<1x16x128x128xf32>) -> tensor<1x16x128x128xf32>
+  return %result : tensor<1x16x128x128xf32>
+}
+
+// CHECK-LABEL: func.func @conv_e2e_constraints_erased_tf
 // CHECK:       linalg.conv_2d_nhwc_hwcf
 // CHECK-NOT:   iree_codegen.smt.constraints