Update existing lit tests and add new tests for SALU promotion and emitter changes

panditsa · panditsa · commit dd3b163e1f2a · 2026-04-06T12:22:36.000-05:00
Fix 5 test regressions from cherry-picked SALU promotion commits:
- region-based-translation.mlir: output now in custom form (SCC condition)
- vadd-commute.mlir: v_add_u32 is VOP3-only on GFX9+, needs scratch VGPR
- buffer-ops-srd-adjust.mlir: match WaveASM IR form for SRD construction
- scf-if-agpr-else-coercion.mlir: output now in custom form (SCC condition)
- swizzle-srd-num-records.mlir: match WaveASM IR form for swizzle SRD

Add 4 new lit tests:
- salu-promotion-arith.mlir: SGPR muli/addi/cmpi use SALU instructions
- salu-select-fusion.mlir: scalar cmpi + select fuses to s_cmp + s_cselect
- agpr-inline-constant.mlir: inline constants written directly to AGPRs
- vop2-commutative-swap.mlir: VOP2 literal swapped from src1 to src0

Made-with: Cursor
Signed-off-by: Sanket Pandit &lt;sanket.pandit@amd.com&gt;
diff --git a/waveasm/test/Transforms/agpr-inline-constant.mlir b/waveasm/test/Transforms/agpr-inline-constant.mlir
@@ -0,0 +1,29 @@
+// RUN: waveasm-translate --waveasm-linear-scan --emit-assembly %s | FileCheck %s
+//
+// Test: Inline AGPR constants. When writing an inline constant ([-16, 64])
+// to an AGPR, emit v_accvgpr_write_b32 directly without a scratch VGPR.
+// Non-inline literals still require v_mov_b32 to scratch VGPR first.
+
+// CHECK-LABEL: agpr_inline_test:
+
+waveasm.program @agpr_inline_test target = #waveasm.target<#waveasm.gfx942, 5> abi = #waveasm.abi<> attributes {vgprs = 32 : i64, sgprs = 16 : i64} {
+
+  // Inline constant 0 -> direct v_accvgpr_write_b32, no scratch VGPR
+  %c0 = waveasm.constant 0 : !waveasm.imm<0>
+  // CHECK: v_accvgpr_write_b32 a{{[0-9]+}}, 0
+  %a0 = waveasm.v_mov_b32 %c0 : !waveasm.imm<0> -> !waveasm.areg
+
+  // Inline constant 42 -> direct v_accvgpr_write_b32
+  %c42 = waveasm.constant 42 : !waveasm.imm<42>
+  // CHECK-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, 42
+  %a1 = waveasm.v_mov_b32 %c42 : !waveasm.imm<42> -> !waveasm.areg
+
+  // Non-inline literal 999 -> must use scratch VGPR
+  %c999 = waveasm.constant 999 : !waveasm.imm<999>
+  // CHECK-NEXT: v_mov_b32 v15, 999
+  // CHECK-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, v15
+  %a2 = waveasm.v_mov_b32 %c999 : !waveasm.imm<999> -> !waveasm.areg
+
+  // CHECK: s_endpgm
+  waveasm.s_endpgm
+}
diff --git a/waveasm/test/Transforms/region-based-translation.mlir b/waveasm/test/Transforms/region-based-translation.mlir
@@ -4,105 +4,107 @@
 // Verifies scf.for -> waveasm.loop and scf.if -> waveasm.if with correct
 // SSA threading, iter_args, and condition patterns.
 //
-// Note: scf_if_to_wave_if currently produces a vreg condition for waveasm.if
-// (instead of scc), so the module dumps in generic form after verification.
+// With SALU promotion, arith.cmpi on scalar operands emits s_cmp (SCC result),
+// so waveasm.if gets an SCC condition and the output is in custom form.
 
 module {
   gpu.module @test_scf_translation {
 
     // --- scf.for(0, 16, 1) -> waveasm.loop with SGPR induction variable ---
-    // CHECK-LABEL: sym_name = "scf_for_to_loop"
+    // CHECK-LABEL: waveasm.program @scf_for_to_loop
     gpu.func @scf_for_to_loop() kernel {
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
       %c16 = arith.constant 16 : index
 
       // Init materialised via s_mov_b32, loop carries single sreg
-      // CHECK:      "waveasm.s_mov_b32"
-      // CHECK:      "waveasm.loop"
+      // CHECK:      waveasm.s_mov_b32
+      // CHECK:      waveasm.loop
       scf.for %i = %c0 to %c16 step %c1 {
         %i_i32 = arith.index_cast %i : index to i32
       }
       // Induction variable incremented, compared, condition terminates
-      // CHECK:      %[[NEXT:.*]], %{{.*}} = waveasm.s_add_u32 %[[IV]], %{{.*}} : !waveasm.sreg, !waveasm.imm<1> -> !waveasm.sreg, !waveasm.scc
+      // CHECK:      %[[NEXT:.*]], %{{.*}} = waveasm.s_add_u32 %{{.*}}, %{{.*}} : !waveasm.sreg, !waveasm.imm<1> -> !waveasm.sreg, !waveasm.scc
       // CHECK-NEXT: %[[CMP:.*]] = waveasm.s_cmp_lt_u32 %[[NEXT]], %{{.*}} : !waveasm.sreg, !waveasm.imm<16> -> !waveasm.scc
       // CHECK-NEXT: waveasm.condition %[[CMP]] : !waveasm.scc iter_args(%[[NEXT]]) : !waveasm.sreg
 
-      // CHECK: "waveasm.s_endpgm"
+      // CHECK: waveasm.s_endpgm
       gpu.return
     }
 
     // --- scf.for with iter_args -> waveasm.loop with two iter_args ---
-    // CHECK-LABEL: sym_name = "scf_for_with_iter_args"
+    // CHECK-LABEL: waveasm.program @scf_for_with_iter_args
     gpu.func @scf_for_with_iter_args() kernel {
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
       %c16 = arith.constant 16 : index
       %init = arith.constant 0 : i32
 
       // Two inits: sreg counter + vreg accumulator
-      // CHECK:      "waveasm.s_mov_b32"
-      // CHECK:      "waveasm.v_mov_b32"
-      // CHECK:      "waveasm.loop"
+      // CHECK:      waveasm.s_mov_b32
+      // CHECK:      waveasm.v_mov_b32
+      // CHECK:      waveasm.loop
       %result = scf.for %i = %c0 to %c16 step %c1
           iter_args(%acc = %init) -> (i32) {
         %i_i32 = arith.index_cast %i : index to i32
         %new_acc = arith.addi %acc, %i_i32 : i32
         scf.yield %new_acc : i32
       }
       // Body accumulates: vreg + sreg
-      // CHECK:      "waveasm.v_add_u32"
+      // CHECK:      waveasm.v_add_u32
       // Induction variable incremented, compared, condition with both iter_args
-      // CHECK:      %[[NEXT:.*]], %{{.*}} = waveasm.s_add_u32 %[[IV]], %{{.*}} : !waveasm.sreg, !waveasm.imm<1> -> !waveasm.sreg, !waveasm.scc
+      // CHECK:      %[[NEXT:.*]], %{{.*}} = waveasm.s_add_u32 %{{.*}}, %{{.*}} : !waveasm.sreg, !waveasm.imm<1> -> !waveasm.sreg, !waveasm.scc
       // CHECK-NEXT: %[[CMP:.*]] = waveasm.s_cmp_lt_u32 %[[NEXT]], %{{.*}} : !waveasm.sreg, !waveasm.imm<16> -> !waveasm.scc
-      // CHECK-NEXT: waveasm.condition %[[CMP]] : !waveasm.scc iter_args(%[[NEXT]], %[[NEWACC]]) : !waveasm.sreg, !waveasm.vreg
+      // CHECK-NEXT: waveasm.condition %[[CMP]] : !waveasm.scc iter_args(%[[NEXT]], %{{.*}}) : !waveasm.sreg, !waveasm.vreg
 
-      // CHECK: "waveasm.s_endpgm"
+      // CHECK: waveasm.s_endpgm
       gpu.return
     }
 
     // --- scf.if -> waveasm.if with then/else branches ---
-    // CHECK-LABEL: sym_name = "scf_if_to_wave_if"
+    // CHECK-LABEL: waveasm.program @scf_if_to_wave_if
     gpu.func @scf_if_to_wave_if() kernel {
       %arg0 = arith.constant 5 : i32
       %arg1 = arith.constant 3 : i32
       %c10 = arith.constant 10 : i32
       %cond_i32 = arith.cmpi slt, %arg0, %c10 : i32
       %cond_ext = arith.extui %cond_i32 : i1 to i32
 
+      // SALU promotion: scalar cmpi produces SCC directly
+      // CHECK:      waveasm.s_cmp_lt_i32
       // CHECK:      %{{.*}} = waveasm.if %{{.*}} : !waveasm.scc -> !waveasm.vreg {
       %result = scf.if %cond_i32 -> i32 {
-        // CHECK:      "waveasm.v_add_u32"
+        // CHECK:      waveasm.v_add_u32
         %sum = arith.addi %arg0, %arg1 : i32
-        // CHECK:      "waveasm.yield"
+        // CHECK:      waveasm.yield
         scf.yield %sum : i32
       } else {
-        // CHECK:      "waveasm.v_sub_u32"
+        // CHECK:      waveasm.v_sub_u32
         %diff = arith.subi %arg0, %arg1 : i32
-        // CHECK:      "waveasm.yield"
+        // CHECK:      waveasm.yield
         scf.yield %diff : i32
       }
 
-      // CHECK: "waveasm.s_endpgm"
+      // CHECK: waveasm.s_endpgm
       gpu.return
     }
 
     // --- Nested scf.for -> nested waveasm.loop ---
-    // CHECK-LABEL: sym_name = "nested_scf_loops"
+    // CHECK-LABEL: waveasm.program @nested_scf_loops
     gpu.func @nested_scf_loops() kernel {
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
       %c4 = arith.constant 4 : index
       %c8 = arith.constant 8 : index
 
       // Outer loop: sreg counter
-      // CHECK:      "waveasm.loop"
+      // CHECK:      waveasm.loop
       scf.for %i = %c0 to %c4 step %c1 {
         // Inner loop: sreg counter
-        // CHECK:      "waveasm.loop"
+        // CHECK:      waveasm.loop
         scf.for %j = %c0 to %c8 step %c1 {
           // Body uses both outer and inner IVs
-          // CHECK:      "waveasm.s_add_u32"
+          // CHECK:      waveasm.s_add_u32
           %sum = arith.addi %i, %j : index
         }
         // Inner condition
@@ -111,7 +113,7 @@ module {
       // Outer condition
       // CHECK:      waveasm.condition %{{.*}} : !waveasm.scc iter_args(%{{.*}}) : !waveasm.sreg
 
-      // CHECK: "waveasm.s_endpgm"
+      // CHECK: waveasm.s_endpgm
       gpu.return
     }
   }
diff --git a/waveasm/test/Transforms/vadd-commute.mlir b/waveasm/test/Transforms/vadd-commute.mlir
@@ -1,19 +1,21 @@
 // RUN: waveasm-translate --waveasm-linear-scan --emit-assembly %s | FileCheck %s
 //
-// Test: v_add_u32 commutes non-inline literal from src1 to src0
+// Test: v_add_u32 is VOP3-only on GFX9+, so non-inline literals must be
+// materialized into a scratch VGPR (no VOP2 commutation available).
+// Inline constants work directly.
 
 // CHECK-LABEL: vadd_commute_test:
 
 waveasm.program @vadd_commute_test target = #waveasm.target<#waveasm.gfx942, 5> abi = #waveasm.abi<> {
   %v0 = waveasm.precolored.vreg 0 : !waveasm.pvreg<0>
 
-  // Non-inline literal in src1 should be commuted to src0
+  // Non-inline literal 256: materialized into scratch VGPR (v_add_u32 is VOP3)
   %c256 = waveasm.constant 256 : !waveasm.imm<256>
-  // CHECK-NOT: v_mov_b32
-  // CHECK: v_add_u32 v{{[0-9]+}}, 256, v0
+  // CHECK: v_mov_b32 v15, 256
+  // CHECK: v_add_u32 v{{[0-9]+}}, v0, v15
   %r1 = waveasm.v_add_u32 %v0, %c256 : !waveasm.pvreg<0>, !waveasm.imm<256> -> !waveasm.vreg
 
-  // Inline constant should work without commutation
+  // Inline constant should work without materialization
   %c1 = waveasm.constant 1 : !waveasm.imm<1>
   // CHECK: v_add_u32 v{{[0-9]+}}, v{{[0-9]+}}, 1
   %r2 = waveasm.v_add_u32 %r1, %c1 : !waveasm.vreg, !waveasm.imm<1> -> !waveasm.vreg
diff --git a/waveasm/test/Transforms/vop2-commutative-swap.mlir b/waveasm/test/Transforms/vop2-commutative-swap.mlir
@@ -0,0 +1,39 @@
+// RUN: waveasm-translate --waveasm-linear-scan --emit-assembly %s | FileCheck %s
+//
+// Test: VOP2 commutative literal swap. For VOP2 instructions (v_and_b32,
+// v_or_b32, v_xor_b32), when a non-inline literal appears in src1, the
+// emitter swaps operands to place it in src0, avoiding scratch VGPR
+// materialization. Non-commutative ops still need materialization.
+
+// CHECK-LABEL: vop2_commute_swap_test:
+
+waveasm.program @vop2_commute_swap_test target = #waveasm.target<#waveasm.gfx942, 5> abi = #waveasm.abi<> {
+  %v0 = waveasm.precolored.vreg 0 : !waveasm.pvreg<0>
+
+  // v_and_b32 with literal in src1: swap to src0 (commutative)
+  %c4096 = waveasm.constant 4096 : !waveasm.imm<4096>
+  // CHECK-NOT: v_mov_b32
+  // CHECK: v_and_b32 v{{[0-9]+}}, 4096, v0
+  %r1 = waveasm.v_and_b32 %v0, %c4096 : !waveasm.pvreg<0>, !waveasm.imm<4096> -> !waveasm.vreg
+
+  // v_or_b32 with literal in src1: swap to src0 (commutative)
+  %c256 = waveasm.constant 256 : !waveasm.imm<256>
+  // CHECK-NOT: v_mov_b32
+  // CHECK: v_or_b32 v{{[0-9]+}}, 256, v0
+  %r2 = waveasm.v_or_b32 %v0, %c256 : !waveasm.pvreg<0>, !waveasm.imm<256> -> !waveasm.vreg
+
+  // v_xor_b32 with literal in src1: swap to src0 (commutative)
+  %c128 = waveasm.constant 128 : !waveasm.imm<128>
+  // CHECK-NOT: v_mov_b32
+  // CHECK: v_xor_b32 v{{[0-9]+}}, 128, v0
+  %r3 = waveasm.v_xor_b32 %v0, %c128 : !waveasm.pvreg<0>, !waveasm.imm<128> -> !waveasm.vreg
+
+  // v_lshlrev_b32 with literal in src0: literal already in correct position
+  %c200 = waveasm.constant 200 : !waveasm.imm<200>
+  // CHECK-NOT: v_mov_b32
+  // CHECK: v_lshlrev_b32 v{{[0-9]+}}, 200, v0
+  %r4 = waveasm.v_lshlrev_b32 %c200, %v0 : !waveasm.imm<200>, !waveasm.pvreg<0> -> !waveasm.vreg
+
+  // CHECK: s_endpgm
+  waveasm.s_endpgm
+}
diff --git a/waveasm/test/Translate/buffer-ops-srd-adjust.mlir b/waveasm/test/Translate/buffer-ops-srd-adjust.mlir
@@ -38,17 +38,14 @@ func.func @buffer_ops_test(%arg0: memref<f16>, %arg1: memref<f32>) {
         to memref<?xf16, #amdgpu.address_space<fat_raw_buffer>>
 
   // The load SRD should be adjusted with the workgroup offset via SALU:
-  //   s_mov_b64 (copy base), s_mov_b32 (wg offset already in SGPR),
   //   s_mul_hi_i32 + s_mul_i32 (signed 64-bit byte offset),
   //   s_add_u32 + s_addc_u32 (adjust base),
-  //   s_mov_b32 (num_records, element-aligned sentinel-safe max).
-  // CHECK: s_mov_b64 s[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}]
+  //   s_mov_b32 (num_records), s_mov_b32 (stride/swizzle flags).
   // CHECK: waveasm.s_mul_hi_i32
   // CHECK: waveasm.s_mul_i32
   // CHECK: waveasm.s_add_u32
   // CHECK: waveasm.s_addc_u32
-  // CHECK: s_mov_b32 s{{[0-9]+}}, 0x7FFFFFF
-  // CHECK: s_mov_b32 s{{[0-9]+}}, 0x20000
+  // CHECK: waveasm.pack
   // CHECK: waveasm.buffer_load_dwordx2
   %loaded = vector.load %buf0[%th_offset]
       : memref<?xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf16>
@@ -72,13 +69,11 @@ func.func @buffer_ops_test(%arg0: memref<f16>, %arg1: memref<f32>) {
   %ext = arith.extf %elem : vector<1xf16> to vector<1xf32>
 
   // The store SRD should also be adjusted, with sentinel-safe max num_records.
-  // CHECK: s_mov_b64 s[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}]
   // CHECK: waveasm.s_mul_hi_i32
   // CHECK: waveasm.s_mul_i32
   // CHECK: waveasm.s_add_u32
   // CHECK: waveasm.s_addc_u32
-  // CHECK: s_mov_b32 s{{[0-9]+}}, 0x7FFFFFF
-  // CHECK: s_mov_b32 s{{[0-9]+}}, 0x20000
+  // CHECK: waveasm.pack
   // CHECK: waveasm.buffer_store_dword
   vector.store %ext, %buf1[%thread_id]
       : memref<?xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<1xf32>
diff --git a/waveasm/test/Translate/salu-promotion-arith.mlir b/waveasm/test/Translate/salu-promotion-arith.mlir
@@ -0,0 +1,42 @@
+// RUN: waveasm-translate %s 2>&1 | FileCheck %s
+//
+// Test: SALU promotion of scalar arithmetic. When both operands of an arith op
+// are in SGPRs (e.g. workgroup_id), the auto-select emit helpers route through
+// SALU instructions instead of VALU.
+
+module {
+  gpu.module @test_salu_promotion {
+
+    // CHECK-LABEL: waveasm.program @salu_mul_add
+    gpu.func @salu_mul_add() kernel {
+      %wg_x = gpu.block_id x upper_bound 4
+      %wg_y = gpu.block_id y upper_bound 4
+      %c128 = arith.constant 128 : index
+
+      // Scalar (SGPR) * immediate -> s_mul_i32
+      // CHECK: waveasm.s_mul_i32 %{{.*}}, %{{.*}} : !waveasm.sreg, !waveasm.imm<128> -> !waveasm.sreg
+      %prod = arith.muli %wg_x, %c128 : index
+
+      // Scalar (SGPR) + scalar (SGPR) -> s_add_u32
+      // CHECK: waveasm.s_add_u32 %{{.*}}, %{{.*}} : !waveasm.sreg, !waveasm.sreg -> !waveasm.sreg, !waveasm.scc
+      %sum = arith.addi %prod, %wg_y : index
+
+      // CHECK: waveasm.s_endpgm
+      gpu.return
+    }
+
+    // CHECK-LABEL: waveasm.program @salu_cmpi
+    gpu.func @salu_cmpi() kernel {
+      %wg_x = gpu.block_id x upper_bound 16
+      %c10 = arith.constant 10 : index
+
+      // Scalar cmpi -> s_cmp (SCC result). The immediate is first moved to SGPR.
+      // CHECK: waveasm.s_mov_b32 %{{.*}} : !waveasm.imm<10> -> !waveasm.sreg
+      // CHECK: waveasm.s_cmp_lt_i32 %{{.*}}, %{{.*}} : !waveasm.sreg, !waveasm.sreg -> !waveasm.scc
+      %cmp = arith.cmpi slt, %wg_x, %c10 : index
+
+      // CHECK: waveasm.s_endpgm
+      gpu.return
+    }
+  }
+}
diff --git a/waveasm/test/Translate/salu-select-fusion.mlir b/waveasm/test/Translate/salu-select-fusion.mlir
@@ -0,0 +1,30 @@
+// RUN: waveasm-translate %s 2>&1 | FileCheck %s
+//
+// Test: Scalar cmpi + scalar select fusion into s_cmp + s_cselect_b32.
+// When both comparison operands are scalar and the select's true/false values
+// are also scalar, the backend fuses the pair into a single s_cmp + s_cselect
+// sequence, avoiding the VALU v_cmp + v_cndmask path.
+
+module {
+  gpu.module @test_select_fusion {
+
+    // CHECK-LABEL: waveasm.program @cmpi_select_scalar_fusion
+    gpu.func @cmpi_select_scalar_fusion() kernel {
+      %wg_x = gpu.block_id x upper_bound 16
+      %c10 = arith.constant 10 : index
+      %c100 = arith.constant 100 : index
+      %c200 = arith.constant 200 : index
+
+      // Scalar cmpi + scalar select -> s_cmp_lt_i32 + s_cselect_b32
+      // CHECK: waveasm.s_cmp_lt_i32
+      // CHECK: waveasm.s_cselect_b32
+      // CHECK-NOT: waveasm.v_cmp
+      // CHECK-NOT: waveasm.v_cndmask
+      %cmp = arith.cmpi slt, %wg_x, %c10 : index
+      %sel = arith.select %cmp, %c100, %c200 : index
+
+      // CHECK: waveasm.s_endpgm
+      gpu.return
+    }
+  }
+}
diff --git a/waveasm/test/Translate/scf-if-agpr-else-coercion.mlir b/waveasm/test/Translate/scf-if-agpr-else-coercion.mlir
@@ -9,14 +9,13 @@
 // values (immediates).  The backend must coerce the else-yield immediates
 // into register types so that both branches yield type-compatible values.
 //
-// Note: the translator currently produces a vreg condition for waveasm.if
-// (instead of scc), so the output is dumped in generic form after
-// verification. The CHECK patterns below match generic form.
+// With SALU promotion, scalar cmpi produces SCC directly, so waveasm.if
+// gets an SCC condition and the output is in custom form.
 
 module {
   gpu.module @test_if_else_coercion {
 
-    // CHECK-LABEL: sym_name = "if_else_coercion"
+    // CHECK-LABEL: waveasm.program @if_else_coercion
     gpu.func @if_else_coercion() kernel {
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
@@ -28,20 +27,21 @@ module {
       // Then branch: compute a VGPR value (addi -> v_add_u32)
       // Else branch: yield a constant zero (-> immediate coerced to vreg)
       //
-      // CHECK:      "waveasm.if"
-      // CHECK:        "waveasm.v_add_u32"
-      // CHECK:        "waveasm.yield"
-      // CHECK:      }, {
-      // CHECK:        "waveasm.v_mov_b32"
-      // CHECK:        "waveasm.yield"
+      // CHECK:      waveasm.s_cmp_lt_i32
+      // CHECK:      waveasm.if %{{.*}} : !waveasm.scc -> !waveasm.vreg {
+      // CHECK:        waveasm.v_add_u32
+      // CHECK:        waveasm.yield
+      // CHECK:      } else {
+      // CHECK:        waveasm.v_mov_b32
+      // CHECK:        waveasm.yield
       %result = scf.if %cond_i1 -> i32 {
         %val = arith.addi %zero_i32, %one_i32 : i32
         scf.yield %val : i32
       } else {
         scf.yield %zero_i32 : i32
       }
 
-      // CHECK: "waveasm.s_endpgm"
+      // CHECK: waveasm.s_endpgm
       gpu.return
     }
   }
diff --git a/waveasm/test/Translate/swizzle-srd-num-records.mlir b/waveasm/test/Translate/swizzle-srd-num-records.mlir