@@ -972,6 +972,71 @@ std::optional<std::string> KernelGenerator::generateOp(Operation *op) {
972972 return formatter.format (" v_cvt_pk_bf16_f32" , operands);
973973 })
974974
975+ // V_PERMLANE16_SWAP_B32: swap lanes 16 apart.
976+ // The hardware clobbers BOTH dst and src. When the allocator assigns
977+ // dst==src, we must save the original to a scratch register, swap
978+ // through another scratch, then restore the original.
979+ .Case <V_PERMLANE16_SWAP_B32>(
980+ [&](V_PERMLANE16_SWAP_B32 swapOp) -> std::optional<std::string> {
981+ std::string dst = resolveValue (swapOp.getDst ());
982+ std::string src = resolveValue (swapOp.getSrc ());
983+ if (dst != src) {
984+ llvm::SmallVector<std::string> operands = {dst, src};
985+ return formatter.format (" v_permlane16_swap_b32" , operands);
986+ }
987+ // dst==src: save original, swap through scratch, restore original
988+ std::string scratch0 = formatVGPRRange (kScratchVGPR , 1 );
989+ std::string scratch1 = formatVGPRRange (kScratchVGPR - 1 , 1 );
990+ peakVGPRs = std::max (peakVGPRs, kScratchVGPR + 1 );
991+ invalidateScratchCache ();
992+ // 1. Save original src to scratch0
993+ // 2. Copy src to scratch1 for the swap
994+ // 3. Swap: dst gets partner's scratch1, scratch1 clobbered
995+ // 4. Restore original from scratch0 back to src
996+ return " v_mov_b32 " + scratch0 + " , " + src + " \n " +
997+ " v_mov_b32 " + scratch1 + " , " + src + " \n " +
998+ " v_permlane16_swap_b32 " + dst + " , " + scratch1 + " \n " +
999+ " v_mov_b32 " + src + " , " + scratch0;
1000+ })
1001+
1002+ // V_ACCVGPR_READ_B32: unroll multi-register reads into scalar ops
1003+ .Case <V_ACCVGPR_READ_B32>(
1004+ [&](V_ACCVGPR_READ_B32 readOp) -> std::optional<std::string> {
1005+ Value dst = readOp.getDst ();
1006+ Value src = readOp.getSrc ();
1007+ int64_t dstSize = getRegSize (dst.getType ());
1008+ int64_t srcSize = getRegSize (src.getType ());
1009+ int64_t size = std::max (dstSize, srcSize);
1010+ if (size <= 1 ) {
1011+ return emitDefaultFormat (readOp, " v_accvgpr_read_b32" );
1012+ }
1013+ int64_t dstBase = -1 , srcBase = -1 ;
1014+ if (auto pv = dyn_cast<PVRegType>(dst.getType ()))
1015+ dstBase = pv.getIndex ();
1016+ else if (isVirtualRegType (dst.getType ()))
1017+ dstBase = mapping.getPhysReg (dst);
1018+ if (auto pa = dyn_cast<PARegType>(src.getType ()))
1019+ srcBase = pa.getIndex ();
1020+ else if (isVirtualRegType (src.getType ()))
1021+ srcBase = mapping.getPhysReg (src);
1022+ if (dstBase < 0 || srcBase < 0 ) {
1023+ llvm::errs () << " V_ACCVGPR_READ_B32 fallback: dstBase=" << dstBase
1024+ << " srcBase=" << srcBase << " dstSize=" << dstSize
1025+ << " srcSize=" << srcSize
1026+ << " dstType=" << dst.getType ()
1027+ << " srcType=" << src.getType () << " \n " ;
1028+ return emitDefaultFormat (readOp, " v_accvgpr_read_b32" );
1029+ }
1030+ std::string lines;
1031+ for (int64_t i = 0 ; i < size; ++i) {
1032+ if (i > 0 )
1033+ lines += " \n " ;
1034+ lines += " v_accvgpr_read_b32 v" + std::to_string (dstBase + i) +
1035+ " , a" + std::to_string (srcBase + i);
1036+ }
1037+ return lines;
1038+ })
1039+
9751040 // Carry ops: on GFX9, carry-out is implicit VCC.
9761041 // v_add_co_u32: dst, vcc, src0, src1
9771042 // v_addc_co_u32: dst, vcc, src0, src1, vcc (carry-in).
0 commit comments