Skip to content

Commit e90f463

Browse files
[SLP] Normalize copyable operand order via majority voting
When building operands for entries with copyable elements, non-copyable lanes of commutative ops may have inconsistent operand order (e.g. some lanes have load,add while others have add,load). This prevents VLOperands::reorder() from grouping consecutive loads on one side, degrading downstream vectorization. Add majority-voting normalization during buildOperands: track the (ValueID, ValueID) pair frequency across non-copyable lanes and swap any lane whose operand types are the exact inverse of the most common pattern. This makes operand order consistent, enabling better load grouping. This is part 1 of #189181. Reviewers: RKSimon, hiraditya Pull Request: #191631
1 parent 7328b74 commit e90f463

5 files changed

Lines changed: 80 additions & 29 deletions

File tree

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12029,12 +12029,70 @@ class InstructionsCompatibilityAnalysis {
1202912029
if (S.areInstructionsWithCopyableElements()) {
1203012030
MainOp = S.getMainOp();
1203112031
MainOpcode = S.getOpcode();
12032+
const bool IsCommutative =
12033+
isCommutative(MainOp) && MainOp->getNumOperands() == 2;
1203212034
Operands.assign(MainOp->getNumOperands(),
1203312035
BoUpSLP::ValueList(VL.size(), nullptr));
12036+
// Build operands and simultaneously count (ID0, ID1) pair
12037+
// frequencies for commutative operand normalization. Pairs and
12038+
// their inverses are tracked under a canonical key so that
12039+
// (Load, Add) and (Add, Load) contribute to the same bucket.
12040+
struct PairInfo {
12041+
unsigned FwdCount = 0;
12042+
unsigned RevCount = 0;
12043+
};
12044+
SmallMapVector<std::pair<unsigned, unsigned>, PairInfo, 8> PairCounts;
12045+
unsigned MajID0 = 0, MajID1 = 0;
1203412046
for (auto [Idx, V] : enumerate(VL)) {
1203512047
SmallVector<Value *> OperandsForValue = getOperands(S, V);
1203612048
for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
1203712049
Operands[OperandIdx][Idx] = Operand;
12050+
if (!IsCommutative || S.isCopyableElement(V) || isa<PoisonValue>(V))
12051+
continue;
12052+
unsigned ID0 = OperandsForValue[0]->getValueID();
12053+
unsigned ID1 = OperandsForValue[1]->getValueID();
12054+
if (ID0 == ID1)
12055+
continue;
12056+
unsigned MinID = std::min(ID0, ID1);
12057+
unsigned MaxID = std::max(ID0, ID1);
12058+
auto [It, Inserted] =
12059+
PairCounts.try_emplace(std::make_pair(MinID, MaxID));
12060+
PairInfo &Info = It->second;
12061+
if (ID0 < ID1)
12062+
++Info.FwdCount;
12063+
else
12064+
++Info.RevCount;
12065+
}
12066+
// Find the most frequent (ID0, ID1) pair across non-copyable
12067+
// lanes. Select the orientation (original or inverse) that has
12068+
// more votes as the majority pattern.
12069+
unsigned BestCount = 0;
12070+
for (const auto &P : PairCounts) {
12071+
const PairInfo &Info = P.second;
12072+
unsigned Total = Info.FwdCount + Info.RevCount;
12073+
if (Total > BestCount) {
12074+
BestCount = Total;
12075+
if (Info.FwdCount >= Info.RevCount) {
12076+
MajID0 = P.first.first;
12077+
MajID1 = P.first.second;
12078+
} else {
12079+
MajID0 = P.first.second;
12080+
MajID1 = P.first.first;
12081+
}
12082+
}
12083+
}
12084+
// For commutative ops, swap lanes whose operand types are the
12085+
// exact inverse of the majority pattern, making the non-copyable
12086+
// lanes consistent.
12087+
if (BestCount > 0) {
12088+
for (auto [Idx, V] : enumerate(VL)) {
12089+
if (S.isCopyableElement(V) || isa<PoisonValue>(V))
12090+
continue;
12091+
unsigned ID0 = Operands[0][Idx]->getValueID();
12092+
unsigned ID1 = Operands[1][Idx]->getValueID();
12093+
if (ID0 == MajID1 && ID1 == MajID0)
12094+
std::swap(Operands[0][Idx], Operands[1][Idx]);
12095+
}
1203812096
}
1203912097
} else {
1204012098
buildOriginalOperands(S, VL, Operands);

llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ define void @test(ptr %0, ptr %1, ptr %2) {
1010
; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP8]]
1111
; CHECK-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP11]], [[TMP10]]
1212
; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
13-
; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> <i32 0, i32 0, i32 1, i32 0>, [[TMP13]]
13+
; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP13]], <i32 0, i32 0, i32 1, i32 0>
1414
; CHECK-NEXT: [[TMP17:%.*]] = sub <4 x i32> [[TMP16]], zeroinitializer
1515
; CHECK-NEXT: [[TMP14:%.*]] = sub <4 x i32> [[TMP17]], zeroinitializer
1616
; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>

llvm/test/Transforms/SLPVectorizer/X86/copyable_reorder.ll

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -136,24 +136,15 @@ entry:
136136
define void @test_add_udiv_reorder_add(ptr %arr1, ptr %arr2, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
137137
; CHECK-LABEL: @test_add_udiv_reorder_add(
138138
; CHECK-NEXT: entry:
139-
; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr i32, ptr [[ARR1:%.*]], i32 2
140-
; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr i32, ptr [[ARR1]], i32 3
141-
; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[GEP1_2]], align 4
142-
; CHECK-NEXT: [[V3:%.*]] = load i32, ptr [[GEP1_3]], align 4
143-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0
144-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[A1:%.*]], i32 1
145-
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 1146, i32 146>
146-
; CHECK-NEXT: [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
147-
; CHECK-NEXT: [[Y3:%.*]] = add nsw i32 [[A3:%.*]], 0
148-
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARR1]], align 4
149-
; CHECK-NEXT: [[RES2:%.*]] = udiv i32 [[V2]], [[Y2]]
150-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
151-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[RES2]], i32 2
152-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[V3]], i32 3
153-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[Y3]], i32 3
154-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
155-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
156-
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP9]]
139+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[ARR1:%.*]], align 4
140+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[A2:%.*]], i32 2
141+
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], <i32 1, i32 1, i32 42, i32 1>
142+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[A0:%.*]], i32 0
143+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[A1:%.*]], i32 1
144+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[V3:%.*]], i32 3
145+
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> <i32 1146, i32 146, i32 0, i32 0>, [[TMP6]]
146+
; CHECK-NEXT: [[TMP7:%.*]] = udiv <4 x i32> [[TMP0]], [[TMP2]]
147+
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP8]]
157148
; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[ARR2:%.*]], align 4
158149
; CHECK-NEXT: ret void
159150
;

llvm/test/Transforms/SLPVectorizer/X86/operand-reorder-with-copyables.ll

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,21 @@ define void @test(ptr %p0, ptr %p1, ptr %dst, i32 %a0, i32 %a1, i32 %a2, i32 %a3
1414
; CHECK-NEXT: [[Y2:%.*]] = load i32, ptr [[R1]], align 4
1515
; CHECK-NEXT: [[Y3:%.*]] = load i32, ptr [[R2]], align 4
1616
; CHECK-NEXT: [[C2:%.*]] = sub i32 [[X2]], [[Y2]]
17-
; CHECK-NEXT: [[C3:%.*]] = sub i32 [[X3]], [[Y3]]
1817
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P0]], align 4
1918
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P1]], align 4
20-
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
2119
; CHECK-NEXT: [[RES2:%.*]] = udiv i32 [[C2]], [[A2]]
22-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[A0]], i32 0
23-
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[A1]], i32 1
24-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[C3]], i32 3
25-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[A3]], i32 3
26-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[RES2]], i32 2
27-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
20+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[RES2]], i32 2
21+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X3]], i32 3
22+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
2823
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
29-
; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP5]], [[TMP9]]
24+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[Y3]], i32 3
25+
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
26+
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP13]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
27+
; CHECK-NEXT: [[TMP15:%.*]] = sub <4 x i32> [[TMP9]], [[TMP14]]
28+
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[A0]], i32 0
29+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[A1]], i32 1
30+
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A3]], i32 3
31+
; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP15]], [[TMP12]]
3032
; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[DST]], align 4
3133
; CHECK-NEXT: ret void
3234
;

llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ define float @test() {
88
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <12 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float undef>, float 0.000000e+00, i32 0
99
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <12 x float> [[TMP1]], float 0.000000e+00, i32 8
1010
; CHECK-NEXT: [[TMP3:%.*]] = fmul <12 x float> [[TMP0]], [[TMP2]]
11-
; CHECK-NEXT: [[TMP4:%.*]] = fadd <12 x float> <float 0.000000e+00, float 0.000000e+00, float -0.000000e+00, float 0.000000e+00, float -0.000000e+00, float 0.000000e+00, float -0.000000e+00, float 0.000000e+00, float 0.000000e+00, float -0.000000e+00, float 0.000000e+00, float poison>, [[TMP3]]
11+
; CHECK-NEXT: [[TMP4:%.*]] = fadd <12 x float> [[TMP3]], <float 0.000000e+00, float 0.000000e+00, float -0.000000e+00, float 0.000000e+00, float -0.000000e+00, float 0.000000e+00, float -0.000000e+00, float 0.000000e+00, float 0.000000e+00, float -0.000000e+00, float 0.000000e+00, float poison>
1212
; CHECK-NEXT: [[TMP5:%.*]] = fsub <8 x float> zeroinitializer, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>
1313
; CHECK-NEXT: [[TMP6:%.*]] = fadd <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, [[TMP4]]
1414
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <12 x float> [[TMP6]], <12 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>

0 commit comments

Comments
 (0)