Skip to content

Commit 1159266

Browse files
committed
[SLP] Add support for fmaximum/fminimum reduction
This patch adds support for vectorized reduction of maximum/minimum intrinsics which are under the appropriate reduction kind. Differential Revision: https://reviews.llvm.org/D154463
1 parent a43aebc commit 1159266

2 files changed

Lines changed: 43 additions & 90 deletions

File tree

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12701,6 +12701,9 @@ class HorizontalReduction {
1270112701
return I->getFastMathFlags().noNaNs();
1270212702
}
1270312703

12704+
if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
12705+
return true;
12706+
1270412707
return I->isAssociative();
1270512708
}
1270612709

@@ -12751,6 +12754,18 @@ class HorizontalReduction {
1275112754
minnum(cast<ConstantFP>(LHS)->getValueAPF(),
1275212755
cast<ConstantFP>(RHS)->getValueAPF()));
1275312756
return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
12757+
case RecurKind::FMaximum:
12758+
if (IsConstant)
12759+
return ConstantFP::get(LHS->getType(),
12760+
maximum(cast<ConstantFP>(LHS)->getValueAPF(),
12761+
cast<ConstantFP>(RHS)->getValueAPF()));
12762+
return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
12763+
case RecurKind::FMinimum:
12764+
if (IsConstant)
12765+
return ConstantFP::get(LHS->getType(),
12766+
minimum(cast<ConstantFP>(LHS)->getValueAPF(),
12767+
cast<ConstantFP>(RHS)->getValueAPF()));
12768+
return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
1275412769
case RecurKind::SMax:
1275512770
if (IsConstant || UseSelect) {
1275612771
Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
@@ -12833,6 +12848,10 @@ class HorizontalReduction {
1283312848
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
1283412849
return RecurKind::FMin;
1283512850

12851+
if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
12852+
return RecurKind::FMaximum;
12853+
if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
12854+
return RecurKind::FMinimum;
1283612855
// This matches either cmp+select or intrinsics. SLP is expected to handle
1283712856
// either form.
1283812857
// TODO: If we are canonicalizing to intrinsics, we can remove several
@@ -13800,6 +13819,8 @@ class HorizontalReduction {
1380013819
}
1380113820
case RecurKind::FMax:
1380213821
case RecurKind::FMin:
13822+
case RecurKind::FMaximum:
13823+
case RecurKind::FMinimum:
1380313824
case RecurKind::SMax:
1380413825
case RecurKind::SMin:
1380513826
case RecurKind::UMax:
@@ -14131,6 +14152,10 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
1413114152
return true;
1413214153
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
1413314154
return true;
14155+
if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
14156+
return true;
14157+
if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
14158+
return true;
1413414159
if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
1413514160
return true;
1413614161
if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))

llvm/test/Transforms/SLPVectorizer/X86/fmaximum-fminimum.ll

Lines changed: 18 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -175,31 +175,15 @@ define double @reduction_v2f64(ptr %p) {
175175
define float @reduction_v4f32(ptr %p) {
176176
; SSE-LABEL: define float @reduction_v4f32
177177
; SSE-SAME: (ptr [[P:%.*]]) {
178-
; SSE-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[P]], i64 1
179-
; SSE-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
180-
; SSE-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
181-
; SSE-NEXT: [[T0:%.*]] = load float, ptr [[P]], align 4
182-
; SSE-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4
183-
; SSE-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4
184-
; SSE-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4
185-
; SSE-NEXT: [[M1:%.*]] = tail call float @llvm.maximum.f32(float [[T1]], float [[T0]])
186-
; SSE-NEXT: [[M2:%.*]] = tail call float @llvm.maximum.f32(float [[T2]], float [[M1]])
187-
; SSE-NEXT: [[M3:%.*]] = tail call float @llvm.maximum.f32(float [[T3]], float [[M2]])
188-
; SSE-NEXT: ret float [[M3]]
178+
; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P]], align 4
179+
; SSE-NEXT: [[TMP2:%.*]] = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> [[TMP1]])
180+
; SSE-NEXT: ret float [[TMP2]]
189181
;
190182
; AVX-LABEL: define float @reduction_v4f32
191183
; AVX-SAME: (ptr [[P:%.*]]) #[[ATTR1]] {
192-
; AVX-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[P]], i64 1
193-
; AVX-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
194-
; AVX-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
195-
; AVX-NEXT: [[T0:%.*]] = load float, ptr [[P]], align 4
196-
; AVX-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4
197-
; AVX-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4
198-
; AVX-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4
199-
; AVX-NEXT: [[M1:%.*]] = tail call float @llvm.maximum.f32(float [[T1]], float [[T0]])
200-
; AVX-NEXT: [[M2:%.*]] = tail call float @llvm.maximum.f32(float [[T2]], float [[M1]])
201-
; AVX-NEXT: [[M3:%.*]] = tail call float @llvm.maximum.f32(float [[T3]], float [[M2]])
202-
; AVX-NEXT: ret float [[M3]]
184+
; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P]], align 4
185+
; AVX-NEXT: [[TMP2:%.*]] = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> [[TMP1]])
186+
; AVX-NEXT: ret float [[TMP2]]
203187
;
204188
%g1 = getelementptr inbounds float, ptr %p, i64 1
205189
%g2 = getelementptr inbounds float, ptr %p, i64 2
@@ -217,31 +201,15 @@ define float @reduction_v4f32(ptr %p) {
217201
define double @reduction_v4f64_fminimum(ptr %p) {
218202
; SSE-LABEL: define double @reduction_v4f64_fminimum
219203
; SSE-SAME: (ptr [[P:%.*]]) {
220-
; SSE-NEXT: [[G1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 1
221-
; SSE-NEXT: [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
222-
; SSE-NEXT: [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
223-
; SSE-NEXT: [[T0:%.*]] = load double, ptr [[P]], align 4
224-
; SSE-NEXT: [[T1:%.*]] = load double, ptr [[G1]], align 4
225-
; SSE-NEXT: [[T2:%.*]] = load double, ptr [[G2]], align 4
226-
; SSE-NEXT: [[T3:%.*]] = load double, ptr [[G3]], align 4
227-
; SSE-NEXT: [[M1:%.*]] = tail call double @llvm.minimum.f64(double [[T1]], double [[T0]])
228-
; SSE-NEXT: [[M2:%.*]] = tail call double @llvm.minimum.f64(double [[T2]], double [[M1]])
229-
; SSE-NEXT: [[M3:%.*]] = tail call double @llvm.minimum.f64(double [[T3]], double [[M2]])
230-
; SSE-NEXT: ret double [[M3]]
204+
; SSE-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[P]], align 4
205+
; SSE-NEXT: [[TMP2:%.*]] = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> [[TMP1]])
206+
; SSE-NEXT: ret double [[TMP2]]
231207
;
232208
; AVX-LABEL: define double @reduction_v4f64_fminimum
233209
; AVX-SAME: (ptr [[P:%.*]]) #[[ATTR1]] {
234-
; AVX-NEXT: [[G1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 1
235-
; AVX-NEXT: [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
236-
; AVX-NEXT: [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
237-
; AVX-NEXT: [[T0:%.*]] = load double, ptr [[P]], align 4
238-
; AVX-NEXT: [[T1:%.*]] = load double, ptr [[G1]], align 4
239-
; AVX-NEXT: [[T2:%.*]] = load double, ptr [[G2]], align 4
240-
; AVX-NEXT: [[T3:%.*]] = load double, ptr [[G3]], align 4
241-
; AVX-NEXT: [[M1:%.*]] = tail call double @llvm.minimum.f64(double [[T1]], double [[T0]])
242-
; AVX-NEXT: [[M2:%.*]] = tail call double @llvm.minimum.f64(double [[T2]], double [[M1]])
243-
; AVX-NEXT: [[M3:%.*]] = tail call double @llvm.minimum.f64(double [[T3]], double [[M2]])
244-
; AVX-NEXT: ret double [[M3]]
210+
; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[P]], align 4
211+
; AVX-NEXT: [[TMP2:%.*]] = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> [[TMP1]])
212+
; AVX-NEXT: ret double [[TMP2]]
245213
;
246214
%g1 = getelementptr inbounds double, ptr %p, i64 1
247215
%g2 = getelementptr inbounds double, ptr %p, i64 2
@@ -259,55 +227,15 @@ define double @reduction_v4f64_fminimum(ptr %p) {
259227
define float @reduction_v8f32_fminimum(ptr %p) {
260228
; SSE-LABEL: define float @reduction_v8f32_fminimum
261229
; SSE-SAME: (ptr [[P:%.*]]) {
262-
; SSE-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[P]], i64 1
263-
; SSE-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
264-
; SSE-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
265-
; SSE-NEXT: [[G4:%.*]] = getelementptr inbounds float, ptr [[P]], i64 4
266-
; SSE-NEXT: [[G5:%.*]] = getelementptr inbounds float, ptr [[P]], i64 5
267-
; SSE-NEXT: [[G6:%.*]] = getelementptr inbounds float, ptr [[P]], i64 6
268-
; SSE-NEXT: [[G7:%.*]] = getelementptr inbounds float, ptr [[P]], i64 7
269-
; SSE-NEXT: [[T0:%.*]] = load float, ptr [[P]], align 4
270-
; SSE-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4
271-
; SSE-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4
272-
; SSE-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4
273-
; SSE-NEXT: [[T4:%.*]] = load float, ptr [[G4]], align 4
274-
; SSE-NEXT: [[T5:%.*]] = load float, ptr [[G5]], align 4
275-
; SSE-NEXT: [[T6:%.*]] = load float, ptr [[G6]], align 4
276-
; SSE-NEXT: [[T7:%.*]] = load float, ptr [[G7]], align 4
277-
; SSE-NEXT: [[M1:%.*]] = tail call float @llvm.minimum.f32(float [[T1]], float [[T0]])
278-
; SSE-NEXT: [[M2:%.*]] = tail call float @llvm.minimum.f32(float [[T2]], float [[M1]])
279-
; SSE-NEXT: [[M3:%.*]] = tail call float @llvm.minimum.f32(float [[T3]], float [[M2]])
280-
; SSE-NEXT: [[M4:%.*]] = tail call float @llvm.minimum.f32(float [[T4]], float [[M3]])
281-
; SSE-NEXT: [[M5:%.*]] = tail call float @llvm.minimum.f32(float [[M4]], float [[T6]])
282-
; SSE-NEXT: [[M6:%.*]] = tail call float @llvm.minimum.f32(float [[M5]], float [[T5]])
283-
; SSE-NEXT: [[M7:%.*]] = tail call float @llvm.minimum.f32(float [[M6]], float [[T7]])
284-
; SSE-NEXT: ret float [[M7]]
230+
; SSE-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[P]], align 4
231+
; SSE-NEXT: [[TMP2:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP1]])
232+
; SSE-NEXT: ret float [[TMP2]]
285233
;
286234
; AVX-LABEL: define float @reduction_v8f32_fminimum
287235
; AVX-SAME: (ptr [[P:%.*]]) #[[ATTR1]] {
288-
; AVX-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[P]], i64 1
289-
; AVX-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
290-
; AVX-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
291-
; AVX-NEXT: [[G4:%.*]] = getelementptr inbounds float, ptr [[P]], i64 4
292-
; AVX-NEXT: [[G5:%.*]] = getelementptr inbounds float, ptr [[P]], i64 5
293-
; AVX-NEXT: [[G6:%.*]] = getelementptr inbounds float, ptr [[P]], i64 6
294-
; AVX-NEXT: [[G7:%.*]] = getelementptr inbounds float, ptr [[P]], i64 7
295-
; AVX-NEXT: [[T0:%.*]] = load float, ptr [[P]], align 4
296-
; AVX-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4
297-
; AVX-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4
298-
; AVX-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4
299-
; AVX-NEXT: [[T4:%.*]] = load float, ptr [[G4]], align 4
300-
; AVX-NEXT: [[T5:%.*]] = load float, ptr [[G5]], align 4
301-
; AVX-NEXT: [[T6:%.*]] = load float, ptr [[G6]], align 4
302-
; AVX-NEXT: [[T7:%.*]] = load float, ptr [[G7]], align 4
303-
; AVX-NEXT: [[M1:%.*]] = tail call float @llvm.minimum.f32(float [[T1]], float [[T0]])
304-
; AVX-NEXT: [[M2:%.*]] = tail call float @llvm.minimum.f32(float [[T2]], float [[M1]])
305-
; AVX-NEXT: [[M3:%.*]] = tail call float @llvm.minimum.f32(float [[T3]], float [[M2]])
306-
; AVX-NEXT: [[M4:%.*]] = tail call float @llvm.minimum.f32(float [[T4]], float [[M3]])
307-
; AVX-NEXT: [[M5:%.*]] = tail call float @llvm.minimum.f32(float [[M4]], float [[T6]])
308-
; AVX-NEXT: [[M6:%.*]] = tail call float @llvm.minimum.f32(float [[M5]], float [[T5]])
309-
; AVX-NEXT: [[M7:%.*]] = tail call float @llvm.minimum.f32(float [[M6]], float [[T7]])
310-
; AVX-NEXT: ret float [[M7]]
236+
; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[P]], align 4
237+
; AVX-NEXT: [[TMP2:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP1]])
238+
; AVX-NEXT: ret float [[TMP2]]
311239
;
312240
%g1 = getelementptr inbounds float, ptr %p, i64 1
313241
%g2 = getelementptr inbounds float, ptr %p, i64 2

0 commit comments

Comments
 (0)