Skip to content

Commit 78a4b8e

Browse files
authored
[AArch64] fuse constant addition after sbb (llvm#185117)
Resolves: llvm#171676 Related: llvm#184541 (x86_64 PR) The issue points out that `Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)` is optimized and that SBB can be optimized similarly: `Fold ADD(SBB(Y,0,W),C) -> SBB(Y,-C,W)`. With the changes from this branch, a new clang will compile the example code: ``` #include <stdint.h> uint64_t f(uint64_t a, uint64_t b) { uint64_t x; x += __builtin_add_overflow(a, b, &x); return x + 10; } uint64_t g(uint64_t a, uint64_t b) { uint64_t x; x -= __builtin_sub_overflow(a, b, &x); return x + 10; } ``` To this, so the subc case matches the add case. ``` f: mov w8, llvm#10 adds x9, x0, x1 adc x0, x9, x8 ret g: mov x8, #-10 subs x9, x0, x1 sbc x0, x9, x8 ret ```
1 parent 883aa69 commit 78a4b8e

2 files changed

Lines changed: 172 additions & 1 deletion

File tree

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23365,6 +23365,31 @@ static SDValue performAddTruncShiftCombine(SDNode *N, SelectionDAG &DAG) {
2336523365
return DAG.getNode(ISD::ADD, DL, VT, Trunc, Shift);
2336623366
}
2336723367

23368+
// Fold ADD(SBC(Y, 0, W), C) -> SBC(Y, -C, W)
23369+
// SBC(Y, 0, W) = Y - 0 - ~carry = Y + carry - 1
23370+
// Adding C: Y + carry - 1 + C = Y - (-C) - ~carry = SBC(Y, -C, W)
23371+
static SDValue performAddWithSBCCombine(SDNode *N, SelectionDAG &DAG) {
23372+
if (N->getOpcode() != ISD::ADD)
23373+
return SDValue();
23374+
EVT VT = N->getValueType(0);
23375+
if (VT != MVT::i32 && VT != MVT::i64)
23376+
return SDValue();
23377+
23378+
SDValue SBC = N->getOperand(0);
23379+
SDValue C = N->getOperand(1);
23380+
// ADD is commutative; operands may be on either side.
23381+
if (SBC.getOpcode() != AArch64ISD::SBC)
23382+
std::swap(SBC, C);
23383+
if (SBC.getOpcode() != AArch64ISD::SBC || !SBC.hasOneUse())
23384+
return SDValue();
23385+
if (!isNullConstant(SBC.getOperand(1)))
23386+
return SDValue();
23387+
// AArch64 SBC (non-flag-setting) has only one output; no flags guard needed.
23388+
SDLoc DL(N);
23389+
return DAG.getNode(AArch64ISD::SBC, DL, VT, SBC.getOperand(0),
23390+
DAG.getNegative(C, DL, VT), SBC.getOperand(2));
23391+
}
23392+
2336823393
static SDValue performAddSubCombine(SDNode *N,
2336923394
TargetLowering::DAGCombinerInfo &DCI) {
2337023395
// Try to change sum of two reductions.
@@ -23390,7 +23415,8 @@ static SDValue performAddSubCombine(SDNode *N,
2339023415
return Val;
2339123416
if (SDValue Val = performAddTruncShiftCombine(N, DCI.DAG))
2339223417
return Val;
23393-
23418+
if (SDValue Val = performAddWithSBCCombine(N, DCI.DAG))
23419+
return Val;
2339423420
if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
2339523421
return Val;
2339623422

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
3+
;
4+
; Verify that ADD(SBB(Y,0,flags),C) folds to SBB(Y,-C,flags).
5+
; SBB(Y,0) = Y - CF; adding C gives Y - CF + C = Y - (-C) - CF = SBB(Y,-C).
6+
7+
; Fold should fire, adding with constant
8+
define i64 @g_i64(i64 %a, i64 %b) nounwind {
9+
; CHECK-LABEL: g_i64:
10+
; CHECK: // %bb.0:
11+
; CHECK-NEXT: mov x8, #-10 // =0xfffffffffffffff6
12+
; CHECK-NEXT: subs x9, x0, x1
13+
; CHECK-NEXT: sbc x0, x9, x8
14+
; CHECK-NEXT: ret
15+
%ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
16+
%val = extractvalue { i64, i1 } %ov, 0
17+
%bit = extractvalue { i64, i1 } %ov, 1
18+
%ext = sext i1 %bit to i64
19+
%r = add i64 %val, %ext
20+
%r2 = add i64 %r, 10
21+
ret i64 %r2
22+
}
23+
24+
; Fold should fire, adding with constant
25+
define i32 @g_i32(i32 %a, i32 %b) nounwind {
26+
; CHECK-LABEL: g_i32:
27+
; CHECK: // %bb.0:
28+
; CHECK-NEXT: mov w8, #-10 // =0xfffffff6
29+
; CHECK-NEXT: subs w9, w0, w1
30+
; CHECK-NEXT: sbc w0, w9, w8
31+
; CHECK-NEXT: ret
32+
%ov = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
33+
%val = extractvalue { i32, i1 } %ov, 0
34+
%bit = extractvalue { i32, i1 } %ov, 1
35+
%ext = sext i1 %bit to i32
36+
%r = add i32 %val, %ext
37+
%r2 = add i32 %r, 10
38+
ret i32 %r2
39+
}
40+
41+
; Fold should fire for non-constant addend too
42+
define i64 @g_nonconstant(i64 %a, i64 %b, i64 %c) nounwind {
43+
; CHECK-LABEL: g_nonconstant:
44+
; CHECK: // %bb.0:
45+
; CHECK-NEXT: neg x8, x2
46+
; CHECK-NEXT: subs x9, x0, x1
47+
; CHECK-NEXT: sbc x0, x9, x8
48+
; CHECK-NEXT: ret
49+
%ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
50+
%val = extractvalue { i64, i1 } %ov, 0
51+
%bit = extractvalue { i64, i1 } %ov, 1
52+
%ext = sext i1 %bit to i64
53+
%r = add i64 %val, %ext
54+
%r2 = add i64 %r, %c
55+
ret i64 %r2
56+
}
57+
58+
; Fold should fire for non-constant addend too
59+
define i32 @g_nonconstant_i32(i32 %a, i32 %b, i32 %c) nounwind {
60+
; CHECK-LABEL: g_nonconstant_i32:
61+
; CHECK: // %bb.0:
62+
; CHECK-NEXT: neg w8, w2
63+
; CHECK-NEXT: subs w9, w0, w1
64+
; CHECK-NEXT: sbc w0, w9, w8
65+
; CHECK-NEXT: ret
66+
%ov = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
67+
%val = extractvalue { i32, i1 } %ov, 0
68+
%bit = extractvalue { i32, i1 } %ov, 1
69+
%ext = sext i1 %bit to i32
70+
%r = add i32 %val, %ext
71+
%r2 = add i32 %r, %c
72+
ret i32 %r2
73+
}
74+
75+
; Fold should fire for non-constant addend in commuted form too
76+
define i64 @g_nonconstant_commuted(i64 %a, i64 %b, i64 %c) nounwind {
77+
; CHECK-LABEL: g_nonconstant_commuted:
78+
; CHECK: // %bb.0:
79+
; CHECK-NEXT: neg x8, x2
80+
; CHECK-NEXT: subs x9, x0, x1
81+
; CHECK-NEXT: sbc x0, x9, x8
82+
; CHECK-NEXT: ret
83+
%ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
84+
%val = extractvalue { i64, i1 } %ov, 0
85+
%bit = extractvalue { i64, i1 } %ov, 1
86+
%ext = sext i1 %bit to i64
87+
%r = add i64 %val, %ext
88+
%r2 = add i64 %c, %r
89+
ret i64 %r2
90+
}
91+
92+
; Fold should fire for non-constant addend in commuted form too
93+
define i32 @g_nonconstant_commuted_i32(i32 %a, i32 %b, i32 %c) nounwind {
94+
; CHECK-LABEL: g_nonconstant_commuted_i32:
95+
; CHECK: // %bb.0:
96+
; CHECK-NEXT: neg w8, w2
97+
; CHECK-NEXT: subs w9, w0, w1
98+
; CHECK-NEXT: sbc w0, w9, w8
99+
; CHECK-NEXT: ret
100+
%ov = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
101+
%val = extractvalue { i32, i1 } %ov, 0
102+
%bit = extractvalue { i32, i1 } %ov, 1
103+
%ext = sext i1 %bit to i32
104+
%r = add i32 %val, %ext
105+
%r2 = add i32 %c, %r
106+
ret i32 %r2
107+
}
108+
109+
; Multiple uses of SBC result should not generate the fold
110+
define i64 @g_multi_use(i64 %a, i64 %b, ptr %out) nounwind {
111+
; CHECK-LABEL: g_multi_use:
112+
; CHECK: // %bb.0:
113+
; CHECK-NEXT: subs x8, x0, x1
114+
; CHECK-NEXT: sbc x8, x8, xzr
115+
; CHECK-NEXT: add x0, x8, #10
116+
; CHECK-NEXT: str x8, [x2]
117+
; CHECK-NEXT: ret
118+
%ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
119+
%val = extractvalue { i64, i1 } %ov, 0
120+
%bit = extractvalue { i64, i1 } %ov, 1
121+
%ext = sext i1 %bit to i64
122+
%sbc = add i64 %val, %ext
123+
store i64 %sbc, ptr %out
124+
%r = add i64 %sbc, 10
125+
ret i64 %r
126+
}
127+
128+
; Multiple uses of SBC result should not generate the fold
129+
define i32 @g_multi_use_i32(i32 %a, i32 %b, ptr %out) nounwind {
130+
; CHECK-LABEL: g_multi_use_i32:
131+
; CHECK: // %bb.0:
132+
; CHECK-NEXT: subs w8, w0, w1
133+
; CHECK-NEXT: sbc w8, w8, wzr
134+
; CHECK-NEXT: add w0, w8, #10
135+
; CHECK-NEXT: str w8, [x2]
136+
; CHECK-NEXT: ret
137+
%ov = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
138+
%val = extractvalue { i32, i1 } %ov, 0
139+
%bit = extractvalue { i32, i1 } %ov, 1
140+
%ext = sext i1 %bit to i32
141+
%sbc = add i32 %val, %ext
142+
store i32 %sbc, ptr %out
143+
%r = add i32 %sbc, 10
144+
ret i32 %r
145+
}

0 commit comments

Comments
 (0)