[AArch64] fuse constant addition after sbb (llvm#185117)

Takashiidobe · web-flow · commit 78a4b8eca939 · 2026-03-11T13:34:45.000Z
Resolves: llvm#171676 Related: llvm#184541 (x86_64 PR) The issue points out that `Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)` is optimized and that SBB can be optimized similarly: `Fold ADD(SBB(Y,0,W),C) -> SBB(Y,-C,W)`. With the changes from this branch, a new clang will compile the example code: ``` #include <stdint.h> uint64_t f(uint64_t a, uint64_t b) { uint64_t x; x += __builtin_add_overflow(a, b, &x); return x + 10; } uint64_t g(uint64_t a, uint64_t b) { uint64_t x; x -= __builtin_sub_overflow(a, b, &x); return x + 10; } ``` To this, so the subc case matches the add case. ``` f: mov w8, llvm#10 adds x9, x0, x1 adc x0, x9, x8 ret g: mov x8, #-10 subs x9, x0, x1 sbc x0, x9, x8 ret ```
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23365,6 +23365,31 @@ static SDValue performAddTruncShiftCombine(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::ADD, DL, VT, Trunc, Shift);
 }
 
+// Fold ADD(SBC(Y, 0, W), C) -> SBC(Y, -C, W)
+// SBC(Y, 0, W) = Y - 0 - ~carry = Y + carry - 1
+// Adding C:  Y + carry - 1 + C = Y - (-C) - ~carry = SBC(Y, -C, W)
+static SDValue performAddWithSBCCombine(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() != ISD::ADD)
+    return SDValue();
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDValue SBC = N->getOperand(0);
+  SDValue C = N->getOperand(1);
+  // ADD is commutative; operands may be on either side.
+  if (SBC.getOpcode() != AArch64ISD::SBC)
+    std::swap(SBC, C);
+  if (SBC.getOpcode() != AArch64ISD::SBC || !SBC.hasOneUse())
+    return SDValue();
+  if (!isNullConstant(SBC.getOperand(1)))
+    return SDValue();
+  // AArch64 SBC (non-flag-setting) has only one output; no flags guard needed.
+  SDLoc DL(N);
+  return DAG.getNode(AArch64ISD::SBC, DL, VT, SBC.getOperand(0),
+                     DAG.getNegative(C, DL, VT), SBC.getOperand(2));
+}
+
 static SDValue performAddSubCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI) {
   // Try to change sum of two reductions.
@@ -23390,7 +23415,8 @@ static SDValue performAddSubCombine(SDNode *N,
     return Val;
   if (SDValue Val = performAddTruncShiftCombine(N, DCI.DAG))
     return Val;
-
+  if (SDValue Val = performAddWithSBCCombine(N, DCI.DAG))
+    return Val;
   if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
     return Val;
 
diff --git a/llvm/test/CodeGen/AArch64/sbc-add-constant.ll b/llvm/test/CodeGen/AArch64/sbc-add-constant.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+;
+; Verify that ADD(SBB(Y,0,flags),C) folds to SBB(Y,-C,flags).
+; SBB(Y,0) = Y - CF; adding C gives Y - CF + C = Y - (-C) - CF = SBB(Y,-C).
+
+; Fold should fire, adding with constant
+define i64 @g_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: g_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #-10 // =0xfffffffffffffff6
+; CHECK-NEXT:    subs x9, x0, x1
+; CHECK-NEXT:    sbc x0, x9, x8
+; CHECK-NEXT:    ret
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, 10
+  ret i64 %r2
+}
+
+; Fold should fire, adding with constant
+define i32 @g_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: g_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-10 // =0xfffffff6
+; CHECK-NEXT:    subs w9, w0, w1
+; CHECK-NEXT:    sbc w0, w9, w8
+; CHECK-NEXT:    ret
+  %ov  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+  %val = extractvalue { i32, i1 } %ov, 0
+  %bit = extractvalue { i32, i1 } %ov, 1
+  %ext = sext i1 %bit to i32
+  %r   = add i32 %val, %ext
+  %r2  = add i32 %r, 10
+  ret i32 %r2
+}
+
+; Fold should fire for non-constant addend too
+define i64 @g_nonconstant(i64 %a, i64 %b, i64 %c) nounwind {
+; CHECK-LABEL: g_nonconstant:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg x8, x2
+; CHECK-NEXT:    subs x9, x0, x1
+; CHECK-NEXT:    sbc x0, x9, x8
+; CHECK-NEXT:    ret
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, %c
+  ret i64 %r2
+}
+
+; Fold should fire for non-constant addend too
+define i32 @g_nonconstant_i32(i32 %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: g_nonconstant_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg w8, w2
+; CHECK-NEXT:    subs w9, w0, w1
+; CHECK-NEXT:    sbc w0, w9, w8
+; CHECK-NEXT:    ret
+  %ov  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+  %val = extractvalue { i32, i1 } %ov, 0
+  %bit = extractvalue { i32, i1 } %ov, 1
+  %ext = sext i1 %bit to i32
+  %r   = add i32 %val, %ext
+  %r2  = add i32 %r, %c
+  ret i32 %r2
+}
+
+; Fold should fire for non-constant addend in commuted form too
+define i64 @g_nonconstant_commuted(i64 %a, i64 %b, i64 %c) nounwind {
+; CHECK-LABEL: g_nonconstant_commuted:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg x8, x2
+; CHECK-NEXT:    subs x9, x0, x1
+; CHECK-NEXT:    sbc x0, x9, x8
+; CHECK-NEXT:    ret
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %c, %r
+  ret i64 %r2
+}
+
+; Fold should fire for non-constant addend in commuted form too
+define i32 @g_nonconstant_commuted_i32(i32 %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: g_nonconstant_commuted_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg w8, w2
+; CHECK-NEXT:    subs w9, w0, w1
+; CHECK-NEXT:    sbc w0, w9, w8
+; CHECK-NEXT:    ret
+  %ov  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+  %val = extractvalue { i32, i1 } %ov, 0
+  %bit = extractvalue { i32, i1 } %ov, 1
+  %ext = sext i1 %bit to i32
+  %r   = add i32 %val, %ext
+  %r2  = add i32 %c, %r
+  ret i32 %r2
+}
+
+; Multiple uses of SBC result should not generate the fold
+define i64 @g_multi_use(i64 %a, i64 %b, ptr %out) nounwind {
+; CHECK-LABEL: g_multi_use:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs x8, x0, x1
+; CHECK-NEXT:    sbc x8, x8, xzr
+; CHECK-NEXT:    add x0, x8, #10
+; CHECK-NEXT:    str x8, [x2]
+; CHECK-NEXT:    ret
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %sbc = add i64 %val, %ext
+  store i64 %sbc, ptr %out
+  %r   = add i64 %sbc, 10
+  ret i64 %r
+}
+
+; Multiple uses of SBC result should not generate the fold
+define i32 @g_multi_use_i32(i32 %a, i32 %b, ptr %out) nounwind {
+; CHECK-LABEL: g_multi_use_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w0, w1
+; CHECK-NEXT:    sbc w8, w8, wzr
+; CHECK-NEXT:    add w0, w8, #10
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:    ret
+  %ov  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+  %val = extractvalue { i32, i1 } %ov, 0
+  %bit = extractvalue { i32, i1 } %ov, 1
+  %ext = sext i1 %bit to i32
+  %sbc = add i32 %val, %ext
+  store i32 %sbc, ptr %out
+  %r   = add i32 %sbc, 10
+  ret i32 %r
+}