Skip to content

Commit af666fb

Browse files
xiongzilelinuxlonelyeagle
authored andcommitted
[CIR] add pairwise-addition-and-widen support (llvm#191845)
Part of llvm#185382
1 parent 438847b commit af666fb

3 files changed

Lines changed: 401 additions & 315 deletions

File tree

clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,29 @@ static mlir::Value emitNeonCall(CIRGenModule &cgm, CIRGenBuilderTy &builder,
223223
isConstrainedFPIntrinsic, shift, rightshift);
224224
}
225225

226+
// Computes the input vector type for a NEON pairwise widening operation (e.g.
227+
// vpaddl/vpadal). Given a result vector type, it derives the corresponding
228+
// input type by halving the element bit width and doubling the number of lanes,
229+
// while setting the signedness based on usgn.
230+
static cir::VectorType getNeonPairwiseWidenInputType(cir::VectorType resType,
231+
bool usgn) {
232+
mlir::Type elemTy = resType.getElementType();
233+
uint64_t resLanes = resType.getSize();
234+
auto intTy = mlir::dyn_cast<cir::IntType>(elemTy);
235+
assert(intTy && "vpaddl result type must be an integer vector");
236+
237+
unsigned resWidth = intTy.getWidth();
238+
assert((resWidth == 16 || resWidth == 32 || resWidth == 64) &&
239+
"unexpected vpaddl result element width");
240+
241+
unsigned argWidth = resWidth / 2;
242+
unsigned argLanes = resLanes * 2;
243+
cir::VectorType result = cir::VectorType::get(
244+
cir::IntType::get(resType.getContext(), argWidth, /* is_signed */ !usgn),
245+
argLanes);
246+
return result;
247+
}
248+
226249
static mlir::Value emitCommonNeonSISDBuiltinExpr(
227250
CIRGenFunction &cgf, const ARMVectorIntrinsicInfo &info,
228251
llvm::SmallVectorImpl<mlir::Value> &ops, const CallExpr *expr) {
@@ -439,7 +462,6 @@ static mlir::Value emitCommonNeonBuiltinExpr(
439462
CIRGenFunction &cgf, unsigned builtinID, unsigned llvmIntrinsic,
440463
unsigned altLLVMIntrinsic, const char *nameHint, unsigned modifier,
441464
const CallExpr *expr, llvm::SmallVectorImpl<mlir::Value> &ops) {
442-
443465
mlir::Location loc = cgf.getLoc(expr->getExprLoc());
444466
clang::ASTContext &ctx = cgf.getContext();
445467

@@ -663,8 +685,21 @@ static mlir::Value emitCommonNeonBuiltinExpr(
663685
case NEON::BI__builtin_neon_vmull_v:
664686
case NEON::BI__builtin_neon_vpadal_v:
665687
case NEON::BI__builtin_neon_vpadalq_v:
688+
cgf.cgm.errorNYI(expr->getSourceRange(),
689+
std::string("Reached code-path for ARM builtin call ") +
690+
ctx.BuiltinInfo.getName(builtinID) +
691+
"(ARM builtins are not supported ATM)");
692+
return mlir::Value{};
666693
case NEON::BI__builtin_neon_vpaddl_v:
667-
case NEON::BI__builtin_neon_vpaddlq_v:
694+
case NEON::BI__builtin_neon_vpaddlq_v: {
695+
llvm::StringRef llvmIntrName =
696+
getLLVMIntrNameNoPrefix(static_cast<llvm::Intrinsic::ID>(
697+
usgn ? llvmIntrinsic : altLLVMIntrinsic));
698+
return emitNeonCall(cgf.getCIRGenModule(), cgf.getBuilder(),
699+
/*argTypes=*/{getNeonPairwiseWidenInputType(vTy, usgn)},
700+
ops, llvmIntrName,
701+
/*funcResTy=*/vTy, loc);
702+
}
668703
case NEON::BI__builtin_neon_vqdmlal_v:
669704
case NEON::BI__builtin_neon_vqdmlsl_v:
670705
case NEON::BI__builtin_neon_vqdmulhq_lane_v:
@@ -2453,11 +2488,15 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
24532488
intrName = "aarch64.neon.fabd";
24542489
return emitNeonCall(cgm, builder, {ty, ty}, ops, intrName, ty, loc);
24552490
case NEON::BI__builtin_neon_vpadal_v:
2456-
case NEON::BI__builtin_neon_vpadalq_v:
2457-
cgm.errorNYI(expr->getSourceRange(),
2458-
std::string("unimplemented AArch64 builtin call: ") +
2459-
getContext().BuiltinInfo.getName(builtinID));
2460-
return mlir::Value{};
2491+
case NEON::BI__builtin_neon_vpadalq_v: {
2492+
intrName = usgn ? "aarch64.neon.uaddlp" : "aarch64.neon.saddlp";
2493+
llvm::SmallVector<mlir::Value> inputs{ops[1]};
2494+
mlir::Value pairwiseSum =
2495+
emitNeonCall(cgm, builder, {getNeonPairwiseWidenInputType(ty, usgn)},
2496+
inputs, intrName, ty, loc);
2497+
mlir::Value accumValue = builder.createBitcast(loc, ops[0], ty);
2498+
return cir::AddOp::create(builder, loc, ty, pairwiseSum, accumValue);
2499+
}
24612500
case NEON::BI__builtin_neon_vpmin_v:
24622501
case NEON::BI__builtin_neon_vpminq_v:
24632502
intrName = usgn ? "aarch64.neon.uminp" : "aarch64.neon.sminp";

clang/test/CodeGen/AArch64/neon-misc.c

Lines changed: 0 additions & 308 deletions
Original file line numberDiff line numberDiff line change
@@ -995,314 +995,6 @@ float32x4_t test_vrev64q_f32(float32x4_t a) {
995995
return vrev64q_f32(a);
996996
}
997997

998-
// CHECK-LABEL: define dso_local <4 x i16> @test_vpaddl_s8(
999-
// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
1000-
// CHECK-NEXT: [[ENTRY:.*:]]
1001-
// CHECK-NEXT: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> [[A]])
1002-
// CHECK-NEXT: ret <4 x i16> [[VPADDL_I]]
1003-
//
1004-
int16x4_t test_vpaddl_s8(int8x8_t a) {
1005-
return vpaddl_s8(a);
1006-
}
1007-
1008-
// CHECK-LABEL: define dso_local <2 x i32> @test_vpaddl_s16(
1009-
// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
1010-
// CHECK-NEXT: [[ENTRY:.*:]]
1011-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
1012-
// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1013-
// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]])
1014-
// CHECK-NEXT: ret <2 x i32> [[VPADDL1_I]]
1015-
//
1016-
int32x2_t test_vpaddl_s16(int16x4_t a) {
1017-
return vpaddl_s16(a);
1018-
}
1019-
1020-
// CHECK-LABEL: define dso_local <1 x i64> @test_vpaddl_s32(
1021-
// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
1022-
// CHECK-NEXT: [[ENTRY:.*:]]
1023-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
1024-
// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1025-
// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]])
1026-
// CHECK-NEXT: ret <1 x i64> [[VPADDL1_I]]
1027-
//
1028-
int64x1_t test_vpaddl_s32(int32x2_t a) {
1029-
return vpaddl_s32(a);
1030-
}
1031-
1032-
// CHECK-LABEL: define dso_local <4 x i16> @test_vpaddl_u8(
1033-
// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
1034-
// CHECK-NEXT: [[ENTRY:.*:]]
1035-
// CHECK-NEXT: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> [[A]])
1036-
// CHECK-NEXT: ret <4 x i16> [[VPADDL_I]]
1037-
//
1038-
uint16x4_t test_vpaddl_u8(uint8x8_t a) {
1039-
return vpaddl_u8(a);
1040-
}
1041-
1042-
// CHECK-LABEL: define dso_local <2 x i32> @test_vpaddl_u16(
1043-
// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
1044-
// CHECK-NEXT: [[ENTRY:.*:]]
1045-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
1046-
// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1047-
// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]])
1048-
// CHECK-NEXT: ret <2 x i32> [[VPADDL1_I]]
1049-
//
1050-
uint32x2_t test_vpaddl_u16(uint16x4_t a) {
1051-
return vpaddl_u16(a);
1052-
}
1053-
1054-
// CHECK-LABEL: define dso_local <1 x i64> @test_vpaddl_u32(
1055-
// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
1056-
// CHECK-NEXT: [[ENTRY:.*:]]
1057-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
1058-
// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1059-
// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]])
1060-
// CHECK-NEXT: ret <1 x i64> [[VPADDL1_I]]
1061-
//
1062-
uint64x1_t test_vpaddl_u32(uint32x2_t a) {
1063-
return vpaddl_u32(a);
1064-
}
1065-
1066-
// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddlq_s8(
1067-
// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
1068-
// CHECK-NEXT: [[ENTRY:.*:]]
1069-
// CHECK-NEXT: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> [[A]])
1070-
// CHECK-NEXT: ret <8 x i16> [[VPADDL_I]]
1071-
//
1072-
int16x8_t test_vpaddlq_s8(int8x16_t a) {
1073-
return vpaddlq_s8(a);
1074-
}
1075-
1076-
// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddlq_s16(
1077-
// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
1078-
// CHECK-NEXT: [[ENTRY:.*:]]
1079-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
1080-
// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1081-
// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]])
1082-
// CHECK-NEXT: ret <4 x i32> [[VPADDL1_I]]
1083-
//
1084-
int32x4_t test_vpaddlq_s16(int16x8_t a) {
1085-
return vpaddlq_s16(a);
1086-
}
1087-
1088-
// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddlq_s32(
1089-
// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
1090-
// CHECK-NEXT: [[ENTRY:.*:]]
1091-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
1092-
// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1093-
// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]])
1094-
// CHECK-NEXT: ret <2 x i64> [[VPADDL1_I]]
1095-
//
1096-
int64x2_t test_vpaddlq_s32(int32x4_t a) {
1097-
return vpaddlq_s32(a);
1098-
}
1099-
1100-
// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddlq_u8(
1101-
// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
1102-
// CHECK-NEXT: [[ENTRY:.*:]]
1103-
// CHECK-NEXT: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[A]])
1104-
// CHECK-NEXT: ret <8 x i16> [[VPADDL_I]]
1105-
//
1106-
uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
1107-
return vpaddlq_u8(a);
1108-
}
1109-
1110-
// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddlq_u16(
1111-
// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
1112-
// CHECK-NEXT: [[ENTRY:.*:]]
1113-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
1114-
// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1115-
// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]])
1116-
// CHECK-NEXT: ret <4 x i32> [[VPADDL1_I]]
1117-
//
1118-
uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
1119-
return vpaddlq_u16(a);
1120-
}
1121-
1122-
// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddlq_u32(
1123-
// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
1124-
// CHECK-NEXT: [[ENTRY:.*:]]
1125-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
1126-
// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1127-
// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]])
1128-
// CHECK-NEXT: ret <2 x i64> [[VPADDL1_I]]
1129-
//
1130-
uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
1131-
return vpaddlq_u32(a);
1132-
}
1133-
1134-
// CHECK-LABEL: define dso_local <4 x i16> @test_vpadal_s8(
1135-
// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
1136-
// CHECK-NEXT: [[ENTRY:.*:]]
1137-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
1138-
// CHECK-NEXT: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> [[B]])
1139-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1140-
// CHECK-NEXT: [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
1141-
// CHECK-NEXT: ret <4 x i16> [[TMP2]]
1142-
//
1143-
int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
1144-
return vpadal_s8(a, b);
1145-
}
1146-
1147-
// CHECK-LABEL: define dso_local <2 x i32> @test_vpadal_s16(
1148-
// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
1149-
// CHECK-NEXT: [[ENTRY:.*:]]
1150-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
1151-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
1152-
// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1153-
// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]])
1154-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1155-
// CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
1156-
// CHECK-NEXT: ret <2 x i32> [[TMP3]]
1157-
//
1158-
int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
1159-
return vpadal_s16(a, b);
1160-
}
1161-
1162-
// CHECK-LABEL: define dso_local <1 x i64> @test_vpadal_s32(
1163-
// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
1164-
// CHECK-NEXT: [[ENTRY:.*:]]
1165-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
1166-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
1167-
// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1168-
// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]])
1169-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
1170-
// CHECK-NEXT: [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
1171-
// CHECK-NEXT: ret <1 x i64> [[TMP3]]
1172-
//
1173-
int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
1174-
return vpadal_s32(a, b);
1175-
}
1176-
1177-
// CHECK-LABEL: define dso_local <4 x i16> @test_vpadal_u8(
1178-
// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
1179-
// CHECK-NEXT: [[ENTRY:.*:]]
1180-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
1181-
// CHECK-NEXT: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> [[B]])
1182-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1183-
// CHECK-NEXT: [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
1184-
// CHECK-NEXT: ret <4 x i16> [[TMP2]]
1185-
//
1186-
uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
1187-
return vpadal_u8(a, b);
1188-
}
1189-
1190-
// CHECK-LABEL: define dso_local <2 x i32> @test_vpadal_u16(
1191-
// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
1192-
// CHECK-NEXT: [[ENTRY:.*:]]
1193-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
1194-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
1195-
// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1196-
// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]])
1197-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1198-
// CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
1199-
// CHECK-NEXT: ret <2 x i32> [[TMP3]]
1200-
//
1201-
uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
1202-
return vpadal_u16(a, b);
1203-
}
1204-
1205-
// CHECK-LABEL: define dso_local <1 x i64> @test_vpadal_u32(
1206-
// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
1207-
// CHECK-NEXT: [[ENTRY:.*:]]
1208-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
1209-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
1210-
// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1211-
// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]])
1212-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
1213-
// CHECK-NEXT: [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
1214-
// CHECK-NEXT: ret <1 x i64> [[TMP3]]
1215-
//
1216-
uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
1217-
return vpadal_u32(a, b);
1218-
}
1219-
1220-
// CHECK-LABEL: define dso_local <8 x i16> @test_vpadalq_s8(
1221-
// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
1222-
// CHECK-NEXT: [[ENTRY:.*:]]
1223-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
1224-
// CHECK-NEXT: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> [[B]])
1225-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1226-
// CHECK-NEXT: [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
1227-
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
1228-
//
1229-
int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
1230-
return vpadalq_s8(a, b);
1231-
}
1232-
1233-
// CHECK-LABEL: define dso_local <4 x i32> @test_vpadalq_s16(
1234-
// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
1235-
// CHECK-NEXT: [[ENTRY:.*:]]
1236-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
1237-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
1238-
// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1239-
// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]])
1240-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1241-
// CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
1242-
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
1243-
//
1244-
int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
1245-
return vpadalq_s16(a, b);
1246-
}
1247-
1248-
// CHECK-LABEL: define dso_local <2 x i64> @test_vpadalq_s32(
1249-
// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
1250-
// CHECK-NEXT: [[ENTRY:.*:]]
1251-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
1252-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
1253-
// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1254-
// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]])
1255-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
1256-
// CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
1257-
// CHECK-NEXT: ret <2 x i64> [[TMP3]]
1258-
//
1259-
int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
1260-
return vpadalq_s32(a, b);
1261-
}
1262-
1263-
// CHECK-LABEL: define dso_local <8 x i16> @test_vpadalq_u8(
1264-
// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
1265-
// CHECK-NEXT: [[ENTRY:.*:]]
1266-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
1267-
// CHECK-NEXT: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[B]])
1268-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1269-
// CHECK-NEXT: [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
1270-
// CHECK-NEXT: ret <8 x i16> [[TMP2]]
1271-
//
1272-
uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
1273-
return vpadalq_u8(a, b);
1274-
}
1275-
1276-
// CHECK-LABEL: define dso_local <4 x i32> @test_vpadalq_u16(
1277-
// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
1278-
// CHECK-NEXT: [[ENTRY:.*:]]
1279-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
1280-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
1281-
// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1282-
// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]])
1283-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1284-
// CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
1285-
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
1286-
//
1287-
uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
1288-
return vpadalq_u16(a, b);
1289-
}
1290-
1291-
// CHECK-LABEL: define dso_local <2 x i64> @test_vpadalq_u32(
1292-
// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
1293-
// CHECK-NEXT: [[ENTRY:.*:]]
1294-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
1295-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
1296-
// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1297-
// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]])
1298-
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
1299-
// CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
1300-
// CHECK-NEXT: ret <2 x i64> [[TMP3]]
1301-
//
1302-
uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
1303-
return vpadalq_u32(a, b);
1304-
}
1305-
1306998
// CHECK-LABEL: define dso_local <8 x i8> @test_vqabs_s8(
1307999
// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
13081000
// CHECK-NEXT: [[ENTRY:.*:]]

0 commit comments

Comments
 (0)