diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index be906d0671e3a..7c37375d22c77 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -2742,17 +2742,38 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr, llvm::SmallVector fmaOps = {laneSource, multiplicand, addend}; return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps); } - case NEON::BI__builtin_neon_vfmaq_laneq_v: + case NEON::BI__builtin_neon_vfmaq_laneq_v: { + mlir::Value addend = builder.createBitcast(ops[0], ty); + mlir::Value multiplicand = builder.createBitcast(ops[1], ty); + mlir::Value laneSource = builder.createBitcast(ops[2], ty); + laneSource = emitNeonSplat(builder, loc, laneSource, ops[3], ty.getSize()); + + llvm::SmallVector fmaOps = {laneSource, multiplicand, addend}; + return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps); + } case NEON::BI__builtin_neon_vfmah_lane_f16: case NEON::BI__builtin_neon_vfmas_lane_f32: case NEON::BI__builtin_neon_vfmah_laneq_f16: case NEON::BI__builtin_neon_vfmas_laneq_f32: case NEON::BI__builtin_neon_vfmad_lane_f64: - case NEON::BI__builtin_neon_vfmad_laneq_f64: cgm.errorNYI(expr->getSourceRange(), std::string("unimplemented AArch64 builtin call: ") + getContext().BuiltinInfo.getName(builtinID)); return mlir::Value{}; + case NEON::BI__builtin_neon_vfmad_laneq_f64: { + mlir::Value addend = builder.createBitcast(ops[0], cgm.doubleTy); + mlir::Value multiplicand = builder.createBitcast(ops[1], cgm.doubleTy); + // The laneq source operand is float64x2_t, so the source vector has two + // double lanes. + cir::VectorType sourceTy = cir::VectorType::get(cgm.doubleTy, 2); + mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy); + laneSource = builder.createExtractElement( + loc, laneSource, static_cast(getIntValueFromConstOp(ops[3]))); + + llvm::SmallVector fmaOps = {multiplicand, laneSource, addend}; + return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", cgm.doubleTy, + fmaOps); + } case NEON::BI__builtin_neon_vmull_v: { intrName = usgn ? "aarch64.neon.umull" : "aarch64.neon.smull"; if (type.isPoly()) diff --git a/clang/test/CodeGen/AArch64/neon-2velem.c b/clang/test/CodeGen/AArch64/neon-2velem.c index 89fdb979d8a98..c7eca2d8426c6 100644 --- a/clang/test/CodeGen/AArch64/neon-2velem.c +++ b/clang/test/CodeGen/AArch64/neon-2velem.c @@ -424,25 +424,6 @@ float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { return vfma_lane_f32(a, b, v, 1); } -// CHECK-LABEL: @test_vfmaq_laneq_f32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <4 x i32> -// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]]) -// CHECK-NEXT: ret <4 x float> [[TMP9]] -// -float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { - return vfmaq_laneq_f32(a, b, v, 3); -} - // CHECK-LABEL: @test_vfms_lane_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> @@ -523,25 +504,6 @@ float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { return vfmsq_laneq_f32(a, b, v, 3); } -// CHECK-LABEL: @test_vfmaq_laneq_f64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x i32> -// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]]) -// CHECK-NEXT: ret <2 x double> [[TMP9]] -// -float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { - return vfmaq_laneq_f64(a, b, v, 1); -} - // CHECK-LABEL: @test_vfmsq_lane_f64( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> @@ -2509,25 +2471,6 @@ float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { return vfma_lane_f32(a, b, v, 0); } -// CHECK-LABEL: @test_vfmaq_laneq_f32_0( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]]) -// CHECK-NEXT: ret <4 x float> [[TMP9]] -// -float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { - return vfmaq_laneq_f32(a, b, v, 0); -} - // CHECK-LABEL: @test_vfms_lane_f32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> @@ -2608,25 +2551,6 @@ float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) return vfmsq_laneq_f32(a, b, v, 0); } -// CHECK-LABEL: @test_vfmaq_laneq_f64_0( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]]) -// CHECK-NEXT: ret <2 x double> [[TMP9]] -// -float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { - return vfmaq_laneq_f64(a, b, v, 0); -} - // CHECK-LABEL: @test_vfmsq_laneq_f64_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c index b464bccdbf9ec..fdb772a79e973 100644 --- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c +++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c @@ -170,17 +170,6 @@ float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) { return vfmad_lane_f64(a, b, c, 0); } -// CHECK-LABEL: define dso_local double @test_vfmad_laneq_f64( -// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]]) -// CHECK-NEXT: ret double [[TMP0]] -// -float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) { - return vfmad_laneq_f64(a, b, c, 1); -} - // CHECK-LABEL: define dso_local float @test_vfmss_lane_f32( // CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c index d3d6b52358678..1460fb3b2bae1 100644 --- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c +++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c @@ -13,8 +13,8 @@ // * clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c // The main difference is the use of RUN lines that enable ClangIR lowering. // This file currently covers the f16 wrappers that lower through -// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v, and -// BI__builtin_neon_vfma_laneq_v. +// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v, +// BI__builtin_neon_vfmaq_laneq_v, and BI__builtin_neon_vfma_laneq_v. // // ACLE section headings based on v2025Q2 of the ACLE specification: // * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate-2 @@ -67,6 +67,28 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, return vfmaq_lane_f16(a, b, c, 3); } +// ALL-LABEL: @test_vfmaq_laneq_f16( +float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, + float16x8_t c) { +// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16> +// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>) -> !cir.vector<8 x !cir.f16> + +// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x half> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8> +// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8> +// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8> +// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half> +// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half> +// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half> +// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> {{.*}}, <8 x i32> +// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]]) +// LLVM: ret <8 x half> [[FMA]] + return vfmaq_laneq_f16(a, b, c, 7); +} + // ALL-LABEL: @test_vfma_laneq_f16( float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c b/clang/test/CodeGen/AArch64/neon/fused-multiply.c index a1e3c6eeea2f2..c0b1932126a23 100644 --- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c +++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c @@ -11,10 +11,12 @@ // // This file contains tests that were originally located in: // * clang/test/CodeGen/AArch64/neon-intrinsics.c +// * clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c // The main difference is the use of RUN lines that enable ClangIR lowering. // This file currently covers the f32/f64 wrappers that lower through -// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v, and -// BI__builtin_neon_vfma_laneq_v. +// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v, +// BI__builtin_neon_vfmaq_laneq_v, BI__builtin_neon_vfma_laneq_v, +// and BI__builtin_neon_vfmad_laneq_f64. // // ACLE section headings based on v2025Q2 of the ACLE specification: // * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate @@ -106,6 +108,50 @@ float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { return vfmaq_lane_f64(a, b, v, 0); } +// ALL-LABEL: @test_vfmaq_laneq_f32( +float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, + float32x4_t v) { +// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.float> +// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float> + +// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[V:%.*]]) {{.*}} { +// LLVM: [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// LLVM-NEXT: [[V_I:%.*]] = bitcast <4 x float> [[V]] to <4 x i32> +// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8> +// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8> +// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <4 x i32> [[V_I]] to <16 x i8> +// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float> +// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float> +// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <4 x float> +// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V_CAST]], <4 x float> {{.*}}, <4 x i32> +// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[B_CAST]], <4 x float> [[A_CAST]]) +// LLVM: ret <4 x float> [[FMA]] + return vfmaq_laneq_f32(a, b, v, 3); +} + +// ALL-LABEL: @test_vfmaq_laneq_f64( +float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, + float64x2_t v) { +// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.double> +// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double> + +// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[V:%.*]]) {{.*}} { +// LLVM: [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> +// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8> +// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8> +// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i64> [[V_I]] to <16 x i8> +// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double> +// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double> +// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <2 x double> +// LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V_CAST]], <2 x double> {{.*}}, <2 x i32> +// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[B_CAST]], <2 x double> [[A_CAST]]) +// LLVM: ret <2 x double> [[FMA]] + return vfmaq_laneq_f64(a, b, v, 1); +} + // ALL-LABEL: @test_vfma_laneq_f32( float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i] : !cir.vector<2 x !cir.float> @@ -152,6 +198,62 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, return vfma_laneq_f64(a, b, v, 0); } +// ALL-LABEL: @test_vfmad_laneq_f64( +float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) { +// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : !cir.vector<2 x !cir.double> +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : (!cir.double, !cir.double, !cir.double) -> !cir.double + +// LLVM-SAME: double {{.*}} [[A:%.*]], double {{.*}} [[B:%.*]], <2 x double> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[LANE:%.*]] = extractelement <2 x double> [[C]], i{{32|64}} 1 +// LLVM: [[FMA:%.*]] = call double @llvm.fma.f64(double [[B]], double [[LANE]], double [[A]]) +// LLVM: ret double [[FMA]] + return vfmad_laneq_f64(a, b, c, 1); +} + +// ALL-LABEL: @test_vfmaq_laneq_f32_0( +float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, + float32x4_t v) { +// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<4 x !cir.float> +// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float> + +// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[V:%.*]]) {{.*}} { +// LLVM: [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// LLVM-NEXT: [[V_I:%.*]] = bitcast <4 x float> [[V]] to <4 x i32> +// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8> +// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8> +// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <4 x i32> [[V_I]] to <16 x i8> +// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float> +// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float> +// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <4 x float> +// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V_CAST]], <4 x float> {{.*}}, <4 x i32> zeroinitializer +// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[B_CAST]], <4 x float> [[A_CAST]]) +// LLVM: ret <4 x float> [[FMA]] + return vfmaq_laneq_f32(a, b, v, 0); +} + +// ALL-LABEL: @test_vfmaq_laneq_f64_0( +float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, + float64x2_t v) { +// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.double> +// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double> + +// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[V:%.*]]) {{.*}} { +// LLVM: [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> +// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8> +// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8> +// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i64> [[V_I]] to <16 x i8> +// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double> +// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double> +// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <2 x double> +// LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V_CAST]], <2 x double> {{.*}}, <2 x i32> zeroinitializer +// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[B_CAST]], <2 x double> [[A_CAST]]) +// LLVM: ret <2 x double> [[FMA]] + return vfmaq_laneq_f64(a, b, v, 0); +} + // ALL-LABEL: @test_vfma_laneq_f32_0( float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c index e8f1eead2a0d5..b25b5fe90bfc5 100644 --- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c @@ -1681,26 +1681,6 @@ float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_lane_f16(a, b, c, 3); } -// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_laneq_f16 -// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <8 x i32> -// CHECK-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP7]], <8 x half> [[TMP6]]) -// CHECK-NEXT: ret <8 x half> [[TMP9]] -// -float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { - return vfmaq_laneq_f16(a, b, c, 7); -} - // CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: