Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2742,17 +2742,38 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
llvm::SmallVector<mlir::Value> fmaOps = {laneSource, multiplicand, addend};
return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
}
case NEON::BI__builtin_neon_vfmaq_laneq_v:
case NEON::BI__builtin_neon_vfmaq_laneq_v: {
mlir::Value addend = builder.createBitcast(ops[0], ty);
mlir::Value multiplicand = builder.createBitcast(ops[1], ty);
mlir::Value laneSource = builder.createBitcast(ops[2], ty);
laneSource = emitNeonSplat(builder, loc, laneSource, ops[3], ty.getSize());

llvm::SmallVector<mlir::Value> fmaOps = {laneSource, multiplicand, addend};
return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
}
case NEON::BI__builtin_neon_vfmah_lane_f16:
case NEON::BI__builtin_neon_vfmas_lane_f32:
case NEON::BI__builtin_neon_vfmah_laneq_f16:
case NEON::BI__builtin_neon_vfmas_laneq_f32:
case NEON::BI__builtin_neon_vfmad_lane_f64:
case NEON::BI__builtin_neon_vfmad_laneq_f64:
cgm.errorNYI(expr->getSourceRange(),
std::string("unimplemented AArch64 builtin call: ") +
getContext().BuiltinInfo.getName(builtinID));
return mlir::Value{};
case NEON::BI__builtin_neon_vfmad_laneq_f64: {
mlir::Value addend = builder.createBitcast(ops[0], cgm.doubleTy);
mlir::Value multiplicand = builder.createBitcast(ops[1], cgm.doubleTy);
// The laneq source operand is float64x2_t, so the source vector has two
// double lanes.
cir::VectorType sourceTy = cir::VectorType::get(cgm.doubleTy, 2);
mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);
Comment on lines +2764 to +2769

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these bit-casts actually required?

laneSource = builder.createExtractElement(
loc, laneSource, static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));

llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, laneSource, addend};
return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", cgm.doubleTy,
fmaOps);
}
case NEON::BI__builtin_neon_vmull_v: {
intrName = usgn ? "aarch64.neon.umull" : "aarch64.neon.smull";
if (type.isPoly())
Expand Down
76 changes: 0 additions & 76 deletions clang/test/CodeGen/AArch64/neon-2velem.c
Original file line number Diff line number Diff line change
Expand Up @@ -424,25 +424,6 @@ float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
return vfma_lane_f32(a, b, v, 1);
}

// CHECK-LABEL: @test_vfmaq_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]])
// CHECK-NEXT: ret <4 x float> [[TMP9]]
//
float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
return vfmaq_laneq_f32(a, b, v, 3);
}

// CHECK-LABEL: @test_vfms_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
Expand Down Expand Up @@ -523,25 +504,6 @@ float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
return vfmsq_laneq_f32(a, b, v, 3);
}

// CHECK-LABEL: @test_vfmaq_laneq_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]])
// CHECK-NEXT: ret <2 x double> [[TMP9]]
//
float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
return vfmaq_laneq_f64(a, b, v, 1);
}

// CHECK-LABEL: @test_vfmsq_lane_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
Expand Down Expand Up @@ -2509,25 +2471,6 @@ float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
return vfma_lane_f32(a, b, v, 0);
}

// CHECK-LABEL: @test_vfmaq_laneq_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]])
// CHECK-NEXT: ret <4 x float> [[TMP9]]
//
float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
return vfmaq_laneq_f32(a, b, v, 0);
}

// CHECK-LABEL: @test_vfms_lane_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
Expand Down Expand Up @@ -2608,25 +2551,6 @@ float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v)
return vfmsq_laneq_f32(a, b, v, 0);
}

// CHECK-LABEL: @test_vfmaq_laneq_f64_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]])
// CHECK-NEXT: ret <2 x double> [[TMP9]]
//
float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
return vfmaq_laneq_f64(a, b, v, 0);
}

// CHECK-LABEL: @test_vfmsq_laneq_f64_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
Expand Down
11 changes: 0 additions & 11 deletions clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,17 +170,6 @@ float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
return vfmad_lane_f64(a, b, c, 0);
}

// CHECK-LABEL: define dso_local double @test_vfmad_laneq_f64(
// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]])
// CHECK-NEXT: ret double [[TMP0]]
//
float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
return vfmad_laneq_f64(a, b, c, 1);
}

// CHECK-LABEL: define dso_local float @test_vfmss_lane_f32(
// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down
26 changes: 24 additions & 2 deletions clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
// * clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
// The main difference is the use of RUN lines that enable ClangIR lowering.
// This file currently covers the f16 wrappers that lower through
// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v, and
// BI__builtin_neon_vfma_laneq_v.
// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v,
// BI__builtin_neon_vfmaq_laneq_v, and BI__builtin_neon_vfma_laneq_v.
//
// ACLE section headings based on v2025Q2 of the ACLE specification:
// * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate-2
Expand Down Expand Up @@ -67,6 +67,28 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
return vfmaq_lane_f16(a, b, c, 3);
}

// ALL-LABEL: @test_vfmaq_laneq_f16(
float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b,
float16x8_t c) {
// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16>
// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>) -> !cir.vector<8 x !cir.f16>

// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x half> {{.*}} [[C:%.*]]) {{.*}} {
// LLVM: [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> {{.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]])
// LLVM: ret <8 x half> [[FMA]]
return vfmaq_laneq_f16(a, b, c, 7);
}

// ALL-LABEL: @test_vfma_laneq_f16(
float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b,
float16x8_t c) {
Expand Down
106 changes: 104 additions & 2 deletions clang/test/CodeGen/AArch64/neon/fused-multiply.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
//
// This file contains tests that were originally located in:
// * clang/test/CodeGen/AArch64/neon-intrinsics.c
// * clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
// The main difference is the use of RUN lines that enable ClangIR lowering.
// This file currently covers the f32/f64 wrappers that lower through
// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v, and
// BI__builtin_neon_vfma_laneq_v.
// BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v,
// BI__builtin_neon_vfmaq_laneq_v, BI__builtin_neon_vfma_laneq_v,
// and BI__builtin_neon_vfmad_laneq_f64.
//
// ACLE section headings based on v2025Q2 of the ACLE specification:
// * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate
Expand Down Expand Up @@ -106,6 +108,50 @@ float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
return vfmaq_lane_f64(a, b, v, 0);
}

// ALL-LABEL: @test_vfmaq_laneq_f32(
float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b,
float32x4_t v) {
// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.float>
// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float>

// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[V:%.*]]) {{.*}} {
// LLVM: [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
// LLVM-NEXT: [[V_I:%.*]] = bitcast <4 x float> [[V]] to <4 x i32>
// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8>
// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8>
// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <4 x i32> [[V_I]] to <16 x i8>
// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float>
// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float>
// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <4 x float>
// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V_CAST]], <4 x float> {{.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[B_CAST]], <4 x float> [[A_CAST]])
// LLVM: ret <4 x float> [[FMA]]
return vfmaq_laneq_f32(a, b, v, 3);
}

// ALL-LABEL: @test_vfmaq_laneq_f64(
float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b,
float64x2_t v) {
// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.double>
// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double>

// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[V:%.*]]) {{.*}} {
// LLVM: [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8>
// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8>
// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i64> [[V_I]] to <16 x i8>
// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double>
// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double>
// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <2 x double>
// LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V_CAST]], <2 x double> {{.*}}, <2 x i32> <i32 1, i32 1>
// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[B_CAST]], <2 x double> [[A_CAST]])
// LLVM: ret <2 x double> [[FMA]]
return vfmaq_laneq_f64(a, b, v, 1);
}

// ALL-LABEL: @test_vfma_laneq_f32(
float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i] : !cir.vector<2 x !cir.float>
Expand Down Expand Up @@ -152,6 +198,62 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b,
return vfma_laneq_f64(a, b, v, 0);
}

// ALL-LABEL: @test_vfmad_laneq_f64(
float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : !cir.vector<2 x !cir.double>
// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : (!cir.double, !cir.double, !cir.double) -> !cir.double

// LLVM-SAME: double {{.*}} [[A:%.*]], double {{.*}} [[B:%.*]], <2 x double> {{.*}} [[C:%.*]]) {{.*}} {
// LLVM: [[LANE:%.*]] = extractelement <2 x double> [[C]], i{{32|64}} 1
// LLVM: [[FMA:%.*]] = call double @llvm.fma.f64(double [[B]], double [[LANE]], double [[A]])
// LLVM: ret double [[FMA]]
return vfmad_laneq_f64(a, b, c, 1);
}

// ALL-LABEL: @test_vfmaq_laneq_f32_0(
float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b,
float32x4_t v) {
// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<4 x !cir.float>
// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float>

// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[V:%.*]]) {{.*}} {
// LLVM: [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
// LLVM-NEXT: [[V_I:%.*]] = bitcast <4 x float> [[V]] to <4 x i32>
// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8>
// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8>
// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <4 x i32> [[V_I]] to <16 x i8>
// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float>
// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float>
// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <4 x float>
// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V_CAST]], <4 x float> {{.*}}, <4 x i32> zeroinitializer
// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[B_CAST]], <4 x float> [[A_CAST]])
// LLVM: ret <4 x float> [[FMA]]
return vfmaq_laneq_f32(a, b, v, 0);
}

// ALL-LABEL: @test_vfmaq_laneq_f64_0(
float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b,
float64x2_t v) {
// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.double>
// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double>

// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[V:%.*]]) {{.*}} {
// LLVM: [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8>
// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8>
// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i64> [[V_I]] to <16 x i8>
// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double>
// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double>
// LLVM-NEXT: [[V_CAST:%.*]] = bitcast <16 x i8> [[V_BYTES]] to <2 x double>
// LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V_CAST]], <2 x double> {{.*}}, <2 x i32> zeroinitializer
// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[B_CAST]], <2 x double> [[A_CAST]])
// LLVM: ret <2 x double> [[FMA]]
return vfmaq_laneq_f64(a, b, v, 0);
}

// ALL-LABEL: @test_vfma_laneq_f32_0(
float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b,
float32x4_t v) {
Expand Down
Loading
Loading