Skip to content

Commit 00cb7fb

Browse files
authored
[AARCH64] Add intrinsic support for new s/udot intrinsics (#189424)
The intrinsics are based on the following [proposal](ARM-software/acle#428). These are: ``` svint16_t svdot_s16_s8(svint16_t zda, svint8_t zn, svint8_t zm); svint16_t svdot_n_s16_s8(svint16_t zda, svint8_t zn, int8_t zm); svint16_t svdot_lane_s16_s8(svint16_t zda, svint8_t zn, svint8_t zm, uint64_t imm_idx); svuint16_t svdot_u16_u8(svuint16_t zda, svuint8_t zn, svuint8_t zm); svuint16_t svdot_n_u16_u8(svuint16_t zda, svuint8_t zn, uint8_t zm); svuint16_t svdot_lane_u16_u8(svuint16_t zda, svuint8_t zn, svuint8_t zm, uint64_t imm_idx); ```
1 parent 8be29ed commit 00cb7fb

10 files changed

Lines changed: 399 additions & 4 deletions

clang/include/clang/Basic/arm_sve.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2526,3 +2526,13 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2p2" in {
25262526
def FMUL_X2 : SInst<"svmul[_{d}_x2]", "222", "hfd", MergeNone, "aarch64_sve_fmul_x2", [IsStreaming], []>;
25272527
def FMUL_X4 : SInst<"svmul[_{d}_x4]", "444", "hfd", MergeNone, "aarch64_sve_fmul_x4", [IsStreaming], []>;
25282528
}
2529+
2530+
let SVETargetGuard = "sve2p3|sme2p3", SMETargetGuard = "sve2p3|sme2p3" in {
2531+
def SVDOT_X2_SH : SInst<"svdot[_{d}_{2}]", "ddhh", "s", MergeNone, "aarch64_sve_sdot_x2", [VerifyRuntimeMode], []>;
2532+
def SVDOT_X2_UH : SInst<"svdot[_{d}_{2}]", "ddhh", "Us", MergeNone, "aarch64_sve_udot_x2", [VerifyRuntimeMode], []>;
2533+
def SVDOT_N_X2_SH : SInst<"svdot[_n_{d}_{2}]", "ddhR", "s", MergeNone, "aarch64_sve_sdot_x2", [VerifyRuntimeMode], []>;
2534+
def SVDOT_N_X2_UH : SInst<"svdot[_n_{d}_{2}]", "ddhR", "Us", MergeNone, "aarch64_sve_udot_x2", [VerifyRuntimeMode], []>;
2535+
2536+
def SVDOT_LANE_X2_SH : SInst<"svdot_lane[_{d}_{2}]", "ddhhi", "s", MergeNone, "aarch64_sve_sdot_lane_x2", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
2537+
def SVDOT_LANE_X2_UH : SInst<"svdot_lane[_{d}_{2}]", "ddhhi", "Us", MergeNone, "aarch64_sve_udot_lane_x2", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
2538+
}
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// REQUIRES: aarch64-registered-target
3+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
4+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
5+
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2p3 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
6+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2p3 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
7+
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2p3 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
8+
9+
#include <arm_sve.h>
10+
11+
#if defined(__ARM_FEATURE_SME)
12+
#define ATTR __arm_streaming
13+
#else
14+
#define ATTR
15+
#endif
16+
17+
#ifdef SVE_OVERLOADED_FORMS
18+
// A simple used,unused... macro, long enough to represent any SVE builtin.
19+
#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
20+
#else
21+
#define SVE_ACLE_FUNC(A1,A2) A1##A2
22+
#endif
23+
24+
// CHECK-LABEL: @test_svdot_s16_x2(
25+
// CHECK-NEXT: entry:
26+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sdot.x2.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
27+
// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
28+
//
29+
// CPP-CHECK-LABEL: @_Z17test_svdot_s16_x2u11__SVInt16_tu10__SVInt8_tS0_(
30+
// CPP-CHECK-NEXT: entry:
31+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sdot.x2.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
32+
// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
33+
//
34+
svint16_t test_svdot_s16_x2(svint16_t op1, svint8_t op2, svint8_t op3) ATTR
35+
{
36+
return SVE_ACLE_FUNC(svdot,_s16_s8)(op1, op2, op3);
37+
}
38+
39+
// CHECK-LABEL: @test_svdot_u16_x2(
40+
// CHECK-NEXT: entry:
41+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.udot.x2.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
42+
// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
43+
//
44+
// CPP-CHECK-LABEL: @_Z17test_svdot_u16_x2u12__SVUint16_tu11__SVUint8_tS0_(
45+
// CPP-CHECK-NEXT: entry:
46+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.udot.x2.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
47+
// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
48+
//
49+
svuint16_t test_svdot_u16_x2(svuint16_t op1, svuint8_t op2, svuint8_t op3) ATTR
50+
{
51+
return SVE_ACLE_FUNC(svdot,_u16_u8)(op1, op2, op3);
52+
}
53+
54+
// CHECK-LABEL: @test_svdot_n_s16_x2(
55+
// CHECK-NEXT: entry:
56+
// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[OP3:%.*]], i64 0
57+
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
58+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sdot.x2.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
59+
// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
60+
//
61+
// CPP-CHECK-LABEL: @_Z19test_svdot_n_s16_x2u11__SVInt16_tu10__SVInt8_ta(
62+
// CPP-CHECK-NEXT: entry:
63+
// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[OP3:%.*]], i64 0
64+
// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
65+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sdot.x2.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
66+
// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
67+
//
68+
svint16_t test_svdot_n_s16_x2(svint16_t op1, svint8_t op2, int8_t op3) ATTR
69+
{
70+
return SVE_ACLE_FUNC(svdot,_n_s16_s8)(op1, op2, op3);
71+
}
72+
73+
// CHECK-LABEL: @test_svdot_n_u16_x2(
74+
// CHECK-NEXT: entry:
75+
// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[OP3:%.*]], i64 0
76+
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
77+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.udot.x2.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
78+
// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
79+
//
80+
// CPP-CHECK-LABEL: @_Z19test_svdot_n_u16_x2u12__SVUint16_tu11__SVUint8_th(
81+
// CPP-CHECK-NEXT: entry:
82+
// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[OP3:%.*]], i64 0
83+
// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
84+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.udot.x2.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[DOTSPLAT]])
85+
// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
86+
//
87+
svuint16_t test_svdot_n_u16_x2(svuint16_t op1, svuint8_t op2, uint8_t op3) ATTR
88+
{
89+
return SVE_ACLE_FUNC(svdot,_n_u16_u8)(op1, op2, op3);
90+
}
91+
92+
// CHECK-LABEL: @test_svdot_lane_s16_x2(
93+
// CHECK-NEXT: entry:
94+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sdot.lane.x2.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]], i32 7)
95+
// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
96+
//
97+
// CPP-CHECK-LABEL: @_Z22test_svdot_lane_s16_x2u11__SVInt16_tu10__SVInt8_tS0_(
98+
// CPP-CHECK-NEXT: entry:
99+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sdot.lane.x2.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]], i32 7)
100+
// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
101+
//
102+
svint16_t test_svdot_lane_s16_x2(svint16_t op1, svint8_t op2, svint8_t op3) ATTR
103+
{
104+
return SVE_ACLE_FUNC(svdot_lane,_s16_s8)(op1, op2, op3, 7);
105+
}
106+
107+
// CHECK-LABEL: @test_svdot_lane_u16_x2(
108+
// CHECK-NEXT: entry:
109+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.udot.lane.x2.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]], i32 7)
110+
// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
111+
//
112+
// CPP-CHECK-LABEL: @_Z22test_svdot_lane_u16_x2u12__SVUint16_tu11__SVUint8_tS0_(
113+
// CPP-CHECK-NEXT: entry:
114+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.udot.lane.x2.nxv8i16(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]], i32 7)
115+
// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
116+
//
117+
svuint16_t test_svdot_lane_u16_x2(svuint16_t op1, svuint8_t op2, svuint8_t op3) ATTR
118+
{
119+
return SVE_ACLE_FUNC(svdot_lane,_u16_u8)(op1, op2, op3, 7);
120+
}

clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_LP_sve2p3_OR_sme2p3_RP___sme_AND_LP_sve2p3_OR_sme2p3_RP.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,18 @@ void test(void) {
8686
svaddsubp_u16(svuint16_t_val, svuint16_t_val);
8787
svaddsubp_u32(svuint32_t_val, svuint32_t_val);
8888
svaddsubp_u64(svuint64_t_val, svuint64_t_val);
89+
svdot(svint16_t_val, svint8_t_val, int8_t_val);
90+
svdot(svint16_t_val, svint8_t_val, svint8_t_val);
91+
svdot(svuint16_t_val, svuint8_t_val, svuint8_t_val);
92+
svdot(svuint16_t_val, svuint8_t_val, uint8_t_val);
93+
svdot_lane(svint16_t_val, svint8_t_val, svint8_t_val, 2);
94+
svdot_lane(svuint16_t_val, svuint8_t_val, svuint8_t_val, 2);
95+
svdot_lane_s16_s8(svint16_t_val, svint8_t_val, svint8_t_val, 2);
96+
svdot_lane_u16_u8(svuint16_t_val, svuint8_t_val, svuint8_t_val, 2);
97+
svdot_n_s16_s8(svint16_t_val, svint8_t_val, int8_t_val);
98+
svdot_n_u16_u8(svuint16_t_val, svuint8_t_val, uint8_t_val);
99+
svdot_s16_s8(svint16_t_val, svint8_t_val, svint8_t_val);
100+
svdot_u16_u8(svuint16_t_val, svuint8_t_val, svuint8_t_val);
89101
svqrshrn_n_s8_s16_x2(svint16x2_t_val, 2);
90102
svqrshrn_n_u8_u16_x2(svuint16x2_t_val, 2);
91103
svqrshrn_s8(svint16x2_t_val, 2);
@@ -231,6 +243,18 @@ void test_streaming(void) __arm_streaming{
231243
svaddsubp_u16(svuint16_t_val, svuint16_t_val);
232244
svaddsubp_u32(svuint32_t_val, svuint32_t_val);
233245
svaddsubp_u64(svuint64_t_val, svuint64_t_val);
246+
svdot(svint16_t_val, svint8_t_val, int8_t_val);
247+
svdot(svint16_t_val, svint8_t_val, svint8_t_val);
248+
svdot(svuint16_t_val, svuint8_t_val, svuint8_t_val);
249+
svdot(svuint16_t_val, svuint8_t_val, uint8_t_val);
250+
svdot_lane(svint16_t_val, svint8_t_val, svint8_t_val, 2);
251+
svdot_lane(svuint16_t_val, svuint8_t_val, svuint8_t_val, 2);
252+
svdot_lane_s16_s8(svint16_t_val, svint8_t_val, svint8_t_val, 2);
253+
svdot_lane_u16_u8(svuint16_t_val, svuint8_t_val, svuint8_t_val, 2);
254+
svdot_n_s16_s8(svint16_t_val, svint8_t_val, int8_t_val);
255+
svdot_n_u16_u8(svuint16_t_val, svuint8_t_val, uint8_t_val);
256+
svdot_s16_s8(svint16_t_val, svint8_t_val, svint8_t_val);
257+
svdot_u16_u8(svuint16_t_val, svuint8_t_val, svuint8_t_val);
234258
svqrshrn_n_s8_s16_x2(svint16x2_t_val, 2);
235259
svqrshrn_n_u8_u16_x2(svuint16x2_t_val, 2);
236260
svqrshrn_s8(svint16x2_t_val, 2);
@@ -376,6 +400,18 @@ void test_streaming_compatible(void) __arm_streaming_compatible{
376400
svaddsubp_u16(svuint16_t_val, svuint16_t_val);
377401
svaddsubp_u32(svuint32_t_val, svuint32_t_val);
378402
svaddsubp_u64(svuint64_t_val, svuint64_t_val);
403+
svdot(svint16_t_val, svint8_t_val, int8_t_val);
404+
svdot(svint16_t_val, svint8_t_val, svint8_t_val);
405+
svdot(svuint16_t_val, svuint8_t_val, svuint8_t_val);
406+
svdot(svuint16_t_val, svuint8_t_val, uint8_t_val);
407+
svdot_lane(svint16_t_val, svint8_t_val, svint8_t_val, 2);
408+
svdot_lane(svuint16_t_val, svuint8_t_val, svuint8_t_val, 2);
409+
svdot_lane_s16_s8(svint16_t_val, svint8_t_val, svint8_t_val, 2);
410+
svdot_lane_u16_u8(svuint16_t_val, svuint8_t_val, svuint8_t_val, 2);
411+
svdot_n_s16_s8(svint16_t_val, svint8_t_val, int8_t_val);
412+
svdot_n_u16_u8(svuint16_t_val, svuint8_t_val, uint8_t_val);
413+
svdot_s16_s8(svint16_t_val, svint8_t_val, svint8_t_val);
414+
svdot_u16_u8(svuint16_t_val, svuint8_t_val, svuint8_t_val);
379415
svqrshrn_n_s8_s16_x2(svint16x2_t_val, 2);
380416
svqrshrn_n_u8_u16_x2(svuint16x2_t_val, 2);
381417
svqrshrn_s8(svint16x2_t_val, 2);

0 commit comments

Comments
 (0)