diff --git a/cmake/gen/rvvfp16arith_microkernels.cmake b/cmake/gen/rvvfp16arith_microkernels.cmake index f690345e472..2d0e970b351 100644 --- a/cmake/gen/rvvfp16arith_microkernels.cmake +++ b/cmake/gen/rvvfp16arith_microkernels.cmake @@ -53,6 +53,7 @@ SET(PROD_RVVFP16ARITH_MICROKERNEL_SRCS src/f16-vbinary/gen/f16-vsub-rvvfp16arith-u8v.c src/f16-vbinary/gen/f16-vsubc-rvvfp16arith-u8v.c src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u8v.c + src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u2v.c src/f16-velu/gen/f16-velu-rvvfp16arith-rr1-p3-u2v.c src/f16-vgelu/gen/f16-vgelu-rvvfp16arith-rational-6-4-div-u2v.c src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c @@ -160,6 +161,8 @@ SET(NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u1v.c src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u2v.c src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u4v.c + src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u1v.c + src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u4v.c src/f16-velu/gen/f16-velu-rvvfp16arith-rr1-p3-u1v.c src/f16-velu/gen/f16-velu-rvvfp16arith-rr1-p3-u4v.c src/f16-vgelu/gen/f16-vgelu-rvvfp16arith-rational-6-4-div-u1v.c diff --git a/gen/rvvfp16arith_microkernels.bzl b/gen/rvvfp16arith_microkernels.bzl index c350325661f..4ea76d79825 100644 --- a/gen/rvvfp16arith_microkernels.bzl +++ b/gen/rvvfp16arith_microkernels.bzl @@ -49,6 +49,7 @@ PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-vbinary/gen/f16-vsub-rvvfp16arith-u8v.c", "src/f16-vbinary/gen/f16-vsubc-rvvfp16arith-u8v.c", "src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u8v.c", + "src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u2v.c", "src/f16-velu/gen/f16-velu-rvvfp16arith-rr1-p3-u2v.c", "src/f16-vgelu/gen/f16-vgelu-rvvfp16arith-rational-6-4-div-u2v.c", "src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c", @@ -157,6 +158,8 @@ NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u1v.c", "src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u2v.c", "src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u4v.c", + "src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u1v.c", + "src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u4v.c", "src/f16-velu/gen/f16-velu-rvvfp16arith-rr1-p3-u1v.c", "src/f16-velu/gen/f16-velu-rvvfp16arith-rr1-p3-u4v.c", "src/f16-vgelu/gen/f16-vgelu-rvvfp16arith-rational-6-4-div-u1v.c", diff --git a/scripts/generate-f32-vcmul.sh b/scripts/generate-f32-vcmul.sh index 087d0c4d250..94a67f77644 100755 --- a/scripts/generate-f32-vcmul.sh +++ b/scripts/generate-f32-vcmul.sh @@ -43,6 +43,10 @@ tools/xngen src/f32-vcmul/rvv.c.in -D LMUL=1 -o src/f32-vcmul/gen/f32-vcmul-rvv- tools/xngen src/f32-vcmul/rvv.c.in -D LMUL=2 -o src/f32-vcmul/gen/f32-vcmul-rvv-u2v.c & tools/xngen src/f32-vcmul/rvv.c.in -D LMUL=4 -o src/f32-vcmul/gen/f32-vcmul-rvv-u4v.c & +tools/xngen src/f16-vcmul/rvv.c.in -D LMUL=1 -o src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u1v.c & +tools/xngen src/f16-vcmul/rvv.c.in -D LMUL=2 -o src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u2v.c & +tools/xngen src/f16-vcmul/rvv.c.in -D LMUL=4 -o src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u4v.c & + #################################### Scalar ################################### tools/xngen src/f32-vcmul/scalar.c.in -D BATCH_TILE=1 -o src/f32-vcmul/gen/f32-vcmul-scalar-u1.c & tools/xngen src/f32-vcmul/scalar.c.in -D BATCH_TILE=2 -o src/f32-vcmul/gen/f32-vcmul-scalar-u2.c & diff --git a/src/configs/cmul-config.c b/src/configs/cmul-config.c index 3095823eec3..36a7b58abf3 100644 --- a/src/configs/cmul-config.c +++ b/src/configs/cmul-config.c @@ -15,14 +15,10 @@ #include "src/xnnpack/microfnptr.h" #include "src/xnnpack/vbinary.h" -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - static struct xnn_cmul_config f16_cmul_config = {0}; -#endif +static struct xnn_cmul_config f16_cmul_config = {0}; static struct xnn_cmul_config f32_cmul_config = {0}; -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - XNN_INIT_ONCE_GUARD(f16_cmul); -#endif +XNN_INIT_ONCE_GUARD(f16_cmul); XNN_INIT_ONCE_GUARD(f32_cmul); // Macros to log the microkernel names if and when they are registered. @@ -30,11 +26,13 @@ XNN_INIT_ONCE_GUARD(f32_cmul); (xnn_vbinary_ukernel_fn) ukernel; \ xnn_log_info("Using cmul microkernel '%s'.", #ukernel); -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - static void init_f16_cmul_config(void) { - f16_cmul_config.ukernel = XNN_INIT_CMUL_UKERNEL(xnn_f16_vcmul_ukernel__neonfp16arith_u16); - } -#endif +static void init_f16_cmul_config(void) { + #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + f16_cmul_config.ukernel = XNN_INIT_CMUL_UKERNEL(xnn_f16_vcmul_ukernel__neonfp16arith_u16); + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR + f16_cmul_config.ukernel = XNN_INIT_CMUL_UKERNEL(xnn_f16_vcmul_ukernel__rvvfp16arith_u2v); + #endif +} static void init_f32_cmul_config(void) { #if XNN_ARCH_ARM @@ -81,16 +79,12 @@ static void init_f32_cmul_config(void) { } const struct xnn_cmul_config* xnn_init_f16_cmul_config() { - #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) { - return NULL; - } - XNN_INIT_ONCE(f16_cmul); - return &f16_cmul_config; - #else + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) { return NULL; - #endif + } + XNN_INIT_ONCE(f16_cmul); + return f16_cmul_config.ukernel ? &f16_cmul_config : NULL; } const struct xnn_cmul_config* xnn_init_f32_cmul_config() { diff --git a/src/f16-vbinary/f16-vcmul.inc b/src/f16-vbinary/f16-vcmul.inc index b3be8c625d8..530e0621110 100644 --- a/src/f16-vbinary/f16-vcmul.inc +++ b/src/f16-vbinary/f16-vcmul.inc @@ -11,3 +11,8 @@ XNN_UKERNEL(xnn_arch_arm_neon_fp16_arith, xnn_f16_vcmul_ukernel__neonfp16arith_u XNN_UKERNEL(xnn_arch_arm_neon_fp16_arith, xnn_f16_vcmul_ukernel__neonfp16arith_u32, 32, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vcmul_ukernel__rvvfp16arith_u1v, 1, true, float, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vcmul_ukernel__rvvfp16arith_u2v, 2, true, float, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vcmul_ukernel__rvvfp16arith_u4v, 4, true, float, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR diff --git a/src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u1v.c b/src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u1v.c new file mode 100644 index 00000000000..df294cdad44 --- /dev/null +++ b/src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u1v.c @@ -0,0 +1,52 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vcmul/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/vbinary.h" + +void xnn_f16_vcmul_ukernel__rvvfp16arith_u1v( + size_t batch, + const xnn_float16* input_a, + const xnn_float16* input_b, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + const xnn_float16* ar = input_a; + const xnn_float16* ai = input_a + batch; + const xnn_float16* br = input_b; + const xnn_float16* bi = input_b + batch; + xnn_float16* or = output; + xnn_float16* oi = output + batch; + + do { + size_t n = __riscv_vsetvl_e16m1(batch); batch -= n; + vfloat16m1_t var = __riscv_vle16_v_f16m1(ar, n); ar += n; + vfloat16m1_t vai = __riscv_vle16_v_f16m1(ai, n); ai += n; + vfloat16m1_t vbr = __riscv_vle16_v_f16m1(br, n); br += n; + vfloat16m1_t vbi = __riscv_vle16_v_f16m1(bi, n); bi += n; + vfloat16m1_t vaccr = __riscv_vfmul(var, vbr, n); + vfloat16m1_t vacci = __riscv_vfmul(var, vbi, n); + vaccr = __riscv_vfnmsac(vaccr, vai, vbi, n); + vacci = __riscv_vfmacc(vacci, vai, vbr, n); + __riscv_vse16(or, vaccr, n); or += n; + __riscv_vse16(oi, vacci, n); oi += n; + } while (batch > 0); +} diff --git a/src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u2v.c b/src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u2v.c new file mode 100644 index 00000000000..94a0b0b3c7d --- /dev/null +++ b/src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u2v.c @@ -0,0 +1,52 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vcmul/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/vbinary.h" + +void xnn_f16_vcmul_ukernel__rvvfp16arith_u2v( + size_t batch, + const xnn_float16* input_a, + const xnn_float16* input_b, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + const xnn_float16* ar = input_a; + const xnn_float16* ai = input_a + batch; + const xnn_float16* br = input_b; + const xnn_float16* bi = input_b + batch; + xnn_float16* or = output; + xnn_float16* oi = output + batch; + + do { + size_t n = __riscv_vsetvl_e16m2(batch); batch -= n; + vfloat16m2_t var = __riscv_vle16_v_f16m2(ar, n); ar += n; + vfloat16m2_t vai = __riscv_vle16_v_f16m2(ai, n); ai += n; + vfloat16m2_t vbr = __riscv_vle16_v_f16m2(br, n); br += n; + vfloat16m2_t vbi = __riscv_vle16_v_f16m2(bi, n); bi += n; + vfloat16m2_t vaccr = __riscv_vfmul(var, vbr, n); + vfloat16m2_t vacci = __riscv_vfmul(var, vbi, n); + vaccr = __riscv_vfnmsac(vaccr, vai, vbi, n); + vacci = __riscv_vfmacc(vacci, vai, vbr, n); + __riscv_vse16(or, vaccr, n); or += n; + __riscv_vse16(oi, vacci, n); oi += n; + } while (batch > 0); +} diff --git a/src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u4v.c b/src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u4v.c new file mode 100644 index 00000000000..5d9fdc6fa13 --- /dev/null +++ b/src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u4v.c @@ -0,0 +1,52 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f16-vcmul/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/vbinary.h" + +void xnn_f16_vcmul_ukernel__rvvfp16arith_u4v( + size_t batch, + const xnn_float16* input_a, + const xnn_float16* input_b, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + const xnn_float16* ar = input_a; + const xnn_float16* ai = input_a + batch; + const xnn_float16* br = input_b; + const xnn_float16* bi = input_b + batch; + xnn_float16* or = output; + xnn_float16* oi = output + batch; + + do { + size_t n = __riscv_vsetvl_e16m4(batch); batch -= n; + vfloat16m4_t var = __riscv_vle16_v_f16m4(ar, n); ar += n; + vfloat16m4_t vai = __riscv_vle16_v_f16m4(ai, n); ai += n; + vfloat16m4_t vbr = __riscv_vle16_v_f16m4(br, n); br += n; + vfloat16m4_t vbi = __riscv_vle16_v_f16m4(bi, n); bi += n; + vfloat16m4_t vaccr = __riscv_vfmul(var, vbr, n); + vfloat16m4_t vacci = __riscv_vfmul(var, vbi, n); + vaccr = __riscv_vfnmsac(vaccr, vai, vbi, n); + vacci = __riscv_vfmacc(vacci, vai, vbr, n); + __riscv_vse16(or, vaccr, n); or += n; + __riscv_vse16(oi, vacci, n); oi += n; + } while (batch > 0); +} diff --git a/src/f16-vcmul/rvv.c.in b/src/f16-vcmul/rvv.c.in new file mode 100644 index 00000000000..0ab7367cdbc --- /dev/null +++ b/src/f16-vcmul/rvv.c.in @@ -0,0 +1,48 @@ +// Copyright 2026 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4] +#include + +#include + +#include "src/xnnpack/vbinary.h" + +void xnn_f16_vcmul_ukernel__rvvfp16arith_u${LMUL}v( + size_t batch, + const xnn_float16* input_a, + const xnn_float16* input_b, + xnn_float16* output, + const struct xnn_f16_default_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(xnn_float16) == 0); + assert(input_a != NULL); + assert(input_b != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_FLOAT16; + + const xnn_float16* ar = input_a; + const xnn_float16* ai = input_a + batch; + const xnn_float16* br = input_b; + const xnn_float16* bi = input_b + batch; + xnn_float16* or = output; + xnn_float16* oi = output + batch; + + do { + size_t n = __riscv_vsetvl_e16m${LMUL}(batch); batch -= n; + vfloat16m${LMUL}_t var = __riscv_vle16_v_f16m${LMUL}(ar, n); ar += n; + vfloat16m${LMUL}_t vai = __riscv_vle16_v_f16m${LMUL}(ai, n); ai += n; + vfloat16m${LMUL}_t vbr = __riscv_vle16_v_f16m${LMUL}(br, n); br += n; + vfloat16m${LMUL}_t vbi = __riscv_vle16_v_f16m${LMUL}(bi, n); bi += n; + vfloat16m${LMUL}_t vaccr = __riscv_vfmul(var, vbr, n); + vfloat16m${LMUL}_t vacci = __riscv_vfmul(var, vbi, n); + vaccr = __riscv_vfnmsac(vaccr, vai, vbi, n); + vacci = __riscv_vfmacc(vacci, vai, vbr, n); + __riscv_vse16(or, vaccr, n); or += n; + __riscv_vse16(oi, vacci, n); oi += n; + } while (batch > 0); +}