Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cmake/gen/rvvfp16arith_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ SET(PROD_RVVFP16ARITH_MICROKERNEL_SRCS
src/f16-vbinary/gen/f16-vsub-rvvfp16arith-u8v.c
src/f16-vbinary/gen/f16-vsubc-rvvfp16arith-u8v.c
src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u8v.c
src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u2v.c
src/f16-velu/gen/f16-velu-rvvfp16arith-rr1-p3-u2v.c
src/f16-vgelu/gen/f16-vgelu-rvvfp16arith-rational-6-4-div-u2v.c
src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c
Expand Down Expand Up @@ -160,6 +161,8 @@ SET(NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS
src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u1v.c
src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u2v.c
src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u4v.c
src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u1v.c
src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u4v.c
src/f16-velu/gen/f16-velu-rvvfp16arith-rr1-p3-u1v.c
src/f16-velu/gen/f16-velu-rvvfp16arith-rr1-p3-u4v.c
src/f16-vgelu/gen/f16-vgelu-rvvfp16arith-rational-6-4-div-u1v.c
Expand Down
3 changes: 3 additions & 0 deletions gen/rvvfp16arith_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [
"src/f16-vbinary/gen/f16-vsub-rvvfp16arith-u8v.c",
"src/f16-vbinary/gen/f16-vsubc-rvvfp16arith-u8v.c",
"src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u8v.c",
"src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u2v.c",
"src/f16-velu/gen/f16-velu-rvvfp16arith-rr1-p3-u2v.c",
"src/f16-vgelu/gen/f16-vgelu-rvvfp16arith-rational-6-4-div-u2v.c",
"src/f16-vhswish/gen/f16-vhswish-rvvfp16arith-u8v.c",
Expand Down Expand Up @@ -157,6 +158,8 @@ NON_PROD_RVVFP16ARITH_MICROKERNEL_SRCS = [
"src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u1v.c",
"src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u2v.c",
"src/f16-vclamp/gen/f16-vclamp-rvvfp16arith-u4v.c",
"src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u1v.c",
"src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u4v.c",
"src/f16-velu/gen/f16-velu-rvvfp16arith-rr1-p3-u1v.c",
"src/f16-velu/gen/f16-velu-rvvfp16arith-rr1-p3-u4v.c",
"src/f16-vgelu/gen/f16-vgelu-rvvfp16arith-rational-6-4-div-u1v.c",
Expand Down
4 changes: 4 additions & 0 deletions scripts/generate-f32-vcmul.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ tools/xngen src/f32-vcmul/rvv.c.in -D LMUL=1 -o src/f32-vcmul/gen/f32-vcmul-rvv-
tools/xngen src/f32-vcmul/rvv.c.in -D LMUL=2 -o src/f32-vcmul/gen/f32-vcmul-rvv-u2v.c &
tools/xngen src/f32-vcmul/rvv.c.in -D LMUL=4 -o src/f32-vcmul/gen/f32-vcmul-rvv-u4v.c &

tools/xngen src/f16-vcmul/rvv.c.in -D LMUL=1 -o src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u1v.c &
tools/xngen src/f16-vcmul/rvv.c.in -D LMUL=2 -o src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u2v.c &
tools/xngen src/f16-vcmul/rvv.c.in -D LMUL=4 -o src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u4v.c &

#################################### Scalar ###################################
tools/xngen src/f32-vcmul/scalar.c.in -D BATCH_TILE=1 -o src/f32-vcmul/gen/f32-vcmul-scalar-u1.c &
tools/xngen src/f32-vcmul/scalar.c.in -D BATCH_TILE=2 -o src/f32-vcmul/gen/f32-vcmul-scalar-u2.c &
Expand Down
34 changes: 14 additions & 20 deletions src/configs/cmul-config.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,24 @@
#include "src/xnnpack/microfnptr.h"
#include "src/xnnpack/vbinary.h"

#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
static struct xnn_cmul_config f16_cmul_config = {0};
#endif
static struct xnn_cmul_config f16_cmul_config = {0};
static struct xnn_cmul_config f32_cmul_config = {0};

#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
XNN_INIT_ONCE_GUARD(f16_cmul);
#endif
XNN_INIT_ONCE_GUARD(f16_cmul);
XNN_INIT_ONCE_GUARD(f32_cmul);

// Macros to log the microkernel names if and when they are registered.
#define XNN_INIT_CMUL_UKERNEL(ukernel) \
(xnn_vbinary_ukernel_fn) ukernel; \
xnn_log_info("Using cmul microkernel '%s'.", #ukernel);

#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
static void init_f16_cmul_config(void) {
f16_cmul_config.ukernel = XNN_INIT_CMUL_UKERNEL(xnn_f16_vcmul_ukernel__neonfp16arith_u16);
}
#endif
static void init_f16_cmul_config(void) {
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
f16_cmul_config.ukernel = XNN_INIT_CMUL_UKERNEL(xnn_f16_vcmul_ukernel__neonfp16arith_u16);
#elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
f16_cmul_config.ukernel = XNN_INIT_CMUL_UKERNEL(xnn_f16_vcmul_ukernel__rvvfp16arith_u2v);
#endif
}

static void init_f32_cmul_config(void) {
#if XNN_ARCH_ARM
Expand Down Expand Up @@ -81,16 +79,12 @@ static void init_f32_cmul_config(void) {
}

const struct xnn_cmul_config* xnn_init_f16_cmul_config() {
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
Comment thread
dsharlet marked this conversation as resolved.
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
return NULL;
}
XNN_INIT_ONCE(f16_cmul);
return &f16_cmul_config;
#else
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
return NULL;
#endif
}
XNN_INIT_ONCE(f16_cmul);
return f16_cmul_config.ukernel ? &f16_cmul_config : NULL;
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dsharletg I modified this (line 87) to protect against the unconfigured ukernel case. (Sorry it seems your previous comments here have been dropped).

}

const struct xnn_cmul_config* xnn_init_f32_cmul_config() {
Expand Down
5 changes: 5 additions & 0 deletions src/f16-vbinary/f16-vcmul.inc
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,8 @@ XNN_UKERNEL(xnn_arch_arm_neon_fp16_arith, xnn_f16_vcmul_ukernel__neonfp16arith_u
XNN_UKERNEL(xnn_arch_arm_neon_fp16_arith, xnn_f16_vcmul_ukernel__neonfp16arith_u32, 32, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL))
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)

#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vcmul_ukernel__rvvfp16arith_u1v, 1, true, float, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL))
XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vcmul_ukernel__rvvfp16arith_u2v, 2, true, float, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL))
XNN_UKERNEL(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vcmul_ukernel__rvvfp16arith_u4v, 4, true, float, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL))
#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_FP16_VECTOR
52 changes: 52 additions & 0 deletions src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u1v.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/f16-vcmul/rvv.c.in
// Generator: tools/xngen
//
// Copyright 2026 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <riscv_vector.h>

#include "src/xnnpack/vbinary.h"

void xnn_f16_vcmul_ukernel__rvvfp16arith_u1v(
size_t batch,
const xnn_float16* input_a,
const xnn_float16* input_b,
xnn_float16* output,
const struct xnn_f16_default_params* restrict params)
{
assert(batch != 0);
assert(batch % sizeof(xnn_float16) == 0);
assert(input_a != NULL);
assert(input_b != NULL);
assert(output != NULL);

batch >>= XNN_LOG2_SIZEOF_FLOAT16;

const xnn_float16* ar = input_a;
const xnn_float16* ai = input_a + batch;
const xnn_float16* br = input_b;
const xnn_float16* bi = input_b + batch;
xnn_float16* or = output;
xnn_float16* oi = output + batch;

do {
size_t n = __riscv_vsetvl_e16m1(batch); batch -= n;
vfloat16m1_t var = __riscv_vle16_v_f16m1(ar, n); ar += n;
vfloat16m1_t vai = __riscv_vle16_v_f16m1(ai, n); ai += n;
vfloat16m1_t vbr = __riscv_vle16_v_f16m1(br, n); br += n;
vfloat16m1_t vbi = __riscv_vle16_v_f16m1(bi, n); bi += n;
vfloat16m1_t vaccr = __riscv_vfmul(var, vbr, n);
vfloat16m1_t vacci = __riscv_vfmul(var, vbi, n);
vaccr = __riscv_vfnmsac(vaccr, vai, vbi, n);
vacci = __riscv_vfmacc(vacci, vai, vbr, n);
__riscv_vse16(or, vaccr, n); or += n;
__riscv_vse16(oi, vacci, n); oi += n;
} while (batch > 0);
}
52 changes: 52 additions & 0 deletions src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u2v.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/f16-vcmul/rvv.c.in
// Generator: tools/xngen
//
// Copyright 2026 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <riscv_vector.h>

#include "src/xnnpack/vbinary.h"

void xnn_f16_vcmul_ukernel__rvvfp16arith_u2v(
size_t batch,
const xnn_float16* input_a,
const xnn_float16* input_b,
xnn_float16* output,
const struct xnn_f16_default_params* restrict params)
{
assert(batch != 0);
assert(batch % sizeof(xnn_float16) == 0);
assert(input_a != NULL);
assert(input_b != NULL);
assert(output != NULL);

batch >>= XNN_LOG2_SIZEOF_FLOAT16;

const xnn_float16* ar = input_a;
const xnn_float16* ai = input_a + batch;
const xnn_float16* br = input_b;
const xnn_float16* bi = input_b + batch;
xnn_float16* or = output;
xnn_float16* oi = output + batch;

do {
size_t n = __riscv_vsetvl_e16m2(batch); batch -= n;
vfloat16m2_t var = __riscv_vle16_v_f16m2(ar, n); ar += n;
vfloat16m2_t vai = __riscv_vle16_v_f16m2(ai, n); ai += n;
vfloat16m2_t vbr = __riscv_vle16_v_f16m2(br, n); br += n;
vfloat16m2_t vbi = __riscv_vle16_v_f16m2(bi, n); bi += n;
vfloat16m2_t vaccr = __riscv_vfmul(var, vbr, n);
vfloat16m2_t vacci = __riscv_vfmul(var, vbi, n);
vaccr = __riscv_vfnmsac(vaccr, vai, vbi, n);
vacci = __riscv_vfmacc(vacci, vai, vbr, n);
__riscv_vse16(or, vaccr, n); or += n;
__riscv_vse16(oi, vacci, n); oi += n;
} while (batch > 0);
}
52 changes: 52 additions & 0 deletions src/f16-vcmul/gen/f16-vcmul-rvvfp16arith-u4v.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/f16-vcmul/rvv.c.in
// Generator: tools/xngen
//
// Copyright 2026 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <riscv_vector.h>

#include "src/xnnpack/vbinary.h"

void xnn_f16_vcmul_ukernel__rvvfp16arith_u4v(
size_t batch,
const xnn_float16* input_a,
const xnn_float16* input_b,
xnn_float16* output,
const struct xnn_f16_default_params* restrict params)
{
assert(batch != 0);
assert(batch % sizeof(xnn_float16) == 0);
assert(input_a != NULL);
assert(input_b != NULL);
assert(output != NULL);

batch >>= XNN_LOG2_SIZEOF_FLOAT16;

const xnn_float16* ar = input_a;
const xnn_float16* ai = input_a + batch;
const xnn_float16* br = input_b;
const xnn_float16* bi = input_b + batch;
xnn_float16* or = output;
xnn_float16* oi = output + batch;

do {
size_t n = __riscv_vsetvl_e16m4(batch); batch -= n;
vfloat16m4_t var = __riscv_vle16_v_f16m4(ar, n); ar += n;
vfloat16m4_t vai = __riscv_vle16_v_f16m4(ai, n); ai += n;
vfloat16m4_t vbr = __riscv_vle16_v_f16m4(br, n); br += n;
vfloat16m4_t vbi = __riscv_vle16_v_f16m4(bi, n); bi += n;
vfloat16m4_t vaccr = __riscv_vfmul(var, vbr, n);
vfloat16m4_t vacci = __riscv_vfmul(var, vbi, n);
vaccr = __riscv_vfnmsac(vaccr, vai, vbi, n);
vacci = __riscv_vfmacc(vacci, vai, vbr, n);
__riscv_vse16(or, vaccr, n); or += n;
__riscv_vse16(oi, vacci, n); oi += n;
} while (batch > 0);
}
48 changes: 48 additions & 0 deletions src/f16-vcmul/rvv.c.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Copyright 2026 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

$assert LMUL in [1, 2, 4]
#include <assert.h>

#include <riscv_vector.h>

#include "src/xnnpack/vbinary.h"

void xnn_f16_vcmul_ukernel__rvvfp16arith_u${LMUL}v(
size_t batch,
const xnn_float16* input_a,
const xnn_float16* input_b,
xnn_float16* output,
const struct xnn_f16_default_params* restrict params)
{
assert(batch != 0);
assert(batch % sizeof(xnn_float16) == 0);
assert(input_a != NULL);
assert(input_b != NULL);
assert(output != NULL);

batch >>= XNN_LOG2_SIZEOF_FLOAT16;

const xnn_float16* ar = input_a;
const xnn_float16* ai = input_a + batch;
const xnn_float16* br = input_b;
const xnn_float16* bi = input_b + batch;
xnn_float16* or = output;
xnn_float16* oi = output + batch;

do {
size_t n = __riscv_vsetvl_e16m${LMUL}(batch); batch -= n;
vfloat16m${LMUL}_t var = __riscv_vle16_v_f16m${LMUL}(ar, n); ar += n;
vfloat16m${LMUL}_t vai = __riscv_vle16_v_f16m${LMUL}(ai, n); ai += n;
vfloat16m${LMUL}_t vbr = __riscv_vle16_v_f16m${LMUL}(br, n); br += n;
vfloat16m${LMUL}_t vbi = __riscv_vle16_v_f16m${LMUL}(bi, n); bi += n;
vfloat16m${LMUL}_t vaccr = __riscv_vfmul(var, vbr, n);
vfloat16m${LMUL}_t vacci = __riscv_vfmul(var, vbi, n);
vaccr = __riscv_vfnmsac(vaccr, vai, vbi, n);
vacci = __riscv_vfmacc(vacci, vai, vbr, n);
__riscv_vse16(or, vaccr, n); or += n;
__riscv_vse16(oi, vacci, n); oi += n;
} while (batch > 0);
}