From 588f0e87ccfb22f8c328ee910d656903d82bfafb Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 3 Oct 2025 17:09:16 +0000 Subject: [PATCH 1/4] Add SBGEMV and SHGEMV routines to RISC-V. --- kernel/riscv64/KERNEL.RISCV64_ZVL256B | 2 + kernel/riscv64/sbgemv_n_vector.c | 95 +++++++++++++++++++ kernel/riscv64/sbgemv_t_vector.c | 127 ++++++++++++++++++++++++++ 3 files changed, 224 insertions(+) create mode 100644 kernel/riscv64/sbgemv_n_vector.c create mode 100644 kernel/riscv64/sbgemv_t_vector.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index c8be1c6370..e69171abe7 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -283,6 +283,8 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) ifndef SBGEMM_BETA SBGEMM_BETA = gemm_beta_rvv.c endif +SBGEMVNKERNEL = sbgemv_n_vector.c +SBGEMVTKERNEL = sbgemv_t_vector.c endif SAXPBYKERNEL = axpby_vector_v2.c diff --git a/kernel/riscv64/sbgemv_n_vector.c b/kernel/riscv64/sbgemv_n_vector.c new file mode 100644 index 0000000000..0c6064b675 --- /dev/null +++ b/kernel/riscv64/sbgemv_n_vector.c @@ -0,0 +1,95 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8) + +#define VSETVL(n) RISCV_RVV(vsetvl_e16m4)(n) + +#if defined(HFLOAT16) +#define IFLOAT_V_T vfloat16m4_t +#define VLEV_IFLOAT RISCV_RVV(vle16_v_f16m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfwmacc_vf_f32m8) +#else +#define IFLOAT_V_T vbfloat16m4_t +#define VLEV_IFLOAT RISCV_RVV(vle16_v_bf16m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfwmaccbf16_vf_f32m8) +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + if (n < 0) return(0); + + IFLOAT *a_ptr, temp; + FLOAT *y_ptr; + BLASLONG i, j, vl; + IFLOAT_V_T va; + FLOAT_V_T vy; + + if (inc_y == 1) { + for (j = 0; j < n; j++) { + temp = (IFLOAT)(alpha * (FLOAT)(x[0])); + y_ptr = y; + a_ptr = a; + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VLEV_FLOAT(y_ptr, vl); + va = VLEV_IFLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, temp, va, vl); + VSEV_FLOAT(y_ptr, vy, vl); + y_ptr += vl; + a_ptr += vl; + } + x += inc_x; + a += lda; + } + } else { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (j = 0; j < n; j++) { + temp = (IFLOAT)(alpha * (FLOAT)(x[0])); + y_ptr = y; + a_ptr = a; + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VLSEV_FLOAT(y_ptr, stride_y, vl); + va = VLEV_IFLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, temp, va, vl); + VSSEV_FLOAT(y_ptr, stride_y, vy, vl); + y_ptr += vl * inc_y; + a_ptr += vl; + } + x += inc_x; + a += lda; + } + } + return(0); +} diff --git a/kernel/riscv64/sbgemv_t_vector.c b/kernel/riscv64/sbgemv_t_vector.c new file mode 100644 index 0000000000..624f43d6cc --- /dev/null +++ b/kernel/riscv64/sbgemv_t_vector.c @@ -0,0 +1,127 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) + +#define VSETVL(n) RISCV_RVV(vsetvl_e16m4)(n) + +#if defined(HFLOAT16) +#define IFLOAT_V_T vfloat16m4_t +#define VLEV_IFLOAT RISCV_RVV(vle16_v_f16m4) +#define VLSEV_IFLOAT RISCV_RVV(vlse16_v_f16m4) +#define VFMACCVV_FLOAT RISCV_RVV(vfwmacc_vv_f32m8) +#else +#define IFLOAT_V_T vbfloat16m4_t +#define VLEV_IFLOAT RISCV_RVV(vle16_v_bf16m4) +#define VLSEV_IFLOAT RISCV_RVV(vlse16_v_bf16m4) +#define VFMACCVV_FLOAT RISCV_RVV(vfwmaccbf16_vv_f32m8) +#endif + +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m8_f32m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1) +#endif +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i = 0, j = 0, k = 0; + BLASLONG ix = 0, iy = 0; + IFLOAT *a_ptr = a; + FLOAT temp; + + IFLOAT_V_T va, vx; + FLOAT_V_T vr, vz; + BLASLONG gvl = 0; + FLOAT_V_T_M1 v_res; + + if (inc_x == 1) { + for (i = 0; i < n; i++) { + v_res = VFMVVF_FLOAT_M1(0, 1); + gvl = VSETVL(m); + j = 0; + vz = VFMVVF_FLOAT(0, gvl); + for (k = 0; k < m/gvl; k++) { + va = VLEV_IFLOAT(&a_ptr[j], gvl); + vx = VLEV_IFLOAT(&x[j], gvl); + vr = VFMACCVV_FLOAT(vz, va, vx, gvl); // could vfmacc here and reduce outside loop + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail + j += gvl; + } + if (j < m) { + gvl = VSETVL(m-j); + va = VLEV_IFLOAT(&a_ptr[j], gvl); + vx = VLEV_IFLOAT(&x[j], gvl); + vr = VFMACCVV_FLOAT(vz, va, vx, gvl); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); + } + temp = (FLOAT)EXTRACT_FLOAT(v_res); + y[iy] += alpha * temp; + + iy += inc_y; + a_ptr += lda; + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (i = 0; i < n; i++) { + v_res = VFMVVF_FLOAT_M1(0, 1); + gvl = VSETVL(m); + j = 0; + ix = 0; + vz = VFMVVF_FLOAT(0, gvl); + for (k = 0; k < m/gvl; k++) { + va = VLEV_IFLOAT(&a_ptr[j], gvl); + vx = VLSEV_IFLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vz, va, vx, gvl); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); + j += gvl; + ix += inc_x * gvl; + } + if (j < m) { + gvl = VSETVL(m-j); + va = VLEV_IFLOAT(&a_ptr[j], gvl); + vx = VLSEV_IFLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vz, va, vx, gvl); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); + } + temp = (FLOAT)EXTRACT_FLOAT(v_res); + y[iy] += alpha * temp; + + iy += inc_y; + a_ptr += lda; + } + } + + return (0); +} From 809e1cba8f1f3f89972581e8b82f2ec52e51eadb Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Mon, 6 Oct 2025 13:19:03 +0000 Subject: [PATCH 2/4] Better FP16 vectorized GEMV - 20% faster. --- kernel/riscv64/sbgemv_t_vector.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/riscv64/sbgemv_t_vector.c b/kernel/riscv64/sbgemv_t_vector.c index 624f43d6cc..09069caaa3 100644 --- a/kernel/riscv64/sbgemv_t_vector.c +++ b/kernel/riscv64/sbgemv_t_vector.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define IFLOAT_V_T vfloat16m4_t #define VLEV_IFLOAT RISCV_RVV(vle16_v_f16m4) #define VLSEV_IFLOAT RISCV_RVV(vlse16_v_f16m4) -#define VFMACCVV_FLOAT RISCV_RVV(vfwmacc_vv_f32m8) +#define VFMACCVV_FLOAT(a,b,c,d) RISCV_RVV(vfwmul_vv_f32m8)(b,c,d) #else #define IFLOAT_V_T vbfloat16m4_t #define VLEV_IFLOAT RISCV_RVV(vle16_v_bf16m4) @@ -62,7 +62,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, IFLOAT *a, BLASL FLOAT temp; IFLOAT_V_T va, vx; - FLOAT_V_T vr, vz; +#if !defined(HFLOAT16) + FLOAT_V_T vz; +#endif + FLOAT_V_T vr; BLASLONG gvl = 0; FLOAT_V_T_M1 v_res; @@ -71,7 +74,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, IFLOAT *a, BLASL v_res = VFMVVF_FLOAT_M1(0, 1); gvl = VSETVL(m); j = 0; +#if !defined(HFLOAT16) vz = VFMVVF_FLOAT(0, gvl); +#endif for (k = 0; k < m/gvl; k++) { va = VLEV_IFLOAT(&a_ptr[j], gvl); vx = VLEV_IFLOAT(&x[j], gvl); @@ -99,7 +104,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, IFLOAT *a, BLASL gvl = VSETVL(m); j = 0; ix = 0; +#if !defined(HFLOAT16) vz = VFMVVF_FLOAT(0, gvl); +#endif for (k = 0; k < m/gvl; k++) { va = VLEV_IFLOAT(&a_ptr[j], gvl); vx = VLSEV_IFLOAT(&x[ix], stride_x, gvl); From aecb7f9537600dc9990b4efb2e1dea40cc3f97be Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 7 Oct 2025 13:14:20 +0000 Subject: [PATCH 3/4] Change signature of SBGEMV. --- kernel/riscv64/sbgemv_n_vector.c | 37 +++++++++++++++++++++++++++++++- kernel/riscv64/sbgemv_t_vector.c | 6 +++--- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/kernel/riscv64/sbgemv_n_vector.c b/kernel/riscv64/sbgemv_n_vector.c index 0c6064b675..94b9488cfc 100644 --- a/kernel/riscv64/sbgemv_n_vector.c +++ b/kernel/riscv64/sbgemv_n_vector.c @@ -32,6 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8) #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) #define VSETVL(n) RISCV_RVV(vsetvl_e16m4)(n) @@ -45,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVF_FLOAT RISCV_RVV(vfwmaccbf16_vf_f32m8) #endif -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) { if (n < 0) return(0); @@ -55,7 +57,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, IFLOAT *a, BLASL IFLOAT_V_T va; FLOAT_V_T vy; + y_ptr = y; if (inc_y == 1) { + if (beta == 0.0) { + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VFMVVF_FLOAT(0.0, vl); + VSEV_FLOAT(y_ptr, vy, vl); + y_ptr += vl; + } + } else if (beta != 1.0) { + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VLEV_FLOAT(y_ptr, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSEV_FLOAT(y_ptr, vy, vl); + y_ptr += vl; + } + } for (j = 0; j < n; j++) { temp = (IFLOAT)(alpha * (FLOAT)(x[0])); y_ptr = y; @@ -74,6 +93,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, IFLOAT *a, BLASL } } else { BLASLONG stride_y = inc_y * sizeof(FLOAT); + if (beta == 0.0) { + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VFMVVF_FLOAT(0.0, vl); + VSSEV_FLOAT(y_ptr, stride_y, vy, vl); + y_ptr += vl * inc_y; + } + } else if (beta != 1.0) { + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VLSEV_FLOAT(y_ptr, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSSEV_FLOAT(y_ptr, stride_y, vy, vl); + y_ptr += vl * inc_y; + } + } for (j = 0; j < n; j++) { temp = (IFLOAT)(alpha * (FLOAT)(x[0])); y_ptr = y; diff --git a/kernel/riscv64/sbgemv_t_vector.c b/kernel/riscv64/sbgemv_t_vector.c index 09069caaa3..7fdccee1b6 100644 --- a/kernel/riscv64/sbgemv_t_vector.c +++ b/kernel/riscv64/sbgemv_t_vector.c @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) { BLASLONG i = 0, j = 0, k = 0; BLASLONG ix = 0, iy = 0; @@ -92,7 +92,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, IFLOAT *a, BLASL v_res = VFREDSUM_FLOAT(vr, v_res, gvl); } temp = (FLOAT)EXTRACT_FLOAT(v_res); - y[iy] += alpha * temp; + y[iy] = y[iy] * beta + alpha * temp; iy += inc_y; a_ptr += lda; @@ -123,7 +123,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, IFLOAT *a, BLASL v_res = VFREDSUM_FLOAT(vr, v_res, gvl); } temp = (FLOAT)EXTRACT_FLOAT(v_res); - y[iy] += alpha * temp; + y[iy] = y[iy] * beta + alpha * temp; iy += inc_y; a_ptr += lda; From f552040c5d8217f0ac641ec71dff7fe96f9d1de1 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 7 Oct 2025 17:17:18 +0000 Subject: [PATCH 4/4] Fix stride issue. --- kernel/riscv64/sbgemv_t_vector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/riscv64/sbgemv_t_vector.c b/kernel/riscv64/sbgemv_t_vector.c index 7fdccee1b6..f537ca4ead 100644 --- a/kernel/riscv64/sbgemv_t_vector.c +++ b/kernel/riscv64/sbgemv_t_vector.c @@ -98,7 +98,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * a_ptr += lda; } } else { - BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_x = inc_x * sizeof(IFLOAT); for (i = 0; i < n; i++) { v_res = VFMVVF_FLOAT_M1(0, 1); gvl = VSETVL(m);