diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index 336ee22927a..f73a30212e5 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -130,7 +130,10 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/x32-transposec/gen/x32-transposec-4x4-rvv.c src/x32-transposec/gen/x32-transposec-8x8-rvv.c src/x32-transposec/gen/x32-transposec-16x8-rvv.c - src/x32-transposec/gen/x32-transposec-32x8-rvv.c) + src/x32-transposec/gen/x32-transposec-32x8-rvv.c + src/x32-transposec/gen/x32-transposec-8xv1-rvv.c + src/x32-transposec/gen/x32-transposec-8xv2-rvv.c + src/x32-transposec/gen/x32-transposec-8xv4-rvv.c) SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-1x1.c diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index eeb6198e648..8857dc36bb8 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -127,6 +127,9 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/x32-transposec/gen/x32-transposec-8x8-rvv.c", "src/x32-transposec/gen/x32-transposec-16x8-rvv.c", "src/x32-transposec/gen/x32-transposec-32x8-rvv.c", + "src/x32-transposec/gen/x32-transposec-8xv1-rvv.c", + "src/x32-transposec/gen/x32-transposec-8xv2-rvv.c", + "src/x32-transposec/gen/x32-transposec-8xv4-rvv.c", ] NON_PROD_RVV_MICROKERNEL_SRCS = [ diff --git a/scripts/generate-xN-transpose.sh b/scripts/generate-xN-transpose.sh index 5585bc6ca29..9952540bcf5 100755 --- a/scripts/generate-xN-transpose.sh +++ b/scripts/generate-xN-transpose.sh @@ -102,6 +102,10 @@ tools/xngen src/x32-transposec/rvv.c.in -D SIZE=32 VLEN=512 -o src/x32-transpose tools/xngen src/x32-transposec/rvv.c.in -D SIZE=32 VLEN=256 -o src/x32-transposec/gen/x32-transposec-8x8-rvv.c & tools/xngen src/x32-transposec/rvv.c.in -D SIZE=32 VLEN=128 -o src/x32-transposec/gen/x32-transposec-4x4-rvv.c & +tools/xngen src/x32-transposec/rvv-u.c.in -D LMUL=1 -o src/x32-transposec/gen/x32-transposec-8xv1-rvv.c & +tools/xngen src/x32-transposec/rvv-u.c.in -D LMUL=2 -o src/x32-transposec/gen/x32-transposec-8xv2-rvv.c & +tools/xngen src/x32-transposec/rvv-u.c.in -D LMUL=4 -o src/x32-transposec/gen/x32-transposec-8xv4-rvv.c & + #################################### ARM NEON ############################### tools/xngen src/x32-transposec/neon-zip.c.in -D SIZE=8 VECTOR_SIZE=64 IN_PTRS=MULTI OUT_PTRS=DEC -o src/x8-transposec/gen/x8-transposec-8x8-multi-dec-zip-neon.c & tools/xngen src/x32-transposec/neon-zip.c.in -D SIZE=8 VECTOR_SIZE=64 IN_PTRS=MULTI OUT_PTRS=MOV -o src/x8-transposec/gen/x8-transposec-8x8-multi-mov-zip-neon.c & diff --git a/src/x32-transposec/gen/x32-transposec-8xv1-rvv.c b/src/x32-transposec/gen/x32-transposec-8xv1-rvv.c new file mode 100644 index 00000000000..9046b438201 --- /dev/null +++ b/src/x32-transposec/gen/x32-transposec-8xv1-rvv.c @@ -0,0 +1,134 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/x32-transposec/rvv-u.c.in +// Generator: tools/xngen +// +// Copyright 2023 SiFive, Inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Optimized by Autocomp (https://github.com/ucb-bar/autocomp) +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/transpose.h" + + +void xnn_x32_transposec_ukernel__8xv1_rvv( + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height) +{ + assert(block_width == 1 || output_stride >= block_height * sizeof(uint32_t)); + assert(block_height == 1 || input_stride >= block_width * sizeof(uint32_t)); + + const size_t input_stride_u32 = input_stride / sizeof(uint32_t); + const size_t output_stride_u32 = output_stride / sizeof(uint32_t); + + for (size_t bh = 0; bh < block_height; ) { + const size_t vl = __riscv_vsetvl_e32m1(block_height - bh); + + const uint32_t* i_row = input + bh * input_stride_u32; + uint32_t* o_col = output + bh; + + size_t bw = 0; + for (; bw + 8 <= block_width; bw += 8) { + const uint32_t* i_ptr = i_row + bw; + uint32_t* o_ptr = o_col + bw * output_stride_u32; + + // Issue loads with the first half of each tuple's stores interleaved to + // cover segmented-load latency. + vuint32m1x8_t tuple0 = __riscv_vlsseg8e32_v_u32m1x8(i_ptr + 0, input_stride, vl); + + // Drain remaining stores. + __riscv_vse32_v_u32m1(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m1x8_u32m1(tuple0, 0), vl); + __riscv_vse32_v_u32m1(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m1x8_u32m1(tuple0, 1), vl); + __riscv_vse32_v_u32m1(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m1x8_u32m1(tuple0, 2), vl); + __riscv_vse32_v_u32m1(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m1x8_u32m1(tuple0, 3), vl); + __riscv_vse32_v_u32m1(o_ptr + 4 * output_stride_u32, __riscv_vget_v_u32m1x8_u32m1(tuple0, 4), vl); + __riscv_vse32_v_u32m1(o_ptr + 5 * output_stride_u32, __riscv_vget_v_u32m1x8_u32m1(tuple0, 5), vl); + __riscv_vse32_v_u32m1(o_ptr + 6 * output_stride_u32, __riscv_vget_v_u32m1x8_u32m1(tuple0, 6), vl); + __riscv_vse32_v_u32m1(o_ptr + 7 * output_stride_u32, __riscv_vget_v_u32m1x8_u32m1(tuple0, 7), vl); + } + + // Column tail: 0 < (block_width - bw) < TILE_WIDTH. + // Keep LMUL=1 so the same vl rows are processed, splitting the tail + // into legal vlsseg calls where needed (EMUL * NFIELDS <= 8). + if (bw < block_width) { + const size_t bw_tail = block_width - bw; + const uint32_t* i_ptr = i_row + bw; + uint32_t* o_ptr = o_col + bw * output_stride_u32; + switch (bw_tail) { + case 7: { + vuint32m1x7_t tuple0 = __riscv_vlsseg7e32_v_u32m1x7(i_ptr + 0, input_stride, vl); + __riscv_vse32_v_u32m1(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m1x7_u32m1(tuple0, 0), vl); + __riscv_vse32_v_u32m1(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m1x7_u32m1(tuple0, 1), vl); + __riscv_vse32_v_u32m1(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m1x7_u32m1(tuple0, 2), vl); + __riscv_vse32_v_u32m1(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m1x7_u32m1(tuple0, 3), vl); + __riscv_vse32_v_u32m1(o_ptr + 4 * output_stride_u32, __riscv_vget_v_u32m1x7_u32m1(tuple0, 4), vl); + __riscv_vse32_v_u32m1(o_ptr + 5 * output_stride_u32, __riscv_vget_v_u32m1x7_u32m1(tuple0, 5), vl); + __riscv_vse32_v_u32m1(o_ptr + 6 * output_stride_u32, __riscv_vget_v_u32m1x7_u32m1(tuple0, 6), vl); + break; + } + case 6: { + vuint32m1x6_t tuple0 = __riscv_vlsseg6e32_v_u32m1x6(i_ptr + 0, input_stride, vl); + __riscv_vse32_v_u32m1(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m1x6_u32m1(tuple0, 0), vl); + __riscv_vse32_v_u32m1(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m1x6_u32m1(tuple0, 1), vl); + __riscv_vse32_v_u32m1(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m1x6_u32m1(tuple0, 2), vl); + __riscv_vse32_v_u32m1(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m1x6_u32m1(tuple0, 3), vl); + __riscv_vse32_v_u32m1(o_ptr + 4 * output_stride_u32, __riscv_vget_v_u32m1x6_u32m1(tuple0, 4), vl); + __riscv_vse32_v_u32m1(o_ptr + 5 * output_stride_u32, __riscv_vget_v_u32m1x6_u32m1(tuple0, 5), vl); + break; + } + case 5: { + vuint32m1x5_t tuple0 = __riscv_vlsseg5e32_v_u32m1x5(i_ptr + 0, input_stride, vl); + __riscv_vse32_v_u32m1(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m1x5_u32m1(tuple0, 0), vl); + __riscv_vse32_v_u32m1(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m1x5_u32m1(tuple0, 1), vl); + __riscv_vse32_v_u32m1(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m1x5_u32m1(tuple0, 2), vl); + __riscv_vse32_v_u32m1(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m1x5_u32m1(tuple0, 3), vl); + __riscv_vse32_v_u32m1(o_ptr + 4 * output_stride_u32, __riscv_vget_v_u32m1x5_u32m1(tuple0, 4), vl); + break; + } + case 4: { + vuint32m1x4_t tuple0 = __riscv_vlsseg4e32_v_u32m1x4(i_ptr + 0, input_stride, vl); + __riscv_vse32_v_u32m1(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m1x4_u32m1(tuple0, 0), vl); + __riscv_vse32_v_u32m1(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m1x4_u32m1(tuple0, 1), vl); + __riscv_vse32_v_u32m1(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m1x4_u32m1(tuple0, 2), vl); + __riscv_vse32_v_u32m1(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m1x4_u32m1(tuple0, 3), vl); + break; + } + case 3: { + vuint32m1x3_t tuple0 = __riscv_vlsseg3e32_v_u32m1x3(i_ptr + 0, input_stride, vl); + __riscv_vse32_v_u32m1(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m1x3_u32m1(tuple0, 0), vl); + __riscv_vse32_v_u32m1(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m1x3_u32m1(tuple0, 1), vl); + __riscv_vse32_v_u32m1(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m1x3_u32m1(tuple0, 2), vl); + break; + } + case 2: { + vuint32m1x2_t tuple0 = __riscv_vlsseg2e32_v_u32m1x2(i_ptr + 0, input_stride, vl); + __riscv_vse32_v_u32m1(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m1x2_u32m1(tuple0, 0), vl); + __riscv_vse32_v_u32m1(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m1x2_u32m1(tuple0, 1), vl); + break; + } + case 1: { + vuint32m1_t v = __riscv_vlse32_v_u32m1(i_ptr, input_stride, vl); + __riscv_vse32_v_u32m1(o_ptr, v, vl); + break; + } + default: + XNN_UNREACHABLE; + } + } + + bh += vl; + } +} diff --git a/src/x32-transposec/gen/x32-transposec-8xv2-rvv.c b/src/x32-transposec/gen/x32-transposec-8xv2-rvv.c new file mode 100644 index 00000000000..0d1c61a70e3 --- /dev/null +++ b/src/x32-transposec/gen/x32-transposec-8xv2-rvv.c @@ -0,0 +1,138 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/x32-transposec/rvv-u.c.in +// Generator: tools/xngen +// +// Copyright 2023 SiFive, Inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Optimized by Autocomp (https://github.com/ucb-bar/autocomp) +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/transpose.h" + + +void xnn_x32_transposec_ukernel__8xv2_rvv( + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height) +{ + assert(block_width == 1 || output_stride >= block_height * sizeof(uint32_t)); + assert(block_height == 1 || input_stride >= block_width * sizeof(uint32_t)); + + const size_t input_stride_u32 = input_stride / sizeof(uint32_t); + const size_t output_stride_u32 = output_stride / sizeof(uint32_t); + + for (size_t bh = 0; bh < block_height; ) { + const size_t vl = __riscv_vsetvl_e32m2(block_height - bh); + + const uint32_t* i_row = input + bh * input_stride_u32; + uint32_t* o_col = output + bh; + + size_t bw = 0; + for (; bw + 8 <= block_width; bw += 8) { + const uint32_t* i_ptr = i_row + bw; + uint32_t* o_ptr = o_col + bw * output_stride_u32; + + // Issue loads with the first half of each tuple's stores interleaved to + // cover segmented-load latency. + vuint32m2x4_t tuple0 = __riscv_vlsseg4e32_v_u32m2x4(i_ptr + 0, input_stride, vl); + __riscv_vse32_v_u32m2(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 0), vl); + __riscv_vse32_v_u32m2(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 1), vl); + vuint32m2x4_t tuple1 = __riscv_vlsseg4e32_v_u32m2x4(i_ptr + 4, input_stride, vl); + + // Drain remaining stores. + __riscv_vse32_v_u32m2(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 2), vl); + __riscv_vse32_v_u32m2(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 3), vl); + __riscv_vse32_v_u32m2(o_ptr + 4 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple1, 0), vl); + __riscv_vse32_v_u32m2(o_ptr + 5 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple1, 1), vl); + __riscv_vse32_v_u32m2(o_ptr + 6 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple1, 2), vl); + __riscv_vse32_v_u32m2(o_ptr + 7 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple1, 3), vl); + } + + // Column tail: 0 < (block_width - bw) < TILE_WIDTH. + // Keep LMUL=2 so the same vl rows are processed, splitting the tail + // into legal vlsseg calls where needed (EMUL * NFIELDS <= 8). + if (bw < block_width) { + const size_t bw_tail = block_width - bw; + const uint32_t* i_ptr = i_row + bw; + uint32_t* o_ptr = o_col + bw * output_stride_u32; + switch (bw_tail) { + case 7: { + vuint32m2x4_t tuple0 = __riscv_vlsseg4e32_v_u32m2x4(i_ptr + 0, input_stride, vl); + vuint32m2x3_t tuple1 = __riscv_vlsseg3e32_v_u32m2x3(i_ptr + 4, input_stride, vl); + __riscv_vse32_v_u32m2(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 0), vl); + __riscv_vse32_v_u32m2(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 1), vl); + __riscv_vse32_v_u32m2(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 2), vl); + __riscv_vse32_v_u32m2(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 3), vl); + __riscv_vse32_v_u32m2(o_ptr + 4 * output_stride_u32, __riscv_vget_v_u32m2x3_u32m2(tuple1, 0), vl); + __riscv_vse32_v_u32m2(o_ptr + 5 * output_stride_u32, __riscv_vget_v_u32m2x3_u32m2(tuple1, 1), vl); + __riscv_vse32_v_u32m2(o_ptr + 6 * output_stride_u32, __riscv_vget_v_u32m2x3_u32m2(tuple1, 2), vl); + break; + } + case 6: { + vuint32m2x4_t tuple0 = __riscv_vlsseg4e32_v_u32m2x4(i_ptr + 0, input_stride, vl); + vuint32m2x2_t tuple1 = __riscv_vlsseg2e32_v_u32m2x2(i_ptr + 4, input_stride, vl); + __riscv_vse32_v_u32m2(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 0), vl); + __riscv_vse32_v_u32m2(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 1), vl); + __riscv_vse32_v_u32m2(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 2), vl); + __riscv_vse32_v_u32m2(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 3), vl); + __riscv_vse32_v_u32m2(o_ptr + 4 * output_stride_u32, __riscv_vget_v_u32m2x2_u32m2(tuple1, 0), vl); + __riscv_vse32_v_u32m2(o_ptr + 5 * output_stride_u32, __riscv_vget_v_u32m2x2_u32m2(tuple1, 1), vl); + break; + } + case 5: { + vuint32m2x4_t tuple0 = __riscv_vlsseg4e32_v_u32m2x4(i_ptr + 0, input_stride, vl); + vuint32m2_t tuple1 = __riscv_vlse32_v_u32m2(i_ptr + 4, input_stride, vl); + __riscv_vse32_v_u32m2(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 0), vl); + __riscv_vse32_v_u32m2(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 1), vl); + __riscv_vse32_v_u32m2(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 2), vl); + __riscv_vse32_v_u32m2(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 3), vl); + __riscv_vse32_v_u32m2(o_ptr + 4 * output_stride_u32, tuple1, vl); + break; + } + case 4: { + vuint32m2x4_t tuple0 = __riscv_vlsseg4e32_v_u32m2x4(i_ptr + 0, input_stride, vl); + __riscv_vse32_v_u32m2(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 0), vl); + __riscv_vse32_v_u32m2(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 1), vl); + __riscv_vse32_v_u32m2(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 2), vl); + __riscv_vse32_v_u32m2(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m2x4_u32m2(tuple0, 3), vl); + break; + } + case 3: { + vuint32m2x3_t tuple0 = __riscv_vlsseg3e32_v_u32m2x3(i_ptr + 0, input_stride, vl); + __riscv_vse32_v_u32m2(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m2x3_u32m2(tuple0, 0), vl); + __riscv_vse32_v_u32m2(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m2x3_u32m2(tuple0, 1), vl); + __riscv_vse32_v_u32m2(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m2x3_u32m2(tuple0, 2), vl); + break; + } + case 2: { + vuint32m2x2_t tuple0 = __riscv_vlsseg2e32_v_u32m2x2(i_ptr + 0, input_stride, vl); + __riscv_vse32_v_u32m2(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m2x2_u32m2(tuple0, 0), vl); + __riscv_vse32_v_u32m2(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m2x2_u32m2(tuple0, 1), vl); + break; + } + case 1: { + vuint32m2_t v = __riscv_vlse32_v_u32m2(i_ptr, input_stride, vl); + __riscv_vse32_v_u32m2(o_ptr, v, vl); + break; + } + default: + XNN_UNREACHABLE; + } + } + + bh += vl; + } +} diff --git a/src/x32-transposec/gen/x32-transposec-8xv4-rvv.c b/src/x32-transposec/gen/x32-transposec-8xv4-rvv.c new file mode 100644 index 00000000000..a3cedbcd0f1 --- /dev/null +++ b/src/x32-transposec/gen/x32-transposec-8xv4-rvv.c @@ -0,0 +1,146 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/x32-transposec/rvv-u.c.in +// Generator: tools/xngen +// +// Copyright 2023 SiFive, Inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Optimized by Autocomp (https://github.com/ucb-bar/autocomp) +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/transpose.h" + + +void xnn_x32_transposec_ukernel__8xv4_rvv( + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height) +{ + assert(block_width == 1 || output_stride >= block_height * sizeof(uint32_t)); + assert(block_height == 1 || input_stride >= block_width * sizeof(uint32_t)); + + const size_t input_stride_u32 = input_stride / sizeof(uint32_t); + const size_t output_stride_u32 = output_stride / sizeof(uint32_t); + + for (size_t bh = 0; bh < block_height; ) { + const size_t vl = __riscv_vsetvl_e32m4(block_height - bh); + + const uint32_t* i_row = input + bh * input_stride_u32; + uint32_t* o_col = output + bh; + + size_t bw = 0; + for (; bw + 8 <= block_width; bw += 8) { + const uint32_t* i_ptr = i_row + bw; + uint32_t* o_ptr = o_col + bw * output_stride_u32; + + // Issue loads with the first half of each tuple's stores interleaved to + // cover segmented-load latency. + vuint32m4x2_t tuple0 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 0, input_stride, vl); + __riscv_vse32_v_u32m4(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 0), vl); + vuint32m4x2_t tuple1 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 2, input_stride, vl); + __riscv_vse32_v_u32m4(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple1, 0), vl); + vuint32m4x2_t tuple2 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 4, input_stride, vl); + __riscv_vse32_v_u32m4(o_ptr + 4 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple2, 0), vl); + vuint32m4x2_t tuple3 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 6, input_stride, vl); + + // Drain remaining stores. + __riscv_vse32_v_u32m4(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 1), vl); + __riscv_vse32_v_u32m4(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple1, 1), vl); + __riscv_vse32_v_u32m4(o_ptr + 5 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple2, 1), vl); + __riscv_vse32_v_u32m4(o_ptr + 6 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple3, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 7 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple3, 1), vl); + } + + // Column tail: 0 < (block_width - bw) < TILE_WIDTH. + // Keep LMUL=4 so the same vl rows are processed, splitting the tail + // into legal vlsseg calls where needed (EMUL * NFIELDS <= 8). + if (bw < block_width) { + const size_t bw_tail = block_width - bw; + const uint32_t* i_ptr = i_row + bw; + uint32_t* o_ptr = o_col + bw * output_stride_u32; + switch (bw_tail) { + case 7: { + vuint32m4x2_t tuple0 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 0, input_stride, vl); + vuint32m4x2_t tuple1 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 2, input_stride, vl); + vuint32m4x2_t tuple2 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 4, input_stride, vl); + vuint32m4_t tuple3 = __riscv_vlse32_v_u32m4(i_ptr + 6, input_stride, vl); + __riscv_vse32_v_u32m4(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 1), vl); + __riscv_vse32_v_u32m4(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple1, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple1, 1), vl); + __riscv_vse32_v_u32m4(o_ptr + 4 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple2, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 5 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple2, 1), vl); + __riscv_vse32_v_u32m4(o_ptr + 6 * output_stride_u32, tuple3, vl); + break; + } + case 6: { + vuint32m4x2_t tuple0 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 0, input_stride, vl); + vuint32m4x2_t tuple1 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 2, input_stride, vl); + vuint32m4x2_t tuple2 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 4, input_stride, vl); + __riscv_vse32_v_u32m4(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 1), vl); + __riscv_vse32_v_u32m4(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple1, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple1, 1), vl); + __riscv_vse32_v_u32m4(o_ptr + 4 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple2, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 5 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple2, 1), vl); + break; + } + case 5: { + vuint32m4x2_t tuple0 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 0, input_stride, vl); + vuint32m4x2_t tuple1 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 2, input_stride, vl); + vuint32m4_t tuple2 = __riscv_vlse32_v_u32m4(i_ptr + 4, input_stride, vl); + __riscv_vse32_v_u32m4(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 1), vl); + __riscv_vse32_v_u32m4(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple1, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple1, 1), vl); + __riscv_vse32_v_u32m4(o_ptr + 4 * output_stride_u32, tuple2, vl); + break; + } + case 4: { + vuint32m4x2_t tuple0 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 0, input_stride, vl); + vuint32m4x2_t tuple1 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 2, input_stride, vl); + __riscv_vse32_v_u32m4(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 1), vl); + __riscv_vse32_v_u32m4(o_ptr + 2 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple1, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 3 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple1, 1), vl); + break; + } + case 3: { + vuint32m4x2_t tuple0 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 0, input_stride, vl); + vuint32m4_t tuple1 = __riscv_vlse32_v_u32m4(i_ptr + 2, input_stride, vl); + __riscv_vse32_v_u32m4(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 1), vl); + __riscv_vse32_v_u32m4(o_ptr + 2 * output_stride_u32, tuple1, vl); + break; + } + case 2: { + vuint32m4x2_t tuple0 = __riscv_vlsseg2e32_v_u32m4x2(i_ptr + 0, input_stride, vl); + __riscv_vse32_v_u32m4(o_ptr + 0 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 0), vl); + __riscv_vse32_v_u32m4(o_ptr + 1 * output_stride_u32, __riscv_vget_v_u32m4x2_u32m4(tuple0, 1), vl); + break; + } + case 1: { + vuint32m4_t v = __riscv_vlse32_v_u32m4(i_ptr, input_stride, vl); + __riscv_vse32_v_u32m4(o_ptr, v, vl); + break; + } + default: + XNN_UNREACHABLE; + } + } + + bh += vl; + } +} diff --git a/src/x32-transposec/rvv-u.c.in b/src/x32-transposec/rvv-u.c.in new file mode 100644 index 00000000000..99171dfd2d1 --- /dev/null +++ b/src/x32-transposec/rvv-u.c.in @@ -0,0 +1,104 @@ +// Copyright 2023 SiFive, Inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Optimized by Autocomp (https://github.com/ucb-bar/autocomp) +$assert LMUL in [1, 2, 4] +$TILE_WIDTH = 8 +$N_LOADS = LMUL +$SEG_PER_LOAD = 8 // LMUL +$HALF_SEG = SEG_PER_LOAD // 2 +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/transpose.h" + + +void xnn_x32_transposec_ukernel__${TILE_WIDTH}xv${LMUL}_rvv( + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height) +{ + assert(block_width == 1 || output_stride >= block_height * sizeof(uint32_t)); + assert(block_height == 1 || input_stride >= block_width * sizeof(uint32_t)); + + const size_t input_stride_u32 = input_stride / sizeof(uint32_t); + const size_t output_stride_u32 = output_stride / sizeof(uint32_t); + + for (size_t bh = 0; bh < block_height; ) { + const size_t vl = __riscv_vsetvl_e32m${LMUL}(block_height - bh); + + const uint32_t* i_row = input + bh * input_stride_u32; + uint32_t* o_col = output + bh; + + size_t bw = 0; + for (; bw + ${TILE_WIDTH} <= block_width; bw += ${TILE_WIDTH}) { + const uint32_t* i_ptr = i_row + bw; + uint32_t* o_ptr = o_col + bw * output_stride_u32; + + // Issue loads with the first half of each tuple's stores interleaved to + // cover segmented-load latency. + $for L in range(N_LOADS): + vuint32m${LMUL}x${SEG_PER_LOAD}_t tuple${L} = __riscv_vlsseg${SEG_PER_LOAD}e32_v_u32m${LMUL}x${SEG_PER_LOAD}(i_ptr + ${L * SEG_PER_LOAD}, input_stride, vl); + $if L + 1 < N_LOADS: + $for S in range(HALF_SEG): + __riscv_vse32_v_u32m${LMUL}(o_ptr + ${L * SEG_PER_LOAD + S} * output_stride_u32, __riscv_vget_v_u32m${LMUL}x${SEG_PER_LOAD}_u32m${LMUL}(tuple${L}, ${S}), vl); + + // Drain remaining stores. + $for L in range(N_LOADS): + $if L + 1 < N_LOADS: + $for S in range(HALF_SEG, SEG_PER_LOAD): + __riscv_vse32_v_u32m${LMUL}(o_ptr + ${L * SEG_PER_LOAD + S} * output_stride_u32, __riscv_vget_v_u32m${LMUL}x${SEG_PER_LOAD}_u32m${LMUL}(tuple${L}, ${S}), vl); + $else: + $for S in range(SEG_PER_LOAD): + __riscv_vse32_v_u32m${LMUL}(o_ptr + ${L * SEG_PER_LOAD + S} * output_stride_u32, __riscv_vget_v_u32m${LMUL}x${SEG_PER_LOAD}_u32m${LMUL}(tuple${L}, ${S}), vl); + } + + // Column tail: 0 < (block_width - bw) < TILE_WIDTH. + // Keep LMUL=${LMUL} so the same vl rows are processed, splitting the tail + // into legal vlsseg calls where needed (EMUL * NFIELDS <= 8). + if (bw < block_width) { + const size_t bw_tail = block_width - bw; + const uint32_t* i_ptr = i_row + bw; + uint32_t* o_ptr = o_col + bw * output_stride_u32; + switch (bw_tail) { + $for M in reversed(range(2, TILE_WIDTH)): + case ${M}: { + $for K in range(M // SEG_PER_LOAD): + vuint32m${LMUL}x${SEG_PER_LOAD}_t tuple${K} = __riscv_vlsseg${SEG_PER_LOAD}e32_v_u32m${LMUL}x${SEG_PER_LOAD}(i_ptr + ${K * SEG_PER_LOAD}, input_stride, vl); + $if M % SEG_PER_LOAD > 1: + vuint32m${LMUL}x${M % SEG_PER_LOAD}_t tuple${M // SEG_PER_LOAD} = __riscv_vlsseg${M % SEG_PER_LOAD}e32_v_u32m${LMUL}x${M % SEG_PER_LOAD}(i_ptr + ${(M // SEG_PER_LOAD) * SEG_PER_LOAD}, input_stride, vl); + $if M % SEG_PER_LOAD == 1: + vuint32m${LMUL}_t tuple${M // SEG_PER_LOAD} = __riscv_vlse32_v_u32m${LMUL}(i_ptr + ${(M // SEG_PER_LOAD) * SEG_PER_LOAD}, input_stride, vl); + $for K in range(M // SEG_PER_LOAD): + $for S in range(SEG_PER_LOAD): + __riscv_vse32_v_u32m${LMUL}(o_ptr + ${K * SEG_PER_LOAD + S} * output_stride_u32, __riscv_vget_v_u32m${LMUL}x${SEG_PER_LOAD}_u32m${LMUL}(tuple${K}, ${S}), vl); + $if M % SEG_PER_LOAD > 1: + $for S in range(M % SEG_PER_LOAD): + __riscv_vse32_v_u32m${LMUL}(o_ptr + ${(M // SEG_PER_LOAD) * SEG_PER_LOAD + S} * output_stride_u32, __riscv_vget_v_u32m${LMUL}x${M % SEG_PER_LOAD}_u32m${LMUL}(tuple${M // SEG_PER_LOAD}, ${S}), vl); + $if M % SEG_PER_LOAD == 1: + __riscv_vse32_v_u32m${LMUL}(o_ptr + ${(M // SEG_PER_LOAD) * SEG_PER_LOAD} * output_stride_u32, tuple${M // SEG_PER_LOAD}, vl); + break; + } + case 1: { + vuint32m${LMUL}_t v = __riscv_vlse32_v_u32m${LMUL}(i_ptr, input_stride, vl); + __riscv_vse32_v_u32m${LMUL}(o_ptr, v, vl); + break; + } + default: + XNN_UNREACHABLE; + } + } + + bh += vl; + } +} diff --git a/src/x32-transposec/x32-transposec.inc b/src/x32-transposec/x32-transposec.inc index 1327dcb65b8..552580906ae 100644 --- a/src/x32-transposec/x32-transposec.inc +++ b/src/x32-transposec/x32-transposec.inc @@ -57,6 +57,9 @@ XNN_TRANSPOSE_UKERNEL(xnn_arch_riscv_vector, xnn_x32_transposec_ukernel__4x4_rvv XNN_TRANSPOSE_UKERNEL(xnn_arch_riscv_vector, xnn_x32_transposec_ukernel__8x8_rvv, 32, uint32_t, 8, 8) XNN_TRANSPOSE_UKERNEL(xnn_arch_riscv_vector, xnn_x32_transposec_ukernel__16x8_rvv, 32, uint32_t, 16, 8) XNN_TRANSPOSE_UKERNEL(xnn_arch_riscv_vector, xnn_x32_transposec_ukernel__32x8_rvv, 32, uint32_t, 32, 8) +XNN_TRANSPOSE_UKERNEL(xnn_arch_riscv_vector, xnn_x32_transposec_ukernel__8xv1_rvv, 32, uint32_t, 4, 8) +XNN_TRANSPOSE_UKERNEL(xnn_arch_riscv_vector, xnn_x32_transposec_ukernel__8xv2_rvv, 32, uint32_t, 8, 8) +XNN_TRANSPOSE_UKERNEL(xnn_arch_riscv_vector, xnn_x32_transposec_ukernel__8xv4_rvv, 32, uint32_t, 16, 8) #endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR #if XNN_ARCH_ARM || XNN_ARCH_ARM64