diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S index e964d4ddb0..8d2acf283c 100644 --- a/kernel/loongarch64/amax_lasx.S +++ b/kernel/loongarch64/amax_lasx.S @@ -56,17 +56,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif + xvxor.v VM0, VM0, VM0 bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT -#ifdef DOUBLE - xvldrepl.d VM0, X, 0 -#else - xvldrepl.w VM0, X, 0 -#endif - XVFSUB VM0, VM0, VM0 bne INCX, TEMP, .L20 srai.d I, N, 4 diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S index 9a2c031f36..e5ab4df3d7 100644 --- a/kernel/loongarch64/asum_lasx.S +++ b/kernel/loongarch64/asum_lasx.S @@ -103,21 +103,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfadd.d res1, VX2, res1 xvfadd.d res1, VX3, res1 #else - xvfadd.s res2, res1, res2 xvpickve.w VX1, res1, 1 xvpickve.w VX2, res1, 2 xvpickve.w VX3, res1, 3 xvfadd.s res1, VX1, res1 xvfadd.s res1, VX2, res1 xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 + xvpickve.w VX0, res1, 4 + xvpickve.w VX1, res1, 5 + xvpickve.w VX2, res1, 6 + xvpickve.w VX3, res1, 7 xvfadd.s res1, VX0, res1 xvfadd.s res1, VX1, res1 xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 #endif .align 3 @@ -217,21 +216,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfadd.d res1, VX2, res1 xvfadd.d res1, VX3, res1 #else - xvfadd.s res2, res1, res2 xvpickve.w VX1, res1, 1 xvpickve.w VX2, res1, 2 xvpickve.w VX3, res1, 3 xvfadd.s res1, VX1, res1 xvfadd.s res1, VX2, res1 xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 + xvpickve.w VX0, res1, 4 + xvpickve.w VX1, res1, 5 + xvpickve.w VX2, res1, 6 + xvpickve.w VX3, res1, 7 xvfadd.s res1, VX0, res1 xvfadd.s res1, VX1, res1 xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 #endif .align 3 diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S index 0583e56ead..32b8bf9825 100644 --- a/kernel/loongarch64/cdot_lasx.S +++ b/kernel/loongarch64/cdot_lasx.S @@ -288,7 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w x2, t2, 6 xvinsgr2vr.w x1, t3, 7 xvinsgr2vr.w x2, t4, 7 - addi.d Y, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE xvpickev.w x3, VX3, VX2 xvpickod.w x4, VX3, VX2 xvfmadd.s res1, x1, x3, res1 diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S index 3a60069ac8..034f9de794 100644 --- a/kernel/loongarch64/cnrm2_lasx.S +++ b/kernel/loongarch64/cnrm2_lasx.S @@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VX4 $xr21 #define res1 $xr19 #define res2 $xr20 +#define RCP $f2 +#define VALPHA $xr3 PROLOGUE @@ -55,10 +57,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 bge $r0, N, .L999 beq $r0, INCX, .L999 + + addi.d $sp, $sp, -32 + st.d $ra, $sp, 0 + st.d N, $sp, 8 + st.d X, $sp, 16 + st.d INCX, $sp, 24 +#ifdef DYNAMIC_ARCH + bl camax_k_LA264 +#else + bl camax_k +#endif + ld.d $ra, $sp, 0 + ld.d N, $sp, 8 + ld.d X, $sp, 16 + ld.d INCX, $sp, 24 + addi.d $sp, $sp, 32 + + frecip.s RCP, $f0 + vreplvei.w $vr3, $vr2, 0 + xvpermi.d VALPHA, $xr3,0x00 + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + fcmp.ceq.s $fcc0, $f0, $f19 + bcnez $fcc0, .L999 + li.d TEMP, SIZE slli.d INCX, INCX, ZBASE_SHIFT srai.d I, N, 2 @@ -67,13 +92,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvfcvtl.d.s VX1, VX0 - xvfcvth.d.s VX2, VX0 - xvfmadd.d res1, VX1, VX1, res1 - xvfmadd.d res2, VX2, VX2, res2 addi.d I, I, -1 - addi.d X, X, 8 * SIZE + + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfmul.s VX0, VX0, VALPHA + xvfmul.s VX1, VX1, VALPHA + xvfmadd.s res1, VX0, VX0, res1 + xvfmadd.s res2, VX1, VX1, res2 + + addi.d X, X, 16 * SIZE blt $r0, I, .L10 .align 3 b .L996 @@ -103,22 +131,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 add.d X, X, INCX - xvfcvtl.d.s VX1, VX0 - xvfcvth.d.s VX2, VX0 - xvfmadd.d res1, VX1, VX1, res1 - xvfmadd.d res2, VX2, VX2, res2 + xvfmul.s VX0, VX0, VALPHA + xvfmadd.s res2, VX0, VX0, res2 addi.d I, I, -1 blt $r0, I, .L21 b .L996 .L996: - xvfadd.d res1, res1, res2 - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 + xvfadd.s res1, res1, res2 + xvpermi.d VX1, res1, 0x4e + xvfadd.s res1, res1, VX1 + vreplvei.w $vr17, $vr19, 1 + vreplvei.w $vr18, $vr19, 2 + vreplvei.w $vr21, $vr19, 3 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvfadd.s res1, VX4, res1 .align 3 .L997: @@ -130,18 +158,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fld.s a1, X, 0 * SIZE fld.s a2, X, 1 * SIZE addi.d I, I, -1 - fcvt.d.s a1, a1 - fcvt.d.s a2, a2 - fmadd.d res, a1, a1, res - fmadd.d res, a2, a2, res + fmul.s a1, a1, RCP + fmul.s a2, a2, RCP + fmadd.s res, a1, a1, res + fmadd.s res, a2, a2, res add.d X, X, INCX blt $r0, I, .L998 .align 3 .L999: - fsqrt.d res, res + fsqrt.s res, res + fmul.s $f0, res, $f0 move $r4, $r17 - fcvt.s.d $f0, res jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S index 31f91cec1c..6a723fb1e8 100644 --- a/kernel/loongarch64/copy_lasx.S +++ b/kernel/loongarch64/copy_lasx.S @@ -260,9 +260,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d Y, Y, INCY ST a2, Y, 0 add.d Y, Y, INCY - ST a3, X, 0 + ST a3, Y, 0 add.d Y, Y, INCY - ST a4, X, 0 + ST a4, Y, 0 add.d Y, Y, INCY LD a1, X, 0 add.d X, X, INCX @@ -276,9 +276,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d Y, Y, INCY ST a2, Y, 0 add.d Y, Y, INCY - ST a3, X, 0 + ST a3, Y, 0 add.d Y, Y, INCY - ST a4, X, 0 + ST a4, Y, 0 add.d Y, Y, INCY addi.d I, I, -1 blt $r0, I, .L222 diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S index f535266630..e32071def7 100644 --- a/kernel/loongarch64/cscal_lasx.S +++ b/kernel/loongarch64/cscal_lasx.S @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHAI $f1 #define X $r7 #define INCX $r8 +#define DUMMY2 $r9 #define I $r12 #define TEMP $r13 @@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 + ld.d DUMMY2, $sp, 0 li.d TEMP, 1 movgr2fr.d a1, $r0 FFINT a1, a1 @@ -86,24 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif bne INCX, TEMP, .L22 +/////// INCX == 1 //////// .L11: - bge $r0, I, .L997 CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L13 - b .L14 - .align 3 + bge $r0, I, .L19 +/////// INCX == 1 && N >= 4 //////// + bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal. -.L13: - bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 - b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + bceqz $fcc0, .L17 -.L14: - bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 - b .L111 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc1, .L17 -.L111: //alpha_r == 0.0 && alpha_i == 0.0 +.L15: //alpha_r == 0.0 && alpha_i == 0.0 xvst VXZ, X, 0 * SIZE #ifdef DOUBLE xvst VXZ, X, 4 * SIZE @@ -113,41 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d X, X, 16 * SIZE #endif addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L113: //alpha_r != 0.0 && alpha_i == 0.0 - xvld VX0, X, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvpickev.d x1, VX1, VX0 - xvpickod.d x2, VX1, VX0 - xvfmul.d x3, VXAR, x1 - xvfmul.d x4, VXAR, x2 - xvilvl.d VX2, x4 ,x3 - xvilvh.d VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE -#else - xvld VX1, X, 8 * SIZE - xvpickev.w x1, VX1, VX0 - xvpickod.w x2, VX1, VX0 - xvfmul.s x3, VXAR, x1 - xvfmul.s x4, VXAR, x2 - xvilvl.w VX2, x4 ,x3 - xvilvh.w VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 8 * SIZE - addi.d X, X, 16 * SIZE -#endif - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 + blt $r0, I, .L15 + b .L19 .align 3 -.L114: //alpha_r != 0.0 && alpha_i != 0.0 +.L17: xvld VX0, X, 0 * SIZE #ifdef DOUBLE xvld VX1, X, 4 * SIZE @@ -177,29 +144,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d X, X, 16 * SIZE #endif addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 + blt $r0, I, .L17 + b .L19 .align 3 +/////// INCX == 1 && N < 8 /////// +.L19: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + + bceqz $fcc0, .L998 + + bceqz $fcc1, .L998 + + b .L995 // alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +/////// INCX != 1 //////// .L22: - bge $r0, I, .L997 - move XX, X CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L23 - b .L24 - .align 3 - -.L23: - bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 - b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + move XX, X + bge $r0, I, .L29 + bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal. + bceqz $fcc0, .L25 -.L24: - bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 - b .L221 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc1, .L25 -.L221: //alpha_r == 0.0 && alpha_i == 0.0 +.L27: //alpha_r == 0.0 && alpha_i == 0.0 #ifdef DOUBLE xvstelm.d VXZ, X, 0, 0 xvstelm.d VXZ, X, 1 * SIZE, 0 @@ -239,122 +216,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif add.d X, X, INCX addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 - .align 3 - -.L223: //alpha_r != 0.0 && alpha_i == 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.d x1, t1, 0 - xvinsgr2vr.d x2, t2, 0 - xvinsgr2vr.d x1, t3, 1 - xvinsgr2vr.d x2, t4, 1 - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - xvinsgr2vr.d x1, t1, 2 - xvinsgr2vr.d x2, t2, 2 - xvinsgr2vr.d x1, t3, 3 - xvinsgr2vr.d x2, t4, 3 - add.d X, X, INCX - - xvfmul.d x3, VXAR, x1 - xvfmul.d x4, VXAR, x2 - addi.d I, I, -1 - xvstelm.d x3, XX, 0 * SIZE, 0 - xvstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 1 - xvstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 2 - xvstelm.d x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 3 - xvstelm.d x4, XX, 1 * SIZE, 3 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 0 - xvinsgr2vr.w x2, t2, 0 - xvinsgr2vr.w x1, t3, 1 - xvinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 2 - xvinsgr2vr.w x2, t2, 2 - xvinsgr2vr.w x1, t3, 3 - xvinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 4 - xvinsgr2vr.w x2, t2, 4 - xvinsgr2vr.w x1, t3, 5 - xvinsgr2vr.w x2, t4, 5 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 6 - xvinsgr2vr.w x2, t2, 6 - xvinsgr2vr.w x1, t3, 7 - xvinsgr2vr.w x2, t4, 7 - add.d X, X, INCX - - xvfmul.s x3, VXAR, x1 - xvfmul.s x4, VXAR, x2 - addi.d I, I, -1 - xvstelm.w x3, XX, 0 * SIZE, 0 - xvstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 1 - xvstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 2 - xvstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 3 - xvstelm.w x4, XX, 1 * SIZE, 3 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 4 - xvstelm.w x4, XX, 1 * SIZE, 4 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 5 - xvstelm.w x4, XX, 1 * SIZE, 5 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 6 - xvstelm.w x4, XX, 1 * SIZE, 6 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 7 - xvstelm.w x4, XX, 1 * SIZE, 7 -#endif - add.d XX, XX, INCX - blt $r0, I, .L223 - b .L997 + blt $r0, I, .L27 + b .L29 .align 3 -.L224: //alpha_r != 0.0 && alpha_i != 0.0 +.L25: #ifdef DOUBLE ld.d t1, X, 0 * SIZE ld.d t2, X, 1 * SIZE @@ -376,7 +242,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.d x1, t3, 3 xvinsgr2vr.d x2, t4, 3 add.d X, X, INCX - xvfmul.d VX0, VXAI, x2 xvfmsub.d x3, VXAR, x1, VX0 xvfmul.d VX1, VXAI, x1 @@ -434,7 +299,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w x1, t3, 7 xvinsgr2vr.w x2, t4, 7 add.d X, X, INCX - xvfmul.s VX0, VXAI, x2 xvfmsub.s x3, VXAR, x1, VX0 xvfmul.s VX1, VXAI, x1 @@ -465,19 +329,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.w x4, XX, 1 * SIZE, 7 #endif add.d XX, XX, INCX - blt $r0, I, .L224 - b .L997 + blt $r0, I, .L25 + b .L29 .align 3 -.L997: +/////// INCX != 1 && N < 8 /////// +.L29: #ifdef DOUBLE - andi I, N, 3 + andi I, N, 3 #else - andi I, N, 7 + andi I, N, 7 #endif - bge $r0, I, .L999 - .align 3 + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + + bceqz $fcc0, .L998 + bceqz $fcc1, .L998 + +.L995: // alpha_r == 0.0 && alpha_i == 0.0 + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L995 + b .L999 .L998: LD a1, X, 0 * SIZE LD a2, X, 1 * SIZE @@ -490,11 +366,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST s2, X, 1 * SIZE add.d X, X, INCX blt $r0, I, .L998 - .align 3 + b .L999 .L999: move $r4, $r12 jirl $r0, $r1, 0x0 .align 3 - EPILOGUE diff --git a/kernel/loongarch64/dot_lasx.S b/kernel/loongarch64/dot_lasx.S index 11c896cb9a..e72f848f89 100644 --- a/kernel/loongarch64/dot_lasx.S +++ b/kernel/loongarch64/dot_lasx.S @@ -53,8 +53,8 @@ PROLOGUE #endif /* init $f8 and $f9 to zero */ - SUB s1, s1, s1 - SUB s2, s2, s2 + xvxor.v $xr8, $xr8, $xr8 + xvxor.v $xr9, $xr9, $xr9 slli.d INCX, INCX, BASE_SHIFT li.d TEMP, SIZE slli.d INCY, INCY, BASE_SHIFT @@ -64,20 +64,6 @@ PROLOGUE /* !((inc_x == 1) && (inc_y == 1)) */ - /* init $xr8 and $xr9 to zero */ -#ifdef DOUBLE - xvldrepl.d $xr0, X, 0 -#else - xvldrepl.w $xr0, X, 0 -#endif -#ifdef DSDOT - xvfcvtl.d.s $xr0, $xr0 - xvfsub.d $xr8, $xr0, $xr0 - xvfsub.d $xr9, $xr0, $xr0 -#else - XVFSUB $xr8, $xr0, $xr0 - XVFSUB $xr9, $xr0, $xr0 -#endif #ifdef DOUBLE srai.d I, N, 4 @@ -99,31 +85,31 @@ PROLOGUE addi.w I, I, -1 addi.d X, X, 128 addi.d Y, Y, 128 -#ifdef DSDOT +#ifndef DOUBLE xvfcvtl.d.s $xr10, $xr0 xvfcvtl.d.s $xr11, $xr4 xvfcvth.d.s $xr12, $xr0 xvfcvth.d.s $xr13, $xr4 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 xvfcvtl.d.s $xr10, $xr1 xvfcvtl.d.s $xr11, $xr5 xvfcvth.d.s $xr12, $xr1 xvfcvth.d.s $xr13, $xr5 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 xvfcvtl.d.s $xr10, $xr2 xvfcvtl.d.s $xr11, $xr6 xvfcvth.d.s $xr12, $xr2 xvfcvth.d.s $xr13, $xr6 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 xvfcvtl.d.s $xr10, $xr3 xvfcvtl.d.s $xr11, $xr7 xvfcvth.d.s $xr12, $xr3 xvfcvth.d.s $xr13, $xr7 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 #else XVFMADD $xr8, $xr0, $xr4, $xr8 XVFMADD $xr9, $xr1, $xr5, $xr9 @@ -149,13 +135,13 @@ PROLOGUE addi.w I, I, -1 addi.d X, X, 32 addi.d Y, Y, 32 -#ifdef DSDOT +#ifndef DOUBLE xvfcvtl.d.s $xr10, $xr0 xvfcvtl.d.s $xr11, $xr4 xvfcvth.d.s $xr12, $xr0 xvfcvth.d.s $xr13, $xr4 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 #else XVFMADD $xr8, $xr0, $xr4, $xr8 #endif @@ -163,27 +149,12 @@ PROLOGUE .align 3 .L14: /* store dot in s1 $f8 */ -#ifdef DSDOT xvfadd.d $xr8, $xr8, $xr9 - fsub.s s2, s2, s2 /* set s2 to 0.0 */ + fsub.d s2, s2, s2 /* set s2 to 0.0 */ xvpermi.q $xr0, $xr8, 0x1 vfadd.d $vr8, $vr8, $vr0 vpackod.d $vr0, $vr8, $vr8 vfadd.d $vr8, $vr8, $vr0 -#else - XVFADD $xr8, $xr8, $xr9 - SUB s2, s2, s2 /* set s2 to 0.0 */ - xvpermi.q $xr0, $xr8, 0x1 - VFADD $vr8, $vr8, $vr0 - vpackod.d $vr0, $vr8, $vr8 -#ifdef DOUBLE - VFADD $vr8, $vr8, $vr0 -#else - VFADD $vr8, $vr8, $vr0 - vpackod.w $vr0, $vr8, $vr8 - VFADD $vr8, $vr8, $vr0 -#endif /* defined DOUBLE */ -#endif /* defined DSDOT */ .align 3 .L15: #ifdef DOUBLE @@ -197,7 +168,7 @@ PROLOGUE /* FLOAT: 1~7 ; DOUBLE: 1~3 */ LD a1, X, 0 LD b1, Y, 0 -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -240,7 +211,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -252,7 +223,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -264,7 +235,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -276,7 +247,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -288,7 +259,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -300,7 +271,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -312,7 +283,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -325,7 +296,7 @@ PROLOGUE LD b1, Y, 0 * SIZE add.d Y, Y, INCY addi.d I, I, -1 -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -346,7 +317,7 @@ PROLOGUE LD b1, Y, 0 * SIZE add.d Y, Y, INCY addi.d I, I, -1 -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -357,12 +328,13 @@ PROLOGUE .align 3 .L999: -#ifdef DSDOT fadd.d $f0, s1, s2 + move $r4, $r17 +#if defined(DOUBLE) +#elif defined(DSDOT) #else - ADD $f0, s1, s2 + fcvt.s.d $f0, $f0 #endif - move $r4, $r17 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/iamax_lasx.S b/kernel/loongarch64/iamax_lasx.S index 090da3004e..a573fd4b77 100644 --- a/kernel/loongarch64/iamax_lasx.S +++ b/kernel/loongarch64/iamax_lasx.S @@ -56,25 +56,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VI3 $xr8 #define VI4 $xr19 #define VT0 $xr23 +#define VZE $xr3 +#define VT1 $xr4 +#define VT2 $xr5 +#define VC0 $xr6 PROLOGUE li.d i0, 0 bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 + xvldi VZE, 0 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 #ifdef DOUBLE + xvfsub.d VT1, VZE, VM0 addi.d i0, i0, 1 srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 + xvfmaxa.d VM0, VM0, VT1 + bge $r0, I, .L11 + slli.d i0, i0, 1 //2 xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 + slli.d i0, i0, 1 //4 xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 + addi.d i0, i0, -7 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 1 @@ -82,19 +89,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.d VI1, i0, 2 addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 + xvinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 + xvinsgr2vr.d VI0, i0, 1 addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 + xvinsgr2vr.d VI0, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 #else + xvfsub.s VT1, VZE, VM0 addi.w i0, i0, 1 srai.d I, N, 3 + xvfmaxa.s VM0, VM0, VT1 bge $r0, I, .L21 - slli.w i0, i0, 3 //8 + slli.w i0, i0, 2 //4 + xvreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 xvreplgr2vr.w VINC8, i0 addi.w i0, i0, -15 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization @@ -135,73 +146,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE xvld VX0, X, 0 * SIZE xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE + xvld VX1, X, 2 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfsub.d VT1, VZE, VX0 + xvfsub.d VT2, VZE, VX1 + xvfmaxa.d VX0, VX0, VT1 + xvfmaxa.d VX1, VX1, VT2 + xvfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1) + xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf) + xvbitsel.v x2, VI1, VI2, VT0 //i + + xvld VX0, X, 4 * SIZE + xvadd.d VI1, VI2, VINC4 + xvld VX1, X, 6 * SIZE xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 + xvfsub.d VT1, VZE, VX0 + xvfsub.d VT2, VZE, VX1 + xvfmaxa.d VX0, VX0, VT1 + xvfmaxa.d VX1, VX1, VT2 + xvfcmp.clt.d VT0, VX0, VX1 + xvbitsel.v x3, VX0, VX1, VT0 //abs(maxf) + xvbitsel.v x4, VI1, VI2, VT0 //i + xvfcmp.clt.d VC0, x1, x3 + xvbitsel.v x1, x1, x3, VC0 //abs(maxf) + xvbitsel.v x2, x2, x4, VC0 //i + xvfcmp.clt.d VT0, VM0, x1 addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, x2, VT0 #else xvld VX0, X, 0 * SIZE - addi.d I, I, -1 xvadd.w VI1, VI1, VINC8 - xvfmaxa.s VM1, VX0, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 + xvld VX1, X, 4 * SIZE + xvadd.w VI2, VI1, VINC4 + xvfsub.s VT1, VZE, VX0 + xvfsub.s VT2, VZE, VX1 + xvfmaxa.s VX0, VX0, VT1 + xvfmaxa.s VX1, VX1, VT2 + xvfcmp.clt.s VT0, VX0, VX1 + xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf) + xvbitsel.v x2, VI1, VI2, VT0 //i + addi.d I, I, -1 + xvfcmp.clt.s VT0, VM0, x1 addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, x2, VT0 + #endif blt $r0, I, .L10 .align 3 .L15: #ifdef DOUBLE - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 + vreplvei.d $vr21, $vr20, 0 + vreplvei.d $vr22, $vr20, 1 + vreplvei.d $vr9, $vr15, 0 + vreplvei.d $vr10, $vr15, 1 + fcmp.ceq.d $fcc0, $f9, $f10 + bceqz $fcc0, .L16 + xvfcmp.clt.d VT0, VI1, VI2 + xvbitsel.v VI0, VI2, VI1, VT0 + b .L17 #else - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 + vreplvei.w $vr21, $vr20, 0 + vreplvei.w $vr22, $vr20, 1 + vreplvei.w $vr8, $vr20, 2 + vreplvei.w $vr19, $vr20, 3 + vreplvei.w $vr9, $vr15, 0 + vreplvei.w $vr10, $vr15, 1 + vreplvei.w $vr11, $vr15, 2 + vreplvei.w $vr12, $vr15, 3 + b .L26 #endif - XVFMAXA VM1, x1, x2 - XVCMPEQ VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - XVFMAXA VM0, x3, x4 - XVCMPEQ VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - XVFMAXA VM0, VM0, VM1 - XVCMPEQ VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - CMPEQ $fcc0, $f15, $f9 - bceqz $fcc0, .L26 - XVCMPLT VT0, VI1, VI0 + .align 3 + +#ifdef DOUBLE +.L16: + xvfcmp.clt.d VT0, x1, x2 + xvbitsel.v VI0, VI1, VI2, VT0 + xvbitsel.v VM0, x1, x2, VT0 + .align 3 + +.L17: + movfr2gr.d i0, $f20 + .align 3 + +.L11: //INCX==1 and N<8 + andi I, N, 7 + bge $r0, I, .L14 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L13: + fld.d $f9, X, 0 + fsub.d $f10, $f3, $f9 + xvfmaxa.d x1, x1, x2 + xvfcmp.clt.d VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 + addi.d I, I, -1 + addi.d i1, i1, 1 + addi.d X, X, SIZE + movgr2fr.d $f21, i1 + blt $r0, I, .L13 + movfr2gr.d i0, $f20 + .align 3 + +.L14: + move $r4, $r17 + jirl $r0, $r1, 0x0 .align 3 .L20: // INCX!=1 move TEMP, X -#ifdef DOUBLE addi.d i0, i0, 1 ld.d t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -210,34 +272,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, I, .L21 ld.d t2, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 + slli.d i0, i0, 1 //2 + xvfsub.d VT1, VZE, VM0 xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 + slli.d i0, i0, 1 //4 xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 + addi.d i0, i0, -7 + xvfmaxa.d VM0, VM0, VT1 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 + addi.d i0, i0, 3 xvinsgr2vr.d VI0, i0, 0 //1 addi.d i0, i0, 1 xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t2, 1 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t2, 1 + xvadd.d VI2, VI1, VINC4 + + xvfsub.d VT1, VZE, VX0 + xvfsub.d VT2, VZE, VX1 + xvfmaxa.d VX0, VX0, VT1 + xvfmaxa.d VX1, VX1, VT2 + xvfcmp.clt.d VT0, VX0, VX1 + xvbitsel.v x1, VX0, VX1, VT0 + xvbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t2, 1 + xvadd.d VI1, VI2, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t2, 1 + xvadd.d VI2, VI1, VINC4 + xvfsub.d VT1, VZE, VX0 + xvfsub.d VT2, VZE, VX1 + xvfmaxa.d VX0, VX0, VT1 + xvfmaxa.d VX1, VX1, VT2 + xvfcmp.clt.d VT0, VX0, VX1 + xvbitsel.v x3, VX0, VX1, VT0 + xvbitsel.v x4, VI1, VI2, VT0 + xvfcmp.clt.d VC0, x1, x3 + xvbitsel.v x1, x1, x3, VC0 + xvbitsel.v x2, x2, x4, VC0 + xvfcmp.clt.d VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, x2, VT0 + + addi.d I, I, -1 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d $vr21, $vr20, 0 + vreplvei.d $vr22, $vr20, 1 + vreplvei.d $vr9, $vr15, 0 + vreplvei.d $vr10, $vr15, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI2 + xvbitsel.v VI0, VI2, VI1, VT0 + b .L27 + .align 3 + +.L26: + xvfcmp.clt.d VT0, x1, x2 + xvbitsel.v VI0, VI1, VI2, VT0 + xvbitsel.v VM0, x1, x2, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + #else +.L20: // INCX!=1 + move TEMP, X addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -253,19 +384,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VM0, t2, 1 xvinsgr2vr.w VM0, t3, 2 xvinsgr2vr.w VM0, t4, 3 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.w VM0, t1, 4 - xvinsgr2vr.w VM0, t2, 5 - xvinsgr2vr.w VM0, t3, 6 - xvinsgr2vr.w VM0, t4, 7 - slli.w i0, i0, 3 //8 + slli.w i0, i0, 2 //4 + xvreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 xvreplgr2vr.w VINC8, i0 addi.w i0, i0, -15 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization @@ -275,15 +396,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VI1, i0, 2 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 1 - xvinsgr2vr.w VI1, i0, 4 - addi.w i0, i0, 1 - xvinsgr2vr.w VI1, i0, 5 - addi.w i0, i0, 1 - xvinsgr2vr.w VI1, i0, 6 - addi.w i0, i0, 1 - xvinsgr2vr.w VI1, i0, 7 - addi.w i0, i0, 1 + addi.w i0, i0, 5 xvinsgr2vr.w VI0, i0, 0 //1 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 1 //2 @@ -291,54 +404,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VI0, i0, 2 //3 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 3 //4 - addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 4 //5 - addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 5 //6 - addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 6 //7 - addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 7 //8 -#endif .align 3 .L24: -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 -#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -351,6 +419,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 + xvadd.w VI1, VI1, VINC8 ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -359,158 +428,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d X, X, INCX ld.w t4, X, 0 * SIZE add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvadd.w VI1, VI1, VINC8 - xvfmaxa.s VM1, VX0, VM0 - xvfcmp.ceq.s VT0, VM1, VM0 + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + xvadd.w VI2, VI1, VINC4 + xvfsub.s VT1, VZE, VX0 + xvfsub.s VT2, VZE, VX1 + xvfmaxa.s VX0, VX0, VT1 + xvfmaxa.s VX1, VX1, VT2 + xvfcmp.clt.s VT0, VX0, VX1 + xvbitsel.v x1, VX0, VX1, VT0 + xvbitsel.v x2, VI1, VI2, VT0 //i + addi.d I, I, -1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 -#endif + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, x2, VT0 blt $r0, I, .L24 .align 3 .L25: -#ifdef DOUBLE - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 -#else - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfmaxa.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfmaxa.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 -#endif - CMPEQ $fcc0, $f15, $f9 - bceqz $fcc0, .L26 - XVCMPLT VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 + vreplvei.w $vr21, $vr20, 0 + vreplvei.w $vr22, $vr20, 1 + vreplvei.w $vr8, $vr20, 2 + vreplvei.w $vr19, $vr20, 3 + vreplvei.w $vr9, $vr15, 0 + vreplvei.w $vr10, $vr15, 1 + vreplvei.w $vr11, $vr15, 2 + vreplvei.w $vr12, $vr15, 3 .align 3 .L26: - fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L27 - XVCMPLT VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 + fcmp.ceq.s $fcc0, $f9, $f10 + bceqz $fcc0, .L31 + xvfcmp.clt.s VT0, VI1, VI2 + xvbitsel.v VI1, VI2, VI1, VT0 + b .L32 .align 3 - -.L27: - fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L28 - XVCMPLT VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 +.L31: + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VI1, VI1, VI2, VT0 + xvbitsel.v x1, x1, x2, VT0 .align 3 - -.L28: - fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L29 - XVCMPLT VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 +.L32: + fcmp.ceq.s $fcc0, $f11, $f12 + bceqz $fcc0, .L33 + xvfcmp.clt.s VT1, VI3, VI4 + xvbitsel.v VI3, VI4, VI3, VT1 + b .L34 .align 3 - -.L29: -#ifdef DOUBLE - movfr2gr.d i0, $f20 -#else - fmov.s $f16, $f20 -#endif +.L33: + xvfcmp.clt.s VT1, x3, x4 + xvbitsel.v x3, x3, x4, VT1 + xvbitsel.v VI3, VI3, VI4, VT1 .align 3 - -#ifdef DOUBLE - -#else -.L252: - xvxor.v VI0, VI0, VI0 - xvor.v VI0, VI0, VX0 - fmov.s $f13, $f15 - xvxor.v VM0, VM0, VM0 - xvor.v VM0, VM0, VX1 - xvpickve.w VI1, VI0, 4 - xvpickve.w VI2, VI0, 5 - xvpickve.w VI3, VI0, 6 - xvpickve.w VI4, VI0, 7 - xvpickve.w x1, VM0, 4 - xvpickve.w x2, VM0, 5 - xvpickve.w x3, VM0, 6 - xvpickve.w x4, VM0, 7 - xvfmaxa.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - fcmp.ceq.d $fcc0, $f15, $f9 - bceqz $fcc0, .L262 - xvfcmp.clt.s VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 +.L34: + fcmp.ceq.s $fcc0, $f9, $f11 + bceqz $fcc0, .L35 + xvfcmp.clt.s VT0, VI1, VI3 + xvbitsel.v VI0, VI3, VI1, VT0 + xvxor.v VM0, x1, VZE + b .L29 .align 3 - -.L262: - fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L272 - xvfcmp.clt.s VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 +.L35: + xvfcmp.clt.s VT0, x1, x3 + xvbitsel.v VM0, x1, x3, VT0 + xvbitsel.v VI0, VI1, VI3, VT0 .align 3 -.L272: - fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L282 - xvfcmp.clt.s VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L282: - fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L292 - xvfcmp.clt.s VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 +.L29: + movfr2gr.s i0, $f20 .align 3 -.L292: - xvfmaxa.s VM0, VX0, VM0 - xvfcmp.ceq.s VT0, VM0, VX0 - xvbitsel.v VI0, VI0, VI1, VT0 - movfr2gr.s i0, $f20 #endif - -.L21: //N<8 +.L21: // N<8 andi I, N, 7 bge $r0, I, .L999 srai.d i1, N, 3 @@ -521,17 +512,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L22: - LD $f9, X, 0 + LD $f9, X, 0 +#ifdef DOUBLE + fsub.d $f10, $f3, $f9 + xvfmaxa.d x1, x1, x2 + xvfcmp.clt.d VT0, VM0, x1 +#else + fsub.s $f10, $f3, $f9 + xvfmaxa.s x1, x1, x2 + xvfcmp.clt.s VT0, VM0, x1 +#endif + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 addi.d I, I, -1 - XVFMAXA VM1, x1, VM0 - XVCMPEQ VT0, VM0, VM1 - add.d X, X, INCX - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 addi.d i1, i1, 1 + add.d X, X, INCX movgr2fr.d $f21, i1 blt $r0, I, .L22 - MTG i0, $f20 + MTG i0, $f20 .align 3 .L999: diff --git a/kernel/loongarch64/icamax_lasx.S b/kernel/loongarch64/icamax_lasx.S index 7800cb9173..fb47aa4583 100644 --- a/kernel/loongarch64/icamax_lasx.S +++ b/kernel/loongarch64/icamax_lasx.S @@ -76,66 +76,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d i0, i0, 1 srai.d I, N, 2 bge $r0, I, .L21 - slli.d i0, i0, 2 //4 + slli.d i0, i0, 1 //2 xvreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 + addi.d i0, i0, -3 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 2 + addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, -1 + addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 2 - xvinsgr2vr.d VI1, i0, 3 addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 2 - xvinsgr2vr.d VI0, i0, 1 //3 + xvinsgr2vr.d VI1, i0, 3 addi.d i0, i0, -1 - xvinsgr2vr.d VI0, i0, 2 //2 - addi.d i0, i0, 2 - xvinsgr2vr.d VI0, i0, 3 //4 + xvinsgr2vr.d VI0, i0, 0 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 #else li.w I, -1 xvreplgr2vr.w VI4, I xvffint.s.w VI4, VI4 // -1 bne INCX, TEMP, .L20 addi.w i0, i0, 1 - srai.d I, N, 3 + srai.d I, N, 2 bge $r0, I, .L21 - slli.w i0, i0, 3 //8 - xvreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 + slli.w i0, i0, 2 //4 + xvreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 2 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, -3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 4 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 5 - addi.w i0, i0, 3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 6 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 0 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 0 //1 + xvinsgr2vr.w VI0, i0, 1 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 3 - xvinsgr2vr.w VI0, i0, 2 //5 + xvinsgr2vr.w VI0, i0, 2 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 3 //6 - addi.w i0, i0, -3 - xvinsgr2vr.w VI0, i0, 4 //3 + xvinsgr2vr.w VI0, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 5 //4 - addi.w i0, i0, 3 - xvinsgr2vr.w VI0, i0, 6 //7 + xvinsgr2vr.w VI0, i0, 5 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 7 //8 + xvinsgr2vr.w VI0, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 #endif .align 3 @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvld VX0, X, 0 * SIZE #ifdef DOUBLE xvadd.d VI1, VI1, VINC4 - xvld VX1, X, 4 * SIZE + xvld VX1, X, 2 * SIZE addi.d I, I, -1 xvpickev.d x1, VX1, VX0 xvpickod.d x2, VX1, VX0 @@ -153,22 +153,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfcmp.clt.d VINC8, x2, VI3 xvbitsel.v x1, x1, x3, VT0 xvbitsel.v x2, x2, x4, VINC8 + xvfadd.d x1, x1, x2 + xvfmax.d x3, VM0, x1 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + xvld VX0, X, 4 * SIZE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 6 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 #else - xvadd.w VI1, VI1, VINC8 - xvld VX1, X, 8 * SIZE + xvadd.w VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE addi.d I, I, -1 xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 xvfmul.s x3, VI4, x1 xvfmul.s x4, VI4, x2 - xvfcmp.clt.s VT0, x1, VI3 - xvfcmp.clt.s VINC4, x2, VI3 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VINC4 #endif - XVFADD x1, x1, x2 - XVFMAX x3, VM0, x1 - XVCMPEQ VT0, x3, VM0 + XVCMPLT VT0, x1, VI3 + XVCMPLT VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + XVFADD x1, x1, x2 + XVFMAX x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 addi.d X, X, 8 * SIZE xvbitsel.v VM0, x3, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 @@ -177,51 +189,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L15: #ifdef DOUBLE - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmax.d VM1, x1, x2 - xvfcmp.ceq.d VT0, VM1, x1 + vreplvei.d $vr21, $vr20, 0 + vreplvei.d $vr22, $vr20, 1 + vreplvei.d $vr9, $vr15, 0 + vreplvei.d $vr10, $vr15, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI2 + xvbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w $vr21, $vr20, 0 + vreplvei.w $vr22, $vr20, 1 + vreplvei.w $vr8, $vr20, 2 + vreplvei.w $vr19, $vr20, 3 + vreplvei.w $vr9, $vr15, 0 + vreplvei.w $vr10, $vr15, 1 + vreplvei.w $vr11, $vr15, 2 + vreplvei.w $vr12, $vr15, 3 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, VM1, x1 xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmax.d VM0, x3, x4 - xvfcmp.ceq.d VT0, x3, VM0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmax.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 -#else - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x1, x2 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 -#endif fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - XVCMPLT VT0, VI1, VI0 + xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 +#endif .align 3 .L20: // INCX!=1 @@ -229,62 +229,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d i0, i0, 1 srai.d I, N, 2 bge $r0, I, .L21 - slli.d i0, i0, 2 //4 + slli.d i0, i0, 1 //2 xvreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 + addi.d i0, i0, -3 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 2 + addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, -1 + addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 2 - xvinsgr2vr.d VI1, i0, 3 addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 2 - xvinsgr2vr.d VI0, i0, 1 //3 + xvinsgr2vr.d VI1, i0, 3 addi.d i0, i0, -1 - xvinsgr2vr.d VI0, i0, 2 //2 - addi.d i0, i0, 2 - xvinsgr2vr.d VI0, i0, 3 //4 + xvinsgr2vr.d VI0, i0, 0 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 #else addi.w i0, i0, 1 - srai.d I, N, 3 + srai.d I, N, 2 bge $r0, I, .L21 - slli.w i0, i0, 3 //8 - xvreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 + slli.w i0, i0, 2 //4 + xvreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 2 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, -3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 4 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 5 - addi.w i0, i0, 3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 6 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 0 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 0 //1 + xvinsgr2vr.w VI0, i0, 1 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 3 - xvinsgr2vr.w VI0, i0, 2 //5 + xvinsgr2vr.w VI0, i0, 2 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 3 //6 - addi.w i0, i0, -3 - xvinsgr2vr.w VI0, i0, 4 //3 + xvinsgr2vr.w VI0, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 5 //4 - addi.w i0, i0, 3 - xvinsgr2vr.w VI0, i0, 6 //7 + xvinsgr2vr.w VI0, i0, 5 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 7 //8 + xvinsgr2vr.w VI0, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 #endif .align 3 @@ -301,16 +301,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.d x1, t3, 1 xvinsgr2vr.d x2, t4, 1 xvadd.d VI1, VI1, VINC4 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.d x1, x1, x2 + xvfmax.d x3, VM0, x1 ld.d t1, X, 0 * SIZE + xvfcmp.ceq.d VT0, x3, VM0 ld.d t2, X, 1 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 add.d X, X, INCX ld.d t3, X, 0 * SIZE ld.d t4, X, 1 * SIZE add.d X, X, INCX - xvinsgr2vr.d x1, t1, 2 - xvinsgr2vr.d x2, t2, 2 - xvinsgr2vr.d x1, t3, 3 - xvinsgr2vr.d x2, t4, 3 + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 addi.d I, I, -1 xvfmul.d x3, VI4, x1 xvfmul.d x4, VI4, x2 @@ -332,6 +344,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w x2, t2, 0 xvinsgr2vr.w x1, t3, 1 xvinsgr2vr.w x2, t4, 1 + xvadd.w VI1, VI1, VINC4 ld.w t1, X, 0 * SIZE ld.w t2, X, 1 * SIZE add.d X, X, INCX @@ -342,31 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w x2, t2, 2 xvinsgr2vr.w x1, t3, 3 xvinsgr2vr.w x2, t4, 3 - xvadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 4 - xvinsgr2vr.w x2, t2, 4 - xvinsgr2vr.w x1, t3, 5 - xvinsgr2vr.w x2, t4, 5 - xvadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 6 - xvinsgr2vr.w x2, t2, 6 - xvinsgr2vr.w x1, t3, 7 - xvinsgr2vr.w x2, t4, 7 addi.d I, I, -1 - xvpickev.w x1, VX1, VX0 - xvpickod.w x2, VX1, VX0 xvfmul.s x3, VI4, x1 xvfmul.s x4, VI4, x2 xvfcmp.clt.s VT0, x1, VI3 @@ -384,152 +373,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L25: #ifdef DOUBLE - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, VM1, x1 + vreplvei.d $vr21, $vr20, 0 + vreplvei.d $vr22, $vr20, 1 + vreplvei.d $vr9, $vr15, 0 + vreplvei.d $vr10, $vr15, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI2 + xvbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w $vr21, $vr20, 0 + vreplvei.w $vr22, $vr20, 1 + vreplvei.w $vr8, $vr20, 2 + vreplvei.w $vr19, $vr20, 3 + vreplvei.w $vr9, $vr15, 0 + vreplvei.w $vr10, $vr15, 1 + vreplvei.w $vr11, $vr15, 2 + vreplvei.w $vr12, $vr15, 3 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, VM1, x1 xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x3, x4 - xvfcmp.ceq.d VT0, x3, VM0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 -#else - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x1, x2 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 -#endif fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - XVCMPLT VT0, VI1, VI0 + xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 +#endif .align 3 +#ifdef DOUBLE .L26: - fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L27 - XVCMPLT VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 + xvfmaxa.d VM0, x1, x2 + xvfcmp.ceq.d VT0, x1, VM0 + xvbitsel.v VI0, VI2, VI1, VT0 .align 3 .L27: - fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L28 - XVCMPLT VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L29 - XVCMPLT VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: -#ifdef DOUBLE movfr2gr.d i0, $f20 -#else - fmov.s $f16, $f20 -#endif .align 3 - -#ifdef DOUBLE #else -.L252: - xvxor.v VI0, VI0, VI0 - xvor.v VI0, VI0, VX0 - fmov.s $f13, $f15 - xvxor.v VM0, VM0, VM0 - xvor.v VM0, VM0, VX1 - xvpickve.w VI1, VI0, 4 - xvpickve.w VI2, VI0, 5 - xvpickve.w VI3, VI0, 6 - xvpickve.w VI4, VI0, 7 - xvpickve.w x1, VM0, 4 - xvpickve.w x2, VM0, 5 - xvpickve.w x3, VM0, 6 - xvpickve.w x4, VM0, 7 - xvfcmp.clt.s VT0, x1, x2 - xvbitsel.v x1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, x1 - xvbitsel.v VM0, VM0, x1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - fcmp.ceq.d $fcc0, $f15, $f9 - bceqz $fcc0, .L262 - xvfcmp.clt.s VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L262: +.L26: fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L272 + bceqz $fcc0, .L27 xvfcmp.clt.s VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 -.L272: +.L27: fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L282 + bceqz $fcc0, .L28 xvfcmp.clt.s VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 -.L282: +.L28: fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L292 + bceqz $fcc0, .L29 xvfcmp.clt.s VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 -.L292: - fcmp.clt.s $fcc0, $f15, $f13 - fsel $f15, $f15, $f13, $fcc0 - fsel $f20, $f20, $f16, $fcc0 +.L29: movfr2gr.s i0, $f20 + .align 3 #endif -.L21: //N<8 -#ifdef DOUBLE +.L21: //N<4 andi I, N, 3 bge $r0, I, .L999 srai.d i1, N, 2 slli.d i1, i1, 2 -#else - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 -#endif addi.d i1, i1, 1 //current index movgr2fr.d $f21, i1 movgr2fr.d $f20, i0 @@ -550,10 +469,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - MTG i0, $f20 + MTG i0, $f20 .align 3 - .L999: move $r4, $r17 jirl $r0, $r1, 0x0 diff --git a/kernel/loongarch64/rot_lasx.S b/kernel/loongarch64/rot_lasx.S index 71378e0b28..386e9136f3 100644 --- a/kernel/loongarch64/rot_lasx.S +++ b/kernel/loongarch64/rot_lasx.S @@ -64,6 +64,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT + move XX, X + move YY, Y #ifdef DOUBLE movfr2gr.d t1, C xvreplgr2vr.d VXC, t1 @@ -80,27 +82,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvreplgr2vr.w VXZ, t3 #endif srai.d I, N, 3 + bge $r0, I, .L997 bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 + bne INCY, TEMP, .L121 // INCX==1 and INCY!=1 + b .L111 // INCX==1 and INCY==1 .L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L110 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 + bne INCY, TEMP, .L221 // INCX!=1 and INCY!=1 + b .L211 // INCX!=1 and INCY==1 .L111: // C!=0 S!=0 xvld VX0, X, 0 * SIZE @@ -130,90 +118,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L112: // C!=0 S==0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvld VX3, Y, 4 * SIZE -#endif - XVMUL VT0, VX0, VXC - XVMUL VT1, VX2, VXC - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE -#ifdef DOUBLE - XVMUL VT0, VX1, VXC - XVMUL VT1, VX3, VXC - xvst VT0, X, 4 * SIZE - xvst VT1, Y, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvld VX3, Y, 4 * SIZE -#endif - XVMUL VT0, VX2, VXS - XVMUL VT1, VX0, VXS - XVFSUB VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE -#ifdef DOUBLE - XVMUL VT0, VX3, VXS - XVMUL VT1, VX1, VXS - xvfsub.d VT1, VXZ, VT1 - xvst VT0, X, 4 * SIZE - xvst VT1, Y, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - xvst VXZ, X, 0 * SIZE - xvst VXZ, Y, 0 * SIZE -#ifdef DOUBLE - xvst VXZ, X, 4 * SIZE - xvst VXZ, Y, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L120 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - .L121: // C!=0 S!=0 - xvld VX0, X, 0 * SIZE #ifdef DOUBLE + xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -221,12 +128,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY xvinsgr2vr.d VX2, t1, 0 xvinsgr2vr.d VX2, t2, 1 xvinsgr2vr.d VX2, t3, 2 xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY #else + xvld VX0, X, 0 * SIZE ld.w t1, Y, 0 * SIZE add.d Y, Y, INCY ld.w t2, Y, 0 * SIZE @@ -234,133 +142,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld.w t3, Y, 0 * SIZE add.d Y, Y, INCY ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY xvinsgr2vr.w VX2, t1, 0 xvinsgr2vr.w VX2, t2, 1 xvinsgr2vr.w VX2, t3, 2 xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE + ld.w t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE xvinsgr2vr.w VX2, t1, 4 xvinsgr2vr.w VX2, t2, 5 xvinsgr2vr.w VX2, t3, 6 xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY #endif XVMUL VT0, VX0, VXC XVFMADD VT0, VX2, VXS, VT0 XVMUL VT1, VX0, VXS XVMSUB VT1, VX2, VXC, VT1 - -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvst VT0, X, 0 * SIZE - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - XVMUL VT0, VX1, VXC - XVFMADD VT0, VX3, VXS, VT0 - XVMUL VT1, VX1, VXS - XVMSUB VT1, VX3, VXC, VT1 - xvst VT0, X, 4 * SIZE - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 -#else xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - -#endif - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 #ifdef DOUBLE - xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX0, VXC - xvfmul.d VT1, VX2, VXC - xvld VX1, X, 4 * SIZE - xvst VT0, X, 0 * SIZE - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX1, VXC - xvfmul.d VT1, VX3, VXC - addi.d I, I, -1 - xvst VT0, X, 4 * SIZE xvstelm.d VT1, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 1 @@ -368,102 +173,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 -#else - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 -#endif - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 -.L123: // C==0 S!=0 -#ifdef DOUBLE - xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE + xvld VX0, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE + ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE xvinsgr2vr.d VX2, t1, 0 xvinsgr2vr.d VX2, t2, 1 xvinsgr2vr.d VX2, t3, 2 xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX2, VXS - xvfmul.d VT1, VX0, VXS - xvfsub.d VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - xvld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX3, VXS - xvfmul.d VT1, VX1, VXS - xvfsub.d VT1, VXZ, VT1 - addi.d I, I, -1 + + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 + xvst VT0, X, 4 * SIZE xvstelm.d VT1, YY, 0, 0 add.d YY, YY, INCY @@ -472,36 +202,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY #else - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE xvstelm.w VT1, YY, 0, 0 add.d YY, YY, INCY xvstelm.w VT1, YY, 0, 1 @@ -517,135 +219,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.w VT1, YY, 0, 6 add.d YY, YY, INCY xvstelm.w VT1, YY, 0, 7 -#endif add.d YY, YY, INCY +#endif addi.d X, X, 8 * SIZE addi.d I, I, -1 - blt $r0, I, .L123 + blt $r0, I, .L121 b .L997 .align 3 -.L124: // C==0 S==0 - xvst VXZ, X, 0 * SIZE +.L211: // C!=0 S!=0 #ifdef DOUBLE - xvst VXZ, X, 0 * SIZE - xvst VXZ, X, 4 * SIZE - xvstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 3 + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 #else - xvst VXZ, X, 0 * SIZE - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 -#endif - add.d YY, YY, INCY - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L124 - move Y, YY - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L210 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 -#ifdef DOUBLE xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE + ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VXC, VX0 - xvfmadd.d VT0, VX2, VXS, VT0 - xvfmul.d VT1, VXS, VX0 - xvfmsub.d VT1, VX2, VXC, VT1 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VX1, VXC - xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX1, VXS - xvfmsub.d VT1, VX3, VXC, VT1 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 4 * SIZE -#else - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 * SIZE add.d X, X, INCX ld.w t3, X, 0 * SIZE add.d X, X, INCX @@ -655,166 +256,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 * SIZE add.d X, X, INCX ld.w t3, X, 0 * SIZE add.d X, X, INCX ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXC, VX0 - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE -#endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 -#ifdef DOUBLE - xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VXC, VX0 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvfmul.d VT1, VX2, VXC - xvst VT1, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VX1, VXC - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvfmul.d VT1, VX3, VXS - xvst VT1, Y, 4 * SIZE -#else - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXC, VX0 - xvfmul.s VT1, VX2, VXC - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE #endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 + XVMUL VT0, VXC, VX0 + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VXS, VX0 + XVMSUB VT1, VX2, VXC, VT1 #ifdef DOUBLE - xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VXS, VX2 - xvfmul.d VT1, VXS, VX0 - xvfsub.d VT1, VXZ, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VT0, XX, 0, 1 @@ -824,148 +283,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.d VT0, XX, 0, 3 add.d XX, XX, INCX xvst VT1, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VX3, VXS - xvfmul.d VT1, VX1, VXS - xvfsub.d VT1, VXZ, VT1 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 4 * SIZE -#else - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXS, VX2 - xvfmul.s VT1, VXS, VX0 - xvfsub.s VT1, VXZ, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE -#endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 -#ifdef DOUBLE - xvstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - xvstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 4 * SIZE -#else - xvstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - xvstelm.w VXZ, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 7 - add.d XX, XX, INCX -#endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L220 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 -.L220: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 -#ifdef DOUBLE + xvld VX2, Y, 4 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -978,135 +297,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.d VX0, t2, 1 xvinsgr2vr.d VX0, t3, 2 xvinsgr2vr.d VX0, t4, 3 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX0, VXC - xvfmadd.d VT0, VX2, VXS, VT0 - xvfmul.d VT1, VX0, VXS - xvfmsub.d VT1, VX2, VXC, VT1 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX1, VXC - xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX1, VXS - xvfmsub.d VT1, VX3, VXC, VT1 + + XVMUL VT0, VXC, VX0 + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VXS, VX0 + XVMSUB VT1, VX2, VXC, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VT0, XX, 0, 1 add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY -#else - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE +#else xvstelm.w VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.w VT0, XX, 0, 1 @@ -1123,232 +328,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d XX, XX, INCX xvstelm.w VT0, XX, 0, 7 add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY + xvst VT1, Y, 0 * SIZE #endif + addi.d Y, Y, 8 * SIZE addi.d I, I, -1 - blt $r0, I, .L221 + blt $r0, I, .L211 b .L997 .align 3 -.L222: // C!=0 S==0 +.L221: // C!=0 S!=0 #ifdef DOUBLE - ld.d t1, X, 0 * SIZE + ld.d t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 * SIZE add.d X, X, INCX - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 * SIZE add.d X, X, INCX - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 * SIZE add.d X, X, INCX xvinsgr2vr.d VX0, t1, 0 xvinsgr2vr.d VX0, t2, 1 xvinsgr2vr.d VX0, t3, 2 xvinsgr2vr.d VX0, t4, 3 - ld.d t1, Y, 0 * SIZE + ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE xvinsgr2vr.d VX2, t1, 0 xvinsgr2vr.d VX2, t2, 1 xvinsgr2vr.d VX2, t3, 2 xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX0, VXC - xvfmul.d VT1, VX2, VXC - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX1, VXC - xvfmul.d VT1, VX3, VXC - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY #else - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 * SIZE add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 * SIZE add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 * SIZE add.d X, X, INCX xvinsgr2vr.w VX0, t1, 0 xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 * SIZE add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 * SIZE add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 * SIZE add.d X, X, INCX xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE + + ld.w t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE + ld.w t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE + ld.w t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE + ld.w t4, Y, 0 * SIZE add.d Y, Y, INCY xvinsgr2vr.w VX2, t1, 0 xvinsgr2vr.w VX2, t2, 1 xvinsgr2vr.w VX2, t3, 2 xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE + ld.w t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE xvinsgr2vr.w VX2, t1, 4 xvinsgr2vr.w VX2, t2, 5 xvinsgr2vr.w VX2, t3, 6 xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY #endif - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // C==0 S!=0 + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 #ifdef DOUBLE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX2, VXS - xvfmul.d VT1, VX0, VXS - xvfsub.d VT1, VXZ, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VT0, XX, 0, 1 @@ -1365,33 +434,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE + + ld.d t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 * SIZE add.d X, X, INCX - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 * SIZE add.d X, X, INCX - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 * SIZE add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - ld.d t1, Y, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE + ld.d t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE + ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 + ld.d t4, Y, 0 * SIZE add.d Y, Y, INCY - xvfmul.d VT0, VX3, VXS - xvfmul.d VT1, VX0, VXS - xvfsub.d VT1, VXZ, VT1 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VT0, XX, 0, 1 @@ -1409,57 +481,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY #else - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 xvstelm.w VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.w VT0, XX, 0, 1 @@ -1476,6 +497,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d XX, XX, INCX xvstelm.w VT0, XX, 0, 7 add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 add.d YY, YY, INCY xvstelm.w VT1, YY, 0, 1 @@ -1494,83 +516,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d YY, YY, INCY #endif addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 -#ifdef DOUBLE - xvstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 3 -#else - xvstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 -#endif - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 -#ifdef DOUBLE - move X, XX - move Y, YY -#endif + blt $r0, I, .L221 b .L997 .align 3 diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S index 3ae11e897a..e5c9c557ab 100644 --- a/kernel/loongarch64/snrm2_lasx.S +++ b/kernel/loongarch64/snrm2_lasx.S @@ -43,15 +43,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define t2 $r13 #define t3 $r14 #define t4 $r15 - -/* Don't change following FR unless you know the effects. */ #define VX0 $xr15 #define VX1 $xr16 #define VX2 $xr17 #define VX3 $xr18 #define VX4 $xr21 +#define VX5 $xr22 +/* Don't change following FR unless you know the effects. */ #define res1 $xr19 #define res2 $xr20 +#define RCP $f2 +#define VALPHA $xr3 + +// The optimization for snrm2 cannot simply involve +// extending the data type from float to double and +// then summing the squares of the data. LAPACK tests +// have shown that this approach can still lead to data overflow. +// Instead, we need to find the maximum absolute value in the entire +// array and divide each data element by this maximum value before +// performing the calculation. This approach can avoid overflow (and does not require extending the data type). PROLOGUE @@ -59,29 +69,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT N, 0(N) LDINT INCX, 0(INCX) #endif + bge $r0, N, .L999 + beq $r0, INCX, .L999 + addi.d $sp, $sp, -32 + st.d $ra, $sp, 0 + st.d N, $sp, 8 + st.d X, $sp, 16 + st.d INCX, $sp, 24 +#ifdef DYNAMIC_ARCH + bl samax_k_LA264 +#else + bl samax_k +#endif + ld.d $ra, $sp, 0 + ld.d N, $sp, 8 + ld.d X, $sp, 16 + ld.d INCX, $sp, 24 + addi.d $sp, $sp, 32 + + frecip.s RCP, $f0 + vreplvei.w $vr3, $vr2, 0 + xvpermi.d VALPHA, $xr3,0x00 xvxor.v res1, res1, res1 xvxor.v res2, res2, res2 - bge $r0, N, .L999 - beq $r0, INCX, .L999 + fcmp.ceq.s $fcc0, $f0, $f19 + bcnez $fcc0, .L999 li.d TEMP, SIZE slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 + srai.d I, N, 4 bne INCX, TEMP, .L20 - bge $r0, I, .L997 + bge $r0, I, .L997 .align 3 .L10: - xvld VX0, X, 0 - xvfcvtl.d.s VX1, VX0 - xvfcvth.d.s VX2, VX0 - xvfmadd.d res1, VX1, VX1, res1 - xvfmadd.d res2, VX2, VX2, res2 + xvld VX0, X, 0 + xvld VX5, X, 8 * SIZE addi.d I, I, -1 - addi.d X, X, 8 * SIZE + addi.d X, X, 16 * SIZE + + xvfmul.s VX0, VX0, VALPHA + xvfmul.s VX5, VX5, VALPHA + + xvfmadd.s res1, VX0, VX0, res1 + xvfmadd.s res2, VX5, VX5, res2 blt $r0, I, .L10 - .align 3 b .L996 + .align 3 .L20: bge $r0, I, .L997 @@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld.w t3, X, 0 add.d X, X, INCX ld.w t4, X, 0 + add.d X, X, INCX xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX0, VX0, VALPHA + xvfmadd.s res1, VX0, VX0, res1 + + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 add.d X, X, INCX - xvfcvtl.d.s VX1, VX0 - xvfcvth.d.s VX2, VX0 - xvfmadd.d res1, VX1, VX1, res1 - xvfmadd.d res2, VX2, VX2, res2 + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX0, VX0, VALPHA + xvfmadd.s res2, VX0, VX0, res2 addi.d I, I, -1 blt $r0, I, .L21 - b .L996 + .align 3 .L996: - xvfadd.d res1, res1, res2 - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - fadd.d $f19, $f19, $f16 - fadd.d $f19, $f19, $f17 - fadd.d $f19, $f19, $f18 + xvfadd.s res1, res1, res2 + xvpermi.d VX1, res1, 0x4e + xvfadd.s res1, res1, VX1 + vreplvei.w $vr16, $vr19, 1 + vreplvei.w $vr17, $vr19, 2 + vreplvei.w $vr18, $vr19, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 .align 3 .L997: - andi I, N, 7 + andi I, N, 15 bge $r0, I, .L999 .align 3 .L998: fld.s $f15, X, 0 - add.d X, X, INCX - addi.d I, I, -1 - fcvt.d.s $f15, $f15 - fmadd.d $f19, $f15, $f15, $f19 + addi.d I, I, -1 + fmul.s $f15, $f15, RCP + fmadd.s $f19, $f15, $f15, $f19 + add.d X, X, INCX blt $r0, I, .L998 .align 3 .L999: - fsqrt.d $f19, $f19 + fsqrt.s $f19, $f19 + fmul.s $f0, $f19, $f0 move $r4, $r17 - fcvt.s.d $f0, $f19 jirl $r0, $r1, 0x0 + .align 3 EPILOGUE diff --git a/kernel/loongarch64/swap_lasx.S b/kernel/loongarch64/swap_lasx.S index 4767fffe36..3e7a14c425 100644 --- a/kernel/loongarch64/swap_lasx.S +++ b/kernel/loongarch64/swap_lasx.S @@ -318,62 +318,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. move XX, X .L222: - LD a1, X, 0 - add.d X, X, INCX - LD a2, X, 0 - add.d X, X, INCX - LD a3, X, 0 - add.d X, X, INCX - LD a4, X, 0 - add.d X, X, INCX - LD b1, Y, 0 - ST a1, Y, 0 - add.d Y, Y, INCY - LD b2, Y, 0 - ST a2, Y, 0 - add.d Y, Y, INCY - LD b3, Y, 0 - ST a3, Y, 0 - add.d Y, Y, INCY - LD b4, Y, 0 - ST a4, Y, 0 - add.d Y, Y, INCY - LD a1, X, 0 - add.d X, X, INCX - ST b1, XX, 0 - add.d XX, XX, INCX - LD b1, Y, 0 - ST a1, Y, 0 - add.d Y, Y, INCY - LD a2, X, 0 - add.d X, X, INCX - ST b2, XX, 0 - add.d XX, XX, INCX - LD b2, Y, 0 - ST a2, Y, 0 - add.d Y, Y, INCY - LD a3, X, 0 - add.d X, X, INCX - ST b3, XX, 0 - add.d XX, XX, INCX - LD b3, Y, 0 - ST a3, Y, 0 - LD a4, X, 0 - add.d X, X, INCX - ST b4, XX, 0 - add.d XX, XX, INCX - LD b4, Y, 0 - ST a4, Y, 0 - add.d Y, Y, INCY - ST b1, XX, 0 - add.d XX, XX, INCX - ST b2, XX, 0 - add.d XX, XX, INCX - ST b3, XX, 0 - add.d XX, XX, INCX - ST b4, XX, 0 - add.d XX, XX, INCX - addi.d I, I, -1 +.rept 8 + LD $f12, X, 0 + LD $f14, Y, 0 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY +.endr + addi.d I, I, -1 blt $r0, I, .L222 .align 3