@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3333#define ALPHAI $f1
3434#define X $r7
3535#define INCX $r8
36+ #define DUMMY2 $r9
3637
3738#define I $r12
3839#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6566
6667 bge $r0, N, .L999
6768 bge $r0, INCX, .L999
69+ ld.d DUMMY2, $sp, 0
6870 li.d TEMP, 1
6971 movgr2fr.d a1, $r0
7072 FFINT a1, a1
@@ -84,24 +86,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8486 srai.d I, N, 2
8587 bne INCX, TEMP, .L22
8688
89+ /////// INCX == 1 ////////
8790.L11:
88- bge $r0, I, .L997
8991 CMPEQ $fcc0, ALPHAR, a1
9092 CMPEQ $fcc1, ALPHAI, a1
93+ bge $r0, I, .L19
94+
95+ /////// INCX == 1 && N >= 4 ////////
96+ bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
97+
9198 bceqz $fcc0, .L13
9299 b .L14
93100 .align 3
94101
95102.L13:
96- bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
97- b .L113 //alpha_r != 0.0 && alpha_i == 0.0
103+ bceqz $fcc1, .L17 //alpha_r != 0.0 && alpha_i != 0.0
104+ b .L16 //alpha_r != 0.0 && alpha_i == 0.0
98105
99106.L14:
100- bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
101- b .L111 //alpha_r == 0.0 && alpha_i == 0.0
107+ bceqz $fcc1, .L18 //alpha_r == 0.0 && alpha_i != 0.0
108+ b .L15 //alpha_r == 0.0 && alpha_i == 0.0
102109 .align 3
103110
104- .L111 : //alpha_r == 0.0 && alpha_i == 0.0
111+ .L15 : //alpha_r == 0.0 && alpha_i == 0.0
105112 vst VXZ, X, 0 * SIZE
106113#ifdef DOUBLE
107114 vst VXZ, X, 2 * SIZE
@@ -112,11 +119,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
112119#endif
113120 addi.d X, X, 8 * SIZE
114121 addi.d I, I, -1
115- blt $r0, I, .L111
116- b .L997
122+ blt $r0, I, .L15
123+ b .L19
117124 .align 3
118125
119- .L113 : //alpha_r != 0.0 && alpha_i == 0.0
126+ .L16 : //alpha_r != 0.0 && alpha_i == 0.0
120127 vld VX0, X, 0 * SIZE
121128#ifdef DOUBLE
122129 vld VX1, X, 2 * SIZE
@@ -151,11 +158,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
151158#endif
152159 addi.d X, X, 8 * SIZE
153160 addi.d I, I, -1
154- blt $r0, I, .L113
155- b .L997
161+ blt $r0, I, .L16
162+ b .L19
156163 .align 3
157164
158- .L114 : //alpha_r != 0.0 && alpha_i != 0.0
165+ .L17 : //alpha_r != 0.0 && alpha_i != 0.0
159166 vld VX0, X, 0 * SIZE
160167#ifdef DOUBLE
161168 vld VX1, X, 2 * SIZE
@@ -196,29 +203,92 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
196203#endif
197204 addi.d X, X, 8 * SIZE
198205 addi.d I, I, -1
199- blt $r0, I, .L114
200- b .L997
206+ blt $r0, I, .L17
207+ b .L19
208+ .align 3
209+
210+ .L18: //alpha_r == 0.0 && alpha_i != 0.0
211+ vld VX0, X, 0 * SIZE
212+ #ifdef DOUBLE
213+ vld VX1, X, 2 * SIZE
214+ vpickev.d x1, VX1, VX0
215+ vpickod.d x2, VX1, VX0
216+ vfmul.d x3, VXAI, x2
217+ vfsub.d x3, VXZ, x3
218+ vfmul.d x4, VXAI, x1
219+ vilvl.d VX2, x4 ,x3
220+ vilvh.d VX3, x4, x3
221+ vst VX2, X, 0 * SIZE
222+ vst VX3, X, 2 * SIZE
223+ vld VX0, X, 4 * SIZE
224+ vld VX1, X, 6 * SIZE
225+ vpickev.d x1, VX1, VX0
226+ vpickod.d x2, VX1, VX0
227+ vfmul.d x3, VXAI, x2
228+ vfsub.d x3, VXZ, x3
229+ vfmul.d x4, VXAI, x1
230+ vilvl.d VX2, x4 ,x3
231+ vilvh.d VX3, x4, x3
232+ vst VX2, X, 4 * SIZE
233+ vst VX3, X, 6 * SIZE
234+ #else
235+ vld VX1, X, 4 * SIZE
236+ vpickev.w x1, VX1, VX0
237+ vpickod.w x2, VX1, VX0
238+ vfmul.s x3, VXAI, x2
239+ vfsub.s x3, VXZ, x3
240+ vfmul.s x4, VXAI, x1
241+ vilvl.w VX2, x4 ,x3
242+ vilvh.w VX3, x4, x3
243+ vst VX2, X, 0 * SIZE
244+ vst VX3, X, 4 * SIZE
245+ #endif
246+ addi.d X, X, 8 * SIZE
247+ addi.d I, I, -1
248+ blt $r0, I, .L18
249+ b .L19
250+ .align 3
251+
252+ /////// INCX == 1 && N < 8 ///////
253+ .L19:
254+ andi I, N, 3
255+ beqz I, .L999
256+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
257+
258+ bceqz $fcc0, .L13_1
259+ b .L14_1
260+
261+ .L13_1:
262+ bceqz $fcc1, .L998 // alpha_r != 0.0 && alpha_i != 0.0
263+ b .L997 // alpha_r != 0.0 && alpha_i == 0.0
264+
265+ .L14_1:
266+ bceqz $fcc1, .L996 // alpha_r == 0.0 && alpha_i != 0.0
267+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
201268 .align 3
202269
270+
271+ /////// INCX != 1 ////////
203272.L22:
204- bge $r0, I, .L997
205- move XX, X
206273 CMPEQ $fcc0, ALPHAR, a1
207274 CMPEQ $fcc1, ALPHAI, a1
275+ move XX, X
276+ bge $r0, I, .L29
277+ bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
208278 bceqz $fcc0, .L23
209279 b .L24
210280 .align 3
211281
212282.L23:
213- bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
214- b .L223 //alpha_r != 0.0 && alpha_i == 0.0
283+ bceqz $fcc1, .L25 //alpha_r != 0.0 && alpha_i != 0.0
284+ b .L26 //alpha_r != 0.0 && alpha_i == 0.0
215285
216286.L24:
217- bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
218- b .L221 //alpha_r == 0.0 && alpha_i == 0.0
287+ bceqz $fcc1, .L28 //alpha_r == 0.0 && alpha_i != 0.0
288+ b .L27 //alpha_r == 0.0 && alpha_i == 0.0
219289 .align 3
220290
221- .L221 : //alpha_r == 0.0 && alpha_i == 0.0
291+ .L27 : //alpha_r == 0.0 && alpha_i == 0.0
222292#ifdef DOUBLE
223293 vstelm.d VXZ, X, 0 , 0
224294 vstelm.d VXZ, X, 1 * SIZE, 0
@@ -246,11 +316,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
246316#endif
247317 add .d X, X, INCX
248318 addi.d I, I, -1
249- blt $r0, I, .L221
250- b .L997
319+ blt $r0, I, .L27
320+ b .L29
251321 .align 3
252322
253- .L223 : //alpha_r != 0.0 && alpha_i == 0.0
323+ .L26 : //alpha_r != 0.0 && alpha_i == 0.0
254324#ifdef DOUBLE
255325 ld.d t1, X, 0 * SIZE
256326 ld.d t2, X, 1 * SIZE
@@ -327,11 +397,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
327397 vstelm.w x4, XX, 1 * SIZE, 3
328398#endif
329399 add .d XX, XX, INCX
330- blt $r0, I, .L223
331- b .L997
400+ blt $r0, I, .L26
401+ b .L29
332402 .align 3
333403
334- .L224 : //alpha_r != 0.0 && alpha_i != 0.0
404+ .L25 : //alpha_r != 0.0 && alpha_i != 0.0
335405#ifdef DOUBLE
336406 ld.d t1, X, 0 * SIZE
337407 ld.d t2, X, 1 * SIZE
@@ -414,16 +484,143 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
414484 vstelm.w x4, XX, 1 * SIZE, 3
415485#endif
416486 add .d XX, XX, INCX
417- blt $r0, I, .L224
418- b .L997
487+ blt $r0, I, .L25
488+ b .L29
419489 .align 3
420490
421- .L997:
422- andi I, N, 3
423- bge $r0, I, .L999
491+ .L28: //alpha_r == 0.0 && alpha_i != 0.0
492+ #ifdef DOUBLE
493+ ld.d t1, X, 0 * SIZE
494+ ld.d t2, X, 1 * SIZE
495+ add .d X, X, INCX
496+ ld.d t3, X, 0 * SIZE
497+ ld.d t4, X, 1 * SIZE
498+ add .d X, X, INCX
499+ vinsgr2vr.d x1, t1, 0
500+ vinsgr2vr.d x2, t2, 0
501+ vinsgr2vr.d x1, t3, 1
502+ vinsgr2vr.d x2, t4, 1
503+ vfmul.d x3, VXAI, x2
504+ vfsub.d x3, VXZ, x3
505+ vfmul.d x4, VXAI, x1
506+ vstelm.d x3, XX, 0 * SIZE, 0
507+ vstelm.d x4, XX, 1 * SIZE, 0
508+ add .d XX, XX, INCX
509+ vstelm.d x3, XX, 0 * SIZE, 1
510+ vstelm.d x4, XX, 1 * SIZE, 1
511+ add .d XX, XX, INCX
512+
513+ ld.d t1, X, 0 * SIZE
514+ ld.d t2, X, 1 * SIZE
515+ add .d X, X, INCX
516+ ld.d t3, X, 0 * SIZE
517+ ld.d t4, X, 1 * SIZE
518+ vinsgr2vr.d x1, t1, 0
519+ vinsgr2vr.d x2, t2, 0
520+ vinsgr2vr.d x1, t3, 1
521+ vinsgr2vr.d x2, t4, 1
522+ add .d X, X, INCX
523+ vfmul.d x3, VXAI, x2
524+ vfsub.d x3, VXZ, x3
525+ vfmul.d x4, VXAI, x1
526+ addi.d I, I, -1
527+ vstelm.d x3, XX, 0 * SIZE, 0
528+ vstelm.d x4, XX, 1 * SIZE, 0
529+ add .d XX, XX, INCX
530+ vstelm.d x3, XX, 0 * SIZE, 1
531+ vstelm.d x4, XX, 1 * SIZE, 1
532+ #else
533+ ld.w t1, X, 0 * SIZE
534+ ld.w t2, X, 1 * SIZE
535+ add .d X, X, INCX
536+ ld.w t3, X, 0 * SIZE
537+ ld.w t4, X, 1 * SIZE
538+ add .d X, X, INCX
539+ vinsgr2vr.w x1, t1, 0
540+ vinsgr2vr.w x2, t2, 0
541+ vinsgr2vr.w x1, t3, 1
542+ vinsgr2vr.w x2, t4, 1
543+ ld.w t1, X, 0 * SIZE
544+ ld.w t2, X, 1 * SIZE
545+ add .d X, X, INCX
546+ ld.w t3, X, 0 * SIZE
547+ ld.w t4, X, 1 * SIZE
548+ vinsgr2vr.w x1, t1, 2
549+ vinsgr2vr.w x2, t2, 2
550+ vinsgr2vr.w x1, t3, 3
551+ vinsgr2vr.w x2, t4, 3
552+ add .d X, X, INCX
553+
554+ vfmul.s x3, VXAI, x2
555+ vfsub.s x3, VXZ, x3
556+ vfmul.s x4, VXAI, x1
557+ addi.d I, I, -1
558+ vstelm.w x3, XX, 0 * SIZE, 0
559+ vstelm.w x4, XX, 1 * SIZE, 0
560+ add .d XX, XX, INCX
561+ vstelm.w x3, XX, 0 * SIZE, 1
562+ vstelm.w x4, XX, 1 * SIZE, 1
563+ add .d XX, XX, INCX
564+ vstelm.w x3, XX, 0 * SIZE, 2
565+ vstelm.w x4, XX, 1 * SIZE, 2
566+ add .d XX, XX, INCX
567+ vstelm.w x3, XX, 0 * SIZE, 3
568+ vstelm.w x4, XX, 1 * SIZE, 3
569+ #endif
570+ add .d XX, XX, INCX
571+ blt $r0, I, .L28
572+ b .L29
573+ .align 3
574+
575+ /////// INCX != 1 && N < 8 ///////
576+ .L29:
577+ andi I, N, 3
578+ beqz I, .L999
579+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
580+
581+ bceqz $fcc0, .L23_1
582+ b .L24_1
583+
584+ .L23_1:
585+ bceqz $fcc1, .L998 // alpha_r != 0.0 && alpha_i != 0.0
586+ b .L997 // alpha_r != 0.0 && alpha_i == 0.0
587+
588+ .L24_1:
589+ bceqz $fcc1, .L996 // alpha_r == 0.0 && alpha_i != 0.0
590+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
424591 .align 3
425592
426- .L998:
593+ .L995: // alpha_r == 0.0 && alpha_i == 0.0
594+ ST a1, X, 0 * SIZE
595+ ST a1, X, 1 * SIZE
596+ addi.d I, I, -1
597+ add .d X, X, INCX
598+ blt $r0, I, .L995
599+ b .L999
600+ .L996: // alpha_r == 0.0 && alpha_i != 0.0
601+ LD a1, X, 0 * SIZE
602+ LD a2, X, 1 * SIZE
603+ addi.d I, I, -1
604+ MUL s1, ALPHAI, a2
605+ MUL s2, ALPHAI, a1
606+ SUB s1, $f12, s1
607+ ST s1, X, 0 * SIZE
608+ ST s2, X, 1 * SIZE
609+ add .d X, X, INCX
610+ blt $r0, I, .L996
611+ b .L999
612+ .L997: // alpha_r != 0.0 && alpha_i == 0.0
613+ LD a1, X, 0 * SIZE
614+ LD a2, X, 1 * SIZE
615+ addi.d I, I, -1
616+ MUL s1, ALPHAR, a1
617+ MUL s2, ALPHAR, a2
618+ ST s1, X, 0 * SIZE
619+ ST s2, X, 1 * SIZE
620+ add .d X, X, INCX
621+ blt $r0, I, .L997
622+ b .L999
623+ .L998: // alpha_r != 0.0 && alpha_i != 0.0, one by one
427624 LD a1, X, 0 * SIZE
428625 LD a2, X, 1 * SIZE
429626 addi.d I, I, -1
@@ -435,7 +632,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
435632 ST s2, X, 1 * SIZE
436633 add .d X, X, INCX
437634 blt $r0, I, .L998
438- .align 3
635+ b .L999
439636
440637.L999:
441638 move $r4, $r12
0 commit comments