@@ -66,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
6666 BLASLONG lda2 = lda * 2 ;
6767 vy0_new = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
6868 vy1_new = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
69- for (k = 0 , j = 0 ; k < m / gvl ; k ++ )
69+ for (k = 0 , j = 0 ; k < m / gvl ; k ++ )
7070 {
7171 a_ptr = a ;
7272 ix = 0 ;
@@ -121,30 +121,73 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
121121#endif
122122 a_ptr += lda2 ;
123123 ix += inc_x2 ;
124+
124125 }
125126
126- for (; i < n ; i += 4 )
127+ for (i = n % 4 ; i < n ; i += 4 )
127128 {
128129#if !defined(XCONJ )
129-
130- x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 4 );
131- x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 4 );
132- temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 4 );
133- temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 4 );
134- temp_rv = VFNMSACVF_FLOAT (temp_rv , alpha_i , x_v1 , 4 );
135- temp_iv = VFMACCVF_FLOAT (temp_iv , alpha_r , x_v1 , 4 );
136- VSEV_FLOAT (& temp_rr [0 ], temp_rv , 4 );
137- VSEV_FLOAT (& temp_ii [0 ], temp_iv , 4 );
130+ // temp_rr[0] = alpha_r * x[ix] - alpha_i * x[ix + 1];
131+ // temp_rr[1] = alpha_r * x[ix + inc_x2] - alpha_i * x[ix + inc_x2 + 1];
132+ x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 2 );
133+ x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 2 );
134+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 2 );
135+ temp_rv = VFNMSACVF_FLOAT (temp_rv , alpha_i , x_v1 , 2 );
136+
137+ // temp_ii[0] = alpha_r * x[ix + 1] + alpha_i * x[ix];
138+ // temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] + alpha_i * x[ix + inc_x2];
139+ temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 2 );
140+ temp_iv = VFMACCVF_FLOAT (temp_iv , alpha_r , x_v1 , 2 );
141+ VSEV_FLOAT (& temp_rr [0 ], temp_rv , 2 );
142+ VSEV_FLOAT (& temp_ii [0 ], temp_iv , 2 );
143+
144+ // temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] - alpha_i * x[ix + inc_x2 * 2 + 1];
145+ // temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] - alpha_i * x[ix + inc_x2 * 3 + 1];
146+ x_v0 = VLSEV_FLOAT (& x [ix + inc_x2 * 2 ], inc_x2 * sizeof (FLOAT ), 2 );
147+ x_v1 = VLSEV_FLOAT (& x [ix + inc_x2 * 2 + 1 ], inc_x2 * sizeof (FLOAT ), 2 );
148+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 2 );
149+ temp_rv = VFNMSACVF_FLOAT (temp_rv , alpha_i , x_v1 , 2 );
150+
151+ // temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] + alpha_i * x[ix + inc_x2 * 2];
152+ // temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] + alpha_i * x[ix + inc_x2 * 3];
153+ temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 2 );
154+ temp_iv = VFMACCVF_FLOAT (temp_iv , alpha_r , x_v1 , 2 );
155+ VSEV_FLOAT (& temp_rr [2 ], temp_rv , 2 );
156+ VSEV_FLOAT (& temp_ii [2 ], temp_iv , 2 );
138157
139158#else
140- x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 4 );
141- x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 4 );
142- temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 4 );
143- temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 4 );
144- temp_rv = VFMACCVF_FLOAT (temp_rv , alpha_i , x_v1 , 4 );
145- temp_iv = VFNMSACVF_FLOAT (temp_iv , alpha_r , x_v1 , 4 );
146- VSEV_FLOAT (& temp_rr [0 ], temp_rv , 4 );
147- VSEV_FLOAT (& temp_ii [0 ], temp_iv , 4 );
159+ // temp_rr[0] = alpha_r * x[ix] + alpha_i * x[ix + 1];
160+ // temp_rr[1] = alpha_r * x[ix + inc_x2] + alpha_i * x[ix + inc_x2 + 1];
161+ x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 2 );
162+ x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 2 );
163+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 2 );
164+ temp_rv = VFMACCVF_FLOAT (temp_rv , alpha_i , x_v1 , 2 );
165+
166+
167+ // temp_ii[0] = alpha_r * x[ix + 1] - alpha_i * x[ix];
168+ // temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] - alpha_i * x[ix + inc_x2];
169+ temp_iv = VFMUL_VF_FLOAT (x_v1 , alpha_r , 2 );
170+ temp_iv = VFNMSACVF_FLOAT (temp_iv , alpha_i , x_v0 , 2 );
171+ VSEV_FLOAT (& temp_rr [0 ], temp_rv , 2 );
172+ VSEV_FLOAT (& temp_ii [0 ], temp_iv , 2 );
173+
174+
175+ // temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] + alpha_i * x[ix + inc_x2 * 2 + 1];
176+ // temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] + alpha_i * x[ix + inc_x2 * 3 + 1];
177+ x_v0 = VLSEV_FLOAT (& x [ix + inc_x2 * 2 ], inc_x2 * sizeof (FLOAT ), 2 );
178+ x_v1 = VLSEV_FLOAT (& x [ix + inc_x2 * 2 + 1 ], inc_x2 * sizeof (FLOAT ), 2 );
179+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 2 );
180+ temp_rv = VFMACCVF_FLOAT (temp_rv , alpha_i , x_v1 , 2 );
181+
182+
183+ temp_ii [2 ] = alpha_r * x [ix + inc_x2 * 2 + 1 ] - alpha_i * x [ix + inc_x2 * 2 ];
184+ temp_ii [3 ] = alpha_r * x [ix + inc_x2 * 3 + 1 ] - alpha_i * x [ix + inc_x2 * 3 ];
185+ temp_iv = VFMUL_VF_FLOAT (x_v1 , alpha_r , 2 );
186+ temp_iv = VFNMSACVF_FLOAT (temp_iv , alpha_i , x_v0 , 2 );
187+ VSEV_FLOAT (& temp_rr [2 ], temp_rv , 2 );
188+ VSEV_FLOAT (& temp_ii [2 ], temp_iv , 2 );
189+
190+
148191
149192#endif
150193
@@ -257,7 +300,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
257300 VSSEV_FLOAT (& y [iy ], stride_y , vy0 , gvl );
258301 VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
259302 j += gvl * 2 ;
260- iy += inc_yv ;
303+ iy += inc_yv ;
261304 }
262305 // tail
263306 if (j / 2 < m )
0 commit comments