Skip to content

Commit 6a23c36

Browse files
committed
Unroll inner loop - 2 rows at a time. Up to 1.5X faster.
1 parent 5b22140 commit 6a23c36

1 file changed

Lines changed: 46 additions & 8 deletions

File tree

kernel/riscv64/gemv_n_vector.c

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,33 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
4949
{
5050
if (n < 0) return(0);
5151

52-
FLOAT *a_ptr, *y_ptr, temp;
52+
FLOAT *a_ptr, *y_ptr, *a2_ptr, temp, temp2;
5353
BLASLONG i, j, vl;
54-
FLOAT_V_T va, vy;
54+
FLOAT_V_T va, vy, va2;
5555

5656
if (inc_y == 1) {
57-
for (j = 0; j < n; j++) {
57+
for (j = 0; j < (n >> 1); j++) {
58+
temp = alpha * x[0];
59+
temp2 = alpha * x[inc_x];
60+
y_ptr = y;
61+
a_ptr = a;
62+
a2_ptr = a + lda;
63+
for (i = m; i > 0; i -= vl) {
64+
vl = VSETVL(i);
65+
vy = VLEV_FLOAT(y_ptr, vl);
66+
va = VLEV_FLOAT(a_ptr, vl);
67+
va2 = VLEV_FLOAT(a2_ptr, vl);
68+
vy = VFMACCVF_FLOAT(vy, temp, va, vl);
69+
vy = VFMACCVF_FLOAT(vy, temp2, va2, vl);
70+
VSEV_FLOAT(y_ptr, vy, vl);
71+
y_ptr += vl;
72+
a_ptr += vl;
73+
a2_ptr += vl;
74+
}
75+
x += inc_x * 2;
76+
a += lda * 2;
77+
}
78+
if (n & 1) {
5879
temp = alpha * x[0];
5980
y_ptr = y;
6081
a_ptr = a;
@@ -67,12 +88,31 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
6788
y_ptr += vl;
6889
a_ptr += vl;
6990
}
70-
x += inc_x;
71-
a += lda;
7291
}
7392
} else {
7493
BLASLONG stride_y = inc_y * sizeof(FLOAT);
75-
for (j = 0; j < n; j++) {
94+
for (j = 0; j < (n >> 1); j++) {
95+
temp = alpha * x[0];
96+
temp2 = alpha * x[inc_x];
97+
y_ptr = y;
98+
a_ptr = a;
99+
a2_ptr = a + lda;
100+
for (i = m; i > 0; i -= vl) {
101+
vl = VSETVL(i);
102+
vy = VLSEV_FLOAT(y_ptr, stride_y, vl);
103+
va = VLEV_FLOAT(a_ptr, vl);
104+
va2 = VLEV_FLOAT(a2_ptr, vl);
105+
vy = VFMACCVF_FLOAT(vy, temp, va, vl);
106+
vy = VFMACCVF_FLOAT(vy, temp2, va2, vl);
107+
VSSEV_FLOAT(y_ptr, stride_y, vy, vl);
108+
y_ptr += vl * inc_y;
109+
a_ptr += vl;
110+
a2_ptr += vl;
111+
}
112+
x += inc_x * 2;
113+
a += lda * 2;
114+
}
115+
if (n & 1) {
76116
temp = alpha * x[0];
77117
y_ptr = y;
78118
a_ptr = a;
@@ -85,8 +125,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
85125
y_ptr += vl * inc_y;
86126
a_ptr += vl;
87127
}
88-
x += inc_x;
89-
a += lda;
90128
}
91129
}
92130
return(0);

0 commit comments

Comments
 (0)