Skip to content

Commit f470592

Browse files
committed
Optimize gemv_n_sve kernel
1 parent 5e71cef commit f470592

1 file changed

Lines changed: 8 additions & 7 deletions

File tree

kernel/arm64/gemv_n_sve.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
5858
ix = 0;
5959
a_ptr = a;
6060
if (inc_y == 1) {
61+
BLASLONG width = n / 3;
6162
BLASLONG width = n / 3;
6263
uint64_t sve_size = SV_COUNT();
6364
svbool_t pg_true = SV_TRUE();
@@ -68,8 +69,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
6869
FLOAT *a2_ptr = a + lda * width * 2;
6970

7071
for (j = 0; j < width; j++) {
71-
i = 0;
72-
while ((i + sve_size * 1 - 1) < m) {
72+
73+
for (i = 0; (i + sve_size - 1) < m; i += sve_size) {
7374
ix = j * inc_x;
7475

7576
SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]);
@@ -86,8 +87,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
8687
y_vec = svmla_lane(y_vec, a02_vec, x2_vec, 0);
8788

8889
svst1(pg_true, y + i, y_vec);
89-
90-
i += sve_size * 1;
9190
}
9291

9392
if (i < m) {
@@ -117,29 +116,31 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
117116
a_ptr = a2_ptr;
118117
for (j = width * 3; j < n; j++) {
119118
ix = j * inc_x;
120-
i = 0;
121-
while ((i + sve_size * 1 - 1) < m) {
119+
for (i = 0; (i + sve_size - 1) < m; i += sve_size) {
122120
SV_TYPE y_vec = svld1(pg_true, y + i);
123121
SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]);
124122
SV_TYPE a_vec = svld1(pg_true, a_ptr + i);
125123
y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec);
126124
svst1(pg_true, y + i, y_vec);
127-
i += sve_size * 1;
128125
}
129126

130127
if (i < m) {
131128
SV_TYPE y_vec = svld1(pg, y + i);
132129
SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]);
133130
SV_TYPE a_vec = svld1(pg, a_ptr + i);
134131
y_vec = svmla_m(pg, y_vec, a_vec, x_vec);
132+
y_vec = svmla_m(pg, y_vec, a_vec, x_vec);
135133
svst1(pg, y + i, y_vec);
136134
}
137135

136+
138137
a_ptr += lda;
139138
ix += inc_x;
140139
}
141140

142141
return (0);
142+
143+
return (0);
143144
}
144145

145146
for (j = 0; j < n; j++) {

0 commit comments

Comments
 (0)