11/ *******************************************************************************
2- Copyright (c) 2015 , The OpenBLAS Project
2+ Copyright (c) 2015 , 2024 The OpenBLAS Project
33All rights reserved.
44Redistribution and use in source and binary forms , with or without
55modification , are permitted provided th at the following conditions are
@@ -170,39 +170,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
170170
171171.macro KERNEL_F32_FINALIZE
172172#if !defined(DOUBLE)
173- fadd v1.4s , v1.4s , v2.4s
173+ // F8 only has 2 accumulators
174+ // so add into those pairs
174175 fadd v1.4s , v1.4s , v3.4s
175- fadd v1.4s , v1.4s , v4.4s
176- #else
177- fadd v1.2d , v1.2d , v2.2d
178- fadd v1.2d , v1.2d , v3.2d
179- fadd v1.2d , v1.2d , v4.2d
176+ fadd v2.4s , v2.4s , v4.4s
180177#endif
181178.endm
182179
183- .macro KERNEL_F4
180+ .macro KERNEL_F8
184181#if !defined(DOUBLE)
185- ld1 {v2 .4s} , [ A_PTR ], # 16
186- ld1 {v3 .4s} , [ X_PTR ], # 16
187- fmla v1.4s , v2 .4s , v3 .4s
188- #else
189- ld1 {v2.2d} , [ A_PTR ], # 16
190- ld1 {v3 .2d} , [ X_PTR ], # 16
191- fmla v1 .2d , v2 .2d , v3 .2d
192-
193- ld1 {v4 .2d} , [ A_PTR ], # 16
194- ld1 {v5 .2d} , [ X_PTR ], # 16
195- fmla v1 .2d , v4 .2d , v5 .2d
182+ ld1 {v13 .4s , v14.4s }, [ A_PTR ], # 32
183+ ld1 {v17 .4s , v18.4s }, [ X_PTR ], # 32
184+ fmla v1.4s , v13 .4s , v17 .4s
185+ fmla v2.4s , v14.4s , v18.4s
186+ #else
187+ ld1 {v13 .2d , v14.2d , v15.2d , v16.2d }, [ A_PTR ], # 64
188+ ld1 {v17 .2d , v18 .2d , v19 .2d , v20.2d} , [ X_PTR ], # 64
189+ fmla v1.2d , v13.2d , v17.2d
190+ fmla v2 .2d , v14.2d , v18.2d
191+ fmla v3 .2d , v15.2d , v19.2d
192+ fmla v4 .2d , v16 .2d , v20 .2d
196193#endif
197194.endm
198195
199- .macro KERNEL_F4_FINALIZE
196+ .macro KERNEL_F8_FINALIZE
200197#if !defined(DOUBLE)
201- ext v2.16b , v1.16b , v1.16b , # 8
198+ // Take the top two elements of v1 and
199+ // put them into the first two lanes of v3
200+ ext v3.16b , v1.16b , v1.16b , # 8
201+ fadd v1.2s , v1.2s , v3.2s
202+ ext v4.16b , v2.16b , v2.16b , # 8
203+ fadd v2.2s , v2.2s , v4.2s
204+ // Final pair
202205 fadd v1.2s , v1.2s , v2.2s
203206 faddp TEMP , v1.2s
204207#else
205208 faddp TEMP , v1.2d
209+ faddp TEMP1 , v2.2d
210+ faddp TEMP2 , v3.2d
211+ faddp TEMP3 , v4.2d
212+ fadd TEMP , TEMP , TEMP1
213+ fadd TEMP2 , TEMP2 , TEMP3
214+ fadd TEMP , TEMP , TEMP2
206215#endif
207216.endm
208217
@@ -258,7 +267,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
258267
259268 asr I , M , # 5
260269 cmp I , xzr
261- beq .Lgemv_t_kernel_F4
270+ beq .Lgemv_t_kernel_F8
262271
263272.Lgemv_t_kernel_F320:
264273
@@ -269,24 +278,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
269278
270279 KERNEL_F32_FINALIZE
271280
272- .Lgemv_t_kernel_F4 :
281+ .Lgemv_t_kernel_F8 :
273282 ands I , M , # 31
274- asr I , I , # 2
283+ asr I , I , # 3
275284 cmp I , xzr
276285 beq .Lgemv_t_kernel_F1
277286
278- .Lgemv_t_kernel_F40 :
287+ .Lgemv_t_kernel_F80 :
279288
280- KERNEL_F4
289+ KERNEL_F8
281290
282291 subs I , I , # 1
283- bne .Lgemv_t_kernel_F40
292+ bne .Lgemv_t_kernel_F80
284293
285294.Lgemv_t_kernel_F1:
286295
287- KERNEL_F4_FINALIZE
296+ KERNEL_F8_FINALIZE
288297
289- ands I , M , # 3
298+ ands I , M , # 7
290299 ble .Lgemv_t_kernel_F_END
291300
292301.Lgemv_t_kernel_F10:
0 commit comments