77 * Modifications Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
88 */
99#include "FLA_f2c.h" /* Table of constant values */
10+ #if FLA_ENABLE_AOCL_BLAS
11+ #include "blis.h"
12+ void bli_dgemv_t_zen4_int (conj_t conja , conj_t conjx , dim_t m , dim_t n , double * alpha , double * a ,
13+ inc_t inca , inc_t lda , double * x , inc_t incx , double * beta , double * y ,
14+ inc_t incy , cntx_t * cntx );
15+ #endif
1016
1117static doublereal c_b4 = 1. ;
1218static doublereal c_b5 = 0. ;
@@ -143,11 +149,14 @@ void dlarf_(char *side, integer *m, integer *n, doublereal *v, integer *incv, do
143149#ifdef FLA_ENABLE_AMD_OPT
144150 extern void fla_dlarf_small_incv1_simd (integer lastv , integer lastc , double * c__ , integer ldc ,
145151 double * v , double tau , double * work );
152+ #if !FLA_ENABLE_AOCL_BLAS
146153 extern doublereal ddot_ (integer * , doublereal * , integer * , doublereal * , integer * );
147154 extern void daxpy_ (integer * , doublereal * , doublereal * , integer * , doublereal * , integer * );
155+ #endif
148156 void fla_dlarf_tuning_params (integer m , integer n , FLA_Bool * use_blocked_flag ,
149157 integer * nthreads );
150158#endif
159+ #if !FLA_ENABLE_AOCL_BLAS
151160 extern /* Subroutine */
152161 void
153162 dger_ (integer * , integer * , doublereal * , doublereal * , integer * , doublereal * , integer * ,
@@ -157,6 +166,7 @@ void dlarf_(char *side, integer *m, integer *n, doublereal *v, integer *incv, do
157166 void
158167 dgemv_ (char * , integer * , integer * , doublereal * , doublereal * , integer * , doublereal * ,
159168 integer * , doublereal * , doublereal * , integer * );
169+ #endif
160170 integer lastc , lastv ;
161171 extern integer iladlc_ (integer * , integer * , doublereal * , integer * ),
162172 iladlr_ (integer * , integer * , doublereal * , integer * );
@@ -248,6 +258,9 @@ void dlarf_(char *side, integer *m, integer *n, doublereal *v, integer *incv, do
248258 && (lastv >= FLA_DGEMV_DGER_SIMD_SMALL_THRESH_M
249259 && lastv <= FLA_DGEMV_DGER_SIMD_SMALL_THRESH );
250260
261+ /* Initialize global context data */
262+ aocl_fla_init ();
263+
251264 /* If the size of the matrix is small and incv =1, use the optimized path */
252265 if (min_lastc_lastv && * incv == c__1 && FLA_IS_MIN_ARCH_ID (FLA_ARCH_AVX2 ))
253266 {
@@ -283,9 +296,28 @@ void dlarf_(char *side, integer *m, integer *n, doublereal *v, integer *incv, do
283296 {
284297 /* Process in a single call */
285298 /* w(1:lastc,1) := C(1:lastv,1:lastc)**T * v(1:lastv,1) */
299+ #if FLA_ENABLE_AOCL_BLAS
300+ if (FLA_IS_MIN_ARCH_ID (FLA_ARCH_AVX512 ))
301+ {
302+ /* Use direct single threaded BLIS kernel */
303+ bli_dgemv_t_zen4_int (BLIS_CONJUGATE , BLIS_NO_CONJUGATE , lastv , lastc , & c_b4 ,
304+ & c__ [c_offset ], 1 , * ldc , & v [1 ], * incv , & c_b5 , & work [1 ],
305+ c__1 , NULL );
306+ }
307+ else
308+ {
309+ #ifdef FLA_OPENMP_MULTITHREADING
310+ #pragma omp teams num_teams(1) thread_limit(1)
311+ #endif
312+ {
313+ dgemv_ ("Transpose" , & lastv , & lastc , & c_b4 , & c__ [c_offset ], ldc , & v [1 ],
314+ incv , & c_b5 , & work [1 ], & c__1 );
315+ }
316+ }
317+ #else
286318 dgemv_ ("Transpose" , & lastv , & lastc , & c_b4 , & c__ [c_offset ], ldc , & v [1 ], incv ,
287319 & c_b5 , & work [1 ], & c__1 );
288-
320+ #endif
289321 /* C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)**T*/
290322 dger_ (& lastv , & lastc , & d__1 , & v [1 ], incv , & work [1 ], & c__1 , & c__ [c_offset ], ldc );
291323 }
@@ -315,8 +347,13 @@ void dlarf_(char *side, integer *m, integer *n, doublereal *v, integer *incv, do
315347#ifdef FLA_ENABLE_AMD_OPT
316348void fla_dlarf_tuning_params (integer m , integer n , FLA_Bool * use_blocked_flag , integer * nthreads )
317349{
318-
319350 extern int fla_thread_get_num_threads (void );
351+ integer num_elems = m * n ;
352+ if (num_elems < FLA_DLARF_THRESH_UNBLOCKED )
353+ {
354+ * use_blocked_flag = 0 ;
355+ return ;
356+ }
320357
321358 integer max_available_threads = fla_thread_get_num_threads ();
322359
@@ -339,35 +376,21 @@ void fla_dlarf_tuning_params(integer m, integer n, FLA_Bool *use_blocked_flag, i
339376 /* General case */
340377
341378 integer opt_n_threads = 1 ;
342- integer blocked_flag = 0 ;
343379
344- integer num_elems = m * n ;
345- if (num_elems < FLA_DLARF_THRESH_UNBLOCKED )
346- {
347- blocked_flag = 0 ;
348- }
349- else if (num_elems < FLA_DLARF_THRESH_THREAD_4 )
350- {
351- opt_n_threads = fla_min (4 , n / 2 );
352- blocked_flag = 1 ;
353- }
354- else if (num_elems < FLA_DLARF_THRESH_THREAD_8 )
380+ if (num_elems < FLA_DLARF_THRESH_THREAD_8 )
355381 {
356382 opt_n_threads = fla_min (8 , n / 2 );
357- blocked_flag = 1 ;
358383 }
359- else if (num_elems < FLA_DLARF_THRESH_THREAD_32 )
384+ else if (num_elems < FLA_DLARF_THRESH_THREAD_64 )
360385 {
361- opt_n_threads = fla_min (32 , n / 2 );
362- blocked_flag = 1 ;
386+ opt_n_threads = fla_min (64 , n / 2 );
363387 }
364388 else
365389 {
366390 opt_n_threads = fla_min (128 , n / 2 );
367- blocked_flag = 1 ;
368391 }
369392
370- * use_blocked_flag = blocked_flag ;
393+ * use_blocked_flag = 1 ;
371394 * nthreads = fla_min (opt_n_threads , max_available_threads );
372395}
373- #endif /* FLA_ENABLE_AMD_OPT */
396+ #endif /* FLA_ENABLE_AMD_OPT */
0 commit comments