Skip to content

Commit 31b1595

Browse files
author
Ahmad, Sameer
authored
AOCL LAPACK: Fix for dlarf regression
Following changes are done- - Modified thresholds for multithreaded optimizations. - Using single threaded zen4 BLIS kernel for medium sizes. - For non zen4 calling dgemv within omp team to enforce single threaded execution. Change-Id: Id7231bc49ba1898b2e76856b31fe2b782277d933 AMD-Internal: [CPUPL-6999]
1 parent 9eb9443 commit 31b1595

2 files changed

Lines changed: 47 additions & 25 deletions

File tree

src/base/flamec/include/FLA_macro_defs.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -298,10 +298,9 @@
298298
/* DLARF , threshold numbers to choose paths for performance */
299299
#define FLA_DLARF_ST_BLOCKED_THRESH_M (1000)
300300
#define FLA_DLARF_ST_BLOCKED_THRESH_N (100)
301-
#define FLA_DLARF_THRESH_UNBLOCKED (4096)
302-
#define FLA_DLARF_THRESH_THREAD_4 (81920)
303-
#define FLA_DLARF_THRESH_THREAD_8 (4096000)
304-
#define FLA_DLARF_THRESH_THREAD_32 (48720000)
301+
#define FLA_DLARF_THRESH_UNBLOCKED (213760)
302+
#define FLA_DLARF_THRESH_THREAD_8 (1048576)
303+
#define FLA_DLARF_THRESH_THREAD_64 (45198729)
305304

306305
/* DLARFT , threshold numbers to chose paths for performance */
307306
#define FLA_DLARFT_BLOCK_NB (16)

src/map/lapack2flamec/f2c/c/dlarf.c

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
* Modifications Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
88
*/
99
#include "FLA_f2c.h" /* Table of constant values */
10+
#if FLA_ENABLE_AOCL_BLAS
11+
#include "blis.h"
12+
void bli_dgemv_t_zen4_int(conj_t conja, conj_t conjx, dim_t m, dim_t n, double *alpha, double *a,
13+
inc_t inca, inc_t lda, double *x, inc_t incx, double *beta, double *y,
14+
inc_t incy, cntx_t *cntx);
15+
#endif
1016

1117
static doublereal c_b4 = 1.;
1218
static doublereal c_b5 = 0.;
@@ -143,11 +149,14 @@ void dlarf_(char *side, integer *m, integer *n, doublereal *v, integer *incv, do
143149
#ifdef FLA_ENABLE_AMD_OPT
144150
extern void fla_dlarf_small_incv1_simd(integer lastv, integer lastc, double *c__, integer ldc,
145151
double *v, double tau, double *work);
152+
#if !FLA_ENABLE_AOCL_BLAS
146153
extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *);
147154
extern void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *);
155+
#endif
148156
void fla_dlarf_tuning_params(integer m, integer n, FLA_Bool * use_blocked_flag,
149157
integer * nthreads);
150158
#endif
159+
#if !FLA_ENABLE_AOCL_BLAS
151160
extern /* Subroutine */
152161
void
153162
dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *,
@@ -157,6 +166,7 @@ void dlarf_(char *side, integer *m, integer *n, doublereal *v, integer *incv, do
157166
void
158167
dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *,
159168
integer *, doublereal *, doublereal *, integer *);
169+
#endif
160170
integer lastc, lastv;
161171
extern integer iladlc_(integer *, integer *, doublereal *, integer *),
162172
iladlr_(integer *, integer *, doublereal *, integer *);
@@ -248,6 +258,9 @@ void dlarf_(char *side, integer *m, integer *n, doublereal *v, integer *incv, do
248258
&& (lastv >= FLA_DGEMV_DGER_SIMD_SMALL_THRESH_M
249259
&& lastv <= FLA_DGEMV_DGER_SIMD_SMALL_THRESH);
250260

261+
/* Initialize global context data */
262+
aocl_fla_init();
263+
251264
/* If the size of the matrix is small and incv =1, use the optimized path */
252265
if(min_lastc_lastv && *incv == c__1 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
253266
{
@@ -283,9 +296,28 @@ void dlarf_(char *side, integer *m, integer *n, doublereal *v, integer *incv, do
283296
{
284297
/* Process in a single call */
285298
/* w(1:lastc,1) := C(1:lastv,1:lastc)**T * v(1:lastv,1) */
299+
#if FLA_ENABLE_AOCL_BLAS
300+
if(FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX512))
301+
{
302+
/* Use direct single threaded BLIS kernel */
303+
bli_dgemv_t_zen4_int(BLIS_CONJUGATE, BLIS_NO_CONJUGATE, lastv, lastc, &c_b4,
304+
&c__[c_offset], 1, *ldc, &v[1], *incv, &c_b5, &work[1],
305+
c__1, NULL);
306+
}
307+
else
308+
{
309+
#ifdef FLA_OPENMP_MULTITHREADING
310+
#pragma omp teams num_teams(1) thread_limit(1)
311+
#endif
312+
{
313+
dgemv_("Transpose", &lastv, &lastc, &c_b4, &c__[c_offset], ldc, &v[1],
314+
incv, &c_b5, &work[1], &c__1);
315+
}
316+
}
317+
#else
286318
dgemv_("Transpose", &lastv, &lastc, &c_b4, &c__[c_offset], ldc, &v[1], incv,
287319
&c_b5, &work[1], &c__1);
288-
320+
#endif
289321
/* C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)**T*/
290322
dger_(&lastv, &lastc, &d__1, &v[1], incv, &work[1], &c__1, &c__[c_offset], ldc);
291323
}
@@ -315,8 +347,13 @@ void dlarf_(char *side, integer *m, integer *n, doublereal *v, integer *incv, do
315347
#ifdef FLA_ENABLE_AMD_OPT
316348
void fla_dlarf_tuning_params(integer m, integer n, FLA_Bool *use_blocked_flag, integer *nthreads)
317349
{
318-
319350
extern int fla_thread_get_num_threads(void);
351+
integer num_elems = m * n;
352+
if(num_elems < FLA_DLARF_THRESH_UNBLOCKED)
353+
{
354+
*use_blocked_flag = 0;
355+
return;
356+
}
320357

321358
integer max_available_threads = fla_thread_get_num_threads();
322359

@@ -339,35 +376,21 @@ void fla_dlarf_tuning_params(integer m, integer n, FLA_Bool *use_blocked_flag, i
339376
/* General case */
340377

341378
integer opt_n_threads = 1;
342-
integer blocked_flag = 0;
343379

344-
integer num_elems = m * n;
345-
if(num_elems < FLA_DLARF_THRESH_UNBLOCKED)
346-
{
347-
blocked_flag = 0;
348-
}
349-
else if(num_elems < FLA_DLARF_THRESH_THREAD_4)
350-
{
351-
opt_n_threads = fla_min(4, n / 2);
352-
blocked_flag = 1;
353-
}
354-
else if(num_elems < FLA_DLARF_THRESH_THREAD_8)
380+
if(num_elems < FLA_DLARF_THRESH_THREAD_8)
355381
{
356382
opt_n_threads = fla_min(8, n / 2);
357-
blocked_flag = 1;
358383
}
359-
else if(num_elems < FLA_DLARF_THRESH_THREAD_32)
384+
else if(num_elems < FLA_DLARF_THRESH_THREAD_64)
360385
{
361-
opt_n_threads = fla_min(32, n / 2);
362-
blocked_flag = 1;
386+
opt_n_threads = fla_min(64, n / 2);
363387
}
364388
else
365389
{
366390
opt_n_threads = fla_min(128, n / 2);
367-
blocked_flag = 1;
368391
}
369392

370-
*use_blocked_flag = blocked_flag;
393+
*use_blocked_flag = 1;
371394
*nthreads = fla_min(opt_n_threads, max_available_threads);
372395
}
373-
#endif /* FLA_ENABLE_AMD_OPT */
396+
#endif /* FLA_ENABLE_AMD_OPT */

0 commit comments

Comments
 (0)