Skip to content

Commit c23efa0

Browse files
committed
Fix for regression in DGESVD small sizes
Changes in threshold made to include some of medium sizes into small path. Direct call to dlasv2 for NN cases avaoided as it is causing regression Guard for malloc in DGELQF small path AMD Internal: CPUPL-7430
1 parent 9feb685 commit c23efa0

File tree

5 files changed

+23
-49
lines changed

5 files changed

+23
-49
lines changed

src/base/flamec/include/FLA_macro_defs.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -332,9 +332,8 @@
332332

333333
// DGESVD, DGESDD thresholds for small size optimization
334334
#define FLA_SVD_SMALL_SIZE_THRESH0 (16)
335-
#define FLA_SVD_SMALL_SIZE_THRESH1 (24)
336-
#define FLA_SVD_SMALL_SIZE_THRESH2 (40)
337-
#define FLA_SVD_SMALL_SIZE_THRESH3 (45)
335+
#define FLA_SVD_SMALL_SIZE_THRESH1 (128)
336+
#define FLA_SVD_SMALL_SIZE_THRESH2 (386)
338337

339338
// TRTRI, threshold numbers to chose paths for performance
340339
#define FLA_TRTRI_SMALL_THRESH0 (60)

src/lapack/dec/svd/ext/flamec/lapack_dgesvd.c

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
399399
if(wntun)
400400
{
401401
/* Path 1 (M much larger than N, JOBU='N') */
402-
if((!wntvo) && (*m <= FLA_SVD_SMALL_SIZE_THRESH2)
402+
if((!wntvo) && (*m <= FLA_SVD_SMALL_SIZE_THRESH1)
403403
&& FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
404404
{
405405
i__2 = *n << 2;
@@ -575,7 +575,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
575575
{
576576
/* Path 6 (M much larger than N, JOBU='S', JOBVT='S' or */
577577
/* 'A') */
578-
if(*m <= FLA_SVD_SMALL_SIZE_THRESH2 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
578+
if(*m <= FLA_SVD_SMALL_SIZE_THRESH1 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
579579
{
580580
i__2 = *n * 3 + *m;
581581
maxwrk = fla_max(i__2, bdspac);
@@ -724,9 +724,8 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
724724
else
725725
{
726726
/* Path 10 (M at least N, but not much larger) */
727-
if(!(wntuo || wntvo) && (((wntun && wntvn) && (*m < FLA_SVD_SMALL_SIZE_THRESH3)))
728-
|| ((wntuas || wntvas) && (*m < FLA_SVD_SMALL_SIZE_THRESH1))
729-
&& FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
727+
if(((wntun || wntus) && (wntvn || wntvs) && (*m < FLA_SVD_SMALL_SIZE_THRESH1))
728+
&& FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
730729
{
731730
i__2 = *n * 3 + *m;
732731
maxwrk = fla_max(i__2, bdspac);
@@ -782,7 +781,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
782781
if(wntvn)
783782
{
784783
/* Path 1t(N much larger than M, JOBVT='N') */
785-
if((wntun && wntvn) && (*n <= FLA_SVD_SMALL_SIZE_THRESH2)
784+
if((wntun && wntvn) && (*n <= FLA_SVD_SMALL_SIZE_THRESH2) && (*m < FLA_SVD_SMALL_SIZE_THRESH0)
786785
&& FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
787786
{
788787
i__2 = *m << 2;
@@ -956,7 +955,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
956955
{
957956
/* Path 6t(N much larger than M, JOBU='S' or 'A', */
958957
/* JOBVT='S') */
959-
if(*n <= FLA_SVD_SMALL_SIZE_THRESH2 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
958+
if(*n <= FLA_SVD_SMALL_SIZE_THRESH1 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
960959
{
961960
i__2 = *m * 3 + *n;
962961
minwrk = fla_max(i__2, bdspac);
@@ -1793,7 +1792,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
17931792
/* Path 1 (M much larger than N, JOBU='N') */
17941793
/* No left singular vectors to be computed */
17951794
#if FLA_ENABLE_AMD_OPT
1796-
if((!wntvo) && (*m <= FLA_SVD_SMALL_SIZE_THRESH2)
1795+
if((!wntvo) && (*m <= FLA_SVD_SMALL_SIZE_THRESH1)
17971796
&& FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
17981797
{
17991798
fla_dgesvd_small6(0, wntvas, m, n, &a[a_offset], lda, NULL, ldu, &s[1], NULL,
@@ -2384,7 +2383,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
23842383
/* N right singular vectors to be computed in VT */
23852384
/* Computing MAX */
23862385
#if FLA_ENABLE_AMD_OPT
2387-
if(*m <= FLA_SVD_SMALL_SIZE_THRESH2 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
2386+
if(*m <= FLA_SVD_SMALL_SIZE_THRESH1 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
23882387
{
23892388
fla_dgesvd_small6(wntus, wntvas, m, n, &a[a_offset], lda, &a[a_offset], lda,
23902389
&s[1], &u[u_offset], ldu, &vt[vt_offset], ldvt, &work[1],
@@ -2923,7 +2922,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
29232922
/* Path 10 (M at least N, but not much larger) */
29242923
/* Reduce to bidiagonal form without QR decomposition */
29252924
#if FLA_ENABLE_AMD_OPT
2926-
if(((wntun || wntus) && (wntvn || wntvs) && (*m < FLA_SVD_SMALL_SIZE_THRESH3))
2925+
if(((wntun || wntus) && (wntvn || wntvs) && (*m < FLA_SVD_SMALL_SIZE_THRESH1))
29272926
&& FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
29282927
{
29292928
fla_dgesvd_xx_small10(wntuas, wntvas, m, n, n, &a[a_offset], lda, &s[1],
@@ -3048,7 +3047,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
30483047
itau = 1;
30493048
iwork = itau + *m;
30503049
#if FLA_ENABLE_AMD_OPT
3051-
if((wntun && wntvn) && (*n <= FLA_SVD_SMALL_SIZE_THRESH2)
3050+
if((wntun && wntvn) && (*n <= FLA_SVD_SMALL_SIZE_THRESH2) && (*m < FLA_SVD_SMALL_SIZE_THRESH0)
30523051
&& FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
30533052
{
30543053
/* Compute A=L*Q */
@@ -3636,7 +3635,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
36363635
/* M right singular vectors to be computed in VT and */
36373636
/* M left singular vectors to be computed in U */
36383637
#if FLA_ENABLE_AMD_OPT
3639-
if(*n <= FLA_SVD_SMALL_SIZE_THRESH2 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2)
3638+
if(*n <= FLA_SVD_SMALL_SIZE_THRESH1 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2)
36403639
&& *lwork >= maxwrk)
36413640
{
36423641
iu = 1;

src/lapack/x86/avx2/fla_dgesvd_nn_small10_avx2.c

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -171,23 +171,9 @@ void fla_dgesvd_xx_small10_avx2(aocl_int64_t wntu, aocl_int64_t wntv, aocl_int64
171171
}
172172
else
173173
{
174-
if(ncvt == 0 && nru == 0)
175-
{
176-
/* Compute Singular Values excluding computation of Singular Vectors */
177-
aocl_lapack_dlasq1(n, &s[1], &e[1], &work[itauq - 1], info);
178-
179-
/* Ensure singular values are positive */
180-
if(*info == 0)
181-
{
182-
FLA_ENSURE_POSITIVE_SINGULAR_VALUES(*n);
183-
}
184-
}
185-
else
186-
{
187-
/* Compute Singular Values and Vectors */
188-
lapack_dbdsqr_small("U", n, &ncvt, &nru, &s[1], &e[1], &vt[1 + *ldvt], ldvt,
189-
&u[1 + *ldu], ldu, info);
190-
}
174+
/* Compute Singular Values and Vectors */
175+
lapack_dbdsqr_small("U", n, &ncvt, &nru, &s[1], &e[1], &vt[1 + *ldvt], ldvt,
176+
&u[1 + *ldu], ldu, info);
191177
}
192178
return;
193179
}

src/lapack/x86/avx2/fla_dgesvd_small6_avx2.c

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -130,23 +130,9 @@ void fla_dgesvd_small6_avx2(aocl_int64_t wntus, aocl_int64_t wntvs, aocl_int64_t
130130
}
131131
else
132132
{
133-
if(ncvt == 0 && nru == 0)
134-
{
135-
/* Compute Singular Values excluding computation of Singular Vectors */
136-
aocl_lapack_dlasq1(n, &s[1], &e[1], &work[itau - 1], info);
137-
138-
/* Ensure singular values are positive */
139-
if(*info == 0)
140-
{
141-
FLA_ENSURE_POSITIVE_SINGULAR_VALUES(*n);
142-
}
143-
}
144-
else
145-
{
146-
/* Compute Singular Values and Vectors */
147-
lapack_dbdsqr_small("U", n, &ncvt, &nru, &s[1], &e[1], &vt[1 + *ldvt], ldvt,
148-
&u[1 + *ldu], ldu, info);
149-
}
133+
/* Compute Singular Values and Vectors */
134+
lapack_dbdsqr_small("U", n, &ncvt, &nru, &s[1], &e[1], &vt[1 + *ldvt], ldvt,
135+
&u[1 + *ldu], ldu, info);
150136
}
151137

152138
/* Compute U by updating U' by applying from the left the Q from QR */

src/lapack/x86/front/fla_lapack_x86_common.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,10 @@ int fla_dgelqf_small(aocl_int64_t *m, aocl_int64_t *n, doublereal *a, aocl_int64
153153

154154
/* Allocate transpose matrix */
155155
at = malloc(*n * *m * sizeof(doublereal));
156+
if(at == NULL)
157+
{
158+
return -1;
159+
}
156160

157161
/* Do transpose and store it in at */
158162
fla_dtranspose(m, n, a, lda, at, n);

0 commit comments

Comments
 (0)