Fix for regression in DGESVD small sizes

varajago · varajago · commit c23efa065aea · 2025-11-13T04:48:23.000Z
Changes in threshold made to include some of medium sizes into small path.
Direct call to dlasv2 for NN cases avaoided as it is causing regression
Guard for malloc in DGELQF small path

AMD Internal: CPUPL-7430
diff --git a/src/base/flamec/include/FLA_macro_defs.h b/src/base/flamec/include/FLA_macro_defs.h
@@ -332,9 +332,8 @@
 
 // DGESVD, DGESDD thresholds for small size optimization
 #define FLA_SVD_SMALL_SIZE_THRESH0    (16)
-#define FLA_SVD_SMALL_SIZE_THRESH1    (24)
-#define FLA_SVD_SMALL_SIZE_THRESH2    (40)
-#define FLA_SVD_SMALL_SIZE_THRESH3    (45)
+#define FLA_SVD_SMALL_SIZE_THRESH1    (128)
+#define FLA_SVD_SMALL_SIZE_THRESH2    (386)
 
 // TRTRI, threshold numbers to chose paths for performance
 #define FLA_TRTRI_SMALL_THRESH0      (60)
diff --git a/src/lapack/dec/svd/ext/flamec/lapack_dgesvd.c b/src/lapack/dec/svd/ext/flamec/lapack_dgesvd.c
@@ -399,7 +399,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
                 if(wntun)
                 {
                     /* Path 1 (M much larger than N, JOBU='N') */
-                    if((!wntvo) && (*m <= FLA_SVD_SMALL_SIZE_THRESH2)
+                    if((!wntvo) && (*m <= FLA_SVD_SMALL_SIZE_THRESH1)
                        && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
                     {
                         i__2 = *n << 2;
@@ -575,7 +575,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
                 {
                     /* Path 6 (M much larger than N, JOBU='S', JOBVT='S' or */
                     /* 'A') */
-                    if(*m <= FLA_SVD_SMALL_SIZE_THRESH2 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
+                    if(*m <= FLA_SVD_SMALL_SIZE_THRESH1 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
                     {
                         i__2 = *n * 3 + *m;
                         maxwrk = fla_max(i__2, bdspac);
@@ -724,9 +724,8 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
             else
             {
                 /* Path 10 (M at least N, but not much larger) */
-                if(!(wntuo || wntvo) && (((wntun && wntvn) && (*m < FLA_SVD_SMALL_SIZE_THRESH3)))
-                   || ((wntuas || wntvas) && (*m < FLA_SVD_SMALL_SIZE_THRESH1))
-                          && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
+                if(((wntun || wntus) && (wntvn || wntvs) && (*m < FLA_SVD_SMALL_SIZE_THRESH1))
+                     && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
                 {
                     i__2 = *n * 3 + *m;
                     maxwrk = fla_max(i__2, bdspac);
@@ -782,7 +781,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
                 if(wntvn)
                 {
                     /* Path 1t(N much larger than M, JOBVT='N') */
-                    if((wntun && wntvn) && (*n <= FLA_SVD_SMALL_SIZE_THRESH2)
+                    if((wntun && wntvn) && (*n <= FLA_SVD_SMALL_SIZE_THRESH2) && (*m < FLA_SVD_SMALL_SIZE_THRESH0)
                        && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
                     {
                         i__2 = *m << 2;
@@ -956,7 +955,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
                 {
                     /* Path 6t(N much larger than M, JOBU='S' or 'A', */
                     /* JOBVT='S') */
-                    if(*n <= FLA_SVD_SMALL_SIZE_THRESH2 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
+                    if(*n <= FLA_SVD_SMALL_SIZE_THRESH1 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
                     {
                         i__2 = *m * 3 + *n;
                         minwrk = fla_max(i__2, bdspac);
@@ -1793,7 +1792,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
                 /* Path 1 (M much larger than N, JOBU='N') */
                 /* No left singular vectors to be computed */
 #if FLA_ENABLE_AMD_OPT
-                if((!wntvo) && (*m <= FLA_SVD_SMALL_SIZE_THRESH2)
+                if((!wntvo) && (*m <= FLA_SVD_SMALL_SIZE_THRESH1)
                    && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
                 {
                     fla_dgesvd_small6(0, wntvas, m, n, &a[a_offset], lda, NULL, ldu, &s[1], NULL,
@@ -2384,7 +2383,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
                     /* N right singular vectors to be computed in VT */
                     /* Computing MAX */
 #if FLA_ENABLE_AMD_OPT
-                    if(*m <= FLA_SVD_SMALL_SIZE_THRESH2 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
+                    if(*m <= FLA_SVD_SMALL_SIZE_THRESH1 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
                     {
                         fla_dgesvd_small6(wntus, wntvas, m, n, &a[a_offset], lda, &a[a_offset], lda,
                                           &s[1], &u[u_offset], ldu, &vt[vt_offset], ldvt, &work[1],
@@ -2923,7 +2922,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
             /* Path 10 (M at least N, but not much larger) */
             /* Reduce to bidiagonal form without QR decomposition */
 #if FLA_ENABLE_AMD_OPT
-            if(((wntun || wntus) && (wntvn || wntvs) && (*m < FLA_SVD_SMALL_SIZE_THRESH3))
+            if(((wntun || wntus) && (wntvn || wntvs) && (*m < FLA_SVD_SMALL_SIZE_THRESH1))
                && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
             {
                 fla_dgesvd_xx_small10(wntuas, wntvas, m, n, n, &a[a_offset], lda, &s[1],
@@ -3048,7 +3047,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
                 itau = 1;
                 iwork = itau + *m;
 #if FLA_ENABLE_AMD_OPT
-                if((wntun && wntvn) && (*n <= FLA_SVD_SMALL_SIZE_THRESH2)
+                if((wntun && wntvn) && (*n <= FLA_SVD_SMALL_SIZE_THRESH2) && (*m < FLA_SVD_SMALL_SIZE_THRESH0)
                    && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2))
                 {
                     /* Compute A=L*Q */
@@ -3636,7 +3635,7 @@ int lapack_dgesvd(char *jobu, char *jobvt, aocl_int64_t *m, aocl_int64_t *n, dou
                     /* M right singular vectors to be computed in VT and */
                     /* M left singular vectors to be computed in U */
 #if FLA_ENABLE_AMD_OPT
-                    if(*n <= FLA_SVD_SMALL_SIZE_THRESH2 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2)
+                    if(*n <= FLA_SVD_SMALL_SIZE_THRESH1 && FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX2)
                        && *lwork >= maxwrk)
                     {
                         iu = 1;
diff --git a/src/lapack/x86/avx2/fla_dgesvd_nn_small10_avx2.c b/src/lapack/x86/avx2/fla_dgesvd_nn_small10_avx2.c
@@ -171,23 +171,9 @@ void fla_dgesvd_xx_small10_avx2(aocl_int64_t wntu, aocl_int64_t wntv, aocl_int64
     }
     else
     {
-        if(ncvt == 0 && nru == 0)
-        {
-            /* Compute Singular Values excluding computation of Singular Vectors */
-            aocl_lapack_dlasq1(n, &s[1], &e[1], &work[itauq - 1], info);
-
-            /* Ensure singular values are positive */
-            if(*info == 0)
-            {
-                FLA_ENSURE_POSITIVE_SINGULAR_VALUES(*n);
-            }
-        }
-        else
-        {
-            /* Compute Singular Values and Vectors */
-            lapack_dbdsqr_small("U", n, &ncvt, &nru, &s[1], &e[1], &vt[1 + *ldvt], ldvt,
-                                &u[1 + *ldu], ldu, info);
-        }
+        /* Compute Singular Values and Vectors */
+        lapack_dbdsqr_small("U", n, &ncvt, &nru, &s[1], &e[1], &vt[1 + *ldvt], ldvt,
+                            &u[1 + *ldu], ldu, info);
     }
     return;
 }
diff --git a/src/lapack/x86/avx2/fla_dgesvd_small6_avx2.c b/src/lapack/x86/avx2/fla_dgesvd_small6_avx2.c
@@ -130,23 +130,9 @@ void fla_dgesvd_small6_avx2(aocl_int64_t wntus, aocl_int64_t wntvs, aocl_int64_t
     }
     else
     {
-        if(ncvt == 0 && nru == 0)
-        {
-            /* Compute Singular Values excluding computation of Singular Vectors */
-            aocl_lapack_dlasq1(n, &s[1], &e[1], &work[itau - 1], info);
-
-            /* Ensure singular values are positive */
-            if(*info == 0)
-            {
-                FLA_ENSURE_POSITIVE_SINGULAR_VALUES(*n);
-            }
-        }
-        else
-        {
-            /* Compute Singular Values and Vectors */
-            lapack_dbdsqr_small("U", n, &ncvt, &nru, &s[1], &e[1], &vt[1 + *ldvt], ldvt,
-                                &u[1 + *ldu], ldu, info);
-        }
+        /* Compute Singular Values and Vectors */
+        lapack_dbdsqr_small("U", n, &ncvt, &nru, &s[1], &e[1], &vt[1 + *ldvt], ldvt,
+                            &u[1 + *ldu], ldu, info);
     }
 
     /* Compute U by updating U' by applying from the left the Q from QR */
diff --git a/src/lapack/x86/front/fla_lapack_x86_common.c b/src/lapack/x86/front/fla_lapack_x86_common.c
@@ -153,6 +153,10 @@ int fla_dgelqf_small(aocl_int64_t *m, aocl_int64_t *n, doublereal *a, aocl_int64
 
         /* Allocate transpose matrix */
         at = malloc(*n * *m * sizeof(doublereal));
+        if(at == NULL)
+        {
+            return -1;
+        }
 
         /* Do transpose and store it in at */
         fla_dtranspose(m, n, a, lda, at, n);