Skip to content

Commit e8bddce

Browse files
Ahmad, Sameervarajago
authored andcommitted
AOCL LAPACK: DORGQR Regression Fix (#151)
- Fixed regression for medium/large size DORGQR regression by rolling back optimization for dlarft. Change-Id: Ieb8ea82dbe547833b4d235edb70cb739e00ebf37 AMD-Internal: CPUPL-6962
1 parent c23efa0 commit e8bddce

File tree

1 file changed

+0
-202
lines changed

1 file changed

+0
-202
lines changed

src/map/lapack2flamec/f2c/c/dlarft.c

Lines changed: 0 additions & 202 deletions
Original file line numberDiff line numberDiff line change
@@ -247,205 +247,6 @@ void aocl_lapack_dlarft(char *direct, char *storev, aocl_int64_t *n, aocl_int64_
247247
{
248248
prevlastv = *n;
249249
i__1 = *k;
250-
#ifdef FLA_ENABLE_AMD_OPT
251-
if(lsame_(storev, "C", 1, 1))
252-
{
253-
aocl_int64_t nb = FLA_DLARFT_BLOCK_NB;
254-
aocl_int64_t f_ncols = fla_min(nb, *k);
255-
doublereal n_tau;
256-
doublereal diag_elem;
257-
aocl_int64_t block_last_i;
258-
259-
/*
260-
* Let V be the matrix (n x k) of the elementary reflectors
261-
*
262-
*
263-
* V is partitioned as follows:
264-
*
265-
* V = || V11 V12 ||
266-
* || V21 V22 ||
267-
* || V31 V32 ||
268-
*
269-
* Where,
270-
* V11 is unit lower triangular
271-
* V21 is rectangular
272-
* V31 is rectangular
273-
* V12 is rectangular
274-
* V22 is rectangular
275-
* V32 is rectangular
276-
*
277-
* Let T be the matrix (k x k) of the elementary reflectors
278-
*
279-
* T is paritioned as follows:
280-
*
281-
* T = || T11 T12 ||
282-
* || T21 T22 ||
283-
*
284-
* Where,
285-
* T11 is upper triangular
286-
* T21 is zero
287-
* T12 is rectangular
288-
* T22 is upper triangular
289-
*
290-
*
291-
*/
292-
293-
/* For the first column of T, we have T11(1,1) = tau[1] */
294-
t[t_dim1 + 1] = tau[1];
295-
296-
/*
297-
* Generate T11 using unblocked algorithm
298-
*
299-
*/
300-
for(i__ = 2; i__ <= f_ncols; ++i__)
301-
{
302-
prevlastv = fla_max(prevlastv, i__);
303-
aocl_int64_t m_lower_triangular = prevlastv - i__ + 1;
304-
aocl_int64_t n_left = i__ - 1;
305-
n_tau = -tau[i__];
306-
diag_elem = v[i__ + i__ * v_dim1];
307-
/* Explicitly make the diagonal element of V11 equal to 1 */
308-
v[i__ + i__ * v_dim1] = 1.;
309-
/* T11(1:i-1,i) := - tau(i) * V11(i:j,1:i-1)**T * V11(i:j,i) */
310-
#if FLA_ENABLE_AOCL_BLAS
311-
if(FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX512))
312-
{
313-
bli_dgemv_t_zen4_int(BLIS_CONJUGATE, BLIS_NO_CONJUGATE, m_lower_triangular,
314-
n_left, &n_tau, &v[i__ + v_dim1], 1, *ldv,
315-
&v[i__ + i__ * v_dim1], c__1, &c_b0, &t[i__ * t_dim1 + 1],
316-
c__1, NULL);
317-
}
318-
else
319-
#endif
320-
{
321-
aocl_blas_dgemv("Transpose", &m_lower_triangular, &n_left, &n_tau,
322-
&v[i__ + v_dim1], ldv, &v[i__ + i__ * v_dim1], &c__1, &c_b0,
323-
&t[i__ * t_dim1 + 1], &c__1);
324-
}
325-
/* Restore the diagonal element of V11 */
326-
v[i__ + i__ * v_dim1] = diag_elem;
327-
/* T11(1:i-1,i) := T11(1:i-1,1:i-1) * T11(1:i-1,i) */
328-
aocl_blas_dtrmv("Upper", "No transpose", "Non-unit", &n_left, &t[t_offset], ldt,
329-
&t[i__ * t_dim1 + 1], &c__1);
330-
/* T11(i,i) := tau(i) */
331-
t[i__ + i__ * t_dim1] = tau[i__];
332-
}
333-
334-
/* Process the remaining blocks from column nb + 1 to k */
335-
for(i__ = nb + 1; i__ <= *k; i__ += nb)
336-
{
337-
/* Using gemm for partial update of T12 */
338-
block_last_i = fla_min(i__ + nb - 1, *k);
339-
aocl_int64_t n_v32 = block_last_i - i__ + 1;
340-
aocl_int64_t m_v31 = fla_max(*n, block_last_i) - block_last_i;
341-
aocl_int64_t n_v31 = i__ - 1;
342-
343-
/* T12 = V31**T * V32 */
344-
aocl_blas_dgemm("Transpose", "No transpose", &n_v31, &n_v32, &m_v31, &c_b6,
345-
&v[block_last_i + 1 + v_dim1], ldv,
346-
&v[block_last_i + 1 + i__ * v_dim1], ldv, &c_b0,
347-
&t[i__ * t_dim1 + 1], ldt);
348-
349-
for(j = i__; j <= block_last_i; ++j)
350-
{
351-
aocl_int64_t m_v22_j = block_last_i - j + 1;
352-
n_tau = -tau[j];
353-
diag_elem = v[j + j * v_dim1];
354-
/* Explicitly make the diagonal element of V22 equal to 1 */
355-
v[j + j * v_dim1] = 1.;
356-
/* Update T12
357-
T12(:, j) = -tau[j] * T12(:, j) -tau[j] * V21**T * V22(:, j)
358-
*/
359-
#if FLA_ENABLE_AOCL_BLAS
360-
if(FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX512))
361-
{
362-
double n_tau_d = n_tau;
363-
bli_dgemv_t_zen4_int(BLIS_CONJUGATE, BLIS_NO_CONJUGATE, m_v22_j, n_v31,
364-
&n_tau, &v[j + v_dim1], 1, *ldv, &v[j + j * v_dim1],
365-
c__1, &n_tau_d, &t[j * t_dim1 + 1], c__1, NULL);
366-
}
367-
else
368-
#endif
369-
{
370-
aocl_blas_dgemv("Transpose", &m_v22_j, &n_v31, &n_tau, &v[j + v_dim1], ldv,
371-
&v[j + j * v_dim1], &c__1, &n_tau, &t[j * t_dim1 + 1],
372-
&c__1);
373-
}
374-
375-
/* V22_32 = || V22 ||
376-
* || V32 ||
377-
*/
378-
379-
aocl_int64_t m_v22_32_j = fla_max(*n, j) - j + 1;
380-
aocl_int64_t n_v22_32_j = j - i__;
381-
382-
/* Update T22
383-
* T22(:, j) = -tau[j] * V22_32(:,1:j-1)**T * V22_32(:, j)
384-
*/
385-
#if FLA_ENABLE_AOCL_BLAS
386-
if(FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX512))
387-
{
388-
bli_dgemv_t_zen4_int(BLIS_CONJUGATE, BLIS_NO_CONJUGATE, m_v22_32_j,
389-
n_v22_32_j, &n_tau, &v[j + v_dim1 * i__], 1, *ldv,
390-
&v[j + j * v_dim1], c__1, &c_b0, &t[j * t_dim1 + i__],
391-
c__1, NULL);
392-
}
393-
else
394-
#endif
395-
{
396-
aocl_blas_dgemv("Transpose", &m_v22_32_j, &n_v22_32_j, &n_tau,
397-
&v[j + v_dim1 * i__], ldv, &v[j + j * v_dim1], &c__1, &c_b0,
398-
&t[j * t_dim1 + i__], &c__1);
399-
}
400-
401-
/* Restore the diagonal element of V22 */
402-
v[j + j * v_dim1] = diag_elem;
403-
/* T22(j, j) = tau[j] */
404-
t[j + j * t_dim1] = tau[j];
405-
}
406-
407-
aocl_int64_t m_t12 = i__ - 1;
408-
aocl_int64_t n_t12 = block_last_i - i__ + 1;
409-
410-
/* Update T12
411-
* T12 = T11 * T12
412-
*/
413-
aocl_blas_dtrmm("Left", "Upper", "No transpose", "Non-unit", &m_t12, &n_t12, &c_b6,
414-
&t[t_offset], ldt, &t[i__ * t_dim1 + 1], ldt);
415-
416-
for(j = i__ + 1; j <= block_last_i; ++j)
417-
{
418-
aocl_int64_t n_t12_j = j - i__;
419-
/* Update T12
420-
* T12(:, j) = T12(:,j) + T12 * T12(:, j)
421-
*/
422-
#if FLA_ENABLE_AOCL_BLAS
423-
if(FLA_IS_MIN_ARCH_ID(FLA_ARCH_AVX512))
424-
{
425-
bli_dgemv_n_zen4_int_40x2_st(BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_t12,
426-
n_t12_j, &c_b6, &t[1 + i__ * t_dim1], 1, *ldt,
427-
&t[i__ + j * t_dim1], c__1, &c_b6,
428-
&t[j * t_dim1 + 1], c__1, NULL);
429-
}
430-
else
431-
#endif
432-
{
433-
aocl_blas_dgemv("No transpose", &m_t12, &n_t12_j, &c_b6,
434-
&t[1 + i__ * t_dim1], ldt, &t[i__ + j * t_dim1], &c__1,
435-
&c_b6, &t[j * t_dim1 + 1], &c__1);
436-
}
437-
/*
438-
* Update T22
439-
* T22(:, j) = T22(1:j-1, j) * T22(:, j)
440-
*/
441-
aocl_blas_dtrmv("Upper", "No transpose", "Non-unit", &n_t12_j,
442-
&t[i__ + i__ * t_dim1], ldt, &t[i__ + j * t_dim1], &c__1);
443-
}
444-
}
445-
}
446-
else
447-
{
448-
#endif
449250
for(i__ = 1; i__ <= i__1; ++i__)
450251
{
451252
prevlastv = fla_max(i__, prevlastv);
@@ -551,9 +352,6 @@ void aocl_lapack_dlarft(char *direct, char *storev, aocl_int64_t *n, aocl_int64_
551352
}
552353
}
553354
}
554-
#ifdef FLA_ENABLE_AMD_OPT
555-
}
556-
#endif
557355
}
558356
else
559357
{

0 commit comments

Comments
 (0)