@@ -247,205 +247,6 @@ void aocl_lapack_dlarft(char *direct, char *storev, aocl_int64_t *n, aocl_int64_
247247 {
248248 prevlastv = * n ;
249249 i__1 = * k ;
250- #ifdef FLA_ENABLE_AMD_OPT
251- if (lsame_ (storev , "C" , 1 , 1 ))
252- {
253- aocl_int64_t nb = FLA_DLARFT_BLOCK_NB ;
254- aocl_int64_t f_ncols = fla_min (nb , * k );
255- doublereal n_tau ;
256- doublereal diag_elem ;
257- aocl_int64_t block_last_i ;
258-
259- /*
260- * Let V be the matrix (n x k) of the elementary reflectors
261- *
262- *
263- * V is partitioned as follows:
264- *
265- * V = || V11 V12 ||
266- * || V21 V22 ||
267- * || V31 V32 ||
268- *
269- * Where,
270- * V11 is unit lower triangular
271- * V21 is rectangular
272- * V31 is rectangular
273- * V12 is rectangular
274- * V22 is rectangular
275- * V32 is rectangular
276- *
277- * Let T be the matrix (k x k) of the elementary reflectors
278- *
279- * T is paritioned as follows:
280- *
281- * T = || T11 T12 ||
282- * || T21 T22 ||
283- *
284- * Where,
285- * T11 is upper triangular
286- * T21 is zero
287- * T12 is rectangular
288- * T22 is upper triangular
289- *
290- *
291- */
292-
293- /* For the first column of T, we have T11(1,1) = tau[1] */
294- t [t_dim1 + 1 ] = tau [1 ];
295-
296- /*
297- * Generate T11 using unblocked algorithm
298- *
299- */
300- for (i__ = 2 ; i__ <= f_ncols ; ++ i__ )
301- {
302- prevlastv = fla_max (prevlastv , i__ );
303- aocl_int64_t m_lower_triangular = prevlastv - i__ + 1 ;
304- aocl_int64_t n_left = i__ - 1 ;
305- n_tau = - tau [i__ ];
306- diag_elem = v [i__ + i__ * v_dim1 ];
307- /* Explicitly make the diagonal element of V11 equal to 1 */
308- v [i__ + i__ * v_dim1 ] = 1. ;
309- /* T11(1:i-1,i) := - tau(i) * V11(i:j,1:i-1)**T * V11(i:j,i) */
310- #if FLA_ENABLE_AOCL_BLAS
311- if (FLA_IS_MIN_ARCH_ID (FLA_ARCH_AVX512 ))
312- {
313- bli_dgemv_t_zen4_int (BLIS_CONJUGATE , BLIS_NO_CONJUGATE , m_lower_triangular ,
314- n_left , & n_tau , & v [i__ + v_dim1 ], 1 , * ldv ,
315- & v [i__ + i__ * v_dim1 ], c__1 , & c_b0 , & t [i__ * t_dim1 + 1 ],
316- c__1 , NULL );
317- }
318- else
319- #endif
320- {
321- aocl_blas_dgemv ("Transpose" , & m_lower_triangular , & n_left , & n_tau ,
322- & v [i__ + v_dim1 ], ldv , & v [i__ + i__ * v_dim1 ], & c__1 , & c_b0 ,
323- & t [i__ * t_dim1 + 1 ], & c__1 );
324- }
325- /* Restore the diagonal element of V11 */
326- v [i__ + i__ * v_dim1 ] = diag_elem ;
327- /* T11(1:i-1,i) := T11(1:i-1,1:i-1) * T11(1:i-1,i) */
328- aocl_blas_dtrmv ("Upper" , "No transpose" , "Non-unit" , & n_left , & t [t_offset ], ldt ,
329- & t [i__ * t_dim1 + 1 ], & c__1 );
330- /* T11(i,i) := tau(i) */
331- t [i__ + i__ * t_dim1 ] = tau [i__ ];
332- }
333-
334- /* Process the remaining blocks from column nb + 1 to k */
335- for (i__ = nb + 1 ; i__ <= * k ; i__ += nb )
336- {
337- /* Using gemm for partial update of T12 */
338- block_last_i = fla_min (i__ + nb - 1 , * k );
339- aocl_int64_t n_v32 = block_last_i - i__ + 1 ;
340- aocl_int64_t m_v31 = fla_max (* n , block_last_i ) - block_last_i ;
341- aocl_int64_t n_v31 = i__ - 1 ;
342-
343- /* T12 = V31**T * V32 */
344- aocl_blas_dgemm ("Transpose" , "No transpose" , & n_v31 , & n_v32 , & m_v31 , & c_b6 ,
345- & v [block_last_i + 1 + v_dim1 ], ldv ,
346- & v [block_last_i + 1 + i__ * v_dim1 ], ldv , & c_b0 ,
347- & t [i__ * t_dim1 + 1 ], ldt );
348-
349- for (j = i__ ; j <= block_last_i ; ++ j )
350- {
351- aocl_int64_t m_v22_j = block_last_i - j + 1 ;
352- n_tau = - tau [j ];
353- diag_elem = v [j + j * v_dim1 ];
354- /* Explicitly make the diagonal element of V22 equal to 1 */
355- v [j + j * v_dim1 ] = 1. ;
356- /* Update T12
357- T12(:, j) = -tau[j] * T12(:, j) -tau[j] * V21**T * V22(:, j)
358- */
359- #if FLA_ENABLE_AOCL_BLAS
360- if (FLA_IS_MIN_ARCH_ID (FLA_ARCH_AVX512 ))
361- {
362- double n_tau_d = n_tau ;
363- bli_dgemv_t_zen4_int (BLIS_CONJUGATE , BLIS_NO_CONJUGATE , m_v22_j , n_v31 ,
364- & n_tau , & v [j + v_dim1 ], 1 , * ldv , & v [j + j * v_dim1 ],
365- c__1 , & n_tau_d , & t [j * t_dim1 + 1 ], c__1 , NULL );
366- }
367- else
368- #endif
369- {
370- aocl_blas_dgemv ("Transpose" , & m_v22_j , & n_v31 , & n_tau , & v [j + v_dim1 ], ldv ,
371- & v [j + j * v_dim1 ], & c__1 , & n_tau , & t [j * t_dim1 + 1 ],
372- & c__1 );
373- }
374-
375- /* V22_32 = || V22 ||
376- * || V32 ||
377- */
378-
379- aocl_int64_t m_v22_32_j = fla_max (* n , j ) - j + 1 ;
380- aocl_int64_t n_v22_32_j = j - i__ ;
381-
382- /* Update T22
383- * T22(:, j) = -tau[j] * V22_32(:,1:j-1)**T * V22_32(:, j)
384- */
385- #if FLA_ENABLE_AOCL_BLAS
386- if (FLA_IS_MIN_ARCH_ID (FLA_ARCH_AVX512 ))
387- {
388- bli_dgemv_t_zen4_int (BLIS_CONJUGATE , BLIS_NO_CONJUGATE , m_v22_32_j ,
389- n_v22_32_j , & n_tau , & v [j + v_dim1 * i__ ], 1 , * ldv ,
390- & v [j + j * v_dim1 ], c__1 , & c_b0 , & t [j * t_dim1 + i__ ],
391- c__1 , NULL );
392- }
393- else
394- #endif
395- {
396- aocl_blas_dgemv ("Transpose" , & m_v22_32_j , & n_v22_32_j , & n_tau ,
397- & v [j + v_dim1 * i__ ], ldv , & v [j + j * v_dim1 ], & c__1 , & c_b0 ,
398- & t [j * t_dim1 + i__ ], & c__1 );
399- }
400-
401- /* Restore the diagonal element of V22 */
402- v [j + j * v_dim1 ] = diag_elem ;
403- /* T22(j, j) = tau[j] */
404- t [j + j * t_dim1 ] = tau [j ];
405- }
406-
407- aocl_int64_t m_t12 = i__ - 1 ;
408- aocl_int64_t n_t12 = block_last_i - i__ + 1 ;
409-
410- /* Update T12
411- * T12 = T11 * T12
412- */
413- aocl_blas_dtrmm ("Left" , "Upper" , "No transpose" , "Non-unit" , & m_t12 , & n_t12 , & c_b6 ,
414- & t [t_offset ], ldt , & t [i__ * t_dim1 + 1 ], ldt );
415-
416- for (j = i__ + 1 ; j <= block_last_i ; ++ j )
417- {
418- aocl_int64_t n_t12_j = j - i__ ;
419- /* Update T12
420- * T12(:, j) = T12(:,j) + T12 * T12(:, j)
421- */
422- #if FLA_ENABLE_AOCL_BLAS
423- if (FLA_IS_MIN_ARCH_ID (FLA_ARCH_AVX512 ))
424- {
425- bli_dgemv_n_zen4_int_40x2_st (BLIS_NO_TRANSPOSE , BLIS_NO_CONJUGATE , m_t12 ,
426- n_t12_j , & c_b6 , & t [1 + i__ * t_dim1 ], 1 , * ldt ,
427- & t [i__ + j * t_dim1 ], c__1 , & c_b6 ,
428- & t [j * t_dim1 + 1 ], c__1 , NULL );
429- }
430- else
431- #endif
432- {
433- aocl_blas_dgemv ("No transpose" , & m_t12 , & n_t12_j , & c_b6 ,
434- & t [1 + i__ * t_dim1 ], ldt , & t [i__ + j * t_dim1 ], & c__1 ,
435- & c_b6 , & t [j * t_dim1 + 1 ], & c__1 );
436- }
437- /*
438- * Update T22
439- * T22(:, j) = T22(1:j-1, j) * T22(:, j)
440- */
441- aocl_blas_dtrmv ("Upper" , "No transpose" , "Non-unit" , & n_t12_j ,
442- & t [i__ + i__ * t_dim1 ], ldt , & t [i__ + j * t_dim1 ], & c__1 );
443- }
444- }
445- }
446- else
447- {
448- #endif
449250 for (i__ = 1 ; i__ <= i__1 ; ++ i__ )
450251 {
451252 prevlastv = fla_max (i__ , prevlastv );
@@ -551,9 +352,6 @@ void aocl_lapack_dlarft(char *direct, char *storev, aocl_int64_t *n, aocl_int64_
551352 }
552353 }
553354 }
554- #ifdef FLA_ENABLE_AMD_OPT
555- }
556- #endif
557355 }
558356 else
559357 {
0 commit comments