@@ -5,7 +5,7 @@ using LinearAlgebra: BlasInt, BlasFloat, checksquare, chkstride1, require_one_ba
55using LinearAlgebra. LAPACK: chkargsok, chklapackerror, chktrans, chkside, chkdiag, chkuplo
66
77using CUDA
8- using CUDA: @allowscalar
8+ using CUDA: @allowscalar , i32
99using CUDA. CUSOLVER
1010
1111# QR methods are implemented with full access to allocated arrays, so we do not need to redo this:
@@ -306,6 +306,73 @@ function Xgesvdr!(A::StridedCuMatrix{T},
306306 return S, U, Vᴴ
307307end
308308
309+ # Wrapper for general eigensolver
310+ for (celty, elty) in ((:ComplexF32 , :Float32 ), (:ComplexF64 , :Float64 ), (:ComplexF32 , :ComplexF32 ), (:ComplexF64 , :ComplexF64 ))
311+ @eval begin
312+ function Xgeev! (A:: StridedCuMatrix{$elty} , D:: StridedCuVector{$celty} , V:: StridedCuMatrix{$celty} )
313+ require_one_based_indexing (A, V, D)
314+ chkstride1 (A, V, D)
315+ n = checksquare (A)
316+ # TODO GPU appropriate version
317+ # chkfinite(A) # balancing routines don't support NaNs and Infs
318+ n == length (D) || throw (DimensionMismatch (" length mismatch between A and D" ))
319+ if length (V) == 0
320+ jobvr = ' N'
321+ elseif length (V) == n* n
322+ jobvr = ' V'
323+ else
324+ throw (DimensionMismatch (" size of VR must match size of A" ))
325+ end
326+ jobvl = ' N' # required by API for now (https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdnxgeev)
327+ #= if length(VL) == 0
328+ jobvl = 'N'
329+ elseif length(VL) == n*n
330+ jobvl = 'V'
331+ else
332+ throw(DimensionMismatch("size of VL must match size of A"))
333+ end=#
334+ VL = similar (A, n, 0 )
335+ lda = max (1 , stride (A, 2 ))
336+ ldvl = max (1 , stride (VL, 2 ))
337+ params = CUSOLVER. CuSolverParameters ()
338+ dh = CUSOLVER. dense_handle ()
339+
340+ if $ elty <: Real
341+ D2 = reinterpret ($ elty, D)
342+ # reuse memory, we will have to reorder afterwards to bring real and imaginary
343+ # components in the order as required for the Complex type
344+ VR = reinterpret ($ elty, V)
345+ else
346+ D2 = D
347+ VR = V
348+ end
349+ ldvr = max (1 , stride (VR, 2 ))
350+ function bufferSize ()
351+ out_cpu = Ref {Csize_t} (0 )
352+ out_gpu = Ref {Csize_t} (0 )
353+ CUSOLVER. cusolverDnXgeev_bufferSize (dh, params, jobvl, jobvr, n, $ elty, A,
354+ lda, $ elty, D2, $ elty, VL, ldvl, $ elty, VR, ldvr,
355+ $ elty, out_gpu, out_cpu)
356+ out_gpu[], out_cpu[]
357+ end
358+ CUDA. with_workspaces (dh. workspace_gpu, dh. workspace_cpu, bufferSize ()... ) do buffer_gpu, buffer_cpu
359+ CUSOLVER. cusolverDnXgeev (dh, params, jobvl, jobvr, n, $ elty, A, lda, $ elty,
360+ D2, $ elty, VL, ldvl, $ elty, VR, ldvr, $ elty, buffer_gpu,
361+ sizeof (buffer_gpu), buffer_cpu, sizeof (buffer_cpu), dh. info)
362+ end
363+ flag = @allowscalar dh. info[1 ]
364+ CUSOLVER. chkargsok (BlasInt (flag))
365+ if eltype (A) <: Real
366+ work = CuVector {$elty} (undef, n)
367+ DR = view (D2, 1 : n)
368+ DI = view (D2, (n + 1 ): (2 n))
369+ _reorder_realeigendecomposition! (D, DR, DI, work, VR, jobvr)
370+ end
371+ return D, V
372+ end
373+ end
374+ end
375+
309376# for (jname, bname, fname, elty, relty) in
310377# ((:sygvd!, :cusolverDnSsygvd_bufferSize, :cusolverDnSsygvd, :Float32, :Float32),
311378# (:sygvd!, :cusolverDnDsygvd_bufferSize, :cusolverDnDsygvd, :Float64, :Float64),
650717# end
651718# end
652719
720+ # device code is unreachable by coverage right now
721+ # COV_EXCL_START
722+ # TODO use a shmem array here
723+ function _reorder_kernel_real (real_ev_ixs, VR:: CuDeviceArray{T} , n:: Int ) where {T}
724+ grid_idx = threadIdx (). x + (blockIdx (). x - 1 i32) * blockDim (). x
725+ @inbounds if grid_idx <= length (real_ev_ixs)
726+ i = real_ev_ixs[grid_idx]
727+ for j in n: - 1 : 1
728+ VR[2 * j, i] = zero (T)
729+ VR[2 * j - 1 , i] = VR[j, i]
730+ end
731+ end
732+ return
733+ end
734+
735+ function _reorder_kernel_complex (complex_ev_ixs, VR:: CuDeviceArray{T} , n:: Int ) where {T}
736+ grid_idx = threadIdx (). x + (blockIdx (). x - 1 i32) * blockDim (). x
737+ @inbounds if grid_idx <= length (complex_ev_ixs)
738+ i = complex_ev_ixs[grid_idx]
739+ for j in n: - 1 : 1
740+ VR[2 * j, i] = VR[j, i + 1 ]
741+ VR[2 * j - 1 , i] = VR[j, i]
742+ VR[2 * j, i + 1 ] = - VR[j, i + 1 ]
743+ VR[2 * j - 1 , i + 1 ] = VR[j, i]
744+ end
745+ end
746+ return
747+ end
748+ # COV_EXCL_STOP
749+
750+ function _reorder_realeigendecomposition! (W, WR, WI, work, VR, jobvr)
751+ # first reorder eigenvalues and recycle work as temporary buffer to efficiently implement the permutation
752+ copy! (work, WI)
753+ n = size (W, 1 )
754+ @. W[1 : n] = WR[1 : n] + im * work[1 : n]
755+ T = eltype (WR)
756+ if jobvr == ' V' # also reorganise vectors
757+ real_ev_ixs = findall (isreal, W)
758+ _cmplx_ev_ixs = findall (! isreal, W) # these come in pairs, choose only the first of each pair
759+ complex_ev_ixs = view (_cmplx_ev_ixs, 1 : 2 : length (_cmplx_ev_ixs))
760+ if ! isempty (real_ev_ixs)
761+ real_threads = 128
762+ real_blocks = max (1 , div (length (real_ev_ixs), real_threads))
763+ @cuda threads= real_threads blocks= real_blocks _reorder_kernel_real (real_ev_ixs, VR, n)
764+ end
765+ if ! isempty (complex_ev_ixs)
766+ complex_threads = 128
767+ complex_blocks = max (1 , div (length (complex_ev_ixs), complex_threads))
768+ @cuda threads= complex_threads blocks= complex_blocks _reorder_kernel_complex (complex_ev_ixs, VR, n)
769+ end
770+ end
771+ end
772+
653773end
0 commit comments