Initial attempt to wrap CUSOLVER.Xgeev (#47)

kshyatt · Katharine Hyatt · web-flow · commit 36ef2288067f · 2025-08-20T10:27:04.000Z
* Initial attempt to wrap CUSOLVER.Xgeev

* Remove unneeded using

* Fixup re comments

---------

Co-authored-by: Katharine Hyatt &lt;katharine.s.hyatt@gmail.com&gt;
diff --git a/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl b/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl
@@ -5,8 +5,8 @@ using MatrixAlgebraKit: @algdef, Algorithm, check_input
 using MatrixAlgebraKit: one!, zero!, uppertriangular!, lowertriangular!
 using MatrixAlgebraKit: diagview, sign_safe
 using MatrixAlgebraKit: LQViaTransposedQR
-using MatrixAlgebraKit: default_qr_algorithm, default_lq_algorithm, default_svd_algorithm
-import MatrixAlgebraKit: _gpu_geqrf!, _gpu_ungqr!, _gpu_unmqr!, _gpu_gesvd!, _gpu_Xgesvdp!, _gpu_Xgesvdr!, _gpu_gesvdj!
+using MatrixAlgebraKit: default_qr_algorithm, default_lq_algorithm, default_svd_algorithm, default_eig_algorithm
+import MatrixAlgebraKit: _gpu_geqrf!, _gpu_ungqr!, _gpu_unmqr!, _gpu_gesvd!, _gpu_Xgesvdp!, _gpu_Xgesvdr!, _gpu_gesvdj!, _gpu_geev!
 using CUDA
 using LinearAlgebra
 using LinearAlgebra: BlasFloat
@@ -23,8 +23,12 @@ end
 function MatrixAlgebraKit.default_svd_algorithm(::Type{T}; kwargs...) where {T<:StridedCuMatrix}
     return CUSOLVER_QRIteration(; kwargs...)
 end
+function MatrixAlgebraKit.default_eig_algorithm(::Type{T}; kwargs...) where {T<:StridedCuMatrix}
+    return CUSOLVER_Simple(; kwargs...)
+end
 
 
+_gpu_geev!(A::StridedCuMatrix, D::StridedCuVector, V::StridedCuMatrix) = YACUSOLVER.Xgeev!(A, D, V)
 _gpu_geqrf!(A::StridedCuMatrix) = YACUSOLVER.geqrf!(A)
 _gpu_ungqr!(A::StridedCuMatrix, τ::StridedCuVector) = YACUSOLVER.ungqr!(A, τ)
 _gpu_unmqr!(side::AbstractChar, trans::AbstractChar, A::StridedCuMatrix, τ::StridedCuVector, C::StridedCuVecOrMat) = YACUSOLVER.unmqr!(side, trans, A, τ, C)
diff --git a/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl b/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl
@@ -5,7 +5,7 @@ using LinearAlgebra: BlasInt, BlasFloat, checksquare, chkstride1, require_one_ba
 using LinearAlgebra.LAPACK: chkargsok, chklapackerror, chktrans, chkside, chkdiag, chkuplo
 
 using CUDA
-using CUDA: @allowscalar
+using CUDA: @allowscalar, i32
 using CUDA.CUSOLVER
 
 # QR methods are implemented with full access to allocated arrays, so we do not need to redo this:
@@ -306,6 +306,73 @@ function Xgesvdr!(A::StridedCuMatrix{T},
     return S, U, Vᴴ
 end
 
+# Wrapper for general eigensolver 
+for (celty, elty) in ((:ComplexF32, :Float32), (:ComplexF64, :Float64), (:ComplexF32, :ComplexF32), (:ComplexF64, :ComplexF64))
+    @eval begin
+        function Xgeev!(A::StridedCuMatrix{$elty}, D::StridedCuVector{$celty}, V::StridedCuMatrix{$celty})
+            require_one_based_indexing(A, V, D)
+            chkstride1(A, V, D)
+            n = checksquare(A)
+            # TODO GPU appropriate version
+            #chkfinite(A) # balancing routines don't support NaNs and Infs
+            n == length(D) || throw(DimensionMismatch("length mismatch between A and D"))
+            if length(V) == 0
+                jobvr = 'N'
+            elseif length(V) == n*n
+                jobvr = 'V'
+            else
+                throw(DimensionMismatch("size of VR must match size of A"))
+            end
+            jobvl = 'N' # required by API for now (https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdnxgeev)
+            #=if length(VL) == 0
+                jobvl = 'N'
+            elseif length(VL) == n*n
+                jobvl = 'V'
+            else
+                throw(DimensionMismatch("size of VL must match size of A"))
+            end=#
+            VL   = similar(A, n, 0)
+            lda  = max(1, stride(A, 2))
+            ldvl = max(1, stride(VL, 2))
+            params = CUSOLVER.CuSolverParameters()
+            dh = CUSOLVER.dense_handle()
+            
+            if $elty <: Real
+                D2 = reinterpret($elty, D)
+                # reuse memory, we will have to reorder afterwards to bring real and imaginary
+                # components in the order as required for the Complex type
+                VR = reinterpret($elty, V)
+            else
+                D2 = D
+                VR = V
+            end
+            ldvr = max(1, stride(VR, 2))
+            function bufferSize()
+                out_cpu = Ref{Csize_t}(0)
+                out_gpu = Ref{Csize_t}(0)
+                CUSOLVER.cusolverDnXgeev_bufferSize(dh, params, jobvl, jobvr, n, $elty, A,
+                                           lda, $elty, D2, $elty, VL, ldvl, $elty, VR, ldvr,
+                                           $elty, out_gpu, out_cpu)
+                out_gpu[], out_cpu[]
+            end
+            CUDA.with_workspaces(dh.workspace_gpu, dh.workspace_cpu, bufferSize()...) do buffer_gpu, buffer_cpu
+                CUSOLVER.cusolverDnXgeev(dh, params, jobvl, jobvr, n, $elty, A, lda, $elty,
+                                         D2, $elty, VL, ldvl, $elty, VR, ldvr, $elty, buffer_gpu,
+                                         sizeof(buffer_gpu), buffer_cpu, sizeof(buffer_cpu), dh.info)
+            end
+            flag = @allowscalar dh.info[1]
+            CUSOLVER.chkargsok(BlasInt(flag))
+            if eltype(A) <: Real
+                work = CuVector{$elty}(undef, n)
+                DR = view(D2, 1:n)
+                DI = view(D2, (n + 1):(2n))
+                _reorder_realeigendecomposition!(D, DR, DI, work, VR, jobvr)
+            end
+            return D, V
+        end
+    end
+end
+
 # for (jname, bname, fname, elty, relty) in
 #     ((:sygvd!, :cusolverDnSsygvd_bufferSize, :cusolverDnSsygvd, :Float32, :Float32),
 #      (:sygvd!, :cusolverDnDsygvd_bufferSize, :cusolverDnDsygvd, :Float64, :Float64),
@@ -650,4 +717,57 @@ end
 #     end
 # end
 
+# device code is unreachable by coverage right now
+# COV_EXCL_START
+# TODO use a shmem array here
+function _reorder_kernel_real(real_ev_ixs, VR::CuDeviceArray{T}, n::Int) where {T}
+    grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
+    @inbounds if grid_idx <= length(real_ev_ixs)
+        i = real_ev_ixs[grid_idx]
+        for j in n:-1:1
+            VR[2 * j, i] = zero(T) 
+            VR[2 * j - 1, i] = VR[j, i]
+        end
+    end
+    return
+end
+
+function _reorder_kernel_complex(complex_ev_ixs, VR::CuDeviceArray{T}, n::Int) where {T}
+    grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
+    @inbounds if grid_idx <= length(complex_ev_ixs)
+        i = complex_ev_ixs[grid_idx]
+        for j in n:-1:1
+            VR[2 * j, i] = VR[j, i + 1]
+            VR[2 * j - 1, i] = VR[j, i]
+            VR[2 * j, i + 1] = -VR[j, i + 1]
+            VR[2 * j - 1, i + 1] = VR[j, i]
+        end
+    end
+    return
+end
+# COV_EXCL_STOP
+
+function _reorder_realeigendecomposition!(W, WR, WI, work, VR, jobvr)
+# first reorder eigenvalues and recycle work as temporary buffer to efficiently implement the permutation
+    copy!(work, WI)
+    n = size(W, 1)
+    @. W[1:n] = WR[1:n] + im * work[1:n]
+    T = eltype(WR)
+    if jobvr == 'V' # also reorganise vectors
+        real_ev_ixs    = findall(isreal, W)
+        _cmplx_ev_ixs  = findall(!isreal, W) # these come in pairs, choose only the first of each pair
+        complex_ev_ixs = view(_cmplx_ev_ixs, 1:2:length(_cmplx_ev_ixs))
+        if !isempty(real_ev_ixs)
+            real_threads = 128
+            real_blocks = max(1, div(length(real_ev_ixs), real_threads))
+            @cuda threads=real_threads blocks=real_blocks _reorder_kernel_real(real_ev_ixs, VR, n)
+        end
+        if !isempty(complex_ev_ixs)
+            complex_threads = 128
+            complex_blocks = max(1, div(length(complex_ev_ixs), complex_threads))
+            @cuda threads=complex_threads blocks=complex_blocks _reorder_kernel_complex(complex_ev_ixs, VR, n)
+        end
+    end
+end
+
 end
diff --git a/src/MatrixAlgebraKit.jl b/src/MatrixAlgebraKit.jl
@@ -33,6 +33,7 @@ export LAPACK_HouseholderQR, LAPACK_HouseholderLQ,
        LAPACK_QRIteration, LAPACK_Bisection, LAPACK_MultipleRelativelyRobustRepresentations,
        LAPACK_DivideAndConquer, LAPACK_Jacobi,
        LQViaTransposedQR,
+       CUSOLVER_Simple,
        CUSOLVER_HouseholderQR, CUSOLVER_QRIteration, CUSOLVER_SVDPolar, CUSOLVER_Jacobi, CUSOLVER_Randomized,
        ROCSOLVER_HouseholderQR, ROCSOLVER_QRIteration, ROCSOLVER_Jacobi
 export truncrank, trunctol, truncabove, TruncationKeepSorted, TruncationKeepFiltered
diff --git a/src/implementations/eig.jl b/src/implementations/eig.jl
@@ -82,3 +82,29 @@ function eig_trunc!(A::AbstractMatrix, DV, alg::TruncatedAlgorithm)
     D, V = eig_full!(A, DV, alg.alg)
     return truncate!(eig_trunc!, (D, V), alg.trunc)
 end
+
+_gpu_geev!(A::AbstractMatrix, D, V) = throw(MethodError(_gpu_geev!, (A, D, V)))
+
+function eig_full!(A::AbstractMatrix, DV, alg::GPU_EigAlgorithm)
+    check_input(eig_full!, A, DV, alg)
+    D, V = DV
+    if alg isa GPU_Simple
+        isempty(alg.kwargs) ||
+            throw(ArgumentError("GPU_Simple (geev) does not accept any keyword arguments"))
+        _gpu_geev!(A, D.diag, V)
+    end
+    # TODO: make this controllable using a `gaugefix` keyword argument
+    V = gaugefix!(V)
+    return D, V
+end
+
+function eig_vals!(A::AbstractMatrix, D, alg::GPU_EigAlgorithm)
+    check_input(eig_vals!, A, D, alg)
+    V  = similar(A, complex(eltype(A)), (size(A, 1), 0))
+    if alg isa GPU_Simple
+        isempty(alg.kwargs) ||
+            throw(ArgumentError("LAPACK_Simple (geev) does not accept any keyword arguments"))
+        _gpu_geev!(A, D, V)
+    end
+    return D
+end
diff --git a/src/interface/decompositions.jl b/src/interface/decompositions.jl
@@ -164,6 +164,15 @@ a general matrix using the randomized SVD algorithm.
 """
 @algdef CUSOLVER_Randomized
 
+"""
+    CUSOLVER_Simple()
+
+Algorithm type to denote the simple CUSOLVER driver for computing the non-Hermitian
+eigenvalue decomposition of a matrix.
+"""
+@algdef CUSOLVER_Simple
+
+const CUSOLVER_EigAlgorithm = Union{CUSOLVER_Simple}
 # =========================
 # ROCSOLVER ALGORITHMS
 # =========================
@@ -192,3 +201,6 @@ Algorithm type to denote the ROCSOLVER driver for computing the singular value d
 a general matrix using the Jacobi algorithm.
 """
 @algdef ROCSOLVER_Jacobi
+
+const GPU_Simple = Union{CUSOLVER_Simple}
+const GPU_EigAlgorithm = Union{GPU_Simple}
diff --git a/test/cuda/eig.jl b/test/cuda/eig.jl
@@ -0,0 +1,63 @@
+using MatrixAlgebraKit
+using LinearAlgebra: Diagonal
+using Test
+using TestExtras
+using StableRNGs
+using CUDA
+
+include(joinpath("..", "utilities.jl"))
+
+@testset "eig_full! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64)
+    rng = StableRNG(123)
+    m = 54
+    for alg in (CUSOLVER_Simple(), :CUSOLVER_Simple, CUSOLVER_Simple)
+        A = CuArray(randn(rng, T, m, m))
+        Tc = complex(T)
+
+        D, V = @constinferred eig_full(A; alg=($alg))
+        @test eltype(D) == eltype(V) == Tc
+        @test A * V ≈ V * D
+
+        alg′ = @constinferred MatrixAlgebraKit.select_algorithm(eig_full!, A, $alg)
+
+        Ac = similar(A)
+        D2, V2 = @constinferred eig_full!(copy!(Ac, A), (D, V), alg′)
+        @test D2 === D
+        @test V2 === V
+        @test A * V ≈ V * D
+
+        Dc = @constinferred eig_vals(A, alg′)
+        @test eltype(Dc) == Tc
+        @test parent(D) ≈ Dc
+    end
+end
+
+#=
+@testset "eig_trunc! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64)
+    rng = StableRNG(123)
+    m = 54
+    for alg in (CUSOLVER_Simple(),)
+        A = CuArray(randn(rng, T, m, m))
+        A *= A' # TODO: deal with eigenvalue ordering etc
+        # eigenvalues are sorted by ascending real component...
+        D₀ = sort!(eig_vals(A); by=abs, rev=true)
+        rmin = findfirst(i -> abs(D₀[end - i]) != abs(D₀[end - i - 1]), 1:(m - 2))
+        r = length(D₀) - rmin
+
+        D1, V1 = @constinferred eig_trunc(A; alg, trunc=truncrank(r))
+        @test length(D1.diag) == r
+        @test A * V1 ≈ V1 * D1
+
+        s = 1 + sqrt(eps(real(T)))
+        trunc = trunctol(s * abs(D₀[r + 1]))
+        D2, V2 = @constinferred eig_trunc(A; alg, trunc)
+        @test length(diagview(D2)) == r
+        @test A * V2 ≈ V2 * D2
+
+        # trunctol keeps order, truncrank might not
+        # test for same subspace
+        @test V1 * ((V1' * V1) \ (V1' * V2)) ≈ V2
+        @test V2 * ((V2' * V2) \ (V2' * V1)) ≈ V1
+    end
+end
+=#
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -63,6 +63,9 @@ if CUDA.functional()
     @safetestset "CUDA SVD" begin
         include("cuda/svd.jl")
     end
+    @safetestset "CUDA General Eigenvalue Decomposition" begin
+        include("cuda/eig.jl")
+    end
 end
 
 using AMDGPU