fix: improvements to bypass scalar indexing and improve GPU support (QuantumKitHub#375)

kshyatt · lkdvos · web-flow · commit e3bdab4aee8b · 2026-05-19T09:29:38.000-04:00
* More tweaks for GPU support

* fix typo

* Fix TC once again

* Remove unneeded Adjoint methods

* Remove unneeded TensorMapWithStorage?

* Death to to_cpu

* Remove unneeded similarstoragetype method

* Add in TensorMap constructor

* Restore former braiding tensor methods

* Fix type issue for sortperm

* Remove stale type params

* Apply suggestions from code review

Co-authored-by: Lukas Devos &lt;ldevos98@gmail.com&gt;

* Fix bad result of suggestion

* Another fix?

* Force inds to move back to the CPU

* Return to glorious scalartype

* Restore DiagonalTensorMap ctor

* Resolve trunc ambiguity

* Remove extra CUDA ctor

* Restore chopped argument

* Also remove no longer needed method

* Remove forced Int eltype

* Get rid of no-op ctor

* Try to resolve ambiguity

* Cover all truncation strategies

* Short-circuit logic in `findtruncated`

Co-authored-by: Lukas Devos &lt;ldevos98@gmail.com&gt;

* Formatter

---------

Co-authored-by: Lukas Devos &lt;ldevos98@gmail.com&gt;
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -101,18 +101,6 @@ function TensorKit.scalar(t::CuTensorMap{T, S, 0, 0}) where {T, S}
     return isempty(inds) ? zero(scalartype(t)) : @allowscalar @inbounds t.data[only(inds)]
 end
 
-function Base.convert(
-        TT::Type{CuTensorMap{T, S, N₁, N₂}},
-        t::AbstractTensorMap{<:Any, S, N₁, N₂}
-    ) where {T, S, N₁, N₂}
-    if typeof(t) === TT
-        return t
-    else
-        tnew = TT(undef, space(t))
-        return copy!(tnew, t)
-    end
-end
-
 function LinearAlgebra.isposdef(t::CuTensorMap)
     domain(t) == codomain(t) ||
         throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))
@@ -138,10 +126,9 @@ function Base.promote_rule(
     return CuTensorMap{T, S, N₁, N₂}
 end
 
-TensorKit.promote_storage_rule(::Type{CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} =
+TensorKit.promote_storage_rule(::Type{<:CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} =
     CuArray{T, N, CUDA.default_memory}
 
-
 # CuTensorMap exponentation:
 function TensorKit.exp!(t::CuTensorMap)
     domain(t) == codomain(t) ||
diff --git a/ext/TensorKitCUDAExt/truncation.jl b/ext/TensorKitCUDAExt/truncation.jl
@@ -10,6 +10,8 @@ function MatrixAlgebraKit.findtruncated(
         fill!(v, dim(c))
     end
 
+    isempty(parent(values)) && return similar(values, Bool)
+
     perm = sortperm(parent(values); strategy.by, strategy.rev)
     cumulative_dim = cumsum(Base.permute!(parent(dims), perm))
 
@@ -36,6 +38,8 @@ function MatrixAlgebraKit.findtruncated(
         end
     end
 
+    isempty(parent(values)) && return similar(values, Bool)
+
     perm = sortperm(parent(values); by = abs, rev = false)
     cumulative_err = cumsum(Base.permute!(parent(ϵᵖ), perm))
 
@@ -44,6 +48,26 @@ function MatrixAlgebraKit.findtruncated(
     return result
 end
 
+function MatrixAlgebraKit.findtruncated_svd(values::CuSectorVector, strategy::S) where {S <: MatrixAlgebraKit.TruncationStrategy}
+    # returning a CuSectorVector wrecks things in truncate_{co}domain
+    # because of scalar indexing
+    return CUDA.CUDACore.Adapt.adapt(Vector, MatrixAlgebraKit.findtruncated(values, strategy))
+end
+
+for strat in (:(MatrixAlgebraKit.TruncationByOrder), :(MatrixAlgebraKit.TruncationByError), :(MatrixAlgebraKit.TruncationIntersection), :(TensorKit.Factorizations.TruncationSpace))
+    @eval function MatrixAlgebraKit.findtruncated_svd(values::CuSectorVector, strategy::$strat)
+        # returning a CuSectorVector wrecks things in truncate_{co}domain
+        # because of scalar indexing
+        return CUDA.CUDACore.Adapt.adapt(Vector, MatrixAlgebraKit.findtruncated(values, strategy))
+    end
+end
+
+function MatrixAlgebraKit.findtruncated_svd(values::CuSectorVector, strategy::MatrixAlgebraKit.TruncationByValue)
+    atol = TensorKit.Factorizations.rtol_to_atol(values, strategy.p, strategy.atol, strategy.rtol)
+    strategy′ = trunctol(; atol, strategy.by, strategy.keep_below)
+    return SectorDict(c => CUDA.CUDACore.Adapt.adapt(Vector, MatrixAlgebraKit.findtruncated_svd(d, strategy′)) for (c, d) in pairs(values))
+end
+
 # Needed until MatrixAlgebraKit patch hits...
 function MatrixAlgebraKit._ind_intersect(A::CuVector{Bool}, B::CuVector{Int})
     result = fill!(similar(A), false)
diff --git a/src/tensors/abstracttensor.jl b/src/tensors/abstracttensor.jl
@@ -53,9 +53,7 @@ storagetype(t) = storagetype(typeof(t))
 function storagetype(::Type{T}) where {T <: AbstractTensorMap}
     if T isa Union
         # attempt to be slightly more specific by promoting unions
-        Ma = storagetype(T.a)
-        Mb = storagetype(T.b)
-        return promote_storagetype(Ma, Mb)
+        return promote_storagetype(T.a, T.b)
     else
         # fallback definition by using scalartype
         return similarstoragetype(scalartype(T))
@@ -103,8 +101,9 @@ similarstoragetype(X::Type, ::Type{T}) where {T <: Number} =
 
 # implement on tensors
 similarstoragetype(::Type{TT}) where {TT <: AbstractTensorMap} = similarstoragetype(storagetype(TT))
-similarstoragetype(::Type{TT}, ::Type{T}) where {TT <: AbstractTensorMap, T <: Number} =
-    similarstoragetype(storagetype(TT), T)
+function similarstoragetype(::Type{TT}, ::Type{T}) where {TT <: AbstractTensorMap, T <: Number}
+    return similarstoragetype(storagetype(TT), T)
+end
 
 # implement on arrays
 similarstoragetype(::Type{A}) where {A <: DenseVector{<:Number}} = A
diff --git a/src/tensors/adjoint.jl b/src/tensors/adjoint.jl
@@ -50,8 +50,6 @@ Base.@propagate_inbounds function subblock(t::AdjointTensorMap, (f₁, f₂)::Tu
     return permutedims(conj(data), (domainind(tp)..., codomainind(tp)...))
 end
 
-to_cpu(t::AdjointTensorMap) = adjoint(to_cpu(adjoint(t)))
-
 # Show
 #------
 function Base.showarg(io::IO, t::AdjointTensorMap, toplevel::Bool)
diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
@@ -21,7 +21,6 @@ struct TensorMap{T, S <: IndexSpace, N₁, N₂, A <: DenseVector{T}} <: Abstrac
         end
         return TensorMap{T, S, N₁, N₂, A}(data, space)
     end
-
     # constructors from data
     function TensorMap{T, S, N₁, N₂, A}(
             data::A, space::TensorMapSpace{S, N₁, N₂}
@@ -34,6 +33,7 @@ struct TensorMap{T, S <: IndexSpace, N₁, N₂, A <: DenseVector{T}} <: Abstrac
         return new{T, S, N₁, N₂, A}(data, space)
     end
 end
+TensorMap{T, S, N₁, N₂, A}(t::TensorMap{T, S, N₁, N₂}) where {T, S <: IndexSpace, N₁, N₂, A <: DenseVector{T}} = TensorMap(A(t.data), space(t))
 
 """
     Tensor{T, S, N, A<:DenseVector{T}} = TensorMap{T, S, N, 0, A}
@@ -407,11 +407,6 @@ for randf in (:rand, :randn, :randexp, :randisometry)
     end
 end
 
-# Moving arbitrary TensorMaps to CPU
-#-----------------------------
-to_cpu(t::TensorMapWithStorage{T, Vector{T}}) where {T} = t # no op
-to_cpu(t::TensorMap) = convert(TensorMapWithStorage{scalartype(t), similarstoragetype(scalartype(t))}, t)
-
 # Efficient copy constructors
 #-----------------------------
 Base.copy(t::TensorMap) = typeof(t)(copy(t.data), t.space)
diff --git a/test/amd/tensors.jl b/test/amd/tensors.jl
@@ -97,7 +97,7 @@ for V in spacelist
             for T in (Int, Float32, ComplexF64)
                 t = @constinferred AMDGPU.rand(T, W)
                 d = convert(Dict, t)
-                @test TensorKit.to_cpu(t) == convert(TensorMap, d)
+                @test adapt(Array, t) == convert(TensorMap, d)
             end
         end
         symmetricbraiding && @timedtestset "Basic linear algebra" begin
@@ -189,10 +189,10 @@ for V in spacelist
                     t = AMDGPU.rand(T, W)
                     t2 = @constinferred AMDGPU.rand!(similar(t))
                     α = rand(T)
-                    @test norm(t, 2) ≈ norm(TensorKit.to_cpu(t), 2)
-                    @test dot(t2, t) ≈ dot(TensorKit.to_cpu(t2), TensorKit.to_cpu(t))
-                    @test TensorKit.to_cpu(α * t) ≈ α * TensorKit.to_cpu(t)
-                    @test TensorKit.to_cpu(t + t) ≈ 2 * TensorKit.to_cpu(t)
+                    @test norm(t, 2) ≈ norm(adapt(Array, t), 2)
+                    @test dot(t2, t) ≈ dot(adapt(Array, t2), adapt(Array, t))
+                    @test adapt(Array, α * t) ≈ α * adapt(Array, t)
+                    @test adapt(Array, t + t) ≈ 2 * adapt(Array, t)
                 end
             end
             @timedtestset "Real and imaginary parts" begin
@@ -202,17 +202,17 @@ for V in spacelist
 
                     tr = @constinferred real(t)
                     @test scalartype(tr) <: Real
-                    @test real(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tr)
+                    @test real(adapt(Array, t)) == adapt(Array, tr)
                     @test storagetype(tr) == ROCVector{real(T), AMDGPU.Mem.HIPBuffer}
 
                     ti = @constinferred imag(t)
                     @test scalartype(ti) <: Real
-                    @test imag(TensorKit.to_cpu(t)) == TensorKit.to_cpu(ti)
+                    @test imag(adapt(Array, t)) == adapt(Array, ti)
                     @test storagetype(ti) == ROCVector{real(T), AMDGPU.Mem.HIPBuffer}
 
                     tc = @inferred complex(t)
                     @test scalartype(tc) <: Complex
-                    @test complex(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tc)
+                    @test complex(adapt(Array, t)) == adapt(Array, tc)
                     @test storagetype(tc) == ROCVector{complex(T), AMDGPU.Mem.HIPBuffer}
 
                     tc2 = @inferred complex(tr, ti)
@@ -275,13 +275,13 @@ for V in spacelist
                     p1 = ntuple(n -> p[n], k)
                     p2 = ntuple(n -> p[k + n], 5 - k)
                     dt2 = AMDGPU.@allowscalar permute(t, (p1, p2))
-                    ht2 = permute(TensorKit.to_cpu(t), (p1, p2))
-                    @test ht2 == TensorKit.to_cpu(dt2)
+                    ht2 = permute(adapt(Array, t), (p1, p2))
+                    @test ht2 == adapt(Array, dt2)
                 end
 
                 dt3 = AMDGPU.@allowscalar repartition(t, k)
-                ht3 = repartition(TensorKit.to_cpu(t), k)
-                @test ht3 == TensorKit.to_cpu(dt3)
+                ht3 = repartition(adapt(Array, t), k)
+                @test ht3 == adapt(Array, dt3)
             end
         end
         symmetricbraiding && @timedtestset "Full trace: test self-consistency" begin
@@ -339,10 +339,10 @@ for V in spacelist
                 @tensor dHrA12[a, s1, s2, c] := drhoL[a, a'] * conj(dA1[a', t1, b]) *
                     dA2[b, t2, c'] * drhoR[c', c] *
                     dH[s1, s2, t1, t2]
-                @tensor hHrA12[a, s1, s2, c] := TensorKit.to_cpu(drhoL)[a, a'] * conj(TensorKit.to_cpu(dA1)[a', t1, b]) *
-                    TensorKit.to_cpu(dA2)[b, t2, c'] * TensorKit.to_cpu(drhoR)[c', c] *
-                    TensorKit.to_cpu(dH)[s1, s2, t1, t2]
-                @test TensorKit.to_cpu(dHrA12) ≈ hHrA12
+                @tensor hHrA12[a, s1, s2, c] := adapt(Array, drhoL)[a, a'] * conj(adapt(Array, dA1)[a', t1, b]) *
+                    adapt(Array, dA2)[b, t2, c'] * adapt(Array, drhoR)[c', c] *
+                    adapt(Array, dH)[s1, s2, t1, t2]
+                @test adapt(Array, dHrA12) ≈ hHrA12
             end
         end=# # doesn't yet work because of AdjointTensor
         BraidingStyle(I) isa HasBraiding && @timedtestset "Index flipping: test flipping inverse" begin
@@ -422,31 +422,31 @@ for V in spacelist
                 t1 = AMDGPU.rand(T, W1, W1)
                 t2 = AMDGPU.rand(T, W2, W2)
                 t = AMDGPU.rand(T, W1, W2)
-                ht1 = TensorKit.to_cpu(t1)
-                ht2 = TensorKit.to_cpu(t2)
-                ht = TensorKit.to_cpu(t)
-                @test TensorKit.to_cpu(t1 * t) ≈ ht1 * ht
-                @test TensorKit.to_cpu(t1' * t) ≈ ht1' * ht
-                @test TensorKit.to_cpu(t2 * t') ≈ ht2 * ht'
-                @test TensorKit.to_cpu(t2' * t') ≈ ht2' * ht'
+                ht1 = adapt(Array, t1)
+                ht2 = adapt(Array, t2)
+                ht = adapt(Array, t)
+                @test adapt(Array, t1 * t) ≈ ht1 * ht
+                @test adapt(Array, t1' * t) ≈ ht1' * ht
+                @test adapt(Array, t2 * t') ≈ ht2 * ht'
+                @test adapt(Array, t2' * t') ≈ ht2' * ht'
 
                 #=AMDGPU.@allowscalar begin
-                    @test TensorKit.to_cpu(inv(t1)) ≈ inv(ht1)
-                    @test TensorKit.to_cpu(pinv(t)) ≈ pinv(ht)
+                    @test adapt(Array, inv(t1)) ≈ inv(ht1)
+                    @test adapt(Array, pinv(t)) ≈ pinv(ht)
 
                     if T == Float32 || T == ComplexF32
                         continue
                     end
 
-                    @test TensorKit.to_cpu(t1 \ t) ≈ ht1 \ ht
-                    @test TensorKit.to_cpu(t1' \ t) ≈ ht1' \ ht
-                    @test TensorKit.to_cpu(t2 \ t') ≈ ht2 \ ht'
-                    @test TensorKit.to_cpu(t2' \ t') ≈ ht2' \ ht'
+                    @test adapt(Array, t1 \ t) ≈ ht1 \ ht
+                    @test adapt(Array, t1' \ t) ≈ ht1' \ ht
+                    @test adapt(Array, t2 \ t') ≈ ht2 \ ht'
+                    @test adapt(Array, t2' \ t') ≈ ht2' \ ht'
 
-                    @test TensorKit.to_cpu(t2 / t) ≈ ht2 / ht
-                    @test TensorKit.to_cpu(t2' / t) ≈ ht2' / ht
-                    @test TensorKit.to_cpu(t1 / t') ≈ ht1 / ht'
-                    @test TensorKit.to_cpu(t1' / t') ≈ ht1' / ht'
+                    @test adapt(Array, t2 / t) ≈ ht2 / ht
+                    @test adapt(Array, t2' / t) ≈ ht2' / ht
+                    @test adapt(Array, t1 / t') ≈ ht1 / ht'
+                    @test adapt(Array, t1' / t') ≈ ht1' / ht'
                 end=#
             end
         end
@@ -456,11 +456,11 @@ for V in spacelist
                 #=t = project_hermitian!(AMDGPU.randn(T, W, W))
                 s = dim(W)
                 @test (@constinferred sqrt(t))^2 ≈ t
-                @test TensorKit.to_cpu(sqrt(t)) ≈ sqrt(TensorKit.to_cpu(t))
+                @test adapt(Array, sqrt(t)) ≈ sqrt(adapt(Array, t))
                 expt = @constinferred exp(t)
-                @test TensorKit.to_cpu(expt) ≈ exp(TensorKit.to_cpu(t))
+                @test adapt(Array, expt) ≈ exp(adapt(Array, t))
                 @test exp(@constinferred log(project_hermitian!(expt))) ≈ expt
-                @test TensorKit.to_cpu(log(project_hermitian!(expt))) ≈ log(TensorKit.to_cpu(expt))
+                @test adapt(Array, log(project_hermitian!(expt))) ≈ log(adapt(Array, expt))
 
                 @test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
                       id(storagetype(t), W)
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl