Add GPU support for Strided backends (#264)

lkdvos · kshyatt · web-flow · commit 77a90083e44f · 2026-05-05T17:56:57.000-04:00
* simplify strided implementations

* add JLArrays extension

* add CUDA extension

* add testing infrastructure

* Update Project.toml

Strided tag has hit

* incremental

* Update test file

---------

Co-authored-by: Katharine Hyatt &lt;kshyatt@users.noreply.github.com&gt;
Co-authored-by: Katharine Hyatt &lt;khyatt@flatironinstitute.org&gt;
diff --git a/Project.toml b/Project.toml
@@ -19,25 +19,32 @@ VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8"
 [weakdeps]
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
+CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
+Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [extensions]
 TensorOperationsBumperExt = "Bumper"
 TensorOperationsChainRulesCoreExt = "ChainRulesCore"
 TensorOperationsMooncakeExt = "Mooncake"
+TensorOperationsCUDACoreExt = "CUDACore"
 TensorOperationsEnzymeExt = "Enzyme"
 TensorOperationscuTENSORExt = "cuTENSOR"
+TensorOperationsJLArraysExt = "JLArrays"
 
 [compat]
 Aqua = "0.6, 0.7, 0.8"
+Adapt = "4"
 Bumper = "0.6, 0.7"
+CUDACore = "6"
 ChainRulesCore = "1"
 ChainRulesTestUtils = "1"
 DynamicPolynomials = "0.5, 0.6"
 Enzyme = "0.13.115"
 EnzymeTestUtils = "0.2"
+JLArrays = "0.3"
 LRUCache = "1"
 LinearAlgebra = "1.6"
 Logging = "1.6"
@@ -47,7 +54,7 @@ PrecompileTools = "1.1"
 Preferences = "1.4"
 PtrArrays = "1.2"
 Random = "1"
-Strided = "2.4"
+Strided = "2.5"
 StridedViews = "0.5"
 Test = "1"
 TupleTools = "1.6"
@@ -57,8 +64,10 @@ cuTENSOR = "6"
 julia = "1.10"
 
 [extras]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
+CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 cuRAND = "20fd9a0b-12d5-4c2f-a8af-7c34e9e60431"
@@ -72,4 +81,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
-test = ["Test", "Random", "DynamicPolynomials", "ChainRulesTestUtils", "ChainRulesCore", "cuRAND", "cuTENSOR", "Aqua", "Logging", "Bumper", "Mooncake", "Enzyme", "EnzymeTestUtils"]
+test = ["Test", "Random", "DynamicPolynomials", "ChainRulesTestUtils", "ChainRulesCore", "cuRAND", "CUDACore", "cuTENSOR", "Aqua", "Logging", "Bumper", "Mooncake", "Enzyme", "EnzymeTestUtils", "Adapt", "JLArrays"]
diff --git a/ext/TensorOperationsCUDACoreExt.jl b/ext/TensorOperationsCUDACoreExt.jl
@@ -0,0 +1,57 @@
+module TensorOperationsCUDACoreExt
+
+using CUDACore
+using TensorOperations
+using TensorOperations: TensorOperations as TO
+
+#-------------------------------------------------------------------------------------------
+# Allocator
+#-------------------------------------------------------------------------------------------
+
+TO.tensoradd_type(TC, A::CuArray, pA::Index2Tuple, conjA::Bool) =
+    CuArray{TC, TO.numind(pA)}
+
+function TO.CUDAAllocator()
+    Mout = CUDACore.UnifiedMemory
+    Min = CUDACore.default_memory
+    Mtemp = CUDACore.default_memory
+    return TO.CUDAAllocator{Mout, Min, Mtemp}()
+end
+
+function TO.tensoralloc_add(
+        TC, A::AbstractArray, pA::Index2Tuple, conjA::Bool,
+        istemp::Val, allocator::TO.CUDAAllocator
+    )
+    ttype = CuArray{TC, TO.numind(pA)}
+    structure = TO.tensoradd_structure(A, pA, conjA)
+    return TO.tensoralloc(ttype, structure, istemp, allocator)::ttype
+end
+
+function TO.tensoralloc_contract(
+        TC,
+        A::AbstractArray, pA::Index2Tuple, conjA::Bool,
+        B::AbstractArray, pB::Index2Tuple, conjB::Bool,
+        pAB::Index2Tuple,
+        istemp::Val, allocator::TO.CUDAAllocator
+    )
+    ttype = CuArray{TC, TO.numind(pAB)}
+    structure = TO.tensorcontract_structure(A, pA, conjA, B, pB, conjB, pAB)
+    return TO.tensoralloc(ttype, structure, istemp, allocator)::ttype
+end
+
+# NOTE: the general implementation in the `DefaultAllocator` case works just fine, without
+# selecting an explicit memory model
+function TO.tensoralloc(
+        ::Type{CuArray{T, N}}, structure,
+        ::Val{istemp}, allocator::TO.CUDAAllocator{Mout, Min, Mtemp}
+    ) where {T, N, istemp, Mout, Min, Mtemp}
+    M = istemp ? Mtemp : Mout
+    return CuArray{T, N, M}(undef, structure)
+end
+
+function TO.tensorfree!(C::CuArray, ::TO.CUDAAllocator)
+    CUDACore.unsafe_free!(C)
+    return nothing
+end
+
+end
diff --git a/ext/TensorOperationsJLArraysExt.jl b/ext/TensorOperationsJLArraysExt.jl
@@ -0,0 +1,9 @@
+module TensorOperationsJLArraysExt
+
+using JLArrays
+using TensorOperations
+
+TensorOperations.tensoradd_type(TC, A::JLArray, pA::Index2Tuple, conjA::Bool) =
+    JLArray{TC, sum(length.(pA))}
+
+end
diff --git a/ext/TensorOperationscuTENSORExt.jl b/ext/TensorOperationscuTENSORExt.jl
@@ -144,57 +144,6 @@ function _custrided(
     end
 end
 
-#-------------------------------------------------------------------------------------------
-# Allocator
-#-------------------------------------------------------------------------------------------
-function TO.CUDAAllocator()
-    Mout = CUDACore.UnifiedMemory
-    Min = CUDACore.default_memory
-    Mtemp = CUDACore.default_memory
-    return CUDAAllocator{Mout, Min, Mtemp}()
-end
-
-function TO.tensoralloc_add(
-        TC, A::AbstractArray, pA::Index2Tuple, conjA::Bool,
-        istemp::Val, allocator::CUDAAllocator
-    )
-    ttype = CuArray{TC, TO.numind(pA)}
-    structure = TO.tensoradd_structure(A, pA, conjA)
-    return TO.tensoralloc(ttype, structure, istemp, allocator)::ttype
-end
-
-function TO.tensoralloc_contract(
-        TC,
-        A::AbstractArray, pA::Index2Tuple, conjA::Bool,
-        B::AbstractArray, pB::Index2Tuple, conjB::Bool,
-        pAB::Index2Tuple,
-        istemp::Val, allocator::CUDAAllocator
-    )
-    ttype = CuArray{TC, TO.numind(pAB)}
-    structure = TO.tensorcontract_structure(A, pA, conjA, B, pB, conjB, pAB)
-    return tensoralloc(ttype, structure, istemp, allocator)::ttype
-end
-
-# Overwrite tensoradd_type
-function TO.tensoradd_type(TC, A::CuArray, pA::Index2Tuple, conjA::Bool)
-    return CuArray{TC, sum(length.(pA))}
-end
-
-# NOTE: the general implementation in the `DefaultAllocator` case works just fine, without
-# selecting an explicit memory model
-function TO.tensoralloc(
-        ::Type{CuArray{T, N}}, structure,
-        ::Val{istemp}, allocator::CUDAAllocator{Mout, Min, Mtemp}
-    ) where {T, N, istemp, Mout, Min, Mtemp}
-    M = istemp ? Mtemp : Mout
-    return CuArray{T, N, M}(undef, structure)
-end
-
-function TO.tensorfree!(C::CuArray, ::CUDAAllocator)
-    CUDACore.unsafe_free!(C)
-    return nothing
-end
-
 #-------------------------------------------------------------------------------------------
 # Implementation
 #-------------------------------------------------------------------------------------------
diff --git a/src/implementation/allocator.jl b/src/implementation/allocator.jl
@@ -166,6 +166,9 @@ end
 function tensoradd_type(TC, A::Base.PermutedDimsArray, pA::Index2Tuple, conjA::Bool)
     return tensoradd_type(TC, A.parent, pA, conjA)
 end
+function tensoradd_type(TC, A::StridedView, pA::Index2Tuple, conjA::Bool)
+    return tensoradd_type(TC, parent(A), pA, conjA)
+end
 
 function tensoradd_structure(A::AbstractArray, pA::Index2Tuple, conjA::Bool)
     return size.(Ref(A), linearize(pA))
diff --git a/src/implementation/strided.jl b/src/implementation/strided.jl
@@ -1,42 +1,12 @@
 const StridedViewOrDiagonal = Union{StridedView, Diagonal}
 
-_ishostarray(x::StridedView) = (pointer(x) isa Ptr)
-_ishostarray(x::Diagonal) = (pointer(x.diag) isa Ptr)
+select_backend(::typeof(tensoradd!), C::StridedView, A::StridedView) = StridedNative()
+select_backend(::typeof(tensortrace!), C::StridedView, A::StridedView) = StridedNative()
 
-function select_backend(::typeof(tensoradd!), C::StridedView, A::StridedView)
-    if _ishostarray(C) && _ishostarray(A)
-        return StridedNative()
-    else
-        return NoBackend()
-    end
-end
-function select_backend(::typeof(tensortrace!), C::StridedView, A::StridedView)
-    if _ishostarray(C) && _ishostarray(A)
-        return StridedNative()
-    else
-        return NoBackend()
-    end
-end
-
-function select_backend(
-        ::typeof(tensorcontract!), C::StridedView, A::StridedView, B::StridedView
-    )
-    if _ishostarray(C) && _ishostarray(A) && _ishostarray(B)
-        return eltype(C) <: LinearAlgebra.BlasFloat ? StridedBLAS() : StridedNative()
-    else
-        return NoBackend()
-    end
-end
-function select_backend(
-        ::typeof(tensorcontract!), C::StridedViewOrDiagonal,
-        A::StridedViewOrDiagonal, B::StridedViewOrDiagonal
-    )
-    if _ishostarray(C) && _ishostarray(A) && _ishostarray(B)
-        return StridedNative()
-    else
-        return NoBackend()
-    end
-end
+select_backend(::typeof(tensorcontract!), C::StridedView, A::StridedView, B::StridedView) =
+    eltype(C) <: LinearAlgebra.BlasFloat ? StridedBLAS() : StridedNative()
+select_backend(::typeof(tensorcontract!), C::StridedViewOrDiagonal, A::StridedViewOrDiagonal, B::StridedViewOrDiagonal) =
+    StridedNative()
 
 #-------------------------------------------------------------------------------------------
 # Force strided implementation on AbstractArray instances with Strided backend
diff --git a/test/gpu.jl b/test/gpu.jl
@@ -0,0 +1,124 @@
+using TensorOperations
+using TensorOperations: StridedBLAS, StridedNative, linearize, numout
+using Test
+using Adapt
+using TupleTools
+using JLArrays
+using VectorInterface
+using CUDACore
+
+test_result(a::AbstractArray, b::AbstractArray; kwargs...) =
+    isapprox(collect(a), collect(b); kwargs...)
+
+function compare(f, AT::Type, xs...; kwargs...)
+    cpu_in = map(deepcopy, xs) # copy on CPU
+    gpu_in = map(adapt(AT), xs) # adapt on GPU
+
+    cpu_out = f(cpu_in...)
+    gpu_out = f(gpu_in...)
+
+    return test_result(cpu_out, gpu_out; kwargs...)
+end
+
+# types to test for
+ATs = []
+!is_buildkite && push!(ATs, JLArray)
+CUDACore.functional() && push!(ATs, CuArray)
+
+backends = [StridedBLAS(), StridedNative()]
+
+@testset "tensoradd! ($AT)" for AT in ATs
+    sz = (3, 5, 4, 6)
+    p = (3, 1, 4, 2)
+    for backend in backends, T in (Float32, ComplexF32)
+        A = randn(T, sz)
+        C = randn(T, TupleTools.getindices(sz, p))
+
+        @test compare(AT, C, A) do c, a
+            tensoradd!(c, a, (p, ()), false, One(), Zero(), backend)
+        end
+
+        α = rand(T)
+        @test compare(AT, C, A) do c, a
+            tensoradd!(c, a, (p, ()), false, α, Zero(), backend)
+        end
+
+        β = rand(T)
+        @test compare(AT, C, A) do c, a
+            tensoradd!(c, a, (p, ()), false, α, β, backend)
+        end
+
+        T <: Real || @test compare(AT, C, A) do c, a
+            tensoradd!(c, a, (p, ()), true, α, β, backend)
+        end
+    end
+end
+
+@testset "tensortrace! ($AT)" for AT in ATs
+    sz = (2, 4, 3, 2)
+    p = (2, 3)
+    q = ((1,), (4,))
+
+    for backend in backends, T in (Float32, ComplexF32)
+        A = randn(T, sz)
+        C = randn(T, TupleTools.getindices(sz, p))
+
+        @test compare(AT, C, A) do c, a
+            tensortrace!(c, a, (p, ()), q, false, One(), Zero(), backend)
+        end
+
+        α = rand(T)
+        @test compare(AT, C, A) do c, a
+            tensortrace!(c, a, (p, ()), q, false, α, Zero(), backend)
+        end
+
+        β = rand(T)
+        @test compare(AT, C, A) do c, a
+            tensortrace!(c, a, (p, ()), q, false, α, β, backend)
+        end
+
+        T <: Real || @test compare(AT, C, A) do c, a
+            tensortrace!(c, a, (p, ()), q, true, α, β, backend)
+        end
+    end
+end
+
+@testset "tensorcontract! ($AT)" for AT in ATs
+    sz = (2, 4, 3, 4, 2, 5)
+    pA = ((4, 1), (2, 3))
+    pB = ((3, 1), (2,))
+    pAB = ((1, 2, 3), ())
+
+    for backend in backends, T in (Float32, ComplexF32)
+        A = randn(T, (2, 4, 3, 2))
+        B = randn(T, (3, 3, 4))
+        C = randn(T, (2, 2, 3))
+
+        @test compare(AT, C, A, B) do c, a, b
+            tensorcontract!(c, a, pA, false, b, pB, false, pAB, One(), Zero(), backend)
+        end
+
+        α = rand(T)
+        @test compare(AT, C, A, B) do c, a, b
+            tensorcontract!(c, a, pA, false, b, pB, false, pAB, α, Zero(), backend)
+        end
+
+        β = rand(T)
+        @test compare(AT, C, A, B) do c, a, b
+            tensorcontract!(c, a, pA, false, b, pB, false, pAB, α, β, backend)
+        end
+
+        if !(T <: Real)
+            @test compare(AT, C, A, B) do c, a, b
+                tensorcontract!(c, a, pA, true, b, pB, false, pAB, α, β, backend)
+            end
+            @test compare(AT, C, A, B) do c, a, b
+                tensorcontract!(c, a, pA, false, b, pB, true, pAB, α, β, backend)
+            end
+            @test compare(AT, C, A, B) do c, a, b
+                tensorcontract!(c, a, pA, true, b, pB, true, pAB, α, β, backend)
+            end
+        end
+    end
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -50,12 +50,13 @@ if !is_buildkite
     end
 end
 
-if is_buildkite
-    # note: cuTENSOR should not be loaded before this point
-    # as there is a test which requires it to be loaded after
-    @testset "cuTENSOR extension" verbose = true begin
-        include("cutensor.jl")
-    end
+# note: cuTENSOR should not be loaded before this point
+# as there is a test which requires it to be loaded after
+@testset "cuTENSOR extension" verbose = true begin
+    include("cutensor.jl")
+end
+@testset "GPUArrays" verbose = true begin
+    include("gpu.jl")
 end
 
 if !is_buildkite