QuantumKitHub · kshyatt · May 11, 2026 · May 11, 2026 · May 13, 2026 · Jun 9, 2026
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -2,7 +2,7 @@ env:
   SECRET_CODECOV_TOKEN: "NsHKj2ZxqUDfErNc+zlH6erC00pk0XRZeNAaU+hyRg6oHlIuSUVL53Z0/MW6Xeq8mBsYsfdG3rmE+h0hoGXj6swpmtjCnLI0CAHUSVOTKNHQ4R6VmKuNnLkNQX7+GO6PEcnV+sCMDSt/nhci0lUl/9qo+6uT/VA+9E6XiKOsKV8nL+kb/GDNJqrG8u2JJzd9EcrFG9Vf4p7tLgsafhQq+yQeVdeYxPWKPx2x6+K2w2WrGel0RlVfyYFLEGHo4TW4+OPPoMOJBCA+kkE2I8OlqzzMUMkULhwhWujHyOrWBZ74EFY2zbwYD/iiYTlGJW8UWaOn561uJp3J7+nab4nEYA==;U2FsdGVkX1/EACeMbht8x2ar6VrhBrcGZUtM4/B4viOz590nUZNIUkWPkjpmdriAAP3t1KEj2LlRg+z/FK+CSQ=="
 
 steps:
-  - label: "Julia v1"
+  - label: "Julia v1 -- CUDA"
     plugins:
       - JuliaCI/julia#v1:
           version: "1"
@@ -16,7 +16,7 @@ steps:
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 30
 
-  - label: "Julia LTS"
+  - label: "Julia LTS -- CUDA"
     plugins:
       - JuliaCI/julia#v1:
           version: "1.10" # "lts" isn't valid
@@ -29,3 +29,31 @@ steps:
       queue: "cuda"
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 30
+
+  - label: "Julia v1 -- AMDGPU"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "rocm"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 30
+
+  - label: "Julia LTS -- AMDGPU"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10" # "lts" isn't valid
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "rocm"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 30
diff --git a/Project.toml b/Project.toml
@@ -17,6 +17,7 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
@@ -26,6 +27,7 @@ JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 
 [extensions]
+TensorOperationsAMDGPUExt = "AMDGPU"
 TensorOperationsBumperExt = "Bumper"
 TensorOperationsChainRulesCoreExt = "ChainRulesCore"
 TensorOperationsMooncakeExt = "Mooncake"
@@ -35,6 +37,7 @@ TensorOperationscuTENSORExt = "cuTENSOR"
 TensorOperationsJLArraysExt = "JLArrays"
 
 [compat]
+AMDGPU = "2"
 Aqua = "0.6, 0.7, 0.8"
 Adapt = "4"
 Bumper = "0.6, 0.7"
@@ -54,7 +57,7 @@ PrecompileTools = "1.1"
 Preferences = "1.4"
 PtrArrays = "1.2"
 Random = "1"
-Strided = "2.5"
+Strided = "2.6"
 StridedViews = "0.5"
 Test = "1"
 TupleTools = "1.6"
@@ -82,4 +85,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
-test = ["Test", "Random", "DynamicPolynomials", "ChainRulesTestUtils", "ChainRulesCore", "cuRAND", "CUDACore", "cuTENSOR", "Aqua", "Logging", "Bumper", "Mooncake", "Enzyme", "EnzymeTestUtils", "Adapt", "JLArrays"]
+test = ["Test", "Random", "DynamicPolynomials", "ChainRulesTestUtils", "ChainRulesCore", "cuRAND", "CUDACore", "cuTENSOR", "Aqua", "Logging", "Bumper", "Mooncake", "Enzyme", "EnzymeTestUtils", "Adapt", "JLArrays", "AMDGPU"]
diff --git a/ext/TensorOperationsAMDGPUExt.jl b/ext/TensorOperationsAMDGPUExt.jl
@@ -0,0 +1,49 @@
+module TensorOperationsAMDGPUExt
+
+using AMDGPU
+using TensorOperations
+using TensorOperations: TensorOperations as TO
+
+#-------------------------------------------------------------------------------------------
+# Allocator
+#-------------------------------------------------------------------------------------------
+
+TO.tensoradd_type(TC, A::AnyROCArray, pA::Index2Tuple, conjA::Bool) =
+    ROCArray{TC, TO.numind(pA)}
+
+function TO.tensoralloc_add(
+        TC, A::AbstractArray, pA::Index2Tuple, conjA::Bool,
+        istemp::Val, allocator::TO.AMDAllocator
+    )
+    ttype = ROCArray{TC, TO.numind(pA)}
+    structure = TO.tensoradd_structure(A, pA, conjA)
+    return TO.tensoralloc(ttype, structure, istemp, allocator)::ttype
+end
+
+function TO.tensoralloc_contract(
+        TC,
+        A::AbstractArray, pA::Index2Tuple, conjA::Bool,
+        B::AbstractArray, pB::Index2Tuple, conjB::Bool,
+        pAB::Index2Tuple,
+        istemp::Val, allocator::TO.AMDAllocator
+    )
+    ttype = ROCArray{TC, TO.numind(pAB)}
+    structure = TO.tensorcontract_structure(A, pA, conjA, B, pB, conjB, pAB)
+    return TO.tensoralloc(ttype, structure, istemp, allocator)::ttype
+end
+
+# NOTE: the general implementation in the `DefaultAllocator` case works just fine, without
+# selecting an explicit memory model
+function TO.tensoralloc(
+        ::Type{<:ROCArray{T, N}}, structure,
+        ::Val{istemp}, allocator::TO.AMDAllocator
+    ) where {T, N, istemp}
+    return ROCArray{T, N}(undef, structure)
+end
+
+function TO.tensorfree!(C::ROCArray, ::TO.AMDAllocator)
+    AMDGPU.unsafe_free!(C)
+    return nothing
+end
+
+end
diff --git a/src/implementation/allocator.jl b/src/implementation/allocator.jl
@@ -30,6 +30,13 @@ parameters `Min`, `Mout`, `Mtemp` can be any of the CUDA.jl memory types, i.e.
 """
 struct CUDAAllocator{Mout, Min, Mtemp} end
 
+"""
+    AMDAllocator()
+
+Allocator that uses the AMD memory manager and will thus allocate `ROCArray` instances.
+"""
+struct AMDAllocator end
+
 """
     ManualAllocator()
 

diff --git a/src/implementation/strided.jl b/src/implementation/strided.jl
@@ -20,7 +20,7 @@ function tensoradd!(
     )
     # resolve conj flags and absorb into StridedView constructor to avoid type instabilities later on
     if conjA
-        stridedtensoradd!(SV(C), conj(SV(A)), pA, α, β, backend, allocator)
+        stridedtensoradd!(SV(C), conj!(SV(A)), pA, α, β, backend, allocator)
     else
         stridedtensoradd!(SV(C), SV(A), pA, α, β, backend, allocator)
     end
@@ -35,7 +35,7 @@ function tensortrace!(
     )
     # resolve conj flags and absorb into StridedView constructor to avoid type instabilities later on
     if conjA
-        stridedtensortrace!(SV(C), conj(SV(A)), p, q, α, β, backend, allocator)
+        stridedtensortrace!(SV(C), conj!(SV(A)), p, q, α, β, backend, allocator)
     else
         stridedtensortrace!(SV(C), SV(A), p, q, α, β, backend, allocator)
     end

diff --git a/test/gpu.jl b/test/gpu.jl
@@ -5,7 +5,7 @@ using Adapt
 using TupleTools
 using JLArrays
 using VectorInterface
-using CUDACore
+using CUDACore, AMDGPU
 
 test_result(a::AbstractArray, b::AbstractArray; kwargs...) =
     isapprox(collect(a), collect(b); kwargs...)
@@ -24,6 +24,7 @@ end
 ATs = []
 !is_buildkite && push!(ATs, JLArray)
 CUDACore.functional() && push!(ATs, CuArray)
+AMDGPU.functional() && push!(ATs, ROCArray)
 
 backends = [StridedBLAS(), StridedNative()]