diff --git a/Project.toml b/Project.toml index 4fe4ac21b..199a0708d 100644 --- a/Project.toml +++ b/Project.toml @@ -36,7 +36,7 @@ SparseArraysExt = "SparseArrays" Adapt = "0.4, 1.0, 2.0, 3.0, 4" Atomix = "0.1, 1" EnzymeCore = "0.7, 0.8.1" -GPUCompiler = "1.13.3" +GPUCompiler = "1.23" InteractiveUtils = "1.6" LLVM = "9.4.1" LinearAlgebra = "1.6" diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 3881da55c..d6414ee22 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -12,7 +12,6 @@ import PrecompileTools import Atomix: @atomic, @atomicswap, @atomicreplace using MacroTools -using StaticArrays using Adapt """ diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index f23e20b0f..ffa385f2f 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -7,8 +7,6 @@ using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA import KernelAbstractions.KernelIntrinsics as KI -import StaticArrays - import Adapt @@ -222,7 +220,11 @@ end end @device_override @inline function KA.Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims} - StaticArrays.MArray{KA.__size(Dims), T}(undef) + # private per-workitem scratch: a stack `alloca` (lowered by GPUCompiler) wrapped in a + # device array. the slot lives in OpenCL "Function" storage (LLVM addrspace 0), which is + # where the SPIR-V target places allocas. + ptr = POCL.GPUCompiler.alloca(T, Val(prod(Dims))) + CLDeviceArray(Dims, reinterpret(POCL.LLVMPtr{T, POCL.AS.Function}, ptr)) end