diff --git a/Project.toml b/Project.toml
index 4fe4ac21b..199a0708d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -36,7 +36,7 @@ SparseArraysExt = "SparseArrays"
 Adapt = "0.4, 1.0, 2.0, 3.0, 4"
 Atomix = "0.1, 1"
 EnzymeCore = "0.7, 0.8.1"
-GPUCompiler = "1.13.3"
+GPUCompiler = "1.23"
 InteractiveUtils = "1.6"
 LLVM = "9.4.1"
 LinearAlgebra = "1.6"
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 3881da55c..d6414ee22 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -12,7 +12,6 @@ import PrecompileTools
 import Atomix: @atomic, @atomicswap, @atomicreplace
 
 using MacroTools
-using StaticArrays
 using Adapt
 
 """
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index f23e20b0f..ffa385f2f 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -7,8 +7,6 @@ using ..POCL: device, clconvert, clfunction
 import KernelAbstractions as KA
 import KernelAbstractions.KernelIntrinsics as KI
 
-import StaticArrays
-
 import Adapt
 
 
@@ -222,7 +220,11 @@ end
 end
 
 @device_override @inline function KA.Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
-    StaticArrays.MArray{KA.__size(Dims), T}(undef)
+    # private per-workitem scratch: a stack `alloca` (lowered by GPUCompiler) wrapped in a
+    # device array. the slot lives in OpenCL "Function" storage (LLVM addrspace 0), which is
+    # where the SPIR-V target places allocas.
+    ptr = POCL.GPUCompiler.alloca(T, Val(prod(Dims)))
+    CLDeviceArray(Dims, reinterpret(POCL.LLVMPtr{T, POCL.AS.Function}, ptr))
 end