From c33f8bb6a84acd469bad1ac09ff80a94ac005d01 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 22 Jun 2026 17:28:40 +0200 Subject: [PATCH 1/2] remove StaticArrays --- Project.toml | 2 -- src/KernelAbstractions.jl | 1 - src/pocl/backend.jl | 2 -- 3 files changed, 5 deletions(-) diff --git a/Project.toml b/Project.toml index 4fe4ac21b..356e85a38 100644 --- a/Project.toml +++ b/Project.toml @@ -18,7 +18,6 @@ RandomNumbers = "e6cf234a-135c-5ec9-84dd-332b85af5143" SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8" SPIRV_LLVM_Backend_jll = "4376b9bf-cff8-51b6-bb48-39421dff0d0c" SPIRV_Tools_jll = "6ac6d60f-d740-5983-97d7-a4482c0689f4" -StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" pocl_standalone_jll = "54f56a70-6062-5590-a942-1226658f6c83" @@ -49,7 +48,6 @@ SPIRVIntrinsics = "1" SPIRV_LLVM_Backend_jll = "22" SPIRV_Tools_jll = "2024.4, 2025.1" SparseArrays = "<0.0.1, 1.6" -StaticArrays = "0.12, 1.0" UUIDs = "<0.0.1, 1.6" julia = "1.10" pocl_standalone_jll = "7.1.2" diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 3881da55c..d6414ee22 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -12,7 +12,6 @@ import PrecompileTools import Atomix: @atomic, @atomicswap, @atomicreplace using MacroTools -using StaticArrays using Adapt """ diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index f23e20b0f..247b3466c 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -7,8 +7,6 @@ using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA import KernelAbstractions.KernelIntrinsics as KI -import StaticArrays - import Adapt From 0b58f9ba7f9b5bb309d0288955698078e46301f6 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 22 Jun 2026 17:38:41 +0200 Subject: [PATCH 2/2] [pocl] Use GPUCompiler.alloca for Scratchpad instead of MArray Back the POCL `Scratchpad` (`@private`) with `GPUCompiler.alloca`, a direct per-workitem stack allocation, instead of a StaticArrays `MArray`. The returned `Ptr` is wrapped in a `CLDeviceArray` over OpenCL "Function" storage (LLVM addrspace 0), where the SPIR-V target places allocas. Its alignment (`Base.datatype_alignment(T)`) matches `CLDeviceArray`'s element accesses. Requires GPUCompiler 1.23 (JuliaGPU/GPUCompiler.jl#859), which adds the `alloca` intrinsic. Drops the now-unused StaticArrays import from the POCL back-end (StaticArrays is still used by the CPU back-end). Co-Authored-By: Claude Opus 4.8 --- Project.toml | 4 +++- src/pocl/backend.jl | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 356e85a38..199a0708d 100644 --- a/Project.toml +++ b/Project.toml @@ -18,6 +18,7 @@ RandomNumbers = "e6cf234a-135c-5ec9-84dd-332b85af5143" SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8" SPIRV_LLVM_Backend_jll = "4376b9bf-cff8-51b6-bb48-39421dff0d0c" SPIRV_Tools_jll = "6ac6d60f-d740-5983-97d7-a4482c0689f4" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" pocl_standalone_jll = "54f56a70-6062-5590-a942-1226658f6c83" @@ -35,7 +36,7 @@ SparseArraysExt = "SparseArrays" Adapt = "0.4, 1.0, 2.0, 3.0, 4" Atomix = "0.1, 1" EnzymeCore = "0.7, 0.8.1" -GPUCompiler = "1.13.3" +GPUCompiler = "1.23" InteractiveUtils = "1.6" LLVM = "9.4.1" LinearAlgebra = "1.6" @@ -48,6 +49,7 @@ SPIRVIntrinsics = "1" SPIRV_LLVM_Backend_jll = "22" SPIRV_Tools_jll = "2024.4, 2025.1" SparseArrays = "<0.0.1, 1.6" +StaticArrays = "0.12, 1.0" UUIDs = "<0.0.1, 1.6" julia = "1.10" pocl_standalone_jll = "7.1.2" diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 247b3466c..ffa385f2f 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -220,7 +220,11 @@ end end @device_override @inline function KA.Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims} - StaticArrays.MArray{KA.__size(Dims), T}(undef) + # private per-workitem scratch: a stack `alloca` (lowered by GPUCompiler) wrapped in a + # device array. the slot lives in OpenCL "Function" storage (LLVM addrspace 0), which is + # where the SPIR-V target places allocas. + ptr = POCL.GPUCompiler.alloca(T, Val(prod(Dims))) + CLDeviceArray(Dims, reinterpret(POCL.LLVMPtr{T, POCL.AS.Function}, ptr)) end