From 1b67f8302ac86874cd69424722354292c38b1303 Mon Sep 17 00:00:00 2001 From: hzarei4 Date: Thu, 23 Apr 2026 13:45:28 +0200 Subject: [PATCH 1/3] added CUDA support for local extrema --- Project.toml | 8 +- ext/CUDASupportExt.jl | 197 ++++++++++++++++++++++++++++++++++++++++++ test/cuda/runtests.jl | 30 +++++++ 3 files changed, 234 insertions(+), 1 deletion(-) create mode 100644 ext/CUDASupportExt.jl diff --git a/Project.toml b/Project.toml index 1e5f3e43..66759375 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ImageFiltering" uuid = "6a3955dd-da59-5b1f-98d4-e7296123deb5" -author = ["Tim Holy ", "Jan Weidner "] version = "0.7.12" +author = ["Tim Holy ", "Jan Weidner "] [deps] CatIndices = "aafaddc9-749c-510e-ac4f-586e18779b91" @@ -20,6 +20,12 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" TiledIteration = "06e1c1a7-607b-532d-9fad-de7d9aa2abac" +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[extensions] +CUDASupportExt = ["CUDA"] + [compat] CatIndices = "0.2" ComputationalResources = "0.3" diff --git a/ext/CUDASupportExt.jl b/ext/CUDASupportExt.jl new file mode 100644 index 00000000..573b78ec --- /dev/null +++ b/ext/CUDASupportExt.jl @@ -0,0 +1,197 @@ +module CUDASupportExt + +using CUDA +using Base +using ImageFiltering + +export findlocalextrema, findlocalmaxima, findlocalminima + +import ImageFiltering: findlocalextrema, findlocalmaxima, findlocalminima + + +findlocalextrema(f, img::CuArray{T,N}, window, edges::Bool) where {T,N} = + findlocalextrema(f, img, window, ntuple(_ -> edges, N)) + +function findlocalextrema( + f, + img::CuArray{T,N}, + window::Dims{N}, + edges::NTuple{N,Bool}, +) where {T,N} + + @assert all(isodd, window) "window entries must be odd" + + mode = _extrema_mode(f) + + mask = similar(img, Bool) + _localextrema_mask!(mask, img, window, edges, mode) + + return findall(Array(mask)) +end + +function findlocalmaxima(img::CuArray; window=_default_window_cuda(img), edges=true) + findlocalextrema(>, img, window, edges) +end + +function findlocalminima(img::CuArray; window=_default_window_cuda(img), edges=true) + findlocalextrema(<, img, window, edges) +end + +# -------------------------------------------------- +# Default window from coords_spatial +# -------------------------------------------------- + +function _default_window_cuda(img) + spatial = ImageFiltering.coords_spatial(img) + spatial_set = Set(spatial) + return ntuple(d -> (d in spatial_set ? 3 : 1), ndims(img)) +end + +# -------------------------------------------------- +# Comparison mode +# -------------------------------------------------- + +@inline function _extrema_mode(f) + if f === > + return Int32(1) + elseif f === < + return Int32(-1) + else + throw(ArgumentError("CUDA findlocalextrema currently supports only `>` and `<`")) + end +end + +@inline _cmp(mode::Int32, a, b) = mode == 1 ? (a > b) : (a < b) + +# -------------------------------------------------- +# Metadata preparation on host +# -------------------------------------------------- + +function _compute_strides(dims::NTuple{N,Int}) where {N} + s = Vector{Int}(undef, N) + stride = 1 + for d in 1:N + s[d] = stride + stride *= dims[d] + end + return s +end + +function _localextrema_mask!( + mask::CuArray{Bool,N}, + img::CuArray{T,N}, + window::Dims{N}, + edges::NTuple{N,Bool}, + mode::Int32, +) where {T,N} + + dims_h = collect(Int, size(img)) + strides_h = _compute_strides(size(img)) + halfw_h = [window[d] ÷ 2 for d in 1:N] + clip_h = [edges[d] ? 0 : max(1, halfw_h[d]) for d in 1:N] + + dims_d = CuArray(dims_h) + strides_d = CuArray(strides_h) + halfw_d = CuArray(halfw_h) + clip_d = CuArray(clip_h) + + len = length(img) + threads = 256 + blocks = cld(len, threads) + + @cuda threads=threads blocks=blocks _localextrema_nd!( + mask, img, dims_d, strides_d, halfw_d, clip_d, Int32(N), mode, len + ) + + return mask +end + +# -------------------------------------------------- +# ND kernel with runtime dimension metadata +# -------------------------------------------------- + +function _localextrema_nd!( + mask, + A, + dims, + strides, + halfw, + clip, + nd::Int32, + mode::Int32, + len::Int, +) + idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + if idx > len + return + end + + # Recover coordinates from linear index + # coord[d] is computed on the fly; no tuple creation. + center_ok = true + linear0 = idx - 1 + + # First pass: check whether this center is eligible + for d in 1:nd + coord_d = (linear0 ÷ strides[d]) % dims[d] + 1 + if !(1 + clip[d] <= coord_d <= dims[d] - clip[d]) + center_ok = false + break + end + end + + if !center_ok + @inbounds mask[idx] = false + return + end + + c = @inbounds A[idx] + + # Total number of offsets in the window + total_offsets = 1 + for d in 1:nd + total_offsets *= (2 * halfw[d] + 1) + end + + isext = true + + # Enumerate offsets in mixed radix + for k in 0:(total_offsets - 1) + t = k + neigh_idx = idx + allzero = true + inbounds = true + + for d in 1:nd + width_d = 2 * halfw[d] + 1 + off_d = (t % width_d) - halfw[d] + t ÷= width_d + + allzero &= (off_d == 0) + + coord_d = (linear0 ÷ strides[d]) % dims[d] + 1 + neigh_coord_d = coord_d + off_d + + if !(1 <= neigh_coord_d <= dims[d]) + inbounds = false + break + end + + neigh_idx += off_d * strides[d] + end + + if !allzero && inbounds + v = @inbounds A[neigh_idx] + if !_cmp(mode, c, v) + isext = false + break + end + end + end + + @inbounds mask[idx] = isext + return +end + +end # module \ No newline at end of file diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl index 5789d00e..1b5a7034 100644 --- a/test/cuda/runtests.jl +++ b/test/cuda/runtests.jl @@ -8,6 +8,36 @@ using ImageQualityIndexes using Test using Random +@testset "extrema CUDA" begin + @testset "local extrema" begin + a = zeros(Int, 9, 9) + a[[1:2; 5], 5] .= 1 + ca = cu(a) + + @test findlocalmaxima(ca) == [CartesianIndex((5, 5))] + @test findlocalmaxima(ca; window=(1, 3)) == + [CartesianIndex((1, 5)), CartesianIndex((2, 5)), CartesianIndex((5, 5))] + @test findlocalmaxima(ca; window=(1, 3), edges=false) == + [CartesianIndex((2, 5)), CartesianIndex((5, 5))] + + a = zeros(Int, 9, 9, 9) + a[[1:2; 5], 5, 5] .= 1 + ca = cu(a) + + @test findlocalmaxima(ca) == [CartesianIndex((5, 5, 5))] + @test findlocalmaxima(ca; window=(1, 3, 1)) == + [CartesianIndex((1, 5, 5)), CartesianIndex((2, 5, 5)), CartesianIndex((5, 5, 5))] + @test findlocalmaxima(ca; window=(1, 3, 1), edges=false) == + [CartesianIndex((2, 5, 5)), CartesianIndex((5, 5, 5))] + + a = zeros(Int, 9, 9) + a[[1:2; 5], 5] .= -1 + ca = cu(a) + + @test findlocalminima(ca) == [CartesianIndex((5, 5))] + end +end + CUDA.allowscalar(false) @testset "ImageFiltering" begin From 091d2e01e746e464fd737ac7eb3f8d5ff5b3fee6 Mon Sep 17 00:00:00 2001 From: hzarei4 Date: Thu, 23 Apr 2026 14:12:20 +0200 Subject: [PATCH 2/3] added CUDA support for local extrema --- ext/CUDASupportExt.jl | 197 ++++++++++++++++++++++++++++++++++++++++++ test/cuda/runtests.jl | 30 +++++++ 2 files changed, 227 insertions(+) create mode 100644 ext/CUDASupportExt.jl diff --git a/ext/CUDASupportExt.jl b/ext/CUDASupportExt.jl new file mode 100644 index 00000000..573b78ec --- /dev/null +++ b/ext/CUDASupportExt.jl @@ -0,0 +1,197 @@ +module CUDASupportExt + +using CUDA +using Base +using ImageFiltering + +export findlocalextrema, findlocalmaxima, findlocalminima + +import ImageFiltering: findlocalextrema, findlocalmaxima, findlocalminima + + +findlocalextrema(f, img::CuArray{T,N}, window, edges::Bool) where {T,N} = + findlocalextrema(f, img, window, ntuple(_ -> edges, N)) + +function findlocalextrema( + f, + img::CuArray{T,N}, + window::Dims{N}, + edges::NTuple{N,Bool}, +) where {T,N} + + @assert all(isodd, window) "window entries must be odd" + + mode = _extrema_mode(f) + + mask = similar(img, Bool) + _localextrema_mask!(mask, img, window, edges, mode) + + return findall(Array(mask)) +end + +function findlocalmaxima(img::CuArray; window=_default_window_cuda(img), edges=true) + findlocalextrema(>, img, window, edges) +end + +function findlocalminima(img::CuArray; window=_default_window_cuda(img), edges=true) + findlocalextrema(<, img, window, edges) +end + +# -------------------------------------------------- +# Default window from coords_spatial +# -------------------------------------------------- + +function _default_window_cuda(img) + spatial = ImageFiltering.coords_spatial(img) + spatial_set = Set(spatial) + return ntuple(d -> (d in spatial_set ? 3 : 1), ndims(img)) +end + +# -------------------------------------------------- +# Comparison mode +# -------------------------------------------------- + +@inline function _extrema_mode(f) + if f === > + return Int32(1) + elseif f === < + return Int32(-1) + else + throw(ArgumentError("CUDA findlocalextrema currently supports only `>` and `<`")) + end +end + +@inline _cmp(mode::Int32, a, b) = mode == 1 ? (a > b) : (a < b) + +# -------------------------------------------------- +# Metadata preparation on host +# -------------------------------------------------- + +function _compute_strides(dims::NTuple{N,Int}) where {N} + s = Vector{Int}(undef, N) + stride = 1 + for d in 1:N + s[d] = stride + stride *= dims[d] + end + return s +end + +function _localextrema_mask!( + mask::CuArray{Bool,N}, + img::CuArray{T,N}, + window::Dims{N}, + edges::NTuple{N,Bool}, + mode::Int32, +) where {T,N} + + dims_h = collect(Int, size(img)) + strides_h = _compute_strides(size(img)) + halfw_h = [window[d] ÷ 2 for d in 1:N] + clip_h = [edges[d] ? 0 : max(1, halfw_h[d]) for d in 1:N] + + dims_d = CuArray(dims_h) + strides_d = CuArray(strides_h) + halfw_d = CuArray(halfw_h) + clip_d = CuArray(clip_h) + + len = length(img) + threads = 256 + blocks = cld(len, threads) + + @cuda threads=threads blocks=blocks _localextrema_nd!( + mask, img, dims_d, strides_d, halfw_d, clip_d, Int32(N), mode, len + ) + + return mask +end + +# -------------------------------------------------- +# ND kernel with runtime dimension metadata +# -------------------------------------------------- + +function _localextrema_nd!( + mask, + A, + dims, + strides, + halfw, + clip, + nd::Int32, + mode::Int32, + len::Int, +) + idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + if idx > len + return + end + + # Recover coordinates from linear index + # coord[d] is computed on the fly; no tuple creation. + center_ok = true + linear0 = idx - 1 + + # First pass: check whether this center is eligible + for d in 1:nd + coord_d = (linear0 ÷ strides[d]) % dims[d] + 1 + if !(1 + clip[d] <= coord_d <= dims[d] - clip[d]) + center_ok = false + break + end + end + + if !center_ok + @inbounds mask[idx] = false + return + end + + c = @inbounds A[idx] + + # Total number of offsets in the window + total_offsets = 1 + for d in 1:nd + total_offsets *= (2 * halfw[d] + 1) + end + + isext = true + + # Enumerate offsets in mixed radix + for k in 0:(total_offsets - 1) + t = k + neigh_idx = idx + allzero = true + inbounds = true + + for d in 1:nd + width_d = 2 * halfw[d] + 1 + off_d = (t % width_d) - halfw[d] + t ÷= width_d + + allzero &= (off_d == 0) + + coord_d = (linear0 ÷ strides[d]) % dims[d] + 1 + neigh_coord_d = coord_d + off_d + + if !(1 <= neigh_coord_d <= dims[d]) + inbounds = false + break + end + + neigh_idx += off_d * strides[d] + end + + if !allzero && inbounds + v = @inbounds A[neigh_idx] + if !_cmp(mode, c, v) + isext = false + break + end + end + end + + @inbounds mask[idx] = isext + return +end + +end # module \ No newline at end of file diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl index 5789d00e..1b5a7034 100644 --- a/test/cuda/runtests.jl +++ b/test/cuda/runtests.jl @@ -8,6 +8,36 @@ using ImageQualityIndexes using Test using Random +@testset "extrema CUDA" begin + @testset "local extrema" begin + a = zeros(Int, 9, 9) + a[[1:2; 5], 5] .= 1 + ca = cu(a) + + @test findlocalmaxima(ca) == [CartesianIndex((5, 5))] + @test findlocalmaxima(ca; window=(1, 3)) == + [CartesianIndex((1, 5)), CartesianIndex((2, 5)), CartesianIndex((5, 5))] + @test findlocalmaxima(ca; window=(1, 3), edges=false) == + [CartesianIndex((2, 5)), CartesianIndex((5, 5))] + + a = zeros(Int, 9, 9, 9) + a[[1:2; 5], 5, 5] .= 1 + ca = cu(a) + + @test findlocalmaxima(ca) == [CartesianIndex((5, 5, 5))] + @test findlocalmaxima(ca; window=(1, 3, 1)) == + [CartesianIndex((1, 5, 5)), CartesianIndex((2, 5, 5)), CartesianIndex((5, 5, 5))] + @test findlocalmaxima(ca; window=(1, 3, 1), edges=false) == + [CartesianIndex((2, 5, 5)), CartesianIndex((5, 5, 5))] + + a = zeros(Int, 9, 9) + a[[1:2; 5], 5] .= -1 + ca = cu(a) + + @test findlocalminima(ca) == [CartesianIndex((5, 5))] + end +end + CUDA.allowscalar(false) @testset "ImageFiltering" begin From 8ebbd409bb350f9d91dbfb6727af0a88aab7a810 Mon Sep 17 00:00:00 2001 From: hzarei4 Date: Thu, 23 Apr 2026 14:17:47 +0200 Subject: [PATCH 3/3] corrected a small mistake --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 66759375..8ca8886e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ImageFiltering" uuid = "6a3955dd-da59-5b1f-98d4-e7296123deb5" -version = "0.7.12" author = ["Tim Holy ", "Jan Weidner "] +version = "0.7.12" [deps] CatIndices = "aafaddc9-749c-510e-ac4f-586e18779b91"