From a0b109d0b0d7a5cc1e4957a9154551e16535fc0b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Mar 2026 19:46:33 +0100 Subject: [PATCH 1/6] Add tiled broadcast via Base.Broadcast integration Implements `ct.Tiled(B) .= A .+ A .* B` syntax that leverages Julia's broadcast fusion machinery and dispatches to cuTile kernels. - `Tiled` wrapper type with `TiledCuArrayStyle` that wins over `CuArrayStyle` and `DefaultArrayStyle` - `materialize!` converts the fused `Broadcasted` tree: CuArrays become TileArrays, style/axes are stripped - Generic 1D/2D kernels recursively evaluate the `Broadcasted` tree on tiles, using `broadcast(bc.f, args...)` for element-wise semantics - Supports arbitrarily nested fused expressions (e.g. `A .+ A .* B`) Type-constructor broadcasts (e.g. `BFloat16.(A)`) are not yet supported due to `Type{T}` fields causing compilation issues. Co-Authored-By: Claude Opus 4.6 (1M context) --- ext/CUDAExt.jl | 121 ++++++++++++++++++++++++++++++++++++++++++++++++- src/cuTile.jl | 5 +- 2 files changed, 124 insertions(+), 2 deletions(-) diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl index 0b80c06a..114cae43 100644 --- a/ext/CUDAExt.jl +++ b/ext/CUDAExt.jl @@ -9,9 +9,12 @@ using CompilerCaching: CacheView, method_instance, results import Core.Compiler as CC -using CUDA: CuModule, CuFunction, cudacall, device, capability +using CUDA: CuArray, CuModule, CuFunction, cudacall, device, capability using CUDA_Compiler_jll +import Base.Broadcast: BroadcastStyle, Broadcasted, DefaultArrayStyle +import CUDA: CuArrayStyle + public launch function run_and_collect(cmd) @@ -255,4 +258,120 @@ Other values pass through unchanged. to_tile_arg(x) = x to_tile_arg(arr::AbstractArray) = TileArray(arr) +#============================================================================= + Tiled Broadcast via Base.Broadcast +=============================================================================# + +""" + Tiled{A <: AbstractArray} + +Wrapper that routes broadcast expressions through cuTile kernels. + + Tiled(B) .= A .+ A + +Uses Julia's `Base.Broadcast` fusion machinery to build a `Broadcasted` tree, +then dispatches to a generic cuTile kernel that evaluates the tree on tiles. +""" +struct _Tiled{A <: AbstractArray} + parent::A +end +Base.parent(t::_Tiled) = t.parent +Base.size(t::_Tiled) = size(parent(t)) +Base.size(t::_Tiled, d) = size(parent(t), d) +Base.axes(t::_Tiled) = axes(parent(t)) +Base.axes(t::_Tiled, d) = axes(parent(t), d) +Base.ndims(::_Tiled{A}) where A = ndims(A) +Base.eltype(::_Tiled{A}) where A = eltype(A) +Base.length(t::_Tiled) = length(parent(t)) +Base.similar(t::_Tiled, args...) = _Tiled(similar(parent(t), args...)) +Base.setindex!(t::_Tiled, v, i...) = setindex!(parent(t), v, i...) + +cuTile.Tiled(arr::AbstractArray) = _Tiled(arr) + +struct TiledCuArrayStyle{N} <: BroadcastStyle end +TiledCuArrayStyle{M}(::Val{N}) where {N,M} = TiledCuArrayStyle{N}() + +BroadcastStyle(::Type{<:_Tiled{<:CuArray{T,N}}}) where {T,N} = TiledCuArrayStyle{N}() + +# TiledCuArrayStyle wins over CuArrayStyle and DefaultArrayStyle +BroadcastStyle(::TiledCuArrayStyle{N}, ::CuArrayStyle{M}) where {N,M} = TiledCuArrayStyle{max(N,M)}() +BroadcastStyle(::TiledCuArrayStyle{N}, ::DefaultArrayStyle{M}) where {N,M} = TiledCuArrayStyle{max(N,M)}() +BroadcastStyle(::TiledCuArrayStyle{N}, ::TiledCuArrayStyle{M}) where {N,M} = TiledCuArrayStyle{max(N,M)}() + +# materialize! dispatch: Tiled(B) .= expr +function Base.Broadcast.materialize!(dest::_Tiled, bc::Broadcasted) + _tiled_broadcast!(parent(dest), bc) + return dest +end + +""" + _to_tiled_bc(bc) + +Walk a Broadcasted tree, converting leaf CuArrays to TileArrays and stripping +style/axes (replacing with nothing). Scalars and other leaves pass through. +""" +_to_tiled_bc(arr::CuArray) = TileArray(arr) +_to_tiled_bc(t::_Tiled) = TileArray(parent(t)) +_to_tiled_bc(x::Number) = x +_to_tiled_bc(x) = x # fallback for other types +function _to_tiled_bc(bc::Broadcasted) + new_args = map(_to_tiled_bc, bc.args) + Broadcasted{Nothing}(bc.f, new_args, nothing) +end + +# The generic broadcast kernel: evaluates the Broadcasted tree on tiles +function _tiled_bc_kernel_1d(dest::TileArray{T, 1}, bc, tile_size) where T + bid = cuTile.bid(1) + result = _eval_bc(bc, bid, tile_size) + result_converted = convert(cuTile.Tile{T}, result) + cuTile.store(dest, bid, result_converted) + return +end + +function _tiled_bc_kernel_2d(dest::TileArray{T, 2}, bc, tile_size) where T + bid_x = cuTile.bid(1) + bid_y = cuTile.bid(2) + result = _eval_bc(bc, (bid_x, bid_y), tile_size) + result_converted = convert(cuTile.Tile{T}, result) + cuTile.store(dest, (bid_x, bid_y), result_converted) + return +end + +# Recursive tree evaluation inside kernel +@inline _eval_bc(arr::TileArray, bid, tile_size) = cuTile.load(arr, bid, tile_size) +@inline _eval_bc(x::Number, bid, tile_size) = x + +@inline function _eval_bc(bc::Broadcasted, bid, tile_size) + args = _eval_bc_args(bc.args, bid, tile_size) + # Use broadcast to get element-wise semantics (not direct call, which + # would dispatch to e.g. matmul for * on tiles) + broadcast(bc.f, args...) +end + +@inline _eval_bc_args(::Tuple{}, bid, tile_size) = () +@inline _eval_bc_args(args::Tuple, bid, tile_size) = + (_eval_bc(args[1], bid, tile_size), _eval_bc_args(Base.tail(args), bid, tile_size)...) + +""" + _tiled_broadcast!(dest, bc; tile_size=64) + +Launch a tiled broadcast kernel for the fused expression `bc` writing to `dest`. +""" +function _tiled_broadcast!(dest::CuArray{T,N}, bc::Broadcasted; tile_size::Int=64) where {T, N} + dest_ta = TileArray(dest) + tiled_bc = _to_tiled_bc(bc) + + if N == 1 + ts = (tile_size,) + grid = (cld(size(dest, 1), tile_size),) + cuTile.launch(_tiled_bc_kernel_1d, grid, dest_ta, tiled_bc, Constant(ts)) + elseif N == 2 + ts = (tile_size, tile_size) + grid = (cld(size(dest, 1), tile_size), cld(size(dest, 2), tile_size)) + cuTile.launch(_tiled_bc_kernel_2d, grid, dest_ta, tiled_bc, Constant(ts)) + else + error("Tiled broadcast not yet supported for $N dimensions") + end +end + end diff --git a/src/cuTile.jl b/src/cuTile.jl index 2e5fe398..4ea4bb24 100644 --- a/src/cuTile.jl +++ b/src/cuTile.jl @@ -38,7 +38,10 @@ include("language/math.jl") include("language/operations.jl") include("language/atomics.jl") -public launch, ByTarget, @compiler_options +public launch, Tiled, ByTarget, @compiler_options launch(args...) = error("Please import CUDA.jl before using `cuTile.launch`.") +# Tiled(arr) is defined in CUDAExt; provide a function stub for the public API +function Tiled end + end # module cuTile From 56abc22d73697d35c72c509b6e1c71e5026d4613 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Mar 2026 22:31:10 +0100 Subject: [PATCH 2/6] Generalize tiled broadcast kernel to N dimensions Replace separate 1D/2D broadcast kernels with a single @generated kernel that handles arbitrary dimensionality, matching the @fuse macro's bid-construction pattern for N>3 grid delinearization. Co-Authored-By: Claude Opus 4.6 (1M context) --- ext/CUDAExt.jl | 60 +++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl index 114cae43..70abf015 100644 --- a/ext/CUDAExt.jl +++ b/ext/CUDAExt.jl @@ -320,21 +320,34 @@ function _to_tiled_bc(bc::Broadcasted) end # The generic broadcast kernel: evaluates the Broadcasted tree on tiles -function _tiled_bc_kernel_1d(dest::TileArray{T, 1}, bc, tile_size) where T - bid = cuTile.bid(1) - result = _eval_bc(bc, bid, tile_size) - result_converted = convert(cuTile.Tile{T}, result) - cuTile.store(dest, bid, result_converted) - return -end +@generated function _tiled_bc_kernel(dest::TileArray{T, N}, bc, tile_size, overflow_grids) where {T, N} + body = Expr[] + bid_vars = [Symbol("bid_$d") for d in 1:N] -function _tiled_bc_kernel_2d(dest::TileArray{T, 2}, bc, tile_size) where T - bid_x = cuTile.bid(1) - bid_y = cuTile.bid(2) - result = _eval_bc(bc, (bid_x, bid_y), tile_size) - result_converted = convert(cuTile.Tile{T}, result) - cuTile.store(dest, (bid_x, bid_y), result_converted) - return + if N <= 3 + for d in 1:N + push!(body, :($(bid_vars[d]) = cuTile.bid($d))) + end + else + push!(body, :($(bid_vars[1]) = cuTile.bid(1))) + push!(body, :($(bid_vars[2]) = cuTile.bid(2))) + push!(body, :(_rem = cuTile.bid(3) - Int32(1))) + for d in 3:N + if d < N + push!(body, :($(bid_vars[d]) = rem(_rem, Int32(overflow_grids[$(d-2)])) + Int32(1))) + push!(body, :(_rem = fld(_rem, Int32(overflow_grids[$(d-2)])))) + else + push!(body, :($(bid_vars[d]) = _rem + Int32(1))) + end + end + end + + idx = N == 1 ? bid_vars[1] : Expr(:tuple, bid_vars...) + push!(body, :(result = _eval_bc(bc, $idx, tile_size))) + push!(body, :(result_converted = convert(cuTile.Tile{$T}, result))) + push!(body, :(cuTile.store(dest, $idx, result_converted))) + push!(body, :(return)) + Expr(:block, body...) end # Recursive tree evaluation inside kernel @@ -361,17 +374,14 @@ function _tiled_broadcast!(dest::CuArray{T,N}, bc::Broadcasted; tile_size::Int=6 dest_ta = TileArray(dest) tiled_bc = _to_tiled_bc(bc) - if N == 1 - ts = (tile_size,) - grid = (cld(size(dest, 1), tile_size),) - cuTile.launch(_tiled_bc_kernel_1d, grid, dest_ta, tiled_bc, Constant(ts)) - elseif N == 2 - ts = (tile_size, tile_size) - grid = (cld(size(dest, 1), tile_size), cld(size(dest, 2), tile_size)) - cuTile.launch(_tiled_bc_kernel_2d, grid, dest_ta, tiled_bc, Constant(ts)) - else - error("Tiled broadcast not yet supported for $N dimensions") - end + ts = ntuple(i -> i <= min(N, 2) ? tile_size : 1, N) + grid = ntuple(i -> cld(size(dest, i), ts[i]), N) + + launch_grid = N <= 3 ? grid : (grid[1], grid[2], prod(grid[i] for i in 3:N)) + overflow = N > 3 ? grid[3:end] : () + + cuTile.launch(_tiled_bc_kernel, launch_grid, dest_ta, tiled_bc, + Constant(ts), Constant(overflow)) end end From ad2c10f16124bae673f66368609586c91f94db58 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 20 Mar 2026 11:22:44 +0100 Subject: [PATCH 3/6] Simplify Tiled and add macro interface. --- ext/CUDAExt.jl | 43 ++++++------------ src/cuTile.jl | 54 ++++++++++++++++++++-- test/execution/broadcast.jl | 89 +++++++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 33 deletions(-) diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl index 70abf015..b6dc5bd7 100644 --- a/ext/CUDAExt.jl +++ b/ext/CUDAExt.jl @@ -1,7 +1,7 @@ module CUDAExt using cuTile -using cuTile: TileArray, Constant, CuTileResults, +using cuTile: Tiled, TileArray, Constant, CuTileResults, emit_code, sanitize_name, constant_eltype, flatten, resolve_hint, format_sm_arch @@ -12,6 +12,7 @@ import Core.Compiler as CC using CUDA: CuArray, CuModule, CuFunction, cudacall, device, capability using CUDA_Compiler_jll +import Base.Broadcast import Base.Broadcast: BroadcastStyle, Broadcasted, DefaultArrayStyle import CUDA: CuArrayStyle @@ -262,36 +263,10 @@ to_tile_arg(arr::AbstractArray) = TileArray(arr) Tiled Broadcast via Base.Broadcast =============================================================================# -""" - Tiled{A <: AbstractArray} - -Wrapper that routes broadcast expressions through cuTile kernels. - - Tiled(B) .= A .+ A - -Uses Julia's `Base.Broadcast` fusion machinery to build a `Broadcasted` tree, -then dispatches to a generic cuTile kernel that evaluates the tree on tiles. -""" -struct _Tiled{A <: AbstractArray} - parent::A -end -Base.parent(t::_Tiled) = t.parent -Base.size(t::_Tiled) = size(parent(t)) -Base.size(t::_Tiled, d) = size(parent(t), d) -Base.axes(t::_Tiled) = axes(parent(t)) -Base.axes(t::_Tiled, d) = axes(parent(t), d) -Base.ndims(::_Tiled{A}) where A = ndims(A) -Base.eltype(::_Tiled{A}) where A = eltype(A) -Base.length(t::_Tiled) = length(parent(t)) -Base.similar(t::_Tiled, args...) = _Tiled(similar(parent(t), args...)) -Base.setindex!(t::_Tiled, v, i...) = setindex!(parent(t), v, i...) - -cuTile.Tiled(arr::AbstractArray) = _Tiled(arr) - struct TiledCuArrayStyle{N} <: BroadcastStyle end TiledCuArrayStyle{M}(::Val{N}) where {N,M} = TiledCuArrayStyle{N}() -BroadcastStyle(::Type{<:_Tiled{<:CuArray{T,N}}}) where {T,N} = TiledCuArrayStyle{N}() +BroadcastStyle(::Type{<:Tiled{<:CuArray{T,N}}}) where {T,N} = TiledCuArrayStyle{N}() # TiledCuArrayStyle wins over CuArrayStyle and DefaultArrayStyle BroadcastStyle(::TiledCuArrayStyle{N}, ::CuArrayStyle{M}) where {N,M} = TiledCuArrayStyle{max(N,M)}() @@ -299,11 +274,19 @@ BroadcastStyle(::TiledCuArrayStyle{N}, ::DefaultArrayStyle{M}) where {N,M} = Til BroadcastStyle(::TiledCuArrayStyle{N}, ::TiledCuArrayStyle{M}) where {N,M} = TiledCuArrayStyle{max(N,M)}() # materialize! dispatch: Tiled(B) .= expr -function Base.Broadcast.materialize!(dest::_Tiled, bc::Broadcasted) +function Base.Broadcast.materialize!(dest::Tiled, bc::Broadcasted) _tiled_broadcast!(parent(dest), bc) return dest end +# copy dispatch: C = Tiled(A) .+ B (allocating form) +function Base.copy(bc::Broadcasted{TiledCuArrayStyle{N}}) where N + ElType = Broadcast.combine_eltypes(bc.f, bc.args) + dest = similar(CuArray{ElType}, axes(bc)) + _tiled_broadcast!(dest, bc) + return dest +end + """ _to_tiled_bc(bc) @@ -311,7 +294,7 @@ Walk a Broadcasted tree, converting leaf CuArrays to TileArrays and stripping style/axes (replacing with nothing). Scalars and other leaves pass through. """ _to_tiled_bc(arr::CuArray) = TileArray(arr) -_to_tiled_bc(t::_Tiled) = TileArray(parent(t)) +_to_tiled_bc(t::Tiled) = TileArray(parent(t)) _to_tiled_bc(x::Number) = x _to_tiled_bc(x) = x # fallback for other types function _to_tiled_bc(bc::Broadcasted) diff --git a/src/cuTile.jl b/src/cuTile.jl index 4ea4bb24..efe914b0 100644 --- a/src/cuTile.jl +++ b/src/cuTile.jl @@ -38,10 +38,58 @@ include("language/math.jl") include("language/operations.jl") include("language/atomics.jl") -public launch, Tiled, ByTarget, @compiler_options +public launch, Tiled, ByTarget, @compiler_options, var"@." launch(args...) = error("Please import CUDA.jl before using `cuTile.launch`.") -# Tiled(arr) is defined in CUDAExt; provide a function stub for the public API -function Tiled end +""" + Tiled(x) + +Wrapper that routes broadcast expressions through cuTile kernels. + + Tiled(B) .= A .+ A + +Uses Julia's `Base.Broadcast` fusion machinery to build a `Broadcasted` tree, +then dispatches to a generic cuTile kernel that evaluates the tree on tiles. +""" +struct Tiled{A <: AbstractArray} + parent::A +end +Tiled(x) = x # passthrough for non-arrays (Numbers, etc.) +Base.parent(t::Tiled) = t.parent +Base.axes(t::Tiled) = axes(parent(t)) +Base.size(t::Tiled) = size(parent(t)) +Base.ndims(::Tiled{A}) where A = ndims(A) +Base.eltype(::Tiled{A}) where A = eltype(A) +Base.Broadcast.broadcastable(t::Tiled) = t + +# Walk dotted AST, wrap value-position leaves in Tiled() +_wrap_tiled(x) = x # literals pass through +_wrap_tiled(s::Symbol) = :($Tiled($s)) +function _wrap_tiled(ex::Expr) + if ex.head === :.= + Expr(:.=, _wrap_tiled(ex.args[1]), _wrap_tiled(ex.args[2])) + elseif ex.head === :. && length(ex.args) == 2 && + ex.args[2] isa Expr && ex.args[2].head === :tuple + # f.(args...) — wrap args, NOT function position + new_args = map(_wrap_tiled, ex.args[2].args) + Expr(:., ex.args[1], Expr(:tuple, new_args...)) + else + Expr(ex.head, map(_wrap_tiled, ex.args)...) + end +end + +""" + @. expr + +Like `Base.@.` but wraps every value-position leaf in `Tiled()`, routing +the broadcast through cuTile kernels. + + using cuTile; const ct = cuTile + ct.@. C = A + sin(B) + # equivalent to: Tiled(C) .= Tiled(A) .+ sin.(Tiled(B)) +""" +macro var"."(ex) + esc(_wrap_tiled(Base.Broadcast.__dot__(ex))) +end end # module cuTile diff --git a/test/execution/broadcast.jl b/test/execution/broadcast.jl index 00055ae7..c49f79ea 100644 --- a/test/execution/broadcast.jl +++ b/test/execution/broadcast.jl @@ -839,3 +839,92 @@ end # type argument broadcasting @test Array(c) ≈ max.(Array(a), Array(b)) end end + +@testset "Tiled broadcast" begin + using CUDA + + @testset "1D element-wise" begin + n = 1024 + A = CUDA.rand(Float32, n) + B = CUDA.rand(Float32, n) + C = CUDA.zeros(Float32, n) + ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(B) + @test Array(C) ≈ Array(A) .+ Array(B) + end + + @testset "fused multi-op" begin + n = 1024 + A = CUDA.rand(Float32, n) .+ 0.1f0 + C = CUDA.zeros(Float32, n) + ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(A) .* sin.(ct.Tiled(A)) + @test Array(C) ≈ Array(A) .+ Array(A) .* sin.(Array(A)) rtol=1e-5 + end + + @testset "scalar broadcast" begin + n = 1024 + A = CUDA.rand(Float32, n) + C = CUDA.zeros(Float32, n) + ct.Tiled(C) .= ct.Tiled(A) .+ 1.0f0 + @test Array(C) ≈ Array(A) .+ 1.0f0 + end + + @testset "2D element-wise" begin + m, n = 128, 256 + A = CUDA.rand(Float32, m, n) + B = CUDA.rand(Float32, m, n) + C = CUDA.zeros(Float32, m, n) + ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(B) + @test Array(C) ≈ Array(A) .+ Array(B) + end + + @testset "3D element-wise" begin + A = CUDA.rand(Float32, 64, 64, 4) + B = CUDA.rand(Float32, 64, 64, 4) + C = CUDA.zeros(Float32, 64, 64, 4) + ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(B) + @test Array(C) ≈ Array(A) .+ Array(B) + end + + @testset "ct.@. in-place" begin + n = 1024 + A = CUDA.rand(Float32, n) + B = CUDA.rand(Float32, n) + C = CUDA.zeros(Float32, n) + ct.@. C = A + B + @test Array(C) ≈ Array(A) .+ Array(B) + end + + @testset "ct.@. with function" begin + n = 1024 + A = CUDA.rand(Float32, n) .+ 0.1f0 + C = CUDA.zeros(Float32, n) + ct.@. C = A + sin(A) + @test Array(C) ≈ Array(A) .+ sin.(Array(A)) rtol=1e-5 + end + + @testset "ct.@. with scalar" begin + n = 1024 + A = CUDA.rand(Float32, n) + C = CUDA.zeros(Float32, n) + ct.@. C = A + 2.0f0 + @test Array(C) ≈ Array(A) .+ 2.0f0 + end + + @testset "allocating copy" begin + n = 1024 + A = CUDA.rand(Float32, n) + B = CUDA.rand(Float32, n) + C = ct.Tiled(A) .+ ct.Tiled(B) + @test C isa CuArray + @test Array(C) ≈ Array(A) .+ Array(B) + end + + @testset "allocating ct.@." begin + n = 1024 + A = CUDA.rand(Float32, n) + B = CUDA.rand(Float32, n) + C = ct.@. A + B + @test C isa CuArray + @test Array(C) ≈ Array(A) .+ Array(B) + end +end From 12aa00d4a20558c71bd105d51d066c6f351c6d4f Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 20 Mar 2026 12:09:58 +0100 Subject: [PATCH 4/6] Use budget-based tile sizing for tiled broadcast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the hardcoded 64×64 tile sizes with a greedy budget-based approach (4096 elements) that skips singleton dimensions and caps each tile dim at the array size. This fixes tiled broadcast for arrays with leading singleton or small dimensions (e.g. (1, 1024), (4, 1024)). Co-Authored-By: Claude Opus 4.6 (1M context) --- ext/CUDAExt.jl | 26 +++++++++++++++++++++++--- test/execution/broadcast.jl | 21 +++++++++++++++++++++ 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl index b6dc5bd7..f6441148 100644 --- a/ext/CUDAExt.jl +++ b/ext/CUDAExt.jl @@ -349,15 +349,35 @@ end (_eval_bc(args[1], bid, tile_size), _eval_bc_args(Base.tail(args), bid, tile_size)...) """ - _tiled_broadcast!(dest, bc; tile_size=64) + _compute_tile_sizes(dest_size; budget=4096) + +Distribute a total element budget greedily across dimensions, skipping singletons. +Each tile dimension is a power of 2, capped by the array size in that dimension. +""" +function _compute_tile_sizes(dest_size::NTuple{N,Int}; budget::Int=4096) where N + ts = ones(Int, N) + remaining = budget + for i in 1:N + s = dest_size[i] + s == 1 && continue + t = prevpow(2, min(remaining, s)) + ts[i] = t + remaining = remaining ÷ t + remaining < 2 && break + end + return NTuple{N,Int}(ts) +end + +""" + _tiled_broadcast!(dest, bc) Launch a tiled broadcast kernel for the fused expression `bc` writing to `dest`. """ -function _tiled_broadcast!(dest::CuArray{T,N}, bc::Broadcasted; tile_size::Int=64) where {T, N} +function _tiled_broadcast!(dest::CuArray{T,N}, bc::Broadcasted) where {T, N} dest_ta = TileArray(dest) tiled_bc = _to_tiled_bc(bc) - ts = ntuple(i -> i <= min(N, 2) ? tile_size : 1, N) + ts = _compute_tile_sizes(size(dest)) grid = ntuple(i -> cld(size(dest, i), ts[i]), N) launch_grid = N <= 3 ? grid : (grid[1], grid[2], prod(grid[i] for i in 3:N)) diff --git a/test/execution/broadcast.jl b/test/execution/broadcast.jl index c49f79ea..d23da4f4 100644 --- a/test/execution/broadcast.jl +++ b/test/execution/broadcast.jl @@ -927,4 +927,25 @@ end @test C isa CuArray @test Array(C) ≈ Array(A) .+ Array(B) end + + @testset "leading singleton dim" begin + A = CUDA.rand(Float32, 1, 1024) + B = similar(A) + ct.Tiled(B) .= ct.Tiled(A) .+ 1.0f0 + @test Array(B) ≈ Array(A) .+ 1.0f0 + end + + @testset "double leading singleton" begin + A = CUDA.rand(Float32, 1, 1, 512) + B = similar(A) + ct.Tiled(B) .= ct.Tiled(A) .* 2.0f0 + @test Array(B) ≈ Array(A) .* 2.0f0 + end + + @testset "small leading dim" begin + A = CUDA.rand(Float32, 4, 1024) + B = similar(A) + ct.Tiled(B) .= ct.Tiled(A) .+ ct.Tiled(A) + @test Array(B) ≈ 2 .* Array(A) + end end From 0f5815d9584d95a1c7b5c87a056bc2ea4db66d15 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 20 Mar 2026 15:06:24 +0100 Subject: [PATCH 5/6] Fixes from review. --- src/cuTile.jl | 4 ++-- test/execution/broadcast.jl | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/cuTile.jl b/src/cuTile.jl index efe914b0..76105277 100644 --- a/src/cuTile.jl +++ b/src/cuTile.jl @@ -38,7 +38,7 @@ include("language/math.jl") include("language/operations.jl") include("language/atomics.jl") -public launch, Tiled, ByTarget, @compiler_options, var"@." +public launch, Tiled, ByTarget, @compiler_options, @. launch(args...) = error("Please import CUDA.jl before using `cuTile.launch`.") """ @@ -88,7 +88,7 @@ the broadcast through cuTile kernels. ct.@. C = A + sin(B) # equivalent to: Tiled(C) .= Tiled(A) .+ sin.(Tiled(B)) """ -macro var"."(ex) +macro __dot__(ex) esc(_wrap_tiled(Base.Broadcast.__dot__(ex))) end diff --git a/test/execution/broadcast.jl b/test/execution/broadcast.jl index d23da4f4..3e2316d7 100644 --- a/test/execution/broadcast.jl +++ b/test/execution/broadcast.jl @@ -885,6 +885,12 @@ end @test Array(C) ≈ Array(A) .+ Array(B) end + @testset "ct.@. expands to Tiled" begin + ex = @macroexpand ct.@. C = A + B + # The macro should produce Tiled() wrapping, not plain dotted calls + @test occursin("Tiled", string(ex)) + end + @testset "ct.@. in-place" begin n = 1024 A = CUDA.rand(Float32, n) From 5d7dfdc9eab9084ed9676f78098dd41b11303d5a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 20 Mar 2026 16:50:43 +0100 Subject: [PATCH 6/6] Reorganize. --- README.md | 33 +++- ext/CUDAExt.jl | 134 +------------- src/broadcast.jl | 204 +++++++++++++++++++++ src/cuTile.jl | 54 +----- test/{execution => device}/atomics.jl | 0 test/{execution => device}/broadcast.jl | 116 ------------ test/{execution => device}/control_flow.jl | 0 test/{execution => device}/core.jl | 0 test/{execution => device}/hints.jl | 0 test/{execution => device}/integration.jl | 0 test/{execution => device}/math.jl | 0 test/{execution => device}/reductions.jl | 0 test/{execution => device}/tile.jl | 0 test/{execution => device}/types.jl | 0 test/host/broadcast.jl | 115 ++++++++++++ test/runtests.jl | 2 +- 16 files changed, 358 insertions(+), 300 deletions(-) create mode 100644 src/broadcast.jl rename test/{execution => device}/atomics.jl (100%) rename test/{execution => device}/broadcast.jl (88%) rename test/{execution => device}/control_flow.jl (100%) rename test/{execution => device}/core.jl (100%) rename test/{execution => device}/hints.jl (100%) rename test/{execution => device}/integration.jl (100%) rename test/{execution => device}/math.jl (100%) rename test/{execution => device}/reductions.jl (100%) rename test/{execution => device}/tile.jl (100%) rename test/{execution => device}/types.jl (100%) create mode 100644 test/host/broadcast.jl diff --git a/README.md b/README.md index aad1066c..5fb8705a 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ vector_size = 2^20 tile_size = 16 blocks = cld(vector_size, tile_size) -grid = (blocks, 1, 1) +grid = (blocks, 1, 1) a, b = CUDA.rand(Float32, vector_size), CUDA.rand(Float32, vector_size) c = CUDA.zeros(Float32, vector_size) @@ -232,7 +232,6 @@ uses standard Julia syntax and is overlaid on `Base`. cuTile.jl follows Julia conventions, which differ from the Python API in several ways: - ### Kernel definition syntax Kernels don't need a decorator, but do have to return `nothing`: @@ -511,6 +510,36 @@ ct.store(arr, (i, j), t) ``` +## Host-level operations + +cuTile.jl also provides a limited set of host-level APIs to use cuTile without +writing custom kernels. For example, for element-wise operations on `CuArray`s, +cuTile can automatically generate and launch a fused kernel using Julia's +broadcast machinery: + +```julia +using CUDA +import cuTile as ct + +A = CUDA.rand(Float32, 1024) +B = CUDA.rand(Float32, 1024) +C = CUDA.zeros(Float32, 1024) + +# Wrap arrays in Tiled() to route through cuTile +ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(B) + +# Or use the @. macro for convenience +ct.@. C = A + sin(B) + +# Allocating form (returns a new CuArray) +D = ct.@. A + B +``` + +The entire broadcast expression is fused into a single cuTile kernel. Tile sizes +are automatically chosen based on array dimensions (power-of-2, budget-based). +Works with 1D through N-dimensional arrays. + + ## Acknowledgments cuTile.jl is inspired by [cuTile-Python](https://github.com/NVIDIA/cutile-python/), diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl index f6441148..6814b878 100644 --- a/ext/CUDAExt.jl +++ b/ext/CUDAExt.jl @@ -1,7 +1,7 @@ module CUDAExt using cuTile -using cuTile: Tiled, TileArray, Constant, CuTileResults, +using cuTile: TileArray, Constant, CuTileResults, emit_code, sanitize_name, constant_eltype, flatten, resolve_hint, format_sm_arch @@ -12,8 +12,7 @@ import Core.Compiler as CC using CUDA: CuArray, CuModule, CuFunction, cudacall, device, capability using CUDA_Compiler_jll -import Base.Broadcast -import Base.Broadcast: BroadcastStyle, Broadcasted, DefaultArrayStyle +import Base.Broadcast: BroadcastStyle import CUDA: CuArrayStyle public launch @@ -259,132 +258,7 @@ Other values pass through unchanged. to_tile_arg(x) = x to_tile_arg(arr::AbstractArray) = TileArray(arr) -#============================================================================= - Tiled Broadcast via Base.Broadcast -=============================================================================# - -struct TiledCuArrayStyle{N} <: BroadcastStyle end -TiledCuArrayStyle{M}(::Val{N}) where {N,M} = TiledCuArrayStyle{N}() - -BroadcastStyle(::Type{<:Tiled{<:CuArray{T,N}}}) where {T,N} = TiledCuArrayStyle{N}() - -# TiledCuArrayStyle wins over CuArrayStyle and DefaultArrayStyle -BroadcastStyle(::TiledCuArrayStyle{N}, ::CuArrayStyle{M}) where {N,M} = TiledCuArrayStyle{max(N,M)}() -BroadcastStyle(::TiledCuArrayStyle{N}, ::DefaultArrayStyle{M}) where {N,M} = TiledCuArrayStyle{max(N,M)}() -BroadcastStyle(::TiledCuArrayStyle{N}, ::TiledCuArrayStyle{M}) where {N,M} = TiledCuArrayStyle{max(N,M)}() - -# materialize! dispatch: Tiled(B) .= expr -function Base.Broadcast.materialize!(dest::Tiled, bc::Broadcasted) - _tiled_broadcast!(parent(dest), bc) - return dest -end - -# copy dispatch: C = Tiled(A) .+ B (allocating form) -function Base.copy(bc::Broadcasted{TiledCuArrayStyle{N}}) where N - ElType = Broadcast.combine_eltypes(bc.f, bc.args) - dest = similar(CuArray{ElType}, axes(bc)) - _tiled_broadcast!(dest, bc) - return dest -end - -""" - _to_tiled_bc(bc) - -Walk a Broadcasted tree, converting leaf CuArrays to TileArrays and stripping -style/axes (replacing with nothing). Scalars and other leaves pass through. -""" -_to_tiled_bc(arr::CuArray) = TileArray(arr) -_to_tiled_bc(t::Tiled) = TileArray(parent(t)) -_to_tiled_bc(x::Number) = x -_to_tiled_bc(x) = x # fallback for other types -function _to_tiled_bc(bc::Broadcasted) - new_args = map(_to_tiled_bc, bc.args) - Broadcasted{Nothing}(bc.f, new_args, nothing) -end - -# The generic broadcast kernel: evaluates the Broadcasted tree on tiles -@generated function _tiled_bc_kernel(dest::TileArray{T, N}, bc, tile_size, overflow_grids) where {T, N} - body = Expr[] - bid_vars = [Symbol("bid_$d") for d in 1:N] - - if N <= 3 - for d in 1:N - push!(body, :($(bid_vars[d]) = cuTile.bid($d))) - end - else - push!(body, :($(bid_vars[1]) = cuTile.bid(1))) - push!(body, :($(bid_vars[2]) = cuTile.bid(2))) - push!(body, :(_rem = cuTile.bid(3) - Int32(1))) - for d in 3:N - if d < N - push!(body, :($(bid_vars[d]) = rem(_rem, Int32(overflow_grids[$(d-2)])) + Int32(1))) - push!(body, :(_rem = fld(_rem, Int32(overflow_grids[$(d-2)])))) - else - push!(body, :($(bid_vars[d]) = _rem + Int32(1))) - end - end - end - - idx = N == 1 ? bid_vars[1] : Expr(:tuple, bid_vars...) - push!(body, :(result = _eval_bc(bc, $idx, tile_size))) - push!(body, :(result_converted = convert(cuTile.Tile{$T}, result))) - push!(body, :(cuTile.store(dest, $idx, result_converted))) - push!(body, :(return)) - Expr(:block, body...) -end - -# Recursive tree evaluation inside kernel -@inline _eval_bc(arr::TileArray, bid, tile_size) = cuTile.load(arr, bid, tile_size) -@inline _eval_bc(x::Number, bid, tile_size) = x - -@inline function _eval_bc(bc::Broadcasted, bid, tile_size) - args = _eval_bc_args(bc.args, bid, tile_size) - # Use broadcast to get element-wise semantics (not direct call, which - # would dispatch to e.g. matmul for * on tiles) - broadcast(bc.f, args...) -end - -@inline _eval_bc_args(::Tuple{}, bid, tile_size) = () -@inline _eval_bc_args(args::Tuple, bid, tile_size) = - (_eval_bc(args[1], bid, tile_size), _eval_bc_args(Base.tail(args), bid, tile_size)...) - -""" - _compute_tile_sizes(dest_size; budget=4096) - -Distribute a total element budget greedily across dimensions, skipping singletons. -Each tile dimension is a power of 2, capped by the array size in that dimension. -""" -function _compute_tile_sizes(dest_size::NTuple{N,Int}; budget::Int=4096) where N - ts = ones(Int, N) - remaining = budget - for i in 1:N - s = dest_size[i] - s == 1 && continue - t = prevpow(2, min(remaining, s)) - ts[i] = t - remaining = remaining ÷ t - remaining < 2 && break - end - return NTuple{N,Int}(ts) -end - -""" - _tiled_broadcast!(dest, bc) - -Launch a tiled broadcast kernel for the fused expression `bc` writing to `dest`. -""" -function _tiled_broadcast!(dest::CuArray{T,N}, bc::Broadcasted) where {T, N} - dest_ta = TileArray(dest) - tiled_bc = _to_tiled_bc(bc) - - ts = _compute_tile_sizes(size(dest)) - grid = ntuple(i -> cld(size(dest, i), ts[i]), N) - - launch_grid = N <= 3 ? grid : (grid[1], grid[2], prod(grid[i] for i in 3:N)) - overflow = N > 3 ? grid[3:end] : () - - cuTile.launch(_tiled_bc_kernel, launch_grid, dest_ta, tiled_bc, - Constant(ts), Constant(overflow)) -end +# Tiled Broadcast — TiledStyle wins over CuArrayStyle +BroadcastStyle(::cuTile.TiledStyle{N}, ::CuArrayStyle{M}) where {N,M} = cuTile.TiledStyle{max(N,M)}() end diff --git a/src/broadcast.jl b/src/broadcast.jl new file mode 100644 index 00000000..12c84e8a --- /dev/null +++ b/src/broadcast.jl @@ -0,0 +1,204 @@ +import Base.Broadcast: BroadcastStyle, Broadcasted + +#============================================================================= + Tiled wrapper — routes broadcast expressions through cuTile kernels +=============================================================================# + +""" + Tiled(x) + +Wrapper that routes broadcast expressions through cuTile kernels. + + Tiled(B) .= A .+ A + +Uses Julia's `Base.Broadcast` fusion machinery to build a `Broadcasted` tree, +then dispatches to a generic cuTile kernel that evaluates the tree on tiles. +""" +struct Tiled{A <: AbstractArray} + parent::A +end +Tiled(x) = x # passthrough for non-arrays (Numbers, etc.) +Base.parent(t::Tiled) = t.parent +Base.axes(t::Tiled) = axes(parent(t)) +Base.size(t::Tiled) = size(parent(t)) +Base.ndims(::Tiled{A}) where A = ndims(A) +Base.eltype(::Tiled{A}) where A = eltype(A) +Base.Broadcast.broadcastable(t::Tiled) = t + +# Walk dotted AST, wrap value-position leaves in Tiled() +_wrap_tiled(x) = x # literals pass through +_wrap_tiled(s::Symbol) = :($Tiled($s)) +function _wrap_tiled(ex::Expr) + if ex.head === :.= + Expr(:.=, _wrap_tiled(ex.args[1]), _wrap_tiled(ex.args[2])) + elseif ex.head === :. && length(ex.args) == 2 && + ex.args[2] isa Expr && ex.args[2].head === :tuple + # f.(args...) — wrap args, NOT function position + new_args = map(_wrap_tiled, ex.args[2].args) + Expr(:., ex.args[1], Expr(:tuple, new_args...)) + else + Expr(ex.head, map(_wrap_tiled, ex.args)...) + end +end + +""" + @. expr + +Like `Base.@.` but wraps every value-position leaf in `Tiled()`, routing +the broadcast through cuTile kernels. + + using cuTile; const ct = cuTile + ct.@. C = A + sin(B) + # equivalent to: Tiled(C) .= Tiled(A) .+ sin.(Tiled(B)) +""" +macro __dot__(ex) + esc(_wrap_tiled(Base.Broadcast.__dot__(ex))) +end + +#============================================================================= + TiledStyle — routes broadcast through cuTile kernels +=============================================================================# + +struct TiledStyle{N} <: BroadcastStyle end +TiledStyle{M}(::Val{N}) where {N,M} = TiledStyle{N}() + +BroadcastStyle(::Type{<:Tiled{A}}) where A = TiledStyle{ndims(A)}() + +# TiledStyle wins over DefaultArrayStyle +BroadcastStyle(::TiledStyle{N}, ::Base.Broadcast.DefaultArrayStyle{M}) where {N,M} = TiledStyle{max(N,M)}() +BroadcastStyle(::TiledStyle{N}, ::TiledStyle{M}) where {N,M} = TiledStyle{max(N,M)}() + +#============================================================================= + materialize! and copy — dispatch to _tiled_broadcast! +=============================================================================# + +function Base.Broadcast.materialize!(dest::Tiled, bc::Broadcasted) + _tiled_broadcast!(parent(dest), bc) + return dest +end + +function Base.copy(bc::Broadcasted{TiledStyle{N}}) where N + arr = @something _find_tiled_array(bc) error("tiled broadcast requires at least one Tiled() argument") + ElType = Base.Broadcast.combine_eltypes(bc.f, bc.args) + dest = similar(arr, ElType, axes(bc)) + _tiled_broadcast!(dest, bc) + return dest +end + +"""Find the first underlying array from a Tiled leaf in a Broadcasted tree.""" +_find_tiled_array(t::Tiled) = parent(t) +_find_tiled_array(x) = nothing +function _find_tiled_array(bc::Broadcasted) + for arg in bc.args + arr = _find_tiled_array(arg) + arr !== nothing && return arr + end + return nothing +end + +#============================================================================= + _tiled_broadcast! — generic AbstractArray implementation +=============================================================================# + +function _tiled_broadcast!(dest::AbstractArray{T,N}, bc::Broadcasted) where {T, N} + dest_ta = TileArray(dest) + tiled_bc = _to_tiled_bc(bc) + + ts = _compute_tile_sizes(size(dest)) + grid = ntuple(i -> cld(size(dest, i), ts[i]), N) + + launch_grid = N <= 3 ? grid : (grid[1], grid[2], prod(grid[i] for i in 3:N)) + overflow = N > 3 ? grid[3:end] : () + + launch(_tiled_bc_kernel, launch_grid, dest_ta, tiled_bc, + Constant(ts), Constant(overflow)) +end + +#============================================================================= + Generic tree walk — convert leaves to TileArrays +=============================================================================# + +_to_tiled_bc(t::Tiled) = TileArray(parent(t)) +_to_tiled_bc(arr::AbstractArray) = TileArray(arr) +_to_tiled_bc(x::Number) = x +_to_tiled_bc(x) = x # fallback for other types +function _to_tiled_bc(bc::Broadcasted) + new_args = map(_to_tiled_bc, bc.args) + Broadcasted{Nothing}(bc.f, new_args, nothing) +end + +#============================================================================= + Broadcast kernel — evaluates Broadcasted tree on tiles +=============================================================================# + +@generated function _tiled_bc_kernel(dest::TileArray{T, N}, bc, tile_size, overflow_grids) where {T, N} + body = Expr[] + bid_vars = [Symbol("bid_$d") for d in 1:N] + + if N <= 3 + for d in 1:N + push!(body, :($(bid_vars[d]) = cuTile.bid($d))) + end + else + push!(body, :($(bid_vars[1]) = cuTile.bid(1))) + push!(body, :($(bid_vars[2]) = cuTile.bid(2))) + push!(body, :(_rem = cuTile.bid(3) - Int32(1))) + for d in 3:N + if d < N + push!(body, :($(bid_vars[d]) = rem(_rem, Int32(overflow_grids[$(d-2)])) + Int32(1))) + push!(body, :(_rem = fld(_rem, Int32(overflow_grids[$(d-2)])))) + else + push!(body, :($(bid_vars[d]) = _rem + Int32(1))) + end + end + end + + idx = N == 1 ? bid_vars[1] : Expr(:tuple, bid_vars...) + push!(body, :(result = _eval_bc(bc, $idx, tile_size))) + push!(body, :(result_converted = convert(cuTile.Tile{$T}, result))) + push!(body, :(cuTile.store(dest, $idx, result_converted))) + push!(body, :(return)) + Expr(:block, body...) +end + +#============================================================================= + Recursive tree evaluation inside kernel +=============================================================================# + +@inline _eval_bc(arr::TileArray, bid, tile_size) = cuTile.load(arr, bid, tile_size) +@inline _eval_bc(x::Number, bid, tile_size) = x + +@inline function _eval_bc(bc::Broadcasted, bid, tile_size) + args = _eval_bc_args(bc.args, bid, tile_size) + # Use broadcast to get element-wise semantics (not direct call, which + # would dispatch to e.g. matmul for * on tiles) + broadcast(bc.f, args...) +end + +@inline _eval_bc_args(::Tuple{}, bid, tile_size) = () +@inline _eval_bc_args(args::Tuple, bid, tile_size) = + (_eval_bc(args[1], bid, tile_size), _eval_bc_args(Base.tail(args), bid, tile_size)...) + +#============================================================================= + Tile sizing +=============================================================================# + +""" + _compute_tile_sizes(dest_size; budget=4096) + +Distribute a total element budget greedily across dimensions, skipping singletons. +Each tile dimension is a power of 2, capped by the array size in that dimension. +""" +function _compute_tile_sizes(dest_size::NTuple{N,Int}; budget::Int=4096) where N + ts = ones(Int, N) + remaining = budget + for i in 1:N + s = dest_size[i] + s == 1 && continue + t = prevpow(2, min(remaining, s)) + ts[i] = t + remaining = remaining ÷ t + remaining < 2 && break + end + return NTuple{N,Int}(ts) +end diff --git a/src/cuTile.jl b/src/cuTile.jl index 76105277..e2d75cf3 100644 --- a/src/cuTile.jl +++ b/src/cuTile.jl @@ -38,58 +38,10 @@ include("language/math.jl") include("language/operations.jl") include("language/atomics.jl") +# Host-level abstractions +include("broadcast.jl") + public launch, Tiled, ByTarget, @compiler_options, @. launch(args...) = error("Please import CUDA.jl before using `cuTile.launch`.") -""" - Tiled(x) - -Wrapper that routes broadcast expressions through cuTile kernels. - - Tiled(B) .= A .+ A - -Uses Julia's `Base.Broadcast` fusion machinery to build a `Broadcasted` tree, -then dispatches to a generic cuTile kernel that evaluates the tree on tiles. -""" -struct Tiled{A <: AbstractArray} - parent::A -end -Tiled(x) = x # passthrough for non-arrays (Numbers, etc.) -Base.parent(t::Tiled) = t.parent -Base.axes(t::Tiled) = axes(parent(t)) -Base.size(t::Tiled) = size(parent(t)) -Base.ndims(::Tiled{A}) where A = ndims(A) -Base.eltype(::Tiled{A}) where A = eltype(A) -Base.Broadcast.broadcastable(t::Tiled) = t - -# Walk dotted AST, wrap value-position leaves in Tiled() -_wrap_tiled(x) = x # literals pass through -_wrap_tiled(s::Symbol) = :($Tiled($s)) -function _wrap_tiled(ex::Expr) - if ex.head === :.= - Expr(:.=, _wrap_tiled(ex.args[1]), _wrap_tiled(ex.args[2])) - elseif ex.head === :. && length(ex.args) == 2 && - ex.args[2] isa Expr && ex.args[2].head === :tuple - # f.(args...) — wrap args, NOT function position - new_args = map(_wrap_tiled, ex.args[2].args) - Expr(:., ex.args[1], Expr(:tuple, new_args...)) - else - Expr(ex.head, map(_wrap_tiled, ex.args)...) - end -end - -""" - @. expr - -Like `Base.@.` but wraps every value-position leaf in `Tiled()`, routing -the broadcast through cuTile kernels. - - using cuTile; const ct = cuTile - ct.@. C = A + sin(B) - # equivalent to: Tiled(C) .= Tiled(A) .+ sin.(Tiled(B)) -""" -macro __dot__(ex) - esc(_wrap_tiled(Base.Broadcast.__dot__(ex))) -end - end # module cuTile diff --git a/test/execution/atomics.jl b/test/device/atomics.jl similarity index 100% rename from test/execution/atomics.jl rename to test/device/atomics.jl diff --git a/test/execution/broadcast.jl b/test/device/broadcast.jl similarity index 88% rename from test/execution/broadcast.jl rename to test/device/broadcast.jl index 3e2316d7..00055ae7 100644 --- a/test/execution/broadcast.jl +++ b/test/device/broadcast.jl @@ -839,119 +839,3 @@ end # type argument broadcasting @test Array(c) ≈ max.(Array(a), Array(b)) end end - -@testset "Tiled broadcast" begin - using CUDA - - @testset "1D element-wise" begin - n = 1024 - A = CUDA.rand(Float32, n) - B = CUDA.rand(Float32, n) - C = CUDA.zeros(Float32, n) - ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(B) - @test Array(C) ≈ Array(A) .+ Array(B) - end - - @testset "fused multi-op" begin - n = 1024 - A = CUDA.rand(Float32, n) .+ 0.1f0 - C = CUDA.zeros(Float32, n) - ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(A) .* sin.(ct.Tiled(A)) - @test Array(C) ≈ Array(A) .+ Array(A) .* sin.(Array(A)) rtol=1e-5 - end - - @testset "scalar broadcast" begin - n = 1024 - A = CUDA.rand(Float32, n) - C = CUDA.zeros(Float32, n) - ct.Tiled(C) .= ct.Tiled(A) .+ 1.0f0 - @test Array(C) ≈ Array(A) .+ 1.0f0 - end - - @testset "2D element-wise" begin - m, n = 128, 256 - A = CUDA.rand(Float32, m, n) - B = CUDA.rand(Float32, m, n) - C = CUDA.zeros(Float32, m, n) - ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(B) - @test Array(C) ≈ Array(A) .+ Array(B) - end - - @testset "3D element-wise" begin - A = CUDA.rand(Float32, 64, 64, 4) - B = CUDA.rand(Float32, 64, 64, 4) - C = CUDA.zeros(Float32, 64, 64, 4) - ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(B) - @test Array(C) ≈ Array(A) .+ Array(B) - end - - @testset "ct.@. expands to Tiled" begin - ex = @macroexpand ct.@. C = A + B - # The macro should produce Tiled() wrapping, not plain dotted calls - @test occursin("Tiled", string(ex)) - end - - @testset "ct.@. in-place" begin - n = 1024 - A = CUDA.rand(Float32, n) - B = CUDA.rand(Float32, n) - C = CUDA.zeros(Float32, n) - ct.@. C = A + B - @test Array(C) ≈ Array(A) .+ Array(B) - end - - @testset "ct.@. with function" begin - n = 1024 - A = CUDA.rand(Float32, n) .+ 0.1f0 - C = CUDA.zeros(Float32, n) - ct.@. C = A + sin(A) - @test Array(C) ≈ Array(A) .+ sin.(Array(A)) rtol=1e-5 - end - - @testset "ct.@. with scalar" begin - n = 1024 - A = CUDA.rand(Float32, n) - C = CUDA.zeros(Float32, n) - ct.@. C = A + 2.0f0 - @test Array(C) ≈ Array(A) .+ 2.0f0 - end - - @testset "allocating copy" begin - n = 1024 - A = CUDA.rand(Float32, n) - B = CUDA.rand(Float32, n) - C = ct.Tiled(A) .+ ct.Tiled(B) - @test C isa CuArray - @test Array(C) ≈ Array(A) .+ Array(B) - end - - @testset "allocating ct.@." begin - n = 1024 - A = CUDA.rand(Float32, n) - B = CUDA.rand(Float32, n) - C = ct.@. A + B - @test C isa CuArray - @test Array(C) ≈ Array(A) .+ Array(B) - end - - @testset "leading singleton dim" begin - A = CUDA.rand(Float32, 1, 1024) - B = similar(A) - ct.Tiled(B) .= ct.Tiled(A) .+ 1.0f0 - @test Array(B) ≈ Array(A) .+ 1.0f0 - end - - @testset "double leading singleton" begin - A = CUDA.rand(Float32, 1, 1, 512) - B = similar(A) - ct.Tiled(B) .= ct.Tiled(A) .* 2.0f0 - @test Array(B) ≈ Array(A) .* 2.0f0 - end - - @testset "small leading dim" begin - A = CUDA.rand(Float32, 4, 1024) - B = similar(A) - ct.Tiled(B) .= ct.Tiled(A) .+ ct.Tiled(A) - @test Array(B) ≈ 2 .* Array(A) - end -end diff --git a/test/execution/control_flow.jl b/test/device/control_flow.jl similarity index 100% rename from test/execution/control_flow.jl rename to test/device/control_flow.jl diff --git a/test/execution/core.jl b/test/device/core.jl similarity index 100% rename from test/execution/core.jl rename to test/device/core.jl diff --git a/test/execution/hints.jl b/test/device/hints.jl similarity index 100% rename from test/execution/hints.jl rename to test/device/hints.jl diff --git a/test/execution/integration.jl b/test/device/integration.jl similarity index 100% rename from test/execution/integration.jl rename to test/device/integration.jl diff --git a/test/execution/math.jl b/test/device/math.jl similarity index 100% rename from test/execution/math.jl rename to test/device/math.jl diff --git a/test/execution/reductions.jl b/test/device/reductions.jl similarity index 100% rename from test/execution/reductions.jl rename to test/device/reductions.jl diff --git a/test/execution/tile.jl b/test/device/tile.jl similarity index 100% rename from test/execution/tile.jl rename to test/device/tile.jl diff --git a/test/execution/types.jl b/test/device/types.jl similarity index 100% rename from test/execution/types.jl rename to test/device/types.jl diff --git a/test/host/broadcast.jl b/test/host/broadcast.jl new file mode 100644 index 00000000..64a625cc --- /dev/null +++ b/test/host/broadcast.jl @@ -0,0 +1,115 @@ +using CUDA + +@testset "Tiled broadcast" begin + @testset "1D element-wise" begin + n = 1024 + A = CUDA.rand(Float32, n) + B = CUDA.rand(Float32, n) + C = CUDA.zeros(Float32, n) + ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(B) + @test Array(C) ≈ Array(A) .+ Array(B) + end + + @testset "fused multi-op" begin + n = 1024 + A = CUDA.rand(Float32, n) .+ 0.1f0 + C = CUDA.zeros(Float32, n) + ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(A) .* sin.(ct.Tiled(A)) + @test Array(C) ≈ Array(A) .+ Array(A) .* sin.(Array(A)) rtol=1e-5 + end + + @testset "scalar broadcast" begin + n = 1024 + A = CUDA.rand(Float32, n) + C = CUDA.zeros(Float32, n) + ct.Tiled(C) .= ct.Tiled(A) .+ 1.0f0 + @test Array(C) ≈ Array(A) .+ 1.0f0 + end + + @testset "2D element-wise" begin + m, n = 128, 256 + A = CUDA.rand(Float32, m, n) + B = CUDA.rand(Float32, m, n) + C = CUDA.zeros(Float32, m, n) + ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(B) + @test Array(C) ≈ Array(A) .+ Array(B) + end + + @testset "3D element-wise" begin + A = CUDA.rand(Float32, 64, 64, 4) + B = CUDA.rand(Float32, 64, 64, 4) + C = CUDA.zeros(Float32, 64, 64, 4) + ct.Tiled(C) .= ct.Tiled(A) .+ ct.Tiled(B) + @test Array(C) ≈ Array(A) .+ Array(B) + end + + @testset "ct.@. expands to Tiled" begin + ex = @macroexpand ct.@. C = A + B + # The macro should produce Tiled() wrapping, not plain dotted calls + @test occursin("Tiled", string(ex)) + end + + @testset "ct.@. in-place" begin + n = 1024 + A = CUDA.rand(Float32, n) + B = CUDA.rand(Float32, n) + C = CUDA.zeros(Float32, n) + ct.@. C = A + B + @test Array(C) ≈ Array(A) .+ Array(B) + end + + @testset "ct.@. with function" begin + n = 1024 + A = CUDA.rand(Float32, n) .+ 0.1f0 + C = CUDA.zeros(Float32, n) + ct.@. C = A + sin(A) + @test Array(C) ≈ Array(A) .+ sin.(Array(A)) rtol=1e-5 + end + + @testset "ct.@. with scalar" begin + n = 1024 + A = CUDA.rand(Float32, n) + C = CUDA.zeros(Float32, n) + ct.@. C = A + 2.0f0 + @test Array(C) ≈ Array(A) .+ 2.0f0 + end + + @testset "allocating copy" begin + n = 1024 + A = CUDA.rand(Float32, n) + B = CUDA.rand(Float32, n) + C = ct.Tiled(A) .+ ct.Tiled(B) + @test C isa CuArray + @test Array(C) ≈ Array(A) .+ Array(B) + end + + @testset "allocating ct.@." begin + n = 1024 + A = CUDA.rand(Float32, n) + B = CUDA.rand(Float32, n) + C = ct.@. A + B + @test C isa CuArray + @test Array(C) ≈ Array(A) .+ Array(B) + end + + @testset "leading singleton dim" begin + A = CUDA.rand(Float32, 1, 1024) + B = similar(A) + ct.Tiled(B) .= ct.Tiled(A) .+ 1.0f0 + @test Array(B) ≈ Array(A) .+ 1.0f0 + end + + @testset "double leading singleton" begin + A = CUDA.rand(Float32, 1, 1, 512) + B = similar(A) + ct.Tiled(B) .= ct.Tiled(A) .* 2.0f0 + @test Array(B) ≈ Array(A) .* 2.0f0 + end + + @testset "small leading dim" begin + A = CUDA.rand(Float32, 4, 1024) + B = similar(A) + ct.Tiled(B) .= ct.Tiled(A) .+ ct.Tiled(A) + @test Array(B) ≈ 2 .* Array(A) + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 9330bab9..7e8a444d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -43,7 +43,7 @@ args = parse_args(ARGS) if filter_tests!(testsuite, args) cuda_functional = CUDA.functional() filter!(testsuite) do (test, _) - if startswith(test, "execution/") || startswith(test, "examples/") + if startswith(test, "device/") || startswith(test, "host/") || startswith(test, "examples/") return cuda_functional else return true