JuliaGPU
diff --git a/‎test/execution.jl‎
Lines changed: 219 additions & 62 deletions b/‎test/execution.jl‎
Lines changed: 219 additions & 62 deletions
@@ -896,6 +896,118 @@ end
     @test Array(b) ≈ sqrt.(Array(a))
 end
 
+@testset "1D abs" begin
+    function vabs_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(b, pid, abs.(tile))
+        return
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n) .- 0.5f0  # Mix of positive and negative
+    b = CUDA.zeros(Float32, n)
+
+    ct.launch(vabs_1d, cld(n, 16), a, b)
+
+    @test Array(b) ≈ abs.(Array(a)) rtol=1e-5
+end
+
+@testset "1D cos" begin
+    function vcos_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(b, pid, cos.(tile))
+        return
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n) .* 6.28f0  # Range [0, 2π]
+    b = CUDA.zeros(Float32, n)
+
+    ct.launch(vcos_1d, cld(n, 16), a, b)
+
+    @test Array(b) ≈ cos.(Array(a)) rtol=1e-4
+end
+
+@testset "1D sin" begin
+    function vsin_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(b, pid, sin.(tile))
+        return
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n) .* 6.28f0
+    b = CUDA.zeros(Float32, n)
+
+    ct.launch(vsin_1d, cld(n, 16), a, b)
+
+    @test Array(b) ≈ sin.(Array(a)) rtol=1e-4
+end
+
+@testset "1D exp" begin
+    function vexp_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(b, pid, exp.(tile))
+        return
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n) .* 4.0f0  # Range [0, 4] to avoid overflow
+    b = CUDA.zeros(Float32, n)
+
+    ct.launch(vexp_1d, cld(n, 16), a, b)
+
+    @test Array(b) ≈ exp.(Array(a)) rtol=1e-4
+end
+
+@testset "1D log" begin
+    function vlog_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(b, pid, log.(tile))
+        return
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n) .+ 0.1f0  # Ensure positive
+    b = CUDA.zeros(Float32, n)
+
+    ct.launch(vlog_1d, cld(n, 16), a, b)
+
+    @test Array(b) ≈ log.(Array(a)) rtol=1e-4
+end
+
+@testset "1D ceil and floor" begin
+    function vceil_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(b, pid, ceil.(tile))
+        return
+    end
+
+    function vfloor_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(b, pid, floor.(tile))
+        return
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n) .* 10.0f0 .- 5.0f0  # Range [-5, 5]
+    b_ceil = CUDA.zeros(Float32, n)
+    b_floor = CUDA.zeros(Float32, n)
+
+    ct.launch(vceil_1d, cld(n, 16), a, b_ceil)
+    ct.launch(vfloor_1d, cld(n, 16), a, b_floor)
+
+    @test Array(b_ceil) ≈ ceil.(Array(a))
+    @test Array(b_floor) ≈ floor.(Array(a))
+end
+
 end
 
 @testset "reduction operations" begin
@@ -1307,94 +1419,139 @@ end
     end
 end
 
-@testset "mismatched shapes with + throws MethodError" begin
-    # Verify that + with different tile shapes throws MethodError (Julia-idiomatic)
-    # Note: This tests the type system, not kernel execution
-    tile_a = ct.Tile{Float32, (1, 128)}()
-    tile_b = ct.Tile{Float32, (64, 1)}()
+end
 
-    # + should require same shapes, so this should fail
-    @test_throws MethodError tile_a + tile_b
+@testset "comparison operations" begin
 
-    # But .+ should work (broadcasting)
-    result = tile_a .+ tile_b
-    @test result isa ct.Tile{Float32, (64, 128)}
-end
+@testset "float .< and .>" begin
+    function cmp_lt_gt_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1},
+                              out_lt::ct.TileArray{Float32,1}, out_gt::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(out_lt, pid, ct.where(ta .< tb, 1.0f0, 0.0f0))
+        ct.store(out_gt, pid, ct.where(ta .> tb, 1.0f0, 0.0f0))
+        return
+    end
 
+    n = 1024
+    a = CUDA.rand(Float32, n)
+    b = CUDA.rand(Float32, n)
+    out_lt = CUDA.zeros(Float32, n)
+    out_gt = CUDA.zeros(Float32, n)
+
+    ct.launch(cmp_lt_gt_kernel, cld(n, 16), a, b, out_lt, out_gt)
+
+    @test Array(out_lt) ≈ Float32.(Array(a) .< Array(b))
+    @test Array(out_gt) ≈ Float32.(Array(a) .> Array(b))
 end
 
-@testset "comparison operations" begin
+@testset "float .<= and .>=" begin
+    function cmp_le_ge_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1},
+                              out_le::ct.TileArray{Float32,1}, out_ge::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(out_le, pid, ct.where(ta .<= tb, 1.0f0, 0.0f0))
+        ct.store(out_ge, pid, ct.where(ta .>= tb, 1.0f0, 0.0f0))
+        return
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n)
+    b = CUDA.rand(Float32, n)
+    out_le = CUDA.zeros(Float32, n)
+    out_ge = CUDA.zeros(Float32, n)
 
-@testset "float comparison operators" begin
-    # Test all broadcast comparison operators with Float32 tiles
-    tile = ct.Tile{Float32, (16,)}()
+    ct.launch(cmp_le_ge_kernel, cld(n, 16), a, b, out_le, out_ge)
 
-    @test (tile .< tile) isa ct.Tile{Bool, (16,)}
-    @test (tile .> tile) isa ct.Tile{Bool, (16,)}
-    @test (tile .<= tile) isa ct.Tile{Bool, (16,)}
-    @test (tile .>= tile) isa ct.Tile{Bool, (16,)}
-    @test (tile .== tile) isa ct.Tile{Bool, (16,)}
-    @test (tile .!= tile) isa ct.Tile{Bool, (16,)}
+    @test Array(out_le) ≈ Float32.(Array(a) .<= Array(b))
+    @test Array(out_ge) ≈ Float32.(Array(a) .>= Array(b))
 end
 
-@testset "integer comparison operators" begin
-    # Test all broadcast comparison operators with Int tiles
-    int_tile = ct.arange((16,), Int)
+@testset "float .== and .!=" begin
+    function cmp_eq_ne_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1},
+                              out_eq::ct.TileArray{Float32,1}, out_ne::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(out_eq, pid, ct.where(ta .== tb, 1.0f0, 0.0f0))
+        ct.store(out_ne, pid, ct.where(ta .!= tb, 1.0f0, 0.0f0))
+        return
+    end
+
+    n = 1024
+    # Use integer-valued floats so equality is meaningful
+    a = CUDA.fill(Float32(1), n)
+    b = CUDA.fill(Float32(1), n)
+    # Set half to different values
+    CUDA.@allowscalar b[1:512] .= 2.0f0
+    out_eq = CUDA.zeros(Float32, n)
+    out_ne = CUDA.zeros(Float32, n)
 
-    @test (int_tile .< int_tile) isa ct.Tile{Bool, (16,)}
-    @test (int_tile .> int_tile) isa ct.Tile{Bool, (16,)}
-    @test (int_tile .<= int_tile) isa ct.Tile{Bool, (16,)}
-    @test (int_tile .>= int_tile) isa ct.Tile{Bool, (16,)}
-    @test (int_tile .== int_tile) isa ct.Tile{Bool, (16,)}
-    @test (int_tile .!= int_tile) isa ct.Tile{Bool, (16,)}
+    ct.launch(cmp_eq_ne_kernel, cld(n, 16), a, b, out_eq, out_ne)
+
+    @test Array(out_eq) ≈ Float32.(Array(a) .== Array(b))
+    @test Array(out_ne) ≈ Float32.(Array(a) .!= Array(b))
 end
 
 @testset "tile vs scalar comparison" begin
-    int_tile = ct.arange((16,), Int)
-    float_tile = ct.Tile{Float32, (16,)}()
-
-    # Int tile vs Int scalar
-    @test (int_tile .< 10) isa ct.Tile{Bool, (16,)}
-    @test (5 .< int_tile) isa ct.Tile{Bool, (16,)}
+    function cmp_scalar_kernel(a::ct.TileArray{Float32,1},
+                               out::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        ct.store(out, pid, ct.where(ta .> 0.5f0, 1.0f0, 0.0f0))
+        return
+    end
 
-    # Float32 tile vs Float32 scalar
-    @test (float_tile .< 2.0f0) isa ct.Tile{Bool, (16,)}
-    @test (1.0f0 .> float_tile) isa ct.Tile{Bool, (16,)}
-end
+    n = 1024
+    a = CUDA.rand(Float32, n)
+    out = CUDA.zeros(Float32, n)
 
-@testset "broadcast comparison shapes" begin
-    tile_a = ct.Tile{Float32, (1, 16)}()
-    tile_b = ct.Tile{Float32, (8, 1)}()
+    ct.launch(cmp_scalar_kernel, cld(n, 16), a, out)
 
-    # (1, 16) .< (8, 1) -> (8, 16)
-    result = tile_a .< tile_b
-    @test result isa ct.Tile{Bool, (8, 16)}
+    @test Array(out) ≈ Float32.(Array(a) .> 0.5f0)
 end
 
 end
 
 @testset "power operations" begin
 
-@testset "float tile .^ float tile" begin
-    tile = ct.Tile{Float32, (16,)}()
-    @test (tile .^ tile) isa ct.Tile{Float32, (16,)}
-end
+@testset "tile .^ tile" begin
+    function pow_tt_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1},
+                           c::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(c, pid, ta .^ tb)
+        return
+    end
 
-@testset "float tile .^ scalar" begin
-    tile = ct.Tile{Float32, (16,)}()
-    @test (tile .^ 2.0f0) isa ct.Tile{Float32, (16,)}
-    @test (2.0f0 .^ tile) isa ct.Tile{Float32, (16,)}
-end
+    n = 1024
+    a = CUDA.rand(Float32, n) .+ 0.5f0  # Ensure positive base
+    b = CUDA.rand(Float32, n) .+ 0.5f0
+    c = CUDA.zeros(Float32, n)
 
-@testset "broadcast power shapes" begin
-    tile_a = ct.Tile{Float32, (1, 16)}()
-    tile_b = ct.Tile{Float32, (8, 1)}()
-    @test (tile_a .^ tile_b) isa ct.Tile{Float32, (8, 16)}
+    ct.launch(pow_tt_kernel, cld(n, 16), a, b, c)
+
+    @test Array(c) ≈ Array(a) .^ Array(b) rtol=1e-4
 end
 
-@testset "integer power not supported" begin
-    int_tile = ct.arange((16,), Int)
-    @test_throws MethodError int_tile .^ int_tile
+@testset "tile .^ scalar" begin
+    function pow_ts_kernel(a::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        ct.store(c, pid, ta .^ 2.0f0)
+        return
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n) .+ 0.1f0
+    c = CUDA.zeros(Float32, n)
+
+    ct.launch(pow_ts_kernel, cld(n, 16), a, c)
+
+    @test Array(c) ≈ Array(a) .^ 2.0f0 rtol=1e-4
 end
 
 end