Skip to content

Commit be4265f

Browse files
committed
Fix SLEB128 zigzag encoding for 64-bit and small integer types
- writer.jl: Changed IntegerIdentityOp.value from Int64 to UInt128 to store full 64-bit unsigned values. Added mask_to_width() to mask values to correct bit width before zigzag encoding. - core.jl: Added to_uint128() helper to convert signed/unsigned values to UInt128 via bit reinterpretation for proper identity value storage. - examples/reducekernel.jl: Added comprehensive tests for all reduce operations (min, max, sum, xor, or, and) on UInt16/32/64, Int16/32/64, and Float16/32/64. Fixes: - UInt64 reduce_min and reduce_and now work correctly - Int16 reduce_max and reduce_and now work correctly - All small integer types (Int8, Int16, Int32) now encode properly with SLEB128
1 parent 6a1b21d commit be4265f

3 files changed

Lines changed: 209 additions & 24 deletions

File tree

examples/reducekernel.jl

Lines changed: 146 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,151 @@ using CUDA
33
using cuTile
44
import cuTile as ct
55

6-
elType = UInt16
7-
function reduceKernel(a::ct.TileArray{elType,1}, b::ct.TileArray{elType,1}, tileSz::ct.Constant{Int})
8-
bid = ct.bid(1)
9-
tile = ct.load(a, bid, (tileSz[],))
10-
result = ct.reduce_min(tile, Val(1))
11-
ct.store(b, bid, result)
12-
return nothing
6+
# Kernel factory to properly capture element type and operation
7+
function makeReduceKernel(::Type{T}, op::Symbol) where {T}
8+
reduceFunc = if op == :reduce_min
9+
ct.reduce_min
10+
elseif op == :reduce_max
11+
ct.reduce_max
12+
elseif op == :reduce_sum
13+
ct.reduce_sum
14+
elseif op == :reduce_xor
15+
ct.reduce_xor
16+
elseif op == :reduce_or
17+
ct.reduce_or
18+
elseif op == :reduce_and
19+
ct.reduce_and
20+
end
21+
22+
@inline function kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int})
23+
ct.store(b, ct.bid(1), reduceFunc(ct.load(a, ct.bid(1), (tileSz[],)), Val(1)))
24+
return nothing
25+
end
26+
return kernel
27+
end
28+
29+
# Test with UInt types
30+
@testset for elType in [UInt16, UInt32, UInt64]
31+
@testset for op in [:reduce_min, :reduce_max, :reduce_sum, :reduce_xor, :reduce_or, :reduce_and]
32+
sz = 32
33+
N = 2^15
34+
35+
# Create kernel using factory
36+
reduceKernel = try
37+
makeReduceKernel(elType, op)
38+
catch e
39+
@test_broken false
40+
rethrow()
41+
end
42+
43+
# Create data and run kernel
44+
a_gpu = CUDA.rand(elType, N)
45+
b_gpu = CUDA.zeros(elType, cld(N, sz))
46+
try
47+
CUDA.@sync ct.launch(reduceKernel, cld(length(a_gpu), sz), a_gpu, b_gpu, ct.Constant(sz))
48+
catch e
49+
@test_broken false
50+
rethrow()
51+
end
52+
res = Array(b_gpu)
53+
54+
# CPU computation
55+
a_cpu = Array(a_gpu)
56+
a_reshaped = reshape(a_cpu, sz, :)
57+
58+
if op == :reduce_min
59+
cpu_result = minimum(a_reshaped, dims=1)[:]
60+
elseif op == :reduce_max
61+
cpu_result = maximum(a_reshaped, dims=1)[:]
62+
elseif op == :reduce_sum
63+
raw_sum = sum(a_reshaped, dims=1)[:]
64+
cpu_result = raw_sum .& typemax(elType)
65+
elseif op == :reduce_xor
66+
cpu_result = mapslices(x -> reduce(, x), a_reshaped, dims=1)[:]
67+
elseif op == :reduce_or
68+
cpu_result = mapslices(x -> reduce(|, x), a_reshaped, dims=1)[:]
69+
elseif op == :reduce_and
70+
cpu_result = mapslices(x -> reduce(&, x), a_reshaped, dims=1)[:]
71+
end
72+
73+
@test cpu_result == res
74+
end
75+
end
76+
77+
# Test with signed Int types
78+
@testset for elType in [Int16, Int32, Int64]
79+
@testset for op in [:reduce_min, :reduce_max, :reduce_sum, :reduce_xor, :reduce_or, :reduce_and]
80+
sz = 32
81+
N = 2^15
82+
83+
# Create kernel using factory
84+
reduceKernel = try
85+
makeReduceKernel(elType, op)
86+
catch e
87+
@test_broken false
88+
rethrow()
89+
end
90+
91+
# Create data and run kernel - use range to get negative values too
92+
a_gpu = CuArray{elType}(rand(-1000:1000, N))
93+
b_gpu = CUDA.zeros(elType, cld(N, sz))
94+
try
95+
CUDA.@sync ct.launch(reduceKernel, cld(length(a_gpu), sz), a_gpu, b_gpu, ct.Constant(sz))
96+
catch e
97+
@test_broken false
98+
rethrow()
99+
end
100+
res = Array(b_gpu)
101+
102+
# CPU computation
103+
a_cpu = Array(a_gpu)
104+
a_reshaped = reshape(a_cpu, sz, :)
105+
106+
if op == :reduce_min
107+
cpu_result = minimum(a_reshaped, dims=1)[:]
108+
elseif op == :reduce_max
109+
cpu_result = maximum(a_reshaped, dims=1)[:]
110+
elseif op == :reduce_sum
111+
cpu_result = sum(a_reshaped, dims=1)[:]
112+
elseif op == :reduce_xor
113+
cpu_result = mapslices(x -> reduce(, x), a_reshaped, dims=1)[:]
114+
elseif op == :reduce_or
115+
cpu_result = mapslices(x -> reduce(|, x), a_reshaped, dims=1)[:]
116+
elseif op == :reduce_and
117+
cpu_result = mapslices(x -> reduce(&, x), a_reshaped, dims=1)[:]
118+
end
119+
120+
@test cpu_result == res
121+
end
13122
end
14123

15-
sz = 32
16-
N = 2^15
17-
a = CUDA.rand(elType, N)
18-
b = CUDA.zeros(elType, cld(N, sz))
19-
CUDA.@sync ct.launch(reduceKernel, cld(length(a), sz), a, b, ct.Constant(sz))
20-
res = Array(b)
124+
# Test with Float types
125+
@testset for elType in [Float16, Float32, Float64]
126+
@testset for op in [:reduce_min, :reduce_max, :reduce_sum]
127+
sz = 32
128+
N = 2^15
129+
130+
# Create kernel using factory
131+
reduceKernel = makeReduceKernel(elType, op)
132+
133+
# Create data and run kernel
134+
a_gpu = CUDA.rand(elType, N)
135+
b_gpu = CUDA.zeros(elType, cld(N, sz))
136+
CUDA.@sync ct.launch(reduceKernel, cld(length(a_gpu), sz), a_gpu, b_gpu, ct.Constant(sz))
137+
res = Array(b_gpu)
138+
139+
# CPU computation
140+
a_cpu = Array(a_gpu)
141+
a_reshaped = reshape(a_cpu, sz, :)
142+
143+
if op == :reduce_min
144+
cpu_result = minimum(a_reshaped, dims=1)[:]
145+
elseif op == :reduce_max
146+
cpu_result = maximum(a_reshaped, dims=1)[:]
147+
elseif op == :reduce_sum
148+
cpu_result = sum(a_reshaped, dims=1)[:]
149+
end
150+
151+
@test isapprox(cpu_result, res)
152+
end
153+
end

src/bytecode/writer.jl

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ end
257257
Integer identity value for binary operations.
258258
"""
259259
struct IntegerIdentityOp <: IdentityOp
260-
value::Int64 # Store as signed Int64, will be reinterpreted as unsigned
260+
value::UInt128 # Store as UInt128 to handle all unsigned values up to 64 bits
261261
type_id::TypeId
262262
dtype::Type # Int8, Int16, Int32, Int64, UInt8, etc.
263263
signed::Bool # true for signed, false for unsigned
@@ -291,13 +291,47 @@ function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityOp)
291291
# Type ID
292292
encode_typeid!(cb.buf, identity.type_id)
293293
# Value: signed uses zigzag varint, unsigned uses plain varint
294+
# Mask value to correct bit width and apply zigzag for signed types
295+
masked_value = mask_to_width(identity.value, identity.dtype, identity.signed)
294296
if identity.signed
295-
encode_signed_varint!(cb.buf, identity.value)
297+
encode_signed_varint!(cb.buf, masked_value)
296298
else
297-
encode_varint!(cb.buf, UInt64(identity.value))
299+
encode_varint!(cb.buf, masked_value)
298300
end
299301
end
300302

303+
"""
304+
mask_to_width(value, dtype, signed)
305+
306+
Mask a UInt128 value to the correct bit width for the given type and apply zigzag if signed.
307+
For signed types, this masks first, then applies zigzag encoding.
308+
"""
309+
# Signed Int64: mask to 64 bits first, then zigzag encode
310+
mask_to_width(value::UInt128, ::Type{Int64}, signed::Bool) =
311+
let masked = UInt64(value & 0xFFFFFFFFFFFFFFFF)
312+
UInt64((masked << 1) (masked >>> 63))
313+
end
314+
# Signed Int32: mask to 32 bits first, then zigzag encode
315+
mask_to_width(value::UInt128, ::Type{Int32}, signed::Bool) =
316+
let masked = UInt32(value & 0xFFFFFFFF)
317+
UInt32((masked << 1) (masked >>> 31))
318+
end
319+
# Signed Int16: mask to 16 bits first, then zigzag encode
320+
mask_to_width(value::UInt128, ::Type{Int16}, signed::Bool) =
321+
let masked = UInt16(value & 0xFFFF)
322+
UInt16((masked << 1) (masked >>> 15))
323+
end
324+
# Signed Int8: mask to 8 bits first, then zigzag encode
325+
mask_to_width(value::UInt128, ::Type{Int8}, signed::Bool) =
326+
let masked = UInt8(value & 0xFF)
327+
UInt8((masked << 1) (masked >>> 7))
328+
end
329+
# Unsigned types: just mask to bit width, no zigzag
330+
mask_to_width(value::UInt128, ::Type{UInt64}, signed::Bool) = UInt64(value & 0xFFFFFFFFFFFFFFFF)
331+
mask_to_width(value::UInt128, ::Type{UInt32}, signed::Bool) = UInt32(value & 0xFFFFFFFF)
332+
mask_to_width(value::UInt128, ::Type{UInt16}, signed::Bool) = UInt16(value & 0xFFFF)
333+
mask_to_width(value::UInt128, ::Type{UInt8}, signed::Bool) = UInt8(value & 0xFF)
334+
301335
"""
302336
float_to_bits(value, dtype)
303337
@@ -585,7 +619,7 @@ function finalize_function!(func_buf::Vector{UInt8}, cb::CodeBuilder,
585619
end
586620

587621
#=============================================================================
588-
Optimization Hints
622+
Optimization Hints
589623
=============================================================================#
590624

591625
"""

src/compiler/intrinsics/core.jl

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -667,43 +667,61 @@ is_signed(::Type{T}) where T <: AbstractFloat = false
667667
Return the identity value for a binary operation (reduce, scan, etc.).
668668
Identity must satisfy: identity ⊕ x = x for the operation.
669669
"""
670+
671+
"""
672+
to_uint128(value, dtype)
673+
674+
Convert an integer value to UInt128 for storage in IntegerIdentityOp.
675+
For signed types, this returns the two's complement bit representation.
676+
"""
677+
# Unsigned types: directly convert
678+
to_uint128(value::UInt64) = UInt128(value)
679+
to_uint128(value::UInt32) = UInt128(value)
680+
to_uint128(value::UInt16) = UInt128(value)
681+
to_uint128(value::UInt8) = UInt128(value)
682+
# Signed types: reinterpret as unsigned first, then convert
683+
to_uint128(value::Int64) = UInt128(reinterpret(UInt64, value))
684+
to_uint128(value::Int32) = UInt128(reinterpret(UInt32, value))
685+
to_uint128(value::Int16) = UInt128(reinterpret(UInt16, value))
686+
to_uint128(value::Int8) = UInt128(reinterpret(UInt8, value))
687+
670688
# Addition identity: 0 + x = x
671689
operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
672690
FloatIdentityOp(zero(T), dtype, T)
673691
operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
674-
IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
692+
IntegerIdentityOp(to_uint128(zero(T)), dtype, T, is_signed(T))
675693

676694
# Maximum identity: max(typemin(T), x) = x
677695
operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
678696
FloatIdentityOp(typemin(T), dtype, T)
679697
operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
680-
IntegerIdentityOp(typemin(T), dtype, T, is_signed(T))
698+
IntegerIdentityOp(to_uint128(typemin(T)), dtype, T, is_signed(T))
681699

682700
# Multiplication identity: 1 * x = x
683701
operation_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
684702
FloatIdentityOp(one(T), dtype, T)
685703
operation_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
686-
IntegerIdentityOp(one(T), dtype, T, is_signed(T))
704+
IntegerIdentityOp(to_uint128(one(T)), dtype, T, is_signed(T))
687705

688706
# Minimum identity: min(typemax(T), x) = x
689707
operation_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
690708
FloatIdentityOp(typemax(T), dtype, T)
691709
operation_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
692-
IntegerIdentityOp(typemax(T), dtype, T, is_signed(T))
710+
IntegerIdentityOp(to_uint128(typemax(T)), dtype, T, is_signed(T))
693711

694712
# AND identity: all bits set (x & identity == x)
695713
# For signed: -one(T) has all bits set in two's complement
696714
# For unsigned: typemax(T) has all bits set
697715
operation_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
698-
IntegerIdentityOp(is_signed(T) ? -one(T) : typemax(T), dtype, T, is_signed(T))
716+
IntegerIdentityOp(to_uint128(is_signed(T) ? -one(T) : typemax(T)), dtype, T, is_signed(T))
699717

700718
# OR identity: 0 | x = x
701719
operation_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
702-
IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
720+
IntegerIdentityOp(to_uint128(zero(T)), dtype, T, is_signed(T))
703721

704722
# XOR identity: 0 ⊕ x = x
705723
operation_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
706-
IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
724+
IntegerIdentityOp(to_uint128(zero(T)), dtype, T, is_signed(T))
707725

708726
#=============================================================================
709727
Reduce Body Operations - dispatch on Val{fn} and elem_type

0 commit comments

Comments
 (0)