@@ -223,201 +223,6 @@ end
223223end
224224
225225
226- # Distribution config used to share the randn/randexp tests below. `edges`
227- # are 3 quartile-or-equivalent CDF cuts; `probs` are the matching per-bin
228- # probabilities; `mean` is the population mean; `in_range` is the per-element
229- # validity predicate (e.g. exponential support is `[0, ∞)`).
230- const RAND_DISTS = (
231- (label = " randn" ,
232- f = Random. randn,
233- f! = Random. randn!,
234- edges = (- 1.0 , 0.0 , 1.0 ),
235- probs = (0.1587 , 0.3413 , 0.3413 , 0.1587 ),
236- mean = 0.0 ,
237- in_range = isfinite),
238-
239- (label = " randexp" ,
240- f = Random. randexp,
241- f! = Random. randexp!,
242- edges = (- log (0.75 ), - log (0.5 ), - log (0.25 )),
243- probs = (0.25 , 0.25 , 0.25 , 0.25 ),
244- mean = 1.0 ,
245- in_range = >= (0.0 )),
246- )
247-
248- function bin_counts (v, edges)
249- counts = zeros (Int, length (edges) + 1 )
250- for x in v
251- idx = something (findfirst (>= (Float64 (x)), edges), length (edges) + 1 )
252- counts[idx] += 1
253- end
254- counts
255- end
256-
257- # Per-bin tolerance: 10% of N. Smallest bin under either CDF (p≈0.16 for
258- # normal, p=0.25 for exp) clears 5σ at N≥1024, so the check is flake-free
259- # across the per-launch-randomized default-stream seed.
260- function check_dist (v, dist; N = length (v), tol_factor = 0.10 )
261- counts = bin_counts (v, dist. edges)
262- for (i, p) in enumerate (dist. probs)
263- @test abs (counts[i] - N * p) < N * tol_factor
264- end
265- end
266-
267- # Mean tolerance: SE/√N at N=1024 is ≤ 0.031; 5σ ≈ 0.16. Narrow floats lose
268- # precision in the BM/log path, so widen for those.
269- mean_tol (:: Type{T} ) where {T} =
270- T <: Union{Float16, ct.BFloat16} ? 0.20 : 0.15
271-
272- @testset " device $(dist. label) " for dist in RAND_DISTS
273- f, in_range = dist. f, dist. in_range
274-
275- @testset " typed surfaces (T=$T , dims=$dims )" for (T, dims) in
276- ((Float32, (16 ,)), (Float32, (32 ,)), (Float32, (64 ,)),
277- (Float64, (16 ,)), (Float16, (16 ,)), (ct. BFloat16, (16 ,)))
278- # `o1`: scalar form. `o2`: tile form via the default stream.
279- # `o3`: tile form via an explicit `DeviceRNG` (different stream).
280- function k (o1, o2, o3, :: Type{T_} , dims_:: NTuple{N, Int} ) where {T_, N}
281- pid = ct. bid (1 )
282- rng = ct. DeviceRNG (); Random. seed! (rng, 1 )
283- o1[pid] = f (T_)
284- ct. store (o2, pid, f (T_, dims_))
285- ct. store (o3, pid, f (rng, T_, prod (dims_)))
286- return
287- end
288-
289- n_blocks = 64
290- m = prod (dims)
291- o1 = CUDA. zeros (T, n_blocks)
292- o2 = CUDA. zeros (T, n_blocks * m)
293- o3 = CUDA. zeros (T, n_blocks * m)
294- @cuda backend= cuTile blocks= n_blocks k (o1, o2, o3, T, ct. Constant (dims))
295-
296- for v in (Array (o1), Array (o2), Array (o3))
297- @test eltype (v) === T
298- @test all (isfinite, v)
299- @test all (in_range, v)
300- end
301-
302- # Distribution shape + mean on the larger draws (o1's N=64 is too
303- # small for shape testing).
304- for v in (Array (o2), Array (o3))
305- check_dist (v, dist)
306- @test abs (sum (Float64, v) / length (v) - dist. mean) < mean_tol (T)
307- end
308- end
309-
310- @testset " untyped surface defaults to Float32" begin
311- function k (o:: ct.TileArray{Float32, 1} )
312- pid = ct. bid (1 )
313- ct. store (o, pid, ct. reshape (f (4 , 4 ), (16 ,)))
314- return
315- end
316- n_blocks = 64
317- o = CUDA. zeros (Float32, n_blocks * 16 )
318- @cuda backend= cuTile blocks= n_blocks k (o)
319- v = Array (o)
320- @test eltype (v) === Float32
321- @test all (in_range, v)
322- check_dist (v, dist)
323- end
324-
325- @testset " in-kernel `Random.seed!` matches host RNG output" begin
326- # Single-block draw with an in-kernel `Random.seed!(default_rng(),s)`
327- # plumbs the same `(seed, counter)` as `cuTile.RNG(s, 0)`, so the
328- # outputs must be byte-identical.
329- function k (out:: ct.TileArray{Float32, 1} )
330- Random. seed! (Random. default_rng (), UInt32 (42 ))
331- pid = ct. bid (1 )
332- ct. store (out, pid, f (Float32, (512 ,)))
333- return
334- end
335- out = CUDA. zeros (Float32, 512 ); @cuda backend= cuTile k (out)
336- @test Array (out) == Array (f (ct. RNG (UInt32 (42 ), UInt32 (0 )), Float32, 512 ))
337- end
338- end
339-
340-
341- @testset " host $(dist. label) " for dist in RAND_DISTS
342- f, f!, in_range = dist. f, dist. f!, dist. in_range
343- N = 4096
344-
345- @testset " $(dist. label) ! basics + counter advance" begin
346- rng = ct. RNG (42 )
347- A = CUDA. zeros (Float32, N)
348- f! (rng, A)
349- v = Array (A)
350- @test all (isfinite, v)
351- @test all (in_range, v)
352- @test rng. counter == UInt32 (N)
353- @test abs (sum (v) / N - dist. mean) < 0.1
354- end
355-
356- @testset " determinism + Philox re-keying" begin
357- # Same seed → byte-identical output.
358- @test Array (f (ct. RNG (123 ), Float32, N)) == Array (f (ct. RNG (123 ), Float32, N))
359-
360- # Different seeds → uncorrelated streams. Set-disjoint isn't safe
361- # for either distribution at this N due to Float32 birthday flake
362- # (especially randexp, whose output concentrates near 0); use
363- # element-wise equality instead — uncorrelated streams collide in
364- # ≤ N²/2^24 ≈ 1 position.
365- c = Array (f (ct. RNG (42 ), Float32, N))
366- d = Array (f (ct. RNG (100 ), Float32, N))
367- @test sum (c .== d) < N ÷ 100
368- end
369-
370- @testset " consecutive disjoint; seed! resets counter" begin
371- # Two back-to-back draws on the same RNG use disjoint counter
372- # ranges, so the underlying Philox outputs (and post-transform
373- # samples) are disjoint up to Float32 birthday flake.
374- rng = ct. RNG (7 )
375- a = Array (f (rng, Float32, N))
376- b = Array (f (rng, Float32, N))
377- @test sum (a .== b) < N ÷ 100
378-
379- Random. seed! (rng, 7 )
380- @test rng. counter == UInt32 (0 )
381- end
382-
383- @testset " T-coverage" for T in (Float16, ct. BFloat16, Float32, Float64)
384- v = Array (f (ct. RNG (13 ), T, N))
385- @test eltype (v) === T
386- @test all (in_range, v)
387- @test all (isfinite, v)
388- @test abs (sum (Float64, v) / N - dist. mean) < mean_tol (T)
389- check_dist (v, dist)
390- end
391-
392- @testset " ct.$(dist. label) / ct.$(dist. label) ! aliases default to Float32" begin
393- ct_f, ct_f! = dist. label == " randn" ? (ct. randn, ct. randn!) :
394- (ct. randexp, ct. randexp!)
395- @test eltype (ct_f (N)) === Float32
396- @test eltype (ct_f (Float32, N)) === Float32
397- B = CUDA. zeros (Float32, N); ct_f! (B)
398- @test all (in_range, Array (B))
399- end
400-
401- @testset " arbitrary length (partial last tile)" begin
402- # `store_partition_view` clips OOB writes, so `length(A)` can be
403- # any value. The host advances by `n_blocks * RAND_FILL_TILE`,
404- # not `length(A)`, so consecutive partial-length calls remain
405- # disjoint up to Float32 birthday flake.
406- rng = ct. RNG (0 )
407- A = CUDA. zeros (Float32, 513 )
408- f! (rng, A)
409- v = Array (A)
410- @test all (isfinite, v)
411- @test rng. counter == UInt32 (2 * cuTile. RAND_FILL_TILE)
412-
413- rng2 = ct. RNG (0 )
414- a = Array (f (rng2, Float32, 100 ))
415- b = Array (f (rng2, Float32, 100 ))
416- @test sum (a .== b) < 100 ÷ 10
417- end
418- end
419-
420-
421226@testset " host rand" begin
422227 N = 2048
423228
0 commit comments