@@ -1098,3 +1098,40 @@ end
10981098
10991099 @test Array (b) ≈ Array (a)
11001100end
1101+
1102+ @testset " scalar indexing as loop bound" begin
1103+ function scalar_index_loop_kernel (data:: ct.TileArray{Float32,1} ,
1104+ lengths:: ct.TileArray{Int32,1} ,
1105+ out:: ct.TileArray{Float32,1} )
1106+ bid = ct. bid (1 )
1107+ len = lengths[bid]
1108+ acc = ct. zeros ((16 ,), Float32)
1109+ j = Int32 (1 )
1110+ while j <= len
1111+ tile = ct. load (data, j, (16 ,))
1112+ acc = acc .+ tile
1113+ j += Int32 (1 )
1114+ end
1115+ ct. store (out, bid, acc)
1116+ return
1117+ end
1118+
1119+ # 3 blocks, each sums a different number of tiles
1120+ n_tiles = Int32[2 , 3 , 1 ]
1121+ data = CUDA. rand (Float32, 48 ) # 3 tiles of 16
1122+ lengths = CuArray (n_tiles)
1123+ out = CUDA. zeros (Float32, 48 )
1124+
1125+ ct. launch (scalar_index_loop_kernel, 3 , data, lengths, out)
1126+
1127+ data_cpu = Array (data)
1128+ out_cpu = Array (out)
1129+ for bid in 1 : 3
1130+ expected = zeros (Float32, 16 )
1131+ for j in 1 : n_tiles[bid]
1132+ expected .+ = data_cpu[(j- 1 )* 16 + 1 : j* 16 ]
1133+ end
1134+ @test out_cpu[(bid- 1 )* 16 + 1 : bid* 16 ] ≈ expected
1135+ end
1136+ end
1137+
0 commit comments