-
Notifications
You must be signed in to change notification settings - Fork 137
Expand file tree
/
Copy pathtest_load_store_advanced_indexing.py
More file actions
323 lines (248 loc) · 13.6 KB
/
test_load_store_advanced_indexing.py
File metadata and controls
323 lines (248 loc) · 13.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
# SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch
import cuda.tile as ct
from cuda.tile._bytecode.version import BytecodeVersion
from cuda.tile._exception import TileTypeError
from util import assert_equal
from conftest import requires_tileiras
pytestmark = requires_tileiras(BytecodeVersion.V_13_3)
# ===========================================================================================
# ct.load_advanced_indexing / ct.store_advanced_indexing: basic load/store
# ===========================================================================================
@ct.kernel
def load_store_advanced_rows(x, y, ROWS: ct.Constant[int], COLS: ct.Constant[int]):
indices = ct.arange(ROWS, dtype=ct.int32)
tile = ct.load_advanced_indexing(x, (indices, ct.Slice(0, COLS)))
ct.store_advanced_indexing(y, (indices, ct.Slice(0, COLS)), tile)
def test_store_basic():
rows, cols = 8, 4
x = torch.arange(rows * cols, device='cuda', dtype=torch.int32).reshape(rows, cols)
y = torch.zeros(rows, cols, device='cuda', dtype=torch.int32)
ct.launch(torch.cuda.current_stream(), (1,), load_store_advanced_rows, (x, y, rows, cols))
assert_equal(x, y)
# ===========================================================================================
# ct.load_advanced_indexing/store_advanced_indexing: non-contiguous row indices (gather/scatter)
# ===========================================================================================
@ct.kernel
def gather_even_rows(x, y, ROWS: ct.Constant[int], COLS: ct.Constant[int]):
indices = ct.arange(ROWS, dtype=ct.int32) * 2 # [0, 2, 4, 6]
tile = ct.load_advanced_indexing(x, (indices, ct.Slice(0, COLS)))
ct.store(y, (0, 0), tile)
def test_gather_non_contiguous():
rows, cols = 8, 4
x = torch.arange(rows * cols, device='cuda', dtype=torch.int32).reshape(rows, cols)
y_rows, y_cols = rows // 2, cols // 2
y = torch.zeros(y_rows, y_cols, device='cuda', dtype=torch.int32)
ct.launch(torch.cuda.current_stream(), (1,), gather_even_rows, (x, y, y_rows, y_cols))
expected = x[::2, :y_cols]
assert_equal(expected, y)
@ct.kernel
def scatter_even_rows(y, ROWS: ct.Constant[int], COLS: ct.Constant[int], col_start):
indices = ct.arange(ROWS, dtype=ct.int32) * 2 # [0, 2, 4, 6]
tile = ct.full((ROWS, COLS), 99, dtype=y.dtype)
ct.store_advanced_indexing(y, (indices, ct.Slice(col_start, COLS)), tile)
def test_scatter_non_contiguous():
y_rows, y_cols = 8, 4
y = torch.zeros(y_rows, y_cols, device='cuda', dtype=torch.int32)
ct.launch(torch.cuda.current_stream(), (1,), scatter_even_rows, (y, y_rows, y_cols, 0))
expected = torch.zeros(y_rows, y_cols, device='cuda', dtype=torch.int32)
expected[::2] = 99
assert_equal(expected, y)
# ===========================================================================================
# ct.load_advanced_indexing: ct.Slice with dynamic start
# ===========================================================================================
@ct.kernel
def load_advanced_dynamic_col(x, y, ROWS: ct.Constant[int], COLS: ct.Constant[int],
col_start):
indices = ct.arange(ROWS, dtype=ct.int32)
tile = ct.load_advanced_indexing(x, (indices, ct.Slice(col_start, COLS)))
ct.store(y, (0, 0), tile)
@pytest.mark.parametrize("col_start", [0, 1, 2])
def test_load_dynamic_col_start(col_start):
rows, cols = 8, 4
y_cols = cols // 2
x = torch.arange(rows * cols, device='cuda', dtype=torch.int32).reshape(rows, cols)
y = torch.zeros(rows, y_cols, device='cuda', dtype=torch.int32)
ct.launch(torch.cuda.current_stream(), (1,),
load_advanced_dynamic_col, (x, y, rows, y_cols, col_start))
assert_equal(x[:, col_start:col_start + y_cols], y)
# ===========================================================================================
# ct.load_advanced_indexing: ct.Slice with constant start
# ===========================================================================================
@ct.kernel
def load_advanced_const_col_start(x, y, ROWS: ct.Constant[int], COLS: ct.Constant[int]):
indices = ct.arange(ROWS, dtype=ct.int32)
tile = ct.load_advanced_indexing(x, (indices, ct.Slice(2, COLS)))
ct.store(y, (0, 0), tile)
def test_load_constant_col_start():
rows, x_cols, tile_cols = 8, 8, 4
x = torch.arange(rows * x_cols, device='cuda', dtype=torch.int32).reshape(rows, x_cols)
y = torch.zeros(rows, tile_cols, device='cuda', dtype=torch.int32)
ct.launch(torch.cuda.current_stream(), (1,),
load_advanced_const_col_start, (x, y, rows, tile_cols))
assert_equal(x[:, 2:2 + tile_cols], y)
# ===========================================================================================
# ct.load_advanced_indexing: out-of-order sparse indices gather rows in specified order.
# ===========================================================================================
def test_load_out_of_order_sparse():
@ct.kernel
def kernel(x, y, ROWS: ct.Constant[int], COLS: ct.Constant[int]):
i = ct.arange(ROWS, dtype=ct.int32)
# indices [7, 4, 2, 3]: i=0→7, i=1→4, i≥2→i
indices = ct.where(i == 0, ct.full((ROWS,), 7, dtype=ct.int32),
ct.where(i == 1, ct.full((ROWS,), 4, dtype=ct.int32), i))
tile = ct.load_advanced_indexing(x, (indices, ct.Slice(0, COLS)))
ct.store(y, (0, 0), tile)
x = torch.arange(32, device='cuda', dtype=torch.int32).reshape(8, 4)
y = torch.zeros(4, 4, device='cuda', dtype=torch.int32)
ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y, 4, 4))
expected = x[[7, 4, 2, 3], :]
assert_equal(y, expected)
# ===========================================================================================
# ct.load_advanced_indexing: OOB
# ===========================================================================================
def test_load_zero_padding():
@ct.kernel
def load_advanced_zero_padding(x, y, ROWS: ct.Constant[int], COLS: ct.Constant[int],
col_start):
indices = ct.arange(ROWS, dtype=ct.int32)
tile = ct.load_advanced_indexing(x, (indices, ct.Slice(col_start, COLS)),
padding_mode=ct.PaddingMode.ZERO)
ct.store(y, (0, 0), tile)
rows, cols = 4, 8
y_cols = cols // 2
x = torch.arange(rows * cols, device='cuda', dtype=torch.int32).reshape(rows, cols) + 1
y = torch.full((rows, y_cols), -1, device='cuda', dtype=torch.int32)
col_start = 6
ct.launch(torch.cuda.current_stream(), (1,),
load_advanced_zero_padding, (x, y, rows, y_cols, col_start))
expected = torch.zeros(rows, y_cols, device='cuda', dtype=torch.int32)
expected[:, :cols - col_start] = x[:, col_start:]
assert_equal(expected, y)
def test_load_sparse_partial_oob_zero_padding():
"""Sparse-dim partial OOB indices are zero-padded when padding_mode=ZERO."""
@ct.kernel
def kernel(x, y, ROWS: ct.Constant[int], COLS: ct.Constant[int]):
# indices [6, 7, 8, 9]: 6 and 7 are in-bounds, 8 and 9 are OOB for an 8-row array
indices = ct.arange(ROWS, dtype=ct.int32) + 6
tile = ct.load_advanced_indexing(x, (indices, ct.Slice(0, COLS)),
padding_mode=ct.PaddingMode.ZERO)
ct.store(y, (0, 0), tile)
x = torch.arange(32, device='cuda', dtype=torch.int32).reshape(8, 4)
y = torch.full((4, 4), -1, device='cuda', dtype=torch.int32)
ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y, 4, 4))
expected = torch.zeros(4, 4, device='cuda', dtype=torch.int32)
expected[:2] = x[6:8]
assert_equal(y, expected)
def test_load_repeated_sparse_correct():
"""Repeated in-bounds sparse indices are defined: each repeated index loads the same row."""
@ct.kernel
def kernel(x, y, ROWS: ct.Constant[int], COLS: ct.Constant[int]):
i = ct.arange(ROWS, dtype=ct.int32)
# indices = [0, 0, 4, 6]: first two repeat row 0, last two are distinct
indices = ct.where(i < 2, ct.zeros((ROWS,), dtype=ct.int32), i * 2)
tile = ct.load_advanced_indexing(x, (indices, ct.Slice(0, COLS)))
ct.store(y, (0, 0), tile)
x = torch.arange(32, device='cuda', dtype=torch.int32).reshape(8, 4)
y = torch.zeros(4, 4, device='cuda', dtype=torch.int32)
ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y, 4, 4))
expected = x[[0, 0, 4, 6], :]
assert_equal(expected, y)
# ===========================================================================================
# ct.store_advanced_indexing semantics
# ===========================================================================================
def test_store_repeated_sparse_ub():
"""Verify that repeated sparse indices on store does not affect non-repeated indices."""
@ct.kernel
def kernel(y, ROWS: ct.Constant[int], COLS: ct.Constant[int]):
i = ct.arange(ROWS, dtype=ct.int32)
# indices = [0, 0, 4, 6]: first two repeat row 0 (UB), last two are distinct
indices = ct.where(i < 2, ct.zeros((ROWS,), dtype=ct.int32), i * 2)
tile = ct.full((ROWS, COLS), 99, dtype=y.dtype)
ct.store_advanced_indexing(y, (indices, ct.Slice(0, COLS)), tile)
y = torch.zeros(8, 4, device='cuda', dtype=torch.int32)
ct.launch(torch.cuda.current_stream(), (1,), kernel, (y, 4, 4))
torch.cuda.synchronize()
# row 0 is UB (written by indices[0] and indices[1]) — no assertion on it
# rows 4 and 6 have distinct indices and must be correctly written
assert_equal(y[4], torch.full((4,), 99, device='cuda', dtype=torch.int32))
assert_equal(y[6], torch.full((4,), 99, device='cuda', dtype=torch.int32))
def test_store_dense_oob_ignored():
"""Dense-dim elements extending past the array boundary are silently ignored."""
rows, array_cols = 8, 4
tile_rows, tile_cols = 4, 4
col_start = 2 # slice [2, 6) but array only has cols [0, 4) → cols 4-5 are OOB
y = torch.zeros(rows, array_cols, device='cuda', dtype=torch.int32)
ct.launch(torch.cuda.current_stream(), (1,), scatter_even_rows,
(y, tile_rows, tile_cols, col_start))
expected = torch.zeros(rows, array_cols, device='cuda', dtype=torch.int32)
expected[::2, col_start:] = 99 # only in-bounds cols [2, 4) on even rows
assert_equal(expected, y)
# ===========================================================================================
# Error cases
# ===========================================================================================
def test_error_2d_tile_as_sparse():
@ct.kernel
def kernel(x):
indices = ct.zeros((4, 4), dtype=ct.int32)
ct.load_advanced_indexing(x, (indices, ct.Slice(0, 4)))
x = torch.zeros(8, 8, device='cuda', dtype=torch.int32)
with pytest.raises(TileTypeError, match="1D"):
ct.launch(torch.cuda.current_stream(), (1,), kernel, (x,))
def test_error_no_sparse_dim_load():
@ct.kernel
def kernel(x, y, col_start):
result = ct.load_advanced_indexing(x, (ct.Slice(0, 4), ct.Slice(col_start, 4)))
ct.store(y, (0, 0), result)
x = torch.arange(64, device='cuda', dtype=torch.int32).reshape(8, 8)
y = torch.zeros(4, 4, device='cuda', dtype=torch.int32)
with pytest.raises(TileTypeError, match="exactly one index must be a 1D integer Tile"):
ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y, 0))
def test_error_no_sparse_dim_store():
@ct.kernel
def kernel(y):
tile = ct.full((4, 4), 99, dtype=y.dtype)
ct.store_advanced_indexing(y, (ct.Slice(2, 4), ct.Slice(1, 4)), tile)
y = torch.zeros(8, 8, device='cuda', dtype=torch.int32)
with pytest.raises(TileTypeError, match="exactly one index must be a 1D integer Tile"):
ct.launch(torch.cuda.current_stream(), (1,), kernel, (y,))
def test_error_multiple_sparse_dims_load():
@ct.kernel
def kernel(x, y):
r = ct.arange(4, dtype=ct.int32) * 2
c = ct.arange(4, dtype=ct.int32) + 1
result = ct.load_advanced_indexing(x, (r, c))
ct.store(y, (0,), result)
x = torch.arange(64, device='cuda', dtype=torch.int32).reshape(8, 8)
y = torch.zeros(4, device='cuda', dtype=torch.int32)
with pytest.raises(TileTypeError, match="exactly one index must be a 1D integer Tile"):
ct.launch(torch.cuda.current_stream(), (1,), kernel, (x, y))
def test_error_multiple_sparse_dims_store():
@ct.kernel
def kernel(y):
r = ct.arange(4, dtype=ct.int32) * 2
c = ct.arange(4, dtype=ct.int32) + 1
tile = ct.full((4,), 99, dtype=y.dtype)
ct.store_advanced_indexing(y, (r, c), tile)
y = torch.zeros(8, 8, device='cuda', dtype=torch.int32)
with pytest.raises(TileTypeError, match="exactly one index must be a 1D integer Tile"):
ct.launch(torch.cuda.current_stream(), (1,), kernel, (y,))
def test_error_wrong_index_rank():
@ct.kernel
def kernel(x):
indices = ct.arange(4, dtype=ct.int32)
ct.load_advanced_indexing(x, (indices, ct.Slice(0, 4), ct.Slice(0, 4)))
x = torch.zeros(8, 8, device='cuda', dtype=torch.int32)
with pytest.raises(TileTypeError, match="does not match array rank"):
ct.launch(torch.cuda.current_stream(), (1,), kernel, (x,))
def test_error_non_power_of_2_slice_length():
@ct.kernel
def kernel(x):
indices = ct.arange(4, dtype=ct.int32)
ct.load_advanced_indexing(x, (indices, ct.Slice(0, 3)))
x = torch.zeros(8, 8, device='cuda', dtype=torch.int32)
with pytest.raises(TileTypeError, match="power of two"):
ct.launch(torch.cuda.current_stream(), (1,), kernel, (x,))