Skip to content

Commit 27d88d2

Browse files
committed
Test reproducer with working LTS driver
1 parent 4a94fd7 commit 27d88d2

File tree

2 files changed

+328
-0
lines changed

2 files changed

+328
-0
lines changed

conda-recipe/build.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
#!/bin/bash
22

3+
# Test reproducer:
4+
echo "building ..."
5+
icpx -fsycl --gcc-install-dir=$BUILD_PREFIX/lib/gcc/x86_64-conda-linux-gnu/14.3.0 --sysroot=$BUILD_PREFIX/x86_64-conda-linux-gnu/sysroot test_minimal.cpp -o test_minimal
6+
echo "build is completed, run now ..."
7+
./test_minimal
8+
echo "run is done"
9+
310
# This is necessary to help DPC++ find Intel libraries such as SVML, IRNG, etc in build prefix
411
export LIBRARY_PATH="$LIBRARY_PATH:${BUILD_PREFIX}/lib"
512

test_minimal.cpp

Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
// Minimal reproducer
2+
//
3+
// Build: icpx -fsycl --gcc-install-dir=$CONDA_PREFIX/lib/gcc/x86_64-conda-linux-gnu/14.3.0 --sysroot=$CONDA_PREFIX/x86_64-conda-linux-gnu/sysroot test_minimal.cpp -o test_minimal
4+
// Run: ./test_minimal
5+
6+
#include <sycl/sycl.hpp>
7+
#include <iostream>
8+
#include <vector>
9+
#include <iomanip>
10+
11+
using namespace sycl;
12+
13+
// Print detailed device information
14+
void print_device_info(const device& dev) {
15+
std::cout << "========================================" << std::endl;
16+
std::cout << "DEVICE INFORMATION" << std::endl;
17+
std::cout << "========================================" << std::endl;
18+
std::cout << std::endl;
19+
20+
std::cout << "Device name: " << dev.get_info<info::device::name>() << std::endl;
21+
std::cout << "Vendor: " << dev.get_info<info::device::vendor>() << std::endl;
22+
std::cout << "Driver version: " << dev.get_info<info::device::driver_version>() << std::endl;
23+
std::cout << "Device version: " << dev.get_info<info::device::version>() << std::endl;
24+
25+
std::cout << std::endl;
26+
std::cout << "Device type: ";
27+
if (dev.is_cpu()) std::cout << "CPU";
28+
else if (dev.is_gpu()) std::cout << "GPU";
29+
else if (dev.is_accelerator()) std::cout << "Accelerator";
30+
else std::cout << "Unknown";
31+
std::cout << std::endl;
32+
33+
std::cout << std::endl;
34+
std::cout << "Max compute units: " << dev.get_info<info::device::max_compute_units>() << std::endl;
35+
std::cout << "Max work group size: " << dev.get_info<info::device::max_work_group_size>() << std::endl;
36+
std::cout << "Max work item dimensions: " << dev.get_info<info::device::max_work_item_dimensions>() << std::endl;
37+
38+
auto max_work_item_sizes = dev.get_info<info::device::max_work_item_sizes<3>>();
39+
std::cout << "Max work item sizes: ["
40+
<< max_work_item_sizes[0] << ", "
41+
<< max_work_item_sizes[1] << ", "
42+
<< max_work_item_sizes[2] << "]" << std::endl;
43+
44+
std::cout << std::endl;
45+
std::cout << "Global mem size: "
46+
<< (dev.get_info<info::device::global_mem_size>() / (1024*1024)) << " MB" << std::endl;
47+
std::cout << "Local mem size: "
48+
<< (dev.get_info<info::device::local_mem_size>() / 1024) << " KB" << std::endl;
49+
std::cout << "Max mem alloc size: "
50+
<< (dev.get_info<info::device::max_mem_alloc_size>() / (1024*1024)) << " MB" << std::endl;
51+
52+
std::cout << std::endl;
53+
std::cout << "Supports USM device: "
54+
<< (dev.has(aspect::usm_device_allocations) ? "YES" : "NO") << std::endl;
55+
std::cout << "Supports USM host: "
56+
<< (dev.has(aspect::usm_host_allocations) ? "YES" : "NO") << std::endl;
57+
std::cout << "Supports USM shared: "
58+
<< (dev.has(aspect::usm_shared_allocations) ? "YES" : "NO") << std::endl;
59+
60+
std::cout << std::endl;
61+
std::cout << "========================================" << std::endl;
62+
std::cout << std::endl;
63+
}
64+
65+
// Kernel with backward dimension writes
66+
template <typename cumsumT, typename indexT>
67+
class NonzeroIndexKernel;
68+
69+
template <typename cumsumT, typename indexT>
70+
sycl::event extract_nonzero_indices(
71+
queue &q,
72+
size_t n_elems,
73+
size_t nz_count,
74+
int ndim,
75+
const cumsumT* cumsum_data,
76+
indexT* indices_data,
77+
const size_t* shape
78+
)
79+
{
80+
constexpr size_t lws = 256;
81+
const size_t n_groups = (n_elems + lws - 1) / lws;
82+
83+
return q.submit([&](handler &cgh) {
84+
local_accessor<cumsumT, 1> local_cumsum(lws + 1, cgh);
85+
86+
cgh.parallel_for<NonzeroIndexKernel<cumsumT, indexT>>(
87+
nd_range<1>(n_groups * lws, lws),
88+
[=](nd_item<1> ndit) {
89+
const size_t gid = ndit.get_global_id(0);
90+
const size_t lid = ndit.get_local_id(0);
91+
const size_t group_id = ndit.get_group(0);
92+
const size_t group_start = group_id * lws;
93+
94+
// Load cumsum with halo
95+
if (lid == 0) {
96+
local_cumsum[0] = (group_start == 0) ? 0 : cumsum_data[group_start - 1];
97+
}
98+
if (group_start + lid < n_elems) {
99+
local_cumsum[lid + 1] = cumsum_data[group_start + lid];
100+
}
101+
102+
group_barrier(ndit.get_group());
103+
104+
if (gid < n_elems) {
105+
bool is_nonzero = (local_cumsum[lid + 1] != local_cumsum[lid]);
106+
107+
if (is_nonzero) {
108+
cumsumT output_pos = local_cumsum[lid + 1] - 1;
109+
size_t flat_idx = gid;
110+
111+
for (int dim = ndim - 1; dim >= 0; dim--) {
112+
indices_data[output_pos * ndim + dim] = flat_idx % shape[dim];
113+
flat_idx /= shape[dim];
114+
}
115+
}
116+
}
117+
}
118+
);
119+
});
120+
}
121+
122+
int main() {
123+
queue q;
124+
int64_t *cumsum_device = nullptr;
125+
size_t *indices_device = nullptr;
126+
size_t *shape_device = nullptr;
127+
size_t *indices_host = nullptr;
128+
129+
try {
130+
q = queue(default_selector_v);
131+
132+
auto device = q.get_device();
133+
print_device_info(device);
134+
135+
std::cout << "========================================" << std::endl;
136+
std::cout << "TEST CONFIGURATION" << std::endl;
137+
std::cout << "========================================" << std::endl;
138+
std::cout << std::endl;
139+
140+
// Test parameters
141+
const size_t n_elems = 6;
142+
const int ndim = 2;
143+
const size_t nz_count = 3;
144+
const std::vector<size_t> shape = {2, 3};
145+
146+
std::cout << "Input array (flat): [1, 0, 0, 4, 0, 6]" << std::endl;
147+
std::cout << "Input array (2D): [[1, 0, 0]," << std::endl;
148+
std::cout << " [4, 0, 6]]" << std::endl;
149+
std::cout << "Shape: [" << shape[0] << ", " << shape[1] << "]" << std::endl;
150+
std::cout << std::endl;
151+
152+
std::cout << "Cumsum (precomputed): [1, 1, 1, 2, 2, 3]" << std::endl;
153+
std::cout << "Nonzero elements: 3" << std::endl;
154+
std::cout << "Nonzero positions:" << std::endl;
155+
std::cout << " gid=0 → output[0] → row=0, col=0" << std::endl;
156+
std::cout << " gid=3 → output[1] → row=1, col=0" << std::endl;
157+
std::cout << " gid=5 → output[2] → row=1, col=2" << std::endl;
158+
std::cout << std::endl;
159+
160+
std::cout << "Kernel configuration:" << std::endl;
161+
std::cout << " Work group size: 256" << std::endl;
162+
std::cout << " Number of groups: 1" << std::endl;
163+
std::cout << " Total work items: 256" << std::endl;
164+
std::cout << " Active work items: 6 (processing 6 elements)" << std::endl;
165+
std::cout << " Local memory: (256 + 1) * 8 bytes = 2056 bytes" << std::endl;
166+
std::cout << std::endl;
167+
168+
std::cout << "========================================" << std::endl;
169+
std::cout << std::endl;
170+
171+
// Hardcoded cumsum values for input [[1, 0, 0], [4, 0, 6]]
172+
int64_t cumsum_values[] = {1, 1, 1, 2, 2, 3};
173+
174+
// Allocate device memory
175+
cumsum_device = malloc_device<int64_t>(n_elems, q);
176+
indices_device = malloc_device<size_t>(nz_count * ndim, q);
177+
shape_device = malloc_device<size_t>(ndim, q);
178+
179+
if (!cumsum_device || !indices_device || !shape_device) {
180+
throw std::runtime_error("Device allocation failed");
181+
}
182+
183+
// Copy data to device
184+
q.copy<int64_t>(cumsum_values, cumsum_device, n_elems).wait();
185+
q.copy<size_t>(shape.data(), shape_device, ndim).wait();
186+
187+
std::cout << "Running kernel..." << std::endl;
188+
std::cout << "(writes dim 1 first, then dim 0)" << std::endl;
189+
std::cout << std::endl;
190+
191+
// Run the kernel
192+
auto kernel_ev = extract_nonzero_indices<int64_t, size_t>(
193+
q, n_elems, nz_count, ndim,
194+
cumsum_device, indices_device, shape_device
195+
);
196+
kernel_ev.wait();
197+
198+
// Read results
199+
indices_host = malloc_host<size_t>(nz_count * ndim, q);
200+
if (!indices_host) {
201+
throw std::runtime_error("Host allocation failed");
202+
}
203+
q.copy<size_t>(indices_device, indices_host, nz_count * ndim).wait();
204+
205+
std::cout << "========================================" << std::endl;
206+
std::cout << "RESULTS" << std::endl;
207+
std::cout << "========================================" << std::endl;
208+
std::cout << std::endl;
209+
210+
// Print raw packed output
211+
std::cout << "Raw packed output: [";
212+
for (size_t i = 0; i < nz_count * ndim; i++) {
213+
std::cout << indices_host[i];
214+
if (i < nz_count * ndim - 1) std::cout << ", ";
215+
}
216+
std::cout << "]" << std::endl;
217+
std::cout << "Expected output: [0, 0, 1, 0, 1, 2]" << std::endl;
218+
std::cout << "Format: [row0, col0, row1, col1, row2, col2]" << std::endl;
219+
std::cout << std::endl;
220+
221+
// Unpack
222+
std::vector<size_t> rows(nz_count), cols(nz_count);
223+
for (size_t i = 0; i < nz_count; i++) {
224+
rows[i] = indices_host[i * ndim + 0];
225+
cols[i] = indices_host[i * ndim + 1];
226+
}
227+
228+
std::cout << "Row indices: [";
229+
for (auto v : rows) std::cout << v << " ";
230+
std::cout << "]" << std::endl;
231+
std::cout << "Expected rows: [0 1 1]" << std::endl;
232+
std::cout << std::endl;
233+
234+
std::cout << "Col indices: [";
235+
for (auto v : cols) std::cout << v << " ";
236+
std::cout << "]" << std::endl;
237+
std::cout << "Expected cols: [0 0 2]" << std::endl;
238+
std::cout << std::endl;
239+
240+
// Verify
241+
std::vector<size_t> expected_rows = {0, 1, 1};
242+
std::vector<size_t> expected_cols = {0, 0, 2};
243+
bool correct = (rows == expected_rows) && (cols == expected_cols);
244+
245+
std::cout << "========================================" << std::endl;
246+
if (correct) {
247+
std::cout << "✓ Test PASSED!" << std::endl;
248+
return 0;
249+
} else {
250+
std::cout << "✗ Test FAILED!" << std::endl;
251+
std::cout << std::endl;
252+
std::cout << "Analysis:" << std::endl;
253+
254+
// Detailed analysis
255+
bool rows_match = (rows == expected_rows);
256+
bool cols_match = (cols == expected_cols);
257+
258+
if (!rows_match) {
259+
std::cout << " - Row indices are WRONG" << std::endl;
260+
std::cout << " Expected: [0 1 1]" << std::endl;
261+
std::cout << " Got: [";
262+
for (auto v : rows) std::cout << v << " ";
263+
std::cout << "]" << std::endl;
264+
} else {
265+
std::cout << " - Row indices are correct" << std::endl;
266+
}
267+
268+
if (!cols_match) {
269+
std::cout << " - Column indices are WRONG" << std::endl;
270+
std::cout << " Expected: [0 0 2]" << std::endl;
271+
std::cout << " Got: [";
272+
for (auto v : cols) std::cout << v << " ";
273+
std::cout << "]" << std::endl;
274+
} else {
275+
std::cout << " - Column indices are correct" << std::endl;
276+
}
277+
278+
std::cout << std::endl;
279+
280+
// Cleanup
281+
if (cumsum_device) free(cumsum_device, q);
282+
if (indices_device) free(indices_device, q);
283+
if (shape_device) free(shape_device, q);
284+
if (indices_host) free(indices_host, q);
285+
286+
return 1;
287+
}
288+
289+
// Cleanup
290+
if (cumsum_device) free(cumsum_device, q);
291+
if (indices_device) free(indices_device, q);
292+
if (shape_device) free(shape_device, q);
293+
if (indices_host) free(indices_host, q);
294+
295+
return 0;
296+
297+
} catch (exception const& e) {
298+
std::cerr << std::endl;
299+
std::cerr << "========================================" << std::endl;
300+
std::cerr << "SYCL EXCEPTION" << std::endl;
301+
std::cerr << "========================================" << std::endl;
302+
std::cerr << e.what() << std::endl;
303+
304+
// Cleanup on error
305+
if (cumsum_device) free(cumsum_device, q);
306+
if (indices_device) free(indices_device, q);
307+
if (shape_device) free(shape_device, q);
308+
if (indices_host) free(indices_host, q);
309+
310+
return 1;
311+
} catch (std::exception const& e) {
312+
std::cerr << std::endl;
313+
std::cerr << "========================================" << std::endl;
314+
std::cerr << "STANDARD EXCEPTION" << std::endl;
315+
std::cerr << "========================================" << std::endl;
316+
std::cerr << e.what() << std::endl;
317+
318+
// Note: Can't cleanup here as we don't have queue reference
319+
return 1;
320+
}
321+
}

0 commit comments

Comments
 (0)