Skip to content

Commit 8b5e3b6

Browse files
authored
Merge pull request #2275 from IntelPython/add-w/a-to-custom_inclusive_scan_over_group
Add a workaround to `custom_inclusive_scan_over_group`
2 parents e8af8c9 + b495d8e commit 8b5e3b6

File tree

1 file changed

+15
-3
lines changed

1 file changed

+15
-3
lines changed

dpctl/tensor/libtensor/include/utils/sycl_utils.hpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,21 @@ T custom_inclusive_scan_over_group(GroupT &&wg,
268268
const bool in_range = (lane_id < n_aggregates);
269269
const bool in_bounds = in_range && (lane_id > 0 || large_wg);
270270

271-
T __scan_val = (in_bounds)
272-
? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
273-
: identity;
271+
// Here is a bug where IGC incorrectly optimized the below code:
272+
// T __scan_val = (in_bounds)
273+
// ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
274+
// : identity;
275+
// That causes `__scan_val` is not initialized with `identity` value:
276+
// wgs = 256, max_sgSize = 16 => n_aggregates = 16
277+
// wi = 0: in_range = 1, in_bounds = 0 => __scan_val = identity
278+
// The w/s adds SYCL atomic fence, since the explicit memory fence
279+
// prevents reordering/elimination, while it will add slight overhead.
280+
T __scan_val = identity;
281+
sycl::atomic_fence(sycl::memory_order::relaxed,
282+
sycl::memory_scope::work_item);
283+
if (in_bounds) {
284+
__scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1];
285+
}
274286
for (std::uint32_t step = 1; step < sgSize; step *= 2) {
275287
const bool advanced_lane = (lane_id >= step);
276288
const std::uint32_t src_lane_id =

0 commit comments

Comments
 (0)