Skip to content

Commit 4ffd5b5

Browse files
Apply w/a to custom_inclusive_scan_over_group from dpctl #2275
1 parent 8cc35f1 commit 4ffd5b5

File tree

1 file changed

+15
-3
lines changed

1 file changed

+15
-3
lines changed

dpnp/tensor/libtensor/include/utils/sycl_utils.hpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -283,9 +283,21 @@ T custom_inclusive_scan_over_group(GroupT &&wg,
283283
const bool in_range = (lane_id < n_aggregates);
284284
const bool in_bounds = in_range && (lane_id > 0 || large_wg);
285285

286-
T __scan_val = (in_bounds)
287-
? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
288-
: identity;
286+
// Here is a bug where IGC incorrectly optimized the below code:
287+
// T __scan_val = (in_bounds)
288+
// ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
289+
// : identity;
290+
// That causes `__scan_val` is not initialized with `identity` value:
291+
// wgs = 256, max_sgSize = 16 => n_aggregates = 16
292+
// wi = 0: in_range = 1, in_bounds = 0 => __scan_val = identity
293+
// The w/s adds SYCL atomic fence, since the explicit memory fence
294+
// prevents reordering/elimination, while it will add slight overhead.
295+
T __scan_val = identity;
296+
sycl::atomic_fence(sycl::memory_order::relaxed,
297+
sycl::memory_scope::work_item);
298+
if (in_bounds) {
299+
__scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1];
300+
}
289301
for (std::uint32_t step = 1; step < sgSize; step *= 2) {
290302
const bool advanced_lane = (lane_id >= step);
291303
const std::uint32_t src_lane_id =

0 commit comments

Comments
 (0)