File tree Expand file tree Collapse file tree 1 file changed +15
-3
lines changed
dpnp/tensor/libtensor/include/utils Expand file tree Collapse file tree 1 file changed +15
-3
lines changed Original file line number Diff line number Diff line change @@ -283,9 +283,21 @@ T custom_inclusive_scan_over_group(GroupT &&wg,
283283 const bool in_range = (lane_id < n_aggregates);
284284 const bool in_bounds = in_range && (lane_id > 0 || large_wg);
285285
286- T __scan_val = (in_bounds)
287- ? local_mem_acc[(offset + lane_id) * max_sgSize - 1 ]
288- : identity;
286+ // Here is a bug where IGC incorrectly optimized the below code:
287+ // T __scan_val = (in_bounds)
288+ // ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
289+ // : identity;
290+ // That causes `__scan_val` is not initialized with `identity` value:
291+ // wgs = 256, max_sgSize = 16 => n_aggregates = 16
292+ // wi = 0: in_range = 1, in_bounds = 0 => __scan_val = identity
293+ // The w/s adds SYCL atomic fence, since the explicit memory fence
294+ // prevents reordering/elimination, while it will add slight overhead.
295+ T __scan_val = identity;
296+ sycl::atomic_fence (sycl::memory_order::relaxed,
297+ sycl::memory_scope::work_item);
298+ if (in_bounds) {
299+ __scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1 ];
300+ }
289301 for (std::uint32_t step = 1 ; step < sgSize; step *= 2 ) {
290302 const bool advanced_lane = (lane_id >= step);
291303 const std::uint32_t src_lane_id =
You can’t perform that action at this time.
0 commit comments