File tree Expand file tree Collapse file tree 1 file changed +15
-3
lines changed
dpctl/tensor/libtensor/include/utils Expand file tree Collapse file tree 1 file changed +15
-3
lines changed Original file line number Diff line number Diff line change @@ -268,9 +268,21 @@ T custom_inclusive_scan_over_group(GroupT &&wg,
268268 const bool in_range = (lane_id < n_aggregates);
269269 const bool in_bounds = in_range && (lane_id > 0 || large_wg);
270270
271- T __scan_val = (in_bounds)
272- ? local_mem_acc[(offset + lane_id) * max_sgSize - 1 ]
273- : identity;
271+ // Here is a bug where IGC incorrectly optimized the below code:
272+ // T __scan_val = (in_bounds)
273+ // ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
274+ // : identity;
275+ // That causes `__scan_val` is not initialized with `identity` value:
276+ // wgs = 256, max_sgSize = 16 => n_aggregates = 16
277+ // wi = 0: in_range = 1, in_bounds = 0 => __scan_val = identity
278+ // The w/s adds SYCL atomic fence, since the explicit memory fence
279+ // prevents reordering/elimination, while it will add slight overhead.
280+ T __scan_val = identity;
281+ sycl::atomic_fence (sycl::memory_order::relaxed,
282+ sycl::memory_scope::work_item);
283+ if (in_bounds) {
284+ __scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1 ];
285+ }
274286 for (std::uint32_t step = 1 ; step < sgSize; step *= 2 ) {
275287 const bool advanced_lane = (lane_id >= step);
276288 const std::uint32_t src_lane_id =
You can’t perform that action at this time.
0 commit comments