@@ -330,10 +330,15 @@ def get_kernel(self, max_ntargets_in_one_box):
330330 if=run_itgt}
331331 end
332332 end
333- result[iknl, itgt] = result[iknl, itgt] + result_temp[ \
333+ for itgt_offset
334+ <> itgt2 = itgt_start + itgt_offset {id=init_itgt_for_write}
335+ <> run_itgt2 = itgt_start + itgt_offset < itgt_end \
336+ {id=init_cond_for_write}
337+ result[iknl, itgt2] = result[iknl, itgt2] + result_temp[ \
334338 itgt_offset, iknl] * kernel_scaling \
335- {dep=update_result:init_result,id=write_result, \
336- dup=iknl,if=run_itgt}
339+ {dep=update_result:init_result,id=write_result, \
340+ dup=iknl,if=run_itgt2}
341+ end
337342 end
338343 """ ],
339344 [
@@ -386,7 +391,8 @@ def get_optimized_kernel(self, max_ntargets_in_one_box):
386391 knl = lp .privatize_temporaries_with_inames (knl ,
387392 "itgt_offset_outer" , "result_temp" )
388393 knl = lp .duplicate_inames (knl , "itgt_offset_outer" , "id:init_result" )
389- knl = lp .duplicate_inames (knl , "itgt_offset_outer" , "id:write_result" )
394+ knl = lp .duplicate_inames (knl , "itgt_offset_outer" ,
395+ "id:write_result or id:init_itgt_for_write or id:init_cond_for_write" )
390396 knl = lp .add_inames_to_insn (knl , "dummy" ,
391397 "id:init_box* or id:fetch_src_box or id:fetch_center "
392398 "or id:kernel_scaling" )
0 commit comments