[ET-VK] Fix missing memory barrier for first-use writes on aliased tensors

ssjia · SS-JIA · commit 6ebaa05a278e · 2026-02-10T14:12:09.000-05:00
Pull Request resolved: #17309 Tensors sharing physical memory via SharedObject each track their own `last_access_` independently. When a tensor's first access is a write, `prev_stage` is `NO_STAGE`, causing `transition()` to use `TOP_OF_PIPE_BIT` as `srcStageMask` with no `srcAccessMask` — effectively a no-op barrier. If the same physical memory was previously written through a different aliased tensor handle, this creates a WAW hazard where the new write may execute before or concurrently with the prior write, producing non-deterministic results. This was observed as non-deterministic q8ta_conv2d output in ResNet50: running the model twice with the same input produced slightly different quantized int8 values. Adding a debug print shader after each conv2d dispatch masked the issue because the print node's read-after-write barrier serialized GPU work. The fix: when `prev_stage` is `NO_STAGE` and the current access is a write, use `COMPUTE_SHADER_BIT` with `SHADER_WRITE_BIT` instead of `TOP_OF_PIPE_BIT` with no access flags. This ensures all prior compute shader work completes and its writes are made visible before the new write begins. Authored with Claude. ghstack-source-id: 339884030 @exported-using-ghexport Differential Revision: [D92715369](https://our.internmc.facebook.com/intern/diff/D92715369/)
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -775,9 +775,22 @@ void vTensorStorage::transition(
   // RAR: no need for synchronization
   if (prev_written || cur_written || layout_changed) {
     VkPipelineStageFlags src_stage = vkapi::vk_stage(prev_stage);
+    VkAccessFlags src_access = vkapi::vk_access(prev_stage, prev_access);
+
     if (0u == src_stage) {
-      src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+      if (cur_written) {
+        // First access through this tensor handle, and it's a write. The
+        // underlying memory may have been previously written through a
+        // different aliased tensor handle (via SharedObject). Wait for all
+        // prior compute work and make those writes available to prevent WAW
+        // hazards on aliased memory.
+        src_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+        src_access = VK_ACCESS_SHADER_WRITE_BIT;
+      } else {
+        src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+      }
     }
+
     VkPipelineStageFlags dst_stage = vkapi::vk_stage(cur_stage);
     if (0u == dst_stage) {
       dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
@@ -786,20 +799,15 @@ void vTensorStorage::transition(
     pipeline_barrier.stage.src |= src_stage;
     pipeline_barrier.stage.dst |= dst_stage;
 
+    VkAccessFlags dst_access = vkapi::vk_access(cur_stage, cur_access);
+
     if (image_) {
       pipeline_barrier.images.emplace_back(
-          vkapi::vk_access(prev_stage, prev_access),
-          vkapi::vk_access(cur_stage, cur_access),
-          cur_layout,
-          new_layout,
-          image_);
+          src_access, dst_access, cur_layout, new_layout, image_);
 
       image_.set_layout(new_layout);
     } else if (buffer_) {
-      pipeline_barrier.buffers.emplace_back(
-          vkapi::vk_access(prev_stage, prev_access),
-          vkapi::vk_access(cur_stage, cur_access),
-          buffer_);
+      pipeline_barrier.buffers.emplace_back(src_access, dst_access, buffer_);
     }
   }