Add grid-stride loop and ROCm cap to index_add_2d_with_unique_indices_kernel (#5934)

q10 · meta-codesync[bot] · commit fa211b0e157b · 2026-06-22T22:31:30.000-07:00
Summary: Pull Request resolved: #5934 X-link: https://github.com/facebookresearch/FBGEMM/pull/2852 Tier-2 fix for HIP grid-overflow in `sparse_ops/sparse_index_add.cu`. `index_add_2d_with_unique_indices_kernel` previously used `blockIdx.x` directly to index unique indices. Capping the host-side grid without first adding a grid-stride loop would silently drop work. Changes: - Add `const int num_unique_indices` as a new kernel parameter. - Convert kernel to a grid-stride loop over `u = blockIdx.x; u < num_unique_indices; u += gridDim.x` (Pattern C). All `blockIdx.x` references replaced with `u`. Hoist `start_D` and `has_remainder` outside the loop since they depend only on `blockIdx.y` / `threadIdx.x`. - RESET per-iteration register state at the top of each iteration: `sum[MAX_ELEMENTS_PER_THREAD]` re-zeroed and `sum_remainder = 0`. - Apply standard `#ifdef USE_ROCM min(blocks_x_uncapped, get_max_thread_blocks(stream)) #else blocks_x_uncapped #endif` cap to the x-dim of the launch grid. y dim is bounded by D/stride_D and needs no cap. Stacked on top of D105029028 (Tier-2 Diff 5/7). Plan: `/home/bensonma415/.llms/plans/sparse_ops_rocm_grid_overflow_tier2_fix.plan.md` (Diff 6/7). Reviewed By: henrylhtsang Differential Revision: D105029511 fbshipit-source-id: 2a33c6218d6b3d1c9c39ca301a1d451f09a39308
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_index_add.cu b/fbgemm_gpu/src/sparse_ops/sparse_index_add.cu
@@ -29,53 +29,59 @@ __launch_bounds__(kMaxThreads) void index_add_2d_with_unique_indices_kernel(
     const int rounded_D,
     const int remaining_D,
     const bool consecutive_indices,
-    const int consecutive_range_start) {
-  const auto start_offset = blockIdx.x == 0 ? 0 : offsets[blockIdx.x - 1];
-  const int end_offset = offsets[blockIdx.x];
-  index_t dst_idx = consecutive_indices ? blockIdx.x + consecutive_range_start
-                                        : unique_indices[blockIdx.x];
+    const int consecutive_range_start,
+    const int num_unique_indices) {
+  // Each thread block processes max of stride_D elements
+  const int start_D = (blockIdx.y * stride_D) + (threadIdx.x * UNROLL_FACTOR);
   const bool has_remainder = blockIdx.y == blockDim.y - 1 && remaining_D > 0 &&
       threadIdx.x < remaining_D;
 
-  // Buffer for storing temporary results
-  scalar_t sum[MAX_ELEMENTS_PER_THREAD];
-  for (int i = 0; i < MAX_ELEMENTS_PER_THREAD; i++) {
-    sum[i] = 0;
-  }
+  // Grid-stride over unique indices (the saturating x dim) so a capped grid
+  // (used on ROCm to avoid the 2^32 launch-side limit) still covers all
+  // unique indices. blockIdx.y is bounded by D/stride_D and needs no cap.
+  for (auto u = blockIdx.x; u < num_unique_indices; u += gridDim.x) {
+    const auto start_offset = u == 0 ? 0 : offsets[u - 1];
+    const int end_offset = offsets[u];
+    index_t dst_idx =
+        consecutive_indices ? u + consecutive_range_start : unique_indices[u];
+
+    // RESET per-iteration register state.
+    scalar_t sum[MAX_ELEMENTS_PER_THREAD];
+    for (int i = 0; i < MAX_ELEMENTS_PER_THREAD; i++) {
+      sum[i] = 0;
+    }
 
-  scalar_t sum_remainder = 0;
+    scalar_t sum_remainder = 0;
 
-  // Each thread block processes max of stride_D elements
-  int start_D = (blockIdx.y * stride_D) + (threadIdx.x * UNROLL_FACTOR);
+    // For each row
+    for (int row = start_offset; row < end_offset; row++) {
+      int64_t src_idx = orig_indices[row];
+      int col, i;
+      for (col = start_D, i = 0; col < start_D + stride_D && col < rounded_D;
+           col += blockDim.x * UNROLL_FACTOR, i += UNROLL_FACTOR) {
+#pragma unroll
+        for (int j = 0; j < UNROLL_FACTOR; j++) {
+          sum[i + j] += LDG(&out_grad[src_idx][col + j]);
+        }
+      }
+      if (has_remainder) {
+        sum_remainder += LDG(&out_grad[src_idx][rounded_D + threadIdx.x]);
+      }
+    } // for each row
 
-  // For each row
-  for (int row = start_offset; row < end_offset; row++) {
-    int64_t src_idx = orig_indices[row];
+    // Write results to global memory
     int col, i;
     for (col = start_D, i = 0; col < start_D + stride_D && col < rounded_D;
          col += blockDim.x * UNROLL_FACTOR, i += UNROLL_FACTOR) {
 #pragma unroll
       for (int j = 0; j < UNROLL_FACTOR; j++) {
-        sum[i + j] += LDG(&out_grad[src_idx][col + j]);
+        in_deduped_grad[dst_idx][col + j] = sum[i + j];
       }
     }
     if (has_remainder) {
-      sum_remainder += LDG(&out_grad[src_idx][rounded_D + threadIdx.x]);
-    }
-  } // for each row
-
-  // Write results to global memory
-  int col, i;
-  for (col = start_D, i = 0; col < start_D + stride_D && col < rounded_D;
-       col += blockDim.x * UNROLL_FACTOR, i += UNROLL_FACTOR) {
-#pragma unroll
-    for (int j = 0; j < UNROLL_FACTOR; j++) {
-      in_deduped_grad[dst_idx][col + j] = sum[i + j];
+      in_deduped_grad[dst_idx][rounded_D + threadIdx.x] += sum_remainder;
     }
   }
-  if (has_remainder) {
-    in_deduped_grad[dst_idx][rounded_D + threadIdx.x] += sum_remainder;
-  }
 }
 
 DLL_PUBLIC Tensor index_add_with_unique_indices_cuda(
@@ -146,10 +152,21 @@ DLL_PUBLIC Tensor index_add_with_unique_indices_cuda(
                 offsets = unique_count.cumsum(0);
               }
 
-              const dim3 grid_size(
+              const int num_y_blocks = (D + stride_D - 1) / stride_D;
+              // HIP enforces a hard limit of 2^32 total threads per launch
+              // (unlike CUDA, which silently wraps).
+              // index_add_2d_with_unique_indices_kernel grid-strides over the
+              // unique index (x) dim, so capping x is correctness-preserving.
+              // The y dim is not grid-strided, so fold num_y_blocks into the
+              // per-launch thread count used for the overflow check, keeping
+              // the cap accounting consistent with the launcher's total-thread
+              // check (grid.x * grid.y * block_size). See:
+              // https://github.com/ROCm/hip/issues/2253
+              const auto blocks_x = utils::cuda::cap_grid_dim_x(
                   cuda_calc_xblock_count(num_unique_indices, 1),
-                  (D + stride_D - 1) / stride_D,
-                  1);
+                  static_cast<int64_t>(block_size) * num_y_blocks,
+                  at::cuda::getCurrentCUDAStream());
+              const dim3 grid_size(blocks_x, num_y_blocks, 1);
 
               const auto unique_indices_ = consecutive_indices
                   ? at::empty(
@@ -177,7 +194,8 @@ DLL_PUBLIC Tensor index_add_with_unique_indices_cuda(
                   rounded_D,
                   remaining_D,
                   consecutive_indices,
-                  consecutive_range_start);
+                  consecutive_range_start,
+                  num_unique_indices);
             });
       });
   return input_grad.reshape(input_shape);
diff --git a/fbgemm_gpu/test/jagged/jagged_index_select_2d_test.py b/fbgemm_gpu/test/jagged/jagged_index_select_2d_test.py
@@ -262,5 +262,114 @@ def test_jagged_index_add_2d_forward_negative_rows_errors(self) -> None:
             )
 
 
+class JaggedIndexSelect2DLargeGridTest(unittest.TestCase):
+    """
+    Retro: regression tests for the HIP grid-overflow bug in
+    ``index_add_2d_with_unique_indices_kernel`` (D105029511 /
+    Subplan B Diff #10), which lacked its own test method when
+    landed.
+
+    Block: dim3(stride_D / UNROLL_FACTOR, num_y_blocks).
+    Grid: dim3(num_unique_indices, ceil(D / stride_D), 1).
+    The production cap is `blocks_x = min(num_unique_indices,
+    get_max_thread_blocks(stream))` (~16384 on MI300/MI350); the
+    kernel grid-strides over the unique-index axis post-fix.
+    """
+
+    @classmethod
+    def _has_gpu(cls) -> bool:
+        return torch.cuda.is_available()
+
+    @classmethod
+    def _gpu_memory_lt(cls, gb: int) -> bool:
+        if not cls._has_gpu():
+            return True
+        return torch.cuda.get_device_properties(0).total_memory < gb * (1 << 30)
+
+    @unittest.skipUnless(torch.cuda.is_available(), "GPU not available")
+    def test_index_add_2d_with_unique_indices_correctness(self) -> None:
+        """
+        Multi-block correctness check at small scale via the autograd
+        backward of ``jagged_index_select`` (which dispatches to
+        ``index_add_2d_with_unique_indices_kernel``). Sentinel non-zero
+        values at start / middle / end of the unique-index axis force
+        the grid-stride outer loop to iterate.
+        """
+        if self._gpu_memory_lt(4):
+            self.skipTest("Requires >= 4 GiB GPU memory")
+        device = torch.accelerator.current_accelerator()
+        # num_unique_indices > 2 * 1024 so the grid-stride loop iterates.
+        N = 2 * 1024 + 3
+        D = 16
+        # Sparse lengths: most entries 0, sentinel non-zero at start /
+        # middle / end so the kernel produces non-trivial backward grad.
+        lengths_cpu = torch.zeros(N, dtype=torch.int64)
+        lengths_cpu[0] = 1
+        lengths_cpu[N // 2] = 2
+        lengths_cpu[N - 1] = 3
+        total = int(lengths_cpu.sum().item())
+        # All unique inverse_lookup values so dedup keeps every batch.
+        inverse_lookup_cpu = torch.arange(N, dtype=torch.int64)
+
+        values_init = torch.arange(total * D, dtype=torch.float32).reshape(total, D)
+
+        # GPU forward + backward.
+        values_gpu = values_init.detach().clone().to(device).requires_grad_(True)
+        output_gpu, _ = torch.ops.fbgemm.jagged_index_select(
+            values_gpu, lengths_cpu.to(device), inverse_lookup_cpu.to(device)
+        )
+        output_gpu.sum().backward()
+
+        # CPU reference: backward of jagged_index_select with a permutation
+        # `inverse_lookup` is a scatter_add of grad over unique indices.
+        # With identity inverse_lookup and grad = ones, the expected
+        # gradient is `ones` for every selected row.
+        # pyre-ignore[16]
+        self.assertEqual(values_gpu.grad.shape, values_init.shape)
+        torch.testing.assert_close(
+            values_gpu.grad.cpu(),
+            torch.ones_like(values_init),
+        )
+
+    @unittest.skipUnless(torch.cuda.is_available(), "GPU not available")
+    def test_index_add_2d_with_unique_indices_large_grid(self) -> None:
+        """
+        Launch-survival regression test at the cap-trip scale.
+
+        Pre-fix, ``index_add_2d_with_unique_indices_kernel`` launches
+        with grid_x = num_unique_indices and per-block thread count
+        determined by stride_D / UNROLL_FACTOR. With D = 8 and
+        num_unique_indices = (1 << 22) + 1 the cap-trip path on ROCm
+        would TORCH_CHECK-fail; post-fix the host caps grid_x to
+        ``get_max_thread_blocks(stream)`` and the kernel grid-strides.
+
+        Memory budget: values ~ N * D * 4B = 128 MiB per copy;
+        the fwd output is the same size. Skip if HBM < 4 GiB.
+        """
+        if self._gpu_memory_lt(4):
+            self.skipTest("Requires >= 4 GiB GPU memory")
+        device = torch.accelerator.current_accelerator()
+        N = (1 << 22) + 1
+        D = 8
+        # All-zero lengths with one non-zero entry so the backward
+        # kernel still launches over all unique indices.
+        lengths_cpu = torch.zeros(N, dtype=torch.int64)
+        lengths_cpu[0] = 1
+        total = int(lengths_cpu.sum().item())
+        inverse_lookup_cpu = torch.arange(N, dtype=torch.int64)
+
+        values = torch.zeros(
+            (total, D), dtype=torch.float32, device=device, requires_grad=True
+        )
+        # Pre-fix this trips KernelLauncher::checkThreadCountNotExceeded
+        # on ROCm at the index_add_2d_with_unique_indices launch.
+        output, _ = torch.ops.fbgemm.jagged_index_select(
+            values, lengths_cpu.to(device), inverse_lookup_cpu.to(device)
+        )
+        output.sum().backward()
+        # pyre-ignore[16]
+        self.assertEqual(values.grad.shape, values.shape)
+
+
 if __name__ == "__main__":
     unittest.main()