Fix HIP grid overflow in permute_1D_sparse_data_cuda (#5763)

q10 · meta-codesync[bot] · commit e1cf8fb22f47 · 2026-05-18T10:54:52.000-07:00
Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/2693 Pull Request resolved: #5763 A MAST training job (`fire-zzt-ESFM-MI350X-20260508-1651-3488ad1a`) was failing on ROCm in `permute_1D_sparse_data_cuda` with: ``` sparse_permute_1d.hip(339:8) [(permute_1D_data_kernel_vec<false, offsets_t, indices_t, std::nullptr_t>)] [grid dim 6252106 x 1 x 1] [block dim 64 x 16 x 1]: Total number of threads 6402156544 is greater than the HIP limit of 2^32 ``` The launch site uses `dim3(64, BT_blocks=16)` (block size 1024) and `blocks = cuda_calc_xblock_count(permuted_lengths_size, BT_blocks)`, so once `permuted_lengths_size > 2^26 ≈ 67M` segments, total threads exceed `2^32` and HIP refuses the launch (CUDA's runtime silently handles the wrap; ROCm does not — see ROCm/hip#2253). The MAST log shows ~100M segments, well past the limit. The kernels `permute_1D_data_kernel_vec` and `permute_1D_data_kernel` already implement a grid-stride loop over `b_t`, so no kernel-side changes are needed — only the launch site needs to cap the grid. The lengths kernel uses `CUDA_KERNEL_LOOP`, which also already grid-strides. Apply the D94944619 conditional-cap pattern at both kernel launch sites in `permute_1D_sparse_data_cuda`: - Compute `total_threads` as a `uint64_t` from the unconstrained grid. - If `total_threads >= numeric_limits<uint32_t>::max()`, cap the grid to `min(num_threadblocks, utils::cuda::get_max_thread_blocks(stream))`. - Otherwise pass through the existing value (no perf change for the common case, including NVIDIA — the generated launch is bit-identical). Same family of fix as: - D65009966 (bounds_check_indices) - D75543767 (TBE forward) - D94944619 (TBE forward V2 — conditional cap) Out of scope: `sparse_permute_2d.cu` (`permute_2D_data_kernel_vec`) has the same pattern at line 253 with `dim3(32, 32)` and is a candidate for the same fix as a follow-up. Reviewed By: spcyppt Differential Revision: D104903707 fbshipit-source-id: 049a7f70ceacd6d7cfd63fa305976e9a95978e01
diff --git a/fbgemm_gpu/fbgemm_gpu/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/sparse_ops.py
@@ -297,8 +297,16 @@ def permute_1D_sparse_data_meta(
     permuted_indices = indices.new_empty(permuted_indices_size)
     permuted_weights = None
     if weights is not None:
-        # pyre-fixme
-        permuted_weights = weights.new_empty(permuted_indices_size)
+        # Preserve trailing dimensions for N-D weights so the meta function
+        # matches the concrete kernel's output shape (e.g. [total, W] for
+        # 2D weights consumed by the vec kernel). Previously this always
+        # returned a 1D tensor, which broke the faketensor opcheck on the
+        # 2D-weights tests.
+        permuted_weights = (
+            weights.new_empty(permuted_indices_size)
+            if weights.dim() <= 1
+            else weights.new_empty([permuted_indices_size, *weights.shape[1:]])
+        )
     return permuted_lengths, permuted_indices, permuted_weights
 
 
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu b/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu
@@ -233,8 +233,19 @@ permute_1D_sparse_data_cuda(
   permuted_lengths = at::empty({permuted_lengths_size}, lengths.options());
 
   constexpr int32_t threads_1 = kMaxThreads;
-  const auto blocks_1 =
+  const auto blocks_1_uncapped =
       cuda_calc_xblock_count(permuted_lengths_size, threads_1);
+#ifdef USE_ROCM
+  // HIP enforces a hard limit of 2^32 total threads per launch (unlike CUDA,
+  // which silently wraps). Cap the grid unconditionally on ROCm;
+  // permute_1D_lengths_kernel uses CUDA_KERNEL_LOOP, which already
+  // grid-strides, so capping is correctness-preserving.
+  const auto blocks_1 = std::min<uint32_t>(
+      blocks_1_uncapped,
+      utils::cuda::get_max_thread_blocks(at::cuda::getCurrentCUDAStream()));
+#else
+  const auto blocks_1 = blocks_1_uncapped;
+#endif
   AT_DISPATCH_INDEX_TYPES(
       lengths.scalar_type(), "permute_1D_lengths_kernel", [&] {
         FBGEMM_LAUNCH_KERNEL(
@@ -262,8 +273,19 @@ permute_1D_sparse_data_cuda(
 
   constexpr int32_t BT_blocks = 16;
   dim3 threads_2(64, BT_blocks);
-  const auto blocks_2 =
+  const auto blocks_2_uncapped =
       cuda_calc_xblock_count(permuted_lengths_size, BT_blocks);
+#ifdef USE_ROCM
+  // HIP enforces a hard limit of 2^32 total threads per launch (unlike CUDA,
+  // which silently wraps). Cap the grid unconditionally on ROCm; the
+  // kernel's grid-striding loop over b_t handles the overflow, so capping is
+  // correctness-preserving.
+  const auto blocks_2 = std::min<uint32_t>(
+      blocks_2_uncapped,
+      utils::cuda::get_max_thread_blocks(at::cuda::getCurrentCUDAStream()));
+#else
+  const auto blocks_2 = blocks_2_uncapped;
+#endif
   permuted_indices = at::empty(permuted_indices_size, indices.options());
 
   AT_DISPATCH_INDEX_TYPES(
diff --git a/fbgemm_gpu/test/sparse/permute_indices_test.py b/fbgemm_gpu/test/sparse/permute_indices_test.py
@@ -27,10 +27,20 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_available, gpu_unavailable, on_oss_clang
+    from test_utils import (
+        gpu_available,
+        gpu_memory_lt_gb,
+        gpu_unavailable,
+        on_oss_clang,
+    )
 else:
     import fbgemm_gpu.sparse_ops  # noqa: F401, E402
-    from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable, on_oss_clang
+    from fbgemm_gpu.test.test_utils import (
+        gpu_available,
+        gpu_memory_lt_gb,
+        gpu_unavailable,
+        on_oss_clang,
+    )
 
 
 class PermuteIndicesTest(unittest.TestCase):
@@ -786,6 +796,77 @@ def test_permute_2D_indices_large_segments(
         else:
             self.assertIsNone(permuted_weights_gpu)
 
+    @unittest.skipIf(*gpu_unavailable)
+    # Skip on GPUs with insufficient HBM (need a few hundred MB for the
+    # int32 N-element tensors).
+    @unittest.skipIf(*gpu_memory_lt_gb(4))
+    def test_permute_1D_sparse_data_large_grid(self) -> None:
+        """
+        Reproduces the HIP grid-overflow bug in permute_1D_sparse_data_cuda
+        and verifies output correctness at the same scale.
+
+        With BT_blocks=16 and dim3(64, 16) (block size 1024), the launch grid
+        is cuda_calc_xblock_count(N, 16). For N > 2**26, total threads exceed
+        the HIP 2**32 limit, causing FBGEMM_LAUNCH_KERNEL ->
+        KernelLauncher::checkThreadCountNotExceeded to TORCH_CHECK-fail on
+        ROCm pre-fix. With the production fix in place, this test additionally
+        validates output correctness against the CPU dispatch of the same op
+        — the GPU output must match the CPU reference element-for-element.
+
+        ``lengths`` is sparse: all zero except for three known non-zero
+        positions (start / middle / end of the logical range), so HBM usage
+        stays bounded (~few hundred MB int32) while the permutation logic is
+        still exercised. ``permute`` is a deterministic non-identity circular
+        shift (``perm[i] != i`` everywhere), so any "kernel computed identity
+        instead of permutation" bug surfaces in the assertion below.
+        """
+
+        # Choose N so that total threads strictly exceeds 2**32:
+        # cuda_calc_xblock_count(N, 16) * 1024 ~= N * 64; need N > 2**26.
+        N = (1 << 26) + 1
+
+        device = torch.device(torch.accelerator.current_accelerator() or "cuda")
+
+        # Deterministic non-identity permute: circular shift by +1.
+        # perm_cpu[0] == N - 1 and perm_cpu[i] == i - 1 for i >= 1, so
+        # perm_cpu[i] != i for every i.
+        perm_cpu = torch.roll(torch.arange(N, dtype=torch.int32), 1)
+        permute = perm_cpu.to(device)
+
+        # Sparse non-zero lengths at start / middle / end. Total = 10.
+        lengths_cpu = torch.zeros(N, dtype=torch.int32)
+        lengths_cpu[0] = 3
+        lengths_cpu[N // 2] = 5
+        lengths_cpu[N - 1] = 2
+        lengths = lengths_cpu.to(device)
+
+        # Distinct indices per segment so the permutation is fully observable.
+        indices_cpu = torch.arange(10, dtype=torch.int32)
+        indices = indices_cpu.to(device)
+
+        # CPU reference oracle — same op, different dispatch.
+        (
+            permuted_lengths_cpu,
+            permuted_indices_cpu,
+            _permuted_weights_cpu,
+        ) = torch.ops.fbgemm.permute_1D_sparse_data(
+            perm_cpu, lengths_cpu, indices_cpu, None, None
+        )
+
+        # GPU op under test. Pre-fix, this launch trips
+        # KernelLauncher::checkThreadCountNotExceeded on ROCm.
+        (
+            permuted_lengths_gpu,
+            permuted_indices_gpu,
+            permuted_weights_gpu,
+        ) = torch.ops.fbgemm.permute_1D_sparse_data(
+            permute, lengths, indices, None, None
+        )
+
+        torch.testing.assert_close(permuted_lengths_gpu.cpu(), permuted_lengths_cpu)
+        torch.testing.assert_close(permuted_indices_gpu.cpu(), permuted_indices_cpu)
+        self.assertIsNone(permuted_weights_gpu)
+
 
 extend_test_class(PermuteIndicesTest)