Add INT8 (uint8) support to FBGEMM index_select_dim0 forward (#5782)

q10 · meta-codesync[bot] · commit 0b8730f34fce · 2026-05-27T10:57:38.000-07:00
Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/2712 Pull Request resolved: #5782 Adds INT8 (uint8/Byte) type support to the index_select_dim0 GPU forward path for UMIA inference use cases. The kernel template already handles arbitrary scalar_t types for the gather/copy operation, so this change extends the type dispatch macro from FBGEMM_DISPATCH_FLOAT_AND_HALF to FBGEMM_DISPATCH_FLOAT_HALF_AND_BYTE which adds at::ScalarType::Byte (uint8_t). For non-floating-point types (e.g., INT8), the autograd wrapper is bypassed since gradient computation is meaningless for integer types. The forward call goes directly to index_select_cuda without index sorting overhead, as INT8 is expected to be used in inference-only scenarios. ## Detailed Changes ### `fbcode/deeplearning/fbgemm/fbgemm_gpu/src/sparse_ops/sparse_index_select.cu` - Swaps the dispatch macro on `index_add_2d_kernel_2` from `FBGEMM_DISPATCH_FLOAT_AND_HALF` to `FBGEMM_DISPATCH_FLOAT_HALF_AND_BYTE`, instantiating the existing scalar-generic gather/copy kernel template for `at::ScalarType::Byte` (`uint8_t`) in addition to `float` and `half`. - No kernel-logic change — the kernel was already templated on `scalar_t`, so the addition is purely a type-dispatch extension. ### `fbcode/deeplearning/fbgemm/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp` - In `index_select_dim0_gpu`, adds an early-return fast path when `input.scalar_type() == at::kByte`: - Validates `indices` is on the same CUDA device as `input` and is 1-D. - Calls `index_select_cuda(input, indices, /*orig_indices=*/empty_long_tensor, /*indices_sorted=*/false)` directly, bypassing `IndexSelectDim0GPUOp::apply` (the autograd-aware wrapper). - This is correct because integer dtypes have no gradient; the autograd `Function` wrapper is meaningless for INT8. - `consecutive_range_start`, `consecutive_range_length`, and `skip_indices_sorting_fwd` are intentionally ignored on this path — they only affect (a) the unused backward and (b) the index-sort optimization, both of which are skipped for inference-only INT8 inputs. ### `fbcode/deeplearning/fbgemm/fbgemm_gpu/test/sparse/index_select_test.py` - Extends the `dtype` Hypothesis sampler in `IndexSelectTest.test_index_select_dim0` from `[torch.float, torch.half]` to `[torch.float, torch.half, torch.uint8]`. - Generates input via `torch.randint(0, 256, ...)` for non-floating-point dtypes (since `torch.rand` only supports floating dtypes); keeps `torch.rand` for float/half. - Tightens the equality check to `atol=0, rtol=0` — gather/copy is bit-exact, so any tolerance would mask correctness regressions. - Skips the gradcheck block entirely when `dtype` is non-floating-point (autograd is not applicable for INT8). Reviewed By: spcyppt Differential Revision: D103495542 fbshipit-source-id: c6ec6b5d5be02ad34f790d9e6904bbb09e39f7f9
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/dispatch_macros.h b/fbgemm_gpu/include/fbgemm_gpu/utils/dispatch_macros.h
@@ -329,12 +329,13 @@
   AT_DISPATCH_SWITCH(                                   \
       TYPE, NAME, FBGEMM_DISPATCH_FLOAT_AND_HALF_CASE(__VA_ARGS__))
 
-#define FBGEMM_DISPATCH_FLOAT_HALF_AND_BYTE(TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(                                        \
-      TYPE,                                                  \
-      NAME,                                                  \
-      FBGEMM_DISPATCH_FLOAT_AND_HALF_CASE(__VA_ARGS__)       \
-          AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__))
+#define FBGEMM_DISPATCH_FLOAT_HALF_AND_BYTE(TYPE, NAME, ...)  \
+  AT_DISPATCH_SWITCH(                                         \
+      TYPE,                                                   \
+      NAME,                                                   \
+      FBGEMM_DISPATCH_FLOAT_AND_HALF_CASE(__VA_ARGS__)        \
+          AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \
+              AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__))
 
 #define FBGEMM_DISPATCH_FLOAT_HALF_FP8_AND_BYTE(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(                                            \
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_index_select.cu b/fbgemm_gpu/src/sparse_ops/sparse_index_select.cu
@@ -96,7 +96,7 @@ DLL_PUBLIC Tensor index_select_cuda(
   }
 
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "index_add_2d_kernel_1", [&] {
-    FBGEMM_DISPATCH_FLOAT_AND_HALF(
+    FBGEMM_DISPATCH_FLOAT_HALF_AND_BYTE(
         input_reshaped.scalar_type(), "index_add_2d_kernel_2", [&] {
           if (indices_sorted) {
             LAUNCH_INDEX_SELECT(true)
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
@@ -653,21 +653,62 @@ std::tuple<Tensor, std::optional<Tensor>> pack_segments_cuda_v2(
       t_in, lengths, max_length, pad_minf, return_presence_mask);
 }
 
+namespace {
+
+// Helper to prepare indices for index_select operation.
+// Returns a tuple of (indices_to_use, orig_indices, indices_sorted).
+// If skip_indices_sorting_fwd is true and not in inference mode, returns the
+// original indices with empty orig_indices and indices_sorted=false.
+// Otherwise, sorts indices and returns (sorted_indices, orig_indices, true).
+std::tuple<Tensor, Tensor, bool> prepare_index_select_indices(
+    const Tensor& indices,
+    std::optional<bool> skip_indices_sorting_fwd) {
+  const bool skip_sort = skip_indices_sorting_fwd.value_or(false) &&
+      !c10::InferenceMode::is_enabled();
+
+  if (skip_sort) {
+    return {indices, at::empty({0}, indices.options().dtype(at::kLong)), false};
+  }
+  Tensor sorted_indices, orig_indices;
+  std::tie(sorted_indices, orig_indices) = indices.sort();
+  return {sorted_indices, orig_indices, true};
+}
+
+} // namespace
+
 Tensor index_select_dim0_gpu(
     const Tensor& input,
     const Tensor& indices,
     std::optional<int64_t> consecutive_range_start,
     std::optional<int64_t> consecutive_range_length,
     std::optional<bool> skip_indices_sorting_fwd) {
-  bool user_skip_indices_sorting_fwd =
-      skip_indices_sorting_fwd ? *skip_indices_sorting_fwd : false;
+  // 8-bit integer dtypes (uint8/Byte and int8/Char) are inference-only and do
+  // not support autograd, so we bypass IndexSelectDim0GPUOp::apply (which
+  // wires up the autograd Function) and call index_select_cuda directly.
+  // consecutive_range_start and consecutive_range_length are intentionally
+  // ignored on this path — they optimize the backward pass for consecutive
+  // indices, but integer dtypes have no backward pass (no gradients).
+  if (input.scalar_type() == at::kByte || input.scalar_type() == at::kChar) {
+    TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(input, indices);
+    TORCH_CHECK_VALUE(
+        indices.dim() == 1, "Index tensor must be 1D, but got ", indices.dim());
+
+    auto [indices_to_use, orig_indices, indices_sorted] =
+        prepare_index_select_indices(indices, skip_indices_sorting_fwd);
+    return index_select_cuda(
+        input, indices_to_use, orig_indices, indices_sorted);
+  }
+
   return IndexSelectDim0GPUOp::apply(
       input,
       indices,
-      consecutive_range_start ? *consecutive_range_start : 0,
-      consecutive_range_length ? *consecutive_range_length : 0,
-      // Always skip indices sorting if doing forward only
-      user_skip_indices_sorting_fwd && !c10::InferenceMode::is_enabled())[0];
+      consecutive_range_start.value_or(0),
+      consecutive_range_length.value_or(0),
+      // Sorting is skipped only when the user requested it AND we are NOT in
+      // inference mode. In inference mode we always sort for cache-friendlier
+      // gathers.
+      skip_indices_sorting_fwd.value_or(false) &&
+          !c10::InferenceMode::is_enabled())[0];
 }
 
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu
@@ -74,6 +74,17 @@ vec4_copy(uint8_t* dst, const uint8_t* src, const int32_t D) {
   }
 }
 
+template <>
+DEVICE_INLINE void vec4_copy(int8_t* dst, const int8_t* src, const int32_t D) {
+  // each row is padded with row_alignment (16 bytes on GPUs), so each row will
+  // be multiple of 16 bytes (uint4 = 32bit x 4 = 16 bytes).
+  const uint4* __restrict__ src_ = reinterpret_cast<const uint4*>(src);
+  uint4* __restrict__ dst_ = reinterpret_cast<uint4*>(dst);
+  for (auto d = threadIdx.x; d * sizeof(uint4) < D; d += blockDim.x) {
+    dst_[d] = src_[d];
+  }
+}
+
 template <typename value_t, typename index_t, bool is_index_put>
 __global__ __launch_bounds__(kMaxThreads) void masked_index_kernel(
     pta::PackedTensorAccessor64<value_t, 2, at::RestrictPtrTraits> self,
@@ -144,7 +155,9 @@ Tensor masked_index_impl(
       is_index_put ? "masked_index_put" : "masked_index_select",
       [&] {
         using value_t = scalar_t;
-        if constexpr (std::is_same_v<value_t, uint8_t>) {
+        if constexpr (
+            std::is_same_v<value_t, uint8_t> ||
+            std::is_same_v<value_t, int8_t>) {
           TORCH_CHECK(D % 16 == 0, "D needs to be padded to be multiple of 16");
         }
         FBGEMM_DISPATCH_INTEGRAL_TYPES(
diff --git a/fbgemm_gpu/test/sparse/index_select_test.py b/fbgemm_gpu/test/sparse/index_select_test.py
@@ -38,7 +38,7 @@ class IndexSelectTest(unittest.TestCase):
             st.lists(st.integers(1, 128), max_size=1),
             st.lists(st.integers(1, 16), min_size=2, max_size=2),
         ),
-        dtype=st.sampled_from([torch.float, torch.half]),
+        dtype=st.sampled_from([torch.float, torch.half, torch.uint8, torch.int8]),
         use_cpu=st.booleans() if gpu_available else st.just(True),
         consecutive_indices=st.booleans(),
         skip_indices_sorting_fwd=st.booleans(),
@@ -76,15 +76,25 @@ def test_index_select_dim0(
 
         kwargs["skip_indices_sorting_fwd"] = skip_indices_sorting_fwd
 
-        input = torch.rand((U,) + tuple(shape), dtype=dtype, device=device)
+        if dtype.is_floating_point:
+            input = torch.rand((U,) + tuple(shape), dtype=dtype, device=device)
+        else:
+            iinfo = torch.iinfo(dtype)
+            input = torch.randint(
+                iinfo.min,
+                iinfo.max + 1,
+                (U,) + tuple(shape),
+                dtype=dtype,
+                device=device,
+            )
 
         with torch.inference_mode() if use_inference_mode else contextlib.nullcontext():
             output_ref = torch.ops.fbgemm.index_select_dim0(input, indices, **kwargs)
             output = torch.index_select(input, 0, indices)
 
-            torch.testing.assert_close(output, output_ref)
+            torch.testing.assert_close(output, output_ref, atol=0, rtol=0)
 
-        if not use_inference_mode:
+        if not use_inference_mode and dtype.is_floating_point:
             gradcheck_args = [
                 input.clone().detach().float().requires_grad_(True),
                 indices,

Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@ DLL_PUBLIC Tensor index_select_cuda(`
`96`	`96`	`}`
`97`	`97`
`98`	`98`	`AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "index_add_2d_kernel_1", [&] {`
`99`		`- FBGEMM_DISPATCH_FLOAT_AND_HALF(`
	`99`	`+ FBGEMM_DISPATCH_FLOAT_HALF_AND_BYTE(`
`100`	`100`	`input_reshaped.scalar_type(), "index_add_2d_kernel_2", [&] {`
`101`	`101`	`if (indices_sorted) {`
`102`	`102`	`LAUNCH_INDEX_SELECT(true)`