turn rand.h into [low, high) following torch.rand pattern (#19468)

Gasoonjia · web-flow · commit ac9efa7dca18 · 2026-05-11T18:38:17.000-07:00
This diff makes aoti rand shim follow torch.rand pattern to generate
outputs falling into [low, high) range.
Reviewed By: GregoryComer

Differential Revision: D104723400
diff --git a/backends/cuda/runtime/shims/rand.cu b/backends/cuda/runtime/shims/rand.cu
@@ -77,6 +77,10 @@ void ensure_rng_init(cudaStream_t stream) {
 // (populated by `advance_counter_kernel` immediately before this launch).
 // This replaces the previous per-element atomicAdd contention with a single
 // atomic per kernel launch.
+//
+// Matches PyTorch's `transformation::uniform_int_from_to` semantics: builds
+// a 64-bit random value from two 32-bit curand draws, then takes
+// `val % range + low` so the output lies in [low, high).
 __global__ void philox_randint_graph_kernel(
     int64_t* __restrict__ out,
     int64_t numel,
@@ -87,13 +91,27 @@ __global__ void philox_randint_graph_kernel(
   if (idx < numel) {
     curandStatePhilox4_32_10_t state;
     curand_init(rng->seed, idx, rng->base_scratch, &state);
-    double val = curand_uniform_double(&state);
-    int64_t ival = static_cast<int64_t>(val * range);
-    out[idx] = low + (ival >= range ? range - 1 : ival);
+    uint32_t hi = curand(&state);
+    uint32_t lo = curand(&state);
+    uint64_t rval = (static_cast<uint64_t>(hi) << 32) | static_cast<uint64_t>(lo);
+    uint64_t urange = static_cast<uint64_t>(range);
+    out[idx] = low + static_cast<int64_t>(rval % urange);
   }
 }
 
-// Philox-based uniform float32 generator (graph-safe version).
+// Maps a uniformly distributed uint32 to a float32 in [0, 1) following the
+// pattern used by PyTorch's `transformation::uniform_real` in
+// aten/src/ATen/native/cuda/DistributionTemplates.h: keep the low 24 mantissa
+// bits and divide by 2^24.
+__device__ inline float uniform_real_from_uint32(uint32_t val) {
+  // std::numeric_limits<float>::digits == 24
+  constexpr uint32_t kMantissaMask = (1u << 24) - 1;
+  constexpr float kDivisor = 1.0f / static_cast<float>(1u << 24);
+  return static_cast<float>(val & kMantissaMask) * kDivisor;
+}
+
+// Philox-based uniform float32 generator (graph-safe version). Produces
+// values in [0, 1) to match torch.rand semantics.
 __global__ void philox_rand_float_graph_kernel(
     float* __restrict__ out,
     int64_t numel,
@@ -102,11 +120,12 @@ __global__ void philox_rand_float_graph_kernel(
   if (idx < numel) {
     curandStatePhilox4_32_10_t state;
     curand_init(rng->seed, idx, rng->base_scratch, &state);
-    out[idx] = curand_uniform(&state);
+    out[idx] = uniform_real_from_uint32(curand(&state));
   }
 }
 
-// Philox-based uniform bfloat16 generator (graph-safe version).
+// Philox-based uniform bfloat16 generator (graph-safe version). Produces a
+// float in [0, 1) and rounds to bfloat16 with round-to-nearest-even.
 __global__ void philox_rand_bf16_graph_kernel(
     uint16_t* __restrict__ out,
     int64_t numel,
@@ -115,7 +134,7 @@ __global__ void philox_rand_bf16_graph_kernel(
   if (idx < numel) {
     curandStatePhilox4_32_10_t state;
     curand_init(rng->seed, idx, rng->base_scratch, &state);
-    float val = curand_uniform(&state);
+    float val = uniform_real_from_uint32(curand(&state));
     uint32_t bits;
     memcpy(&bits, &val, sizeof(uint32_t));
     uint32_t lsb = (bits >> 16) & 1;
diff --git a/backends/cuda/runtime/shims/rand.h b/backends/cuda/runtime/shims/rand.h
@@ -25,13 +25,19 @@ using SlimTensor = executorch::backends::aoti::slim::SlimTensor;
 extern "C" {
 
 /**
- * Generates a tensor filled with uniform random values in [0, 1).
+ * Generates a tensor filled with uniform random values in [0, 1), matching
+ * the behavior of torch.rand / aten::rand (see
+ * aten/src/ATen/native/cuda/DistributionUniform.cu and the
+ * `transformation::uniform_real` helper in
+ * aten/src/ATen/native/cuda/DistributionTemplates.h).
  *
  * Implements the AOTI shim for aten::rand.default on CUDA. Uses cuRAND
- * Philox counter-based RNG with GPU-resident state. The counter is
- * atomically advanced by each kernel invocation on-device, making it
- * fully compatible with CUDA graph capture and replay — each replay
- * produces different values because the counter increments on the GPU.
+ * Philox counter-based RNG with GPU-resident state, then maps the random
+ * uint32 to [0, 1) using PyTorch's bit-mask + divisor formulation rather
+ * than curand_uniform (which returns (0, 1]). The counter is atomically
+ * advanced by each kernel invocation on-device, making it fully compatible
+ * with CUDA graph capture and replay — each replay produces different
+ * values because the counter increments on the GPU.
  *
  * Supports float32 and bfloat16 output dtypes.
  */
@@ -46,7 +52,10 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_cuda_rand(
     SlimTensor** ret0);
 
 /**
- * Fills a pre-allocated int64 tensor with random integers in [low, high).
+ * Fills a pre-allocated int64 tensor with random integers in [low, high),
+ * matching the behavior of torch.randint / aten::randint.low_out (see
+ * `transformation::uniform_int_from_to` in
+ * aten/src/ATen/native/cuda/DistributionTemplates.h).
  *
  * Implements the AOTI shim for aten::randint.low_out on CUDA. Used by
  * Inductor's Philox RNG to generate random seeds. Each thread atomically
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_rand.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_rand.cpp
@@ -91,7 +91,7 @@ class AOTITorchCudaRandTest : public ::testing::Test {
 // aoti_torch_cuda_rand tests
 // ----------------------------------------------------------------------------
 
-// Basic float32 rand: produces a tensor in [0, 1).
+// Basic float32 rand: produces a tensor in [0, 1) to match torch.rand.
 TEST_F(AOTITorchCudaRandTest, RandFloat32Basic) {
   std::vector<int64_t> sizes = {4, 8};
   int64_t numel = 4 * 8;
@@ -144,7 +144,9 @@ TEST_F(AOTITorchCudaRandTest, RandDefaultDtypeIsFloat) {
   EXPECT_EQ(out->numel(), 16);
 }
 
-// BFloat16 rand: values must lie in [0, 1).
+// BFloat16 rand: values must lie in [0, 1) to match torch.rand. Note that
+// bfloat16 has only 8 mantissa bits so a float in [0, 1) close to 1.0 may
+// round-up to bfloat16 1.0; we accept that as PyTorch does.
 TEST_F(AOTITorchCudaRandTest, RandBFloat16Basic) {
   std::vector<int64_t> sizes = {32};
   int64_t numel = 32;
@@ -171,7 +173,7 @@ TEST_F(AOTITorchCudaRandTest, RandBFloat16Basic) {
   for (int64_t i = 0; i < numel; ++i) {
     float v = bfloat16_bits_to_float(host[i]);
     EXPECT_GE(v, 0.0f) << "bf16 value at " << i << " < 0";
-    EXPECT_LT(v, 1.0f) << "bf16 value at " << i << " >= 1";
+    EXPECT_LE(v, 1.0f) << "bf16 value at " << i << " > 1";
   }
 }
 
@@ -287,7 +289,8 @@ TEST_F(AOTITorchCudaRandTest, RandTwoCallsProduceDifferentValues) {
 // aoti_torch_cuda_randint_low_out tests
 // ----------------------------------------------------------------------------
 
-// Basic randint into a pre-allocated int64 tensor; values lie in [low, high).
+// Basic randint into a pre-allocated int64 tensor; values lie in [low, high)
+// to match torch.randint semantics.
 TEST_F(AOTITorchCudaRandTest, RandintBasicRange) {
   std::vector<int64_t> sizes = {32};
   int64_t numel = 32;