pytorch
diff --git a/‎backends/cuda/CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions b/‎backends/cuda/CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/cuda/cuda_backend.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/cuda/cuda_backend.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 3 additions & 0 deletions b/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/cuda/runtime/cuda_delegate_handle.h‎
Lines changed: 38 additions & 0 deletions b/‎backends/cuda/runtime/cuda_delegate_handle.h‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎backends/cuda/runtime/shims/rand.cu‎
Lines changed: 255 additions & 0 deletions b/‎backends/cuda/runtime/shims/rand.cu‎
Lines changed: 255 additions & 0 deletions
@@ -107,10 +107,10 @@ set(_aoti_cuda_shim_sources runtime/shims/memory.cpp
                             runtime/shims/cuda_guard.cpp
 )
 
-# Only build int4mm shim when CUDA language/toolchain is available.
+# Only build CUDA shims when CUDA language/toolchain is available.
 if(CMAKE_CUDA_COMPILER)
   list(APPEND _aoti_cuda_shim_sources runtime/shims/int4mm.cu
-       runtime/shims/sort.cu
+       runtime/shims/sort.cu runtime/shims/rand.cu
   )
 endif()
 
@@ -162,7 +162,7 @@ else()
     aoti_cuda_shims
     PRIVATE cuda_platform
     PUBLIC -Wl,--whole-archive aoti_common_shims_slim -Wl,--no-whole-archive
-           CUDA::cudart ${CMAKE_DL_LIBS}
+           CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS}
   )
 endif()
 
 
@@ -146,6 +146,7 @@ def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
         return {
             "at::_ops::_weight_int4pack_mm::call": None,
             "at::_ops::sort_stable::call": None,
+            "aoti_torch_cuda_randint_low_out": None,
         }
 
     @classmethod
 
@@ -693,6 +693,7 @@ class ET_EXPERIMENTAL CudaBackend final
 
         gpu_inputs[i] = make_slimtensor_from_blob_with_etensor_metadata(
             static_ptr, cpu_tensor);
+
         continue;
       }
 
@@ -805,6 +806,7 @@ class ET_EXPERIMENTAL CudaBackend final
       // End capture → instantiate graph
       cudaError_t gerr =
           cudaStreamEndCapture(cuda_stream, &handle->cuda_graph_state.graph);
+
       ET_CHECK_OR_RETURN_ERROR(
           gerr == cudaSuccess,
           Internal,
@@ -814,6 +816,7 @@ class ET_EXPERIMENTAL CudaBackend final
       gerr = cudaGraphInstantiate(
           &handle->cuda_graph_state.graph_exec,
           handle->cuda_graph_state.graph,
+
           cudaGraphInstantiateFlagAutoFreeOnLaunch);
       ET_CHECK_OR_RETURN_ERROR(
           gerr == cudaSuccess,
 
@@ -149,6 +149,44 @@ struct CudaDelegateHandle : public aoti::AOTIDelegateHandle {
 
   // CUDA graph state (warmup, capture, replay, static buffers)
   CudaGraphState cuda_graph_state;
+  // --- CUDA graph state ---
+  // Phase: 0=disabled, 1=warmup, 2=captured (replay mode)
+  int cuda_graph_phase = 0;
+  int cuda_graph_warmup_remaining = 0;
+
+  // Captured graph and executable instance
+  cudaGraph_t cuda_graph = nullptr;
+  cudaGraphExec_t cuda_graph_exec = nullptr;
+
+  // Static input/output GPU buffers pinned during capture.
+  // These hold the tensor metadata; the underlying data pointers are fixed
+  // addresses that CUDA graph replay will write to / read from.
+  // SlimTensor pointers — owned by this handle.
+  std::vector<void*> static_input_ptrs; // raw GPU data pointers for inputs
+  std::vector<void*> static_output_ptrs; // raw GPU data pointers for outputs
+  std::vector<std::vector<int64_t>> static_input_sizes;
+  std::vector<std::vector<int64_t>> static_input_strides;
+  std::vector<std::vector<int64_t>> static_output_sizes;
+  std::vector<std::vector<int64_t>> static_output_strides;
+  std::vector<int> static_input_scalar_types;
+  std::vector<int> static_output_scalar_types;
+  std::vector<size_t> static_input_nbytes;
+  std::vector<size_t> static_output_nbytes;
+
+  ~CudaDelegateHandle() {
+    if (cuda_graph_exec) {
+      cudaGraphExecDestroy(cuda_graph_exec);
+    }
+    if (cuda_graph) {
+      cudaGraphDestroy(cuda_graph);
+    }
+    // Only free input buffers — output buffers are owned by the AOTI runtime
+    // (allocated during graph capture via the caching allocator).
+    for (auto* ptr : static_input_ptrs) {
+      if (ptr)
+        cudaFree(ptr);
+    }
+  }
 };
 
 } // namespace cuda
 
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/shims/rand.h>
+
+#include <executorch/backends/aoti/slim/cuda/guard.h>
+#include <executorch/backends/aoti/slim/factory/empty.h>
+#include <executorch/backends/aoti/slim/util/size_util.h>
+#include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/log.h>
+
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+
+#include <cstdint>
+#include <ctime>
+#include <vector>
+
+namespace executorch::backends::cuda {
+
+namespace c10 = executorch::backends::aoti::slim::c10;
+using c10::Device;
+using c10::DeviceIndex;
+using c10::DeviceType;
+using c10::ScalarType;
+using executorch::backends::aoti::slim::empty_strided;
+using executorch::backends::aoti::slim::IntArrayRef;
+using executorch::backends::aoti::slim::makeArrayRef;
+
+namespace {
+
+// ---- GPU-resident RNG state ----
+// Seed and counter live in device memory allocated during the first call
+// (warmup phase, before CUDA graph capture). The counter is atomically
+// advanced by each kernel invocation on-device, so it automatically
+// produces different random sequences on every CUDA graph replay.
+
+struct RngState {
+  unsigned long long seed;
+  unsigned long long counter;
+};
+
+static RngState* d_rng = nullptr;
+static bool g_rng_init_done = false;
+
+// Initialize RNG state on the given stream.
+// Must be called during warmup (before graph capture).
+void ensure_rng_init(cudaStream_t stream) {
+  if (!g_rng_init_done) {
+    cudaMallocAsync(&d_rng, sizeof(RngState), stream);
+    RngState h;
+    h.seed = static_cast<unsigned long long>(time(nullptr));
+    h.counter = 0;
+    cudaMemcpyAsync(
+        d_rng, &h, sizeof(RngState), cudaMemcpyHostToDevice, stream);
+    // Synchronize to ensure the copy completes before we return
+    // (the host-side RngState `h` is on the stack).
+    cudaStreamSynchronize(stream);
+    g_rng_init_done = true;
+  }
+}
+
+// Philox-based randint kernel that reads seed from device-resident state
+// and atomically advances the counter. The counter pointer survives CUDA
+// graph replay, so each replay produces different values.
+__global__ void philox_randint_graph_kernel(
+    int64_t* __restrict__ out,
+    int64_t numel,
+    int64_t low,
+    int64_t range,
+    RngState* __restrict__ rng) {
+  // Each thread reads the seed and computes its unique offset.
+  // The "base offset" is read from rng->counter. We can't atomicAdd per
+  // thread, so we use a two-pass approach: first a single-thread kernel
+  // advances the counter, then the main kernel uses the old value.
+  // But that requires two kernel launches...
+  //
+  // Simpler: since numel=1 for randint seed generation, just one thread.
+  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    // Each invocation atomically grabs `numel` slots from the counter.
+    // For numel=1, this is just one atomicAdd.
+    unsigned long long my_offset = atomicAdd(&rng->counter, 1ULL);
+    curandStatePhilox4_32_10_t state;
+    curand_init(rng->seed, idx, my_offset, &state);
+    double val = curand_uniform_double(&state);
+    int64_t ival = static_cast<int64_t>(val * range);
+    out[idx] = low + (ival >= range ? range - 1 : ival);
+  }
+}
+
+// Philox-based uniform float32 generator (graph-safe version).
+__global__ void philox_rand_float_graph_kernel(
+    float* __restrict__ out,
+    int64_t numel,
+    RngState* __restrict__ rng) {
+  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    unsigned long long my_offset = atomicAdd(&rng->counter, 1ULL);
+    curandStatePhilox4_32_10_t state;
+    curand_init(rng->seed, idx, my_offset, &state);
+    out[idx] = curand_uniform(&state);
+  }
+}
+
+// Philox-based uniform bfloat16 generator (graph-safe version).
+__global__ void philox_rand_bf16_graph_kernel(
+    uint16_t* __restrict__ out,
+    int64_t numel,
+    RngState* __restrict__ rng) {
+  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    unsigned long long my_offset = atomicAdd(&rng->counter, 1ULL);
+    curandStatePhilox4_32_10_t state;
+    curand_init(rng->seed, idx, my_offset, &state);
+    float val = curand_uniform(&state);
+    uint32_t bits;
+    memcpy(&bits, &val, sizeof(uint32_t));
+    uint32_t lsb = (bits >> 16) & 1;
+    bits += 0x7FFFu + lsb;
+    out[idx] = static_cast<uint16_t>(bits >> 16);
+  }
+}
+
+} // anonymous namespace
+
+extern "C" {
+
+AOTITorchError aoti_torch_cuda_rand(
+    const int64_t* size,
+    int64_t size_len_,
+    int32_t* dtype,
+    int32_t* layout,
+    int32_t* device,
+    int32_t device_index_,
+    int32_t* pin_memory,
+    SlimTensor** ret0) {
+  (void)layout;
+  (void)device;
+  (void)pin_memory;
+
+  ET_CHECK_OR_RETURN_ERROR(
+      ret0 != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda_rand: ret0 is null");
+
+  // Default to float32 if dtype not specified.
+  ScalarType scalar_type = ScalarType::Float;
+  if (dtype != nullptr) {
+    scalar_type = static_cast<ScalarType>(*dtype);
+  }
+
+  // Compute contiguous strides and total elements.
+  std::vector<int64_t> strides(size_len_);
+  int64_t numel = 1;
+  for (int64_t i = size_len_ - 1; i >= 0; i--) {
+    strides[i] = numel;
+    numel *= size[i];
+  }
+
+  // Allocate output tensor.
+  IntArrayRef sizes_ref(size, static_cast<size_t>(size_len_));
+  *ret0 = new SlimTensor(empty_strided(
+      sizes_ref,
+      makeArrayRef(strides),
+      scalar_type,
+      Device(DeviceType::CUDA, static_cast<DeviceIndex>(device_index_))));
+
+  if (numel == 0) {
+    return Error::Ok;
+  }
+
+  // Get the current CUDA stream.
+  auto stream_result = getCurrentCUDAStream(0);
+  ET_CHECK_OR_RETURN_ERROR(
+      stream_result.ok(),
+      Internal,
+      "aoti_torch_cuda_rand: failed to get CUDA stream");
+  cudaStream_t stream = stream_result.get();
+
+  ensure_rng_init(stream);
+
+  constexpr int kThreads = 256;
+  int blocks = static_cast<int>((numel + kThreads - 1) / kThreads);
+
+  if (scalar_type == ScalarType::Float) {
+    philox_rand_float_graph_kernel<<<blocks, kThreads, 0, stream>>>(
+        static_cast<float*>((*ret0)->data_ptr()), numel, d_rng);
+  } else if (scalar_type == ScalarType::BFloat16) {
+    philox_rand_bf16_graph_kernel<<<blocks, kThreads, 0, stream>>>(
+        static_cast<uint16_t*>((*ret0)->data_ptr()), numel, d_rng);
+  } else {
+    ET_LOG(
+        Error,
+        "aoti_torch_cuda_rand: unsupported dtype %d",
+        static_cast<int>(scalar_type));
+    return Error::NotSupported;
+  }
+
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_cuda_randint_low_out(
+    SlimTensor* out,
+    int64_t low,
+    int64_t high,
+    const int64_t* size,
+    int64_t size_len_) {
+  ET_CHECK_OR_RETURN_ERROR(
+      out != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda_randint_low_out: out tensor is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      high > low,
+      InvalidArgument,
+      "aoti_torch_cuda_randint_low_out: requires high > low");
+
+  int64_t numel = 1;
+  for (int64_t i = 0; i < size_len_; i++) {
+    numel *= size[i];
+  }
+  if (numel == 0) {
+    return Error::Ok;
+  }
+
+  // Get the current CUDA stream.
+  auto stream_result = getCurrentCUDAStream(0);
+  ET_CHECK_OR_RETURN_ERROR(
+      stream_result.ok(),
+      Internal,
+      "aoti_torch_cuda_randint_low_out: failed to get CUDA stream");
+  cudaStream_t stream = stream_result.get();
+
+  ensure_rng_init(stream);
+
+  int64_t range = high - low;
+  int64_t* out_data = static_cast<int64_t*>(out->data_ptr());
+
+  constexpr int kThreads = 256;
+  int blocks = static_cast<int>((numel + kThreads - 1) / kThreads);
+  philox_randint_graph_kernel<<<blocks, kThreads, 0, stream>>>(
+      out_data, numel, low, range, d_rng);
+
+  return Error::Ok;
+}
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
Original file line number	Diff line number	Diff line change
`@@ -107,10 +107,10 @@ set(_aoti_cuda_shim_sources runtime/shims/memory.cpp`
`107`	`107`	`runtime/shims/cuda_guard.cpp`
`108`	`108`	`)`
`109`	`109`
`110`		`-# Only build int4mm shim when CUDA language/toolchain is available.`
	`110`	`+# Only build CUDA shims when CUDA language/toolchain is available.`
`111`	`111`	`if(CMAKE_CUDA_COMPILER)`
`112`	`112`	`list(APPEND _aoti_cuda_shim_sources runtime/shims/int4mm.cu`
`113`		`- runtime/shims/sort.cu`
	`113`	`+ runtime/shims/sort.cu runtime/shims/rand.cu`
`114`	`114`	`)`
`115`	`115`	`endif()`
`116`	`116`
`@@ -162,7 +162,7 @@ else()`
`162`	`162`	`aoti_cuda_shims`
`163`	`163`	`PRIVATE cuda_platform`
`164`	`164`	`PUBLIC -Wl,--whole-archive aoti_common_shims_slim -Wl,--no-whole-archive`
`165`		`- CUDA::cudart ${CMAKE_DL_LIBS}`
	`165`	`+ CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS}`
`166`	`166`	`)`
`167`	`167`	`endif()`
`168`	`168`
Original file line number	Diff line number	Diff line change
`@@ -146,6 +146,7 @@ def get_supported_fallback_kernels(cls) -> Dict[str, Any]:`
`146`	`146`	`return {`
`147`	`147`	`"at::_ops::_weight_int4pack_mm::call": None,`
`148`	`148`	`"at::_ops::sort_stable::call": None,`
	`149`	`+ "aoti_torch_cuda_randint_low_out": None,`
`149`	`150`	`}`
`150`	`151`
`151`	`152`	`@classmethod`