pytorch
diff --git a/‎backends/cuda/runtime/weight_offload/session.cpp‎
Lines changed: 270 additions & 8 deletions b/‎backends/cuda/runtime/weight_offload/session.cpp‎
Lines changed: 270 additions & 8 deletions
@@ -497,6 +497,18 @@ Session::~Session() {
 
   for (auto& [fqn, e] : live_) {
     if (e.dev_ptr != nullptr && compute_stream_ != nullptr) {
+      // Stream-order the free behind the entry's H2D. Required
+      // since commit 8 introduced prefetched entries whose
+      // ready_event has not been waited on by any prior serve()
+      // — without this, cudaFreeAsync on compute_stream_ would
+      // queue the free before the in-flight cudaMemcpyAsync on
+      // copy_stream_ drains, freeing memory still being written
+      // to. cudaStreamWaitEvent is a queued cross-stream
+      // dependency (not a host sync), and a no-op if the event
+      // has already completed.
+      if (e.ready_event != nullptr) {
+        cudaStreamWaitEvent(compute_stream_, e.ready_event, 0);
+      }
       cudaFreeAsync(e.dev_ptr, compute_stream_);
     }
     if (e.ready_event != nullptr) {
@@ -538,15 +550,17 @@ Session::~Session() {
         stderr,
         "[ET_WEIGHT_OFFLOAD_STATS] method=%s hits=%llu misses=%llu "
         "evictions=%llu bytes_h2d=%llu peak_live_bytes=%llu budget=%llu "
-        "floor=%llu\n",
+        "floor=%llu prefetch_attempted=%llu prefetch_succeeded=%llu\n",
         method_name_.c_str(),
         static_cast<unsigned long long>(stats_.pool_hits),
         static_cast<unsigned long long>(stats_.pool_misses),
         static_cast<unsigned long long>(stats_.evictions),
         static_cast<unsigned long long>(stats_.bytes_h2d_copied),
         static_cast<unsigned long long>(peak_live_bytes_),
         static_cast<unsigned long long>(budget_bytes_),
-        static_cast<unsigned long long>(floor_bytes_));
+        static_cast<unsigned long long>(floor_bytes_),
+        static_cast<unsigned long long>(stats_.prefetch_attempted),
+        static_cast<unsigned long long>(stats_.prefetch_succeeded));
   }
 }
 
@@ -660,6 +674,10 @@ ::executorch::runtime::Error Session::serve(
       return err == Error::Ok ? Error::Internal : err;
     }
     *output = wrapped;
+    // Best-effort depth-1 prefetch. Errors are logged inside the
+    // helper and never propagated — the current probe is already
+    // populated in *output.
+    (void)opportunistic_prefetch(probe_id);
     return Error::Ok;
   }
 
@@ -687,6 +705,23 @@ ::executorch::runtime::Error Session::serve(
       return Error::Internal;
     }
     auto& v = victim_it->second;
+    // Stream-order the free behind the entry's H2D — required for
+    // prefetched entries whose ready_event hasn't been waited on
+    // yet (no prior serve has consumed them). Hit-path entries
+    // already had their ready_event waited on, so this is a no-op
+    // for them. See "Existing miss-path eviction + ~Session() need
+    // the same wait" in the commit-8 subplan.
+    cudaError_t wait_err =
+        cudaStreamWaitEvent(compute_stream_, v.ready_event, 0);
+    if (wait_err != cudaSuccess) {
+      std::fprintf(
+          stderr,
+          "[ET_WEIGHT_OFFLOAD][ERROR] cudaStreamWaitEvent on victim "
+          "'%s' before eviction failed: %s\n",
+          victim_it->first.c_str(),
+          cudaGetErrorString(wait_err));
+      return Error::Internal;
+    }
     // Check cudaFreeAsync — failure here means the device pointer is
     // NOT being freed, so we must NOT decrement bytes_in_flight_ or
     // erase from live_, otherwise our accounting diverges from the
@@ -715,38 +750,52 @@ ::executorch::runtime::Error Session::serve(
     // One event per eviction batch — all cudaFreeAsyncs are on
     // compute_stream_ and stream-ordered relative to each other,
     // so a single event after the batch covers them all.
+    //
+    // Any failure here is delicate: live_/bytes_in_flight_ have
+    // already been mutated to reflect the evictions, but if we
+    // don't establish the copy_stream_ ↔ compute_stream_ ordering
+    // a subsequent cudaMallocFromPoolAsync on copy_stream_ may
+    // race the still-pending cudaFreeAsyncs. Fallback on every
+    // failure: cudaStreamSynchronize(compute_stream_) so the
+    // frees are GUARANTEED physically complete before we return —
+    // the Session state stays consistent at the cost of a brief
+    // host block. Cheap insurance for a rare error path.
     cudaEvent_t evict_done = nullptr;
     cudaError_t ev_err =
         cudaEventCreateWithFlags(&evict_done, cudaEventDisableTiming);
     if (ev_err != cudaSuccess) {
       std::fprintf(
           stderr,
           "[ET_WEIGHT_OFFLOAD][ERROR] cudaEventCreate for eviction batch "
-          "failed: %s\n",
+          "failed: %s; falling back to cudaStreamSynchronize(compute) to "
+          "guarantee the frees physically complete before return\n",
           cudaGetErrorString(ev_err));
+      (void)cudaStreamSynchronize(compute_stream_);
       return Error::Internal;
     }
     cudaError_t rec_err = cudaEventRecord(evict_done, compute_stream_);
     if (rec_err != cudaSuccess) {
       std::fprintf(
           stderr,
           "[ET_WEIGHT_OFFLOAD][ERROR] cudaEventRecord for eviction batch "
-          "failed: %s; the subsequent cudaMallocFromPoolAsync would be "
-          "unordered against the cudaFreeAsync, risking reuse before free\n",
+          "failed: %s; falling back to cudaStreamSynchronize(compute) to "
+          "guarantee the frees physically complete before return\n",
           cudaGetErrorString(rec_err));
       cudaEventDestroy(evict_done);
+      (void)cudaStreamSynchronize(compute_stream_);
       return Error::Internal;
     }
     cudaError_t wait_err = cudaStreamWaitEvent(copy_stream_, evict_done, 0);
     if (wait_err != cudaSuccess) {
       std::fprintf(
           stderr,
           "[ET_WEIGHT_OFFLOAD][ERROR] cudaStreamWaitEvent for eviction "
-          "batch on copy_stream failed: %s; the subsequent "
-          "cudaMallocFromPoolAsync would be unordered against the "
-          "cudaFreeAsync\n",
+          "batch on copy_stream failed: %s; falling back to "
+          "cudaStreamSynchronize(compute) to guarantee the frees "
+          "physically complete before return\n",
           cudaGetErrorString(wait_err));
       cudaEventDestroy(evict_done);
+      (void)cudaStreamSynchronize(compute_stream_);
       return Error::Internal;
     }
     cudaEventDestroy(evict_done);
@@ -858,6 +907,219 @@ ::executorch::runtime::Error Session::serve(
   stats_.bytes_h2d_copied += need;
 
   *output = wrapped;
+  // Best-effort depth-1 prefetch. Errors are logged inside the
+  // helper and never propagated — the current probe is already
+  // populated in *output.
+  (void)opportunistic_prefetch(probe_id);
+  return Error::Ok;
+}
+
+::executorch::runtime::Error Session::opportunistic_prefetch(
+    int64_t current_probe_id) {
+  using ::executorch::runtime::Error;
+
+  if (schedule_.empty()) {
+    return Error::Ok;
+  }
+  const int64_t next_id =
+      (current_probe_id + 1) % static_cast<int64_t>(schedule_.size());
+  const std::string& fqn = schedule_[static_cast<size_t>(next_id)];
+
+  // Step 1: already-live → no work needed.
+  if (live_.find(fqn) != live_.end()) {
+    return Error::Ok;
+  }
+
+  // Step 1b: defensive guard — never evict the FQN the c-shim is
+  // about to hand back to AOTI for kernel launch. The floor formula
+  // (budget >= bytes(current) + bytes(next)) should make this
+  // unreachable today; the guard catches the single-immediately-
+  // just-served-FQN case if a future commit ever allows budgets
+  // below the floor. Narrow protection — it does NOT cover
+  // multi-probe-before-one-launch (fused kernels with probes A, B
+  // before one launch could still have A evicted by a prefetch
+  // after B if the floor invariant were violated). The floor
+  // hard-fail at init is the real general contract.
+  const std::string& current_fqn =
+      schedule_[static_cast<size_t>(current_probe_id)];
+
+  auto host_it = host_entries_.find(fqn);
+  if (host_it == host_entries_.end()) {
+    std::fprintf(
+        stderr,
+        "[ET_WEIGHT_OFFLOAD][WARN] prefetch skipped: no host mirror "
+        "for FQN '%s'\n",
+        fqn.c_str());
+    return Error::Internal;
+  }
+  const HostEntry& host = host_it->second;
+  const uint64_t need = host.nbytes;
+
+  // From here on a real prefetch is attempted. Count the attempt
+  // regardless of whether it succeeds — `attempted - succeeded`
+  // = swallowed errors.
+  stats_.prefetch_attempted++;
+
+  // Step 2: eviction (same logic as the miss path, plus the
+  // current_fqn guard and the stream-order wait before each free).
+  bool evicted = false;
+  while (bytes_in_flight_ + need > budget_bytes_) {
+    auto victim_it = pick_lru();
+    if (victim_it == live_.end()) {
+      std::fprintf(
+          stderr,
+          "[ET_WEIGHT_OFFLOAD][WARN] prefetch skipped: no evictable "
+          "entries for FQN '%s' (%llu bytes, budget %llu)\n",
+          fqn.c_str(),
+          static_cast<unsigned long long>(need),
+          static_cast<unsigned long long>(budget_bytes_));
+      return Error::Internal;
+    }
+    if (victim_it->first == current_fqn) {
+      // Floor formula should prevent this; if it ever fires, skip
+      // the prefetch instead of risking corruption of the
+      // just-served entry.
+      std::fprintf(
+          stderr,
+          "[ET_WEIGHT_OFFLOAD][WARN] prefetch skipped: LRU victim "
+          "for FQN '%s' would be the just-served '%s'; floor "
+          "formula may be violated\n",
+          fqn.c_str(),
+          current_fqn.c_str());
+      return Error::Internal;
+    }
+    auto& v = victim_it->second;
+    cudaError_t wait_err =
+        cudaStreamWaitEvent(compute_stream_, v.ready_event, 0);
+    if (wait_err != cudaSuccess) {
+      std::fprintf(
+          stderr,
+          "[ET_WEIGHT_OFFLOAD][WARN] prefetch eviction "
+          "cudaStreamWaitEvent on victim '%s' failed: %s; prefetch "
+          "skipped\n",
+          victim_it->first.c_str(),
+          cudaGetErrorString(wait_err));
+      return Error::Internal;
+    }
+    cudaError_t free_err = cudaFreeAsync(v.dev_ptr, compute_stream_);
+    if (free_err != cudaSuccess) {
+      std::fprintf(
+          stderr,
+          "[ET_WEIGHT_OFFLOAD][WARN] prefetch eviction cudaFreeAsync "
+          "for victim '%s' failed: %s; prefetch skipped\n",
+          victim_it->first.c_str(),
+          cudaGetErrorString(free_err));
+      return Error::Internal;
+    }
+    cudaEventDestroy(v.ready_event);
+    bytes_in_flight_ -= v.nbytes;
+    stats_.evictions++;
+    live_.erase(victim_it);
+    evicted = true;
+  }
+  if (evicted) {
+    // Mirror the miss-path event ordering: one event per eviction
+    // batch, made copy_stream_ wait on it before allocating.
+    //
+    // Failure here is the same hazard as the miss-path equivalent:
+    // live_/bytes_in_flight_ have been mutated to reflect the
+    // evictions, but without the copy_stream_ ↔ compute_stream_
+    // ordering, a subsequent cudaMallocFromPoolAsync would race
+    // the pending cudaFreeAsyncs. Even though serve() ignores our
+    // return value (this is best-effort), Session state must stay
+    // consistent for the NEXT serve(). Fall back to
+    // cudaStreamSynchronize(compute_stream_) so the frees are
+    // guaranteed done before we return.
+    cudaEvent_t evict_done = nullptr;
+    cudaError_t ev_err =
+        cudaEventCreateWithFlags(&evict_done, cudaEventDisableTiming);
+    if (ev_err != cudaSuccess) {
+      std::fprintf(
+          stderr,
+          "[ET_WEIGHT_OFFLOAD][WARN] prefetch cudaEventCreate for "
+          "eviction batch failed: %s; syncing compute_stream and "
+          "skipping prefetch\n",
+          cudaGetErrorString(ev_err));
+      (void)cudaStreamSynchronize(compute_stream_);
+      return Error::Internal;
+    }
+    if (cudaEventRecord(evict_done, compute_stream_) != cudaSuccess ||
+        cudaStreamWaitEvent(copy_stream_, evict_done, 0) != cudaSuccess) {
+      cudaEventDestroy(evict_done);
+      std::fprintf(
+          stderr,
+          "[ET_WEIGHT_OFFLOAD][WARN] prefetch event-record/wait for "
+          "eviction batch failed; syncing compute_stream and skipping "
+          "prefetch\n");
+      (void)cudaStreamSynchronize(compute_stream_);
+      return Error::Internal;
+    }
+    cudaEventDestroy(evict_done);
+  }
+
+  // Step 3: allocate + copy on copy_stream_. SAME as miss path
+  // except we do NOT cudaStreamWaitEvent(compute_, ready) — the
+  // next serve() that consumes this entry as a hit does that.
+  void* dev = nullptr;
+  cudaError_t malloc_err =
+      cudaMallocFromPoolAsync(&dev, need, pool_, copy_stream_);
+  if (malloc_err != cudaSuccess) {
+    std::fprintf(
+        stderr,
+        "[ET_WEIGHT_OFFLOAD][WARN] prefetch cudaMallocFromPoolAsync "
+        "for FQN '%s' (%llu bytes) failed: %s; prefetch skipped\n",
+        fqn.c_str(),
+        static_cast<unsigned long long>(need),
+        cudaGetErrorString(malloc_err));
+    return Error::Internal;
+  }
+  auto free_on_error = [&]() {
+    if (dev != nullptr) {
+      cudaFreeAsync(dev, copy_stream_);
+      dev = nullptr;
+    }
+  };
+
+  if (cudaMemcpyAsync(
+          dev, host.host_ptr, need, cudaMemcpyHostToDevice, copy_stream_) !=
+      cudaSuccess) {
+    std::fprintf(
+        stderr,
+        "[ET_WEIGHT_OFFLOAD][WARN] prefetch cudaMemcpyAsync for FQN "
+        "'%s' failed; prefetch skipped\n",
+        fqn.c_str());
+    free_on_error();
+    return Error::Internal;
+  }
+  cudaEvent_t ready = nullptr;
+  if (cudaEventCreateWithFlags(&ready, cudaEventDisableTiming) != cudaSuccess) {
+    std::fprintf(
+        stderr,
+        "[ET_WEIGHT_OFFLOAD][WARN] prefetch cudaEventCreate for FQN "
+        "'%s' ready event failed; prefetch skipped\n",
+        fqn.c_str());
+    free_on_error();
+    return Error::Internal;
+  }
+  if (cudaEventRecord(ready, copy_stream_) != cudaSuccess) {
+    cudaEventDestroy(ready);
+    free_on_error();
+    std::fprintf(
+        stderr,
+        "[ET_WEIGHT_OFFLOAD][WARN] prefetch cudaEventRecord for FQN "
+        "'%s' failed; prefetch skipped\n",
+        fqn.c_str());
+    return Error::Internal;
+  }
+
+  bytes_in_flight_ += need;
+  peak_live_bytes_ = std::max(peak_live_bytes_, bytes_in_flight_);
+  // Treat the prefetched entry as "newest" — see commit-8 subplan
+  // for the option-(b) rationale (the FQN is about to be served
+  // next, so its expected next-use is sooner than the just-served).
+  live_.emplace(fqn, LiveAllocation{dev, need, ready, next_seq_++});
+  stats_.bytes_h2d_copied += need;
+  stats_.prefetch_succeeded++;
   return Error::Ok;
 }