pytorch
diff --git a/‎backends/cuda/passes/weight_offload_pass.py‎
Lines changed: 18 additions & 4 deletions b/‎backends/cuda/passes/weight_offload_pass.py‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 57 additions & 15 deletions b/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 57 additions & 15 deletions
@@ -31,9 +31,12 @@
     the pinned host mirror + serves probes. No public partitioner
     kwarg yet; the only opt-in callers are this stack's own tests.
 
-Pinning (``pin_fqns``) is hard-failed at init for now and lands
-with the public partitioner kwarg. Multi-device offload is hard-
-failed at init (commit 7 only supports device 0).
+Pinning (``pin_fqns``) is supported as of commit 9a: the runtime
+allocates each pinned weight once via out-of-pool ``cudaMalloc``
++ a synchronous H2D, then serves it through a resident fast path.
+The pass deduplicates pin_fqns before serialization so the
+runtime never sees duplicates. Multi-device offload is still
+hard-failed at init (commit 7 only supports device 0).
 
 Schedule / cursor order -- RESOLVED:
     The probe op carries an explicit ``probe_id: int`` argument assigned
@@ -763,7 +766,18 @@ def _apply_weight_offload(
     stack's own tests. See the EXPERIMENTAL banner at the top of
     this module for the current wiring state.
     """
-    pin_fqns = list(pin_fqns or [])
+    # Canonicalize pin_fqns: dedupe while preserving first-seen
+    # order so the payload is stable. The commit-9a runtime
+    # hard-fails on duplicates at parse time; deduping here keeps
+    # harmless caller mistakes from reaching that hard-fail.
+    raw_pins = list(pin_fqns or [])
+    pin_fqns = []
+    _seen_pin = set()
+    for fqn in raw_pins:
+        if fqn in _seen_pin:
+            continue
+        _seen_pin.add(fqn)
+        pin_fqns.append(fqn)
     graph = exported_program.graph_module.graph
 
     # Re-entering the pass would wrap each probe's input (still a
 
@@ -439,18 +439,27 @@ class ET_EXPERIMENTAL CudaBackend final
       // fetching — no point allocating GPU state for a config we'd
       // throw away.
       //
-      // Pinning is deferred to commit 9. Session::create also
-      // hard-fails on non-empty pin_fqns, but pulling the check
-      // forward here saves the entire AOTI container + catalog walk
-      // + dummy install before we discover the failure.
-      if (!offload_payload.pin_fqns.empty()) {
-        std::fprintf(
-            stderr,
-            "[ET_WEIGHT_OFFLOAD][ERROR] method '%s' specifies pin_fqns "
-            "but pinning is deferred to a future commit; drop pin_fqns "
-            "or pass empty\n",
-            method_name.c_str());
-        return Error::InvalidArgument;
+      // Deduplicate pin_fqns (commit 9a). The pass + partitioner
+      // are supposed to emit a canonical list, but a corrupted or
+      // hand-rolled payload could repeat. Hard-fail at parse time
+      // so accounting / allocation downstream can rely on a
+      // 1:1 fqn↔allocation mapping. Session::create's
+      // pinned_.emplace() is the second-layer guard.
+      {
+        std::unordered_set<std::string> pin_seen;
+        pin_seen.reserve(offload_payload.pin_fqns.size());
+        for (const auto& fqn : offload_payload.pin_fqns) {
+          if (!pin_seen.insert(fqn).second) {
+            std::fprintf(
+                stderr,
+                "[ET_WEIGHT_OFFLOAD][ERROR] method '%s' has duplicate "
+                "FQN '%s' in payload.pin_fqns; the partitioner should "
+                "have deduplicated before serializing\n",
+                method_name.c_str(),
+                fqn.c_str());
+            return Error::InvalidArgument;
+          }
+        }
       }
 
       // Single-device constraint for commit 7. The CUDA backend's
@@ -1124,14 +1133,39 @@ class ET_EXPERIMENTAL CudaBackend final
         session_catalog.emplace(fqn, std::move(info));
       }
 
+      // Compute pinned_bytes_total from payload metadata BEFORE
+      // resolving the default budget. With non-empty pin_fqns, the
+      // no-spec default must be `floor + pinned` (not just floor),
+      // since the floor formula already excludes pinned weights —
+      // a `floor`-only default would leave no room for the pinned
+      // allocations and trip the streaming-vs-floor check.
+      uint64_t pinned_bytes_total = 0;
+      for (const auto& fqn : offload_payload.pin_fqns) {
+        // Coverage check above already verified the set; defensive.
+        auto m_it = fqn_to_meta.find(fqn);
+        if (m_it == fqn_to_meta.end()) {
+          std::fprintf(
+              stderr,
+              "[ET_WEIGHT_OFFLOAD][ERROR] pin_fqn '%s' missing from "
+              "payload metadata (should have been caught upstream)\n",
+              fqn.c_str());
+          delete handle;
+          return Error::Internal;
+        }
+        pinned_bytes_total += m_it->second->nbytes;
+      }
+
       // Resolve the per-load budget from the runtime-spec channel.
-      // Default = payload.floor_bytes. Override via the private
+      // Default = payload.floor_bytes + pinned_bytes_total when no
+      // spec is provided. Override via the private
       // ``_weight_offload_internal_budget_bytes`` runtime spec.
-      uint64_t resolved_budget = offload_payload.floor_bytes;
+      // (9b adds the public ``weight_offload_budget_mb`` spec.)
+      uint64_t resolved_budget = 0;
+      bool budget_explicitly_provided = false;
       auto budget_res = context.get_runtime_spec<const char*>(
           "_weight_offload_internal_budget_bytes");
       if (budget_res.error() == Error::NotFound) {
-        // Spec absent — keep the default.
+        // Spec absent — fall through to default.
       } else if (budget_res.error() == Error::InvalidArgument) {
         std::fprintf(
             stderr,
@@ -1179,6 +1213,14 @@ class ET_EXPERIMENTAL CudaBackend final
           return Error::InvalidArgument;
         }
         resolved_budget = static_cast<uint64_t>(parsed);
+        budget_explicitly_provided = true;
+      }
+
+      // Default: cover the streaming floor PLUS pinned bytes. With
+      // no pins, this matches the pre-9a behavior of
+      // ``resolved_budget = floor_bytes``.
+      if (!budget_explicitly_provided) {
+        resolved_budget = offload_payload.floor_bytes + pinned_bytes_total;
       }
 
       auto session_res = weight_offload::Session::create(