pytorch
diff --git a/‎backends/cuda/passes/weight_offload_pass.py‎
Lines changed: 7 additions & 22 deletions b/‎backends/cuda/passes/weight_offload_pass.py‎
Lines changed: 7 additions & 22 deletions
diff --git a/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 17 additions & 144 deletions b/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 17 additions & 144 deletions
diff --git a/‎backends/cuda/runtime/weight_offload/payload.h‎
Lines changed: 85 additions & 9 deletions b/‎backends/cuda/runtime/weight_offload/payload.h‎
Lines changed: 85 additions & 9 deletions
@@ -702,18 +702,7 @@ def _apply_weight_offload(
     contains probe nodes -- the second pass would insert probes on
     the probes' outputs.
     """
-    # Canonicalize pin_fqns: dedupe while preserving first-seen order
-    # so the payload is stable. The runtime hard-fails on duplicates
-    # at parse time; deduping here keeps harmless caller mistakes
-    # from reaching that hard-fail.
-    raw_pins = list(pin_fqns or [])
-    pin_fqns = []
-    _seen_pin = set()
-    for fqn in raw_pins:
-        if fqn in _seen_pin:
-            continue
-        _seen_pin.add(fqn)
-        pin_fqns.append(fqn)
+    pin_fqns = list(pin_fqns or [])
     graph = exported_program.graph_module.graph
 
     # Re-entering the pass would wrap each probe's input (still a
@@ -781,19 +770,15 @@ def _nbytes(fqn: str) -> int:
                 f"weight offload: FQN {fqn!r} appears as a placeholder but "
                 f"is missing from state_dict and constants"
             )
-        # The runtime serves weights by raw-byte H2D from a contiguous
-        # host mirror sized at numel * element_size. A non-contiguous
-        # parameter / buffer (e.g. a register_buffer holding a strided
-        # view) would have storage_nbytes > logical_nbytes; the runtime
-        # would later hard-fail in validate_scheduled_layout, but the
-        # error is far clearer if we name the offending FQN at export.
+        # The host mirror is sized at numel * element_size; a non-
+        # contiguous tensor would over-read its storage. The runtime
+        # parser also rejects non-contiguous metadata, but flagging
+        # here names the FQN with a Python stack trace.
         if not t.is_contiguous():
             raise ValueError(
-                f"weight offload: FQN {fqn!r} is a non-contiguous tensor "
+                f"weight offload: FQN {fqn!r} is non-contiguous "
                 f"(shape={tuple(t.shape)}, strides={tuple(t.stride())}); "
-                f"the offload runtime only supports contiguous "
-                f"parameters and buffers. Call .contiguous() on the "
-                f"source tensor before exporting."
+                f"call .contiguous() on the source tensor before exporting"
             )
         return t.numel() * t.element_size()
 
 
@@ -430,49 +430,12 @@ class ET_EXPERIMENTAL CudaBackend final
       offload_payload = std::move(parsed.get());
       offload_buf->Free();
 
-      // Fail-fast on payload-derived configurations we do NOT
-      // support yet, BEFORE container creation / .so load / blob
-      // fetching — no point allocating GPU state for a config we'd
-      // throw away.
+      // Fail-fast on incompatible runtime modes BEFORE container
+      // creation / .so load / blob fetching — no point allocating
+      // GPU state for a config we'd throw away. pin_fqns dedup,
+      // device_index==0, and per-FQN metadata invariants are all
+      // enforced by the payload parser.
       //
-      // Deduplicate pin_fqns. The pass + partitioner emit a canonical
-      // list, but a corrupted or hand-rolled payload could repeat.
-      // Hard-fail at parse time so accounting / allocation downstream
-      // can rely on a 1:1 fqn↔allocation mapping. Session::create's
-      // pinned_.emplace() is the second-layer guard.
-      {
-        std::unordered_set<std::string> pin_seen;
-        pin_seen.reserve(offload_payload.pin_fqns.size());
-        for (const auto& fqn : offload_payload.pin_fqns) {
-          if (!pin_seen.insert(fqn).second) {
-            std::fprintf(
-                stderr,
-                "[ET_WEIGHT_OFFLOAD][ERROR] method '%s': duplicate FQN "
-                "'%s' in payload.pin_fqns\n",
-                method_name.c_str(),
-                fqn.c_str());
-            return Error::InvalidArgument;
-          }
-        }
-      }
-
-      // Single-device constraint. ``create_with_device("cuda", nullptr)``
-      // doesn't take a per-method device index; dummies + stream + pool
-      // land on device 0 regardless of payload. The parser already
-      // validated device_index == 0 per-entry; this is a belt-and-
-      // braces re-check on the first entry (and a no-op for empty
-      // metadata).
-      if (!offload_payload.constants_metadata.empty() &&
-          offload_payload.constants_metadata[0].device_index != 0) {
-        std::fprintf(
-            stderr,
-            "[ET_WEIGHT_OFFLOAD][ERROR] method '%s' has device_index=%d "
-            "in payload metadata; only device 0 is supported\n",
-            method_name.c_str(),
-            offload_payload.constants_metadata[0].device_index);
-        return Error::InvalidArgument;
-      }
-
       // Disallow shared-stream mode with offload. The shared stream
       // (see create_shared_cuda_stream) is created on whichever
       // device happened to be current at the time of the first
@@ -876,115 +839,24 @@ class ET_EXPERIMENTAL CudaBackend final
       for (const auto& m : offload_payload.constants_metadata) {
         fqn_to_meta[m.fqn] = &m;
       }
+      // Cross-check AOTI's per-constant data_size against the
+      // payload's nbytes. This is the one check init still needs to
+      // do because the two sides are independent sources of truth:
+      // the parser validated payload internals (dtype + sizes ->
+      // nbytes consistency, contiguity, etc.), but AOTI's container
+      // is a separate origin and could disagree with the payload if
+      // the .pte and the .so were built from drifted sources.
       for (size_t i = 0; i < num_constants; ++i) {
         const auto& fqn = aoti_fqns[i];
-        auto m_it = fqn_to_meta.find(fqn);
-        if (m_it == fqn_to_meta.end()) {
-          // Parser's cross-field check should have caught this, but
-          // defensively re-verify.
-          std::fprintf(
-              stderr,
-              "[ET_WEIGHT_OFFLOAD][ERROR] FQN '%s' missing from payload "
-              "metadata\n",
-              fqn.c_str());
-          delete handle;
-          return Error::Internal;
-        }
-        const auto& m = *m_it->second;
-        // Validate the RAW int32 dtype against a supported code set
-        // BEFORE casting to slim ScalarType: ScalarType is int8_t-
-        // backed, so a corrupted dtype like 256 would silently
-        // truncate to 0 (Byte) on cast. Mirrors the pass-side
-        // _TORCH_DTYPE_TO_C10 map.
-        static constexpr int32_t kSupportedDtypeCodes[] = {
-            0, // Byte / uint8
-            1, // Char / int8
-            2, // Short / int16
-            3, // Int / int32
-            4, // Long / int64
-            5, // Half / float16
-            6, // Float / float32
-            11, // Bool
-            15, // BFloat16
-        };
-        bool dtype_supported = false;
-        for (int32_t code : kSupportedDtypeCodes) {
-          if (m.dtype == code) {
-            dtype_supported = true;
-            break;
-          }
-        }
-        if (!dtype_supported) {
-          std::fprintf(
-              stderr,
-              "[ET_WEIGHT_OFFLOAD][ERROR] FQN '%s': dtype=%d not in the "
-              "offload-supported set\n",
-              fqn.c_str(),
-              m.dtype);
-          delete handle;
-          return Error::InvalidArgument;
-        }
-        auto slim_dtype =
-            static_cast<::executorch::backends::aoti::slim::c10::ScalarType>(
-                m.dtype);
-        uint64_t logical =
-            ::executorch::backends::aoti::slim::c10::elementSize(slim_dtype);
-        for (int64_t s : m.sizes) {
-          if (s <= 0) {
-            std::fprintf(
-                stderr,
-                "[ET_WEIGHT_OFFLOAD][ERROR] FQN '%s': non-positive size "
-                "%lld in payload\n",
-                fqn.c_str(),
-                static_cast<long long>(s));
-            delete handle;
-            return Error::Internal;
-          }
-          // Portable overflow check (MSVC has no __builtin_mul_overflow
-          // for 64-bit). For unsigned a * b: overflow iff
-          // b != 0 && a > UINT64_MAX / b. Guard b != 0 first;
-          // logical starts at elementSize(dtype) which is > 0, and
-          // s > 0 from the check above, so the second condition is
-          // the actual safety net.
-          const uint64_t s_u = static_cast<uint64_t>(s);
-          if (s_u != 0 &&
-              logical > std::numeric_limits<uint64_t>::max() / s_u) {
-            std::fprintf(
-                stderr,
-                "[ET_WEIGHT_OFFLOAD][ERROR] FQN '%s': logical nbytes "
-                "overflow (dtype=%d)\n",
-                fqn.c_str(),
-                m.dtype);
-            delete handle;
-            return Error::Internal;
-          }
-          logical *= s_u;
-        }
-        if (logical != static_cast<uint64_t>(data_sizes[i])) {
+        const auto& m = *fqn_to_meta.at(fqn);
+        if (m.nbytes != static_cast<uint64_t>(data_sizes[i])) {
           std::fprintf(
               stderr,
               "[ET_WEIGHT_OFFLOAD][ERROR] FQN '%s': AOTI data_size=%zu "
-              "vs payload logical nbytes=%llu\n",
+              "vs payload nbytes=%llu (pass <-> AOTI drift)\n",
               fqn.c_str(),
               data_sizes[i],
-              static_cast<unsigned long long>(logical));
-          delete handle;
-          return Error::InvalidArgument;
-        }
-        // Payload also carries an explicit nbytes field for defense
-        // in depth. The pass computes it as dtype*product(sizes), so
-        // it must equal `logical` we just computed. Treat any
-        // mismatch as schema drift / corrupted payload — if v2
-        // promises this field, a stale value here means we can't
-        // trust the rest of the metadata either.
-        if (m.nbytes != logical) {
-          std::fprintf(
-              stderr,
-              "[ET_WEIGHT_OFFLOAD][ERROR] FQN '%s': payload nbytes=%llu "
-              "vs dtype+sizes logical=%llu\n",
-              fqn.c_str(),
-              static_cast<unsigned long long>(m.nbytes),
-              static_cast<unsigned long long>(logical));
+              static_cast<unsigned long long>(m.nbytes));
           delete handle;
           return Error::InvalidArgument;
         }
@@ -1274,6 +1146,7 @@ class ET_EXPERIMENTAL CudaBackend final
           session_catalog,
           handle->get_cuda_stream(),
           resolved_budget,
+          pinned_bytes_total,
           static_cast<const uint8_t*>(blob_buf->data()),
           fqn_offsets,
           std::move(dummy_guard),
 
@@ -35,7 +35,9 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
+#include <limits>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include <executorch/runtime/core/error.h>
@@ -166,23 +168,53 @@ class Cursor {
   size_t offset_{0};
 };
 
-// Read a single ConstantMetadata entry with per-field bounds. Used in
-// the inner loop of parse_payload.
+// Element size by dtype code. Returns 0 for unsupported codes, which
+// the caller treats as an invalid-payload signal. The supported set
+// mirrors the pass-side ``_TORCH_DTYPE_TO_C10`` map; extending one
+// without the other will hard-fail at parse, which is the intended
+// drift signal.
+inline uint64_t element_size(int32_t dtype) {
+  switch (dtype) {
+    case 0: // uint8
+    case 1: // int8
+    case 11: // bool
+      return 1;
+    case 2: // int16
+    case 5: // float16
+    case 15: // bfloat16
+      return 2;
+    case 3: // int32
+    case 6: // float32
+      return 4;
+    case 4: // int64
+      return 8;
+    default:
+      return 0;
+  }
+}
+
+// Read a single ConstantMetadata entry with per-field bounds + cross-
+// field consistency (dtype is supported, sizes positive, strides
+// describe a C-contiguous layout, storage_offset == 0, nbytes ==
+// elementSize(dtype) * product(sizes)). Catching these at parse means
+// downstream code can trust the parsed struct directly.
 inline ::executorch::runtime::Error read_constant_metadata(
     Cursor& cur,
     ConstantMetadata& m) {
   using ::executorch::runtime::Error;
   if (cur.read_bounded_string(m.fqn, kMaxStrLen) != Error::Ok) {
     return Error::InvalidArgument;
   }
-  // Per-entry FQN must be non-empty — empty fqn means no addressable
-  // probe routing.
   if (m.fqn.empty()) {
     return Error::InvalidArgument;
   }
   if (cur.read_i32(m.dtype) != Error::Ok) {
     return Error::InvalidArgument;
   }
+  const uint64_t esize = element_size(m.dtype);
+  if (esize == 0) {
+    return Error::InvalidArgument;
+  }
   uint32_t ndim = 0;
   if (cur.read_u32(ndim) != Error::Ok) {
     return Error::InvalidArgument;
@@ -191,30 +223,57 @@ inline ::executorch::runtime::Error read_constant_metadata(
     return Error::InvalidArgument;
   }
   m.sizes.resize(ndim);
+  uint64_t logical = esize;
   for (uint32_t k = 0; k < ndim; ++k) {
     if (cur.read_i64(m.sizes[k]) != Error::Ok) {
       return Error::InvalidArgument;
     }
-    // Positive sizes only — zero-product constants are hard-failed
-    // per the v8 zero-byte policy. Scalars (ndim==0) skip this loop
-    // entirely and are accepted by construction: numel == 1, no
-    // dimension to validate.
+    // Positive sizes only. Scalars (ndim==0) skip this loop entirely
+    // and are accepted: logical stays at element_size, numel == 1.
     if (m.sizes[k] <= 0) {
       return Error::InvalidArgument;
     }
+    const uint64_t s_u = static_cast<uint64_t>(m.sizes[k]);
+    if (logical > std::numeric_limits<uint64_t>::max() / s_u) {
+      return Error::InvalidArgument;
+    }
+    logical *= s_u;
   }
   m.strides.resize(ndim);
   for (uint32_t k = 0; k < ndim; ++k) {
     if (cur.read_i64(m.strides[k]) != Error::Ok) {
       return Error::InvalidArgument;
     }
   }
+  // Strides must describe a C-contiguous layout: strides[i] ==
+  // product(sizes[i+1..]). The offload host mirror is sized for
+  // logical bytes and the H2D copy is dense, so any non-contiguous
+  // layout would over- or under-read.
+  {
+    int64_t expected = 1;
+    for (int64_t i = static_cast<int64_t>(ndim) - 1; i >= 0; --i) {
+      if (m.strides[i] != expected) {
+        return Error::InvalidArgument;
+      }
+      expected *= m.sizes[i];
+    }
+  }
   if (cur.read_i64(m.storage_offset) != Error::Ok) {
     return Error::InvalidArgument;
   }
+  if (m.storage_offset != 0) {
+    return Error::InvalidArgument;
+  }
   if (cur.read_u64(m.nbytes) != Error::Ok) {
     return Error::InvalidArgument;
   }
+  // nbytes must equal the logical byte count derived from dtype +
+  // sizes. The pass writes it as `element_size * product(sizes)`;
+  // catching drift here means downstream consumers can read either
+  // field interchangeably.
+  if (m.nbytes != logical) {
+    return Error::InvalidArgument;
+  }
   if (cur.read_i32(m.device_type) != Error::Ok) {
     return Error::InvalidArgument;
   }
@@ -331,7 +390,10 @@ inline ::executorch::runtime::Result<Payload> parse_payload(
   // Cross-field invariants for v2:
   //   - constants_metadata FQN set must equal unique(schedule).
   //   - No duplicate FQNs across metadata entries.
-  // Catching these at parse means we hard-fail before any GPU work.
+  //   - No duplicate FQNs in pin_fqns.
+  //   - Every pin_fqn must appear in the schedule.
+  // Catching these at parse means downstream code (init, Session)
+  // can trust the parsed struct without re-validating.
   if (!p.constants_metadata.empty() || !p.schedule.empty()) {
     std::vector<std::string> md_fqns;
     md_fqns.reserve(p.constants_metadata.size());
@@ -353,6 +415,20 @@ inline ::executorch::runtime::Result<Payload> parse_payload(
       return Error::InvalidArgument;
     }
   }
+  if (!p.pin_fqns.empty()) {
+    std::unordered_set<std::string> sched_set(
+        p.schedule.begin(), p.schedule.end());
+    std::unordered_set<std::string> pin_set;
+    pin_set.reserve(p.pin_fqns.size());
+    for (const auto& f : p.pin_fqns) {
+      if (!pin_set.insert(f).second) {
+        return Error::InvalidArgument; // duplicate in pin_fqns
+      }
+      if (sched_set.find(f) == sched_set.end()) {
+        return Error::InvalidArgument; // pin_fqn not in schedule
+      }
+    }
+  }
 
   return p;
 }