pytorch
diff --git a/‎backends/cuda/passes/tests/test_weight_offload_pass.py‎
Lines changed: 4 additions & 15 deletions b/‎backends/cuda/passes/tests/test_weight_offload_pass.py‎
Lines changed: 4 additions & 15 deletions
diff --git a/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 78 additions & 115 deletions b/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 78 additions & 115 deletions
diff --git a/‎backends/cuda/runtime/weight_offload/constant_catalog.h‎
Lines changed: 17 additions & 199 deletions b/‎backends/cuda/runtime/weight_offload/constant_catalog.h‎
Lines changed: 17 additions & 199 deletions
@@ -4,21 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Contract tests for ``_apply_weight_offload``.
-
-The pass owns the export-time half of CUDA weight offloading: it
-rewrites parameter consumers to read through ``probe(w, probe_id)``
-nodes and returns the v1 offload payload (``version``,
-``method_name``, ``schedule``, ``floor_bytes``, ``pin_fqns``).
-
-These tests assert the public contract — what the returned payload
-contains, where probe nodes appear, how ``probe_id`` lines up with
-``schedule``, how view chains on weights are duplicated per
-consumer, the set-union semantics of the floor calculation, and
-which inputs hard-fail. They do NOT exercise the runtime serve path
-(covered by the dispatch test under ``backends/cuda/tests``) or any
-partitioner / opt-in plumbing (still unwired in this PR).
-"""
+"""Contract tests for ``_apply_weight_offload``: payload contents,
+probe-node placement, probe_id ↔ schedule alignment, view-chain
+duplication, floor set-union semantics, and hard-fail paths. Runtime
+serve and partitioner plumbing are covered by other test files."""
 
 import unittest
 
 
@@ -9,41 +9,26 @@
 #pragma once
 
 // ===========================================================================
-// EXPERIMENTAL -- PER-FQN AOTI CONSTANT METADATA SOURCE
+// EXPERIMENTAL -- per-FQN AOTI constant metadata
 // ===========================================================================
-// Builds a per-FQN view (dtype, sizes, nbytes, live data pointer) of
-// the AOTI container's constants by combining
-// ``get_num_constants`` / ``get_constant_original_fqn`` /
-// ``extract_constants_map`` (all existing AOTI APIs — no upstream
-// PyTorch changes required for offload to know what each constant
-// looks like).
+// Holds the canonical ConstantInfo struct used by the offload runtime
+// (Session, cuda_backend.cpp). The runtime builds its catalog inline
+// in CudaBackend::init from the payload's per-FQN metadata block --
+// not from AOTI's extract_constants_map, which after dummy installation
+// returns placeholder metadata for the installed dummies rather than
+// the originals.
 //
-// Scope: the catalog is the METADATA source for offload. It is NOT
-// the host-byte source for the eventual host mirror — relying on
-// ``data_ptr()`` here would imply "load every weight to GPU first,
-// then free", which is exactly the path the weight-offload feature
-// exists to avoid. The host-byte source (likely the
-// ``_weights_blob`` NamedData entry, indexed by per-constant
-// offsets sourced separately) is a problem the host-mirror commit
-// solves.
-//
-// Built once per (handle, method) AFTER constants are loaded so the
-// returned ``data_ptr`` reflects the FINAL active handles —
-// cross-method weight sharing in ``cuda_backend.cpp`` can swap the
-// container's constant pointers during load, so reading before load
-// gives stale data_ptrs.
+// Consumers that copy bytes (Session::serve) must use the LOGICAL size
+// (product(sizes) * elementSize(dtype)) for the H2D length, not
+// storage_nbytes -- view-style constants can have storage_nbytes >
+// logical, and Session::create hard-fails any scheduled FQN where
+// the two disagree.
 // ===========================================================================
 
 #include <cstdint>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
-#include <executorch/backends/aoti/aoti_delegate_handle.h>
-#include <executorch/backends/aoti/common_shims_slim.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/core/result.h>
-
 namespace executorch::backends::cuda::weight_offload {
 
 struct ConstantInfo {
@@ -52,185 +37,18 @@ struct ConstantInfo {
   std::vector<int64_t> sizes;
   std::vector<int64_t> strides;
   int64_t storage_offset{0};
-  // ``storage_nbytes`` is what ``aoti_torch_get_storage_size``
-  // reports — the byte size of the underlying storage allocation,
-  // which CAN be larger than the logical tensor for view-style
-  // constants. Consumers that copy bytes (Session::serve) must
-  // use the LOGICAL size (``product(sizes) * elementSize(dtype)``)
-  // for the H2D / D2H length, otherwise an offset-zero contiguous
-  // view backed by larger storage will overrun the destination.
-  // Session::create hard-fails any scheduled FQN where
-  // ``storage_nbytes != logical_nbytes``.
   uint64_t storage_nbytes{0};
-  // Device type from ``aoti_torch_get_device_type``. Session::create
-  // hard-fails any scheduled FQN whose ``device_type != CUDA`` — the
+  // Device type from aoti_torch_get_device_type. Session::create
+  // hard-fails any scheduled FQN whose device_type != CUDA -- the
   // sync-H2D path has no model for host-resident or other-device
   // constants, and silently treating them as device 0 would corrupt
   // data on multi-GPU hosts.
   int32_t device_type{0};
   int32_t device_index{0};
-  // Live device pointer for the constant's bytes, valid until the
-  // AOTI container or its user-managed pair table swaps it out.
-  // METADATA OBSERVABILITY ONLY — see the file banner for why this
-  // is not the host-mirror byte source.
+  // Live device pointer (the installed dummy in the offload path).
+  // Used for ProbeRegistry registration; the runtime never reads
+  // bytes from this pointer.
   void* data_ptr{nullptr};
 };
 
-// Build the FQN -> ConstantInfo catalog for the AOTI container
-// associated with ``handle``. Caller MUST have already loaded
-// constants (``update_constants_from_blob`` or
-// ``update_user_managed_constant_buffer_pairs``) — see the file
-// banner.
-//
-// Constants whose ``get_constant_original_fqn`` returns null or an
-// empty string are SKIPPED — AOTI emits unnamed/internal constants
-// for some lowerings and they're not addressable through the FQN
-// the pass uses. Matches the existing ``load_constants_with_cache``
-// filter so opt-in init doesn't hard-fail on containers the
-// non-offload path accepts cleanly. The schedule ⊆ catalog
-// validation in ``CudaBackend::init`` then catches the case where
-// a probed parameter is missing because AOTI folded it (the FQN
-// won't be in the catalog at all).
-//
-// Returns ``Error::Internal`` if any of the required AOTI symbols
-// are unresolved on the handle, if any AOTI call fails, or if a
-// named constant from ``get_constant_original_fqn`` is missing
-// from ``extract_constants_map``. The offload contract is "loud at
-// init"; opt-in callers should hard-fail on a non-Ok result.
-inline ::executorch::runtime::Result<
-    std::unordered_map<std::string, ConstantInfo>>
-build_constant_catalog(
-    ::executorch::backends::aoti::AOTIDelegateHandle* handle) {
-  using ::executorch::backends::aoti::AOTInductorConstantMapHandle;
-  using ::executorch::backends::aoti::AtenTensorHandle;
-  using ::executorch::runtime::Error;
-  using SlimTensor = ::executorch::backends::aoti::Tensor;
-
-  if (handle == nullptr || handle->container_handle == nullptr ||
-      handle->get_num_constants == nullptr ||
-      handle->get_constant_original_fqn == nullptr ||
-      handle->extract_constants_map == nullptr) {
-    return Error::Internal;
-  }
-
-  size_t num_constants = 0;
-  if (handle->get_num_constants(handle->container_handle, &num_constants) !=
-      Error::Ok) {
-    return Error::Internal;
-  }
-
-  // idx -> FQN, skipping unnamed/internal constants whose original
-  // FQN is null or empty (mirrors ``load_constants_with_cache``).
-  std::vector<std::string> named_fqns;
-  named_fqns.reserve(num_constants);
-  for (size_t i = 0; i < num_constants; ++i) {
-    const char* fqn = nullptr;
-    if (handle->get_constant_original_fqn(handle->container_handle, i, &fqn) !=
-        Error::Ok) {
-      return Error::Internal;
-    }
-    if (fqn == nullptr || fqn[0] == '\0') {
-      continue;
-    }
-    named_fqns.emplace_back(fqn);
-  }
-
-  // The AtenTensorHandle values populated below are BORROWED from the
-  // AOTI container — they remain valid for the container's lifetime
-  // and are not owned by ``extracted``. The catalog stores only the
-  // derived dtype / sizes / strides / device fields (see ConstantInfo
-  // below); it never retains the handles past this function, so no
-  // separate teardown of ``extracted`` is required.
-  std::unordered_map<std::string, AtenTensorHandle> extracted;
-  if (handle->extract_constants_map(
-          handle->container_handle,
-          reinterpret_cast<AOTInductorConstantMapHandle>(&extracted),
-          /*use_inactive=*/false) != Error::Ok) {
-    return Error::Internal;
-  }
-
-  std::unordered_map<std::string, ConstantInfo> catalog;
-  catalog.reserve(named_fqns.size());
-  for (const auto& fqn : named_fqns) {
-    auto it = extracted.find(fqn);
-    if (it == extracted.end()) {
-      // get_constant_original_fqn reported this FQN but
-      // extract_constants_map did not surface it — schema drift in
-      // the AOTI container we're not equipped to handle.
-      return Error::Internal;
-    }
-    SlimTensor* tensor = reinterpret_cast<SlimTensor*>(it->second);
-
-    ConstantInfo info;
-    info.fqn = fqn;
-
-    int32_t dtype = 0;
-    if (::executorch::backends::aoti::aoti_torch_get_dtype(tensor, &dtype) !=
-        Error::Ok) {
-      return Error::Internal;
-    }
-    info.dtype = dtype;
-
-    int64_t ndim = 0;
-    if (::executorch::backends::aoti::aoti_torch_get_dim(tensor, &ndim) !=
-        Error::Ok) {
-      return Error::Internal;
-    }
-    int64_t* sizes_ptr = nullptr;
-    if (::executorch::backends::aoti::aoti_torch_get_sizes(
-            tensor, &sizes_ptr) != Error::Ok ||
-        (ndim > 0 && sizes_ptr == nullptr)) {
-      return Error::Internal;
-    }
-    info.sizes.assign(sizes_ptr, sizes_ptr + ndim);
-
-    int64_t* strides_ptr = nullptr;
-    if (::executorch::backends::aoti::aoti_torch_get_strides(
-            tensor, &strides_ptr) != Error::Ok ||
-        (ndim > 0 && strides_ptr == nullptr)) {
-      return Error::Internal;
-    }
-    info.strides.assign(strides_ptr, strides_ptr + ndim);
-
-    int64_t storage_offset = 0;
-    if (::executorch::backends::aoti::aoti_torch_get_storage_offset(
-            tensor, &storage_offset) != Error::Ok) {
-      return Error::Internal;
-    }
-    info.storage_offset = storage_offset;
-
-    int64_t storage_size = 0;
-    if (::executorch::backends::aoti::aoti_torch_get_storage_size(
-            tensor, &storage_size) != Error::Ok ||
-        storage_size < 0) {
-      return Error::Internal;
-    }
-    info.storage_nbytes = static_cast<uint64_t>(storage_size);
-
-    int32_t device_type = 0;
-    if (::executorch::backends::aoti::aoti_torch_get_device_type(
-            tensor, &device_type) != Error::Ok) {
-      return Error::Internal;
-    }
-    info.device_type = device_type;
-    int32_t device_index = 0;
-    if (::executorch::backends::aoti::aoti_torch_get_device_index(
-            tensor, &device_index) != Error::Ok) {
-      return Error::Internal;
-    }
-    info.device_index = device_index;
-
-    void* data_ptr = nullptr;
-    if (::executorch::backends::aoti::aoti_torch_get_data_ptr(
-            tensor, &data_ptr) != Error::Ok) {
-      return Error::Internal;
-    }
-    info.data_ptr = data_ptr;
-
-    catalog.emplace(fqn, std::move(info));
-  }
-
-  return catalog;
-}
-
 } // namespace executorch::backends::cuda::weight_offload