pytorch
diff --git a/‎backends/cuda/passes/weight_offload_pass.py‎
Lines changed: 29 additions & 138 deletions b/‎backends/cuda/passes/weight_offload_pass.py‎
Lines changed: 29 additions & 138 deletions
diff --git a/‎backends/cuda/runtime/TARGETS‎
Lines changed: 0 additions & 1 deletion b/‎backends/cuda/runtime/TARGETS‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 0 additions & 1 deletion b/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/cuda/runtime/weight_offload/constant_catalog.h‎
Lines changed: 0 additions & 54 deletions b/‎backends/cuda/runtime/weight_offload/constant_catalog.h‎
Lines changed: 0 additions & 54 deletions
@@ -672,144 +672,35 @@ def _apply_weight_offload(
 ) -> dict:
     """In-place graph rewrite + offload payload computation.
 
-    INTERNAL — leading underscore is the Python signal. The only
-    supported caller is
-    ``CudaBackend.pre_aoti_transform_and_collect_named_data`` (see
-    ``backends/cuda/cuda_backend.py``), which gates on the private
-    ``_weight_offload_internal_enable`` compile spec, sources
-    ``method_name`` from the AOTI method-name spec, and routes the
-    returned payload into ``NamedDataStore`` for ``CudaBackend::init``
-    to parse. ``method_name`` is REQUIRED (no default) precisely so a
-    direct caller importing this function for a multi-method model
-    cannot silently collide all methods on ``"forward"``.
-
-    Inserts ``probe(w, probe_id)`` in front of every consumer of every
-    parameter (or buffer) placeholder, rewriting the consumer's arg to
-    read the probe's output. One probe call per ``(consumer, weight)``
-    pair — not per weight — so the runtime can re-load a weight that
-    was evicted between two uses inside the same forward pass.
-    ``probe_id`` is assigned contiguously (0..N-1) in graph order; the
-    runtime keys the schedule lookup off ``probe_id`` directly, with
-    no cursor.
-
-    Pinned FQNs are included in the schedule like any other FQN — every
-    consumer of a pinned weight gets a probe with its own ``probe_id``,
-    and ``schedule[probe_id]`` returns the pinned FQN. The pin set
-    ships separately (``pin_fqns``) so the runtime can choose the
-    resident fast-path over the streaming path inside ``serve``. The
-    pin set is, however, EXCLUDED from the floor calculation (pinned
-    weights don't compete for the streaming pool). See
-    ``Session::Config::pin_fqns`` and ``Session::register_schedule`` in
-    ``backends/cuda/runtime/weight_offload/weight_offload.h``.
-
-    The pass is the single authoritative source for the pin set. The
-    runtime has NO pin-set option; it parses the list out of the
-    partition payload and passes it to the Session unchanged. Pin set
-    affects floor correctness (the floor is computed assuming pinned
-    FQNs do not stream), so the runtime cannot override it.
-
-    AOTI constant-folding contract:
-      The pass operates on parameter placeholders in the
-      ExportedProgram. AOTI knobs that fold parameters out of the
-      container at compile time (so they no longer appear in
-      ``get_constant_name(idx)``) break offload: the pass inserts
-      probes against the placeholders it sees pre-AOTI, but the
-      folded constants would be loaded eagerly through the normal
-      blob path at runtime — silently defeating offload and
-      reintroducing the OOM this feature exists to prevent. The pass
-      itself cannot observe AOTI folding (it runs before AOTI
-      compile, where all state_dict entries are still placeholders),
-      so the catch lives in the runtime: ``CudaBackend::init`` walks
-      ``get_constant_from_folded(i)`` for every catalog entry and
-      hard-fails on the first folded one (with dummies pre-installed,
-      ``run_const_fold`` would read other constants as garbage). The
-      set-equality coverage check
-      ``non_folded_catalog == unique(schedule)`` is the second half
-      of the defense. Exports that enable weight offload must also
-      configure ``torch._inductor.config.aot_inductor.use_runtime_constant_folding
-      = False``; the partitioner-side opt-in (future work) is the
-      right place to verify that.
-
-    Metadata transport:
-      The returned ``dict`` is the offload payload that
-      ``CudaBackend.preprocess`` serializes (via
-      ``_serialize_payload``) into the AOTI ``NamedDataStore`` under
-      the per-method ``_weight_offload_payload`` key (see
-      ``named_data_key_for_method``). ``cuda_backend.cpp::init``
-      retrieves and parses it.
-
-    Args:
-      exported_program: an ``ExportedProgram`` produced by
-        ``torch.export.export``. Mutated in place: probe nodes are
-        inserted, consumer args are rewritten.
-      method_name: the method this pass is being applied to. Returned
-        verbatim in the payload so the runtime can validate which
-        method the bytes belong to.
-      pin_fqns: FQNs to mark as always-resident. Optional. The list
-        is propagated verbatim into the payload AND used by this pass
-        to exclude those FQNs from the floor calculation (pinned
-        weights don't compete for the streaming pool). Pinned FQNs
-        DO appear in the schedule like any other FQN — keeping
-        ``probe_id`` dense is what lets the runtime do a single
-        ``schedule[probe_id]`` lookup per probe. Pinning an FQN that
-        does not appear as a parameter placeholder is a hard error.
-
-    Returns: a ``dict`` with the keys defined at module scope (all
-    internal payload, not opt-in signals):
-
-      - ``"version"``: ``int`` schema version (currently ``2``). Runtime
-        hard-fails on any version other than 2 (v1 is rejected with a
-        "rebuild required" message).
-      - ``"method_name"``: ``str``. Echoed for runtime validation.
-      - ``"schedule"``: ``list[str]`` of length N indexed by
-        ``probe_id`` — ``schedule[probe_id]`` is the FQN of the weight
-        that probe site reads. Pinned and non-pinned FQNs BOTH appear
-        here (every probe site contributes one entry regardless of pin
-        status); the runtime checks ``pin_fqns`` inside ``serve`` to
-        choose the resident fast-path over the streaming path. Keeping
-        ``probe_id`` dense and contiguous is what lets the runtime do
-        a single ``schedule[probe_id]`` lookup per probe.
-      - ``"floor_bytes"``: ``int`` — conservative FX fusion upper
-        bound on the streaming pool. NOT a tight kernel-level
-        estimate; that needs post-AOTI kernel grouping that a
-        future commit will land. Computed as ``max over consecutive
-        FX candidate pairs of (sum bytes of the UNION of non-pinned
-        working sets at each side) + max single non-pinned
-        weight``. FX candidates are non-view non-probe
-        ``call_function`` nodes plus the output sink (so Inductor
-        fusing independent final consumers into one multi-output
-        kernel still factors in). Each candidate's working set is
-        built by propagating probe FQNs forward through every
-        fusion-eligible edge (see ``_fusion_dependency_sets``).
-        Defaulting to "fusible" overestimates the floor — safe.
-        Claiming barrier where none exists underestimates it —
-        corruption. The runtime asserts
-        ``(weight_offload_budget_mb << 20) - pinned_bytes`` covers
-        this; below-floor budgets hard-fail at init with the
-        required minimum spelled out.
-      - ``"pin_fqns"``: ``list[str]`` of FQNs the runtime keeps
-        resident. Side set over the FQNs that appear in
-        ``schedule``; an FQN in ``pin_fqns`` must also appear in
-        ``schedule`` (at every site where it is read). Empty if
-        ``pin_fqns`` is unset.
-
-    Per-FQN AOTI constant metadata (dtype / sizes / strides /
-    storage_offset / nbytes / device_type / device_index) arrives in
-    the v2 payload via the ``constants_metadata`` block (one entry
-    per ``unique(schedule)`` FQN). The runtime cross-checks each
-    payload entry against AOTI's own ``get_constant_data_size`` and
-    drives both the source-blob copy length and the SlimTensor
-    metadata Session uses for borrowed wraps.
-
-    The opt-in signal is the private compile spec
-    ``_weight_offload_internal_enable`` (see ``COMPILE_SPEC_KEY_ENABLE``);
-    pin FQNs come in via ``_weight_offload_internal_pin_fqns``
-    (NUL-separated UTF-8). The enable signal lives in exactly one
-    place - the compile spec - rather than being duplicated across
-    compile spec + payload. End users opt in through the public
-    ``CudaPartitioner(weight_offload=True,
-    weight_offload_pin_fqns=[...])`` kwargs, which translate to
-    these internal specs.
+    Internal: the only supported caller is
+    ``CudaBackend.pre_aoti_transform_and_collect_named_data``. The
+    ``method_name`` arg is required (no default) so a direct caller
+    on a multi-method model cannot silently collide all methods on
+    ``"forward"``.
+
+    Mutates ``exported_program`` in place: inserts ``probe(w, probe_id)``
+    in front of every consumer of every parameter / buffer placeholder
+    and rewrites the consumer's arg to read the probe's output. One
+    probe per ``(consumer, weight)`` pair so the runtime can re-load
+    a weight evicted between two uses in the same forward pass.
+    ``probe_id`` is dense 0..N-1 in graph order.
+
+    Returns the offload payload dict (see ``PAYLOAD_KEY_*`` at module
+    scope for the schema):
+      * ``schedule[probe_id]`` is the FQN that probe site reads;
+        pinned FQNs appear here too (the runtime picks the resident
+        path inside ``serve``).
+      * ``floor_bytes`` is a conservative FX-fusion-aware upper bound
+        on the streaming pool, excluding pinned FQNs. The runtime
+        hard-fails if ``budget - pinned < floor``.
+      * ``pin_fqns`` is the resident set, deduped first-seen-order.
+      * ``constants_metadata`` carries per-FQN dtype / sizes / strides
+        / storage_offset / nbytes / device for runtime cross-check.
+
+    Re-entry: this pass MUST run before AOTI compile (it operates on
+    placeholders) and MUST NOT be re-run on a graph that already
+    contains probe nodes -- the second pass would insert probes on
+    the probes' outputs.
     """
     # Canonicalize pin_fqns: dedupe while preserving first-seen order
     # so the payload is stable. The runtime hard-fails on duplicates
 
@@ -86,7 +86,6 @@ runtime.cxx_library(
     ],
     headers = [
         "cuda_delegate_handle.h",
-        "weight_offload/constant_catalog.h",
         "weight_offload/payload.h",
         "weight_offload/session.h",
     ],
 
@@ -49,7 +49,6 @@
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
-#include <executorch/backends/cuda/runtime/weight_offload/constant_catalog.h>
 #include <executorch/backends/cuda/runtime/weight_offload/payload.h>
 #include <executorch/backends/cuda/runtime/weight_offload/session.h>