solve commit and bring ci back

Gasoonjia · Gasoonjia · commit c8ffe28298ab · 2026-04-22T12:11:27.000-07:00
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
@@ -25,7 +25,6 @@
 
 class COMPILE_SPEC_KEYS(Enum):
     METHOD_NAME = "method_name"
-    SHARE_KV_CACHE_ACROSS_METHODS = "share_kv_cache_across_methods"
 
 
 @experimental(
@@ -287,13 +286,3 @@ def method_name_from_compile_specs(
         raise RuntimeError(
             f"Could not find method name in compile specs: {compile_specs}"
         )
-
-    @classmethod
-    def generate_share_kv_cache_compile_spec(cls) -> CompileSpec:
-        """
-        Generate a CompileSpec to enable cross-method KV cache sharing.
-        """
-        return CompileSpec(
-            COMPILE_SPEC_KEYS.SHARE_KV_CACHE_ACROSS_METHODS.value,
-            bytes([1]),
-        )
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -17,6 +17,7 @@
 #include <cstdio>
 
 #include <array>
+#include <atomic>
 #include <filesystem>
 #include <fstream>
 #include <mutex>
@@ -80,6 +81,7 @@ namespace {
 constexpr char kSkipCopyOutputToCpuForMethod[] =
     "skip_copy_output_to_cpu_for_method";
 constexpr char kUseSharedCudaStream[] = "use_shared_cuda_stream";
+constexpr char kWeightSharingAcrossMethods[] = "weight_sharing_across_methods";
 } // anonymous namespace
 
 class ET_EXPERIMENTAL CudaBackend final
@@ -173,6 +175,16 @@ class ET_EXPERIMENTAL CudaBackend final
     return shared_cuda_stream_ != nullptr;
   }
 
+  // Enable cross-method per-FQN weight caching. Set via the
+  // kWeightSharingAcrossMethods runtime backend option.
+  void set_weight_sharing_across_methods(bool enabled) {
+    weight_sharing_across_methods_.store(enabled, std::memory_order_relaxed);
+  }
+
+  bool is_weight_sharing_across_methods_enabled() const {
+    return weight_sharing_across_methods_.load(std::memory_order_relaxed);
+  }
+
   Error load_function_pointers_into_handle(
       void* so_handle,
       AOTIDelegateHandle* handle) const {
@@ -264,6 +276,16 @@ class ET_EXPERIMENTAL CudaBackend final
           ET_LOG(Error, "Option %s must be a boolean.", kUseSharedCudaStream);
           return Error::InvalidArgument;
         }
+      } else if (std::strcmp(option.key, kWeightSharingAcrossMethods) == 0) {
+        if (auto* val = std::get_if<bool>(&option.value)) {
+          set_weight_sharing_across_methods(*val);
+        } else {
+          ET_LOG(
+              Error,
+              "Option %s must be a boolean.",
+              kWeightSharingAcrossMethods);
+          return Error::InvalidArgument;
+        }
       }
     }
     return Error::Ok;
@@ -362,11 +384,20 @@ class ET_EXPERIMENTAL CudaBackend final
 
     handle->container_handle = container_handle;
 
-    // Load constants with per-weight caching.
-    // This replaces the old update_constants_from_blob + cross-method sharing
-    // with a unified approach that avoids duplicate GPU allocations.
-    ET_CHECK_OK_OR_RETURN_ERROR(
-        load_constants_with_cache(handle, named_data_map, method_name));
+    // Load constants. When weight_sharing_across_methods is enabled (opt-in
+    // via the kWeightSharingAcrossMethods runtime backend option set by the
+    // runner), use the per-weight FQN cache so methods that share weights
+    // (e.g. prefill/decode) avoid duplicate GPU allocations. Otherwise fall
+    // back to the legacy per-method blob load — required for models whose
+    // methods are independent sub-graphs that may have FQN collisions
+    // (e.g. parakeet).
+    if (is_weight_sharing_across_methods_enabled()) {
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          load_constants_with_cache(handle, named_data_map, method_name));
+    } else {
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          load_constants_legacy(handle, named_data_map, method_name));
+    }
 
     // Use shared CUDA stream if enabled via options, otherwise create one.
     // A shared stream ensures proper ordering across multiple methods
@@ -630,6 +661,11 @@ class ET_EXPERIMENTAL CudaBackend final
   mutable std::mutex cuda_stream_mutex_;
   std::shared_ptr<cudaStream_t> shared_cuda_stream_ = nullptr;
 
+  // Whether to enable cross-method per-FQN weight caching at init time.
+  // Toggled by the kWeightSharingAcrossMethods runtime backend option. Default
+  // OFF — see set_weight_sharing_across_methods() for safety constraints.
+  std::atomic<bool> weight_sharing_across_methods_{false};
+
   // Cached output tensors for skip-copy optimization.
   // When skip-copy is enabled, output SlimTensors are cached here to keep
   // the underlying GPU memory alive while the caller processes the results.
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -686,15 +686,13 @@ def _export_cuda(model, config, args):
                 CudaPartitioner(
                     [
                         CudaBackend.generate_method_name_compile_spec("decode"),
-                        CudaBackend.generate_share_kv_cache_compile_spec(),
                     ]
                 )
             ],
             "prefill": [
                 CudaPartitioner(
                     [
                         CudaBackend.generate_method_name_compile_spec("prefill"),
-                        CudaBackend.generate_share_kv_cache_compile_spec(),
                     ]
                 )
             ],
diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp
@@ -13,6 +13,7 @@
 #include <executorch/extension/llm/sampler/util.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/backend/options.h>
 #include <executorch/runtime/platform/log.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
 
@@ -86,6 +87,34 @@ int main(int argc, char** argv) {
 
   printf("Loading methods...\n");
 
+  // Enable cross-method per-FQN weight sharing in the CUDA backend so that
+  // prefill and decode (which share KV cache and other mutable buffers /
+  // weights) avoid duplicate GPU allocations. This is critical for fitting
+  // Qwen 3.5 MoE on a single GPU. MUST be set BEFORE load_method, since the
+  // backend reads this flag during init() to decide between the per-weight
+  // cache path and the legacy per-method blob load.
+  {
+    executorch::runtime::BackendOptions<1> backend_options;
+    auto set_err =
+        backend_options.set_option("weight_sharing_across_methods", true);
+    if (set_err != Error::Ok) {
+      ET_LOG(
+          Error,
+          "Failed to construct weight_sharing_across_methods option: %d",
+          static_cast<int>(set_err));
+      return 1;
+    }
+    const auto opt_err =
+        executorch::runtime::set_option("CudaBackend", backend_options.view());
+    if (opt_err != Error::Ok) {
+      ET_LOG(
+          Error,
+          "Failed to enable weight_sharing_across_methods: %d",
+          static_cast<int>(opt_err));
+      return 1;
+    }
+  }
+
   // Try loading both methods; fall back to single "forward" method
   bool dual_method = true;
   std::string prefill_method = "prefill";

Original file line number	Diff line number	Diff line change
`@@ -686,15 +686,13 @@ def _export_cuda(model, config, args):`
`686`	`686`	`CudaPartitioner(`
`687`	`687`	`[`
`688`	`688`	`CudaBackend.generate_method_name_compile_spec("decode"),`
`689`		`- CudaBackend.generate_share_kv_cache_compile_spec(),`
`690`	`689`	`]`
`691`	`690`	`)`
`692`	`691`	`],`
`693`	`692`	`"prefill": [`
`694`	`693`	`CudaPartitioner(`
`695`	`694`	`[`
`696`	`695`	`CudaBackend.generate_method_name_compile_spec("prefill"),`
`697`		`- CudaBackend.generate_share_kv_cache_compile_spec(),`
`698`	`696`	`]`
`699`	`697`	`)`
`700`	`698`	`],`