Skip to content

Commit c8ffe28

Browse files
committed
solve commit and bring ci back
1 parent ab1cbe1 commit c8ffe28

4 files changed

Lines changed: 70 additions & 18 deletions

File tree

backends/aoti/aoti_backend.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525

2626
class COMPILE_SPEC_KEYS(Enum):
2727
METHOD_NAME = "method_name"
28-
SHARE_KV_CACHE_ACROSS_METHODS = "share_kv_cache_across_methods"
2928

3029

3130
@experimental(
@@ -287,13 +286,3 @@ def method_name_from_compile_specs(
287286
raise RuntimeError(
288287
f"Could not find method name in compile specs: {compile_specs}"
289288
)
290-
291-
@classmethod
292-
def generate_share_kv_cache_compile_spec(cls) -> CompileSpec:
293-
"""
294-
Generate a CompileSpec to enable cross-method KV cache sharing.
295-
"""
296-
return CompileSpec(
297-
COMPILE_SPEC_KEYS.SHARE_KV_CACHE_ACROSS_METHODS.value,
298-
bytes([1]),
299-
)

backends/cuda/runtime/cuda_backend.cpp

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <cstdio>
1818

1919
#include <array>
20+
#include <atomic>
2021
#include <filesystem>
2122
#include <fstream>
2223
#include <mutex>
@@ -80,6 +81,7 @@ namespace {
8081
constexpr char kSkipCopyOutputToCpuForMethod[] =
8182
"skip_copy_output_to_cpu_for_method";
8283
constexpr char kUseSharedCudaStream[] = "use_shared_cuda_stream";
84+
constexpr char kWeightSharingAcrossMethods[] = "weight_sharing_across_methods";
8385
} // anonymous namespace
8486

8587
class ET_EXPERIMENTAL CudaBackend final
@@ -173,6 +175,16 @@ class ET_EXPERIMENTAL CudaBackend final
173175
return shared_cuda_stream_ != nullptr;
174176
}
175177

178+
// Enable cross-method per-FQN weight caching. Set via the
179+
// kWeightSharingAcrossMethods runtime backend option.
180+
void set_weight_sharing_across_methods(bool enabled) {
181+
weight_sharing_across_methods_.store(enabled, std::memory_order_relaxed);
182+
}
183+
184+
bool is_weight_sharing_across_methods_enabled() const {
185+
return weight_sharing_across_methods_.load(std::memory_order_relaxed);
186+
}
187+
176188
Error load_function_pointers_into_handle(
177189
void* so_handle,
178190
AOTIDelegateHandle* handle) const {
@@ -264,6 +276,16 @@ class ET_EXPERIMENTAL CudaBackend final
264276
ET_LOG(Error, "Option %s must be a boolean.", kUseSharedCudaStream);
265277
return Error::InvalidArgument;
266278
}
279+
} else if (std::strcmp(option.key, kWeightSharingAcrossMethods) == 0) {
280+
if (auto* val = std::get_if<bool>(&option.value)) {
281+
set_weight_sharing_across_methods(*val);
282+
} else {
283+
ET_LOG(
284+
Error,
285+
"Option %s must be a boolean.",
286+
kWeightSharingAcrossMethods);
287+
return Error::InvalidArgument;
288+
}
267289
}
268290
}
269291
return Error::Ok;
@@ -362,11 +384,20 @@ class ET_EXPERIMENTAL CudaBackend final
362384

363385
handle->container_handle = container_handle;
364386

365-
// Load constants with per-weight caching.
366-
// This replaces the old update_constants_from_blob + cross-method sharing
367-
// with a unified approach that avoids duplicate GPU allocations.
368-
ET_CHECK_OK_OR_RETURN_ERROR(
369-
load_constants_with_cache(handle, named_data_map, method_name));
387+
// Load constants. When weight_sharing_across_methods is enabled (opt-in
388+
// via the kWeightSharingAcrossMethods runtime backend option set by the
389+
// runner), use the per-weight FQN cache so methods that share weights
390+
// (e.g. prefill/decode) avoid duplicate GPU allocations. Otherwise fall
391+
// back to the legacy per-method blob load — required for models whose
392+
// methods are independent sub-graphs that may have FQN collisions
393+
// (e.g. parakeet).
394+
if (is_weight_sharing_across_methods_enabled()) {
395+
ET_CHECK_OK_OR_RETURN_ERROR(
396+
load_constants_with_cache(handle, named_data_map, method_name));
397+
} else {
398+
ET_CHECK_OK_OR_RETURN_ERROR(
399+
load_constants_legacy(handle, named_data_map, method_name));
400+
}
370401

371402
// Use shared CUDA stream if enabled via options, otherwise create one.
372403
// A shared stream ensures proper ordering across multiple methods
@@ -630,6 +661,11 @@ class ET_EXPERIMENTAL CudaBackend final
630661
mutable std::mutex cuda_stream_mutex_;
631662
std::shared_ptr<cudaStream_t> shared_cuda_stream_ = nullptr;
632663

664+
// Whether to enable cross-method per-FQN weight caching at init time.
665+
// Toggled by the kWeightSharingAcrossMethods runtime backend option. Default
666+
// OFF — see set_weight_sharing_across_methods() for safety constraints.
667+
std::atomic<bool> weight_sharing_across_methods_{false};
668+
633669
// Cached output tensors for skip-copy optimization.
634670
// When skip-copy is enabled, output SlimTensors are cached here to keep
635671
// the underlying GPU memory alive while the caller processes the results.

examples/models/qwen3_5_moe/export.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -686,15 +686,13 @@ def _export_cuda(model, config, args):
686686
CudaPartitioner(
687687
[
688688
CudaBackend.generate_method_name_compile_spec("decode"),
689-
CudaBackend.generate_share_kv_cache_compile_spec(),
690689
]
691690
)
692691
],
693692
"prefill": [
694693
CudaPartitioner(
695694
[
696695
CudaBackend.generate_method_name_compile_spec("prefill"),
697-
CudaBackend.generate_share_kv_cache_compile_spec(),
698696
]
699697
)
700698
],

examples/models/qwen3_5_moe/main.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <executorch/extension/llm/sampler/util.h>
1414
#include <executorch/extension/module/module.h>
1515
#include <executorch/extension/tensor/tensor.h>
16+
#include <executorch/runtime/backend/options.h>
1617
#include <executorch/runtime/platform/log.h>
1718
#include <pytorch/tokenizers/hf_tokenizer.h>
1819

@@ -86,6 +87,34 @@ int main(int argc, char** argv) {
8687

8788
printf("Loading methods...\n");
8889

90+
// Enable cross-method per-FQN weight sharing in the CUDA backend so that
91+
// prefill and decode (which share KV cache and other mutable buffers /
92+
// weights) avoid duplicate GPU allocations. This is critical for fitting
93+
// Qwen 3.5 MoE on a single GPU. MUST be set BEFORE load_method, since the
94+
// backend reads this flag during init() to decide between the per-weight
95+
// cache path and the legacy per-method blob load.
96+
{
97+
executorch::runtime::BackendOptions<1> backend_options;
98+
auto set_err =
99+
backend_options.set_option("weight_sharing_across_methods", true);
100+
if (set_err != Error::Ok) {
101+
ET_LOG(
102+
Error,
103+
"Failed to construct weight_sharing_across_methods option: %d",
104+
static_cast<int>(set_err));
105+
return 1;
106+
}
107+
const auto opt_err =
108+
executorch::runtime::set_option("CudaBackend", backend_options.view());
109+
if (opt_err != Error::Ok) {
110+
ET_LOG(
111+
Error,
112+
"Failed to enable weight_sharing_across_methods: %d",
113+
static_cast<int>(opt_err));
114+
return 1;
115+
}
116+
}
117+
89118
// Try loading both methods; fall back to single "forward" method
90119
bool dual_method = true;
91120
std::string prefill_method = "prefill";

0 commit comments

Comments
 (0)