pytorch
diff --git a/‎backends/xnnpack/runtime/XNNExecutor.h‎
Lines changed: 21 additions & 0 deletions b/‎backends/xnnpack/runtime/XNNExecutor.h‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎backends/xnnpack/runtime/XNNPACKBackend.cpp‎
Lines changed: 45 additions & 21 deletions b/‎backends/xnnpack/runtime/XNNPACKBackend.cpp‎
Lines changed: 45 additions & 21 deletions
diff --git a/‎backends/xnnpack/runtime/XNNWeightsCache.h‎
Lines changed: 20 additions & 8 deletions b/‎backends/xnnpack/runtime/XNNWeightsCache.h‎
Lines changed: 20 additions & 8 deletions
diff --git a/‎backends/xnnpack/runtime/XNNWeightsCacheManager.cpp‎
Lines changed: 91 additions & 0 deletions b/‎backends/xnnpack/runtime/XNNWeightsCacheManager.cpp‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎backends/xnnpack/runtime/XNNWeightsCacheManager.h‎
Lines changed: 71 additions & 0 deletions b/‎backends/xnnpack/runtime/XNNWeightsCacheManager.h‎
Lines changed: 71 additions & 0 deletions
@@ -25,6 +25,9 @@ namespace backends {
 namespace xnnpack {
 namespace delegate {
 
+// Forward-declared to keep XNNWeightsCache.h out of this header.
+class XNNWeightsCache;
+
 class XNNExecutor {
  private:
   std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> runtime_{
@@ -37,6 +40,10 @@ class XNNExecutor {
   std::vector<xnn_external_value> externals_;
   std::vector<std::string> packed_data_names_;
   std::shared_ptr<XNNWorkspace> workspace_;
+  // Owned so the cache outlives delete_packed_data in destroy(),
+  // even when every other executor sharing it is gone. Empty when no
+  // file-backed cache is in use.
+  std::shared_ptr<XNNWeightsCache> weights_cache_;
   std::atomic<bool> in_use_{false};
   std::atomic<bool> destroyed_{false};
 
@@ -71,6 +78,20 @@ class XNNExecutor {
     return workspace_;
   }
 
+  // Set once by XNNPACKBackend::init after compileModel succeeds. Pass
+  // an empty shared_ptr if no file-backed cache is in use for this PTE
+  // (treated identically to never calling this).
+  inline void set_weights_cache(std::shared_ptr<XNNWeightsCache> cache) {
+    weights_cache_ = std::move(cache);
+  }
+
+  // Returns the per-PTE weights cache shared_ptr (may be empty). Used
+  // by XNNPACKBackend::execute to lock the cache's mutex around runtime
+  // invocation, and by destroy() to invoke delete_packed_data.
+  inline std::shared_ptr<XNNWeightsCache> get_weights_cache() const {
+    return weights_cache_;
+  }
+
   /**
    * Initialize the XNNExecutor with a given runtime and input/output ids.
    * The input/output ids are expected to be sorted in order of their
 
@@ -91,18 +91,28 @@ class XnnpackBackend final
     auto workspace = workspace_result.get();
 
     bool use_weight_cache = options_.resolve_weight_cache(context);
-    // Hold the lock for the entire init-compile-finalize sequence to prevent
-    // concurrent inits from resetting is_finalized_ or overwriting
-    // named_data_map_ while compileModel is using the shared weights cache.
-    std::unique_lock<std::mutex> lock_weights_cache(
-        options_.weights_cache_mutex(), std::defer_lock);
+    // Per-path weights cache: same-path PTEs share an instance and
+    // serialize on its mutex; different paths run in parallel.
+    std::shared_ptr<xnnpack::delegate::XNNWeightsCache> weights_cache;
+    std::unique_lock<std::mutex> lock_weights_cache;
     if (use_weight_cache) {
-      lock_weights_cache.lock();
-
-      const auto& cache_path = options_.get_packed_cache_path();
-      options_.weights_cache().set_packed_cache_path(cache_path);
+      // Only honor a path coming through runtime_spec (per-PTE opt-in).
+      // Reading the backend-singleton global would let a non-opt-in PTE
+      // inherit another model's cache file.
+      std::string cache_path;
+      auto path_spec = context.get_runtime_spec<const char*>(
+          xnnpack::packed_cache_path_option_key);
+      if (path_spec.ok()) {
+        cache_path = path_spec.get();
+      }
+      auto wc_result = options_.get_or_create_weights_cache(cache_path);
+      if (!wc_result.ok()) {
+        return wc_result.error();
+      }
+      weights_cache = wc_result.get();
+      lock_weights_cache = std::unique_lock<std::mutex>(weights_cache->mutex());
 
-      options_.weights_cache().initialize_for_runtime(
+      weights_cache->initialize_for_runtime(
           context.get_runtime_allocator(), named_data_map);
       workspace->set_uses_weight_cache();
     }
@@ -118,7 +128,7 @@ class XnnpackBackend final
         processed->data(),
         processed->size(),
         executor,
-        &options_.weights_cache(),
+        weights_cache.get(),
         workspace_ptr,
         named_data_map,
         use_weight_cache);
@@ -135,6 +145,12 @@ class XnnpackBackend final
       return err;
     }
 
+    // Hand the cache to the executor (held by shared_ptr so it
+    // outlives any sibling executors that share it).
+    if (use_weight_cache) {
+      executor->set_weights_cache(std::move(weights_cache));
+    }
+
     return executor;
   }
 
@@ -146,10 +162,12 @@ class XnnpackBackend final
 
     auto workspace = executor->get_workspace();
 
-    std::unique_lock<std::mutex> lock_weights_cache(
-        options_.weights_cache_mutex(), std::defer_lock);
-    if (executor->uses_weight_cache() || workspace->uses_weight_cache()) {
-      lock_weights_cache.lock();
+    // Lock the cache shared with sibling executors at the same path.
+    // Empty cache → PTE didn't opt into file-backed mode.
+    auto cache = executor->get_weights_cache();
+    std::unique_lock<std::mutex> lock_weights_cache;
+    if (cache) {
+      lock_weights_cache = std::unique_lock<std::mutex>(cache->mutex());
     }
 
     auto [raii_lock, _] = workspace->acquire();
@@ -176,17 +194,21 @@ class XnnpackBackend final
     if (handle != nullptr) {
       auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
       auto workspace = executor->get_workspace();
+      auto cache = executor->get_weights_cache();
 
-      const std::lock_guard<std::mutex> lock_weights_cache(
-          options_.weights_cache_mutex());
+      // Local shared_ptr keeps the instance alive through
+      // delete_packed_data even if the executor was the last holder.
+      std::unique_lock<std::mutex> lock_weights_cache;
+      if (cache) {
+        lock_weights_cache = std::unique_lock<std::mutex>(cache->mutex());
+      }
 
 #ifdef ENABLE_XNNPACK_PROFILING
       executor->print_avg_op_timings();
 #endif
 
-      if (executor->uses_weight_cache()) {
-        options_.weights_cache().delete_packed_data(
-            executor->get_packed_data_names());
+      if (cache && executor->uses_weight_cache()) {
+        cache->delete_packed_data(executor->get_packed_data_names());
       }
 
       // This is needed to serialize access to xnn_delete_runtime which is not
@@ -237,7 +259,9 @@ class XnnpackBackend final
   mutable xnnpack::XnnpackBackendOptions options_;
 
   // Lock hierarchy for mutexes:
-  //   options_.weights_cache_mutex()
+  //   weights_cache_manager_.meta_mutex_  (leaf — held only during
+  //                                        get_or_create map ops)
+  //   XNNWeightsCache::instance_mutex_    (one per cache instance)
   //   workspace_meta_mutex_
   //   workspace_mutex_ (owned by executor)
 };
 
@@ -14,6 +14,7 @@
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/executor/pte_data_map.h>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -139,21 +140,28 @@ class XNNWeightsCache {
   Error delete_packed_data(const std::vector<std::string>& packed_names);
 
   /**
-   * Set the path for the file-backed packed weight storage.
-   * When set, reserve_space() allocates from a MAP_SHARED file instead
-   * of heap, and finalize_for_runtime() calls msync to make pages clean.
+   * Set the file-backed storage path. When set, reserve_space()
+   * allocates from a MAP_SHARED file instead of heap, and
+   * finalize_for_runtime() msyncs pages.
    *
-   * The path MUST be unique per XNNWeightsCache instance — sharing it
-   * across instances (or processes) would mean O_TRUNC corrupts the other
-   * holder's mappings (SIGBUS on access). initialize_for_runtime() takes
-   * an advisory exclusive flock on the file; if the lock fails the mmap
-   * path is disabled for this instance and allocations fall back to heap.
+   * Call once, before any other method, and never again. Two
+   * instances sharing the same path will corrupt each other on
+   * O_TRUNC (SIGBUS); the manager prevents this by per-path dedup.
    */
   void set_packed_cache_path(const std::string& path);
 
   /** Save packed weight index so subsequent loads skip packing. */
   Error save_packed_index();
 
+  /**
+   * Per-instance mutex. The cache has no internal synchronization;
+   * callers must hold this around every method call and every
+   * XNNPACK callback that touches the cache during xnn_create_runtime.
+   */
+  std::mutex& mutex() noexcept {
+    return instance_mutex_;
+  }
+
  private:
   static constexpr uint32_t kCacheMagic = 0x58505743; // "XPWC"
   // Bump when the on-disk layout (footer or per-entry record) changes.
@@ -215,6 +223,10 @@ class XNNWeightsCache {
   // in mmap_regions_, so delete_packed_data() can munmap when ref_count==0.
   std::unordered_map<void*, size_t> file_ptr_to_region_index_;
 
+  // See mutex() for the locking contract — caller-owned, no internal
+  // use within this class.
+  std::mutex instance_mutex_;
+
   // Function pointers to override XNNPACK's default xnn_weights_cache_provider
   // functions.
   static size_t look_up(
 
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCacheManager.h>
+
+#include <executorch/runtime/core/error.h>
+
+#include <utility>
+#include <vector>
+
+namespace executorch::backends::xnnpack {
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
+Result<std::shared_ptr<delegate::XNNWeightsCache>>
+XNNWeightsCacheManager::get_or_create(const std::string& cache_file_path) {
+  // Empty path → one shared heap-only instance. See header for why.
+  if (cache_file_path.empty()) {
+    std::scoped_lock<std::mutex> lock(empty_path_mutex_);
+    if (auto live = empty_path_cache_.lock()) {
+      return live;
+    }
+    auto cache = std::make_shared<delegate::XNNWeightsCache>();
+    empty_path_cache_ = cache;
+    return cache;
+  }
+
+  std::scoped_lock<std::mutex> lock(meta_mutex_);
+  auto it = caches_.find(cache_file_path);
+  if (it != caches_.end()) {
+    if (auto live = it->second.lock()) {
+      return live;
+    }
+    caches_.erase(it);
+  }
+
+  auto cache = std::make_shared<delegate::XNNWeightsCache>();
+  // Set path before publishing into the map so concurrent callers
+  // observe a fully initialized instance.
+  cache->set_packed_cache_path(cache_file_path);
+  caches_[cache_file_path] = cache;
+  return cache;
+}
+
+Error XNNWeightsCacheManager::save_all() {
+  // Snapshot live shared_ptrs under meta_mutex_, then release it
+  // before per-instance save (honors lock order, lets get_or_create
+  // on unrelated paths proceed during the save walk).
+  std::vector<std::shared_ptr<delegate::XNNWeightsCache>> live;
+  {
+    std::scoped_lock<std::mutex> lock(meta_mutex_);
+    live.reserve(caches_.size());
+    for (auto it = caches_.begin(); it != caches_.end();) {
+      if (auto cache = it->second.lock()) {
+        live.push_back(std::move(cache));
+        ++it;
+      } else {
+        it = caches_.erase(it);
+      }
+    }
+  }
+
+  Error first_err = Error::Ok;
+  for (auto& cache : live) {
+    std::lock_guard<std::mutex> lock(cache->mutex());
+    Error err = cache->save_packed_index();
+    if (err != Error::Ok && first_err == Error::Ok) {
+      first_err = err;
+    }
+  }
+  return first_err;
+}
+
+size_t XNNWeightsCacheManager::live_count() const {
+  std::scoped_lock<std::mutex> lock(meta_mutex_);
+  size_t count = 0;
+  for (const auto& entry : caches_) {
+    if (!entry.second.expired()) {
+      ++count;
+    }
+  }
+  return count;
+}
+
+} // namespace executorch::backends::xnnpack
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+
+namespace executorch::backends::xnnpack {
+
+/**
+ * One `XNNWeightsCache` per cache file path. Mirrors
+ * `XNNWorkspaceManager`'s PerModel pattern with `weak_ptr` so
+ * instances live as long as the executors owning them.
+ *
+ * Per-path keying prevents `initialize_for_runtime` from a second
+ * path tearing down the first path's fd / mmap regions (SIGBUS).
+ *
+ * Empty path returns one shared heap-only instance so callers
+ * without a file still get XNNPACK's in-memory name dedup.
+ *
+ * Lock order: `meta_mutex_` → `XNNWeightsCache::mutex()` →
+ * `XNNWorkspaceManager::workspace_meta_mutex_` → `XNNWorkspace::mutex_`.
+ */
+class XNNWeightsCacheManager {
+ public:
+  XNNWeightsCacheManager() = default;
+  ~XNNWeightsCacheManager() = default;
+
+  XNNWeightsCacheManager(const XNNWeightsCacheManager&) = delete;
+  XNNWeightsCacheManager& operator=(const XNNWeightsCacheManager&) = delete;
+  XNNWeightsCacheManager(XNNWeightsCacheManager&&) = delete;
+  XNNWeightsCacheManager& operator=(XNNWeightsCacheManager&&) = delete;
+
+  /** Shared `XNNWeightsCache` for `cache_file_path`. Empty path
+   * returns one shared heap-only instance. Never null on success. */
+  runtime::Result<std::shared_ptr<delegate::XNNWeightsCache>> get_or_create(
+      const std::string& cache_file_path);
+
+  /** Walk live caches and call `save_packed_index()` on each under
+   * its per-instance mutex. Returns the first error; keeps going so
+   * one failure doesn't strand the others. Opportunistically erases
+   * expired weak_ptrs. */
+  runtime::Error save_all();
+
+  /** Test-only: count of live (non-expired) entries. */
+  size_t live_count() const;
+
+ private:
+  mutable std::mutex meta_mutex_;
+  std::unordered_map<std::string, std::weak_ptr<delegate::XNNWeightsCache>>
+      caches_;
+
+  // Separate slot for the empty-path (heap-only) cache to avoid
+  // string-hashing and contention with mmap-path callers.
+  mutable std::mutex empty_path_mutex_;
+  std::weak_ptr<delegate::XNNWeightsCache> empty_path_cache_;
+};
+
+} // namespace executorch::backends::xnnpack