Skip to content

Commit 9b81951

Browse files
committed
Update base for Update on "[Executorch] Add non-flash SDPA for decode"
Add cpu_sdpa template function in op_sdpa_impl.h that provides a simpler SDPA implementation using standard GEMM (no tiling). This is useful as a baseline and for cases where flash attention is not optimal. The implementation uses a single SeqDim parameter for all tensors and supports causal masking, attention masks, GQA, and multi-threading. During decode (seq_len == 1), the tiled flash attention implementation has unnecessary overhead from its blocking/tiling logic. The simpler unfused SDPA path using direct GEMM is more efficient for single-query attention, yielding ~25-30% decode throughput improvement on S25 (41 -> 53 tok/s for 1.4B parameter model). This makes cpu_sdpa always available (previously gated behind ET_USE_UNFUSED_SDPA) and dispatches to it when seq_len == 1 and inputs are not quantized. Prefill continues to use flash attention. Differential Revision: [D96044318](https://our.internmc.facebook.com/intern/diff/D96044318/) [ghstack-poisoned]
2 parents cc2244c + fb1618e commit 9b81951

84 files changed

Lines changed: 4118 additions & 636 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ci/scripts/export_model_artifact.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
358358
STREAMING_ARG=""
359359
PREPROCESSOR_ARGS="--feature_size 128 --output_file ${OUTPUT_DIR}/preprocessor.pte"
360360
if [ "$USE_STREAMING" = "true" ]; then
361-
STREAMING_ARG="--streaming"
361+
STREAMING_ARG="--streaming --sliding-window 2048"
362362
PREPROCESSOR_ARGS="$PREPROCESSOR_ARGS --streaming"
363363
else
364364
PREPROCESSOR_ARGS="$PREPROCESSOR_ARGS --stack_output --max_audio_len 300"

.github/workflows/android-release-artifacts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ jobs:
165165
contents: read
166166
steps:
167167
- name: configure aws credentials
168-
uses: aws-actions/configure-aws-credentials@v1.7.0
168+
uses: aws-actions/configure-aws-credentials@v4
169169
with:
170170
role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-android
171171
aws-region: us-east-1

.github/workflows/apple.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ jobs:
239239
python-version: '3.11'
240240
cache: pip
241241
- name: configure aws credentials
242-
uses: aws-actions/configure-aws-credentials@v1.7.0
242+
uses: aws-actions/configure-aws-credentials@v4
243243
with:
244244
role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-ios
245245
aws-region: us-east-1

.lintrunner.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ exclude_patterns = [
223223
'**/*.gif',
224224
'extension/llm/tokenizers',
225225
'extension/llm/tokenizers/**',
226+
'backends/cadence/utils/FACTO',
226227
'examples/cuda',
227228
'kernels/portable',
228229
# File contains @generated

backends/aoti/utils.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <executorch/runtime/platform/log.h>
1616
#include <cstddef>
1717
#include <cstdint>
18+
#include <utility>
1819
#include <vector>
1920

2021
namespace executorch {
@@ -163,6 +164,27 @@ inline bool is_contiguous_tensor(
163164
return true;
164165
}
165166

167+
// Scope guard: invokes a callable on destruction. Equivalent to
168+
// std::scope_exit (C++23 <scope>), which is not available in C++17/20.
169+
template <typename F>
170+
class ScopeGuard {
171+
public:
172+
static_assert(
173+
noexcept(std::declval<F&>()()),
174+
"ScopeGuard callable must be noexcept to avoid std::terminate "
175+
"if it throws during stack unwinding");
176+
177+
explicit ScopeGuard(F&& fn) : fn_(std::move(fn)) {}
178+
~ScopeGuard() noexcept {
179+
fn_();
180+
}
181+
ScopeGuard(const ScopeGuard&) = delete;
182+
ScopeGuard& operator=(const ScopeGuard&) = delete;
183+
184+
private:
185+
F fn_;
186+
};
187+
166188
} // namespace aoti
167189
} // namespace backends
168190
} // namespace executorch

backends/apple/coreml/BUCK

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ runtime.cxx_library(
1818
"runtime/delegate/ETCoreMLDefaultModelExecutor.mm",
1919
"runtime/delegate/ETCoreMLLogging.mm",
2020
"runtime/delegate/ETCoreMLModel.mm",
21+
"runtime/delegate/ETCoreMLModelCache.mm",
2122
"runtime/delegate/ETCoreMLModelCompiler.mm",
2223
"runtime/delegate/ETCoreMLModelLoader.mm",
2324
"runtime/delegate/ETCoreMLModelManager.mm",

backends/apple/coreml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ set(DELEGATE_SOURCES
3232
runtime/delegate/ETCoreMLAsset.mm
3333
runtime/delegate/ETCoreMLAssetManager.mm
3434
runtime/delegate/ETCoreMLDefaultModelExecutor.mm
35+
runtime/delegate/ETCoreMLModelCache.mm
3536
runtime/delegate/ETCoreMLModelLoader.mm
3637
runtime/delegate/ETCoreMLModelCompiler.mm
3738
runtime/delegate/ETCoreMLLogging.mm
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
//
2+
// ETCoreMLCacheProtocol.h
3+
//
4+
// Copyright © 2024 Apple Inc. All rights reserved.
5+
//
6+
// Please refer to the license found in the LICENSE file in the root directory of the source tree.
7+
8+
#import <Foundation/Foundation.h>
9+
10+
NS_ASSUME_NONNULL_BEGIN
11+
12+
/// Protocol defining the interface for CoreML model caching.
13+
///
14+
/// This protocol abstracts the cache implementation
15+
@protocol ETCoreMLCache <NSObject>
16+
17+
/// Returns the URL of a cached model if it exists and is valid, otherwise nil.
18+
///
19+
/// @param identifier The unique identifier for the cached model.
20+
/// @param error On failure, error is filled with the failure information.
21+
/// @return The URL to the cached model bundle, or nil if not found or invalid.
22+
///
23+
/// @warning The returned URL may become invalid before the caller uses it if another
24+
/// process deletes or replaces the cached model. Callers MUST handle MLModel load
25+
/// failures gracefully by treating them as cache misses and recompiling.
26+
- (nullable NSURL*)cachedModelURLForIdentifier:(NSString*)identifier error:(NSError**)error;
27+
28+
/// Stores a compiled model in the cache. Returns the cached URL on success.
29+
///
30+
/// @param compiledModelURL The URL of the compiled model bundle to cache. Must exist.
31+
/// @param identifier The unique identifier for this model.
32+
/// @param error On failure, error is filled with the failure information.
33+
/// @return The URL of the cached model, or nil on failure.
34+
- (nullable NSURL*)storeModelAtURL:(NSURL*)compiledModelURL withIdentifier:(NSString*)identifier error:(NSError**)error;
35+
36+
/// Removes a specific cached model.
37+
///
38+
/// @param identifier The unique identifier for the cached model to remove.
39+
/// @param error On failure, error is filled with the failure information.
40+
/// @return YES if the model was removed or didn't exist. Returns NO only on I/O errors.
41+
- (BOOL)removeCachedModelWithIdentifier:(NSString*)identifier error:(NSError**)error;
42+
43+
/// Clears the entire cache, including all cached models.
44+
///
45+
/// @param error On failure, error is filled with the failure information.
46+
/// @return YES if the cache was purged successfully, otherwise NO.
47+
- (BOOL)purgeAndReturnError:(NSError**)error;
48+
49+
/// Returns a temp URL where intermediate files can be written during compilation.
50+
/// This is guaranteed to be on the same filesystem as the cache, ensuring atomic moves.
51+
///
52+
/// @param error On failure, error is filled with the failure information.
53+
/// @return A temp URL where intermediate files can be written, or nil on failure.
54+
///
55+
/// @note The temp URL is unique (UUID-based) to avoid conflicts.
56+
/// @note Temp entries are cleaned up automatically after 24 hours.
57+
- (nullable NSURL*)temporaryDirectoryWithError:(NSError**)error;
58+
59+
@end
60+
61+
NS_ASSUME_NONNULL_END
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
//
2+
// ETCoreMLModelCache.h
3+
//
4+
// Copyright © 2024 Apple Inc. All rights reserved.
5+
//
6+
// Please refer to the license found in the LICENSE file in the root directory of the source tree.
7+
8+
#import <Foundation/Foundation.h>
9+
10+
#import "ETCoreMLCacheProtocol.h"
11+
12+
NS_ASSUME_NONNULL_BEGIN
13+
14+
extern NSString* const ETCoreMLModelCacheErrorDomain;
15+
16+
typedef NS_ENUM(NSInteger, ETCoreMLModelCacheErrorCode) {
17+
ETCoreMLModelCacheErrorCodeUnknown = 0,
18+
ETCoreMLModelCacheErrorCodeInitializationFailed = 1,
19+
ETCoreMLModelCacheErrorCodeInvalidIdentifier = 2,
20+
ETCoreMLModelCacheErrorCodeSourceNotFound = 3,
21+
ETCoreMLModelCacheErrorCodeDiskFull = 4,
22+
ETCoreMLModelCacheErrorCodeIOError = 5,
23+
ETCoreMLModelCacheErrorCodeCorruptedCache = 6,
24+
};
25+
26+
/// A simplified, filesystem-based cache for compiled CoreML models.
27+
///
28+
/// This class provides a cache implementation that stores compiled models as directories
29+
/// in a versioned cache structure. It uses atomic writes (rename) to ensure cache integrity
30+
/// even in the presence of crashes or concurrent access.
31+
///
32+
/// Directory structure:
33+
/// ```
34+
/// cache_root/
35+
/// ├── version.txt (cache format version)
36+
/// ├── models/
37+
/// │ ├── {identifier}.mlmodelc/ (compiled model bundle)
38+
/// │ ├── {identifier}.accessed (last access time for LRU eviction)
39+
/// │ └── ...
40+
/// └── temp/
41+
/// └── {uuid}/ (mlpackage files awaiting compilation)
42+
/// ```
43+
///
44+
/// ## Thread Safety and Concurrency Guarantees
45+
///
46+
/// This class provides **NO internal synchronization**. It is designed to be used in one of
47+
/// two ways:
48+
///
49+
/// 1. **Single-threaded access**: All calls to a single instance from one thread/queue.
50+
///
51+
/// 2. **External serialization**: When used via `ETCoreMLModelManager`, access is serialized
52+
/// by the manager's per-identifier loading queue. This is the expected usage pattern.
53+
///
54+
/// **Multi-process safety** is provided by:
55+
/// - Atomic filesystem operations (`rename()`)
56+
/// - Unique temp paths (UUID-based) to avoid conflicts
57+
/// - "Last writer wins" semantics (acceptable since all writers produce identical output)
58+
///
59+
/// **Multiple instances** pointing to the same directory are safe because:
60+
/// - Each write uses a unique temp path
61+
/// - Final placement uses atomic `moveItemAtURL:` (POSIX `rename()`)
62+
/// - Concurrent writes result in "last writer wins" (both write identical data)
63+
/// - Cleanup only targets entries older than 24 hours
64+
///
65+
/// **Callers are responsible for**:
66+
/// - Handling `MLModel` load failures gracefully (cache entry may be replaced/deleted
67+
/// between URL retrieval and model load)
68+
/// - Not relying on returned URLs remaining valid indefinitely
69+
@interface ETCoreMLModelCache : NSObject <ETCoreMLCache>
70+
71+
- (instancetype)init NS_UNAVAILABLE;
72+
+ (instancetype)new NS_UNAVAILABLE;
73+
74+
/// The root directory for all cache data (contains models/, temp/, version.txt).
75+
@property (nonatomic, readonly) NSURL* cacheRootDirectory;
76+
77+
/// Whether the cache was initialized successfully and is ready for use.
78+
/// If NO, all operations will fail. Check this after initialization.
79+
@property (nonatomic, readonly, getter=isReady) BOOL ready;
80+
81+
/// If `ready` is NO, this contains the error that occurred during initialization.
82+
@property (nonatomic, readonly, nullable) NSError* initializationError;
83+
84+
/// Initializes the cache with the given root directory.
85+
/// Creates the directory structure if it doesn't exist.
86+
/// Check the `ready` property after initialization to verify success.
87+
/// If initialization fails, `initializationError` will contain the reason.
88+
///
89+
/// @param cacheRootDirectory The root directory for all cache data.
90+
- (instancetype)initWithCacheRootDirectory:(NSURL*)cacheRootDirectory NS_DESIGNATED_INITIALIZER;
91+
92+
/// Returns the URL of a cached model if it exists and is valid, otherwise nil.
93+
///
94+
/// @param identifier The unique identifier for the cached model.
95+
/// @param error On failure, error is filled with the failure information.
96+
/// @return The URL to the cached model bundle, or nil if not found or invalid.
97+
///
98+
/// @warning The returned URL may become invalid before the caller uses it if another
99+
/// process deletes or replaces the cached model. Callers MUST handle MLModel load
100+
/// failures gracefully by treating them as cache misses and recompiling.
101+
- (nullable NSURL*)cachedModelURLForIdentifier:(NSString*)identifier error:(NSError**)error;
102+
103+
/// Stores a compiled model in the cache. Returns the cached URL on success.
104+
///
105+
/// @param compiledModelURL The URL of the compiled model bundle to cache. Must exist.
106+
/// @param identifier The unique identifier for this model. Must not contain '/' or '..'.
107+
/// @param error On failure, contains the error. Check for ETCoreMLModelCacheErrorCodeDiskFull
108+
/// to handle out-of-space conditions specially.
109+
/// @return The URL of the cached model, or nil on failure.
110+
- (nullable NSURL*)storeModelAtURL:(NSURL*)compiledModelURL withIdentifier:(NSString*)identifier error:(NSError**)error;
111+
112+
/// Removes a specific cached model. This is a best-effort operation that removes
113+
/// the model bundle and access time files for the given identifier.
114+
///
115+
/// @param identifier The unique identifier for the cached model to remove.
116+
/// @param error On failure, error is filled with the failure information.
117+
/// @return YES on success (including if the model didn't exist), NO on validation errors.
118+
- (BOOL)removeCachedModelWithIdentifier:(NSString*)identifier error:(NSError**)error;
119+
120+
/// Clears the entire cache, including all cached models.
121+
/// Recreates the empty directory structure after clearing.
122+
///
123+
/// @param error On failure, error is filled with the failure information.
124+
/// @return YES if the cache was purged successfully, otherwise NO.
125+
- (BOOL)purgeAndReturnError:(NSError**)error;
126+
127+
#pragma mark - Temp Directory (for mlpackage extraction before compilation)
128+
129+
/// Returns a temp URL where an mlpackage can be extracted before compilation.
130+
/// The caller is responsible for cleaning up this directory after compilation completes.
131+
///
132+
/// @param error On failure, error is filled with the failure information.
133+
/// @return A temp URL where the mlpackage can be extracted, or nil on failure.
134+
///
135+
/// @note The temp URL is unique and includes a UUID to avoid conflicts.
136+
/// @note Temp entries are automatically cleaned up after 24 hours if not removed.
137+
- (nullable NSURL*)temporaryDirectoryWithError:(NSError**)error;
138+
139+
@end
140+
141+
NS_ASSUME_NONNULL_END

0 commit comments

Comments
 (0)