pytorch
diff --git a/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/aoti/common_shims_slim.cpp‎
Lines changed: 4 additions & 0 deletions b/‎backends/aoti/common_shims_slim.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/aoti/common_shims_slim.h‎
Lines changed: 1 addition & 0 deletions b/‎backends/aoti/common_shims_slim.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/aoti/slim/c10/core/ScalarType.h‎
Lines changed: 8 additions & 1 deletion b/‎backends/aoti/slim/c10/core/ScalarType.h‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎backends/apple/coreml/BUCK‎
Lines changed: 1 addition & 0 deletions b/‎backends/apple/coreml/BUCK‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/apple/coreml/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎backends/apple/coreml/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/apple/coreml/runtime/delegate/ETCoreMLModelCache.h‎
Lines changed: 141 additions & 0 deletions b/‎backends/apple/coreml/runtime/delegate/ETCoreMLModelCache.h‎
Lines changed: 141 additions & 0 deletions
@@ -354,7 +354,7 @@ EOF
     fi
     ;;
   qwen3_5_moe)
-    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --prompt 'What is the capital of France?' --max_new_tokens 32"
+    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --prompt 'What is the capital of France?' --max_new_tokens 128 --temperature 0"
     ;;
   voxtral_realtime)
     RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
 
@@ -145,8 +145,8 @@ jobs:
         # Run CUDA backend Python tests
         python -m pytest backends/cuda/tests backends/cuda/passes/tests -v -o "addopts="
 
-        # Run quantize roundtrip tests (Qwen 3.5 MoE save/load prequantized)
-        python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py -v -o "addopts="
+        # Run Qwen 3.5 MoE tests (quantize roundtrip + TurboQuant KV cache)
+        python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py examples/models/qwen3_5_moe/test_turboquant.py -v -o "addopts="
 
   export-model-cuda-artifact:
     name: export-model-cuda-artifact
 
@@ -134,6 +134,10 @@ int32_t aoti_torch_dtype_int8() {
   return 1; // ScalarType::Char
 }
 
+int32_t aoti_torch_dtype_uint8() {
+  return 0; // ScalarType::Byte
+}
+
 int32_t aoti_torch_dtype_bool() {
   return 11; // ScalarType::Bool
 }
 
@@ -76,6 +76,7 @@ AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int64();
 AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32();
 AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16();
 AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_uint8();
 AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bool();
 
 // ============================================================
 
@@ -23,7 +23,7 @@ using BFloat16 = ::executorch::runtime::etensor::BFloat16;
 /// Enum representing the scalar type (dtype) of tensor elements.
 /// Note: Enum values must match PyTorch's c10::ScalarType for compatibility.
 enum class ScalarType : int8_t {
-  // Byte = 0,     // uint8_t - not currently needed
+  Byte = 0, // uint8_t
   Char = 1, // int8_t
   Short = 2, // int16_t
   Int = 3, // int32_t
@@ -43,6 +43,7 @@ enum class ScalarType : int8_t {
 };
 
 // Type alias constants for convenience
+constexpr ScalarType kByte = ScalarType::Byte;
 constexpr ScalarType kChar = ScalarType::Char;
 constexpr ScalarType kShort = ScalarType::Short;
 constexpr ScalarType kInt = ScalarType::Int;
@@ -56,6 +57,8 @@ constexpr ScalarType kBFloat16 = ScalarType::BFloat16;
 /// @return The size in bytes of a single element.
 inline size_t elementSize(ScalarType t) {
   switch (t) {
+    case ScalarType::Byte:
+      return sizeof(uint8_t);
     case ScalarType::Char:
       return sizeof(int8_t);
     case ScalarType::Short:
@@ -80,6 +83,8 @@ inline size_t elementSize(ScalarType t) {
 /// @return The name of the scalar type.
 inline const char* toString(ScalarType t) {
   switch (t) {
+    case ScalarType::Byte:
+      return "Byte";
     case ScalarType::Char:
       return "Char";
     case ScalarType::Short:
@@ -114,6 +119,7 @@ inline bool isFloatingType(ScalarType t) {
 /// @return true if the scalar type is integral, false otherwise.
 inline bool isIntegralType(ScalarType t, bool includeBool) {
   switch (t) {
+    case ScalarType::Byte:
     case ScalarType::Char:
     case ScalarType::Short:
     case ScalarType::Int:
@@ -138,6 +144,7 @@ inline bool isBoolType(ScalarType t) {
 /// @return true if the scalar type is valid, false otherwise.
 inline bool isValidScalarType(ScalarType t) {
   switch (t) {
+    case ScalarType::Byte:
     case ScalarType::Char:
     case ScalarType::Short:
     case ScalarType::Int:
 
@@ -18,6 +18,7 @@ runtime.cxx_library(
         "runtime/delegate/ETCoreMLDefaultModelExecutor.mm",
         "runtime/delegate/ETCoreMLLogging.mm",
         "runtime/delegate/ETCoreMLModel.mm",
+        "runtime/delegate/ETCoreMLModelCache.mm",
         "runtime/delegate/ETCoreMLModelCompiler.mm",
         "runtime/delegate/ETCoreMLModelLoader.mm",
         "runtime/delegate/ETCoreMLModelManager.mm",
 
@@ -32,6 +32,7 @@ set(DELEGATE_SOURCES
     runtime/delegate/ETCoreMLAsset.mm
     runtime/delegate/ETCoreMLAssetManager.mm
     runtime/delegate/ETCoreMLDefaultModelExecutor.mm
+    runtime/delegate/ETCoreMLModelCache.mm
     runtime/delegate/ETCoreMLModelLoader.mm
     runtime/delegate/ETCoreMLModelCompiler.mm
     runtime/delegate/ETCoreMLLogging.mm
 
@@ -0,0 +1,141 @@
+//
+// ETCoreMLModelCache.h
+//
+// Copyright © 2024 Apple Inc. All rights reserved.
+//
+// Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+#import <Foundation/Foundation.h>
+
+#import "ETCoreMLCacheProtocol.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+extern NSString* const ETCoreMLModelCacheErrorDomain;
+
+typedef NS_ENUM(NSInteger, ETCoreMLModelCacheErrorCode) {
+    ETCoreMLModelCacheErrorCodeUnknown = 0,
+    ETCoreMLModelCacheErrorCodeInitializationFailed = 1,
+    ETCoreMLModelCacheErrorCodeInvalidIdentifier = 2,
+    ETCoreMLModelCacheErrorCodeSourceNotFound = 3,
+    ETCoreMLModelCacheErrorCodeDiskFull = 4,
+    ETCoreMLModelCacheErrorCodeIOError = 5,
+    ETCoreMLModelCacheErrorCodeCorruptedCache = 6,
+};
+
+/// A simplified, filesystem-based cache for compiled CoreML models.
+///
+/// This class provides a cache implementation that stores compiled models as directories
+/// in a versioned cache structure. It uses atomic writes (rename) to ensure cache integrity
+/// even in the presence of crashes or concurrent access.
+///
+/// Directory structure:
+/// ```
+/// cache_root/
+/// ├── version.txt                         (cache format version)
+/// ├── models/
+/// │   ├── {identifier}.mlmodelc/          (compiled model bundle)
+/// │   ├── {identifier}.accessed           (last access time for LRU eviction)
+/// │   └── ...
+/// └── temp/
+///     └── {uuid}/                         (mlpackage files awaiting compilation)
+/// ```
+///
+/// ## Thread Safety and Concurrency Guarantees
+///
+/// This class provides **NO internal synchronization**. It is designed to be used in one of
+/// two ways:
+///
+/// 1. **Single-threaded access**: All calls to a single instance from one thread/queue.
+///
+/// 2. **External serialization**: When used via `ETCoreMLModelManager`, access is serialized
+///    by the manager's per-identifier loading queue. This is the expected usage pattern.
+///
+/// **Multi-process safety** is provided by:
+/// - Atomic filesystem operations (`rename()`)
+/// - Unique temp paths (UUID-based) to avoid conflicts
+/// - "Last writer wins" semantics (acceptable since all writers produce identical output)
+///
+/// **Multiple instances** pointing to the same directory are safe because:
+/// - Each write uses a unique temp path
+/// - Final placement uses atomic `moveItemAtURL:` (POSIX `rename()`)
+/// - Concurrent writes result in "last writer wins" (both write identical data)
+/// - Cleanup only targets entries older than 24 hours
+///
+/// **Callers are responsible for**:
+/// - Handling `MLModel` load failures gracefully (cache entry may be replaced/deleted
+///   between URL retrieval and model load)
+/// - Not relying on returned URLs remaining valid indefinitely
+@interface ETCoreMLModelCache : NSObject <ETCoreMLCache>
+
+- (instancetype)init NS_UNAVAILABLE;
++ (instancetype)new NS_UNAVAILABLE;
+
+/// The root directory for all cache data (contains models/, temp/, version.txt).
+@property (nonatomic, readonly) NSURL* cacheRootDirectory;
+
+/// Whether the cache was initialized successfully and is ready for use.
+/// If NO, all operations will fail. Check this after initialization.
+@property (nonatomic, readonly, getter=isReady) BOOL ready;
+
+/// If `ready` is NO, this contains the error that occurred during initialization.
+@property (nonatomic, readonly, nullable) NSError* initializationError;
+
+/// Initializes the cache with the given root directory.
+/// Creates the directory structure if it doesn't exist.
+/// Check the `ready` property after initialization to verify success.
+/// If initialization fails, `initializationError` will contain the reason.
+///
+/// @param cacheRootDirectory The root directory for all cache data.
+- (instancetype)initWithCacheRootDirectory:(NSURL*)cacheRootDirectory NS_DESIGNATED_INITIALIZER;
+
+/// Returns the URL of a cached model if it exists and is valid, otherwise nil.
+///
+/// @param identifier The unique identifier for the cached model.
+/// @param error On failure, error is filled with the failure information.
+/// @return The URL to the cached model bundle, or nil if not found or invalid.
+///
+/// @warning The returned URL may become invalid before the caller uses it if another
+/// process deletes or replaces the cached model. Callers MUST handle MLModel load
+/// failures gracefully by treating them as cache misses and recompiling.
+- (nullable NSURL*)cachedModelURLForIdentifier:(NSString*)identifier error:(NSError**)error;
+
+/// Stores a compiled model in the cache. Returns the cached URL on success.
+///
+/// @param compiledModelURL The URL of the compiled model bundle to cache. Must exist.
+/// @param identifier The unique identifier for this model. Must not contain '/' or '..'.
+/// @param error On failure, contains the error. Check for ETCoreMLModelCacheErrorCodeDiskFull
+///              to handle out-of-space conditions specially.
+/// @return The URL of the cached model, or nil on failure.
+- (nullable NSURL*)storeModelAtURL:(NSURL*)compiledModelURL withIdentifier:(NSString*)identifier error:(NSError**)error;
+
+/// Removes a specific cached model. This is a best-effort operation that removes
+/// the model bundle and access time files for the given identifier.
+///
+/// @param identifier The unique identifier for the cached model to remove.
+/// @param error On failure, error is filled with the failure information.
+/// @return YES on success (including if the model didn't exist), NO on validation errors.
+- (BOOL)removeCachedModelWithIdentifier:(NSString*)identifier error:(NSError**)error;
+
+/// Clears the entire cache, including all cached models.
+/// Recreates the empty directory structure after clearing.
+///
+/// @param error On failure, error is filled with the failure information.
+/// @return YES if the cache was purged successfully, otherwise NO.
+- (BOOL)purgeAndReturnError:(NSError**)error;
+
+#pragma mark - Temp Directory (for mlpackage extraction before compilation)
+
+/// Returns a temp URL where an mlpackage can be extracted before compilation.
+/// The caller is responsible for cleaning up this directory after compilation completes.
+///
+/// @param error On failure, error is filled with the failure information.
+/// @return A temp URL where the mlpackage can be extracted, or nil on failure.
+///
+/// @note The temp URL is unique and includes a UUID to avoid conflicts.
+/// @note Temp entries are automatically cleaned up after 24 hours if not removed.
+- (nullable NSURL*)temporaryDirectoryWithError:(NSError**)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,10 @@ int32_t aoti_torch_dtype_int8() {`
`134`	`134`	`return 1; // ScalarType::Char`
`135`	`135`	`}`
`136`	`136`
	`137`	`+int32_t aoti_torch_dtype_uint8() {`
	`138`	`+ return 0; // ScalarType::Byte`
	`139`	`+}`
	`140`	`+`
`137`	`141`	`int32_t aoti_torch_dtype_bool() {`
`138`	`142`	`return 11; // ScalarType::Bool`
`139`	`143`	`}`