up

metascroy · metascroy · commit 984de055ce9b · 2026-04-07T15:49:23.000-07:00
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
@@ -181,7 +181,7 @@ jobs:
 
         echo "::group::Install Voxtral requirements"
         ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
-        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        ${CONDA_RUN} python -c "from huggingface_hub import login; login(token='$SECRET_EXECUTORCH_HF_TOKEN')"
         ${CONDA_RUN} pip install mistral_common librosa soundfile datasets
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         ${CONDA_RUN} pip install "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
@@ -240,13 +240,13 @@ jobs:
 
         echo "::group::Install Voxtral Realtime requirements"
         ${CONDA_RUN} pip install -U "huggingface_hub[cli]" safetensors
-        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        ${CONDA_RUN} python -c "from huggingface_hub import login; login(token='$SECRET_EXECUTORCH_HF_TOKEN')"
         echo "::endgroup::"
 
         ${CONDA_RUN} pip list
 
         echo "::group::Download model"
-        ${CONDA_RUN} huggingface-cli download mistralai/Voxtral-Mini-4B-Realtime-2602
+        ${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602')"
         MODEL_PATH=$(${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; print(snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602'))")
         echo "Model path: ${MODEL_PATH}"
         echo "::endgroup::"
@@ -313,7 +313,7 @@ jobs:
 
         echo "::group::Install Whisper requirements"
         ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
-        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        ${CONDA_RUN} python -c "from huggingface_hub import login; login(token='$SECRET_EXECUTORCH_HF_TOKEN')"
         ${CONDA_RUN} pip install transformers soundfile datasets librosa
         echo "::endgroup::"
 
@@ -447,7 +447,7 @@ jobs:
 
         echo "::group::Install LLM requirements"
         ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
-        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        ${CONDA_RUN} python -c "from huggingface_hub import login; login(token='$SECRET_EXECUTORCH_HF_TOKEN')"
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         ${CONDA_RUN} pip install transformers "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
         echo "::endgroup::"
diff --git a/backends/mlx/examples/llm/README.md b/backends/mlx/examples/llm/README.md
@@ -44,14 +44,14 @@ python -m executorch.backends.mlx.examples.llm.export_llm_hf \
     --use-custom-sdpa \
     --use-custom-kv-cache
 
-# With INT4 quantization
+# With 4-bit quantization
 python -m executorch.backends.mlx.examples.llm.export_llm_hf \
     --model-id "unsloth/Llama-3.2-1B-Instruct" \
     --output llama_hf_int4.pte \
     --use-custom-sdpa \
     --use-custom-kv-cache \
-    --quantize-linear int4 \
-    --quantize-embeddings int4
+    --qlinear 4w \
+    --qembedding 4w
 ```
 
 ### Options
@@ -62,8 +62,8 @@ python -m executorch.backends.mlx.examples.llm.export_llm_hf \
 | `--output` | *(required)* | Output .pte file path |
 | `--max-seq-len` | `1024` | Maximum sequence length for KV cache |
 | `--dtype` | `bf16` | Model dtype (`fp32`, `fp16`, `bf16`) |
-| `--quantize-linear` | None | Quantization for linear layers (`int4`, `int8`) |
-| `--quantize-embeddings` | None | Quantization for embedding layers (`int4`, `int8`) |
+| `--qlinear` | None | Quantization for linear layers (`4w`, `8w`, `nvfp4`) |
+| `--qembedding` | None | Quantization for embedding layers (`4w`, `8w`, `nvfp4`) |
 | `--no-tie-word-embeddings` | `False` | Disable re-tying lm_head to embedding after quantization |
 | `--use-custom-sdpa` | `False` | Use MLX custom SDPA (`mlx::custom_sdpa`) |
 | `--use-custom-kv-cache` | `False` | Use MLX custom KV cache (`mlx::kv_cache_update`) |
diff --git a/backends/mlx/examples/whisper/README.md b/backends/mlx/examples/whisper/README.md
@@ -34,6 +34,10 @@ python -m executorch.backends.mlx.examples.whisper.export_whisper \
 | `--output-dir` | `whisper_mlx` | Output directory for `.pte` files |
 | `--max-decoder-seq-len` | `256` | Maximum decoder sequence length |
 | `--dtype` | `bf16` | Model dtype (`fp32`, `fp16`, `bf16`) |
+| `--qlinear` | None | Quantization for linear layers (`4w`, `8w`, `nvfp4`) |
+| `--qembedding` | None | Quantization for embedding layers (`4w`, `8w`, `nvfp4`) |
+| `--qlinear-group-size` | auto | Group size for linear quantization |
+| `--qembedding-group-size` | auto | Group size for embedding quantization |
 
 
 ## Run
diff --git a/examples/models/voxtral_realtime/model.py b/examples/models/voxtral_realtime/model.py
@@ -538,11 +538,12 @@ def forward(
         return y.view(bsz, seqlen, self.dim)
 
 
-class MLXKVCache(nn.Module):
-    """Wrapper that adapts MLX BHSD KV cache for model's BSHD convention.
+class MLXStaticKVCache(nn.Module):
+    """Wrapper that adapts MLX static KV cache for model's BSHD convention.
 
-    The model's QKV projections produce [B, S, H, D] tensors, but MLX's
-    KVCache expects [B, H, S, D]. This wrapper transposes on the way in.
+    For offline (non-streaming) mode. The model's QKV projections produce
+    [B, S, H, D] tensors, but MLX's KVCache expects [B, H, S, D].
+    This wrapper transposes on the way in.
     """
 
     def __init__(
@@ -569,12 +570,13 @@ def update(
         return self.cache.update(input_pos, k_val, v_val)
 
 
-class MLXEncoderRingKVCache(nn.Module):
-    """Wrapper that adapts MLX RingBufferKVCache for the encoder's BSHD convention.
+class MLXRingKVCache(nn.Module):
+    """Wrapper that adapts MLX RingBufferKVCache for model's BSHD convention.
 
-    The encoder's QKV projections produce [B, S, H, D] tensors, but MLX's
-    RingBufferKVCache expects [B, H, S, D]. This wrapper transposes on the
-    way in and delegates ring buffer semantics to the MLX implementation.
+    For streaming mode (both encoder and decoder). The model's QKV projections
+    produce [B, S, H, D] tensors, but MLX's RingBufferKVCache expects
+    [B, H, S, D]. This wrapper transposes on the way in and delegates
+    ring buffer semantics to the MLX implementation.
     """
 
     def __init__(
@@ -603,7 +605,9 @@ def update(
         v_val = v_val.transpose(1, 2)
         return self.ring_cache.update(input_pos, k_val, v_val)
 
-    def create_causal_mask(self, start_pos, seq_len, bool_mask=False) -> torch.Tensor:
+    def create_causal_mask(
+        self, start_pos, seq_len, bool_mask=False, **kwargs
+    ) -> torch.Tensor:
         return self.ring_cache.create_sliding_window_mask(start_pos, seq_len)
 
 
@@ -637,9 +641,10 @@ def forward(
         return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
 
 
-class MLXEncoderSDPA(nn.Module):
-    """SDPA for streaming encoder with MLX ring buffer KV cache.
+class MLXMaskedSDPA(nn.Module):
+    """SDPA with explicit mask for MLX ring buffer KV cache.
 
+    Used with MLXRingKVCache for streaming mode (both encoder and decoder).
     Uses F.scaled_dot_product_attention with explicit attn_mask from the
     ring buffer. KV cache is in BHSD layout, queries are in BSHD.
     """
@@ -662,7 +667,7 @@ def forward(
         Args:
             input_pos: (seq_len,) position indices (unused, kept for interface).
             q: (B, seq_len, n_heads, head_dim) in BSHD layout.
-            k, v: (B, n_heads, buf_size, head_dim) in BHSD from MLXEncoderRingKVCache.
+            k, v: (B, n_heads, buf_size, head_dim) in BHSD from MLXRingKVCache.
             bsz, seqlen: batch size and query length.
             mask: (1, 1, seq_len, buf_size) additive attention mask from ring buffer.
         """
@@ -699,7 +704,7 @@ def __init__(self, config: VoxtralRealtimeConfig):
             # Ring buffer KV cache for unlimited streaming.
             if self.backend == "mlx":
                 cache_dtype = self.wq.weight.dtype
-                self.kv_cache = MLXKVCache(
+                self.kv_cache = MLXRingKVCache(
                     config.sliding_window,
                     self.n_kv_heads,
                     self.head_dim,
@@ -723,7 +728,16 @@ def __init__(self, config: VoxtralRealtimeConfig):
                 self.sdpa = SDPA(self.n_heads, self.head_dim)
         else:
             # Flat KV cache for offline mode (capped at max_seq_len).
-            if self.backend == "metal":
+            if self.backend == "mlx":
+                cache_dtype = self.wq.weight.dtype
+                self.kv_cache = MLXStaticKVCache(
+                    config.max_seq_len,
+                    self.n_kv_heads,
+                    self.head_dim,
+                    dtype=cache_dtype,
+                )
+                self.sdpa = MLXSDPA(self.n_heads, self.head_dim)
+            elif self.backend == "metal":
                 self.kv_cache = StaticKVCache(
                     config.max_seq_len, self.n_kv_heads, self.head_dim
                 )
@@ -1160,7 +1174,7 @@ def __init__(self, model: VoxtralRealtimeModel, max_enc_len: int = 750):
             cache_dtype = self.layers[0].attention.wq.weight.dtype
             self.kv_caches = nn.ModuleList(
                 [
-                    MLXEncoderRingKVCache(
+                    MLXRingKVCache(
                         max_enc_len,
                         config.enc_n_heads,
                         config.enc_head_dim,
@@ -1169,7 +1183,7 @@ def __init__(self, model: VoxtralRealtimeModel, max_enc_len: int = 750):
                     for _ in range(config.enc_n_layers)
                 ]
             )
-            self.sdpa = MLXEncoderSDPA(config.enc_n_heads, config.enc_head_dim)
+            self.sdpa = MLXMaskedSDPA(config.enc_n_heads, config.enc_head_dim)
         elif config.backend == "metal":
             self.kv_caches = nn.ModuleList(
                 [