Pin Gemma 4 MLX flow to validated model revision

zeel2104 · zeel2104 · commit ca37250f08ac · 2026-04-29T16:54:41.000-04:00
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
@@ -512,6 +512,10 @@ jobs:
         MODEL_NAME="${{ matrix.model.name }}"
         USE_CUSTOM="${{ matrix.use-custom }}"
         QCONFIG="${{ matrix.qconfig }}"
+        MODEL_REVISION=""
+        if [ "${MODEL_ID}" = "google/gemma-4-E2B-it" ]; then
+          MODEL_REVISION="b4a601102c3d45e2b7b50e2057a6d5ec8ed4adcf"
+        fi
 
         CUSTOM_ARGS=""
         if [ "${USE_CUSTOM}" = "true" ]; then
@@ -547,6 +551,7 @@ jobs:
         echo "::group::Export ${MODEL_NAME}"
         ${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.export_llm_hf \
           --model-id "${MODEL_ID}" \
+          ${MODEL_REVISION:+--revision "${MODEL_REVISION}"} \
           --output /tmp/${MODEL_NAME}.pte \
           --qlinear ${QCONFIG} \
           ${QEMBEDDING_ARGS} \
@@ -557,6 +562,7 @@ jobs:
         OUTPUT=$(${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.run_llm_hf \
           --pte /tmp/${MODEL_NAME}.pte \
           --model-id "${MODEL_ID}" \
+          ${MODEL_REVISION:+--revision "${MODEL_REVISION}"} \
           --prompt "What is the capital of France?" \
           --max-new-tokens 50 2>&1)
         echo "$OUTPUT"
diff --git a/backends/mlx/examples/llm/README.md b/backends/mlx/examples/llm/README.md
@@ -57,6 +57,7 @@ python -m executorch.backends.mlx.examples.llm.export_llm_hf \
 # Gemma 4 text-only export
 python -m executorch.backends.mlx.examples.llm.export_llm_hf \
     --model-id "google/gemma-4-E2B-it" \
+    --revision "b4a601102c3d45e2b7b50e2057a6d5ec8ed4adcf" \
     --output gemma4_hf_int4.pte \
     --use-custom-sdpa \
     --use-custom-kv-cache \
@@ -108,6 +109,7 @@ Validated Gemma 4 run command:
 python -m executorch.backends.mlx.examples.llm.run_llm_hf \
     --pte gemma4_hf_int4.pte \
     --model-id google/gemma-4-E2B-it \
+    --revision b4a601102c3d45e2b7b50e2057a6d5ec8ed4adcf \
     --prompt "What is the capital of France?" \
     --max-new-tokens 50
 ```
diff --git a/backends/mlx/examples/llm/export_llm_hf.py b/backends/mlx/examples/llm/export_llm_hf.py
@@ -50,6 +50,7 @@
 
 def _export_with_optimum(
     model_id: str,
+    revision: Optional[str],
     output_path: str,
     max_seq_len: int,
     dtype: str,
@@ -73,6 +74,7 @@ def _export_with_optimum(
     logger.info(f"Loading model using optimum-executorch: {model_id}")
     exportable = load_causal_lm_model(
         model_id,
+        revision=revision,
         dtype=dtype_str,
         max_seq_len=max_seq_len,
     )
@@ -124,6 +126,7 @@ def _export_with_optimum(
 
 def _export_with_custom_components(
     model_id: str,
+    revision: Optional[str],
     output_path: str,
     max_seq_len: int,
     dtype: str,
@@ -171,6 +174,8 @@ def _export_with_custom_components(
         "torch_dtype": torch_dtype,
         "low_cpu_mem_usage": True,
     }
+    if revision is not None:
+        load_kwargs["revision"] = revision
     if attn_implementation:
         load_kwargs["attn_implementation"] = attn_implementation
     model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
@@ -345,6 +350,7 @@ def _save_program(executorch_program, output_path: str) -> None:
 
 def export_llama_hf(
     model_id: str,
+    revision: Optional[str],
     output_path: str,
     max_seq_len: int = 1024,
     dtype: str = "bf16",
@@ -376,6 +382,7 @@ def export_llama_hf(
         )
         _export_with_custom_components(
             model_id=model_id,
+            revision=revision,
             output_path=output_path,
             max_seq_len=max_seq_len,
             dtype=dtype,
@@ -391,6 +398,7 @@ def export_llama_hf(
         logger.info("Using optimum-executorch pipeline (no custom components)")
         _export_with_optimum(
             model_id=model_id,
+            revision=revision,
             output_path=output_path,
             max_seq_len=max_seq_len,
             dtype=dtype,
@@ -412,6 +420,12 @@ def main():
         default="unsloth/Llama-3.2-1B-Instruct",
         help="HuggingFace model ID",
     )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        help="Optional HuggingFace model revision/commit to pin",
+    )
     parser.add_argument(
         "--output",
         type=str,
@@ -451,6 +465,7 @@ def main():
 
     export_llama_hf(
         model_id=args.model_id,
+        revision=args.revision,
         output_path=args.output,
         max_seq_len=args.max_seq_len,
         dtype=args.dtype,
diff --git a/backends/mlx/examples/llm/run_llm_hf.py b/backends/mlx/examples/llm/run_llm_hf.py
@@ -47,7 +47,7 @@ def _get_max_input_seq_len(program) -> int:
     return sizes[1] if len(sizes) >= 2 else 1
 
 
-def _load_text_processor(model_id: str):
+def _load_text_processor(model_id: str, revision: str | None):
     """
     Load a text processor for the model.
 
@@ -58,13 +58,13 @@ def _load_text_processor(model_id: str):
     """
     logger.info(f"Loading tokenizer from HuggingFace: {model_id}...")
     try:
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
         return tokenizer, False
     except Exception as exc:
         logger.info(f"AutoTokenizer unavailable for {model_id}: {exc}")
 
     try:
-        processor = AutoProcessor.from_pretrained(model_id)
+        processor = AutoProcessor.from_pretrained(model_id, revision=revision)
         if hasattr(processor, "apply_chat_template") and hasattr(processor, "decode"):
             logger.info(f"Loaded processor from HuggingFace: {model_id}")
             return processor, True
@@ -101,11 +101,12 @@ def _get_eos_token_id(text_processor):
 def run_inference(
     pte_path: str,
     model_id: str,
+    revision: str | None,
     prompt: str,
     max_new_tokens: int = 50,
 ) -> str:
     """Run inference on the exported HuggingFace model."""
-    text_processor, uses_processor = _load_text_processor(model_id)
+    text_processor, uses_processor = _load_text_processor(model_id, revision)
 
     logger.info(f"Loading model from {pte_path}...")
     et_runtime = Runtime.get()
@@ -208,6 +209,12 @@ def main():
         default="unsloth/Llama-3.2-1B-Instruct",
         help="HuggingFace model ID (used to load tokenizer or processor)",
     )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        help="Optional HuggingFace model revision/commit to pin",
+    )
     parser.add_argument(
         "--prompt",
         type=str,
@@ -226,6 +233,7 @@ def main():
     generated_text = run_inference(
         pte_path=args.pte,
         model_id=args.model_id,
+        revision=args.revision,
         prompt=args.prompt,
         max_new_tokens=args.max_new_tokens,
     )