Clean up Gemma 4 MLX path for macOS 15 CI

zeel2104 · zeel2104 · commit 065b50e5dd52 · 2026-05-01T09:22:04.000-04:00
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
@@ -489,17 +489,25 @@ jobs:
             name: "gemma3-1b"
         use-custom: [false, true]
         qconfig: ["4w", "nvfp4"]
+        runner: ["macos-14-xlarge"]
         include:
           - model:
               id: "google/gemma-4-E2B-it"
               name: "gemma4-e2b"
             use-custom: true
             qconfig: "4w"
+            runner: "macos-15-xlarge"
+          - model:
+              id: "google/gemma-4-E2B-it"
+              name: "gemma4-e2b"
+            use-custom: false
+            qconfig: "4w"
+            runner: "macos-15-xlarge"
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
       job-name: test-mlx-llm-${{ matrix.model.name }}${{ matrix.use-custom && '-custom' || '' }}-${{ matrix.qconfig }}
-      runner: macos-14-xlarge
+      runner: ${{ matrix.runner }}
       python-version: "3.12"
       submodules: recursive
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -512,11 +520,6 @@ jobs:
         MODEL_NAME="${{ matrix.model.name }}"
         USE_CUSTOM="${{ matrix.use-custom }}"
         QCONFIG="${{ matrix.qconfig }}"
-        MODEL_REVISION=""
-        if [ "${MODEL_ID}" = "google/gemma-4-E2B-it" ]; then
-          MODEL_REVISION="b4a601102c3d45e2b7b50e2057a6d5ec8ed4adcf"
-        fi
-
         CUSTOM_ARGS=""
         if [ "${USE_CUSTOM}" = "true" ]; then
           CUSTOM_ARGS="--use-custom-sdpa --use-custom-kv-cache"
@@ -551,7 +554,6 @@ jobs:
         echo "::group::Export ${MODEL_NAME}"
         ${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.export_llm_hf \
           --model-id "${MODEL_ID}" \
-          ${MODEL_REVISION:+--revision "${MODEL_REVISION}"} \
           --output /tmp/${MODEL_NAME}.pte \
           --qlinear ${QCONFIG} \
           ${QEMBEDDING_ARGS} \
@@ -562,7 +564,6 @@ jobs:
         OUTPUT=$(${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.run_llm_hf \
           --pte /tmp/${MODEL_NAME}.pte \
           --model-id "${MODEL_ID}" \
-          ${MODEL_REVISION:+--revision "${MODEL_REVISION}"} \
           --prompt "What is the capital of France?" \
           --max-new-tokens 50 2>&1)
         echo "$OUTPUT"
diff --git a/backends/mlx/builder/program_builder.py b/backends/mlx/builder/program_builder.py
@@ -444,50 +444,26 @@ def _make_io_slots(self):  # noqa: C901
             else:
                 raise NotImplementedError(f"Support for input {arg} is not implemented")
 
-        placeholder_nodes = {
-            node.name: node for node in self.ep.graph.nodes if node.op == "placeholder"
-        }
-
-        # Allocate placeholder-backed slots in graph-signature order instead of
-        # raw FX node traversal order. This keeps lifted constant tids stable
-        # across equivalent exports, which matters for models like Gemma 4 that
-        # carry multiple rotary constant placeholders with similar structure.
-        for name in constant_tensors:
-            node = placeholder_nodes.get(name)
-            if node is None or node.users == {}:
-                continue
-            self.make_or_get_slot(node, id_space=IdSpace.Constant)
-
-        for name in user_inputs:
-            node = placeholder_nodes.get(name)
-            if node is None or node.users == {}:
-                continue
-            val = node.meta.get("val", None)
-            if isinstance(val, torch.Tensor) and not val.is_contiguous():
-                raise ValueError(
-                    f"MLX backend requires contiguous input tensors, "
-                    f"but input '{node.name}' has non-contiguous strides. "
-                    f"shape={list(val.shape)}, stride={list(val.stride())}. "
-                    f"Ensure example inputs passed to torch.export.export() "
-                    f"are contiguous (call .contiguous() on them)."
-                )
-            self.make_or_get_slot(node, id_space=IdSpace.Input)
-
-        for name in mutable_buffers:
-            node = placeholder_nodes.get(name)
-            if node is None or node.users == {}:
-                continue
-            self.make_or_get_slot(node, id_space=IdSpace.MutableBuffer)
-
-        classified_placeholders = (
-            set(constant_tensors) | set(user_inputs) | set(mutable_buffers)
-        )
-
         for node in self.ep.graph.nodes:
             if node.op == "placeholder":
                 if node.users == {}:
                     continue
-                if node.name not in classified_placeholders:
+                if node.name in constant_tensors:
+                    self.make_or_get_slot(node, id_space=IdSpace.Constant)
+                elif node.name in user_inputs:
+                    val = node.meta.get("val", None)
+                    if isinstance(val, torch.Tensor) and not val.is_contiguous():
+                        raise ValueError(
+                            f"MLX backend requires contiguous input tensors, "
+                            f"but input '{node.name}' has non-contiguous strides. "
+                            f"shape={list(val.shape)}, stride={list(val.stride())}. "
+                            f"Ensure example inputs passed to torch.export.export() "
+                            f"are contiguous (call .contiguous() on them)."
+                        )
+                    self.make_or_get_slot(node, id_space=IdSpace.Input)
+                elif node.name in mutable_buffers:
+                    self.make_or_get_slot(node, id_space=IdSpace.MutableBuffer)
+                else:
                     raise NotImplementedError(
                         f"Support for placeholder {node.name} is not implemented"
                     )
diff --git a/backends/mlx/examples/llm/README.md b/backends/mlx/examples/llm/README.md
@@ -57,7 +57,6 @@ python -m executorch.backends.mlx.examples.llm.export_llm_hf \
 # Gemma 4 text-only export
 python -m executorch.backends.mlx.examples.llm.export_llm_hf \
     --model-id "google/gemma-4-E2B-it" \
-    --revision "b4a601102c3d45e2b7b50e2057a6d5ec8ed4adcf" \
     --output gemma4_hf_int4.pte \
     --use-custom-sdpa \
     --use-custom-kv-cache \
@@ -109,7 +108,6 @@ Validated Gemma 4 run command:
 python -m executorch.backends.mlx.examples.llm.run_llm_hf \
     --pte gemma4_hf_int4.pte \
     --model-id google/gemma-4-E2B-it \
-    --revision b4a601102c3d45e2b7b50e2057a6d5ec8ed4adcf \
     --prompt "What is the capital of France?" \
     --max-new-tokens 50
 ```
diff --git a/backends/mlx/examples/llm/export_llm_hf.py b/backends/mlx/examples/llm/export_llm_hf.py
@@ -47,53 +47,6 @@
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 logger = logging.getLogger(__name__)
 
-_GEMMA4_MODEL_ID = "google/gemma-4-E2B-it"
-_GEMMA4_PROBLEM_LAYER_FQN = "model.language_model.layers.22.mlp.down_proj"
-
-
-def _get_submodule_by_fqn(root: torch.nn.Module, fqn: str) -> torch.nn.Module:
-    cur = root
-    for part in fqn.split("."):
-        if part.isdigit():
-            cur = cur[int(part)]  # type: ignore[index]
-        else:
-            cur = getattr(cur, part)
-    return cur
-
-
-def _capture_gemma4_float_fallback_weight(
-    model_id: str,
-    qlinear: Optional[str],
-    model: torch.nn.Module,
-) -> Optional[torch.Tensor]:
-    if model_id != _GEMMA4_MODEL_ID or qlinear != "4w":
-        return None
-
-    layer = _get_submodule_by_fqn(model, _GEMMA4_PROBLEM_LAYER_FQN)
-    weight = layer.weight.detach().clone()
-    logger.info(
-        "Saving %s in floating point to avoid the current Gemma 4 4w mismatch",
-        _GEMMA4_PROBLEM_LAYER_FQN,
-    )
-    return weight
-
-
-def _restore_gemma4_float_fallback_weight(
-    model_id: str,
-    qlinear: Optional[str],
-    model: torch.nn.Module,
-    weight: Optional[torch.Tensor],
-) -> None:
-    if weight is None or model_id != _GEMMA4_MODEL_ID or qlinear != "4w":
-        return
-
-    layer = _get_submodule_by_fqn(model, _GEMMA4_PROBLEM_LAYER_FQN)
-    layer.weight = torch.nn.Parameter(weight, requires_grad=False)
-    logger.info(
-        "Restored %s in floating point after quantization",
-        _GEMMA4_PROBLEM_LAYER_FQN,
-    )
-
 
 def _export_with_optimum(
     model_id: str,
@@ -128,10 +81,6 @@ def _export_with_optimum(
 
     from executorch.backends.mlx.llm.quantization import quantize_model_
 
-    gemma4_float_weight = _capture_gemma4_float_fallback_weight(
-        model_id, qlinear, exportable.model
-    )
-
     quantize_model_(
         exportable.model,
         qlinear_config=qlinear,
@@ -143,9 +92,6 @@ def _export_with_optimum(
         )
         and not no_tie_word_embeddings,
     )
-    _restore_gemma4_float_fallback_weight(
-        model_id, qlinear, exportable.model, gemma4_float_weight
-    )
 
     logger.info("Exporting model with torch.export...")
     exported_progs = exportable.export()
@@ -215,24 +161,13 @@ def _export_with_custom_components(
     }
     torch_dtype = torch_dtype_map.get(dtype, torch.bfloat16)
 
-    effective_use_custom_sdpa = use_custom_sdpa
-    effective_use_custom_kv_cache = use_custom_kv_cache
-    if model_id == _GEMMA4_MODEL_ID and use_custom_sdpa:
-        logger.info(
-            "Disabling custom SDPA for Gemma 4 while keeping the custom cache path"
-        )
-        effective_use_custom_sdpa = False
-    if model_id == _GEMMA4_MODEL_ID and use_custom_kv_cache:
-        logger.info("Disabling custom KV cache for Gemma 4")
-        effective_use_custom_kv_cache = False
-
-    if effective_use_custom_sdpa:
+    if use_custom_sdpa:
         from executorch.backends.mlx.llm.hf_attention import register_mlx_attention
 
         register_mlx_attention()
         logger.info("Registered MLX custom SDPA attention")
 
-    attn_implementation = "mlx" if effective_use_custom_sdpa else None
+    attn_implementation = "mlx" if use_custom_sdpa else None
 
     logger.info(f"Loading HuggingFace model: {model_id}")
     load_kwargs = {
@@ -292,7 +227,7 @@ def _export_with_custom_components(
             max_cache_len=effective_cache_len,
         )
 
-    if effective_use_custom_kv_cache:
+    if use_custom_kv_cache:
         from executorch.backends.mlx.llm.source_transformation import (
             replace_hf_cache_with_mlx,
         )
@@ -316,10 +251,6 @@ def _export_with_custom_components(
 
     from executorch.backends.mlx.llm.quantization import quantize_model_
 
-    gemma4_float_weight = _capture_gemma4_float_fallback_weight(
-        model_id, qlinear, exportable.model
-    )
-
     quantize_model_(
         exportable.model,
         qlinear_config=qlinear,
@@ -329,9 +260,6 @@ def _export_with_custom_components(
         tie_word_embeddings=getattr(model.config, "tie_word_embeddings", False)
         and not no_tie_word_embeddings,
     )
-    _restore_gemma4_float_fallback_weight(
-        model_id, qlinear, exportable.model, gemma4_float_weight
-    )
 
     logger.info("Exporting model with torch.export...")
     seq_length = 3
@@ -421,24 +349,10 @@ def export_llama_hf(
         use_custom_sdpa: Use MLX custom SDPA (mlx::custom_sdpa)
         use_custom_kv_cache: Use MLX custom KV cache (mlx::kv_cache_update)
     """
-    effective_use_custom_sdpa = use_custom_sdpa
-    effective_use_custom_kv_cache = use_custom_kv_cache
-    if model_id == _GEMMA4_MODEL_ID:
-        if effective_use_custom_sdpa:
-            logger.info(
-                "Disabling custom SDPA for Gemma 4 and falling back to the baseline export path"
-            )
-            effective_use_custom_sdpa = False
-        if effective_use_custom_kv_cache:
-            logger.info(
-                "Disabling custom KV cache for Gemma 4 and falling back to the baseline export path"
-            )
-            effective_use_custom_kv_cache = False
-
-    if effective_use_custom_sdpa or effective_use_custom_kv_cache:
+    if use_custom_sdpa or use_custom_kv_cache:
         logger.info(
-            f"Using custom components: sdpa={effective_use_custom_sdpa}, "
-            f"kv_cache={effective_use_custom_kv_cache}"
+            f"Using custom components: sdpa={use_custom_sdpa}, "
+            f"kv_cache={use_custom_kv_cache}"
         )
         _export_with_custom_components(
             model_id=model_id,
@@ -448,8 +362,8 @@ def export_llama_hf(
             dtype=dtype,
             qlinear=qlinear,
             qembedding=qembedding,
-            use_custom_sdpa=effective_use_custom_sdpa,
-            use_custom_kv_cache=effective_use_custom_kv_cache,
+            use_custom_sdpa=use_custom_sdpa,
+            use_custom_kv_cache=use_custom_kv_cache,
             no_tie_word_embeddings=no_tie_word_embeddings,
             qlinear_group_size=qlinear_group_size,
             qembedding_group_size=qembedding_group_size,