Add GGUF → MLX export support for Gemma 4 31B (pytorch#19829)

mergennachin · web-flow · commit 42581f1b0916 · 2026-05-28T17:44:19.000-04:00
Enable loading GGUF files (e.g. Q4_K_M) and exporting to the MLX backend. Three areas of change: GGUF loader (gguf_loader.py): - Add MLX backend support alongside CUDA - Keep embedding quantized for MLX (QuantizedEmbeddingHandler supports quantized gather natively, unlike CUDA's Int4Tensor) - Fix stale docstring references to Int4TilePackedTo4dTensor/tinygemm MLX backend (op_helpers.py, patterns.py): - Accept group_size=16 in parse_dequant_node for GGUF Q6_K tensors - For group_size < 32, emit DequantizeNode + TransposeNode + AddmmNode instead of QuantizedMatmulNode, since MLX Metal kernels are only instantiated for group_size >= 32. Weights stay packed as int8 in the .pte file and are dequantized on-device at runtime — same strategy CUDA/Inductor uses (separate Triton dequant + cuBLAS mm). Packer (pack_mlx.py): - Add 16 to supported group sizes so Q6_K IntxUnpackedToInt8Tensor passes through to export unchanged Tests (test_ops.py): - Add group_size=16 configs for int8, int4, and no-bias variants Test Plan: Export and run this model https://huggingface.co/unsloth/gemma-4-31B-it-GGUF/blob/main/gemma-4-31B-it-Q4_K_M.gguf On M1 32GB machine (exported on Linux A100) ``` (executorch_dev) mnachin@mnachin-mbp executorch % ./cmake-out/examples/models/gemma4_31b/gemma4_31b_runner \ --model_path /Users/mnachin/repos/models/gemma-4-31B-it-GGUF/model.pte \ --tokenizer_path /Users/mnachin/repos/models/gemma-4-31B-it-HQQ-INT4/tokenizer.json \ --prompt "Tell me a joke about RAM usage" \ --max_new_tokens 128 \ --temperature 0.8 I tokenizers:regex.cpp:27] Registering override fallback regex WARNING: All log messages before absl::InitializeLog() is called are written to STDERR E0000 00:00:1779926968.603672 54889180 re2.cc:237] Error parsing '((\<pad\>|ool\|\>1\x00\x00\ �\<t|respo|\<tool_call\|\>|\<bos\>|\<\|tool_response\>|\<\|think\|\>|\x0...': invalid UTF-8 I tokenizers:re2_regex.cpp:27] Re2 failed to compile regex: ((\<pad\>|ool\|\>1\x00\x00\ �\<t|respo|\<tool_call\|\>|\<bos\>|\<\|tool_response\>|\<\|think\|\>|\x00\x00\\\<|\<tool_response\|\>|\<mask\>|\<\|\"\|\>|all\|\>j\x00\x00\\|\<channel\|\>|\<\|turn\>|\<turn\|\>|\<\|image\>|\<\|$ I tokenizers:regex_lookahead.cpp:27] Creating PCRE2 regex I tokenizers:pcre2_regex.cpp:48] PCRE2 UTF-8 validation failed at offset 27: UTF-8 error: byte 2 top bits not 0x80. Retrying without UTF flags. Loading model... Prompt tokens: 23 Why did the computer go to therapy? Because it had too many **unresolved dependencies** and it just couldn't stop **dwelling on the past**... but it forgot everything the moment it took a nap.<turn|> PyTorchObserver {"prefill_token_per_sec":2.49539,"decode_token_per_sec":0.0880671,"prompt_tokens":23,"generated_tokens":44,"model_load_start_ms":1779926968052,"model_load_end_ms":1779926982494,"inference_start_ms":1779926982497,"inference_end_ms":1779927491333,"prompt_eval_end_ms":1779926991714,"first_token_ms":1779926991714,"aggregate_sampling_time_ms":0,"SCALING_FACTOR_UNITS_PER_SECOND":1000} ``` For reference, here's the this model: https://huggingface.co/SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4 ``` (executorch_dev) mnachin@mnachin-mbp executorch % ./cmake-out/examples/models/gemma4_31b/gemma4_31b_runner \ --model_path /Users/mnachin/repos/models/gemma-4-31B-it-HQQ-INT4/model.pte \ --tokenizer_path /Users/mnachin/repos/models/gemma-4-31B-it-HQQ-INT4/tokenizer.json \ --prompt "Tell me a joke about RAM usage" \ --max_new_tokens 128 \ --temperature 0.8 I tokenizers:regex.cpp:27] Registering override fallback regex WARNING: All log messages before absl::InitializeLog() is called are written to STDERR E0000 00:00:1779927592.109382 54914733 re2.cc:237] Error parsing '((\<pad\>|ool\|\>1\x00\x00\ �\<t|respo|\<tool_call\|\>|\<bos\>|\<\|tool_response\>|\<\|think\|\>|\x0...': invalid UTF-8 I tokenizers:re2_regex.cpp:27] Re2 failed to compile regex: ((\<pad\>|ool\|\>1\x00\x00\ �\<t|respo|\<tool_call\|\>|\<bos\>|\<\|tool_response\>|\<\|think\|\>|\x00\x00\\\<|\<tool_response\|\>|\<mask\>|\<\|\"\|\>|all\|\>j\x00\x00\\|\<channel\|\>|\<\|turn\>|\<turn\|\>|\<\|image\>|\<\|$ I tokenizers:regex_lookahead.cpp:27] Creating PCRE2 regex I tokenizers:pcre2_regex.cpp:48] PCRE2 UTF-8 validation failed at offset 27: UTF-8 error: byte 2 top bits not 0x80. Retrying without UTF flags. Loading model... Prompt tokens: 23 Why did the computer go to therapy? Because it had too many **unresolved dependencies** and couldn't stop **dwelling on the past**, but it still couldn't remember why it was there. *** Alternatively, a shorter one: **Why was the RAM so stressed?** Because it had too much on its mind, but it knew that as soon as it slept, it would forget everything.<turn|> PyTorchObserver {"prefill_token_per_sec":9.11975,"decode_token_per_sec":5.24998,"prompt_tokens":23,"generated_tokens":86,"model_load_start_ms":1779927591719,"model_load_end_ms":1779927603575,"inference_start_ms":1779927603579,"inference_end_ms":1779927622482,"prompt_eval_end_ms":1779927606101,"first_token_ms":1779927606101,"aggregate_sampling_time_ms":0,"SCALING_FACTOR_UNITS_PER_SECOND":1000} ``` There's definitely performance degradation when running GGUF
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
@@ -47,6 +47,10 @@ jobs:
 
         ${CONDA_RUN} pip list
 
+        echo "::group::Install Python test requirements"
+        ${CONDA_RUN} pip install gguf
+        echo "::endgroup::"
+
         echo "::group::Build test runners"
         ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner -j$(( $(sysctl -n hw.ncpu) - 1 ))
         echo "::endgroup::"
diff --git a/backends/mlx/builder/op_helpers.py b/backends/mlx/builder/op_helpers.py
@@ -334,7 +334,7 @@ def parse_dequant_node(
     if len(non_one) != 1:
         return None
     quantized_dim, group_size = non_one[0]
-    if group_size not in [32, 64, 128]:
+    if group_size not in [16, 32, 64, 128]:
         return None
 
     # TODO: MLX supports 3, 5, and 7, but we need to figure out the
diff --git a/backends/mlx/patterns.py b/backends/mlx/patterns.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import os
 from typing import Any, List, Optional, Tuple
 
 import torch
@@ -37,6 +38,7 @@
 )
 from executorch.backends.mlx.serialization.mlx_graph_schema import (
     AddIntNode,
+    AddmmNode,
     AddNode,
     AsTypeNode,
     DequantizeNode,
@@ -52,6 +54,7 @@
     SubtractIntNode,
     SymSizeNode,
     TakeNode,
+    TransposeNode,
 )
 from torch.export.exported_program import ExportedProgram
 from torch.fx.node import Node
@@ -883,6 +886,18 @@ def maybe_create(
             out_dtype=out_dtype,
         )
 
+    # MLX's quantized_matmul Metal kernels are only instantiated for
+    # group_size in {32, 64, 128}. For smaller group sizes (e.g. GGUF
+    # Q6_K with group_size=16), emit DequantizeNode + matmul instead.
+    # Weights stay packed in the .pte file; dequantized on-device.
+    # This non-fused path is significantly slower and must be opted in
+    # via ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS=1.
+    _MIN_FUSED_GROUP_SIZE = 32
+
+    @staticmethod
+    def _allow_non_fused() -> bool:
+        return os.environ.get("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", "0") == "1"
+
     def __call__(self, P: MLXProgramBuilder, n: Node) -> Slot:
         assert n == self.head
 
@@ -908,19 +923,59 @@ def __call__(self, P: MLXProgramBuilder, n: Node) -> Slot:
         x_dtype = x_node.meta["val"].dtype
         needs_cast = self.out_dtype != x_dtype
 
-        P.emit(
-            QuantizedMatmulNode(
-                x=P.slot_to_tid(x_slot),
-                w=P.slot_to_tid(w),
-                scales=P.slot_to_tid(scale_slot),
-                out=P.slot_to_tid(out),
-                biases=P.slot_to_tid(biases),
-                group_size=self.group_size,
-                bits=self.bits,
-                mode="affine",
-                transpose=True,
+        if self.group_size >= self._MIN_FUSED_GROUP_SIZE:
+            P.emit(
+                QuantizedMatmulNode(
+                    x=P.slot_to_tid(x_slot),
+                    w=P.slot_to_tid(w),
+                    scales=P.slot_to_tid(scale_slot),
+                    out=P.slot_to_tid(out),
+                    biases=P.slot_to_tid(biases),
+                    group_size=self.group_size,
+                    bits=self.bits,
+                    mode="affine",
+                    transpose=True,
+                )
             )
-        )
+        else:
+            if not self._allow_non_fused():
+                raise ValueError(
+                    f"Quantized linear with group_size={self.group_size} requires "
+                    f"the non-fused dequantize+matmul path, which is significantly "
+                    f"slower than the fused QuantizedMatmulNode (group_size >= 32). "
+                    f"Set ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS=1 to allow this."
+                )
+            out_scalar_type = torch_dtype_to_scalar_type(self.out_dtype)
+            _, w_deq = P.make_tmp_slot()
+            P.emit(
+                DequantizeNode(
+                    w=P.slot_to_tid(w),
+                    scales=P.slot_to_tid(scale_slot),
+                    out=P.slot_to_tid(w_deq),
+                    biases=P.slot_to_tid(biases),
+                    group_size=self.group_size,
+                    bits=self.bits,
+                    mode="affine",
+                    dtype=out_scalar_type,
+                )
+            )
+            _, w_t = P.make_tmp_slot()
+            P.emit(
+                TransposeNode(
+                    x=P.slot_to_tid(w_deq),
+                    out=P.slot_to_tid(w_t),
+                    perm=[1, 0],
+                )
+            )
+            P.emit(
+                AddmmNode(
+                    mat1=P.slot_to_tid(x_slot),
+                    mat2=P.slot_to_tid(w_t),
+                    out=P.slot_to_tid(out),
+                )
+            )
+            # DequantizeNode already produces the correct dtype.
+            needs_cast = False
 
         if has_bias:
             P.emit(
diff --git a/backends/mlx/test/test_ops.py b/backends/mlx/test/test_ops.py
@@ -24,6 +24,7 @@
 See README.md in this directory for full documentation.
 """
 
+import os
 from typing import Callable, Dict, List, Optional, Tuple
 
 import torch
@@ -5621,8 +5622,21 @@ def get_test_configs(cls) -> List["QuantizedLinearTest"]:
             cls(group_size=128),
             cls(qdtype=torch.int2),
             cls(qdtype=torch.int8),
+            # group_size=16: exercises the non-fused dequantize+matmul path
+            # (requires ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS=1).
+            cls(qdtype=torch.int8, group_size=16),
+            cls(qdtype=torch.int4, group_size=16),
+            cls(qdtype=torch.int8, group_size=16, bias=False),
         ]
 
+    def generate_test_files(self, verbose=False):
+        if self.group_size < 32:
+            os.environ["ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS"] = "1"
+        try:
+            return super().generate_test_files(verbose=verbose)
+        finally:
+            os.environ.pop("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", None)
+
     def create_model(self) -> nn.Module:
         model = LinearModel(self.in_features, self.out_features, bias=self.bias)
         model = model.to(self.dtype)
diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md
@@ -15,6 +15,7 @@ both export and eager inference:
 |---|---|---|
 | `quantize_and_save.py` | bf16 HF checkpoint → quantized checkpoint (one-time) | ~30 GB CPU |
 | `export.py --prequantized <dir>` | quantized checkpoint → `model.pte` + `model.ptd` | ~24 GB CPU + CUDA for packing |
+| `export.py --gguf <file> [--backend mlx]` | GGUF file (Q4_K_M, etc.) → `model.pte` + `model.ptd` | ~24 GB CPU |
 | `inference.py --prequantized <dir>` | quantized checkpoint → eager generation under `torch.compile` | ~24 GB GPU |
 | `inference.py --gguf <file>` | GGUF file (Q4_K_M, etc.) → eager generation | ~24 GB GPU |
 | `export.py --model-dir <hf>` | one-shot bf16 → quantize → export (no intermediate file) | ~30 GB CPU + CUDA for packing |
diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py
@@ -443,7 +443,12 @@ def main() -> None:
             backend=args.backend,
         )
 
-    export_and_lower(model, config, args.output_dir, backend=args.backend)
+    if args.gguf and args.backend == "mlx":
+        os.environ["ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS"] = "1"
+    try:
+        export_and_lower(model, config, args.output_dir, backend=args.backend)
+    finally:
+        os.environ.pop("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", None)
 
 
 if __name__ == "__main__":
diff --git a/examples/models/gemma4_31b/gguf_loader.py b/examples/models/gemma4_31b/gguf_loader.py
@@ -12,6 +12,7 @@
 
 Usage:
     model, config = load_gguf_model("model.gguf", backend="cuda")
+    model, config = load_gguf_model("model.gguf", backend="mlx")
 """
 
 from typing import Optional
@@ -104,10 +105,11 @@ def load_gguf_model(
     Streams tensors one at a time for low peak memory.
 
     GGUF ties ``embed_tokens`` and ``lm_head`` into a single Q4_K tensor.
-    We untie them: the embedding is dequantized to bf16 (``nn.Embedding``
-    needs gather, which ``Int4TilePackedTo4dTensor`` does not support),
-    while ``lm_head`` keeps the original Q4_K quantization (``nn.Linear``
-    matmul via tinygemm).
+    We untie them so ``lm_head`` keeps the original Q4_K quantization.
+    On CUDA, the embedding is dequantized to bf16 because ``Int4Tensor``
+    does not support the gather op that ``nn.Embedding`` requires.  On
+    MLX, the embedding stays quantized — ``QuantizedEmbeddingHandler``
+    handles quantized gather natively.
 
     Returns ``(model, config)``.
     """
@@ -120,8 +122,12 @@ def load_gguf_model(
         from executorch.examples.models.gemma4_31b.quant import DEFAULT_CUDA_PACKERS
 
         packers = DEFAULT_CUDA_PACKERS
+    elif backend == "mlx":
+        from executorch.examples.models.gemma4_31b.quant import DEFAULT_MLX_PACKERS
+
+        packers = DEFAULT_MLX_PACKERS
     else:
-        raise ValueError(f"Unsupported backend: {backend!r}. Supported: 'cuda'.")
+        raise ValueError(f"Unsupported backend: {backend!r}. Supported: 'cuda', 'mlx'.")
 
     config = Gemma4_31BConfig(max_seq_len=max_seq_len)
 
@@ -143,7 +149,8 @@ def load_gguf_model(
 
         if model_key == "embed_tokens.weight" and isinstance(result, Int4Tensor):
             embed_quant = result
-            result = dequantize_weight(result, torch.bfloat16)
+            if backend == "cuda":
+                result = dequantize_weight(result, torch.bfloat16)
 
         pack_one(model, model_key, result, packers)
 
diff --git a/examples/models/gemma4_31b/quant/README.md b/examples/models/gemma4_31b/quant/README.md
@@ -50,5 +50,3 @@ The format is compatible with torchao's `save_pretrained` / `load_pretrained`.
 
 - `pack_metal.py` — Metal backend packer.
 - `gguf.py` — extend with Q5_K, Q8_0 GGUF quant types.
-- Upstream `Int4TilePackedTo4dTensor.from_int4_tensor()` to torchao
-  to replace the manual conversion in `pack_int4_for_cuda`.
diff --git a/examples/models/gemma4_31b/quant/pack_mlx.py b/examples/models/gemma4_31b/quant/pack_mlx.py
@@ -22,7 +22,7 @@
 
 from .pack import ModulePackerFn, pack_model  # noqa: F401
 
-_MLX_SUPPORTED_GROUP_SIZES = (128, 64, 32)
+_MLX_SUPPORTED_GROUP_SIZES = (128, 64, 32, 16)
 
 
 # ---------------------------------------------------------------------------
@@ -126,7 +126,9 @@ def pack_for_mlx(module: nn.Module, weights: dict[str, torch.Tensor]) -> None:
     default dispatch produces the ``dequantize_affine → linear`` pattern
     MLX expects.  Regroups to a compatible group_size when needed (e.g.
     per-axis group_size=5376 → group_size=128) since MLX's
-    ``parse_dequant_node`` only accepts group_size in {32, 64, 128}.
+    ``parse_dequant_node`` only accepts group_size in {16, 32, 64, 128}.
+    Group sizes ≥ 32 use the fused ``QuantizedMatmulNode``; group_size=16
+    (e.g. GGUF Q6_K) falls back to ``DequantizeNode`` + matmul at export.
     """
     from torchao.quantization import IntxUnpackedToInt8Tensor
     from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor
diff --git a/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py b/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py
@@ -146,7 +146,7 @@ def test_regroup_preserves_dequant(self):
 
 class TestMlxGroupSize(unittest.TestCase):
     def test_passthrough(self):
-        for gs in (32, 64, 128):
+        for gs in (16, 32, 64, 128):
             self.assertEqual(_mlx_group_size(gs, 256), gs)
 
     def test_regroup_5376(self):
@@ -157,7 +157,49 @@ def test_regroup_256(self):
 
     def test_rejects_indivisible(self):
         with self.assertRaises(ValueError):
-            _mlx_group_size(48, 48)
+            _mlx_group_size(7, 7)
+
+
+class TestPackLinearGroupSize16(unittest.TestCase):
+    """Packing group_size=16 weights (GGUF Q6_K) preserves semantics."""
+
+    def _make_gs16_tensor(self, N=64, K=128):
+        from torchao.quantization import IntxUnpackedToInt8Tensor
+
+        return IntxUnpackedToInt8Tensor(
+            qdata=torch.randint(-32, 31, (N, K), dtype=torch.int8),
+            scale=torch.randn(N, K // 16, dtype=torch.bfloat16),
+            zero_point=torch.zeros(N, K // 16, dtype=torch.int8),
+            target_dtype=torch.int8,
+            block_size=(1, 16),
+            dtype=torch.bfloat16,
+            activation_quantization=None,
+        )
+
+    def test_dequant_preserves_values(self):
+        """Packing preserves the dequantized weight values."""
+        w = self._make_gs16_tensor(64, 128)
+        before = dequantize_weight(w, torch.float32)
+
+        module = nn.Linear(128, 64, bias=False)
+        pack_for_mlx(module, {"weight": w})
+        after = dequantize_weight(module.weight.data, torch.float32)
+
+        self.assertTrue(
+            torch.allclose(before, after, atol=1e-5),
+            f"max diff: {(before - after).abs().max():.6g}",
+        )
+
+    def test_forward_produces_valid_output(self):
+        """Packed gs=16 weight produces finite output in a linear forward."""
+        w = self._make_gs16_tensor(64, 128)
+        module = nn.Linear(128, 64, bias=False)
+        pack_for_mlx(module, {"weight": w})
+
+        x = torch.randn(1, 128, dtype=torch.bfloat16)
+        out = torch.nn.functional.linear(x, module.weight.data.dequantize())
+        self.assertEqual(out.shape, torch.Size([1, 64]))
+        self.assertFalse(torch.isnan(out).any())
 
 
 class TestPackEmbeddingForMlx(unittest.TestCase):
diff --git a/examples/models/gemma4_31b/tests/test_mlx_pipeline.py b/examples/models/gemma4_31b/tests/test_mlx_pipeline.py