pytorch
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cuda.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/gemma4_31b/README.md‎
Lines changed: 10 additions & 0 deletions b/‎examples/models/gemma4_31b/README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎examples/models/gemma4_31b/export.py‎
Lines changed: 13 additions & 1 deletion b/‎examples/models/gemma4_31b/export.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎examples/models/gemma4_31b/gguf_loader.py‎
Lines changed: 165 additions & 0 deletions b/‎examples/models/gemma4_31b/gguf_loader.py‎
Lines changed: 165 additions & 0 deletions
diff --git a/‎examples/models/gemma4_31b/inference.py‎
Lines changed: 51 additions & 18 deletions b/‎examples/models/gemma4_31b/inference.py‎
Lines changed: 51 additions & 18 deletions
diff --git a/‎examples/models/gemma4_31b/model.md‎
Lines changed: 11 additions & 12 deletions b/‎examples/models/gemma4_31b/model.md‎
Lines changed: 11 additions & 12 deletions
@@ -149,7 +149,7 @@ jobs:
         python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py examples/models/qwen3_5_moe/test_turboquant.py examples/models/qwen3_5_moe/test_sampler.py -v -o "addopts="
 
         # Run Gemma 4 31B tests (quant unit tests + pipeline integration tests)
-        python -m pytest examples/models/gemma4_31b/quant/ examples/models/gemma4_31b/test_pipeline.py examples/models/gemma4_31b/test_cuda_pipeline.py -v -o "addopts="
+        python -m pytest examples/models/gemma4_31b/quant/tests/ examples/models/gemma4_31b/tests/ -v -o "addopts="
 
   export-model-cuda-artifact:
     name: export-model-cuda-artifact
 
@@ -16,6 +16,7 @@ both export and eager inference:
 | `quantize_and_save.py` | bf16 HF checkpoint → quantized checkpoint (one-time) | ~30 GB CPU |
 | `export.py --prequantized <dir>` | quantized checkpoint → `model.pte` + `model.ptd` | ~24 GB CPU + CUDA for packing |
 | `inference.py --prequantized <dir>` | quantized checkpoint → eager generation under `torch.compile` | ~24 GB GPU |
+| `inference.py --gguf <file>` | GGUF file (Q4_K_M, etc.) → eager generation | ~24 GB GPU |
 | `export.py --model-dir <hf>` | one-shot bf16 → quantize → export (no intermediate file) | ~30 GB CPU + CUDA for packing |
 
 The quantized checkpoint is a safetensors file with int values + per-group
@@ -85,6 +86,15 @@ python examples/models/gemma4_31b/inference.py \
     --temperature 0.8
 ```
 
+GGUF files from the community (e.g., Q4_K_M) can also be used directly:
+
+```bash
+python examples/models/gemma4_31b/inference.py \
+    --gguf ./gemma-4-31B-it-Q4_K_M.gguf \
+    --tokenizer-path /path/to/tokenizer.json \
+    --prompt "Hello"
+```
+
 Useful before spending the export+lowering time to confirm the quantized
 model produces sensible text.
 
 
@@ -10,9 +10,10 @@
   - "decode":  T=1, static shape, returns the next sampled token.
   - "prefill": T>=2, dynamic shape, returns the next sampled token.
 
-Two input paths:
+Three input paths:
   --prequantized <dir>      Load a quantized checkpoint (from quantize_and_save.py)
                             and pack for the target backend. No re-quantization.
+  --gguf <file>             Load a GGUF file (e.g., Q4_K_M from the community).
   --model-dir <hf>          Load bf16 checkpoint, quantize, pack, and export
                             in one shot.
 
@@ -251,6 +252,11 @@ def main() -> None:
         default=None,
         help="Path to a quantized checkpoint directory. Skips quantization.",
     )
+    src.add_argument(
+        "--gguf",
+        default=None,
+        help="Path to a GGUF file (e.g., gemma-4-31B-it-Q4_K_M.gguf).",
+    )
     parser.add_argument(
         "--output-dir",
         default="./gemma4_31b_exports",
@@ -285,6 +291,12 @@ def main() -> None:
             max_seq_len=args.max_seq_len,
             backend=args.backend,
         )
+    elif args.gguf:
+        from executorch.examples.models.gemma4_31b.gguf_loader import load_gguf_model
+
+        model, config = load_gguf_model(
+            args.gguf, max_seq_len=args.max_seq_len, backend=args.backend
+        )
     else:
         model, config = load_and_quantize(
             args.model_dir,
 
@@ -0,0 +1,165 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Load a GGUF file into a Gemma 4 31B model.
+
+Streams tensors one at a time via ``iter_gguf_tensors`` for low peak
+memory, remaps GGUF names to model FQNs, handles tied embed/lm_head,
+and packs for the target backend.
+
+Usage:
+    model, config = load_gguf_model("model.gguf", backend="cuda")
+"""
+
+from typing import Optional
+
+import torch
+
+# GGUF pattern → model FQN pattern. ``{}`` is the layer index.
+_KEY_MAP = {
+    "token_embd.weight": "embed_tokens.weight",
+    "output_norm.weight": "norm.weight",
+    # Per-layer attention
+    "blk.{}.attn_q.weight": "layers.{}.self_attn.q_proj.weight",
+    "blk.{}.attn_k.weight": "layers.{}.self_attn.k_proj.weight",
+    "blk.{}.attn_v.weight": "layers.{}.self_attn.v_proj.weight",
+    "blk.{}.attn_output.weight": "layers.{}.self_attn.o_proj.weight",
+    "blk.{}.attn_q_norm.weight": "layers.{}.self_attn.q_norm.weight",
+    "blk.{}.attn_k_norm.weight": "layers.{}.self_attn.k_norm.weight",
+    # Per-layer norms
+    "blk.{}.attn_norm.weight": "layers.{}.input_layernorm.weight",
+    "blk.{}.post_attention_norm.weight": "layers.{}.post_attention_layernorm.weight",
+    "blk.{}.ffn_norm.weight": "layers.{}.pre_feedforward_layernorm.weight",
+    "blk.{}.post_ffw_norm.weight": "layers.{}.post_feedforward_layernorm.weight",
+    # Per-layer MLP
+    "blk.{}.ffn_gate.weight": "layers.{}.mlp.gate_proj.weight",
+    "blk.{}.ffn_up.weight": "layers.{}.mlp.up_proj.weight",
+    "blk.{}.ffn_down.weight": "layers.{}.mlp.down_proj.weight",
+    # Per-layer scalar
+    "blk.{}.layer_output_scale.weight": "layers.{}.layer_scalar",
+}
+
+_IGNORED_KEYS = {"rope_freqs.weight"}
+
+
+def gguf_to_model_key(gguf_key: str) -> Optional[str]:
+    """Map a GGUF tensor name to a model FQN, or ``None`` to skip."""
+    if gguf_key in _IGNORED_KEYS:
+        return None
+
+    for gguf_pat, model_pat in _KEY_MAP.items():
+        if "{}" not in gguf_pat:
+            if gguf_key == gguf_pat:
+                return model_pat
+            continue
+        prefix, suffix = gguf_pat.split("{}")
+        if gguf_key.startswith(prefix) and gguf_key.endswith(suffix):
+            layer_str = gguf_key[len(prefix) : len(gguf_key) - len(suffix)]
+            if layer_str.isdigit():
+                return model_pat.replace("{}", layer_str)
+
+    return None
+
+
+def _resolve_tied_lm_head(model, embed_cw, packers):
+    """Handle tied embed/lm_head after streaming all tensors."""
+    from executorch.examples.models.gemma4_31b.quant import pack_one
+
+    lm_head = getattr(model.lm_head, "weight", None)
+    if lm_head is None or lm_head.device.type != "meta":
+        return
+    if embed_cw is not None:
+        pack_one(model, "lm_head.weight", embed_cw, packers)
+    else:
+        pack_one(
+            model,
+            "lm_head.weight",
+            model.embed_tokens.weight.data.clone(),
+            packers,
+        )
+
+
+def _validate_no_meta(model):
+    """Ensure all parameters have been loaded."""
+    for fqn, p in model.named_parameters():
+        if p.device.type == "meta":
+            raise RuntimeError(
+                f"Weight '{fqn}' not found in GGUF file "
+                f"(model/checkpoint version mismatch?)"
+            )
+    for p in model.parameters():
+        p.requires_grad_(False)
+
+
+def load_gguf_model(
+    gguf_path: str,
+    max_seq_len: int = 4096,
+    backend: str = "cuda",
+) -> tuple:
+    """Load a GGUF file, remap keys, and pack for the target backend.
+
+    Streams tensors one at a time for low peak memory.
+
+    GGUF ties ``embed_tokens`` and ``lm_head`` into a single Q4_K tensor.
+    We untie them: the embedding is dequantized to bf16 (``nn.Embedding``
+    needs gather, which ``Int4TilePackedTo4dTensor`` does not support),
+    while ``lm_head`` keeps the original Q4_K quantization (``nn.Linear``
+    matmul via tinygemm).
+
+    Returns ``(model, config)``.
+    """
+    from executorch.examples.models.gemma4_31b.model import Gemma4_31B, Gemma4_31BConfig
+    from executorch.examples.models.gemma4_31b.quant import dequantize_weight, pack_one
+    from executorch.examples.models.gemma4_31b.quant.gguf import iter_gguf_tensors
+    from executorch.examples.models.gemma4_31b.quant.serialize import (
+        CanonicalQuantizedWeight,
+    )
+
+    if backend == "cuda":
+        from executorch.examples.models.gemma4_31b.quant import DEFAULT_CUDA_PACKERS
+
+        packers = DEFAULT_CUDA_PACKERS
+    else:
+        raise ValueError(f"Unsupported backend: {backend!r}. Supported: 'cuda'.")
+
+    config = Gemma4_31BConfig(max_seq_len=max_seq_len)
+
+    print("Building model on meta device...")
+    with torch.device("meta"):
+        model = Gemma4_31B(config)
+
+    embed_cw = None
+    n_processed = 0
+
+    print(f"Streaming GGUF from {gguf_path}...")
+    for gguf_name, result in iter_gguf_tensors(gguf_path):
+        model_key = gguf_to_model_key(gguf_name)
+        if model_key is None:
+            continue
+
+        if isinstance(result, torch.Tensor) and result.dtype == torch.float32:
+            result = result.to(torch.bfloat16)
+
+        if model_key == "embed_tokens.weight" and isinstance(
+            result, CanonicalQuantizedWeight
+        ):
+            embed_cw = result
+            result = dequantize_weight(result, torch.bfloat16)
+
+        pack_one(model, model_key, result, packers)
+
+        n_processed += 1
+        if n_processed % 100 == 0:
+            print(f"  Processed {n_processed} tensors...")
+
+    _resolve_tied_lm_head(model, embed_cw, packers)
+    del embed_cw
+
+    _validate_no_meta(model)
+    model.eval()
+
+    print(f"Model: {config.num_hidden_layers} layers, hidden={config.hidden_size}")
+    return model, config
@@ -4,20 +4,26 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Eager inference on a prequantized Gemma 4 31B-IT model (CUDA + torch.compile).
+"""Eager inference on Gemma 4 31B-IT (CUDA + torch.compile).
 
-Loads a quantized checkpoint (from ``quantize_and_save.py``), packs for CUDA,
-materializes runtime buffers, optionally compiles with ``torch.compile``, and
-generates text autoregressively. The model performs Gumbel-max sampling
-on-device, so each forward returns the next token ID as a float tensor of
-shape ``[B, 1]``.
+Two input paths:
+  --prequantized <dir>   Load a quantized checkpoint (from quantize_and_save.py).
+  --gguf <file>          Load a GGUF file (e.g., Q4_K_M from the community).
+
+Packs for the target backend (--backend cuda), materializes runtime buffers,
+optionally compiles with ``torch.compile``, and generates text autoregressively.
 
 Usage:
     python inference.py \\
         --prequantized ./gemma4_31b_int4 \\
         --prompt "Write a short joke about saving RAM." \\
         --max-new-tokens 128 \\
         --temperature 0.8
+
+    python inference.py \\
+        --gguf ./gemma-4-31B-it-Q4_K_M.gguf \\
+        --tokenizer-path ./tokenizer.json \\
+        --prompt "Hello"
 """
 
 import argparse
@@ -113,14 +119,23 @@ def generate(
 
 
 def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Eager inference on prequantized Gemma 4 31B-IT (CUDA)."
-    )
-    parser.add_argument(
+    parser = argparse.ArgumentParser(description="Eager inference on Gemma 4 31B-IT.")
+    src = parser.add_mutually_exclusive_group(required=True)
+    src.add_argument(
         "--prequantized",
-        required=True,
+        default=None,
         help="Path to a quantized checkpoint directory.",
     )
+    src.add_argument(
+        "--gguf",
+        default=None,
+        help="Path to a GGUF file (e.g., gemma-4-31B-it-Q4_K_M.gguf).",
+    )
+    parser.add_argument(
+        "--tokenizer-path",
+        default=None,
+        help="Path to tokenizer.json (required with --gguf, optional with --prequantized).",
+    )
     parser.add_argument("--prompt", default="Hello", help="Input prompt.")
     parser.add_argument(
         "--max-new-tokens",
@@ -145,23 +160,41 @@ def main() -> None:
         action="store_true",
         help="Skip torch.compile (slower, but easier to debug).",
     )
+    parser.add_argument(
+        "--backend",
+        default="cuda",
+        choices=["cuda"],
+        help="Target backend.",
+    )
     args = parser.parse_args()
 
-    if not torch.cuda.is_available():
-        parser.error("CUDA is required for inference.")
+    if args.backend == "cuda" and not torch.cuda.is_available():
+        parser.error("CUDA is required for the cuda backend.")
 
-    print(f"Loading prequantized model from {args.prequantized}...")
-    model, config = load_prequantized_model(
-        args.prequantized, max_seq_len=args.max_seq_len
-    )
+    if args.gguf:
+        from executorch.examples.models.gemma4_31b.gguf_loader import load_gguf_model
+
+        model, config = load_gguf_model(
+            args.gguf, args.max_seq_len, backend=args.backend
+        )
+    else:
+        print(f"Loading prequantized model from {args.prequantized}...")
+        model, config = load_prequantized_model(
+            args.prequantized, max_seq_len=args.max_seq_len, backend=args.backend
+        )
     _move_to_cuda(model, config)
     model.eval()
 
     if not args.no_compile:
         print("Compiling model with torch.compile...")
         model = torch.compile(model, mode="default")
 
-    tokenizer_path = os.path.join(args.prequantized, "tokenizer.json")
+    if args.tokenizer_path:
+        tokenizer_path = args.tokenizer_path
+    elif args.prequantized:
+        tokenizer_path = os.path.join(args.prequantized, "tokenizer.json")
+    else:
+        parser.error("--tokenizer-path is required with --gguf.")
     from tokenizers import Tokenizer
 
     tokenizer = Tokenizer.from_file(tokenizer_path)
 
@@ -121,20 +121,19 @@ identical logits to sequential one-token-at-a-time prefill.
 
 ## Quantization
 
-Three modules in `quant/`:
+Modules in `quant/`:
 
-- **Recipe** (`recipe.py`): `QuantConfig` (bits, group_size, symmetric,
-  method) + `QuantRule` (regex pattern, config, optional layer filter) +
-  `QuantRecipe` (ordered rules, first match wins). Declares what to
-  quantize and how — says nothing about packing or backends.
+- **Recipe** (`recipe.py`): `QuantConfig` + `QuantRule` + `QuantRecipe`.
+  Declares what to quantize — says nothing about packing or backends.
+- **Quantize** (`quantize.py`): `quantize_weight` / `dequantize_weight` /
+  `quantize_model`. Produces `CanonicalQuantizedWeight` from fp weights.
 - **Serialize** (`serialize.py`): `CanonicalQuantizedWeight` (int8 qdata +
-  bf16 scale + optional zero). `save()` / `load()` persist to safetensors
-  with a JSON header per weight. Packing-agnostic — any backend can read
-  the file.
-- **Packer** (`pack_cuda.py`): converts `CanonicalQuantizedWeight` to
-  backend runtime format at load time via `pack_model()`. Dispatches per
-  parent module type (`nn.Linear` → `Int4TilePackedTo4dTensor` for
-  tinygemm). Extensible via a packers dict.
+  bf16 scale + optional zero). `save()` / `load()` persist to safetensors.
+- **Pack** (`pack.py` + `pack_cuda.py`): `pack_model` groups weights by
+  parent module, `pack_one` handles single weights. Per-module packers
+  dispatch by module type (`nn.Linear`, `nn.Embedding`, extensible for MoE).
+- **GGUF** (`gguf.py`): `unpack_gguf_tensor` / `iter_gguf_tensors` for
+  loading community-quantized GGUF files (Q4_K, Q6_K).
 
 The quantize-once flow: