pytorch
diff --git a/‎examples/models/gemma4_31b/README.md‎
Lines changed: 5 additions & 4 deletions b/‎examples/models/gemma4_31b/README.md‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/models/gemma4_31b/export.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/models/gemma4_31b/export.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/models/gemma4_31b/gguf_loader.py‎
Lines changed: 10 additions & 14 deletions b/‎examples/models/gemma4_31b/gguf_loader.py‎
Lines changed: 10 additions & 14 deletions
diff --git a/‎examples/models/gemma4_31b/model.md‎
Lines changed: 7 additions & 6 deletions b/‎examples/models/gemma4_31b/model.md‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎examples/models/gemma4_31b/quant/README.md‎
Lines changed: 24 additions & 57 deletions b/‎examples/models/gemma4_31b/quant/README.md‎
Lines changed: 24 additions & 57 deletions
diff --git a/‎examples/models/gemma4_31b/quant/__init__.py‎
Lines changed: 0 additions & 7 deletions b/‎examples/models/gemma4_31b/quant/__init__.py‎
Lines changed: 0 additions & 7 deletions
@@ -19,10 +19,11 @@ both export and eager inference:
 | `inference.py --gguf <file>` | GGUF file (Q4_K_M, etc.) → eager generation | ~24 GB GPU |
 | `export.py --model-dir <hf>` | one-shot bf16 → quantize → export (no intermediate file) | ~30 GB CPU + CUDA for packing |
 
-The quantized checkpoint is a safetensors file with int values + per-group
-scales and a JSON header describing each weight's `QuantConfig`. No tensor
-subclass or backend-specific packing — packing for the target backend happens
-at load time via `quant.pack_model()`.
+The quantized checkpoint is a safetensors file containing torchao tensor
+subclasses (`Int4Tensor`, `IntxUnpackedToInt8Tensor`) and plain tensors.
+Metadata records each subclass's type and attributes. No backend-specific
+packing — packing for the target backend happens at load time via
+`quant.pack_model()`.
 
 ## Quantization recipes
 
 
@@ -82,12 +82,12 @@ def load_and_quantize(
         model.lm_head.weight = nn.Parameter(model.embed_tokens.weight.clone())
 
     print(f"Quantizing with recipe '{recipe_name}'...")
-    quantized, unquantized = quantize_model(model, recipe)
+    state_dict = quantize_model(model, recipe)
 
     print(f"Packing for {backend}...")
     with torch.device("meta"):
         model = Gemma4_31B(config)
-    pack_model(model, quantized, unquantized, packers=_get_packers(backend))
+    pack_model(model, state_dict, packers=_get_packers(backend))
     model.eval()
 
     print(f"Model: {config.num_hidden_layers} layers, hidden={config.hidden_size}")
 
@@ -64,15 +64,15 @@ def gguf_to_model_key(gguf_key: str) -> Optional[str]:
     return None
 
 
-def _resolve_tied_lm_head(model, embed_cw, packers):
+def _resolve_tied_lm_head(model, embed_quant, packers):
     """Handle tied embed/lm_head after streaming all tensors."""
     from executorch.examples.models.gemma4_31b.quant import pack_one
 
     lm_head = getattr(model.lm_head, "weight", None)
     if lm_head is None or lm_head.device.type != "meta":
         return
-    if embed_cw is not None:
-        pack_one(model, "lm_head.weight", embed_cw, packers)
+    if embed_quant is not None:
+        pack_one(model, "lm_head.weight", embed_quant, packers)
     else:
         pack_one(
             model,
@@ -114,9 +114,7 @@ def load_gguf_model(
     from executorch.examples.models.gemma4_31b.model import Gemma4_31B, Gemma4_31BConfig
     from executorch.examples.models.gemma4_31b.quant import dequantize_weight, pack_one
     from executorch.examples.models.gemma4_31b.quant.gguf import iter_gguf_tensors
-    from executorch.examples.models.gemma4_31b.quant.serialize import (
-        CanonicalQuantizedWeight,
-    )
+    from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor
 
     if backend == "cuda":
         from executorch.examples.models.gemma4_31b.quant import DEFAULT_CUDA_PACKERS
@@ -131,7 +129,7 @@ def load_gguf_model(
     with torch.device("meta"):
         model = Gemma4_31B(config)
 
-    embed_cw = None
+    embed_quant = None
     n_processed = 0
 
     print(f"Streaming GGUF from {gguf_path}...")
@@ -140,13 +138,11 @@ def load_gguf_model(
         if model_key is None:
             continue
 
-        if isinstance(result, torch.Tensor) and result.dtype == torch.float32:
+        if type(result) is torch.Tensor and result.dtype == torch.float32:
             result = result.to(torch.bfloat16)
 
-        if model_key == "embed_tokens.weight" and isinstance(
-            result, CanonicalQuantizedWeight
-        ):
-            embed_cw = result
+        if model_key == "embed_tokens.weight" and isinstance(result, Int4Tensor):
+            embed_quant = result
             result = dequantize_weight(result, torch.bfloat16)
 
         pack_one(model, model_key, result, packers)
@@ -155,8 +151,8 @@ def load_gguf_model(
         if n_processed % 100 == 0:
             print(f"  Processed {n_processed} tensors...")
 
-    _resolve_tied_lm_head(model, embed_cw, packers)
-    del embed_cw
+    _resolve_tied_lm_head(model, embed_quant, packers)
+    del embed_quant
 
     _validate_no_meta(model)
     model.eval()
 
@@ -126,9 +126,10 @@ Modules in `quant/`:
 - **Recipe** (`recipe.py`): `QuantConfig` + `QuantRule` + `QuantRecipe`.
   Declares what to quantize — says nothing about packing or backends.
 - **Quantize** (`quantize.py`): `quantize_weight` / `dequantize_weight` /
-  `quantize_model`. Produces `CanonicalQuantizedWeight` from fp weights.
-- **Serialize** (`serialize.py`): `CanonicalQuantizedWeight` (int8 qdata +
-  bf16 scale + optional zero). `save()` / `load()` persist to safetensors.
+  `quantize_model`. Produces torchao tensor subclasses (`Int4Tensor`,
+  `IntxUnpackedToInt8Tensor`) from fp weights.
+- **Serialization**: callers use torchao's safetensors integration
+  (`torchao.prototype.safetensors`) directly — no wrapper module needed.
 - **Pack** (`pack.py` + `pack_cuda.py`): `pack_model` groups weights by
   parent module, `pack_one` handles single weights. Per-module packers
   dispatch by module type (`nn.Linear`, `nn.Embedding`, extensible for MoE).
@@ -142,11 +143,11 @@ quantize_and_save.py                    export.py / inference.py
      |                                       |
   bf16 weights                          quantized checkpoint (safetensors)
      |                                       |
-  quantize_weight()                     load()
+  quantize_weight()                     load (torchao safetensors)
      |                                       |
-  CanonicalQuantizedWeight              CanonicalQuantizedWeight
+  Int4Tensor / IntxUnpacked             Int4Tensor / IntxUnpacked
      |                                       |
-  save()                                pack_model()
+  save (torchao safetensors)            pack_model()
      |                                       |
   model.safetensors                     Int4TilePackedTo4dTensor (runtime)
 ```
 
@@ -1,27 +1,29 @@
 # quant/
 
-Packing-agnostic quantization framework: **recipe → quantize → serialize → pack**.
+Quantization framework: **recipe → quantize → pack**.
 
 ## Files
 
 | File | Concern | Depends on |
 |---|---|---|
 | `recipe.py` | **Policy** — what to quantize, what precision, which layers | nothing |
-| `quantize.py` | **Computation** — produces/dequantizes canonical weights | recipe, torchao |
-| `serialize.py` | **Data format** — saves/loads canonical weights to safetensors | recipe |
-| `pack.py` | **Packing dispatch** — `pack_model` (bulk) and `pack_one` (streaming) | serialize |
-| `pack_cuda.py` | **CUDA packing** — converts canonical to tinygemm/intx runtime format | pack, serialize |
-| `gguf.py` | **GGUF import** — unpacks Q4_K/Q6_K blocks to canonical form | recipe, serialize |
+| `quantize.py` | **Computation** — produces torchao subclass tensors | recipe, torchao |
+| `pack.py` | **Packing dispatch** — `pack_model` (bulk) and `pack_one` (streaming) | — |
+| `pack_cuda.py` | **CUDA packing** — converts Int4Tensor to tinygemm format | pack |
+| `gguf.py` | **GGUF import** — unpacks Q4_K/Q6_K blocks to torchao subclasses | torchao |
 
 ## Data flow
 
 ```
-QuantRecipe → quantize_model() → CanonicalQuantizedWeight → save() → file → load() → CanonicalQuantizedWeight → pack_model() → runtime model
+QuantRecipe → quantize_model() → state_dict{Int4Tensor, IntxUnpackedToInt8Tensor, Tensor} → safetensors → state_dict → pack_model() → runtime model
 ```
 
-`CanonicalQuantizedWeight` is the interchange point — int8 qdata + bf16
-scale + optional zero + config. Everything left of it is backend-agnostic.
-Everything right is backend-specific.
+Quantized weights are stored as torchao tensor subclasses:
+- **Int4Tensor** — 4-bit weights (nibble-packed qdata + transposed scale/zero_point)
+- **IntxUnpackedToInt8Tensor** — 8-bit weights (int8 qdata + scale + zero_point)
+
+These are the canonical interchange formats from torchao. Everything left
+of `save()` is backend-agnostic. Everything right is backend-specific.
 
 ## Adding a new backend
 
@@ -32,56 +34,21 @@ def pack_linear_for_metal(module, weights): ...
 DEFAULT_METAL_PACKERS = {nn.Linear: pack_linear_for_metal}
 ```
 
-Call `pack_model(model, quantized, unquantized, packers=DEFAULT_METAL_PACKERS)`.
-No changes to recipe, quantize, or serialize.
-
-Things to consider:
-
-- **Recipes may need to be backend-aware.** Each backend's kernels have
-  different constraints (e.g., Metal's `fpa4w` is INT4-only — no INT8 linear
-  kernel, so the sensitive recipe's 8-bit edge layers would need to be INT4
-  or dequantized to bf16). Define per-backend recipes or validate recipe
-  compatibility at pack time.
-- **Source transforms before packing.** Some backends replace model modules
-  (e.g., MLX swaps `FusedMoEExperts` → `SwitchMLP`, Metal swaps to
-  `MetalMoEExperts`). These transforms change the module types that
-  packers dispatch on, so they must run before `pack_model()`. For dense
-  models (no MoE) this is not needed.
-- **Embedding quantization.** Not all backends have a quantized embedding
-  gather kernel. The packer can dequantize to bf16 at load time — the
-  disk savings from the canonical format still apply.
-
-## Adding a new model
-
-1. Define a `QuantRecipe` with rules for the model's FQN patterns.
-2. If the model has custom module types (e.g., `FusedMoEExperts`), write a
-   per-module packer and extend the packers dict:
-   ```python
-   packers = {**DEFAULT_CUDA_PACKERS, FusedMoEExperts: pack_moe_experts}
-   ```
-3. No changes to the quant package itself.
+Call `pack_model(model, state_dict, packers=DEFAULT_METAL_PACKERS)`.
+No changes to recipe or quantize.
 
 ## On-disk format
 
-Safetensors with a `format_version` in the header. Per quantized weight:
-`{fqn}.qdata` (int8, nibble-packed for 4-bit), `{fqn}.scale` (bf16),
-optionally `{fqn}.zero` (bf16). Header JSON records bits, group_size,
-symmetric, and method per weight. Unquantized weights stored as-is.
+Uses torchao's safetensors integration (`torchao.prototype.safetensors`).
+Each tensor subclass is decomposed into its inner tensors
+(e.g., `layer._weight_qdata`, `layer._weight_scale`) plus JSON metadata
+recording the subclass type and attributes. Plain tensors are stored as-is.
+The format is compatible with torchao's `save_pretrained` / `load_pretrained`.
 
 ## TODO
 
-- `pack_metal.py` — Metal backend packer. Convert canonical INT4 to
-  `UIntxWeightOnlyConfig` subclass (torchao experimental) for the
-  `torchao::_linear_fp_act_4bit_weight` kernel. For MoE models, pack
-  expert weights into Metal's `gather_qmv` format (asymmetric, unsigned
-  INT4 with scale + bias buffers).
-
-- `pack_mlx.py` — MLX backend packer. Convert canonical INT4 to
-  `IntxWeightOnlyConfig` subclass for the `mlx::gather_qmm` kernel.
-  For MoE models, stack per-expert weights into `SwitchLinear` format.
-
-- `gguf.py` — extend with Q5_K, Q8_0, and other GGUF quant types.
-  Currently supports Q4_K and Q6_K. Some Q4_K_M files also contain
-  Q5_K or Q8_0 tensors (for sensitive layers on certain architectures)
-  which will raise — add support as needed. Q6_K is widened to 8-bit
-  for CUDA packing since there is no 6-bit CUDA kernel.
+- `pack_metal.py` — Metal backend packer.
+- `pack_mlx.py` — MLX backend packer.
+- `gguf.py` — extend with Q5_K, Q8_0 GGUF quant types.
+- Upstream `Int4TilePackedTo4dTensor.from_int4_tensor()` to torchao
+  to replace the manual conversion in `pack_int4_for_cuda`.
@@ -8,10 +8,3 @@
 from .pack_cuda import DEFAULT_CUDA_PACKERS, load_and_pack_for_cuda  # noqa: F401
 from .quantize import dequantize_weight, quantize_model, quantize_weight  # noqa: F401
 from .recipe import QuantConfig, QuantRecipe, QuantRule  # noqa: F401
-from .serialize import (  # noqa: F401
-    CanonicalQuantizedWeight,
-    deserialize,
-    load,
-    save,
-    serialize,
-)