Add streaming iter_load and tighten quant public API

mergennachin · mergennachin · commit 0ea68120e213 · 2026-04-30T14:25:16.000-07:00
- serialize.py: add iter_load() generator that streams weights one at a
  time from safetensors, keeping peak memory proportional to the largest
  single weight instead of loading all weights into memory at once.
- pack_cuda.py: rewrite load_and_pack_for_cuda to use iter_load for
  streaming — avoids ~40 GB peak memory when loading the 31B checkpoint.
- __init__.py: remove low-level CUDA packer internals (pack_int4_for_cuda,
  pack_int8_for_cuda, pack_linear_for_cuda, pack_embedding_for_cuda) from
  the public API. Tests import these directly from pack_cuda.py.
diff --git a/examples/models/gemma4_31b/quant/__init__.py b/examples/models/gemma4_31b/quant/__init__.py
@@ -5,14 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from .pack import ModulePackerFn, pack_model, pack_one  # noqa: F401
-from .pack_cuda import (  # noqa: F401
-    DEFAULT_CUDA_PACKERS,
-    load_and_pack_for_cuda,
-    pack_embedding_for_cuda,
-    pack_int4_for_cuda,
-    pack_int8_for_cuda,
-    pack_linear_for_cuda,
-)
+from .pack_cuda import DEFAULT_CUDA_PACKERS, load_and_pack_for_cuda  # noqa: F401
 from .quantize import dequantize_weight, quantize_model, quantize_weight  # noqa: F401
 from .recipe import QuantConfig, QuantRecipe, QuantRule  # noqa: F401
 from .serialize import (  # noqa: F401
diff --git a/examples/models/gemma4_31b/quant/pack_cuda.py b/examples/models/gemma4_31b/quant/pack_cuda.py
@@ -17,7 +17,7 @@
 import torch.nn as nn
 
 from .pack import ModulePackerFn, pack_model  # noqa: F401
-from .serialize import CanonicalQuantizedWeight, load
+from .serialize import CanonicalQuantizedWeight
 
 
 # ---------------------------------------------------------------------------
@@ -202,9 +202,23 @@ def load_and_pack_for_cuda(
     model: nn.Module,
     packers: dict[type, ModulePackerFn] | None = None,
 ) -> None:
-    """Read a quantized safetensors file and pack into ``model`` for CUDA.
+    """Stream weights from a quantized safetensors file and pack for CUDA.
 
-    Thin wrapper: ``load`` + ``pack_model``.
+    Uses ``iter_load`` to process one weight at a time, keeping peak
+    memory proportional to the largest single weight instead of loading
+    all weights into memory at once.
     """
-    quantized, unquantized = load(path)
-    pack_model(model, quantized, unquantized, packers or DEFAULT_CUDA_PACKERS)
+    from .pack import pack_one
+    from .serialize import iter_load
+
+    _packers = packers or DEFAULT_CUDA_PACKERS
+
+    for fqn, value in iter_load(path):
+        pack_one(model, fqn, value, _packers)
+
+    for fqn, p in model.named_parameters():
+        if p.device.type == "meta":
+            raise RuntimeError(
+                f"Weight '{fqn}' not found in checkpoint "
+                f"(model/checkpoint version mismatch?)"
+            )
diff --git a/examples/models/gemma4_31b/quant/serialize.py b/examples/models/gemma4_31b/quant/serialize.py
@@ -22,7 +22,7 @@
 
 import json
 from dataclasses import dataclass
-from typing import Optional
+from typing import Iterator, Optional
 
 import torch
 from safetensors import safe_open
@@ -233,3 +233,49 @@ def load(
         header = f.metadata()
         tensors = {k: f.get_tensor(k) for k in f.keys()}
     return deserialize(tensors, header)
+
+
+def iter_load(
+    path: str,
+) -> Iterator[tuple[str, CanonicalQuantizedWeight | torch.Tensor]]:
+    """Stream weights from a safetensors file one at a time.
+
+    Yields ``(fqn, value)`` where *value* is a ``CanonicalQuantizedWeight``
+    for quantized weights or a plain ``torch.Tensor`` for unquantized ones.
+    Only one weight's tensors are resident in memory at a time, keeping peak
+    memory proportional to the largest single weight.
+    """
+    with safe_open(path, framework="pt", device="cpu") as f:
+        header = f.metadata()
+        quant_meta = json.loads(header.get("quant", "{}"))
+        all_keys = set(f.keys())
+        consumed: set[str] = set()
+
+        for fqn, meta in quant_meta.items():
+            config = QuantConfig(
+                bits=meta["bits"],
+                group_size=meta["group_size"],
+                symmetric=meta["symmetric"],
+                method=meta["method"],
+            )
+            qdata = f.get_tensor(f"{fqn}.qdata")
+            consumed.add(f"{fqn}.qdata")
+            if config.bits == 4:
+                qdata = _nibble_unpack(qdata, meta["shape"][-1])
+
+            scale = f.get_tensor(f"{fqn}.scale")
+            consumed.add(f"{fqn}.scale")
+
+            zero_key = f"{fqn}.zero"
+            zero = None
+            if zero_key in all_keys:
+                zero = f.get_tensor(zero_key)
+                consumed.add(zero_key)
+
+            yield fqn, CanonicalQuantizedWeight(
+                qdata=qdata, scale=scale, zero=zero, config=config
+            )
+
+        for key in all_keys:
+            if key not in consumed:
+                yield key, f.get_tensor(key)
diff --git a/examples/models/gemma4_31b/quant/tests/test_serialize.py b/examples/models/gemma4_31b/quant/tests/test_serialize.py
@@ -26,6 +26,7 @@
     _nibble_unpack,
     CanonicalQuantizedWeight,
     deserialize,
+    iter_load,
     load,
     save,
     serialize,
@@ -264,5 +265,69 @@ def test_empty_quantized(self):
         self.assertTrue(torch.equal(unq["w"], u["w"]))
 
 
+class TestIterLoad(unittest.TestCase):
+    """Streaming load — one weight at a time from disk."""
+
+    def test_yields_all_weights(self):
+        """iter_load yields every quantized and unquantized weight."""
+        q4 = QuantConfig(bits=4, group_size=32, symmetric=False, method="min_max")
+        q8 = QuantConfig(bits=8, group_size=32, symmetric=True, method="min_max")
+        cw4 = _make_cqw((64, 128), q4)
+        cw8 = _make_cqw((32, 64), q8)
+        unq = {"norm.weight": torch.randn(64, dtype=torch.bfloat16)}
+
+        with tempfile.TemporaryDirectory() as d:
+            path = os.path.join(d, "m.safetensors")
+            save({"proj.weight": cw4, "embed.weight": cw8}, unq, path)
+            items = list(iter_load(path))
+
+        fqns = {fqn for fqn, _ in items}
+        self.assertIn("proj.weight", fqns)
+        self.assertIn("embed.weight", fqns)
+        self.assertIn("norm.weight", fqns)
+        self.assertEqual(len(items), 3)
+
+    def test_quantized_matches_load(self):
+        """Streaming yields identical CQW to batch load."""
+        config = QuantConfig(bits=4, group_size=32, symmetric=False, method="min_max")
+        cw = _make_cqw((64, 128), config)
+
+        with tempfile.TemporaryDirectory() as d:
+            path = os.path.join(d, "m.safetensors")
+            save({"w": cw}, {}, path)
+
+            q_batch, _ = load(path)
+            items = dict(iter_load(path))
+
+        batch_cw = q_batch["w"]
+        stream_cw = items["w"]
+        self.assertIsInstance(stream_cw, CanonicalQuantizedWeight)
+        self.assertTrue(torch.equal(batch_cw.qdata, stream_cw.qdata))
+        self.assertTrue(torch.equal(batch_cw.scale, stream_cw.scale))
+        self.assertTrue(torch.equal(batch_cw.zero, stream_cw.zero))
+        self.assertEqual(batch_cw.config, stream_cw.config)
+
+    def test_unquantized_matches_load(self):
+        """Streaming yields identical plain tensors to batch load."""
+        unq = {"a": torch.randn(8, 16, dtype=torch.bfloat16)}
+
+        with tempfile.TemporaryDirectory() as d:
+            path = os.path.join(d, "m.safetensors")
+            save({}, unq, path)
+
+            _, u_batch = load(path)
+            items = dict(iter_load(path))
+
+        self.assertTrue(torch.equal(u_batch["a"], items["a"]))
+
+    def test_empty_file(self):
+        """Streaming an empty checkpoint yields nothing."""
+        with tempfile.TemporaryDirectory() as d:
+            path = os.path.join(d, "m.safetensors")
+            save({}, {}, path)
+            items = list(iter_load(path))
+        self.assertEqual(len(items), 0)
+
+
 if __name__ == "__main__":
     unittest.main()