GSQ/convert_to_humming.py at main · IST-DASLab/GSQ · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
"""Convert an assembled GSQ compressed-tensors checkpoint into a Humming-format
checkpoint that `humming.layer.HummingLayer.from_safetensors` can load.

Input  (from `save_model.py`):
    <in_dir>/config.json               with quantization_config.format=pack-quantized
    <in_dir>/model-*.safetensors       per-shard tensors
    <in_dir>/model.safetensors.index.json

Per-Linear inputs in safetensors:
    <prefix>.weight_packed   int32 [N, K * num_bits // 32]
    <prefix>.weight_scale    bf16/fp32 [N, K // group_size]
    <prefix>.weight_shape    int64 [2]

Output:
    <out_dir>/config.json              with quantization_config.quant_method=humming
                                       and a per-layer `dynamic` map encoding
                                       the effective bits.
    <out_dir>/model-*.safetensors      every quantized Linear rewritten as:
        <prefix>.weight         int32 [N, K * eff_bits // 32]  (Humming packed)
        <prefix>.weight_scale   bf16  [N, K // group_size]
        <prefix>.zero_point     bf16  [N, K // group_size]
    All non-quantized tensors are copied through unchanged.

Verification:
    --verify-one <regex>   pick the first matching Linear, build a HummingLayer
                           from the converted tensors + config, run a forward
                           pass against a random bf16 activation, compare to
                           `x @ W_deq.T` where W_deq is the CT dequantization
                           of the same Linear. Reports max abs / rel error.

Usage:
    . ~/local/venvs/main/bin/activate
    export PATH=$CUDA_HOME/bin:$PATH

    # Convert + write a new checkpoint dir:
    python convert_to_humming.py \
        --in-dir  /path/to/assembled \
        --out-dir /path/to/assembled-humming \
        --verify-one '.*\\.layers\\.0\\..*'

    # Verify only (no write):
    python convert_to_humming.py --in-dir /path/to/assembled --verify-only \
        --verify-one '.*\\.layers\\.0\\..*'
"""

from __future__ import annotations

import argparse
import json
import os
import re
import shutil
import sys
import time
from pathlib import Path
from typing import Dict, List, Tuple

import torch
from safetensors import safe_open
from safetensors.torch import save_file

REPO_ROOT = Path(__file__).resolve().parent
sys.path.insert(0, str(REPO_ROOT))

from src.quantization.humming_pack import (  # noqa: E402
    ct_dequantize_reference,
    ct_to_humming,
)


CT_SUFFIXES = ("weight_packed", "weight_scale", "weight_shape")


def load_quant_config(in_dir: Path) -> Tuple[Dict, Dict]:
    """Return (full_config, weight_subconfig)."""
    cfg = json.loads((in_dir / "config.json").read_text())
    qc = cfg["quantization_config"]
    if qc["quant_method"] != "compressed-tensors":
        raise ValueError(f"expected compressed-tensors, got {qc['quant_method']}")
    if qc.get("format") != "pack-quantized":
        raise ValueError(f"expected pack-quantized format, got {qc.get('format')}")
    weight_cfg = qc["config_groups"]["group_0"]["weights"]
    return cfg, weight_cfg


def discover_layers(in_dir: Path) -> Dict[str, Dict[str, Tuple[str, str]]]:
    """Walk safetensors index, return {prefix: {suffix: (key, shard_path)}}.

    `prefix` is the Linear module name (e.g. 'model.layers.0.mlp.gate_proj').
    """
    idx_path = in_dir / "model.safetensors.index.json"
    single_path = in_dir / "model.safetensors"
    if idx_path.exists():
        wmap = json.loads(idx_path.read_text())["weight_map"]
        wmap = {k: str(in_dir / v) for k, v in wmap.items()}
    elif single_path.exists():
        with safe_open(str(single_path), framework="pt", device="cpu") as f:
            wmap = {k: str(single_path) for k in f.keys()}
    else:
        raise FileNotFoundError(f"no safetensors index/file under {in_dir}")

    layers: Dict[str, Dict[str, Tuple[str, str]]] = {}
    for key, shard in wmap.items():
        for suf in CT_SUFFIXES:
            marker = "." + suf
            if key.endswith(marker):
                prefix = key[: -len(marker)]
                layers.setdefault(prefix, {})[suf] = (key, shard)
                break
    complete = {p: m for p, m in layers.items() if set(m) == set(CT_SUFFIXES)}
    return complete


def load_layer_tensors(
    shard_map: Dict[str, Tuple[str, str]],
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Read weight_packed/weight_scale/weight_shape from their (possibly different) shards."""
    # Group reads per shard for fewer file opens.
    by_shard: Dict[str, List[Tuple[str, str]]] = {}
    for suf, (key, shard) in shard_map.items():
        by_shard.setdefault(shard, []).append((suf, key))
    out: Dict[str, torch.Tensor] = {}
    for shard, items in by_shard.items():
        with safe_open(shard, framework="pt", device="cpu") as f:
            for suf, key in items:
                out[suf] = f.get_tensor(key)
    return out["weight_packed"], out["weight_scale"], out["weight_shape"]


def convert_layer(
    weight_packed: torch.Tensor,
    weight_scale: torch.Tensor,
    weight_shape: torch.Tensor,
    *,
    storage_bits: int,
    group_size: int,
    symmetric: bool,
    target_dtype: torch.dtype,
    symmetric_out: bool = False,
):
    return ct_to_humming(
        weight_packed, weight_scale, weight_shape,
        storage_bits=storage_bits, group_size=group_size, symmetric=symmetric,
        target_dtype=target_dtype, symmetric_out=symmetric_out,
    )


def write_humming_checkpoint(
    in_dir: Path,
    out_dir: Path,
    layer_map: Dict[str, Dict[str, Tuple[str, str]]],
    storage_bits: int,
    group_size: int,
    symmetric: bool,
    target_dtype: torch.dtype,
    symmetric_out: bool = False,
):
    """Rewrite the checkpoint streaming, one input shard at a time. Peak RSS
    stays bounded by the largest single shard, not the whole model -- so this
    scales to Kimi-K2.5 (~32 GB packed) on a workstation."""
    import gc

    out_dir.mkdir(parents=True, exist_ok=True)

    # Load original index.
    idx_path = in_dir / "model.safetensors.index.json"
    if idx_path.exists():
        wmap_orig = json.loads(idx_path.read_text())["weight_map"]
    else:
        with safe_open(str(in_dir / "model.safetensors"), framework="pt", device="cpu") as f:
            wmap_orig = {k: "model.safetensors" for k in f.keys()}

    # Map shard filename -> list of original keys.
    by_shard: Dict[str, List[str]] = {}
    for k, shard in wmap_orig.items():
        by_shard.setdefault(shard, []).append(k)

    # Assert per-Linear colocation: all three CT suffixes for a prefix sit in
    # the same shard (true for GSQ save_model.py output, which writes one shard
    # per transformer layer). Streaming-per-shard relies on this.
    shard_of_prefix: Dict[str, str] = {}
    for prefix, sm in layer_map.items():
        shards = {Path(sm[s][1]).name for s in CT_SUFFIXES}
        if len(shards) != 1:
            raise NotImplementedError(
                f"prefix {prefix!r} has CT tensors split across {shards}; "
                f"streaming converter requires colocation. Re-run save_model.py "
                f"or fall back to a non-streaming converter."
            )
        shard_of_prefix[prefix] = shards.pop()

    prefixes_in_shard: Dict[str, List[str]] = {}
    for p, sh in shard_of_prefix.items():
        prefixes_in_shard.setdefault(sh, []).append(p)

    per_layer_cfg: Dict[str, Dict] = {}
    total_eff_bits_counter: Dict[int, int] = {}
    new_wmap: Dict[str, str] = {}
    total_bytes = 0

    shards_sorted = sorted({Path(s).name for s in by_shard})
    t0 = time.perf_counter()
    progress_total = len(layer_map)
    done = 0

    print(f"streaming {len(shards_sorted)} shards from {in_dir} to {out_dir}...")
    for shard_idx, shard_name in enumerate(shards_sorted, start=1):
        src_path = in_dir / shard_name
        out_path = out_dir / shard_name
        prefixes_here = sorted(prefixes_in_shard.get(shard_name, []))
        ct_keys_to_drop = {f"{p}.{suf}" for p in prefixes_here for suf in CT_SUFFIXES}

        tensors_out: Dict[str, torch.Tensor] = {}
        with safe_open(str(src_path), framework="pt", device="cpu") as f:
            # 1) Convert quantized Linears that live in this shard. Materialize
            #    only one Linear's CT tensors at a time so RSS stays low.
            for prefix in prefixes_here:
                wp = f.get_tensor(f"{prefix}.weight_packed")
                ws = f.get_tensor(f"{prefix}.weight_scale")
                wsh = f.get_tensor(f"{prefix}.weight_shape")
                try:
                    tensors, cfg, info = convert_layer(
                        wp, ws, wsh,
                        storage_bits=storage_bits, group_size=group_size,
                        symmetric=symmetric, target_dtype=target_dtype,
                        symmetric_out=symmetric_out,
                    )
                except Exception as e:
                    raise RuntimeError(f"failed to convert {prefix}: {e}") from e
                # Drop the CT inputs immediately.
                del wp, ws, wsh

                per_layer_cfg[prefix] = cfg
                total_eff_bits_counter[info["effective_bits"]] = \
                    total_eff_bits_counter.get(info["effective_bits"], 0) + 1

                tensors_out[f"{prefix}.weight"] = tensors["weight"].contiguous()
                tensors_out[f"{prefix}.weight_scale"] = tensors["weight_scale"].contiguous()
                if "zero_point" in tensors:
                    tensors_out[f"{prefix}.zero_point"] = tensors["zero_point"].contiguous()
                done += 1

            # 2) Pass-through non-quantized tensors from the same shard.
            for k in f.keys():
                if k in ct_keys_to_drop:
                    continue
                tensors_out[k] = f.get_tensor(k).contiguous()

        # 3) Bookkeeping + write.
        for k, v in tensors_out.items():
            new_wmap[k] = shard_name
            total_bytes += v.numel() * v.element_size()
        save_file(tensors_out, str(out_path))

        dt = time.perf_counter() - t0
        print(f"  [{shard_idx}/{len(shards_sorted)}] {shard_name}: "
              f"{len(tensors_out)} tensors ({done}/{progress_total} converted)  "
              f"elapsed={dt:.1f}s", flush=True)

        # 4) Free everything so the next shard starts from a clean RSS.
        del tensors_out
        gc.collect()

    print(f"\neffective-bit histogram: {total_eff_bits_counter}")

    # Write a new index.
    if idx_path.exists() or len(by_shard) > 1:
        new_index = {"metadata": {"total_size": int(total_bytes)}, "weight_map": new_wmap}
        (out_dir / "model.safetensors.index.json").write_text(json.dumps(new_index, indent=2))

    # Copy tokenizer / generation_config / etc.
    keep_files = [
        "tokenizer.json", "tokenizer_config.json", "vocab.json", "merges.txt",
        "tokenizer.model", "tiktoken.model",
        "special_tokens_map.json", "generation_config.json", "preprocessor_config.json",
    ]
    for fn in os.listdir(in_dir):
        src = in_dir / fn
        if not src.is_file():
            continue
        if fn in keep_files or fn.endswith(".py"):
            shutil.copy2(src, out_dir / fn)

    # Write the updated config.json with humming quantization_config.
    cfg_full = json.loads((in_dir / "config.json").read_text())
    old_qc = cfg_full["quantization_config"]
    ignore = old_qc.get("ignore", [])

    # Build a `dynamic` map: regex -> per-layer humming config. Use the layer
    # prefix as an exact-match regex (escaped). The HummingLayer.from_safetensors
    # treats `regex[2:]` as the actual pattern (the first two chars are a flag).
    # HummingLayer.from_safetensors parses entries as:
    #   regex[0] != "-"  → assert prefix does NOT match (exclusion)
    #   regex[0] == "-"  → if match, override config with the linked entry
    # So we use "-:" for per-layer overrides.
    dynamic: Dict[str, Dict] = {}
    for prefix, cfg_l in per_layer_cfg.items():
        pat = "-:" + re.escape(prefix) + "$"
        dynamic[pat] = cfg_l

    # Pick a "default" config from the most-common bitwidth so the top-level
    # config_groups entry is at least sensible. HummingLayer.from_safetensors
    # uses `config_groups` when `dynamic` doesn't match; the per-layer dynamic
    # entries take precedence.
    most_common_bits = max(total_eff_bits_counter, key=total_eff_bits_counter.get)
    top_cfg = {
        "config_groups": {
            "group_0": {
                "input_activations": None,
                "output_activations": None,
                "targets": ["Linear"],
                "weights": {
                    "dtype": f"uint{most_common_bits}",
                    "group_size": group_size,
                    "has_zero_point": not symmetric_out,
                    "is_fp_zero_point": not symmetric_out,
                    "num_bits": most_common_bits,
                    "strategy": "group",
                    "symmetric": bool(symmetric_out),
                    "type": "int",
                },
            }
        },
        "format": "pack-quantized",
        "ignore": ignore,
        "quant_method": "humming",
        "quantization_status": "compressed",
        "dynamic": dynamic,
    }
    cfg_full["quantization_config"] = top_cfg
    (out_dir / "config.json").write_text(json.dumps(cfg_full, indent=2))

    print(f"\nwrote {len(per_layer_cfg)} quantized layers, "
          f"~{total_bytes / 1024**3:.2f} GB total")
    print(f"output: {out_dir}")


def verify_one(
    layer_map: Dict[str, Dict[str, Tuple[str, str]]],
    pattern: str,
    storage_bits: int,
    group_size: int,
    symmetric: bool,
    target_dtype: torch.dtype,
    symmetric_out: bool = False,
):
    """Pick the first Linear matching `pattern`, convert it, run a kernel
    forward, and compare to the CT-dequant matmul reference."""
    pat = re.compile(pattern)
    matches = [p for p in sorted(layer_map) if pat.search(p)]
    if not matches:
        raise SystemExit(f"no layer matched verify-one pattern {pattern!r}; "
                         f"first few layers: {sorted(layer_map)[:5]}")
    name = matches[0]
    print(f"verifying {name}  (symmetric_out={symmetric_out})")
    wp, ws, wsh = load_layer_tensors(layer_map[name])
    ref = ct_dequantize_reference(
        wp, ws, wsh, storage_bits=storage_bits, group_size=group_size,
        target_dtype=target_dtype,
    )
    tensors, cfg, info = ct_to_humming(
        wp, ws, wsh,
        storage_bits=storage_bits, group_size=group_size, symmetric=symmetric,
        target_dtype=target_dtype, symmetric_out=symmetric_out,
    )
    print(f"  info: {info}")
    print(f"  schema: {cfg}")

    # Build kernel layer and forward.
    from humming import dtypes
    from humming.layer import HummingLayer
    from humming.schema.humming import HummingWeightSchema

    schema = HummingWeightSchema(
        b_dtype=dtypes.DataType.from_str(cfg["dtype"]),
        weight_scale_group_size=cfg["group_size"],
        has_zero_point=cfg["has_zero_point"],
        is_fp_zero_point=cfg["is_fp_zero_point"],
    )
    N, K = info["N"], info["K"]
    device = "cuda"
    layer = HummingLayer(
        shape_n=N, shape_k=K, weight_config=schema, torch_dtype=target_dtype,
    ).to(device)
    layer.load_from_tensors({k: v.to(device) for k, v in tensors.items()})
    layer.transform()

    torch.manual_seed(0)
    x = (torch.randn(16, K, dtype=target_dtype, device=device) * 0.05)
    y_hum = layer(x)
    y_ref = x @ ref.to(device).t()
    diff = (y_hum.float() - y_ref.float()).abs()
    rel = diff.max().item() / max(y_ref.float().abs().max().item(), 1e-6)
    print(f"  forward: max_abs={diff.max().item():.3e}  "
          f"mean={diff.mean().item():.3e}  rel_max={rel:.3e}")
    if rel > 5e-2:
        raise SystemExit(f"forward error too large: rel_max={rel}")
    print("  OK")


def parse_args():
    p = argparse.ArgumentParser(__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("--in-dir", required=True, help="Path to assembled GSQ checkpoint dir.")
    p.add_argument("--out-dir", default=None, help="Output dir for converted humming checkpoint.")
    p.add_argument("--verify-only", action="store_true",
                   help="Only verify, don't write anything.")
    p.add_argument("--verify-one", default=None,
                   help="Regex; verify the first matching Linear by running a kernel forward "
                        "pass and comparing to the dequant reference.")
    p.add_argument("--target-dtype", default="bfloat16", choices=["bfloat16", "float16"])
    p.add_argument("--symmetric", action="store_true",
                   help="Emit Humming's offset-binary symmetric format: no per-layer "
                        "zero_point tensor; the kernel applies an implicit "
                        "2^(eff_bits-1) offset. Requires the GSQ codebook to span the "
                        "full unsigned range (true for current 2/3/4-bit Gumbel quantizers).")
    return p.parse_args()


def main():
    args = parse_args()
    in_dir = Path(args.in_dir).resolve()
    target_dtype = getattr(torch, args.target_dtype)

    full_cfg, weight_cfg = load_quant_config(in_dir)
    storage_bits = int(weight_cfg["num_bits"])
    group_size = int(weight_cfg["group_size"])
    symmetric = bool(weight_cfg["symmetric"])
    print(f"input checkpoint: {in_dir}")
    print(f"  storage_bits={storage_bits}  group_size={group_size}  symmetric={symmetric}")

    layer_map = discover_layers(in_dir)
    print(f"  found {len(layer_map)} quantized Linear modules")

    if args.verify_one:
        verify_one(layer_map, args.verify_one,
                   storage_bits=storage_bits, group_size=group_size, symmetric=symmetric,
                   target_dtype=target_dtype, symmetric_out=args.symmetric)

    if args.verify_only:
        return

    if args.out_dir is None:
        raise SystemExit("pass --out-dir (or --verify-only)")
    out_dir = Path(args.out_dir).resolve()
    write_humming_checkpoint(
        in_dir, out_dir, layer_map,
        storage_bits=storage_bits, group_size=group_size, symmetric=symmetric,
        target_dtype=target_dtype, symmetric_out=args.symmetric,
    )


if __name__ == "__main__":
    main()