make the quant only do what it should do, more moduled

Michael20070814 · Michael20070814 · commit 4f6157aa968d · 2026-03-30T13:23:47.000+08:00
diff --git a/llmc/__main__.py b/llmc/__main__.py
@@ -20,6 +20,7 @@
 from llmc.models import *
 from llmc.utils import (check_config, deploy_all_modality, get_modality,
                         mkdirs, print_important_package_version, seed_all,
+                        collect_lightllm_kv_calib_json,
                         update_autoawq_quant_config,
                         update_lightx2v_quant_config, update_vllm_quant_config)
 from llmc.utils.registry_factory import ALGO_REGISTRY, MODEL_REGISTRY
@@ -74,9 +75,9 @@ def main(config):
     if int(os.environ['RANK']) == 0:
         if 'save' in config and config.save.get('save_lightllm_kv_cache_calib', False):
             calib_json_list = [
-                blockwise_opt.collect_calib_json()
+                collect_lightllm_kv_calib_json(blockwise_opt)
                 for blockwise_opt in blockwise_opts
-                if hasattr(blockwise_opt, 'collect_calib_json')
+                if hasattr(blockwise_opt, 'quant_kvcache')
             ]
             calib_json_payload = (
                 calib_json_list[0] if len(calib_json_list) == 1 else calib_json_list
diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py
@@ -1,7 +1,6 @@
 import copy
 import functools
 import gc
-import json
 import os
 import re
 from collections import defaultdict
@@ -12,7 +11,6 @@
 import torch.nn as nn
 from loguru import logger
 
-from llmc.utils.export_calib import collect_lightllm_kv_calib_json
 from llmc.utils.registry_factory import KV_REGISTRY, TOKEN_REDUCTION_REGISTRY
 
 from ..blockwise_optimization import BlockwiseOpt
@@ -1011,45 +1009,6 @@ def contiguous_params(self):
                 if not param.is_contiguous():
                     param.data = param.data.contiguous()
 
-    # Convert tensors and similar objects into Python values that can be
-    # directly serialized into JSON.
-    def _to_jsonable(self, value):
-        if isinstance(value, torch.Tensor):
-            return value.detach().cpu().tolist()
-        return value
-
-    # Normalize inputs into CPU tensors so the following range computation
-    # and serialization logic can handle them consistently.
-    def _to_tensor(self, value, dtype=torch.float32):
-        if isinstance(value, torch.Tensor):
-            return value.detach().cpu().to(dtype)
-        return torch.as_tensor(value, dtype=dtype)
-
-    # LightLLM expects offline FP8 KV descales. Recover the real-value range
-    # from the qparams first, then convert it into per-layer K/V scales that
-    # align with torch.float8_e4m3fn.
-    def _collect_lightllm_kv_scale(self, scales, zeros, qmin, qmax):
-        if isinstance(scales, torch.Tensor) and scales.numel() == 0:
-            return None
-
-        scales_tensor = self._to_tensor(scales)
-        zeros_tensor = self._to_tensor(zeros, dtype=scales_tensor.dtype)
-        qmin_tensor = self._to_tensor(qmin, dtype=scales_tensor.dtype)
-        qmax_tensor = self._to_tensor(qmax, dtype=scales_tensor.dtype)
-        min_tensor = (qmin_tensor - zeros_tensor) * scales_tensor
-        max_tensor = (qmax_tensor - zeros_tensor) * scales_tensor
-        absmax_tensor = torch.maximum(min_tensor.abs(), max_tensor.abs())
-        fp8_qmax = torch.tensor(
-            torch.finfo(torch.float8_e4m3fn).max, dtype=absmax_tensor.dtype
-        )
-        return absmax_tensor / fp8_qmax
-
-    # Export calibration results in the LightLLM kv_cache_calib.json format.
-    # At the moment, only the per_tensor and per_head KV formats supported by
-    # LightLLM are handled here.
-    def collect_calib_json(self):
-        return collect_lightllm_kv_calib_json(self)
-
     @torch.no_grad()
     def save_model(self, path):
         if int(os.environ['RANK']) != 0:
diff --git a/llmc/utils/export_calib.py b/llmc/utils/export_calib.py
@@ -1,6 +1,35 @@
 import torch
 
 
+def _to_jsonable(value):
+    if isinstance(value, torch.Tensor):
+        return value.detach().cpu().tolist()
+    return value
+
+
+def _to_tensor(value, dtype=torch.float32):
+    if isinstance(value, torch.Tensor):
+        return value.detach().cpu().to(dtype)
+    return torch.as_tensor(value, dtype=dtype)
+
+
+def _collect_lightllm_kv_scale(scales, zeros, qmin, qmax):
+    if isinstance(scales, torch.Tensor) and scales.numel() == 0:
+        return None
+
+    scales_tensor = _to_tensor(scales)
+    zeros_tensor = _to_tensor(zeros, dtype=scales_tensor.dtype)
+    qmin_tensor = _to_tensor(qmin, dtype=scales_tensor.dtype)
+    qmax_tensor = _to_tensor(qmax, dtype=scales_tensor.dtype)
+    min_tensor = (qmin_tensor - zeros_tensor) * scales_tensor
+    max_tensor = (qmax_tensor - zeros_tensor) * scales_tensor
+    absmax_tensor = torch.maximum(min_tensor.abs(), max_tensor.abs())
+    fp8_qmax = torch.tensor(
+        torch.finfo(torch.float8_e4m3fn).max, dtype=absmax_tensor.dtype
+    )
+    return absmax_tensor / fp8_qmax
+
+
 def collect_lightllm_kv_calib_json(blockwise_opt):
     if not getattr(blockwise_opt, 'quant_kvcache', False):
         raise ValueError(
@@ -24,13 +53,13 @@ def collect_lightllm_kv_calib_json(blockwise_opt):
     )
     scales = []
     for layer_idx in range(num_layers):
-        key_scale = blockwise_opt._collect_lightllm_kv_scale(
+        key_scale = _collect_lightllm_kv_scale(
             blockwise_opt.kv_module.k_scales_buffer[layer_idx],
             blockwise_opt.kv_module.k_zeros_buffer[layer_idx],
             blockwise_opt.kv_module.k_qmin_buffer[layer_idx],
             blockwise_opt.kv_module.k_qmax_buffer[layer_idx],
         )
-        value_scale = blockwise_opt._collect_lightllm_kv_scale(
+        value_scale = _collect_lightllm_kv_scale(
             blockwise_opt.kv_module.v_scales_buffer[layer_idx],
             blockwise_opt.kv_module.v_zeros_buffer[layer_idx],
             blockwise_opt.kv_module.v_qmin_buffer[layer_idx],
@@ -65,5 +94,5 @@ def collect_lightllm_kv_calib_json(blockwise_opt):
         'num_layers': num_layers,
         'num_head': num_head,
         'scales_shape': [num_layers, scale_width],
-        'scales': scales,
+        'scales': _to_jsonable(scales),
     }