Skip to content

Commit d13540b

Browse files
authored
convert : remove input_scale for dequantized fp8 modelopt (#22356)
1 parent f84270e commit d13540b

1 file changed

Lines changed: 21 additions & 30 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 21 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,22 @@ def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Call
272272

273273
return tensors
274274

275+
@staticmethod
276+
def _scale_is_trivial(scale: Tensor) -> bool:
277+
return scale.numel() <= 1 and abs(float(scale.float().sum()) - 1.0) < 1e-6
278+
279+
def _write_scale_tensor(self, scale_name: str, scale: Tensor):
280+
if not self._scale_is_trivial(scale):
281+
scale_f32 = scale.float().numpy().flatten()
282+
logger.info(f" + {scale_name} (per-tensor scale, shape [{scale_f32.size}])")
283+
self.gguf_writer.add_tensor(scale_name, scale_f32)
284+
285+
def _write_scales_tensor(self, scale_name: str, scales: list[float]):
286+
if not np.allclose(scales, 1.0, atol=1e-6):
287+
scale_vals = np.array(scales, dtype=np.float32)
288+
logger.info(f" + {scale_name} (per-expert scale, shape [{len(scales)}])")
289+
self.gguf_writer.add_tensor(scale_name, scale_vals)
290+
275291
def dequant_model(self):
276292
# If all quantized tensors were already handled (e.g. pure NVFP4), skip
277293
if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors):
@@ -494,7 +510,7 @@ def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: T
494510
s = self.model_tensors[name]
495511
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
496512
tensors_to_remove.append(name)
497-
if name.endswith((".k_scale", ".v_scale")):
513+
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
498514
tensors_to_remove.append(name)
499515
elif quant_method is not None:
500516
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
@@ -602,10 +618,6 @@ def _nvfp4_pack(weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]:
602618
raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36)
603619
return raw, [out_features, n_super * 64]
604620

605-
@staticmethod
606-
def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
607-
return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
608-
609621
def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
610622
if "language_model." in name:
611623
name = name.replace("language_model.", "")
@@ -616,19 +628,8 @@ def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor
616628
logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
617629
self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
618630

619-
# Emit per-tensor scale2 as a separate F32 tensor when non-trivial
620-
if not self._nvfp4_scale2_is_trivial(scale2):
621-
scale2_f32 = scale2.float().numpy().flatten()
622-
scale_name = new_name.replace(".weight", ".scale")
623-
logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
624-
self.gguf_writer.add_tensor(scale_name, scale2_f32)
625-
626-
# Emit per-tensor input_scale as a separate F32 tensor when non-trivial
627-
if not self._nvfp4_scale2_is_trivial(input_scale):
628-
input_scale_f32 = input_scale.float().numpy().flatten()
629-
input_scale_name = new_name.replace(".weight", ".input_scale")
630-
logger.info(f" + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])")
631-
self.gguf_writer.add_tensor(input_scale_name, input_scale_f32)
631+
self._write_scale_tensor(new_name.replace(".weight", ".scale"), scale2)
632+
self._write_scale_tensor(new_name.replace(".weight", ".input_scale"), input_scale)
632633

633634
def _generate_nvfp4_tensors(self):
634635
# Per-layer expert merging to avoid holding all experts in memory
@@ -719,21 +720,11 @@ def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_s
719720
logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4")
720721
self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
721722

722-
# Emit per-expert scale2 tensor if any expert has non-trivial scale2
723723
scales.sort(key=lambda x: x[0])
724-
scale_vals = np.array([s[1] for s in scales], dtype=np.float32)
725-
if not np.allclose(scale_vals, 1.0, atol=1e-6):
726-
scale_name = new_name.replace(".weight", ".scale")
727-
logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
728-
self.gguf_writer.add_tensor(scale_name, scale_vals)
724+
self._write_scales_tensor(new_name.replace(".weight", ".scale"), [s[1] for s in scales])
729725

730-
# Emit per-expert input_scale tensor if any expert has non-trivial input_scale
731726
input_scales.sort(key=lambda x: x[0])
732-
input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32)
733-
if not np.allclose(input_scale_vals, 1.0, atol=1e-6):
734-
input_scale_name = new_name.replace(".weight", ".input_scale")
735-
logger.info(f" + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])")
736-
self.gguf_writer.add_tensor(input_scale_name, input_scale_vals)
727+
self._write_scales_tensor(new_name.replace(".weight", ".input_scale"), [s[1] for s in input_scales])
737728

738729
del experts, merged
739730

0 commit comments

Comments
 (0)