@@ -272,6 +272,22 @@ def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Call
272272
273273 return tensors
274274
275+ @staticmethod
276+ def _scale_is_trivial(scale: Tensor) -> bool:
277+ return scale.numel() <= 1 and abs(float(scale.float().sum()) - 1.0) < 1e-6
278+
279+ def _write_scale_tensor(self, scale_name: str, scale: Tensor):
280+ if not self._scale_is_trivial(scale):
281+ scale_f32 = scale.float().numpy().flatten()
282+ logger.info(f" + {scale_name} (per-tensor scale, shape [{scale_f32.size}])")
283+ self.gguf_writer.add_tensor(scale_name, scale_f32)
284+
285+ def _write_scales_tensor(self, scale_name: str, scales: list[float]):
286+ if not np.allclose(scales, 1.0, atol=1e-6):
287+ scale_vals = np.array(scales, dtype=np.float32)
288+ logger.info(f" + {scale_name} (per-expert scale, shape [{len(scales)}])")
289+ self.gguf_writer.add_tensor(scale_name, scale_vals)
290+
275291 def dequant_model(self):
276292 # If all quantized tensors were already handled (e.g. pure NVFP4), skip
277293 if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors):
@@ -494,7 +510,7 @@ def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: T
494510 s = self.model_tensors[name]
495511 self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
496512 tensors_to_remove.append(name)
497- if name.endswith((".k_scale", ".v_scale")):
513+ if name.endswith((".input_scale", ". k_scale", ".v_scale")):
498514 tensors_to_remove.append(name)
499515 elif quant_method is not None:
500516 raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
@@ -602,10 +618,6 @@ def _nvfp4_pack(weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]:
602618 raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36)
603619 return raw, [out_features, n_super * 64]
604620
605- @staticmethod
606- def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
607- return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
608-
609621 def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
610622 if "language_model." in name:
611623 name = name.replace("language_model.", "")
@@ -616,19 +628,8 @@ def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor
616628 logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
617629 self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
618630
619- # Emit per-tensor scale2 as a separate F32 tensor when non-trivial
620- if not self._nvfp4_scale2_is_trivial(scale2):
621- scale2_f32 = scale2.float().numpy().flatten()
622- scale_name = new_name.replace(".weight", ".scale")
623- logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
624- self.gguf_writer.add_tensor(scale_name, scale2_f32)
625-
626- # Emit per-tensor input_scale as a separate F32 tensor when non-trivial
627- if not self._nvfp4_scale2_is_trivial(input_scale):
628- input_scale_f32 = input_scale.float().numpy().flatten()
629- input_scale_name = new_name.replace(".weight", ".input_scale")
630- logger.info(f" + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])")
631- self.gguf_writer.add_tensor(input_scale_name, input_scale_f32)
631+ self._write_scale_tensor(new_name.replace(".weight", ".scale"), scale2)
632+ self._write_scale_tensor(new_name.replace(".weight", ".input_scale"), input_scale)
632633
633634 def _generate_nvfp4_tensors(self):
634635 # Per-layer expert merging to avoid holding all experts in memory
@@ -719,21 +720,11 @@ def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_s
719720 logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4")
720721 self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
721722
722- # Emit per-expert scale2 tensor if any expert has non-trivial scale2
723723 scales.sort(key=lambda x: x[0])
724- scale_vals = np.array([s[1] for s in scales], dtype=np.float32)
725- if not np.allclose(scale_vals, 1.0, atol=1e-6):
726- scale_name = new_name.replace(".weight", ".scale")
727- logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
728- self.gguf_writer.add_tensor(scale_name, scale_vals)
724+ self._write_scales_tensor(new_name.replace(".weight", ".scale"), [s[1] for s in scales])
729725
730- # Emit per-expert input_scale tensor if any expert has non-trivial input_scale
731726 input_scales.sort(key=lambda x: x[0])
732- input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32)
733- if not np.allclose(input_scale_vals, 1.0, atol=1e-6):
734- input_scale_name = new_name.replace(".weight", ".input_scale")
735- logger.info(f" + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])")
736- self.gguf_writer.add_tensor(input_scale_name, input_scale_vals)
727+ self._write_scales_tensor(new_name.replace(".weight", ".input_scale"), [s[1] for s in input_scales])
737728
738729 del experts, merged
739730
0 commit comments