ModelCloud
diff --git a/‎gptqmodel/looper/weight_only_looper.py‎
Lines changed: 3 additions & 3 deletions b/‎gptqmodel/looper/weight_only_looper.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎gptqmodel/looper/weight_only_processor.py‎
Lines changed: 15 additions & 11 deletions b/‎gptqmodel/looper/weight_only_processor.py‎
Lines changed: 15 additions & 11 deletions
diff --git a/‎gptqmodel/models/auto.py‎
Lines changed: 2 additions & 0 deletions b/‎gptqmodel/models/auto.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎gptqmodel/models/base.py‎
Lines changed: 4 additions & 2 deletions b/‎gptqmodel/models/base.py‎
Lines changed: 4 additions & 2 deletions
@@ -24,7 +24,7 @@
 from ..models import BaseQModel
 from ..models._const import CPU, SUPPORTS_MODULE_TYPES
 from ..nn_modules.converter import MODULE_CONVERTER_MAP
-from ..quantization.config import GGUFQuantizeConfig, RTNQuantizeConfig
+from ..quantization.config import FP8Config, GGUFQuantizeConfig, RTNQuantizeConfig
 from ..utils.logger import setup_logger
 from ..utils.model import find_modules, get_module, get_module_by_name_prefix, move_to
 from ..utils.offload import offload_to_disk
@@ -96,9 +96,9 @@ def _offload_quantized_module(self, module: NamedModule) -> None:
     def loop(self, **kwargs):
         """Quantize layers directly from weights without calibration forwards."""
         quant_config = self.gptq_model.quantize_config
-        if not isinstance(quant_config, (RTNQuantizeConfig, GGUFQuantizeConfig)):
+        if not isinstance(quant_config, (RTNQuantizeConfig, GGUFQuantizeConfig, FP8Config)):
             raise NotImplementedError(
-                "Weight-only looper only supports `RTNQuantizeConfig` and `GGUFQuantizeConfig` today."
+                "Weight-only looper only supports `RTNQuantizeConfig`, `GGUFQuantizeConfig`, and `FP8Config` today."
             )
 
         if quant_config.lm_head:
 
@@ -27,6 +27,7 @@
 )
 from ..quantization.config import (
     BaseQuantizeConfig,
+    FP8Config,
     GGUFQuantizeConfig,
     METHOD,
     RTNQuantizeConfig,
@@ -50,7 +51,7 @@ class WeightOnlyProcessor(LoopProcessor):
     def __init__(
         self,
         tokenizer,
-        qcfg: RTNQuantizeConfig | GGUFQuantizeConfig,
+        qcfg: RTNQuantizeConfig | GGUFQuantizeConfig | FP8Config,
     ):
         super().__init__(
             tokenizer=tokenizer,
@@ -67,8 +68,8 @@ def __init__(
         self.lock = threading.Lock()
 
     @staticmethod
-    def _uses_direct_gguf(qcfg: RTNQuantizeConfig | GGUFQuantizeConfig) -> bool:
-        return qcfg.quant_method == METHOD.GGUF
+    def _uses_direct_pack(qcfg: RTNQuantizeConfig | GGUFQuantizeConfig | FP8Config) -> bool:
+        return qcfg.quant_method in {METHOD.GGUF, METHOD.FP8}
 
     def _update_logged_loss(self, module: NamedModule, avg_loss: str) -> None:
         with self.lock:
@@ -94,15 +95,15 @@ def _annotate_tp_padding(self, module: NamedModule, qcfg: BaseQuantizeConfig) ->
             "original_columns": columns,
         }
 
-    def quantize_module(self, module: NamedModule) -> Optional[RTNQuantizeConfig | GGUFQuantizeConfig]:
+    def quantize_module(self, module: NamedModule) -> Optional[RTNQuantizeConfig | GGUFQuantizeConfig | FP8Config]:
         qcfg_clone = clone_weight_only_config_for_module(self.qcfg, module.full_name)
         if qcfg_clone is None:
             return None
 
-        if self._uses_direct_gguf(qcfg_clone):
+        if self._uses_direct_pack(qcfg_clone):
             start_time = time.time()
             duration = time.time() - start_time
-            avg_loss = "gguf: pending"
+            avg_loss = f"{qcfg_clone.quant_method.value}: pending"
             damp_percent = 0.0
             nsamples = 0
         else:
@@ -139,7 +140,7 @@ def quantize_module(self, module: NamedModule) -> Optional[RTNQuantizeConfig | G
             self.log.append(stat)
         self.log_new_row(stat)
 
-        if not self._uses_direct_gguf(qcfg_clone):
+        if not self._uses_direct_pack(qcfg_clone):
             module.weight.data = wq
         return qcfg_clone
 
@@ -148,11 +149,11 @@ def submodule_finalize(
         module: NamedModule,
         model: BaseQModel,
         *,
-        qcfg: Optional[RTNQuantizeConfig | GGUFQuantizeConfig] = None,
+        qcfg: Optional[RTNQuantizeConfig | GGUFQuantizeConfig | FP8Config] = None,
         **kwargs,
     ):
         active_qcfg = qcfg or self.qcfg
-        if not self._uses_direct_gguf(active_qcfg):
+        if not self._uses_direct_pack(active_qcfg):
             module.stream_sync()
             with self.lock:
                 q_zeros = module.state.pop("q_zeros").clone()
@@ -187,6 +188,7 @@ def submodule_finalize(
                     pack_dtype=active_qcfg.pack_dtype,
                     format=resolve_quant_format(active_qcfg.format, active_qcfg.quant_method),
                     register_buffers=False,
+                    init_kwargs=active_qcfg.quant_linear_init_kwargs(),
                 )
         if timer is not None and create_start is not None:
             timer.record("submodule_finalize_create", time.perf_counter() - create_start, source=module_label)
@@ -197,7 +199,7 @@ def submodule_finalize(
             if name == module.full_name
         }
 
-        if self._uses_direct_gguf(active_qcfg):
+        if self._uses_direct_pack(active_qcfg):
             pack_start = time.perf_counter() if timer is not None else None
             with log_time_block("module.pack_original", logger=log, module_name=module_label):
                 with parent_module_lock(parent_key):
@@ -219,7 +221,7 @@ def submodule_finalize(
             reference_weight = qmodule._weight_to_matrix(original_layer).detach().cpu().to(torch.float32)
             dequant_weight = qmodule.dequantize_weight().T.detach().cpu().to(torch.float32)
             mean_abs_err = (dequant_weight - reference_weight).abs().mean().item()
-            self._update_logged_loss(module, f"gguf: {mean_abs_err:.7f}")
+            self._update_logged_loss(module, f"{active_qcfg.quant_method.value}: {mean_abs_err:.7f}")
             module.unregister_parameter("weight")
             return
 
@@ -254,6 +256,8 @@ def finalize(self, model: BaseQModel, **kwargs):
     def name(self) -> str:
         if self.qcfg.quant_method == METHOD.GGUF:
             return "weight_only_gguf"
+        if self.qcfg.quant_method == METHOD.FP8:
+            return "weight_only_fp8"
         return "weight_only_rtn"
 
 __all__ = ["WeightOnlyProcessor"]
@@ -289,6 +289,7 @@ def _is_supported_quantization_config(config: AutoConfig) -> bool:
     if isinstance(quant_format, str) and quant_format.lower() in (
         METHOD.GPTQ,
         METHOD.GGUF,
+        METHOD.FP8,
         METHOD.AWQ,
         METHOD.QQQ,
         METHOD.EXL3,
@@ -299,6 +300,7 @@ def _is_supported_quantization_config(config: AutoConfig) -> bool:
     if isinstance(quant_method, str) and quant_method.lower() in (
         METHOD.GPTQ,
         METHOD.GGUF,
+        METHOD.FP8,
         METHOD.AWQ,
         METHOD.QQQ,
         METHOD.EXL3,
 
@@ -708,6 +708,8 @@ def quantize(
                 preferred_backend = BACKEND.EXLLAMA_V3
             elif self.quantize_config.quant_method == METHOD.GGUF:
                 preferred_backend = BACKEND.AUTO
+            elif self.quantize_config.quant_method == METHOD.FP8:
+                preferred_backend = BACKEND.TORCH
             else:
                 preferred_backend = BACKEND.TORCH
 
@@ -2015,8 +2017,8 @@ def __getattr__(self, item):
     def _auto_detect_module_tree(self, model: PreTrainedModel, quant_method: METHOD):
         log.warn("Model not yet support, attempting Module Tree AutoCompat...")
 
-        if quant_method not in {METHOD.GPTQ, METHOD.GGUF, METHOD.EXL3}:
-            log.warn(f"Module Tree AutoCompat: Failed, quant_method={quant_method}, only support GPTQ/GGUF/EXL3")
+        if quant_method not in {METHOD.GPTQ, METHOD.GGUF, METHOD.FP8, METHOD.EXL3}:
+            log.warn(f"Module Tree AutoCompat: Failed, quant_method={quant_method}, only support GPTQ/GGUF/FP8/EXL3")
             return None
 
         def _get(path):