fix(peft): gate FP8+PEFT config kwarg injection on is_hf_model (#2169)

HuiyingLi · claude · web-flow · commit 79218ca12c0d · 2026-05-08T10:10:28.000-07:00
* fix(peft): gate FP8+PEFT config kwarg injection on is_hf_model When PEFT is requested and the HF config has FP8 quantization, the FP8 dequantize path injected `kwargs["config"] = _hf_config` so that HF's from_pretrained would honor the in-memory `dequantize=True` mutation. But the same kwargs flow into the custom-model branch in model_init.py, which calls `model_cls(hf_config, *model_args, **kwargs)` — passing hf_config positionally — and the duplicate `config` produced: TypeError: MiniMaxM2ForCausalLM.__init__() got multiple values for argument 'config' Gate the kwargs injection on is_hf_model. The dequantize=True flag is mutated in place on the quantization_config dict, so the custom-model path still observes it via the positional hf_config argument; only the HF path needs it surfaced as a kwarg. Fixes: #2164 Signed-off-by: HuiyingLi <willwin.lee@gmail.com> Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: HuiyingLi <willwin.lee@gmail.com> * test: cover fp8 peft config kwarg gate Signed-off-by: HuiyingLi <willwin.lee@gmail.com> --------- Signed-off-by: HuiyingLi <willwin.lee@gmail.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/nemo_automodel/_transformers/auto_model.py b/nemo_automodel/_transformers/auto_model.py
@@ -353,7 +353,12 @@ def _retry(**override):
         )
         _hf_native_quant_cfg = getattr(_hf_config, "quantization_config", None)
         if _maybe_dequantize_fp8_for_peft(_hf_native_quant_cfg, peft_config, pretrained_model_name_or_path_or_config):
-            kwargs["config"] = _hf_config
+            # Only HF's from_pretrained needs `config` in kwargs (it would otherwise
+            # re-read config from disk and lose the in-memory dequantize=True mutation).
+            # Custom models receive _hf_config positionally in model_init.py and would
+            # collide with kwargs["config"] (issue #2164).
+            if is_hf_model:
+                kwargs["config"] = _hf_config
 
         # Use meta device initialization when:
         # - Not using MegatronFSDPManager or DDPManager (they handle their own initialization)
diff --git a/tests/unit_tests/_transformers/test_fp8_peft_dequantize.py b/tests/unit_tests/_transformers/test_fp8_peft_dequantize.py
@@ -14,9 +14,9 @@
 
 """Tests for FP8 model + PEFT dequantization logic."""
 
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
-from nemo_automodel._transformers.auto_model import _maybe_dequantize_fp8_for_peft
+from nemo_automodel._transformers.auto_model import _BaseNeMoAutoModelClass, _maybe_dequantize_fp8_for_peft
 
 # ---------------------------------------------------------------------------
 # Tests: FP8 + PEFT auto-dequantize
@@ -173,3 +173,149 @@ def test_meta_device_disabled_single_gpu_hf_model(self):
             hf_native_quant_cfg=None,
         )
         assert result is False
+
+
+# ---------------------------------------------------------------------------
+# Tests: kwargs["config"] injection gated on is_hf_model (issue #2164)
+# ---------------------------------------------------------------------------
+
+
+class TestKwargsConfigInjectionGate:
+    """Tests for the is_hf_model gate on kwargs["config"] = _hf_config injection.
+
+    Custom models receive _hf_config positionally in model_init.py:783 via
+    model_cls(hf_config, *model_args, **kwargs); injecting config into kwargs
+    causes a TypeError ("got multiple values for argument 'config'"). The gate
+    suppresses the injection for the custom-model path while preserving the
+    in-place dequantize=True mutation needed by the HF path.
+    """
+
+    @staticmethod
+    def _make_build_kwargs(is_hf_model):
+        """Minimal kwargs for running _build_model through the FP8+PEFT gate."""
+        mesh = MagicMock()
+        mesh.tp_size = 1
+        mesh.cp_size = 1
+        return dict(
+            is_hf_model=is_hf_model,
+            use_liger_kernel=False,
+            use_sdpa_patching=False,
+            sdpa_method=None,
+            torch_dtype="auto",
+            attn_implementation="eager",
+            quantization_config=None,
+            force_hf=False,
+            model_wrapper=None,
+            autopipeline=None,
+            parallelize_fn=None,
+            qat_quantizer=None,
+            mesh=mesh,
+            loss_fn=None,
+            peft_config=MagicMock(),
+            fp8_config=None,
+            compile_config=None,
+            load_base_model=True,
+        )
+
+    @staticmethod
+    def _run_build_model_with_native_fp8(is_hf_model):
+        quant_cfg = {"quant_method": "fp8", "dequantize": False}
+        hf_config = MagicMock()
+        hf_config.quantization_config = quant_cfg
+        sentinel_model = MagicMock()
+
+        with (
+            patch("nemo_automodel._transformers.auto_model._apply_preload_overrides", return_value=("eager", False)),
+            patch("nemo_automodel._transformers.auto_model.get_hf_config", return_value=hf_config),
+            patch("nemo_automodel._transformers.auto_model._init_model") as mock_init,
+            patch("nemo_automodel._transformers.auto_model.get_world_size_safe", return_value=1),
+            patch("nemo_automodel._transformers.auto_model._verify_sdpa_support"),
+            patch(
+                "nemo_automodel._transformers.capabilities.attach_capabilities_and_validate",
+                return_value=sentinel_model,
+            ),
+            patch("nemo_automodel._transformers.auto_model.apply_model_infrastructure", return_value=sentinel_model),
+            patch("torch.cuda.current_device", return_value=0),
+        ):
+            mock_init.return_value = (not is_hf_model, sentinel_model)
+            result = _BaseNeMoAutoModelClass._build_model(
+                "some-model",
+                **TestKwargsConfigInjectionGate._make_build_kwargs(is_hf_model),
+            )
+
+        return quant_cfg, hf_config, result, sentinel_model, mock_init
+
+    def test_build_model_hf_fp8_peft_injects_config_kwarg(self):
+        """_build_model should pass mutated config through kwargs for HF from_pretrained."""
+        quant_cfg, hf_config, result, sentinel_model, mock_init = self._run_build_model_with_native_fp8(
+            is_hf_model=True
+        )
+
+        assert result is sentinel_model
+        assert mock_init.call_args.kwargs["config"] is hf_config
+        assert quant_cfg["dequantize"] is True
+
+    def test_build_model_custom_fp8_peft_does_not_inject_config_kwarg(self):
+        """_build_model should not pass duplicate config kwargs for custom model init."""
+        quant_cfg, _hf_config, result, sentinel_model, mock_init = self._run_build_model_with_native_fp8(
+            is_hf_model=False
+        )
+
+        assert result is sentinel_model
+        assert "config" not in mock_init.call_args.kwargs
+        assert quant_cfg["dequantize"] is True
+
+    @staticmethod
+    def _apply_gate(hf_native_quant_cfg, peft_config, pretrained_path, is_hf_model, hf_config_obj):
+        """Replicate the gated kwargs["config"] injection from _build_model."""
+        kwargs: dict = {}
+        if _maybe_dequantize_fp8_for_peft(hf_native_quant_cfg, peft_config, pretrained_path):
+            if is_hf_model:
+                kwargs["config"] = hf_config_obj
+        return kwargs
+
+    def test_hf_model_fp8_peft_injects_config_kwarg(self):
+        """HF path needs config in kwargs so HF.from_pretrained sees the dequantize mutation."""
+        quant_cfg = {"quant_method": "fp8", "dequantize": False}
+        hf_config = MagicMock()
+        hf_config.quantization_config = quant_cfg
+
+        kwargs = self._apply_gate(quant_cfg, MagicMock(), "some-model", is_hf_model=True, hf_config_obj=hf_config)
+
+        assert "config" in kwargs
+        assert kwargs["config"] is hf_config
+        assert quant_cfg["dequantize"] is True
+
+    def test_custom_model_fp8_peft_does_not_inject_config_kwarg(self):
+        """Custom-model path receives hf_config positionally; injecting config would TypeError (#2164)."""
+        quant_cfg = {"quant_method": "fp8", "dequantize": False}
+        hf_config = MagicMock()
+        hf_config.quantization_config = quant_cfg
+
+        kwargs = self._apply_gate(quant_cfg, MagicMock(), "some-model", is_hf_model=False, hf_config_obj=hf_config)
+
+        assert "config" not in kwargs
+        # Dequantize mutation must still be applied so the custom path sees it via the
+        # positional hf_config argument.
+        assert quant_cfg["dequantize"] is True
+
+    def test_no_peft_does_not_inject_regardless_of_is_hf_model(self):
+        """When PEFT is not configured, no injection happens on either path."""
+        quant_cfg = {"quant_method": "fp8", "dequantize": False}
+        hf_config = MagicMock()
+
+        kwargs_hf = self._apply_gate(quant_cfg, None, "some-model", is_hf_model=True, hf_config_obj=hf_config)
+        kwargs_custom = self._apply_gate(quant_cfg, None, "some-model", is_hf_model=False, hf_config_obj=hf_config)
+
+        assert "config" not in kwargs_hf
+        assert "config" not in kwargs_custom
+        assert quant_cfg["dequantize"] is False
+
+    def test_non_fp8_quant_does_not_inject(self):
+        """Non-FP8 quant configs (e.g. GPTQ) are not the FP8+PEFT case; no injection."""
+        quant_cfg = {"quant_method": "gptq", "bits": 4}
+        hf_config = MagicMock()
+
+        kwargs = self._apply_gate(quant_cfg, MagicMock(), "some-model", is_hf_model=True, hf_config_obj=hf_config)
+
+        assert "config" not in kwargs