fix(quant): make embedding input_quantizer absorb wildcard configs

ajrasane · ajrasane · commit a932284fe68f · 2026-05-19T20:25:55.000Z
The previous design raised in _QuantEmbedding.forward whenever
input_quantizer.is_enabled, on the theory that any non-disable config was
an explicit user mistake. That assumption was wrong for wildcard configs:
the default QuantizeConfig is just [{"quantizer_name": "*", "cfg":
{"num_bits": 8, ...}}] (no embedding opt-out), so the wildcard enables
embed_tokens.input_quantizer for tiny Llama-style tests and the forward
guard fires — breaking test_peft_save_load and test_transformers_save_load.

Switch _UnsettableInputQuantizer.set_from_attribute_config to absorb the
incoming config like a normal quantizer, then force _disabled = True at
the end. The "throw on explicit set" semantics are preserved via the
.enable / .enable_quant / .enable_calib overrides, which catch the direct
mistakes users would actually make. The forward-time guard (and the
corresponding test) are removed since the invariant is now maintained at
the configure step.

Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
diff --git a/modelopt/torch/quantization/nn/modules/quant_embedding.py b/modelopt/torch/quantization/nn/modules/quant_embedding.py
@@ -39,12 +39,13 @@ class _UnsettableInputQuantizer(TensorQuantizer):
     """TensorQuantizer slot for nn.Embedding.input — present but not enable-able.
 
     Embedding inputs are integer indices that cannot be fake-quantized. The attribute
-    is kept so introspection code (export, calibration helpers) can find it. Wildcard
-    configs (e.g. ``NVFP4_DEFAULT_CFG``'s ``*input_quantizer``) are accepted silently
-    so that the standard "deny-all → enable wildcards → opt-out specific layers"
-    pattern in the stock configs still works. Direct calls to ``enable*()`` raise
-    immediately, and ``_QuantEmbedding.forward`` raises if the final state ends up
-    enabled (e.g. a user explicitly targeted this quantizer).
+    is kept so introspection code (export, calibration helpers) can find it.
+
+    Wildcard configs (e.g. the default ``QuantizeConfig`` ``"*"`` rule or
+    ``NVFP4_DEFAULT_CFG``'s ``*input_quantizer``) are accepted silently, then the
+    quantizer is force-disabled — wildcards don't really mean "enable embedding
+    input quant", they mean "enable input quant in general". Direct, explicit
+    attempts (calling ``enable``/``enable_quant``/``enable_calib``) raise loudly.
     """
 
     def enable(self):
@@ -59,21 +60,34 @@ def enable_calib(self):
         """Disallowed for embedding inputs."""
         raise RuntimeError(_INPUT_QUANTIZER_ERR)
 
+    def set_from_attribute_config(self, attribute_cfg):
+        """Apply the config like any quantizer, then force-disable us.
+
+        This absorbs wildcard configs from stock recipes without raising. The
+        quantizer's other attributes (``num_bits``, ``axis``, etc.) take on the
+        config values for introspection, but ``_disabled`` is forced back to
+        ``True`` so forward is always a no-op.
+        """
+        super().set_from_attribute_config(attribute_cfg)
+        self._disabled = True
+
 
 @QuantModuleRegistry.register({nn.Embedding: "nn.Embedding"})
 class _QuantEmbedding(QuantModule):
     """Quantized version of ``nn.Embedding``.
 
     The literal input to ``nn.Embedding`` is integer indices, which cannot be
     fake-quantized. The ``input_quantizer`` attribute is kept (for symmetry with
-    other quant modules and for introspection by export/calibration code) but
-    configuring it raises — see ``_UnsettableInputQuantizer``. Only the embedding
+    other quant modules and for introspection by export/calibration code) but is
+    permanently disabled — see ``_UnsettableInputQuantizer``. Only the embedding
     table (weight) and the lookup output (an activation feeding downstream layers)
     are quantizable.
 
     Quantizer roles:
         - ``weight_quantizer``: quantizes the embedding table (``self.weight``).
-        - ``input_quantizer``: permanently disabled placeholder — raises on configure.
+        - ``input_quantizer``: permanently disabled placeholder — direct
+          ``enable*()`` calls raise; configs that target it are absorbed and the
+          quantizer is force-disabled.
         - ``output_quantizer``: optional activation quantizer for the lookup output,
           disabled by default.
     """
@@ -119,10 +133,13 @@ def _setup(self):
         self._register_dynamic_attribute("weight", self._get_quantized_weight)
 
     def forward(self, input, *args, **kwargs):
-        """Quantize the embedding table, look up, then optionally quantize the output."""
-        if self.input_quantizer.is_enabled:
-            # Caught any config or call that managed to flip _disabled to False.
-            raise RuntimeError(_INPUT_QUANTIZER_ERR)
+        """Quantize the embedding table, look up, then optionally quantize the output.
+
+        ``input_quantizer`` is intentionally never applied — embedding inputs are
+        integer indices. ``_UnsettableInputQuantizer.set_from_attribute_config``
+        keeps that quantizer disabled regardless of what configs target it, so we
+        rely on that invariant rather than a runtime check here.
+        """
         if is_torch_export_mode():
             # quantize_weight()'s attribute write is not allowed under torch.export;
             # weight quantization is still applied inline via _get_quantized_weight's
diff --git a/tests/unit/torch/quantization/test_quant_embedding.py b/tests/unit/torch/quantization/test_quant_embedding.py
@@ -92,24 +92,22 @@ def test_input_quantizer_mutators_raise(self, method):
         with pytest.raises(RuntimeError, match="nn.Embedding"):
             getattr(qemb.input_quantizer, method)()
 
-    def test_forward_raises_if_input_quantizer_enabled(self):
-        """Forward catches back-door flips of input_quantizer._disabled."""
-        qemb = _make_quant_embedding()
-        qemb.input_quantizer._disabled = False
-        with pytest.raises(RuntimeError, match="nn.Embedding"):
-            qemb(torch.randint(0, VOCAB_SIZE, (4, 6)))
+    def test_wildcard_config_keeps_input_quantizer_disabled(self):
+        """set_from_attribute_config absorbs any cfg but force-disables input_quantizer.
 
-    def test_wildcard_config_accepted_then_opt_out(self):
-        """Wildcard cfg on ``*input_quantizer`` must not raise — stock NVFP4_DEFAULT_CFG relies on it.
-        A follow-up ``enable: false`` rule restores the disabled state."""
+        Stock recipes' ``*input_quantizer`` wildcard (and the default ``QuantizeConfig``
+        ``"*"`` rule) target every quantizer including the embedding's input slot.
+        The quantizer must end up disabled regardless of what the cfg said.
+        """
         qemb = _make_quant_embedding()
         set_quantizer_attributes_partial(
             qemb,
             "*input_quantizer",
             QuantizerAttributeConfig(num_bits=8, axis=None).model_dump(),
         )
-        set_quantizer_attributes_partial(qemb, "*input_quantizer", {"enable": False})
-        qemb(torch.randint(0, VOCAB_SIZE, (4, 6)))  # forward succeeds
+        assert not qemb.input_quantizer.is_enabled
+        # Forward still works — input_quantizer is disabled and never applied.
+        qemb(torch.randint(0, VOCAB_SIZE, (4, 6)))
 
 
 def _embedding_nvfp4_cfg() -> dict: