Quant in checkpoint dtype (#18781)

metascroy · web-flow · commit 3e8fc7ba3ce0 · 2026-04-10T13:41:38.000-07:00
Switches order in etLLM so we quantize in checkpoint dtype and then cast to dtype-override. This can prevent underflowing on scales. Also exposes ability to turn HQQ on/off. Export: ``` python -m extension.llm.export.export_llm \ base.model_class=phi_4_mini \ base.params=examples/models/phi_4_mini/config/config.json \ model.use_kv_cache=true \ model.use_sdpa_with_kv_cache=true \ model.dtype_override=fp32 \ export.output_dir=/tmp/phi_4_mini_no_hqq \ export.output_name=model.pte \ export.max_seq_length=2048 \ export.max_context_length=2048 \ quantization.qmode=8da4w \ quantization.group_size=32 "quantization.embedding_quantize='8,0'" quantization.use_hqq=False \ backend.xnnpack.enabled=true \ backend.xnnpack.extended_ops=true ``` Phi4 output: ``` <|im_start|>system You are a highly capable, helpful, and honest AI assistant designed to provide clear, accurate, and thoughtful responses to a wide range of questions. Your primary goal is to assist users by offering information, explanations, and guidance in a manner that is respectful, unbiased, and safe. Always strive to be as helpful as possible, but never provide content that is harmful, unethical, offensive, or illegal. If a question is unclear, nonsensical, or based on incorrect premises, politely explain the issue rather than attempting to answer inaccurately. If you do not know the answer to a question, it is better to admit uncertainty than to provide false or misleading information. When appropriate, include examples, analogies, or step-by-step reasoning to enhance understanding. Your responses should be positive, inclusive, and supportive, fostering a constructive and informative interaction.<|im_end|> <|im_start|>user Please answer the following question in detail and provide relevant context, examples, and explanations where possible: What are some of the most important considerations when designing a machine learning system for real-world applications? Discuss potential challenges, best practices, and how to ensure ethical and responsible use.<|im_end|> <|im_start|>assistant Designing a machine learning system for real-world applications involves various considerations to ensure the system is effective, fair, and secure. Some of the most important considerations include data quality and sourcing, model choice and design, evaluation and validation, interpretability and transparency, and ensuring fairness and avoiding biases. Data quality and sourcing involve ensuring data is of high quality, representative of the target application, and properly curated and preprocessed to remove noise and biases. Model choice and design involve selecting an appropriate model for the application, understanding the strengths and limitations of different models, and understanding the application domain and data. Model evaluation and validation involve properly training and tuning the model on a training set and properly validating and testing the model on a separate validation set to avoid data leakage and ``` Related work: improvement in torchao's HQQ algorithm that helps with Phi4's model distribution: pytorch/ao#4259
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -743,10 +743,9 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
             f"Checkpoint dtype {checkpoint_dtype} precision is higher than dtype override {dtype_override.to_torch_dtype()}."
         )
 
-    edge_manager.model = edge_manager.model.to(dtype=dtype_override.to_torch_dtype())
-
-    # We want to quantize (in the source transforms) the weights of the model
-    # in the checkpoint dtype.
+    # Quantize weights in checkpoint dtype for accuracy, then cast to
+    # dtype_override afterward. IntxUnpackedToInt8Tensor.to() properly
+    # propagates the dtype change to scale/zero_point/output dtype.
     logging.info(f"Checkpoint dtype: {edge_manager.model.checkpoint_dtype}")
     edge_manager = edge_manager.set_output_dir(output_dir_path).source_transform(
         _get_source_transforms(
@@ -791,9 +790,14 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
             local_global_attention=llm_config.model.local_global_attention,
             use_torchao_kernels_linear=llm_config.backend.torchao.use_torchao_kernels_linear,
             use_torchao_kernels_tied_embedding=llm_config.backend.torchao.use_torchao_kernels_tied_embedding,
+            quantize_with_hqq=llm_config.quantization.use_hqq,
         )
     )
 
+    # Now cast to the dtype override after quantization, so non-quantized
+    # components use the desired computation dtype.
+    edge_manager.model = edge_manager.model.to(dtype=dtype_override.to_torch_dtype())
+
     return edge_manager
 
 
@@ -1736,8 +1740,7 @@ def _get_source_transforms(  # noqa
             get_quant_embedding_transform(
                 embedding_quantize,
                 use_shared_embedding,
-                checkpoint_dtype,
-                quantize_with_hqq,
+                quantize_with_hqq=quantize_with_hqq,
             )
         )
 
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -755,14 +755,21 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor:
                 self.weight, self.scales, None, -8, 7, indices, dtype=self.dtype
             )
 
+    def _apply(self, fn, recurse=True):
+        """Override _apply to update self.dtype when the module is cast via .to(dtype)."""
+        super()._apply(fn, recurse)
+        # Probe the new dtype from the scales buffer, which gets cast by super()._apply.
+        if self.scales is not None:
+            self.dtype = self.scales.dtype
+        return self
+
 
 ############################ Source Transform Start #######################
 
 
 def get_quant_embedding_transform(
     embedding_quantize: str,
     use_shared_embedding: bool = False,
-    dtype_override: Optional[DType] = None,
     quantize_with_hqq: bool = True,
 ):
     if embedding_quantize.startswith("torchao:"):
@@ -817,13 +824,11 @@ def _torchao_embedding_quantizer(model):
     else:
         group_size = int(group_size)
     bitwidth = int(bitwidth)
-    torch_dtype = dtype_override.to_torch_dtype() if dtype_override else None
     return lambda model: EmbeddingQuantHandler(
         model,
         bitwidth=bitwidth,
         group_size=group_size,
         packed=(bitwidth in [2, 4]),
-        precision=torch_dtype,
         quantize_with_hqq=quantize_with_hqq,
     ).quantized_model()
 
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
@@ -429,6 +429,7 @@ class QuantizationConfig:
     calibration_limit: Optional[int] = None
     calibration_seq_length: Optional[int] = None
     calibration_data: str = "Once upon a time"
+    use_hqq: bool = True
 
     def __post_init__(self):
         if self.qmode: