fix: address PR review feedback (realAsma)

sungsooha · sungsooha · commit f412e297bd48 · 2026-04-07T14:37:18.000-07:00
- Fold pre_quant_scale on GPU before .cpu() move (perf fix)
- Use torch.allclose instead of torch.equal in test (nit)

Signed-off-by: Sungsoo Ha &lt;sungsooh@nvidia.com&gt;
diff --git a/modelopt/torch/export/plugins/vllm_fakequant_hf.py b/modelopt/torch/export/plugins/vllm_fakequant_hf.py
@@ -105,11 +105,11 @@ def _process_weight(item: _WeightQuantWork) -> tuple[str, torch.Tensor, str | No
     Returns (sd_key, quantized_weight_on_cpu, inp_q_key_or_None).
     """
     w = item.weight
-    w_quant = item.quantizer(w.float()).to(w.dtype).cpu()
+    w_quant = item.quantizer(w.float()).to(w.dtype)
     if item.inp_q is not None:
         scale = item.inp_q._pre_quant_scale.squeeze().to(device=w_quant.device)
         w_quant = (w_quant * scale[None, :]).to(w_quant.dtype)
-    return item.sd_key, w_quant, item.inp_q_key
+    return item.sd_key, w_quant.cpu(), item.inp_q_key
 
 
 def _process_device_batch(items: list[_WeightQuantWork], device: torch.device):
diff --git a/tests/gpu/torch/export/test_vllm_fakequant_hf_parallel_export.py b/tests/gpu/torch/export/test_vllm_fakequant_hf_parallel_export.py
@@ -80,7 +80,7 @@ def forward_loop(model):
 
     assert seq_sd.keys() == par_sd.keys(), "Key mismatch between sequential and parallel export"
     for key in seq_sd:
-        assert torch.equal(seq_sd[key], par_sd[key]), (
+        assert torch.allclose(seq_sd[key], par_sd[key]), (
             f"Weight mismatch for {key}: max diff={torch.abs(seq_sd[key] - par_sd[key]).max()}"
         )
 

Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@ def forward_loop(model):`
`80`	`80`
`81`	`81`	`assert seq_sd.keys() == par_sd.keys(), "Key mismatch between sequential and parallel export"`
`82`	`82`	`for key in seq_sd:`
`83`		`- assert torch.equal(seq_sd[key], par_sd[key]), (`
	`83`	`+ assert torch.allclose(seq_sd[key], par_sd[key]), (`
`84`	`84`	`f"Weight mismatch for {key}: max diff={torch.abs(seq_sd[key] - par_sd[key]).max()}"`
`85`	`85`	`)`
`86`	`86`