|
17 | 17 |
|
18 | 18 | import torch |
19 | 19 | import torch.nn as nn |
20 | | -from _test_utils.torch.export.utils import ToyModel, partial_w4a8_config |
| 20 | +from _test_utils.torch.export.utils import ToyModel, partial_nvfp4_config, partial_w4a8_config |
21 | 21 | from torch.nn import functional as F |
22 | 22 | from torch.nn import init |
23 | 23 |
|
24 | 24 | import modelopt.torch.quantization as mtq |
25 | 25 | from modelopt.torch.export.unified_export_hf import _export_quantized_weight |
| 26 | +from modelopt.torch.quantization.nn import NVFP4StaticQuantizer |
26 | 27 | from modelopt.torch.quantization.nn.modules.quant_module import QuantModule, QuantModuleRegistry |
27 | 28 | from modelopt.torch.quantization.nn.modules.tensor_quantizer import TensorQuantizer |
28 | 29 | from modelopt.torch.quantization.tensor_quant import QUANT_DESC_8BIT_PER_TENSOR |
29 | | -from modelopt.torch.quantization.utils import quantizer_attr_names |
| 30 | +from modelopt.torch.quantization.utils import quantizer_attr_names, reduce_block_amax |
30 | 31 |
|
31 | 32 |
|
32 | 33 | class ToyLinear(nn.Module): |
@@ -121,3 +122,61 @@ def test_export_per_block_quantized_weight(): |
121 | 122 | assert hasattr(model.linears[2], quantizer_attrs.output_quantizer) |
122 | 123 | assert not getattr(model.linears[2], quantizer_attrs.output_quantizer).is_enabled |
123 | 124 | assert not hasattr(model.linears[2], quantizer_attrs.output_scale) |
| 125 | + |
| 126 | + |
| 127 | +def test_export_nvfp4_static_weight_dynamic_vs_static_match(): |
| 128 | + """Dynamic vs static NVFP4 export: same weight and scales after export even when amaxs are |
| 129 | + cleared on one layer (lazy calibration via _ensure_weight_quantizer_calibrated fills them from weights). |
| 130 | + """ |
| 131 | + device = "cuda" |
| 132 | + dims = [32, 32, 32, 32] |
| 133 | + block_size = 16 |
| 134 | + calib_input = torch.randn(1, 4, 32, device=device) |
| 135 | + nvfp4_layer_indices = [1, 2] # layers with NVFP4 enabled in partial_nvfp4_config |
| 136 | + |
| 137 | + torch.manual_seed(42) |
| 138 | + model_dynamic = ToyModel(dims=dims).to(device) |
| 139 | + mtq.quantize(model_dynamic, partial_nvfp4_config, lambda x: x(calib_input)) |
| 140 | + |
| 141 | + torch.manual_seed(42) |
| 142 | + model_static = ToyModel(dims=dims).to(device) |
| 143 | + mtq.quantize(model_static, partial_nvfp4_config, lambda x: x(calib_input)) |
| 144 | + |
| 145 | + # Convert NVFP4 layers to NVFP4StaticQuantizer with per-block and global amax |
| 146 | + for idx in nvfp4_layer_indices: |
| 147 | + layer = model_static.linears[idx] |
| 148 | + weight = layer.weight.data |
| 149 | + per_block_amax = reduce_block_amax(weight, block_sizes={-1: block_size}) |
| 150 | + tq = layer.weight_quantizer |
| 151 | + if hasattr(tq, "_amax"): |
| 152 | + delattr(tq, "_amax") |
| 153 | + tq.register_buffer("_amax", per_block_amax.to(weight.device).clone().detach()) |
| 154 | + NVFP4StaticQuantizer.from_tensor_quantizer(tq, global_amax=per_block_amax.max()) |
| 155 | + |
| 156 | + # Clear amaxs on layer 1 to exercise lazy calibration during export |
| 157 | + for linear, is_static in [(model_dynamic.linears[1], False), (model_static.linears[1], True)]: |
| 158 | + wq = linear.weight_quantizer |
| 159 | + if hasattr(wq, "_amax"): |
| 160 | + delattr(wq, "_amax") |
| 161 | + if is_static and hasattr(wq, "_global_amax"): |
| 162 | + delattr(wq, "_global_amax") |
| 163 | + |
| 164 | + quantizer_attrs = quantizer_attr_names("weight") |
| 165 | + for idx in nvfp4_layer_indices: |
| 166 | + _export_quantized_weight(model_dynamic.linears[idx], torch.float32, "weight") |
| 167 | + _export_quantized_weight(model_static.linears[idx], torch.float32, "weight") |
| 168 | + |
| 169 | + for idx in nvfp4_layer_indices: |
| 170 | + dyn_linear = model_dynamic.linears[idx] |
| 171 | + sta_linear = model_static.linears[idx] |
| 172 | + assert torch.equal(dyn_linear.weight, sta_linear.weight), ( |
| 173 | + f"Layer {idx}: exported NVFP4 weight should match (dynamic vs static)" |
| 174 | + ) |
| 175 | + assert torch.allclose( |
| 176 | + getattr(dyn_linear, quantizer_attrs.weight_scale).float(), |
| 177 | + getattr(sta_linear, quantizer_attrs.weight_scale).float(), |
| 178 | + ), f"Layer {idx}: weight_scale should match" |
| 179 | + assert torch.allclose( |
| 180 | + getattr(dyn_linear, quantizer_attrs.weight_scale_2).float(), |
| 181 | + getattr(sta_linear, quantizer_attrs.weight_scale_2).float(), |
| 182 | + ), f"Layer {idx}: weight_scale_2 should match" |
0 commit comments