[6106576] Address PR review feedback on edgellm shim restoration

ajrasane · ajrasane · commit c7e10f46ffb0 · 2026-05-08T14:29:42.000Z
- fp4qdq_to_2dq: look up block_size by attribute name instead of position so
  the shim does not silently use the wrong attribute if TRT_FP4QDQ attribute
  ordering changes.
- _get_precision_dtype: use onnx.TensorProto.BFLOAT16 instead of the literal
  16 for readability.
- nvfp4_exporter: note in the docstrings of _cast_fp4 and
  _replace_fp4qdq_with_2dq that they are reused by the deprecated
  qdq_utils.fp4qdq_to_2dq shim, so a future refactor does not silently drop
  them.
- Add direct smoke tests for quantize_weights_to_int4,
  quantize_weights_to_mxfp8, and fp4qdq_to_2dq that assert each shim emits a
  DeprecationWarning and produces the expected end-state graph (the existing
  tests only exercise the staged exporters).

Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
diff --git a/modelopt/onnx/export/nvfp4_exporter.py b/modelopt/onnx/export/nvfp4_exporter.py
@@ -39,6 +39,10 @@ def _cast_fp4(array: np.ndarray) -> np.ndarray:
 
     Note: The first dimension of the array must be divisible by 2
     as two FP4 values are packed into a single byte.
+
+    Also reused by the deprecated ``modelopt.onnx.quantization.qdq_utils.fp4qdq_to_2dq``
+    compatibility shim. Do not rename or change the signature without updating that
+    shim (it is a load-bearing re-export for TensorRT-Edge-LLM 0.6.1).
     """
     array_f32_t = torch.from_numpy(array)
     array_f32_t_shape = array_f32_t.shape
@@ -76,6 +80,10 @@ def _replace_fp4qdq_with_2dq(
 ):
     """Replaces the given node in the ONNX graph with a subgraph consisting of two DequantizeLinear nodes.
 
+    Also reused by the deprecated ``modelopt.onnx.quantization.qdq_utils.fp4qdq_to_2dq``
+    compatibility shim. Do not rename or change the signature without updating that
+    shim (it is a load-bearing re-export for TensorRT-Edge-LLM 0.6.1).
+
     Args:
         graph: The ONNX graph containing the node to replace.
         node: The node to be replaced.
diff --git a/modelopt/onnx/quantization/qdq_utils.py b/modelopt/onnx/quantization/qdq_utils.py
@@ -1555,7 +1555,7 @@ def _cast_input_dtypes(node: onnx.NodeProto, precision_dtype: str):
     def _get_precision_dtype() -> str:
         precision_dtype = "Half"
         for initializer in graph.initializer:
-            if initializer.data_type == 16:
+            if initializer.data_type == onnx.TensorProto.BFLOAT16:
                 precision_dtype = "BFloat16"
                 break
         return precision_dtype
@@ -1570,7 +1570,9 @@ def _get_precision_dtype() -> str:
     for node in fp4_qdq_nodes:
         idx1 = initializer_indices.get(node.input[0], None)
         assert idx1 is not None, f"Initializer for weight '{node.input[0]}' not found."
-        block_size = node.attribute[0].i
+        block_size_attr = next((attr for attr in node.attribute if attr.name == "block_size"), None)
+        assert block_size_attr is not None, f"block_size attribute not found for {node.name}"
+        block_size = block_size_attr.i
         initializers_to_delete.append(initializers[idx1].name)
         logger.debug(
             f"Processing FP4QDQ node for weight {node.input[0]} with block size {block_size}"
diff --git a/tests/unit/onnx/quantization/test_qdq_utils.py b/tests/unit/onnx/quantization/test_qdq_utils.py
@@ -1108,3 +1108,96 @@ def test_constant_node_scale_path_still_patched(self):
         scale_arr = numpy_helper.to_array(value_attr.t)
         assert not (scale_arr == 0).any()
         assert (scale_arr > 0).all()
+
+
+class TestLegacyEdgeLLMShims:
+    """Smoke tests for the deprecated top-level shims kept for TensorRT-Edge-LLM 0.6.1.
+
+    These are the functions edgellm 0.6.1 imports from
+    ``modelopt.onnx.quantization.qdq_utils`` directly (not via the staged exporters).
+    Tests verify each shim runs end-to-end on the same fixtures used for the staged
+    exporters and emits a ``DeprecationWarning``.
+    """
+
+    def test_quantize_weights_to_int4_shim(self):
+        import warnings
+
+        from modelopt.onnx.quantization.qdq_utils import quantize_weights_to_int4
+
+        model = create_test_model_with_int4_dq_reshape_transpose_matmul()
+
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            quantized_model = quantize_weights_to_int4(model)
+
+        assert any(
+            issubclass(w.category, DeprecationWarning)
+            and "quantize_weights_to_int4" in str(w.message)
+            for w in caught
+        )
+
+        weight_tensor = next(
+            init for init in quantized_model.graph.initializer if init.name == "weight"
+        )
+        assert weight_tensor.data_type == TensorProto.INT4
+
+        node_types = [node.op_type for node in quantized_model.graph.node]
+        assert "Reshape" not in node_types
+        assert "Transpose" not in node_types
+
+    def test_quantize_weights_to_mxfp8_shim(self):
+        import warnings
+
+        from modelopt.onnx.quantization.qdq_utils import quantize_weights_to_mxfp8
+
+        model = create_test_model_with_mxfp8_dq()
+
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            quantized_model = quantize_weights_to_mxfp8(model)
+
+        assert any(
+            issubclass(w.category, DeprecationWarning)
+            and "quantize_weights_to_mxfp8" in str(w.message)
+            for w in caught
+        )
+
+        weight_tensor = next(
+            init for init in quantized_model.graph.initializer if init.name == "linear.weight"
+        )
+        assert weight_tensor.data_type == TensorProto.FLOAT8E4M3FN
+
+        gelu_node = next(node for node in quantized_model.graph.node if node.op_type == "Gelu")
+        approximate_attr = next(attr for attr in gelu_node.attribute if attr.name == "approximate")
+        assert approximate_attr.s == b"tanh"
+
+    @pytest.mark.parametrize("with_transpose", [False, True])
+    def test_fp4qdq_to_2dq_shim(self, with_transpose):
+        import warnings
+
+        from modelopt.onnx.quantization.qdq_utils import fp4qdq_to_2dq
+
+        model = create_test_model_with_nvfp4_qdq(with_transpose=with_transpose)
+
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            converted_model = fp4qdq_to_2dq(model)
+
+        assert any(
+            issubclass(w.category, DeprecationWarning) and "fp4qdq_to_2dq" in str(w.message)
+            for w in caught
+        )
+
+        fp4qdq_nodes = [node for node in converted_model.graph.node if node.op_type == "TRT_FP4QDQ"]
+        assert len(fp4qdq_nodes) == 0
+
+        dq_nodes = [
+            node for node in converted_model.graph.node if node.op_type == "DequantizeLinear"
+        ]
+        assert len(dq_nodes) == 2
+
+        initializer_names = {init.name for init in converted_model.graph.initializer}
+        assert "linear.weight_f4" in initializer_names
+        assert "linear.weight_f8_scale" in initializer_names
+        assert "linear.weight_f8_scale_f32_scale" in initializer_names
+        assert "linear.weight" not in initializer_names