Arm backend: Preserve MXFP Conv2d output dtype (#20513)

YufengShi-dudu · web-flow · commit 5a920c37f7e6 · 2026-06-25T14:57:19.000+01:00
Infer the MXFP Conv2d wrapper output dtype from the source Conv2d weight dtype, matching the MXFP linear path. Cast the custom op output back to bf16 when the original module is bf16, while keeping the MXFP TOSA op output in fp32. Add AO, export, and rewrite pass tests covering the default fp32 constructor path and inferred bf16 output preservation. Change-Id: I48fb70157439650b329d7db35fd794200fe1545d cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Yufeng Shi <yufeng.shi@arm.com>
diff --git a/backends/arm/ao_ext/ops/mxfp_conv2d_op.py b/backends/arm/ao_ext/ops/mxfp_conv2d_op.py
@@ -32,6 +32,12 @@
 )
 
 
+_SUPPORTED_OUTPUT_DTYPES: set[torch.dtype] = {
+    torch.float32,
+    torch.bfloat16,
+}
+
+
 def _get_mx_elem_dtype(
     weight_qdata: torch.Tensor,
     weight_payload_dtype: str = "",
@@ -208,10 +214,12 @@ def __init__(
         groups: int,
         weight_dtype: MXFPDType,
         block_size: int,
+        output_dtype: torch.dtype = torch.float32,
     ) -> None:
         super().__init__()
         self.weight_dtype = mxfp_dtype_to_str(weight_dtype)
         self.block_size = block_size
+        self.output_dtype = output_dtype
 
         self.register_buffer("weight_qdata", weight_qdata, persistent=True)
         self.register_buffer("weight_scale", weight_scale, persistent=True)
@@ -233,7 +241,7 @@ def __init__(
         self.groups = groups
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return torch.ops.tosa_mxfp.conv2d.default(
+        output = torch.ops.tosa_mxfp.conv2d.default(
             x,
             self.weight_qdata,
             self.weight_scale,
@@ -245,6 +253,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             self.block_size,
             self.weight_dtype,
         )
+        if self.output_dtype != torch.float32:
+            output = output.to(self.output_dtype)
+        return output
 
 
 def transform_conv2d_to_mxfp(
@@ -276,6 +287,9 @@ def transform_conv2d_to_mxfp(
     )
 
     bias = module.bias.detach().to(torch.float32) if module.bias is not None else None
+    output_dtype = weight_ohwi.dtype
+    if output_dtype not in _SUPPORTED_OUTPUT_DTYPES:
+        raise ValueError(f"Unsupported output_dtype: {output_dtype}")
     return MXFPConv2dOp(
         weight_qdata,
         weight_scale,
@@ -286,4 +300,5 @@ def transform_conv2d_to_mxfp(
         module.groups,
         config.weight_dtype,
         config.block_size,
+        output_dtype,
     )
diff --git a/backends/arm/test/misc/test_mxfp_conv2d_ao.py b/backends/arm/test/misc/test_mxfp_conv2d_ao.py
@@ -159,6 +159,61 @@ def test_mxfp_conv2d_quantize_supports_fp4_weights() -> None:
     )
 
 
+def test_mxfp_conv2d_preserves_bfloat16_output_dtype() -> None:
+    model = Conv2dModule().eval().to(torch.bfloat16)
+    to_mxfp(
+        model,
+        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
+    )
+
+    output = model(torch.randn(1, IN_CHANNELS, 8, 8, dtype=torch.bfloat16))
+
+    assert isinstance(model.conv, MXFPConv2dOp)
+    assert model.conv.output_dtype == torch.bfloat16
+    assert output.dtype == torch.bfloat16
+
+
+def test_mxfp_conv2d_op_output_dtype_constructor_arg() -> None:
+    model = Conv2dModule().eval()
+    config = MXFPOpConfig(weight_dtype=torch.float8_e4m3fn)
+    to_mxfp(
+        model,
+        config,
+    )
+    assert isinstance(model.conv, MXFPConv2dOp)
+
+    fp32_conv = MXFPConv2dOp(
+        model.conv.weight_qdata,
+        model.conv.weight_scale,
+        model.conv.bias,
+        model.conv.stride,
+        model.conv.padding,
+        model.conv.dilation,
+        model.conv.groups,
+        config.weight_dtype,
+        config.block_size,
+    )
+    bf16_conv = MXFPConv2dOp(
+        model.conv.weight_qdata,
+        model.conv.weight_scale,
+        model.conv.bias,
+        model.conv.stride,
+        model.conv.padding,
+        model.conv.dilation,
+        model.conv.groups,
+        config.weight_dtype,
+        config.block_size,
+        output_dtype=torch.bfloat16,
+    )
+
+    test_input = torch.randn(1, IN_CHANNELS, 8, 8)
+
+    assert fp32_conv.output_dtype == torch.float32
+    assert fp32_conv(test_input).dtype == torch.float32
+    assert bf16_conv.output_dtype == torch.bfloat16
+    assert bf16_conv(test_input).dtype == torch.bfloat16
+
+
 def _test_mxfp_conv2d_export_preserves_custom_op(config: MXFPOpConfig) -> None:
     model = Conv2dModule().eval()
     to_mxfp(model, config)
@@ -198,6 +253,33 @@ def test_mxfp6_e3m2_conv2d_export_preserves_custom_op() -> None:
     )
 
 
+def test_mxfp_conv2d_export_preserves_inferred_bfloat16_output_dtype() -> None:
+    model = Conv2dModule().eval().to(torch.bfloat16)
+    to_mxfp(
+        model,
+        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
+    )
+
+    exported = export(
+        model,
+        (torch.randn(1, IN_CHANNELS, 8, 8, dtype=torch.bfloat16),),
+        strict=False,
+    )
+
+    cast_nodes = [
+        node
+        for node in exported.graph_module.graph.nodes
+        if node.op == "call_function" and node.target == torch.ops.aten.to.dtype
+    ]
+
+    assert len(cast_nodes) == 1
+    assert cast_nodes[0].args[1] == torch.bfloat16
+    assert cast_nodes[0].meta["val"].dtype == torch.bfloat16
+    cast_input = cast_nodes[0].args[0]
+    assert isinstance(cast_input, torch.fx.Node)
+    assert cast_input.target == torch.ops.tosa_mxfp.conv2d.default
+
+
 def test_mxfp_conv2d_cpu_impl_matches_ref() -> None:
     ref_model = Conv2dModule().eval()
     test_model = Conv2dModule().eval()
diff --git a/backends/arm/test/passes/test_rewrite_mxfp_conv2d_pass.py b/backends/arm/test/passes/test_rewrite_mxfp_conv2d_pass.py
@@ -59,10 +59,15 @@ def _nodes_from_target(
 def _rewrite_conv2d_module(
     config: MXFPOpConfig,
     bias: bool = True,
+    model_dtype: torch.dtype = torch.float32,
 ) -> tuple[torch.fx.GraphModule, list[torch.fx.Node], list[torch.fx.Node]]:
-    model = _Conv2dModule(bias=bias).eval()
+    model = _Conv2dModule(bias=bias).eval().to(model_dtype)
     to_mxfp(model, config, filter_fn=_is_conv2d)
-    exported = export(model, (torch.randn(1, 32, 10, 12),), strict=False)
+    exported = export(
+        model,
+        (torch.randn(1, 32, 10, 12, dtype=model_dtype),),
+        strict=False,
+    )
     tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
 
     with TosaLoweringContext(tosa_spec):
@@ -113,6 +118,19 @@ def test_rewrite_mxfp_conv2d_restores_output_shape() -> None:
     assert tuple(output_node.meta["val"].shape) == (1, 8, 5, 6)
 
 
+def test_rewrite_mxfp_conv2d_preserves_inferred_bfloat16_output_cast() -> None:
+    graph_module, _, conv_nodes = _rewrite_conv2d_module(
+        MXFPOpConfig(),
+        model_dtype=torch.bfloat16,
+    )
+
+    output_node = graph_module.graph.output_node()
+
+    assert len(conv_nodes) == 1
+    assert conv_nodes[0].meta["val"].dtype == torch.float32
+    assert output_node.meta["val"][0].dtype == torch.bfloat16
+
+
 def test_rewrite_mxfp4_conv2d_marks_payloads() -> None:
     model = _Conv2dModule(bias=True).eval()
     to_mxfp(