Arm backend: Preserve MXFP linear output dtype (#20487)

YufengShi-dudu · web-flow · commit e3d5de206f82 · 2026-06-24T16:35:53.000+01:00
Infer the output dtype of MXFP linear replacements from the source nn.Linear module. Keep the internal MXFP custom op output in FP32, and insert a cast back to the inferred dtype when needed. This lets BF16 models keep BF16 outputs from MXFP linear layers, which keeps SDPA input and attention mask dtypes compatible during export. Add Qwen3 VL layer coverage for MXFP8 BF16 attention, MLP, and decoder layers. Change-Id: Id6143ff330aeeca0815756c5468efb9930ac185f cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Yufeng Shi <yufeng.shi@arm.com>
diff --git a/backends/arm/ao_ext/ops/mxfp_linear_op.py b/backends/arm/ao_ext/ops/mxfp_linear_op.py
@@ -33,6 +33,12 @@
 )
 
 
+_SUPPORTED_OUTPUT_DTYPES: set[torch.dtype] = {
+    torch.float32,
+    torch.bfloat16,
+}
+
+
 def _get_mx_elem_dtype(
     weight_qdata: torch.Tensor,
     weight_payload_dtype: str = "",
@@ -139,10 +145,12 @@ def __init__(
         bias: torch.Tensor | None,
         weight_dtype: MXFPDType,
         block_size: int,
+        output_dtype: torch.dtype = torch.float32,
     ) -> None:
         super().__init__()
         self.weight_dtype = mxfp_dtype_to_str(weight_dtype)
         self.block_size = block_size
+        self.output_dtype = output_dtype
 
         self.register_buffer("weight_qdata", weight_qdata, persistent=True)
         self.register_buffer("weight_scale", weight_scale, persistent=True)
@@ -159,14 +167,17 @@ def __init__(
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return torch.ops.tosa_mxfp.linear.default(
+        output = torch.ops.tosa_mxfp.linear.default(
             x,
             self.weight_qdata,
             self.weight_scale,
             self.bias,
             self.block_size,
             self.weight_dtype,
         )
+        if self.output_dtype != torch.float32:
+            output = output.to(self.output_dtype)
+        return output
 
 
 def transform_linear_to_mxfp(
@@ -196,10 +207,14 @@ def transform_linear_to_mxfp(
     weight_scale = weight_scale.unsqueeze(0)
 
     bias = module.bias.detach().to(torch.float32) if module.bias is not None else None
+    output_dtype = weight.dtype
+    if output_dtype not in _SUPPORTED_OUTPUT_DTYPES:
+        raise ValueError(f"Unsupported output_dtype: {output_dtype}")
     return MXFPLinearOp(
         weight_qdata,
         weight_scale,
         bias,
         config.weight_dtype,
         config.block_size,
+        output_dtype,
     )
diff --git a/backends/arm/test/misc/test_mxfp_linear_ao.py b/backends/arm/test/misc/test_mxfp_linear_ao.py
@@ -98,6 +98,53 @@ def _is_selected_linear(module: torch.nn.Module, fqn: str) -> bool:
     assert isinstance(model.skipped, torch.nn.Linear)
 
 
+def test_mxfp_linear_preserves_bfloat16_output_dtype() -> None:
+    model = LinearModule().eval().to(torch.bfloat16)
+    to_mxfp(
+        model,
+        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
+    )
+
+    output = model(torch.randn(4, 32, dtype=torch.bfloat16))
+
+    assert isinstance(model.linear, MXFPLinearOp)
+    assert model.linear.output_dtype == torch.bfloat16
+    assert output.dtype == torch.bfloat16
+
+
+def test_mxfp_linear_op_output_dtype_constructor_arg() -> None:
+    model = LinearModule().eval()
+    config = MXFPOpConfig(weight_dtype=torch.float8_e4m3fn)
+    to_mxfp(
+        model,
+        config,
+    )
+    assert isinstance(model.linear, MXFPLinearOp)
+
+    fp32_linear = MXFPLinearOp(
+        model.linear.weight_qdata,
+        model.linear.weight_scale,
+        model.linear.bias,
+        config.weight_dtype,
+        config.block_size,
+    )
+    bf16_linear = MXFPLinearOp(
+        model.linear.weight_qdata,
+        model.linear.weight_scale,
+        model.linear.bias,
+        config.weight_dtype,
+        config.block_size,
+        output_dtype=torch.bfloat16,
+    )
+
+    test_input = torch.randn(4, 32)
+
+    assert fp32_linear.output_dtype == torch.float32
+    assert fp32_linear(test_input).dtype == torch.float32
+    assert bf16_linear.output_dtype == torch.bfloat16
+    assert bf16_linear(test_input).dtype == torch.bfloat16
+
+
 def _test_mxfp_linear_export_preserves_custom_op(config: MXFPOpConfig) -> None:
     model = LinearModule().eval()
     to_mxfp(model, config)
@@ -135,3 +182,26 @@ def test_mxfp6_e3m2_linear_export_preserves_custom_op() -> None:
     _test_mxfp_linear_export_preserves_custom_op(
         MXFPOpConfig(weight_dtype=DTYPE_FP6_E3M2)
     )
+
+
+def test_mxfp_linear_export_preserves_inferred_bfloat16_output_dtype() -> None:
+    model = LinearModule().eval().to(torch.bfloat16)
+    to_mxfp(
+        model,
+        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
+    )
+
+    exported = export(model, (torch.randn(4, 32, dtype=torch.bfloat16),), strict=False)
+
+    cast_nodes = [
+        node
+        for node in exported.graph_module.graph.nodes
+        if node.op == "call_function" and node.target == torch.ops.aten.to.dtype
+    ]
+
+    assert len(cast_nodes) == 1
+    assert cast_nodes[0].args[1] == torch.bfloat16
+    assert cast_nodes[0].meta["val"].dtype == torch.bfloat16
+    cast_input = cast_nodes[0].args[0]
+    assert isinstance(cast_input, torch.fx.Node)
+    assert cast_input.target == torch.ops.tosa_mxfp.linear.default
diff --git a/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_layers.py b/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_layers.py
@@ -560,6 +560,31 @@ def test_qwen3_vl_tosa_mxfp8_fp32(test_case: Qwen3VLTestCase):
         pipeline.run()
 
 
+@common.parametrize(
+    "test_case",
+    TOSA_MXFP8_TEST_CASES,
+)
+def test_qwen3_vl_tosa_mxfp8_bf16(test_case: Qwen3VLTestCase):
+    model, inputs = test_case.model_cls.prepare_model_and_inputs()
+    model, inputs = _to_bfloat16(model, inputs)
+    mxfp_config = MXFPOpConfig(weight_dtype=torch.float8_e4m3fn)
+
+    with torch.no_grad():
+        pipeline = MXFPTosaPipelineFP[input_t](
+            model,
+            inputs,
+            aten_op=aten_op_mxfp_linear,
+            exir_op=[],
+            filter_fn=_is_linear,
+            frobenius_threshold=0.05,
+            cosine_threshold=0.995,
+            mxfp_config=mxfp_config,
+            tosa_version="1.1",
+            tosa_extensions=["bf16", "mxfp"],
+        )
+        pipeline.run()
+
+
 @common.SkipIfNoModelConverter
 @common.parametrize(
     "test_case",
diff --git a/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py b/backends/arm/test/passes/test_rewrite_mxfp_linear_pass.py
@@ -54,10 +54,15 @@ def _get_nodes_from_target(
 
 def _rewrite_linear_module(
     config: MXFPOpConfig,
+    model_dtype: torch.dtype = torch.float32,
 ) -> tuple[torch.fx.GraphModule, list[torch.fx.Node], list[torch.fx.Node]]:
-    model = _LinearModule(bias=True).eval()
+    model = _LinearModule(bias=True).eval().to(model_dtype)
     to_mxfp(model, config, filter_fn=_is_linear)
-    exported = export(model, (torch.randn(4, 5, 32),), strict=False)
+    exported = export(
+        model,
+        (torch.randn(4, 5, 32, dtype=model_dtype),),
+        strict=False,
+    )
     tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+mxfp")
 
     with TosaLoweringContext(tosa_spec):
@@ -98,6 +103,19 @@ def test_rewrite_mxfp_linear_replaces_custom_op() -> None:
     assert tuple(output_node.meta["val"][0].shape) == (4, 5, 8)
 
 
+def test_rewrite_mxfp_linear_preserves_inferred_bfloat16_output_cast() -> None:
+    graph_module, _, matmul_nodes = _rewrite_linear_module(
+        MXFPOpConfig(),
+        model_dtype=torch.bfloat16,
+    )
+
+    output_node = graph_module.graph.output_node()
+
+    assert len(matmul_nodes) == 1
+    assert matmul_nodes[0].meta["val"].dtype == torch.float32
+    assert output_node.meta["val"][0].dtype == torch.bfloat16
+
+
 def test_rewrite_mxfp6_linear_marks_payload_dtype() -> None:
     graph_module, cast_nodes, matmul_nodes = _rewrite_linear_module(
         MXFPOpConfig(weight_dtype=DTYPE_FP6_E2M3)