pytorch
diff --git a/‎backends/arm/_passes/to_tosa_memory_format_pass.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/to_tosa_memory_format_pass.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/test/misc/test_const_shape.py‎
Lines changed: 86 additions & 0 deletions b/‎backends/arm/test/misc/test_const_shape.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎backends/arm/test/targets.bzl‎
Lines changed: 4 additions & 4 deletions b/‎backends/arm/test/targets.bzl‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 14 additions & 5 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎backends/cadence/aot/quantizer/fusion_pass.py‎
Lines changed: 26 additions & 19 deletions b/‎backends/cadence/aot/quantizer/fusion_pass.py‎
Lines changed: 26 additions & 19 deletions
diff --git a/‎backends/cadence/aot/quantizer/patterns.py‎
Lines changed: 31 additions & 4 deletions b/‎backends/cadence/aot/quantizer/patterns.py‎
Lines changed: 31 additions & 4 deletions
diff --git a/‎backends/cadence/aot/ref_implementations.py‎
Lines changed: 11 additions & 8 deletions b/‎backends/cadence/aot/ref_implementations.py‎
Lines changed: 11 additions & 8 deletions
@@ -465,6 +465,7 @@ def call(self, graph_module: torch.fx.GraphModule):
         Entry point for the pass: annotate spatial ranks, compute dim orders,
         insert bridging transposes, and forward to child passes.
         """
+        graph_module.graph.eliminate_dead_code()
         nodes = list(graph_module.graph.nodes)
         for node in nodes:
             if not self._is_ok_for_annotation(node):
 
@@ -5,9 +5,22 @@
 
 from typing import Set, Type
 
+import executorch.backends.arm.tosa.dialect  # noqa: F401
+import pytest
 import torch
+import tosa_serializer as ts
 from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.backends.arm._passes.to_tosa_memory_format_pass import (
+    ToTosaMemoryFormatPass,
+)
+from executorch.backends.arm.operators.node_visitor import get_node_visitors
+from executorch.backends.arm.process_node import process_call_function
 from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
+from executorch.backends.arm.tosa.specification import (
+    TosaLoweringContext,
+    TosaSpecification,
+)
+from executorch.backends.test.graph_builder import GraphBuilder
 from executorch.exir import to_edge
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
@@ -54,3 +67,76 @@ def forward(self, x):
     assert const_shape_nodes
     for n in const_shape_nodes:
         assert n.meta[TosaSpecialDtype.meta_key()] == TosaSpecialDtype.SHAPE
+
+
+def _graph_module_with_unused_const_shape():
+    with TosaLoweringContext(TosaSpecification.create_from_string("TOSA-1.1+FP+shape")):
+        builder = GraphBuilder()
+        builder.call_operator(exir_ops.backend.tosa.CONST_SHAPE.default, ([1],))
+        live_const = builder.call_operator(
+            exir_ops.backend.tosa.CONST_SHAPE.default, ([3],)
+        )
+        builder.output([live_const])
+        graph_module = ExportPass().call(builder.get_graph_module()).graph_module
+        for node in graph_module.graph.nodes:
+            if node.op == "call_function":
+                node.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.SHAPE
+        return graph_module
+
+
+def _propagate_shape_dim_orders_from_users(graph_module: torch.fx.GraphModule) -> None:
+    output_node = next(node for node in graph_module.graph.nodes if node.op == "output")
+    output_node.meta["tosa_dim_order"] = (0,)
+    dummy_exported = torch.export.export(torch.nn.Identity(), (torch.randn(1),))
+    tosa_memory_format_pass = ToTosaMemoryFormatPass(dummy_exported)
+    tosa_memory_format_pass._propagate_dim_order_to_shape_args(output_node)
+
+
+def _serialize_graph_module_to_tosa(graph_module: torch.fx.GraphModule):
+    tosa_spec = TosaSpecification.create_from_string("TOSA-1.1+FP+shape")
+    node_visitors = get_node_visitors(None, tosa_spec)
+    tosa_graph = ts.TosaSerializer(
+        "",
+        targetMajor=tosa_spec.version.major,
+        targetMinor=tosa_spec.version.minor,
+        targetPatch=tosa_spec.version.micro,
+        targetDraft=True,
+    )
+
+    for node in graph_module.graph.nodes:
+        if node.op == "call_function":
+            process_call_function(node, tosa_graph, node_visitors, tosa_spec)
+
+    return tosa_graph
+
+
+def test_unused_shape_ops_miss_tosa_dim_order_and_must_be_removed_before_tosa_serialization():
+    graph_module = _graph_module_with_unused_const_shape()
+    _propagate_shape_dim_orders_from_users(graph_module)
+
+    const_shape_nodes = [
+        node
+        for node in graph_module.graph.nodes
+        if node.op == "call_function"
+        and node.target == exir_ops.backend.tosa.CONST_SHAPE.default
+    ]
+    dead_const_shape, live_const_shape = const_shape_nodes
+
+    assert dead_const_shape.users == {}
+    assert "tosa_dim_order" not in dead_const_shape.meta
+    assert live_const_shape.meta["tosa_dim_order"] == (0,)
+
+    with pytest.raises(KeyError, match="tosa_dim_order"):
+        _serialize_graph_module_to_tosa(graph_module)
+
+    graph_module.graph.eliminate_dead_code()
+    graph_module.recompile()
+
+    remaining_const_shape = next(
+        node
+        for node in graph_module.graph.nodes
+        if node.op == "call_function"
+        and node.target == exir_ops.backend.tosa.CONST_SHAPE.default
+    )
+    assert remaining_const_shape.meta["tosa_dim_order"] == (0,)
+    assert _serialize_graph_module_to_tosa(graph_module)
@@ -3,7 +3,7 @@ load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
 load("@bazel_skylib//lib:paths.bzl", "paths")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
-_ENABLE_VGF = True
+_ENABLE_VGF = False  # Disabled: memfd_create blocked by seccomp on Sandcastle causes segfaults before Python pre-flight check can run
 
 def define_arm_tests():
     # TODO [fbonly] Add more tests
@@ -72,6 +72,7 @@ def define_arm_tests():
             resources = ["conftest.py"],
             compile = "with-source",
             typing = False,
+            skip_on_mode_mac = True,
             env = {} if runtime.is_oss else ({
                 "MODEL_CONVERTER_PATH": "$(location fbsource//third-party/pypi/ai-ml-sdk-model-converter/0.8.0:model-converter-bin)",
                 "MODEL_CONVERTER_LIB_DIR": "$(location fbsource//third-party/nvidia-nsight-systems:linux-x86_64)/host-linux-x64",
@@ -81,12 +82,11 @@ def define_arm_tests():
                 "EMULATION_LAYER_TENSOR_JSON": "$(location fbsource//third-party/arm-ml-emulation-layer/v0.9.0/src:VkLayer_Tensor_json)",
                 "EMULATION_LAYER_GRAPH_JSON": "$(location fbsource//third-party/arm-ml-emulation-layer/v0.9.0/src:VkLayer_Graph_json)",
             } if _ENABLE_VGF else {}),
-            preload_deps = [
+            preload_deps = [] if runtime.is_oss or not _ENABLE_VGF else [
                 "//executorch/kernels/quantized:custom_ops_generated_lib",
-            ] + ([] if runtime.is_oss or not _ENABLE_VGF else [
                 "fbsource//third-party/khronos:vulkan",
                 "//executorch/backends/arm/runtime:vgf_backend",
-            ]),
+            ],
             deps = [
                 "//executorch/backends/arm/test:arm_tester" if runtime.is_oss else "//executorch/backends/arm/test/tester/fb:arm_tester_fb",
                 "//executorch/backends/arm/test:conftest",
 
@@ -689,11 +689,11 @@ def register_fake(
 )
 
 lib.define(
-    "quantized_w8a32_gru(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale) -> Tensor"
+    "quantized_w8a32_gru(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_scale, Tensor bias_hidden) -> Tensor"
 )
 
 lib.define(
-    "quantized_w8a32_gru.out(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_w8a32_gru.out(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_scale, Tensor bias_hidden, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
 lib.define(
@@ -3060,11 +3060,20 @@ def quantized_w8a32_gru_meta(
     weights_hidden: torch.Tensor,
     w_h_scale: float,
     bias_inputs: torch.Tensor,
-    b_i_scale: float,
+    b_scale: float,
     bias_hidden: torch.Tensor,
-    b_h_scale: float,
 ) -> torch.Tensor:
-    return hidden.new_empty((2, *hidden.shape), dtype=torch.float32)
+    seq_len = inputs.shape[1]
+    assert seq_len == 1
+    # inputs comes in shape [batch, seq_len, input_size]
+    # hidden comes in shape [batch, seq_len, hidden_size]
+    # weights_inputs comes in shape [3 * hidden_size, input_size]
+    # weights_hidden comes in shape [3 * hidden_size, hidden_size]
+    # output comes in empty with shape [2, batch, seq_len, hidden_size]
+    # The first dimension stacks the output and the new hidden state
+    return hidden.new_empty(
+        (2, inputs.shape[0], inputs.shape[1], hidden.shape[-1]), dtype=torch.float32
+    )
 
 
 @register_fake("cadence::slice_scatter_")
 
@@ -438,26 +438,36 @@ def get_args_and_kwargs_mixed_w8a32_conv(
         torch.ops.aten.permute.default,
         (other_inputs[0], [0, 2, 1]),  # NCL -> NLC
     )
-    assert "val" in other_inputs[0].meta, "Missing val metadata on input node"
-    original_val = other_inputs[0].meta["val"]
-    assert original_val.fake_mode is not None, "fake_mode is None on input node"
-    with original_val.fake_mode:
-        transposed_inputs.meta["val"] = torch.ops.aten.permute.default(
-            original_val, [0, 2, 1]
-        )
+    # Propagate val metadata for transposed_inputs
+    if "val" in other_inputs[0].meta:
+        original_val = other_inputs[0].meta["val"]
+        fake_mode = original_val.fake_mode
+        if fake_mode is not None:
+            with fake_mode:
+                transposed_val = torch.ops.aten.permute.default(original_val, [0, 2, 1])
+            transposed_inputs.meta["val"] = transposed_val
+        else:
+            transposed_inputs.meta["val"] = torch.ops.aten.permute.default(
+                original_val, [0, 2, 1]
+            )
     copy_node_metadata(transposed_inputs, other_inputs[0])
 
     transposed_weights = graph_module.graph.call_function(
         torch.ops.aten.permute.default,
         (weights_inputs[0], [2, 0, 1]),  # NCL -> LNC
     )
-    assert "val" in weights_inputs[0].meta, "Missing val metadata on weight node"
-    original_val = weights_inputs[0].meta["val"]
-    assert original_val.fake_mode is not None, "fake_mode is None on weight node"
-    with original_val.fake_mode:
-        transposed_weights.meta["val"] = torch.ops.aten.permute.default(
-            original_val, [2, 0, 1]
-        )
+    # Propagate val metadata for transposed_weights
+    if "val" in weights_inputs[0].meta:
+        original_val = weights_inputs[0].meta["val"]
+        fake_mode = original_val.fake_mode
+        if fake_mode is not None:
+            with fake_mode:
+                transposed_val = torch.ops.aten.permute.default(original_val, [2, 0, 1])
+            transposed_weights.meta["val"] = transposed_val
+        else:
+            transposed_weights.meta["val"] = torch.ops.aten.permute.default(
+                original_val, [2, 0, 1]
+            )
     copy_node_metadata(transposed_weights, weights_inputs[0])
 
     args = (
@@ -511,12 +521,10 @@ def get_args_and_kwargs_mixed_w8a32_gru(
 ) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
     # Stride, padding, dilation, groups not supported yet
 
-    assert len(dequants_weights) == 2
     assert len(dequants_biases) == 2
     w_i_scale = dequants_weights[0].args[1]
     w_h_scale = dequants_weights[1].args[1]
-    b_i_scale = dequants_biases[0].args[1]
-    b_h_scale = dequants_biases[1].args[1]
+    b_scale = dequants_biases[0].args[1]
 
     args = (
         other_inputs[0],
@@ -526,9 +534,8 @@ def get_args_and_kwargs_mixed_w8a32_gru(
         weights_inputs[1],
         w_h_scale,
         bias_inputs[0],
-        b_i_scale,
+        b_scale,
         bias_inputs[1],
-        b_h_scale,
     )
     kwargs = {}
 
 
@@ -718,7 +718,7 @@ def get_anchors(
             )
 
         cnn_weights = conv_layer.args[1]
-        if hasattr(cnn_weights.meta, "tensor_meta"):
+        if "tensor_meta" in cnn_weights.meta:
             cnn_weights_shape = cnn_weights.meta["tensor_meta"].shape
             # Bail if the channels are not multiple of 4 (SIMD)
             if cnn_weights_shape[0] % 4 != 0:
@@ -744,6 +744,18 @@ def get_anchors(
                     conv_layer,
                 )
 
+            inputs = conv_layer.args[0]
+            if "tensor_meta" in inputs.meta:
+                inputs_shape = inputs.meta["tensor_meta"].shape
+                # Bail if length != kernel size - Not yet supported
+                if inputs_shape[-1] != cnn_weights_shape[2]:
+                    return (
+                        PartitionAnchors(
+                            empty=True,
+                        ),
+                        conv_layer,
+                    )
+
         return (
             PartitionAnchors(
                 inputs=[],
@@ -777,14 +789,16 @@ def get_anchors(
             )
 
         # Bail if input or states are not multiple of 4 (SIMD)
-        if gru_layer.args[0].meta["tensor_meta"].shape[-1] % 4 != 0:
+        tensor_meta_0 = gru_layer.args[0].meta.get("tensor_meta", None)
+        if tensor_meta_0 is None or tensor_meta_0.shape[-1] % 4 != 0:
             return (
                 PartitionAnchors(
                     empty=True,
                 ),
                 gru_layer,
             )
-        if gru_layer.args[1].meta["tensor_meta"].shape[-1] % 4 != 0:
+        tensor_meta_1 = gru_layer.args[1].meta.get("tensor_meta", None)
+        if tensor_meta_1 is None or tensor_meta_1.shape[-1] % 4 != 0:
             return (
                 PartitionAnchors(
                     empty=True,
@@ -799,13 +813,26 @@ def __init__(self, args, meta):
 
         wrapper = Wrapper(tuple(gru_layer.args[2]), gru_layer.meta)
 
+        # Using SharedQuantizationSpec so that bias_hh has the same observer as bias_ih
+        # Both biases get the same quantization scale to match the cpp operator
+        bias_ih_node = wrapper.args[2]
+        bias_ih_edge = (bias_ih_node, gru_layer)
+        shared_bias_qspec = SharedQuantizationSpec(edge_or_node=bias_ih_edge)
+
         return (
             PartitionAnchors(
                 inputs=[],
                 # pyre-fixme[6]: Expected `List[Tuple[Node, int]]` but got `List[Tuple[Wrapper, int]]`.
                 weights=[(wrapper, 0), (wrapper, 1)],
                 # pyre-fixme[6]: Expected `List[Union[Tuple[Node, int], Tuple[Node, int, DerivedQuantizationSpec]]]` but got `List[Tuple[Wrapper, int]]`.
-                biases=[(wrapper, 2), (wrapper, 3)],
+                biases=[
+                    (wrapper, 2),  # bias_ih gets normal qspec
+                    (
+                        wrapper,
+                        3,
+                        shared_bias_qspec,
+                    ),  # bias_hh shares observer with bias_ih
+                ],
                 output=[],
                 others=[(gru_layer, 0), (gru_layer, 1)],
             ),
 
@@ -1257,9 +1257,8 @@ def quantized_w8a32_gru(
     weights_hidden: torch.Tensor,
     w_h_scale: float,
     bias_inputs: torch.Tensor,
-    b_i_scale: float,
+    b_scale: float,
     bias_hidden: torch.Tensor,
-    b_h_scale: float,
 ) -> torch.Tensor:
     assert weights_inputs.dtype == torch.int8
     assert weights_hidden.dtype == torch.int8
@@ -1288,10 +1287,8 @@ def quantized_w8a32_gru(
     dequant_weights_inputs = weights_inputs.float() * w_i_scale
     dequant_weights_hidden = weights_hidden.float() * w_h_scale
 
-    # C++ implementation averages the two bias scales
-    avg_bias_scale = (b_i_scale + b_h_scale) / 2
-    dequant_bias_inputs = bias_inputs.float() * avg_bias_scale
-    dequant_bias_hidden = bias_hidden.float() * avg_bias_scale
+    dequant_bias_inputs = bias_inputs.float() * b_scale
+    dequant_bias_hidden = bias_hidden.float() * b_scale
 
     gi = F.linear(inputs, dequant_weights_inputs, dequant_bias_inputs)
     gh = F.linear(hidden, dequant_weights_hidden, dequant_bias_hidden)
@@ -1310,8 +1307,14 @@ def quantized_w8a32_gru(
 
     assert new_hidden.shape == original_hidden_shape
 
-    new_hidden = new_hidden.view(original_hidden_shape)
-    return torch.stack([new_hidden, new_hidden], dim=0)
+    batch_size = inputs.shape[0]
+    input_dim = inputs.shape[1]
+    hidden_dim = hidden.shape[-1]
+
+    new_hidden_expanded = new_hidden.unsqueeze(1).expand(
+        batch_size, input_dim, hidden_dim
+    )
+    return torch.stack([new_hidden_expanded, new_hidden_expanded], dim=0)
 
 
 @impl_tracked(m, "quantized_conv2d_nhwc.per_tensor")