Fix GRU w8a32 operator

Marco Giordano · facebook-github-bot · commit 6f6ff6a583fc · 2026-02-04T15:19:06.000-08:00
Summary:
# Context
This diff fixes the reference implementation of the w8a32 GRU operator and enhances the operator's pattern matching.

# Mitigation
The reference implementation has now the right output dimension and pattern matching now uses a safer check for the operator parameters.

Reviewed By: hsharma35

Differential Revision: D90437262
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -2854,7 +2854,15 @@ def quantized_w8a32_gru_meta(
     bias_hidden: torch.Tensor,
     b_h_scale: float,
 ) -> torch.Tensor:
-    return hidden.new_empty((2, hidden.shape[-1]), dtype=torch.float32)
+    seq_len = inputs.shape[1]
+    assert seq_len == 1
+    # inputs comes in shape [batch, seq_len, input_size]
+    # hidden comes in shape [batch, seq_len, hidden_size]
+    # weights_inputs comes in shape [3 * hidden_size, input_size]
+    # weights_hidden comes in shape [3 * hidden_size, hidden_size]
+    # output comes in empty with shape [2, batch, seq_len, hidden_size]
+    # The first dimension stacks the output and the new hidden state
+    return hidden.new_empty((2, inputs.shape[0], inputs.shape[1], hidden.shape[-1]), dtype=torch.float32)
 
 
 # Validate that all meta kernels have reference implementations
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
@@ -696,14 +696,16 @@ def get_anchors(
             )
 
         # Bail if input or states are not multiple of 4 (SIMD)
-        if gru_layer.args[0].meta["tensor_meta"].shape[-1] % 4 != 0:
+        tensor_meta_0 = gru_layer.args[0].meta.get("tensor_meta", None)
+        if tensor_meta_0 is None or tensor_meta_0.shape[-1] % 4 != 0:
             return (
                 PartitionAnchors(
                     empty=True,
                 ),
                 gru_layer,
             )
-        if gru_layer.args[1].meta["tensor_meta"].shape[-1] % 4 != 0:
+        tensor_meta_1 = gru_layer.args[1].meta.get("tensor_meta", None)
+        if tensor_meta_1 is None or tensor_meta_1.shape[-1] % 4 != 0:
             return (
                 PartitionAnchors(
                     empty=True,
@@ -718,13 +720,22 @@ def __init__(self, args, meta):
 
         wrapper = Wrapper(tuple(gru_layer.args[2]), gru_layer.meta)
 
+        # Using SharedQuantizationSpec so that bias_hh has the same observer as bias_ih
+        # Both biases get the same quantization scale to match the cpp operator
+        bias_ih_node = wrapper.args[2]
+        bias_ih_edge = (bias_ih_node, gru_layer)
+        shared_bias_qspec = SharedQuantizationSpec(edge_or_node=bias_ih_edge)
+
         return (
             PartitionAnchors(
                 inputs=[],
                 # pyre-fixme[6]: Expected `List[Tuple[Node, int]]` but got `List[Tuple[Wrapper, int]]`.
                 weights=[(wrapper, 0), (wrapper, 1)],
                 # pyre-fixme[6]: Expected `List[Union[Tuple[Node, int], Tuple[Node, int, DerivedQuantizationSpec]]]` but got `List[Tuple[Wrapper, int]]`.
-                biases=[(wrapper, 2), (wrapper, 3)],
+                biases=[
+                    (wrapper, 2),  # bias_ih gets normal qspec
+                    (wrapper, 3, shared_bias_qspec),  # bias_hh shares observer with bias_ih
+                ],
                 output=[],
                 others=[(gru_layer, 0), (gru_layer, 1)],
             ),
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
@@ -1060,8 +1060,13 @@ def quantized_w8a32_gru(
 
     assert new_hidden.shape == original_hidden_shape
 
-    new_hidden = new_hidden.view(-1)
-    return torch.stack([new_hidden, new_hidden], dim=0)
+    batch_size = inputs.shape[0]
+    input_dim = inputs.shape[1]
+    hidden_dim = hidden.shape[-1]
+
+    new_hidden_expanded = new_hidden.unsqueeze(1).expand(batch_size, input_dim, hidden_dim)
+
+    return torch.stack([new_hidden_expanded, new_hidden_expanded], dim=0)
 
 
 @impl_tracked(m, "quantized_conv2d_nhwc.per_tensor")
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -2975,10 +2975,11 @@ def test_quantized_w8a32_gru(
             torch.float32,
             f"Output dtype should be float32 in {name}",
         )
+        expected_shape = (2, inputs.shape[0], inputs.shape[1], hidden.shape[-1])
         self.assertEqual(
             output.shape,
-            (2, hidden.shape[-1]),
-            f"Output shape should match {(2, hidden.shape[-1])} in {name}",
+            expected_shape,
+            f"Output shape should match {expected_shape} in {name}",
         )
         assert isinstance(output, torch.Tensor)
 

Original file line number	Diff line number	Diff line change
`@@ -2975,10 +2975,11 @@ def test_quantized_w8a32_gru(`
`2975`	`2975`	`torch.float32,`
`2976`	`2976`	`f"Output dtype should be float32 in {name}",`
`2977`	`2977`	`)`
	`2978`	`+ expected_shape = (2, inputs.shape[0], inputs.shape[1], hidden.shape[-1])`
`2978`	`2979`	`self.assertEqual(`
`2979`	`2980`	`output.shape,`
`2980`		`- (2, hidden.shape[-1]),`
`2981`		`- f"Output shape should match {(2, hidden.shape[-1])} in {name}",`
	`2981`	`+ expected_shape,`
	`2982`	`+ f"Output shape should match {expected_shape} in {name}",`
`2982`	`2983`	`)`
`2983`	`2984`	`assert isinstance(output, torch.Tensor)`
`2984`	`2985`