Make ScalelessRMSNorm a torch.nn.RMSNorm; fix SDPACustom view -> reshape (#19376)

navsud · facebook-github-bot · commit cc6fedc387ba · 2026-05-10T12:51:54.000-07:00
Summary:

Two related changes that together unblock the QNN export path for VLM/STITO:

(1) ScalelessRMSNorm: re-implement as torch.nn.RMSNorm subclass

ScalelessRMSNorm was previously implemented as a hand-rolled RMS normalization
(decomposed into mean / rsqrt / mul). On the QNN export path, this decomposition
fails to lower for an LLM. Using torch.nn.RMSNorm() directly works.

Re-implement ScalelessRMSNorm as a torch.nn.RMSNorm subclass whose weight is
hardcoded to ones and frozen (requires_grad=False). This keeps the public
interface (ScalelessRMSNorm(dim, eps)) unchanged while letting backends see a
proper RMSNorm op so it composes/decomposes cleanly for QNN.



(2) SDPACustom / QuantizedSDPA: replace .view() with .reshape()

Switching to torch.nn.RMSNorm changes how strides propagate through the export
graph compared to the hand-rolled decomposition, exposing a latent bug in
source_transformation/sdpa.py. The output of torch.ops.llama.custom_sdpa retains
the non-contiguous (transposed) strides of its inputs, so
output.view(bsz, seqlen, self.dim) — which merges the last two dims
(n_heads, head_dim) — fails during torch.export with:

    Cannot view a tensor with shape (1, s0, 32, 64) and strides
    (2048*s0, 64, 64*s0, 1) as a tensor with shape (1, s0, 2048)

Switching to .reshape() inserts .contiguous() only when needed and matches the pattern already used elsewhere in this file (SDPASimple, SDPAFlex, SDPACoreML, and attention.py).

Reviewed By: billmguo, telgamal-1

Differential Revision: D104258950
diff --git a/examples/models/llama/norm.py b/examples/models/llama/norm.py
@@ -42,16 +42,23 @@ def forward(self, x):
 
 
 class ScalelessRMSNorm(torch.nn.Module):
+    """RMSNorm without learnable scaling.
+
+    Calls F.rms_norm with weight=None so the op composes/decomposes cleanly for
+    backends like QNN instead of being expressed as a hand-rolled decomposition
+    of mean / rsqrt / mul. Semantically equivalent to
+    torch.nn.RMSNorm(elementwise_affine=False), but implemented as a plain
+    Module to preserve the previous parameterless state_dict signature (no
+    `weight` attribute / parameter).
+    """
+
     def __init__(self, dim: int, eps: float = 1e-6):
         super().__init__()
         self.dim = dim
         self.eps = eps
 
     def forward(self, x):
-        x_float = x.float()
-        return (
-            x_float * torch.rsqrt((x_float * x_float).mean(-1, keepdim=True) + self.eps)
-        ).type_as(x)
+        return F.rms_norm(x, (self.dim,), None, self.eps)
 
 
 class RMSNormWithInputScale(torch.nn.Module):
diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py
@@ -69,7 +69,7 @@ def forward(
                 0,  # dropout probability. Ignored by the code
                 True,  # is_causal
             )
-        return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)
+        return output.reshape(bsz, seqlen, self.dim).to(dtype=input_dtype)
 
 
 def _replace_sdpa_with_custom_op(
@@ -198,7 +198,7 @@ def forward(
                 v_scale_fp32,
             )
 
-        return output.view(bsz, seqlen, self.dim)
+        return output.reshape(bsz, seqlen, self.dim)
 
 
 def _update_attention_module_with_quantized_sdpa(

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ def forward(`
`69`	`69`	`0, # dropout probability. Ignored by the code`
`70`	`70`	`True, # is_causal`
`71`	`71`	`)`
`72`		`- return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)`
	`72`	`+ return output.reshape(bsz, seqlen, self.dim).to(dtype=input_dtype)`
`73`	`73`
`74`	`74`
`75`	`75`	`def _replace_sdpa_with_custom_op(`
`@@ -198,7 +198,7 @@ def forward(`
`198`	`198`	`v_scale_fp32,`
`199`	`199`	`)`
`200`	`200`
`201`		`- return output.view(bsz, seqlen, self.dim)`
	`201`	`+ return output.reshape(bsz, seqlen, self.dim)`
`202`	`202`
`203`	`203`
`204`	`204`	`def _update_attention_module_with_quantized_sdpa(`