Make ScalelessRMSNorm a torch.nn.RMSNorm; fix SDPACustom view -> reshape (#19376)

navsud · meta-codesync[bot] · commit 07aa7e59ce08 · 2026-05-07T17:36:18.000-07:00
Summary: Pull Request resolved: #19376 Two related changes that together unblock the QNN export path for VLM/STITO: (1) ScalelessRMSNorm: re-implement as torch.nn.RMSNorm subclass ScalelessRMSNorm was previously implemented as a hand-rolled RMS normalization (decomposed into mean / rsqrt / mul). On the QNN export path, this decomposition fails to lower for an LLM. Using torch.nn.RMSNorm() directly works. Re-implement ScalelessRMSNorm as a torch.nn.RMSNorm subclass whose weight is hardcoded to ones and frozen (requires_grad=False). This keeps the public interface (ScalelessRMSNorm(dim, eps)) unchanged while letting backends see a proper RMSNorm op so it composes/decomposes cleanly for QNN. (2) SDPACustom / QuantizedSDPA: replace .view() with .reshape() Switching to torch.nn.RMSNorm changes how strides propagate through the export graph compared to the hand-rolled decomposition, exposing a latent bug in source_transformation/sdpa.py. The output of torch.ops.llama.custom_sdpa retains the non-contiguous (transposed) strides of its inputs, so output.view(bsz, seqlen, self.dim) — which merges the last two dims (n_heads, head_dim) — fails during torch.export with: Cannot view a tensor with shape (1, s0, 32, 64) and strides (2048*s0, 64, 64*s0, 1) as a tensor with shape (1, s0, 2048) Switching to .reshape() inserts .contiguous() only when needed and matches the pattern already used elsewhere in this file (SDPASimple, SDPAFlex, SDPACoreML, and attention.py). Reviewed By: billmguo, telgamal-1 Differential Revision: D104258950
diff --git a/examples/models/llama/norm.py b/examples/models/llama/norm.py
@@ -41,17 +41,20 @@ def forward(self, x):
         return output * self.weight.type_as(x)
 
 
-class ScalelessRMSNorm(torch.nn.Module):
+class ScalelessRMSNorm(torch.nn.RMSNorm):
+    """RMSNorm with weight hardcoded to ones and not trainable.
+
+    Equivalent to a scaleless RMSNorm (no learnable scaling) but implemented as a
+    torch.nn.RMSNorm so the op composes/decomposes cleanly for backends like QNN
+    instead of being expressed as a hand-rolled decomposition.
+    """
+
     def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
+        super().__init__(dim, eps)
         self.dim = dim
-        self.eps = eps
-
-    def forward(self, x):
-        x_float = x.float()
-        return (
-            x_float * torch.rsqrt((x_float * x_float).mean(-1, keepdim=True) + self.eps)
-        ).type_as(x)
+        with torch.no_grad():
+            self.weight.fill_(1.0)
+        self.weight.requires_grad = False
 
 
 class RMSNormWithInputScale(torch.nn.Module):
diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py
@@ -69,7 +69,7 @@ def forward(
                 0,  # dropout probability. Ignored by the code
                 True,  # is_causal
             )
-        return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)
+        return output.reshape(bsz, seqlen, self.dim).to(dtype=input_dtype)
 
 
 def _replace_sdpa_with_custom_op(
@@ -198,7 +198,7 @@ def forward(
                 v_scale_fp32,
             )
 
-        return output.view(bsz, seqlen, self.dim)
+        return output.reshape(bsz, seqlen, self.dim)
 
 
 def _update_attention_module_with_quantized_sdpa(

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ def forward(`
`69`	`69`	`0, # dropout probability. Ignored by the code`
`70`	`70`	`True, # is_causal`
`71`	`71`	`)`
`72`		`- return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)`
	`72`	`+ return output.reshape(bsz, seqlen, self.dim).to(dtype=input_dtype)`
`73`	`73`
`74`	`74`
`75`	`75`	`def _replace_sdpa_with_custom_op(`
`@@ -198,7 +198,7 @@ def forward(`
`198`	`198`	`v_scale_fp32,`
`199`	`199`	`)`
`200`	`200`
`201`		`- return output.view(bsz, seqlen, self.dim)`
	`201`	`+ return output.reshape(bsz, seqlen, self.dim)`
`202`	`202`
`203`	`203`
`204`	`204`	`def _update_attention_module_with_quantized_sdpa(`