Back out "Make ScalelessRMSNorm a torch.nn.RMSNorm; fix SDPACustom view -> reshape"

billmguo · web-flow · commit 0a82163a9181 · 2026-05-19T01:23:20.000Z
Differential Revision: D105623266 Pull Request resolved: #19655
diff --git a/examples/models/llama/norm.py b/examples/models/llama/norm.py
@@ -41,20 +41,17 @@ def forward(self, x):
         return output * self.weight.type_as(x)
 
 
-class ScalelessRMSNorm(torch.nn.RMSNorm):
-    """RMSNorm with weight hardcoded to ones and not trainable.
-
-    Equivalent to a scaleless RMSNorm (no learnable scaling) but implemented as a
-    torch.nn.RMSNorm so the op composes/decomposes cleanly for backends like QNN
-    instead of being expressed as a hand-rolled decomposition.
-    """
-
+class ScalelessRMSNorm(torch.nn.Module):
     def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__(dim, eps)
+        super().__init__()
         self.dim = dim
-        with torch.no_grad():
-            self.weight.fill_(1.0)
-        self.weight.requires_grad = False
+        self.eps = eps
+
+    def forward(self, x):
+        x_float = x.float()
+        return (
+            x_float * torch.rsqrt((x_float * x_float).mean(-1, keepdim=True) + self.eps)
+        ).type_as(x)
 
 
 class RMSNormCoreML(torch.nn.Module):
diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py
@@ -69,7 +69,7 @@ def forward(
                 0,  # dropout probability. Ignored by the code
                 True,  # is_causal
             )
-        return output.reshape(bsz, seqlen, self.dim).to(dtype=input_dtype)
+        return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)
 
 
 def _replace_sdpa_with_custom_op(
@@ -198,7 +198,7 @@ def forward(
                 v_scale_fp32,
             )
 
-        return output.reshape(bsz, seqlen, self.dim)
+        return output.view(bsz, seqlen, self.dim)
 
 
 def _update_attention_module_with_quantized_sdpa(

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ def forward(`
`69`	`69`	`0, # dropout probability. Ignored by the code`
`70`	`70`	`True, # is_causal`
`71`	`71`	`)`
`72`		`- return output.reshape(bsz, seqlen, self.dim).to(dtype=input_dtype)`
	`72`	`+ return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)`
`73`	`73`
`74`	`74`
`75`	`75`	`def _replace_sdpa_with_custom_op(`
`@@ -198,7 +198,7 @@ def forward(`
`198`	`198`	`v_scale_fp32,`
`199`	`199`	`)`
`200`	`200`
`201`		`- return output.reshape(bsz, seqlen, self.dim)`
	`201`	`+ return output.view(bsz, seqlen, self.dim)`
`202`	`202`
`203`	`203`
`204`	`204`	`def _update_attention_module_with_quantized_sdpa(`