Consolidate Gemma RMSNorm +1 offset into single block after load

psiddh · psiddh · commit f0386df63271 · 2026-06-02T12:27:49.000-07:00
Address review feedback (shewu-quic): move the Gemma norm weight
adjustment out of both if/else branches into a single block that
runs regardless of checkpoint source. Also fix the misleading comment
and simplify torch.ones() to scalar + 1.0.
diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
@@ -166,17 +166,6 @@ def _prepare_model(self):  # noqa: C901
             state_dict = torch.load(
                 checkpoint, weights_only=True, map_location="cpu", mmap=True
             )
-            if self.control_args.decoder_model in {
-                "gemma-2b",
-                "gemma2-2b",
-                "gemma3-1b",
-            }:
-                for k, v in state_dict.items():
-                    if "norm" not in k:
-                        continue
-                    # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
-                    # See https://github.com/huggingface/transformers/pull/29402
-                    state_dict[k] = v.float() + torch.ones(v.shape, dtype=torch.float32)
         else:
             state_dict = torch.load(
                 self.control_args.checkpoint,
@@ -192,18 +181,18 @@ def _prepare_model(self):  # noqa: C901
                     k.replace("_orig_mod.", ""): v for k, v in state_dict.items()
                 }
 
-            if self.control_args.decoder_model in {
-                "gemma-2b",
-                "gemma2-2b",
-                "gemma3-1b",
-            }:
-                for k, v in state_dict.items():
-                    if "norm" not in k:
-                        continue
-                    # Gemma RMSNorm uses (1 + w) * x, so converted checkpoints
-                    # that haven't been offset need +1 applied here.
-                    # See https://github.com/huggingface/transformers/pull/29402
-                    state_dict[k] = v.float() + torch.ones(v.shape, dtype=torch.float32)
+        # Gemma RMSNorm computes (1 + w) * x but ExecuTorch's RMSNorm computes
+        # w * x, so add the +1 offset to norm weights regardless of load path.
+        # See https://github.com/huggingface/transformers/pull/29402
+        if self.control_args.decoder_model in {
+            "gemma-2b",
+            "gemma2-2b",
+            "gemma3-1b",
+        }:
+            for k, v in state_dict.items():
+                if "norm" not in k:
+                    continue
+                state_dict[k] = v.float() + 1.0
 
         # change to HF weight to improve the performance of RoPE in HTP backend.
         if self.config.transform_weight: