fix checkpoint conversion for tied weights llama3 (#1370)

pstjohn · web-flow · commit 6e804c4be228 · 2025-12-09T23:03:11.000Z
Small fix and tests for llama3 te &gt; hf conversion with tied weights

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/models/amplify/src/amplify/state.py b/bionemo-recipes/models/amplify/src/amplify/state.py
@@ -158,7 +158,7 @@ def scale_weights(ctx):
             _params[name] = nn.Parameter(target_param, requires_grad=param.requires_grad)
             target_state.pop(name)
         else:
-            print(f"Unexpected key: {name} not in checkpoint but in model.")
+            print(f"Unexpected key: {name} not in target model but is in source model.")
 
     for key, val in _params.items():
         _module, _key = target, key
@@ -190,7 +190,7 @@ def scale_weights(ctx):
     keys = list(filter(lambda x: x is not None and not x.endswith("_extra_state"), target_state.keys()))
     keys = [key for key in keys if key not in state_dict_ignored_entries]
     if len(keys) != 0:
-        raise RuntimeError(f"Additional keys: {keys} in checkpoint but not in model.")
+        raise RuntimeError(f"Additional keys: {keys} in target model but not in source model.")
 
     if hasattr(target, "tie_weights"):
         target.tie_weights()
diff --git a/bionemo-recipes/models/esm2/src/esm/state.py b/bionemo-recipes/models/esm2/src/esm/state.py
@@ -158,7 +158,7 @@ def scale_weights(ctx):
             _params[name] = nn.Parameter(target_param, requires_grad=param.requires_grad)
             target_state.pop(name)
         else:
-            print(f"Unexpected key: {name} not in checkpoint but in model.")
+            print(f"Unexpected key: {name} not in target model but is in source model.")
 
     for key, val in _params.items():
         _module, _key = target, key
@@ -190,7 +190,7 @@ def scale_weights(ctx):
     keys = list(filter(lambda x: x is not None and not x.endswith("_extra_state"), target_state.keys()))
     keys = [key for key in keys if key not in state_dict_ignored_entries]
     if len(keys) != 0:
-        raise RuntimeError(f"Additional keys: {keys} in checkpoint but not in model.")
+        raise RuntimeError(f"Additional keys: {keys} in target model but not in source model.")
 
     if hasattr(target, "tie_weights"):
         target.tie_weights()
diff --git a/bionemo-recipes/models/llama3/convert.py b/bionemo-recipes/models/llama3/convert.py
@@ -126,6 +126,7 @@ def convert_llama_te_to_hf(model_te: NVLlamaForCausalLM, **config_kwargs) -> Lla
                 fn=state.TransformFns.split_fc1,
             ),
         ],
+        state_dict_ignored_entries=model_hf._tied_weights_keys,
     )
 
     output_model.model.rotary_emb.inv_freq = model_te.model.rotary_emb.inv_freq.clone()
diff --git a/bionemo-recipes/models/llama3/state.py b/bionemo-recipes/models/llama3/state.py
@@ -158,7 +158,7 @@ def scale_weights(ctx):
             _params[name] = nn.Parameter(target_param, requires_grad=param.requires_grad)
             target_state.pop(name)
         else:
-            print(f"Unexpected key: {name} not in checkpoint but in model.")
+            print(f"Unexpected key: {name} not in target model but is in source model.")
 
     for key, val in _params.items():
         _module, _key = target, key
@@ -190,7 +190,7 @@ def scale_weights(ctx):
     keys = list(filter(lambda x: x is not None and not x.endswith("_extra_state"), target_state.keys()))
     keys = [key for key in keys if key not in state_dict_ignored_entries]
     if len(keys) != 0:
-        raise RuntimeError(f"Additional keys: {keys} in checkpoint but not in model.")
+        raise RuntimeError(f"Additional keys: {keys} in target model but not in source model.")
 
     if hasattr(target, "tie_weights"):
         target.tie_weights()
diff --git a/bionemo-recipes/models/llama3/tests/test_convert.py b/bionemo-recipes/models/llama3/tests/test_convert.py
@@ -72,6 +72,18 @@ def test_convert_hf_to_te_with_bf16():
     convert_llama_hf_to_te(model_hf)
 
 
+def test_convert_hf_to_te_with_bf16_tied_weights():
+    config = AutoConfig.from_pretrained(
+        "nvidia/Llama-3.1-8B-Instruct-FP8",
+        dtype=torch.bfloat16,
+        num_hidden_layers=2,
+        tie_word_embeddings=True,
+    )
+    model_hf = LlamaForCausalLM(config)
+    model_hf.to(dtype=torch.bfloat16)  # I think the original llama3 model doesn't initialize in bf16.
+    convert_llama_hf_to_te(model_hf)
+
+
 def test_convert_te_to_hf_with_bf16():
     config = NVLlamaConfig.from_pretrained(
         "nvidia/Llama-3.1-8B-Instruct-FP8", dtype=torch.bfloat16, num_hidden_layers=2
@@ -81,6 +93,18 @@ def test_convert_te_to_hf_with_bf16():
     convert_llama_te_to_hf(model_te)
 
 
+def test_convert_te_to_hf_with_bf16_tied_weights():
+    config = NVLlamaConfig.from_pretrained(
+        "nvidia/Llama-3.1-8B-Instruct-FP8",
+        dtype=torch.bfloat16,
+        num_hidden_layers=2,
+        tie_word_embeddings=True,
+    )
+    model_te = NVLlamaForCausalLM(config)
+    model_te.to(dtype=torch.float32)  # I think the original llama3 model doesn't initialize in bf16.
+    convert_llama_te_to_hf(model_te)
+
+
 @pytest.mark.skipif(os.getenv("CI", "false") == "true", reason="Skipping test in CI not download llama3 models.")
 @pytest.mark.parametrize(
     "upstream_model_name", ["meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.1-8B-Instruct"]

Original file line number	Diff line number	Diff line change
`@@ -126,6 +126,7 @@ def convert_llama_te_to_hf(model_te: NVLlamaForCausalLM, **config_kwargs) -> Lla`
`126`	`126`	`fn=state.TransformFns.split_fc1,`
`127`	`127`	`),`
`128`	`128`	`],`
	`129`	`+ state_dict_ignored_entries=model_hf._tied_weights_keys,`
`129`	`130`	`)`
`130`	`131`
`131`	`132`	`output_model.model.rotary_emb.inv_freq = model_te.model.rotary_emb.inv_freq.clone()`