Add CP support to modeling_llama_te.py (#1407)

pstjohn · web-flow · commit 6321b0ccb81c · 2026-01-13T01:23:58.000Z
* Makes the attention_mask type more easily controllable through a config kwarg * passes the additional `_padded` kwargs through to the TE layers * simplifies some of the pre-processing logic and adds comments as to why they're needed Fixes BIO-12 Signed-off-by: Peter St. John <pstjohn@nvidia.com>
diff --git a/bionemo-recipes/models/llama3/modeling_llama_te.py b/bionemo-recipes/models/llama3/modeling_llama_te.py
@@ -43,6 +43,7 @@ class NVLlamaConfig(LlamaConfig):
     """NVLlama configuration."""
 
     attn_input_format: str = "thd"
+    self_attn_mask_type: str = "padding_causal"
 
 
 class NVLlamaPreTrainedModel(PreTrainedModel):
@@ -118,7 +119,7 @@ def _init_method(x):
                     normalization="RMSNorm",
                     activation="swiglu",
                     attn_input_format=config.attn_input_format,
-                    self_attn_mask_type="padding_causal",
+                    self_attn_mask_type=config.self_attn_mask_type,
                     num_gqa_groups=config.num_key_value_heads,
                     layer_number=layer_idx + 1,
                     params_dtype=config.dtype,
@@ -181,49 +182,32 @@ def forward(
 
         hidden_states = inputs_embeds
 
+        # TE-specific input handling.
         has_thd_input = [x in kwargs for x in ["cu_seq_lens_q", "cu_seq_lens_k", "max_length_q", "max_length_k"]]
         should_pack_inputs = not any(has_thd_input) and self.config.attn_input_format == "thd"
 
-        # This might be slower for BSHD + padding with fused attention backend. But it should be faster for the flash
-        # attention backend.
-        self_attn_mask_type = "padding_causal"
         if should_pack_inputs:
-            # Left-side padding is not supported in TE layers, so to make generation work with TE we dynamically convert
-            # to THD-style inputs in our forward pass, and then convert back to BSHD for the output. This lets the
-            # entire transformer stack run in THD mode.
+            # Left-side padding is not supported in TE layers, so to make huggingface-style generation work with TE we
+            # dynamically convert to THD-style inputs in our forward pass, and then convert back to BSHD for the output.
+            # This lets the entire transformer stack run in THD mode. This might be slower for BSHD + padding with fused
+            # attention backend, but it should be faster for the flash attention backend.
             assert attention_mask is not None, "Attention mask is required when packing BSHD inputs."
             batch_size = hidden_states.size(0)
             hidden_states, indices, cu_seqlens, max_seqlen, _ = _unpad_input(hidden_states, attention_mask)
-            cu_seq_lens_q = cu_seq_lens_k = cu_seqlens
-            max_length_q = max_length_k = max_seqlen
+            kwargs["cu_seq_lens_q"] = kwargs["cu_seq_lens_k"] = cu_seqlens
+            kwargs["max_length_q"] = kwargs["max_length_k"] = max_seqlen
 
-        elif self.config.attn_input_format == "thd":
-            # Here, we're providing THD-style inputs, so we can just grab the kwargs.
-            assert hidden_states.dim() == 3 and hidden_states.size(0) == 1, (
-                "THD expects embeddings shaped [1, total_tokens, hidden_size]."
-            )
+        if self.config.attn_input_format == "thd" and hidden_states.dim() == 3 and hidden_states.size(0) == 1:
+            # For THD, the embedding output is a 3-dimensional tensor with shape [1, total_tokens, hidden_size], but TE
+            # expects a 2-dimensional tensor with shape [total_tokens, hidden_size].
             hidden_states = hidden_states.squeeze(0)
-            cu_seq_lens_q = kwargs["cu_seq_lens_q"]
-            cu_seq_lens_k = kwargs["cu_seq_lens_k"]
-            max_length_q = kwargs["max_length_q"]
-            max_length_k = kwargs["max_length_k"]
-
-        else:
-            if attention_mask is not None:
-                attention_mask = attention_mask[:, None, None, :] < -1
-            else:
-                self_attn_mask_type = "causal"
-            cu_seq_lens_q = cu_seq_lens_k = None
-            max_length_q = max_length_k = hidden_states.size(1)
-
-        # If we're using kv-caching, we can't trust the max_length_q value as the true max length for rotary
-        # embeddings, since this will be 1 in generation. Instead we can take the max sequence length from the past
-        # key values object.
-        te_rope_emb = self.rotary_emb(
-            max_seq_len=max_length_q if past_key_values is None else past_key_values.max_ctx_len
-        )
 
-        if isinstance(past_key_values, InferenceParams):
+        if self.config.attn_input_format == "bshd" and attention_mask is not None and attention_mask.dim() == 2:
+            # If we're using padded BSHD inputs, we need to convert the 2-dimensional mask to a 4-dimensional mask in
+            # the expected boolean format for TE.
+            attention_mask = attention_mask[:, None, None, :] < -1
+
+        if isinstance(past_key_values, InferenceParams):  # InferenceParams is TE's way of managing kv-caching.
             # In generation mode, we set the length to 1 for each batch index. Otherwise, we use the attention mask to
             # compute the lengths of each sequence in the batch.
             lengths = (
@@ -233,6 +217,8 @@ def forward(
             )
             past_key_values.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths)))
 
+        te_rope_emb = self.rotary_emb(max_seq_len=self.config.max_position_embeddings)
+
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states = (*all_hidden_states, hidden_states)
@@ -241,12 +227,14 @@ def forward(
                 hidden_states,
                 attention_mask=None if self.config.attn_input_format == "thd" else attention_mask,
                 rotary_pos_emb=te_rope_emb,
-                self_attn_mask_type=self_attn_mask_type,
                 inference_params=past_key_values,
-                cu_seqlens_q=cu_seq_lens_q,
-                cu_seqlens_kv=cu_seq_lens_k,
-                max_seqlen_q=max_length_q,
-                max_seqlen_kv=max_length_k,
+                cu_seqlens_q=kwargs.get("cu_seq_lens_q", None),
+                cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None),
+                cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None),
+                cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None),
+                max_seqlen_q=kwargs.get("max_length_q", None),
+                max_seqlen_kv=kwargs.get("max_length_k", None),
+                pad_between_seqs=kwargs.get("pad_between_seqs", None),
             )
 
         hidden_states = self.norm(hidden_states)
@@ -258,7 +246,7 @@ def forward(
 
         if should_pack_inputs:
             # If we've converted BSHD to THD for our TE layers, we need to convert back to BSHD for the output.
-            hidden_states = _pad_input(hidden_states, indices, batch_size, max_length_q)
+            hidden_states = _pad_input(hidden_states, indices, batch_size, max_seqlen)
 
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
diff --git a/bionemo-recipes/models/llama3/tests/test_lm_eval.py b/bionemo-recipes/models/llama3/tests/test_lm_eval.py
@@ -29,7 +29,10 @@
 def model_checkpoint(tmp_path: Path):
     tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-8B-Instruct-FP8")
     config = NVLlamaConfig.from_pretrained(
-        "nvidia/Llama-3.1-8B-Instruct-FP8", num_hidden_layers=2, attn_input_format="bshd"
+        "nvidia/Llama-3.1-8B-Instruct-FP8",
+        num_hidden_layers=2,
+        attn_input_format="bshd",
+        self_attn_mask_type="causal",
     )
     model = NVLlamaForCausalLM(config)
     model.save_pretrained(tmp_path / "checkpoint")
diff --git a/bionemo-recipes/models/llama3/tests/test_modeling_llama_te.py b/bionemo-recipes/models/llama3/tests/test_modeling_llama_te.py
@@ -69,7 +69,10 @@ def test_llama_model_forward_pass(input_text, attn_input_format):
 def test_llama_model_forward_pass_no_attention_mask():
     tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-8B-Instruct-FP8")
     config = NVLlamaConfig.from_pretrained(
-        "nvidia/Llama-3.1-8B-Instruct-FP8", num_hidden_layers=2, attn_input_format="bshd"
+        "nvidia/Llama-3.1-8B-Instruct-FP8",
+        num_hidden_layers=2,
+        attn_input_format="bshd",
+        self_attn_mask_type="causal",
     )
     model = NVLlamaForCausalLM(config)
 
@@ -269,7 +272,7 @@ def test_hf_llama_model_generate_bshd():
 def test_te_llama_model_generate_with_cache():
     tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
     model_hf = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", dtype=torch.bfloat16)
-    model_te = convert_llama_hf_to_te(model_hf)
+    model_te = convert_llama_hf_to_te(model_hf, self_attn_mask_type="padding_causal")
 
     prompt = """
    Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py b/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py
@@ -43,6 +43,7 @@ class NVLlamaConfig(LlamaConfig):
     """NVLlama configuration."""
 
     attn_input_format: str = "thd"
+    self_attn_mask_type: str = "padding_causal"
 
 
 class NVLlamaPreTrainedModel(PreTrainedModel):
@@ -118,7 +119,7 @@ def _init_method(x):
                     normalization="RMSNorm",
                     activation="swiglu",
                     attn_input_format=config.attn_input_format,
-                    self_attn_mask_type="padding_causal",
+                    self_attn_mask_type=config.self_attn_mask_type,
                     num_gqa_groups=config.num_key_value_heads,
                     layer_number=layer_idx + 1,
                     params_dtype=config.dtype,
@@ -181,49 +182,32 @@ def forward(
 
         hidden_states = inputs_embeds
 
+        # TE-specific input handling.
         has_thd_input = [x in kwargs for x in ["cu_seq_lens_q", "cu_seq_lens_k", "max_length_q", "max_length_k"]]
         should_pack_inputs = not any(has_thd_input) and self.config.attn_input_format == "thd"
 
-        # This might be slower for BSHD + padding with fused attention backend. But it should be faster for the flash
-        # attention backend.
-        self_attn_mask_type = "padding_causal"
         if should_pack_inputs:
-            # Left-side padding is not supported in TE layers, so to make generation work with TE we dynamically convert
-            # to THD-style inputs in our forward pass, and then convert back to BSHD for the output. This lets the
-            # entire transformer stack run in THD mode.
+            # Left-side padding is not supported in TE layers, so to make huggingface-style generation work with TE we
+            # dynamically convert to THD-style inputs in our forward pass, and then convert back to BSHD for the output.
+            # This lets the entire transformer stack run in THD mode. This might be slower for BSHD + padding with fused
+            # attention backend, but it should be faster for the flash attention backend.
             assert attention_mask is not None, "Attention mask is required when packing BSHD inputs."
             batch_size = hidden_states.size(0)
             hidden_states, indices, cu_seqlens, max_seqlen, _ = _unpad_input(hidden_states, attention_mask)
-            cu_seq_lens_q = cu_seq_lens_k = cu_seqlens
-            max_length_q = max_length_k = max_seqlen
+            kwargs["cu_seq_lens_q"] = kwargs["cu_seq_lens_k"] = cu_seqlens
+            kwargs["max_length_q"] = kwargs["max_length_k"] = max_seqlen
 
-        elif self.config.attn_input_format == "thd":
-            # Here, we're providing THD-style inputs, so we can just grab the kwargs.
-            assert hidden_states.dim() == 3 and hidden_states.size(0) == 1, (
-                "THD expects embeddings shaped [1, total_tokens, hidden_size]."
-            )
+        if self.config.attn_input_format == "thd" and hidden_states.dim() == 3 and hidden_states.size(0) == 1:
+            # For THD, the embedding output is a 3-dimensional tensor with shape [1, total_tokens, hidden_size], but TE
+            # expects a 2-dimensional tensor with shape [total_tokens, hidden_size].
             hidden_states = hidden_states.squeeze(0)
-            cu_seq_lens_q = kwargs["cu_seq_lens_q"]
-            cu_seq_lens_k = kwargs["cu_seq_lens_k"]
-            max_length_q = kwargs["max_length_q"]
-            max_length_k = kwargs["max_length_k"]
-
-        else:
-            if attention_mask is not None:
-                attention_mask = attention_mask[:, None, None, :] < -1
-            else:
-                self_attn_mask_type = "causal"
-            cu_seq_lens_q = cu_seq_lens_k = None
-            max_length_q = max_length_k = hidden_states.size(1)
-
-        # If we're using kv-caching, we can't trust the max_length_q value as the true max length for rotary
-        # embeddings, since this will be 1 in generation. Instead we can take the max sequence length from the past
-        # key values object.
-        te_rope_emb = self.rotary_emb(
-            max_seq_len=max_length_q if past_key_values is None else past_key_values.max_ctx_len
-        )
 
-        if isinstance(past_key_values, InferenceParams):
+        if self.config.attn_input_format == "bshd" and attention_mask is not None and attention_mask.dim() == 2:
+            # If we're using padded BSHD inputs, we need to convert the 2-dimensional mask to a 4-dimensional mask in
+            # the expected boolean format for TE.
+            attention_mask = attention_mask[:, None, None, :] < -1
+
+        if isinstance(past_key_values, InferenceParams):  # InferenceParams is TE's way of managing kv-caching.
             # In generation mode, we set the length to 1 for each batch index. Otherwise, we use the attention mask to
             # compute the lengths of each sequence in the batch.
             lengths = (
@@ -233,6 +217,8 @@ def forward(
             )
             past_key_values.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths)))
 
+        te_rope_emb = self.rotary_emb(max_seq_len=self.config.max_position_embeddings)
+
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states = (*all_hidden_states, hidden_states)
@@ -241,12 +227,14 @@ def forward(
                 hidden_states,
                 attention_mask=None if self.config.attn_input_format == "thd" else attention_mask,
                 rotary_pos_emb=te_rope_emb,
-                self_attn_mask_type=self_attn_mask_type,
                 inference_params=past_key_values,
-                cu_seqlens_q=cu_seq_lens_q,
-                cu_seqlens_kv=cu_seq_lens_k,
-                max_seqlen_q=max_length_q,
-                max_seqlen_kv=max_length_k,
+                cu_seqlens_q=kwargs.get("cu_seq_lens_q", None),
+                cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None),
+                cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None),
+                cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None),
+                max_seqlen_q=kwargs.get("max_length_q", None),
+                max_seqlen_kv=kwargs.get("max_length_k", None),
+                pad_between_seqs=kwargs.get("pad_between_seqs", None),
             )
 
         hidden_states = self.norm(hidden_states)
@@ -258,7 +246,7 @@ def forward(
 
         if should_pack_inputs:
             # If we've converted BSHD to THD for our TE layers, we need to convert back to BSHD for the output.
-            hidden_states = _pad_input(hidden_states, indices, batch_size, max_length_q)
+            hidden_states = _pad_input(hidden_states, indices, batch_size, max_seqlen)
 
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,