fix: baichuan dynamic cache (#1865)

zyzhou5 · akoumpa · linnanwang · commit 1dcb225db36a · 2026-04-24T00:17:58.000-07:00
fix: Handle DynamicCache in Baichuan model for generation compatibility

Baichuan's forward() and prepare_inputs_for_generation() assumed
past_key_values is always a legacy tuple-of-tuples, but transformers
5.x passes DynamicCache objects during model.generate(). This caused
TypeError/AttributeError in the baichuan_2_7b_squad_vllm_deploy and
baichuan_2_7b_squad_peft_vllm_deploy CI tests.

- Convert DynamicCache to legacy tuples in BaichuanModel.forward()
- Treat empty DynamicCache as None in prepare_inputs_for_generation()
- Fix position_ids truncation for transformers 5.x which passes
  position_ids via kwargs instead of letting the model compute them

Signed-off-by: Zeyu Zhou &lt;zezhou@nvidia.com&gt;
Co-authored-by: Alexandros Koumparoulis &lt;153118171+akoumpa@users.noreply.github.com&gt;
diff --git a/nemo_automodel/components/models/baichuan/model.py b/nemo_automodel/components/models/baichuan/model.py
@@ -42,6 +42,7 @@
 from torch.nn import functional as F
 from transformers import GenerationMixin, PreTrainedModel
 from transformers.activations import ACT2FN
+from transformers.cache_utils import DynamicCache
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.utils import logging
 
@@ -364,6 +365,12 @@ def forward(
 
         seq_length_with_past = seq_length
         past_key_values_length = 0
+        if past_key_values is not None:
+            if isinstance(past_key_values, DynamicCache):
+                if past_key_values.get_seq_length() > 0:
+                    past_key_values = tuple((layer.keys, layer.values) for layer in past_key_values.layers)
+                else:
+                    past_key_values = None
         if past_key_values is not None:
             past_key_values_length = past_key_values[0][0].shape[2]
             seq_length_with_past = seq_length_with_past + past_key_values_length
@@ -558,15 +565,18 @@ def prepare_inputs_for_generation(
         inputs_embeds=None,
         **kwargs,
     ):
+        # Treat empty DynamicCache as no cache so inputs stay consistent with forward()
+        if isinstance(past_key_values, DynamicCache) and past_key_values.get_seq_length() == 0:
+            past_key_values = None
         if past_key_values:
             input_ids = input_ids[:, -1:]
 
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
+        if past_key_values and position_ids is not None:
+            position_ids = position_ids[:, -1].unsqueeze(-1)
 
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
diff --git a/tests/unit_tests/models/baichuan/test_baichuan_model.py b/tests/unit_tests/models/baichuan/test_baichuan_model.py
@@ -15,6 +15,7 @@
 
 import pytest
 import torch
+from transformers.cache_utils import DynamicCache
 
 from nemo_automodel.components.models.baichuan.configuration import BaichuanConfig
 from nemo_automodel.components.models.baichuan.model import (
@@ -483,6 +484,33 @@ def test_with_inputs_embeds_no_past(self):
         assert "input_ids" not in inputs
 
 
+class TestDynamicCacheCompat:
+    """Regression test for DynamicCache incompatibility (baichuan_2_7b_squad_vllm_deploy)."""
+
+    def test_forward_with_dynamic_cache(self):
+        cfg = _tiny_config(use_cache=True)
+        model = BaichuanModel(cfg)
+        model.eval()
+        bsz, seq_len = 1, 4
+        input_ids = torch.randint(0, cfg.vocab_size, (bsz, seq_len))
+
+        # First forward to populate cache
+        with torch.no_grad():
+            out = model(input_ids=input_ids, use_cache=True)
+        legacy_cache = out.past_key_values
+
+        # Convert legacy cache to DynamicCache (simulates what GenerationMixin does)
+        dynamic_cache = DynamicCache()
+        for layer_idx, (key, value) in enumerate(legacy_cache):
+            dynamic_cache.update(key, value, layer_idx)
+
+        # Second forward with DynamicCache — this was the failing path
+        next_token = torch.randint(0, cfg.vocab_size, (bsz, 1))
+        with torch.no_grad():
+            out2 = model(input_ids=next_token, past_key_values=dynamic_cache, use_cache=True)
+        assert out2.last_hidden_state.shape == (bsz, 1, cfg.hidden_size)
+
+
 class TestReorderCache:
     def test_reorders_correctly(self):
         past = tuple((torch.randn(3, 2, 4, 8), torch.randn(3, 2, 4, 8)) for _ in range(2))