fix: add is_compileable attribute to HFInferenceParams for transformers 5.4+

svc-bionemo · svc-bionemo · commit b8ea565d8915 · 2026-04-06T14:54:31.000-07:00
transformers &gt;= 5.4 checks cache.is_compileable in generate(). The custom
HFInferenceParams class (TE-based cache) did not implement this attribute,
causing AttributeError during test_generate_with_cache tests.

Set is_compileable = False since this cache type is not compatible with
torch.compile generate().

Tested locally:
- models/mixtral: 52 passed, 3 skipped, 26 xfailed (3 local-only OOM on 32GB GPU, pass on CI L4)
- recipes/mixtral_native_te: 7 passed
- recipes/opengenome2_mixtral_native_te: 20 passed

Signed-off-by: svc-bionemo &lt;267129667+svc-bionemo@users.noreply.github.com&gt;
diff --git a/bionemo-recipes/models/mixtral/modeling_mixtral_te.py b/bionemo-recipes/models/mixtral/modeling_mixtral_te.py
@@ -872,6 +872,10 @@ def _unpad_input(hidden_states, attention_mask, unused_mask=None):
 class HFInferenceParams(InferenceParams):
     """Extension of the InferenceParams class to support HF generate() and beam search."""
 
+    # Required by transformers >= 5.4 _valid_auto_compile_criteria(); this
+    # custom TE-based cache is not compatible with torch.compile generate().
+    is_compileable = False
+
     def get_seq_length(self, layer_idx: int = 0) -> int:
         """Return the current cached sequence length.
 
diff --git a/bionemo-recipes/recipes/mixtral_native_te/modeling_mixtral_te.py b/bionemo-recipes/recipes/mixtral_native_te/modeling_mixtral_te.py
@@ -878,6 +878,10 @@ def _unpad_input(hidden_states, attention_mask, unused_mask=None):
 class HFInferenceParams(InferenceParams):
     """Extension of the InferenceParams class to support HF generate() and beam search."""
 
+    # Required by transformers >= 5.4 _valid_auto_compile_criteria(); this
+    # custom TE-based cache is not compatible with torch.compile generate().
+    is_compileable = False
+
     def get_seq_length(self, layer_idx: int = 0) -> int:
         """Return the current cached sequence length.
 
diff --git a/bionemo-recipes/recipes/opengenome2_mixtral_native_te/modeling_mixtral_te.py b/bionemo-recipes/recipes/opengenome2_mixtral_native_te/modeling_mixtral_te.py
@@ -878,6 +878,10 @@ def _unpad_input(hidden_states, attention_mask, unused_mask=None):
 class HFInferenceParams(InferenceParams):
     """Extension of the InferenceParams class to support HF generate() and beam search."""
 
+    # Required by transformers >= 5.4 _valid_auto_compile_criteria(); this
+    # custom TE-based cache is not compatible with torch.compile generate().
+    is_compileable = False
+
     def get_seq_length(self, layer_idx: int = 0) -> int:
         """Return the current cached sequence length.