Fix CI failures: restore is_compileable and unify quantized_model_init

savitha-eng · savitha-eng · commit 0bfaceb2a29b · 2026-05-14T04:52:26.000Z
- Restore is_compileable property on HFInferenceParams (accidentally
  dropped from PR 1500), required by newer transformers generate().
- Unify get_autocast_context init path to work both standalone (model
  tests, no outer context) and with outer quantized_model_init (recipe
  training). FP8/FP4 layers use per-layer quantized_model_init with
  preserve_high_precision_init_val=True; BF16 layers use
  quantized_model_init(enabled=False) to override any outer context.

Signed-off-by: Savitha Srinivasan &lt;savithas@nvidia.com&gt;
diff --git a/bionemo-recipes/models/llama3/modeling_llama_te.py b/bionemo-recipes/models/llama3/modeling_llama_te.py
@@ -409,11 +409,9 @@ def get_autocast_context(
 
         if init and self.config.use_quantized_model_init:
             if precision in ("fp8", "fp4"):
-                # Let the outer quantized_model_init context handle FP8/FP4 layers. Using nullcontext()
-                # preserves the outer context's settings (recipe, preserve_high_precision_init_val).
-                # A nested quantized_model_init would override preserve_high_precision_init_val to False.
-                return nullcontext()
-            # BF16 layers: explicitly disable quantized init to override any outer quantized_model_init context.
+                return transformer_engine.pytorch.quantized_model_init(
+                    recipe=recipe, preserve_high_precision_init_val=True
+                )
             return transformer_engine.pytorch.quantized_model_init(enabled=False)
 
         if precision == "fp8":
@@ -633,6 +631,11 @@ def get_seq_length(self, layer_idx: int = 0) -> int:
             return 0
         return max(self.sequences.values())
 
+    @property
+    def is_compileable(self) -> bool:
+        """Required by HuggingFace transformers generate() auto-compile check."""
+        return False
+
     def reorder_cache(self, beam_idx: torch.LongTensor):
         """Reorder the cache based on the beam indices."""
         if isinstance(self.cache_manager, PagedKVCacheManager):
diff --git a/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py b/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py
@@ -415,11 +415,9 @@ def get_autocast_context(
 
         if init and self.config.use_quantized_model_init:
             if precision in ("fp8", "fp4"):
-                # Let the outer quantized_model_init context handle FP8/FP4 layers. Using nullcontext()
-                # preserves the outer context's settings (recipe, preserve_high_precision_init_val).
-                # A nested quantized_model_init would override preserve_high_precision_init_val to False.
-                return nullcontext()
-            # BF16 layers: explicitly disable quantized init to override any outer quantized_model_init context.
+                return transformer_engine.pytorch.quantized_model_init(
+                    recipe=recipe, preserve_high_precision_init_val=True
+                )
             return transformer_engine.pytorch.quantized_model_init(enabled=False)
 
         if precision == "fp8":
@@ -639,6 +637,11 @@ def get_seq_length(self, layer_idx: int = 0) -> int:
             return 0
         return max(self.sequences.values())
 
+    @property
+    def is_compileable(self) -> bool:
+        """Required by HuggingFace transformers generate() auto-compile check."""
+        return False
+
     def reorder_cache(self, beam_idx: torch.LongTensor):
         """Reorder the cache based on the beam indices."""
         if isinstance(self.cache_manager, PagedKVCacheManager):
diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/modeling_llama_te.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/modeling_llama_te.py
@@ -415,11 +415,9 @@ def get_autocast_context(
 
         if init and self.config.use_quantized_model_init:
             if precision in ("fp8", "fp4"):
-                # Let the outer quantized_model_init context handle FP8/FP4 layers. Using nullcontext()
-                # preserves the outer context's settings (recipe, preserve_high_precision_init_val).
-                # A nested quantized_model_init would override preserve_high_precision_init_val to False.
-                return nullcontext()
-            # BF16 layers: explicitly disable quantized init to override any outer quantized_model_init context.
+                return transformer_engine.pytorch.quantized_model_init(
+                    recipe=recipe, preserve_high_precision_init_val=True
+                )
             return transformer_engine.pytorch.quantized_model_init(enabled=False)
 
         if precision == "fp8":
@@ -639,6 +637,11 @@ def get_seq_length(self, layer_idx: int = 0) -> int:
             return 0
         return max(self.sequences.values())
 
+    @property
+    def is_compileable(self) -> bool:
+        """Required by HuggingFace transformers generate() auto-compile check."""
+        return False
+
     def reorder_cache(self, beam_idx: torch.LongTensor):
         """Reorder the cache based on the beam indices."""
         if isinstance(self.cache_manager, PagedKVCacheManager):