Reduce the number of inner loop checks for compatibility

jstjohn · jstjohn · commit c372637172a8 · 2026-05-22T13:59:01.000-07:00
Signed-off-by: John St. John &lt;jstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/engine.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/engine.py
@@ -19,11 +19,6 @@
 import torch.nn.functional as F  # noqa: N812
 from einops import rearrange
 
-from bionemo.evo2.models.megatron.hyena.subquadratic_safety import (
-    ensure_subquadratic_causal_conv1d_supported,
-    ensure_subquadratic_fft_causal_conv1d_supported,
-)
-
 
 try:
     from subquadratic_ops_torch.causal_conv1d import causal_conv1d as _subq_causal_conv1d
@@ -83,7 +78,6 @@ def parallel_fir(
     fir_length,
     compute_state,
     use_subquadratic_ops=False,
-    check_subquadratic_ops=True,
 ):
     """Compute parallel finite impulse response filtering with optional state computation."""
     L = u.shape[1]  # noqa: N806
@@ -95,8 +89,6 @@ def parallel_fir(
     if fir_length >= 128:
         if use_subquadratic_ops:
             # subq-ops fft_causal_conv1d expects [B, D, L] input and [D, L] filter; dtypes must match
-            if check_subquadratic_ops and u.is_cuda:
-                ensure_subquadratic_fft_causal_conv1d_supported()
             k = weight[:, :, :L].squeeze(1) if weight.dim() == 3 else weight[:, :L]
             u_fp32 = u.to(torch.float32)
             z = _subq_fft_causal_conv1d(u_fp32, k.to(torch.float32))
@@ -115,8 +107,6 @@ def parallel_fir(
             if _subq_causal_conv1d is None:
                 raise ImportError(_subq_error_msg)
             # subq-ops causal_conv1d expects pre-padded [B, D, L+pad] input and [D, K] weight.
-            if check_subquadratic_ops and u.is_cuda:
-                ensure_subquadratic_causal_conv1d_supported()
             pad_size = fir_length - 1
             x_padded = F.pad(u.to(torch.float32), (pad_size, 0))
             w = weight.squeeze(1) if weight.dim() == 3 else weight
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/hyena_utils.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/hyena_utils.py
@@ -33,11 +33,6 @@
 from torch.autograd.function import Function
 
 from bionemo.evo2.models.megatron.hyena.hyena_config import HyenaConfig
-from bionemo.evo2.models.megatron.hyena.subquadratic_safety import (
-    ensure_subquadratic_b2b_causal_conv1d_supported,
-    ensure_subquadratic_causal_conv1d_supported,
-    ensure_subquadratic_fft_causal_conv1d_supported,
-)
 
 
 try:
@@ -469,7 +464,6 @@ def fftconv_func(
     k_rev=None,
     bidirectional=False,
     use_subquadratic_ops=False,
-    check_subquadratic_ops=True,
 ):
     """Apply a 1D convolution to the input sequence u using the filter k and the shortcut D."""
     seqlen = u.shape[-1]
@@ -503,8 +497,6 @@ def fftconv_func(
     # causal
     else:
         if use_subquadratic_ops:
-            if check_subquadratic_ops and u.is_cuda:
-                ensure_subquadratic_fft_causal_conv1d_supported()
             y = fft_causal_conv1d(u, k.squeeze(0))
         else:
             fft_size = max(fft_size, 2 * k.shape[-1])
@@ -903,7 +895,6 @@ def __init__(
         self.zigzag = zigzag
 
         self.use_subquadratic_ops = transformer_config.use_subquadratic_ops
-        self._subquadratic_ops_checked = False
 
         self.model_parallel_size = self.pg_collection.tp.size() if self.pg_collection.tp is not None else 1
         self.model_parallel_rank = self.pg_collection.tp.rank() if self.pg_collection.tp is not None else 0
@@ -986,16 +977,6 @@ def reset_parameters(self):
             bounds = math.sqrt(1 / self.kernel_size)
             torch.nn.init.uniform_(self.conv_bias, a=-bounds, b=bounds)
 
-    def _ensure_subquadratic_ops_supported(self):
-        """Run expensive subquadratic-op CUDA self-tests once per operator instance."""
-        if self._subquadratic_ops_checked or not self.use_subquadratic_ops:
-            return
-        if self.operator_type == "hyena_medium_conv" and self.kernel_size < 128:
-            ensure_subquadratic_causal_conv1d_supported()
-        else:
-            ensure_subquadratic_fft_causal_conv1d_supported()
-        self._subquadratic_ops_checked = True
-
     def forward_long(self, *, x1, x2, v, h, bias, inference_context):
         """Forward pass long."""
         import bionemo.evo2.models.megatron.hyena.engine as engine
@@ -1086,7 +1067,6 @@ def get_filter_state(filter_name):
                 fir_length=self.kernel_size,  # self.short_filter_length,
                 compute_state=inference_context is not None,
                 use_subquadratic_ops=self.use_subquadratic_ops,
-                check_subquadratic_ops=False,
             )
             y = rearrange(y, "b d l -> b l d")
             y = y * x1
@@ -1112,8 +1092,6 @@ def forward(self, x1, x2, v, _hyena_use_cp=True, inference_context=None):
         Input shapes: bs, (num_groups, group_size), seq_length
         Output shapes: bs, (num_groups, group_size), seq_length
         """
-        if x1.is_cuda:
-            self._ensure_subquadratic_ops_supported()
         B, GDG, L = x1.shape  # noqa: N806
         x1, x2, v = x1[..., :L], x2[..., :L], v[..., :L]
 
@@ -1204,7 +1182,6 @@ def forward(self, x1, x2, v, _hyena_use_cp=True, inference_context=None):
                     gelu=False,
                     bidirectional=self.bidirectional,
                     use_subquadratic_ops=self.use_subquadratic_ops,
-                    check_subquadratic_ops=False,
                 )
                 z = z.to(v.dtype)
 
@@ -1404,7 +1381,6 @@ def __init__(
         self.num_groups = num_groups
         self.transformer_config = transformer_config
         self.use_subquadratic_ops = transformer_config.use_subquadratic_ops
-        self._subquadratic_ops_checked = False
         self.short_conv_L = hyena_config.short_conv_L
         self.local_init = local_init
         if pg_collection is None:
@@ -1496,9 +1472,6 @@ def forward(self, x, inference_context=None, _use_cp=True):
         # Projection conv is fused with SE/MR layers by B2BCausalConv1dModule when available.
         if self.use_fast_causal_conv:  # hyena_proj_conv case
             if self.use_subquadratic_ops:
-                if x.is_cuda and not self._subquadratic_ops_checked:
-                    ensure_subquadratic_causal_conv1d_supported()
-                    self._subquadratic_ops_checked = True
                 y = causal_conv1d(x, weight)[..., pad_size:]
             else:
                 y = causal_conv1d_fn(x, weight, bias=None, activation=None)[..., pad_size:]
@@ -1566,7 +1539,6 @@ def __init__(
         """
         super().__init__()
         self.b2b_causal_conv1d_fn = b2b_causal_conv1d
-        self._check_subquadratic_ops = b2b_causal_conv1d is globals()["b2b_causal_conv1d"]
         if pg_collection is None:
             pg_collection = ProcessGroupCollection.use_mpu_process_groups()
         self.pg_collection = pg_collection
@@ -1591,14 +1563,6 @@ def __init__(
             raise ValueError(f"Operator type {operator_type} not supported")
 
         self.effective_pad_size = (self._mixer_kernel_size - 1) + (self._proj_conv_kernel_size - 1)
-        self._subquadratic_ops_checked = False
-
-    def _ensure_subquadratic_ops_supported(self):
-        """Run the B2B CUDA self-test once per wrapper instance."""
-        if self._subquadratic_ops_checked or not self._check_subquadratic_ops:
-            return
-        ensure_subquadratic_b2b_causal_conv1d_supported()
-        self._subquadratic_ops_checked = True
 
     def forward(self, x, _use_cp=True):
         """Forward pass for the B2BCausalConv1dModule.
@@ -1612,8 +1576,6 @@ def forward(self, x, _use_cp=True):
         # Validate input dimensions
         if x.dim() != 3:
             raise ValueError("Input tensor must be 3D [batch_size, hidden_dim, seq_len]")
-        if x.is_cuda:
-            self._ensure_subquadratic_ops_supported()
 
         # Extract weights at runtime to avoid parameter registration
         proj_weight = self._proj_conv_module.short_conv_weight
@@ -1747,9 +1709,6 @@ def get_filter_state(filter_name):
         L = u.shape[1]  # noqa: N806
         fir_state = get_filter_state("fir")
         if fir_state is None:
-            if self.use_subquadratic_ops and u.is_cuda and not self._subquadratic_ops_checked:
-                ensure_subquadratic_causal_conv1d_supported()
-                self._subquadratic_ops_checked = True
             z_pre, fir_state = engine.parallel_fir(
                 u=u,
                 weight=torch.tensor(weight),  # self.short_filter_weight,
@@ -1759,7 +1718,6 @@ def get_filter_state(filter_name):
                 fir_length=self.kernel_size,  # self.short_filter_length,
                 compute_state=inference_context is not None,
                 use_subquadratic_ops=self.use_subquadratic_ops,
-                check_subquadratic_ops=False,
             )
         else:
             if len(u.shape) > 2:
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/subquadratic_safety.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/subquadratic_safety.py
@@ -41,6 +41,14 @@ def _assert_close_or_raise(op_name: str, actual: torch.Tensor, expected: torch.T
         _raise_subquadratic_self_test_error(op_name, f"max_diff={max_diff:.6g}, rel={rel:.6g}")
 
 
+@lru_cache(maxsize=None)
+def ensure_subquadratic_ops_supported(device_index: int | None = None) -> None:
+    """Validate all subquadratic_ops_torch CUDA kernels used by Evo2."""
+    ensure_subquadratic_causal_conv1d_supported(device_index)
+    ensure_subquadratic_fft_causal_conv1d_supported(device_index)
+    ensure_subquadratic_b2b_causal_conv1d_supported(device_index)
+
+
 @lru_cache(maxsize=None)
 def ensure_subquadratic_causal_conv1d_supported(device_index: int | None = None) -> None:
     """Validate subquadratic_ops_torch.causal_conv1d before using it for model data."""
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/infer.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/infer.py
@@ -107,6 +107,7 @@
 
 from bionemo.evo2.data.dataset_tokenizer import DEFAULT_HF_TOKENIZER_MODEL_PATH
 from bionemo.evo2.models.evo2_provider import HyenaInferenceContext
+from bionemo.evo2.models.megatron.hyena.subquadratic_safety import ensure_subquadratic_ops_supported
 from bionemo.evo2.run.predict import initialize_inference_distributed, resolve_checkpoint_path
 from bionemo.evo2.run.text_generation_controller import Evo2TextGenerationController
 
@@ -469,6 +470,8 @@ def setup_inference_engine(
         dist_config=dist_config,
     )
     logger.info("Initialized distributed environment")
+    if use_subquadratic_ops:
+        ensure_subquadratic_ops_supported()
 
     # -------------------------------------------------------------------------
     # Step 5: Create model and load weights
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/predict.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/predict.py
@@ -106,6 +106,7 @@
 
 from bionemo.evo2.data.dataset_tokenizer import DEFAULT_HF_TOKENIZER_MODEL_PATH
 from bionemo.evo2.data.fasta_dataset import SimpleFastaDataset
+from bionemo.evo2.models.megatron.hyena.subquadratic_safety import ensure_subquadratic_ops_supported
 from bionemo.recipeutils.inference.collation import batch_collator
 
 
@@ -1093,6 +1094,8 @@ def predict(
         dist_config=dist_config,
     )
     logger.info("Initialized distributed environment")
+    if use_subquadratic_ops:
+        ensure_subquadratic_ops_supported()
 
     # -------------------------------------------------------------------------
     # Step 5: Create model and load weights
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/train.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/train.py
@@ -35,10 +35,11 @@
 from megatron.bridge.training.mixed_precision import MIXED_PRECISION_RECIPES
 from megatron.bridge.training.post_training.checkpointing import has_modelopt_state
 from megatron.bridge.training.pretrain import pretrain
-from megatron.bridge.utils.common_utils import get_rank_safe
+from megatron.bridge.utils.common_utils import get_local_rank_preinit, get_rank_safe
 
 from bionemo.evo2.data.dataset_tokenizer import DEFAULT_HF_TOKENIZER_MODEL_PATH
 from bionemo.evo2.models.evo2_provider import MODEL_OPTIONS, hyena_forward_step, infer_model_type
+from bionemo.evo2.models.megatron.hyena.subquadratic_safety import ensure_subquadratic_ops_supported
 from bionemo.evo2.recipes.evo2 import evo2_1b_pretrain_config as pretrain_config
 
 
@@ -885,7 +886,9 @@ def train(args: argparse.Namespace) -> None:
     if args.num_layers:
         cfg.model.num_layers = args.num_layers
     if args.use_subquadratic_ops:
-        # TODO assert that it is installed
+        if torch.cuda.is_available():
+            torch.cuda.set_device(get_local_rank_preinit())
+        ensure_subquadratic_ops_supported()
         cfg.model.use_subquadratic_ops = True
 
     if args.no_activation_checkpointing:
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/megatron/hyena/test_engine.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/megatron/hyena/test_engine.py
@@ -18,6 +18,7 @@
 import torch.nn.functional as F  # noqa: N812
 
 from bionemo.evo2.models.megatron.hyena import engine
+from bionemo.evo2.models.megatron.hyena.subquadratic_safety import ensure_subquadratic_ops_supported
 
 
 def test_fftconv_func_is_prefix_invariant_when_filter_is_longer_than_input():
@@ -83,6 +84,11 @@ def test_parallel_fir_short_cuda_path_matches_torch_depthwise_conv1d(use_subquad
     """Short FIR prefill should match F.conv1d or fail before returning bad subq output."""
     if not torch.cuda.is_available():
         pytest.skip("short FIR CUDA path requires CUDA")
+    if use_subquadratic_ops:
+        try:
+            ensure_subquadratic_ops_supported()
+        except RuntimeError as e:
+            pytest.xfail(str(e))
 
     torch.manual_seed(1234)
     batch_size = 2
@@ -95,21 +101,16 @@ def test_parallel_fir_short_cuda_path_matches_torch_depthwise_conv1d(use_subquad
     weight = torch.randn(hidden_size, 1, kernel_size, device=device)
     bias = torch.randn(hidden_size, device=device)
 
-    try:
-        actual, state = engine.parallel_fir(
-            u=u,
-            weight=weight,
-            bias=bias,
-            L=seq_len,
-            gated_bias=True,
-            fir_length=kernel_size,
-            compute_state=True,
-            use_subquadratic_ops=use_subquadratic_ops,
-        )
-    except RuntimeError as e:
-        if use_subquadratic_ops and "failed a CUDA self-test" in str(e):
-            pytest.xfail(str(e))
-        raise
+    actual, state = engine.parallel_fir(
+        u=u,
+        weight=weight,
+        bias=bias,
+        L=seq_len,
+        gated_bias=True,
+        fir_length=kernel_size,
+        compute_state=True,
+        use_subquadratic_ops=use_subquadratic_ops,
+    )
 
     u_bdl = u.transpose(1, 2).contiguous()
     expected = F.conv1d(
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/megatron/hyena/test_hyena_mixer_kernel.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/megatron/hyena/test_hyena_mixer_kernel.py
@@ -26,6 +26,7 @@
 from bionemo.evo2.models.megatron.hyena.hyena_layer_specs import hyena_stack_spec_no_te
 from bionemo.evo2.models.megatron.hyena.hyena_mixer import HyenaMixer
 from bionemo.evo2.models.megatron.hyena.hyena_utils import ImplicitModalFilter
+from bionemo.evo2.models.megatron.hyena.subquadratic_safety import ensure_subquadratic_ops_supported
 
 from ....utils import distributed_model_parallel_state
 
@@ -254,6 +255,10 @@ def test_subquadratic_ops_kernel(  # noqa: D103
     # Skip bf16 with short convolution due to numerical instability
     if test_config.params_dtype == torch.bfloat16 and operator_type == "hyena_short_conv":
         pytest.skip("bf16 with short convolution is skipped due to numerical instability")
+    try:
+        ensure_subquadratic_ops_supported()
+    except RuntimeError as e:
+        pytest.xfail(str(e))
 
     with distributed_model_parallel_state():
         # Create both models inside the same distributed context
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/megatron/hyena/test_hyena_utils.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/megatron/hyena/test_hyena_utils.py
@@ -37,6 +37,7 @@
     wang_init_method,
     zigzag_get_overlapping_patches,
 )
+from bionemo.evo2.models.megatron.hyena.subquadratic_safety import ensure_subquadratic_ops_supported
 
 
 class MockProcessGroup:
@@ -137,7 +138,6 @@ def test_parallel_causal_depthwise_conv1d_uses_subquadratic_fast_conv(
         pg_collection=types.SimpleNamespace(cp=None),
         use_fast_causal_conv=True,
         use_subquadratic_ops=True,
-        _subquadratic_ops_checked=False,
     )
 
     y = ParallelCausalDepthwiseConv1d.forward(module, x, _use_cp=False)
@@ -304,29 +304,6 @@ def test_b2b_causal_conv1d_module_device_handling():  # noqa: D103
         assert result_cuda.device == x_cuda.device, "Device mismatch on CUDA"
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required for subquadratic guard test")
-@patch("bionemo.evo2.models.megatron.hyena.hyena_utils.ensure_subquadratic_b2b_causal_conv1d_supported")
-@patch("bionemo.evo2.models.megatron.hyena.hyena_utils.b2b_causal_conv1d")
-def test_b2b_causal_conv1d_module_checks_subquadratic_kernel_once(mock_b2b, mock_ensure):  # noqa: D103
-    mock_b2b.side_effect = mock_b2b_causal_conv1d
-    proj_conv = MockProjConv(kernel_size=3)
-    mixer = MockMixer(kernel_size=5)
-    b2b_module = B2BCausalConv1dModule(
-        proj_conv,
-        mixer,
-        operator_type="hyena_short_conv",
-        b2b_causal_conv1d=mock_b2b,
-        pg_collection=MockProcessGroupCollection(),
-    )
-
-    x = torch.randn(2, 96, 32, device="cuda")
-    b2b_module(x)
-    b2b_module(x)
-
-    assert mock_ensure.call_count == 1
-    assert mock_b2b.call_count == 2
-
-
 def test_b2b_causal_conv1d_effective_padding_size():
     """Test the zigzag pattern for data distribution in context parallel mode."""
     proj_conv = MockProjConv(kernel_size=3)
@@ -344,14 +321,14 @@ def test_b2b_causal_conv1d_effective_padding_size():
     assert b2b_module.effective_pad_size == expected_pad_size
 
 
-@pytest.mark.xfail(
-    reason="subquadratic-ops fused B2B kernel may fail CUDA/PTX self-test on unsupported GPUs",
-    strict=True,
-)
 def test_b2b_causal_conv1d_module_matches_sequential_reference():
     """Document the isolated B2B CUDA kernel behavior before relying on the fused path."""
     if not torch.cuda.is_available():
         pytest.skip("B2B causal conv isolation test requires CUDA")
+    try:
+        ensure_subquadratic_ops_supported()
+    except RuntimeError as e:
+        pytest.xfail(str(e))
 
     torch.manual_seed(1234)
     batch_size = 2