Go back to the original subq version, assume it works on other gpus and fail loudly if the CUDA_ERROR_UNSUPPORTED_PTX_VERSION error comes up

jstjohn · jstjohn · commit 49be647490dc · 2026-05-22T12:51:22.000-07:00
Signed-off-by: John St. John &lt;jstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/evo2_megatron/pyproject.toml b/bionemo-recipes/recipes/evo2_megatron/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
     # nvidia-resiliency-ext is pulled transitively by megatron-bridge.
     "emerging_optimizers",
     "subquadratic-ops-torch-cu13",
+    "email-validator",
 
     # These are dependencies for examples only, but are useful for actually doing analyses with this model
     "biopython",
@@ -88,6 +89,8 @@ override-dependencies = [
     "triton; sys_platform == 'never'",
     "transformer-engine; sys_platform == 'never'",
     "transformer-engine[pytorch]; sys_platform == 'never'",
+    # Avoid alpha Pydantic releases; langchain imports pulled by nvidia-resiliency-ext are not compatible.
+    "pydantic>=2.12,<2.14",
     # Avoid optional log-pattern-mining dependency conflicts from nvidia-resiliency-ext.
     "logsage; sys_platform == 'never'",
     "drain3; sys_platform == 'never'",
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/engine.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/engine.py
@@ -19,10 +19,17 @@
 import torch.nn.functional as F  # noqa: N812
 from einops import rearrange
 
+from bionemo.evo2.models.megatron.hyena.subquadratic_safety import (
+    ensure_subquadratic_causal_conv1d_supported,
+    ensure_subquadratic_fft_causal_conv1d_supported,
+)
+
 
 try:
+    from subquadratic_ops_torch.causal_conv1d import causal_conv1d as _subq_causal_conv1d
     from subquadratic_ops_torch.fft_causal_conv1d import fft_causal_conv1d as _subq_fft_causal_conv1d
 except ImportError as _subq_import_error:
+    _subq_causal_conv1d = None
     _subq_fft_causal_conv1d = None
     _subq_error_msg = f"subquadratic_ops_torch not available: {_subq_import_error}"
 
@@ -87,6 +94,7 @@ def parallel_fir(
     if fir_length >= 128:
         if use_subquadratic_ops:
             # subq-ops fft_causal_conv1d expects [B, D, L] input and [D, L] filter; dtypes must match
+            ensure_subquadratic_fft_causal_conv1d_supported()
             k = weight[:, :, :L].squeeze(1) if weight.dim() == 3 else weight[:, :L]
             u_fp32 = u.to(torch.float32)
             z = _subq_fft_causal_conv1d(u_fp32, k.to(torch.float32))
@@ -101,14 +109,24 @@ def parallel_fir(
                     D=bias,
                 ).to(dtype=u.dtype)
     else:
-        z = F.conv1d(
-            u.to(torch.float32),
-            weight.to(torch.float32),
-            bias=None,
-            stride=1,
-            padding=fir_length - 1,
-            groups=u.shape[1],  # always set to D, regardless of filter grouping
-        )[..., :L]
+        if use_subquadratic_ops:
+            if _subq_causal_conv1d is None:
+                raise ImportError(_subq_error_msg)
+            # subq-ops causal_conv1d expects pre-padded [B, D, L+pad] input and [D, K] weight.
+            ensure_subquadratic_causal_conv1d_supported()
+            pad_size = fir_length - 1
+            x_padded = F.pad(u.to(torch.float32), (pad_size, 0))
+            w = weight.squeeze(1) if weight.dim() == 3 else weight
+            z = _subq_causal_conv1d(x_padded, w.to(torch.float32))[..., pad_size:]
+        else:
+            z = F.conv1d(
+                u.to(torch.float32),
+                weight.to(torch.float32),
+                bias=None,
+                stride=1,
+                padding=fir_length - 1,
+                groups=u.shape[1],
+            )[..., :L]
 
         z = z.to(u.dtype)
 
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/hyena_mixer.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/hyena_mixer.py
@@ -119,9 +119,7 @@ def __init__(
         self.fast_conv_mixer = self.hyena_config.fast_conv_mixer
 
         self.use_subquadratic_ops = self.transformer_config.use_subquadratic_ops
-        # TODO: Re-enable B2BCausalConv1dModule for short/medium Hyena layers once
-        # subquadratic-ops updates it to support causal_conv1d 1.6+ semantics.
-        self.use_fused_b2b_causal_conv1d = False
+        self.use_fused_b2b_causal_conv1d = self.use_subquadratic_ops
 
         # Per attention head and per partition values.
         assert torch.distributed.is_initialized()
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/hyena_utils.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/hyena_utils.py
@@ -33,6 +33,11 @@
 from torch.autograd.function import Function
 
 from bionemo.evo2.models.megatron.hyena.hyena_config import HyenaConfig
+from bionemo.evo2.models.megatron.hyena.subquadratic_safety import (
+    ensure_subquadratic_b2b_causal_conv1d_supported,
+    ensure_subquadratic_causal_conv1d_supported,
+    ensure_subquadratic_fft_causal_conv1d_supported,
+)
 
 
 try:
@@ -50,10 +55,25 @@ def causal_conv1d_fn(*args, **kwargs):
 
 
 try:
-    from subquadratic_ops_torch.b2b_causal_conv1d import b2b_causal_conv1d
-    from subquadratic_ops_torch.causal_conv1d import causal_conv1d
-    from subquadratic_ops_torch.fft_causal_conv1d import fft_causal_conv1d
+    from subquadratic_ops_torch.b2b_causal_conv1d import b2b_causal_conv1d as _subq_b2b_causal_conv1d
+    from subquadratic_ops_torch.causal_conv1d import causal_conv1d as _subq_causal_conv1d
+    from subquadratic_ops_torch.fft_causal_conv1d import fft_causal_conv1d as _subq_fft_causal_conv1d
     from subquadratic_ops_torch.implicit_filter import implicit_filter
+
+    def causal_conv1d(*args, **kwargs):
+        """Run guarded subquadratic causal_conv1d."""
+        ensure_subquadratic_causal_conv1d_supported()
+        return _subq_causal_conv1d(*args, **kwargs)
+
+    def b2b_causal_conv1d(*args, **kwargs):
+        """Run guarded subquadratic b2b_causal_conv1d."""
+        ensure_subquadratic_b2b_causal_conv1d_supported()
+        return _subq_b2b_causal_conv1d(*args, **kwargs)
+
+    def fft_causal_conv1d(*args, **kwargs):
+        """Run guarded subquadratic fft_causal_conv1d."""
+        ensure_subquadratic_fft_causal_conv1d_supported()
+        return _subq_fft_causal_conv1d(*args, **kwargs)
 except ImportError as e:
     msg_causal_conv1d = f"Problem importing subquadratic_ops: {e}. causal_conv1d is not available."
     msg_b2b_causal_conv1d = f"Problem importing subquadratic_ops: {e}. b2b_causal_conv1d is not available."
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/subquadratic_safety.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/megatron/hyena/subquadratic_safety.py
@@ -0,0 +1,158 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import lru_cache
+
+import torch
+import torch.nn.functional as F  # noqa: N812
+
+
+def _raise_subquadratic_self_test_error(op_name: str, detail: str) -> None:
+    raise RuntimeError(
+        f"subquadratic_ops_torch.{op_name} failed a CUDA self-test ({detail}). "
+        "This often happens with CUDA_ERROR_UNSUPPORTED_PTX_VERSION or unsupported GPU/toolchain "
+        "combinations. Refusing to run this subquadratic kernel because it can otherwise return "
+        "invalid outputs without raising."
+    )
+
+
+def _assert_close_or_raise(op_name: str, actual: torch.Tensor, expected: torch.Tensor) -> None:
+    torch.cuda.synchronize(actual.device)
+    if not torch.isfinite(actual).all():
+        _raise_subquadratic_self_test_error(op_name, "non-finite output")
+
+    if not torch.allclose(actual, expected, rtol=1e-4, atol=1e-4):
+        max_diff = (actual.float() - expected.float()).abs().max().item()
+        rel = (
+            (actual.float() - expected.float()).pow(2).sum().sqrt() / (expected.float().pow(2).sum().sqrt() + 1e-30)
+        ).item()
+        _raise_subquadratic_self_test_error(op_name, f"max_diff={max_diff:.6g}, rel={rel:.6g}")
+
+
+@lru_cache(maxsize=None)
+def ensure_subquadratic_causal_conv1d_supported(device_index: int | None = None) -> None:
+    """Validate subquadratic_ops_torch.causal_conv1d before using it for model data."""
+    if not torch.cuda.is_available():
+        return
+
+    device_index = torch.cuda.current_device() if device_index is None else device_index
+    device = torch.device("cuda", device_index)
+
+    from subquadratic_ops_torch.causal_conv1d import causal_conv1d as subq_causal_conv1d
+
+    batch_size = 1
+    hidden_size = 4
+    seq_len = 8
+    kernel_size = 3
+    pad_size = kernel_size - 1
+
+    u = torch.linspace(-1.0, 1.0, steps=batch_size * hidden_size * seq_len, device=device).reshape(
+        batch_size, hidden_size, seq_len
+    )
+    weight = torch.linspace(-0.5, 0.5, steps=hidden_size * kernel_size, device=device).reshape(
+        hidden_size, kernel_size
+    )
+
+    expected = F.conv1d(
+        u,
+        weight.unsqueeze(1),
+        bias=None,
+        stride=1,
+        padding=pad_size,
+        groups=hidden_size,
+    )[..., :seq_len]
+    actual = subq_causal_conv1d(F.pad(u, (pad_size, 0)), weight)[..., pad_size:]
+    _assert_close_or_raise("causal_conv1d", actual, expected)
+
+
+@lru_cache(maxsize=None)
+def ensure_subquadratic_fft_causal_conv1d_supported(device_index: int | None = None) -> None:
+    """Validate subquadratic_ops_torch.fft_causal_conv1d before using it for model data."""
+    if not torch.cuda.is_available():
+        return
+
+    device_index = torch.cuda.current_device() if device_index is None else device_index
+    device = torch.device("cuda", device_index)
+
+    from subquadratic_ops_torch.fft_causal_conv1d import fft_causal_conv1d as subq_fft_causal_conv1d
+
+    batch_size = 1
+    hidden_size = 4
+    seq_len = 8
+    kernel_size = 5
+
+    u = torch.linspace(-1.0, 1.0, steps=batch_size * hidden_size * seq_len, device=device).reshape(
+        batch_size, hidden_size, seq_len
+    )
+    weight = torch.linspace(-0.5, 0.5, steps=hidden_size * kernel_size, device=device).reshape(
+        hidden_size, kernel_size
+    )
+
+    expected = F.conv1d(
+        u,
+        weight.flip(-1).unsqueeze(1),
+        bias=None,
+        stride=1,
+        padding=kernel_size - 1,
+        groups=hidden_size,
+    )[..., :seq_len]
+    actual = subq_fft_causal_conv1d(u, weight)
+    _assert_close_or_raise("fft_causal_conv1d", actual, expected)
+
+
+@lru_cache(maxsize=None)
+def ensure_subquadratic_b2b_causal_conv1d_supported(device_index: int | None = None) -> None:
+    """Validate subquadratic_ops_torch.b2b_causal_conv1d before using it for model data."""
+    if not torch.cuda.is_available():
+        return
+
+    device_index = torch.cuda.current_device() if device_index is None else device_index
+    device = torch.device("cuda", device_index)
+
+    from subquadratic_ops_torch.b2b_causal_conv1d import b2b_causal_conv1d as subq_b2b_causal_conv1d
+
+    batch_size = 1
+    hidden_size = 2
+    seq_len = 10
+    proj_kernel_size = 3
+    mixer_kernel_size = 7
+
+    x = torch.linspace(-1.0, 1.0, steps=batch_size * 3 * hidden_size * seq_len, device=device).reshape(
+        batch_size, 3 * hidden_size, seq_len
+    )
+    proj_weight = torch.linspace(-0.5, 0.5, steps=3 * hidden_size * proj_kernel_size, device=device).reshape(
+        3 * hidden_size, proj_kernel_size
+    )
+    mixer_weight = torch.linspace(-0.25, 0.25, steps=hidden_size * mixer_kernel_size, device=device).reshape(
+        hidden_size, mixer_kernel_size
+    )
+    bias = torch.linspace(-0.1, 0.1, steps=hidden_size, device=device)
+
+    actual = subq_b2b_causal_conv1d(x, proj_weight, mixer_weight, bias)
+
+    projected = F.conv1d(
+        F.pad(x, (proj_kernel_size - 1, 0)),
+        proj_weight.flip(-1).unsqueeze(1),
+        groups=3 * hidden_size,
+    )
+    x1, x2, v = projected[:, ::3], projected[:, 1::3], projected[:, 2::3]
+    z = x2 * v
+    mixed = F.conv1d(
+        F.pad(z, (mixer_kernel_size - 1, 0)),
+        mixer_weight.flip(-1).unsqueeze(1),
+        groups=hidden_size,
+    )
+    expected = x1 * (mixed + bias[None, :, None] * z)
+    _assert_close_or_raise("b2b_causal_conv1d", actual, expected)
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/infer.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/infer.py
@@ -77,7 +77,14 @@
 )
 from megatron.bridge.training.config import DistributedInitConfig, RNGConfig
 from megatron.bridge.training.mixed_precision import get_mixed_precision_config
-from megatron.bridge.training.tokenizers.tokenizer import _HuggingFaceTokenizer
+
+
+try:
+    from megatron.bridge.training.tokenizers.tokenizer import _HuggingFaceTokenizer
+except ImportError:
+    from megatron.core.tokenizers.text.libraries.huggingface_tokenizer import (
+        HuggingFaceTokenizer as _HuggingFaceTokenizer,
+    )
 from megatron.bridge.training.utils.checkpoint_utils import (
     file_exists,
     get_checkpoint_run_config_filename,
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/megatron/hyena/test_engine.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/megatron/hyena/test_engine.py
@@ -13,10 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-Apache2
-
+import pytest
 import torch
+import torch.nn.functional as F  # noqa: N812
 
 from bionemo.evo2.models.megatron.hyena import engine
 
@@ -77,3 +76,51 @@ def test_parallel_iir_is_prefix_invariant_when_filter_is_longer_than_input():
     )
 
     torch.testing.assert_close(short_out, long_out[:, :short_len], rtol=1e-5, atol=1e-5)
+
+
+@pytest.mark.parametrize("use_subquadratic_ops", [False, True], ids=["torch", "subq"])
+def test_parallel_fir_short_cuda_path_matches_torch_depthwise_conv1d(use_subquadratic_ops):
+    """Short FIR prefill should match F.conv1d or fail before returning bad subq output."""
+    if not torch.cuda.is_available():
+        pytest.skip("short FIR CUDA path requires CUDA")
+
+    torch.manual_seed(1234)
+    batch_size = 2
+    seq_len = 17
+    hidden_size = 8
+    kernel_size = 7
+    device = torch.device("cuda")
+
+    u = torch.randn(batch_size, seq_len, hidden_size, device=device)
+    weight = torch.randn(hidden_size, 1, kernel_size, device=device)
+    bias = torch.randn(hidden_size, device=device)
+
+    try:
+        actual, state = engine.parallel_fir(
+            u=u,
+            weight=weight,
+            bias=bias,
+            L=seq_len,
+            gated_bias=True,
+            fir_length=kernel_size,
+            compute_state=True,
+            use_subquadratic_ops=use_subquadratic_ops,
+        )
+    except RuntimeError as e:
+        if use_subquadratic_ops and "failed a CUDA self-test" in str(e):
+            pytest.xfail(str(e))
+        raise
+
+    u_bdl = u.transpose(1, 2).contiguous()
+    expected = F.conv1d(
+        u_bdl.float(),
+        weight.float(),
+        bias=None,
+        stride=1,
+        padding=kernel_size - 1,
+        groups=hidden_size,
+    )[..., :seq_len]
+    expected = expected.to(u.dtype) + bias[None, :, None] * u_bdl
+
+    torch.testing.assert_close(actual, expected, rtol=1e-5, atol=1e-5)
+    torch.testing.assert_close(state, u_bdl[..., -(kernel_size - 1) :])
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/megatron/hyena/test_hyena_utils.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/megatron/hyena/test_hyena_utils.py
@@ -296,11 +296,11 @@ def test_b2b_causal_conv1d_effective_padding_size():
 
 
 @pytest.mark.xfail(
-    reason="subquadratic-ops fused B2B kernel does not match causal_conv1d 1.6+ short-conv semantics",
+    reason="subquadratic-ops fused B2B kernel may fail CUDA/PTX self-test on unsupported GPUs",
     strict=True,
 )
 def test_b2b_causal_conv1d_module_matches_sequential_reference():
-    """Document the isolated B2B mismatch before re-enabling the fused path."""
+    """Document the isolated B2B CUDA kernel behavior before relying on the fused path."""
     if not torch.cuda.is_available():
         pytest.skip("B2B causal conv isolation test requires CUDA")
 
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/run/test_infer.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/run/test_infer.py
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/run/test_predict.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/run/test_predict.py