add infer_evo2 --use-subquadratic-ops flag with test for matching baseline

farhadrgh · farhadrgh · commit 8b4bd11ddad0 · 2026-04-29T23:43:44.000Z
Signed-off-by: Farhad Ramezanghorbani &lt;farhadr@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/infer.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/infer.py
@@ -358,6 +358,7 @@ def setup_inference_engine(
     vortex_style_fp8: bool = False,
     random_seed: int = 1234,
     prompt_segmentation_threshold: Optional[int] = None,
+    use_subquadratic_ops: bool = False,
 ) -> Evo2InferenceComponents:
     """Setup the Evo2 inference engine and related components.
 
@@ -379,6 +380,9 @@ def setup_inference_engine(
             segmented during prefill to reduce peak memory. The first segment
             runs as a normal prefill; remaining tokens are processed one at a
             time before generation begins.
+        use_subquadratic_ops: Use fused subquadratic-ops kernels (b2b causal
+            conv1d in prefill, fft_causal_conv1d / causal_conv1d in
+            parallel_fir).
 
     Returns:
         Evo2InferenceComponents containing all inference components.
@@ -412,6 +416,7 @@ def setup_inference_engine(
     model_provider.sequence_parallel = False
 
     model_provider.flash_decode = True
+    model_provider.use_subquadratic_ops = use_subquadratic_ops
 
     if vortex_style_fp8:
         model_provider.vortex_style_fp8 = True
@@ -808,6 +813,14 @@ def parse_args() -> argparse.Namespace:
         "generation begins. Useful for long prompts that would otherwise OOM. "
         "Also settable via EVO2_PST env var.",
     )
+    ap.add_argument(
+        "--use-subquadratic-ops",
+        action="store_true",
+        default=False,
+        help="Use fused subquadratic-ops CUDA kernels (b2b causal conv1d in prefill, "
+        "fft_causal_conv1d / causal_conv1d in parallel_fir). Speeds up prompt processing "
+        "but has no effect on per-token decode throughput.",
+    )
 
     return ap.parse_args()
 
@@ -831,6 +844,7 @@ def infer(
     max_seq_length: int = 8192,
     max_batch_size: int = 1,
     prompt_segmentation_threshold: Optional[int] = None,
+    use_subquadratic_ops: bool = False,
 ) -> List[Dict[str, Any]]:
     """Run autoregressive text generation with Evo2 using MCore inference.
 
@@ -858,6 +872,7 @@ def infer(
             GPU memory proportional to this value. For large models, only 1 may fit.
         prompt_segmentation_threshold: If set, prompts longer than this are segmented
             during prefill to reduce peak memory.
+        use_subquadratic_ops: Use fused subquadratic-ops kernels in the inference path.
 
     Returns:
         List of JSONL-serialisable result dicts.
@@ -878,6 +893,7 @@ def infer(
         vortex_style_fp8=vortex_style_fp8,
         random_seed=random_seed,
         prompt_segmentation_threshold=prompt_segmentation_threshold,
+        use_subquadratic_ops=use_subquadratic_ops,
     )
 
     mem_after_setup_gb = torch.cuda.max_memory_allocated() / (1024**3)
@@ -1003,6 +1019,7 @@ def main() -> None:
         max_seq_length=max_seq_length,
         max_batch_size=args.max_batch_size,
         prompt_segmentation_threshold=prompt_segmentation_threshold,
+        use_subquadratic_ops=args.use_subquadratic_ops,
     )
 
 
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/run/test_infer.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/run/test_infer.py
@@ -284,6 +284,7 @@ def run_infer_subprocess(
     temperature: float = 1.0,
     top_k: int = 1,
     seed: int = 42,
+    use_subquadratic_ops: bool = False,
 ):
     """Helper function to run inference as a subprocess.
 
@@ -295,6 +296,7 @@ def run_infer_subprocess(
         temperature: Sampling temperature
         top_k: Top-k sampling parameter (1 for greedy)
         seed: Random seed for reproducibility
+        use_subquadratic_ops: Pass --use-subquadratic-ops to the CLI.
 
     Returns:
         The generated completion text from the first JSONL record
@@ -326,6 +328,8 @@ def run_infer_subprocess(
         "--seed",
         str(seed),
     ]
+    if use_subquadratic_ops:
+        cmd.append("--use-subquadratic-ops")
 
     env = copy.deepcopy(PRETEST_ENV)
 
@@ -517,6 +521,47 @@ def test_identical_prompts_should_be_identical(mbridge_checkpoint_path, tmp_path
     )
 
 
+def test_subquadratic_ops_matches_baseline(mbridge_checkpoint_path, tmp_path):
+    """Greedy generation with --use-subquadratic-ops must match the standard path.
+
+    This is the end-to-end correctness check for the subq-ops inference path:
+    Phase 1 routes engine.parallel_fir through subq-ops kernels during prefill,
+    Phase 2 fuses proj+mixer convs via b2b_causal_conv1d during prefill and
+    populates FIR caches for the subsequent decode steps. With greedy decoding
+    (top_k=1) and the same seed, both paths must produce identical output.
+    """
+    output_baseline = tmp_path / "output_baseline.jsonl"
+    output_subq = tmp_path / "output_subq.jsonl"
+
+    generated_baseline = run_infer_subprocess(
+        mbridge_checkpoint_path,
+        prompt=PROMPT_1,
+        output_file=output_baseline,
+        max_new_tokens=20,
+        temperature=1.0,
+        top_k=1,
+        seed=42,
+        use_subquadratic_ops=False,
+    )
+
+    generated_subq = run_infer_subprocess(
+        mbridge_checkpoint_path,
+        prompt=PROMPT_1,
+        output_file=output_subq,
+        max_new_tokens=20,
+        temperature=1.0,
+        top_k=1,
+        seed=42,
+        use_subquadratic_ops=True,
+    )
+
+    assert len(generated_baseline) > 0, "Baseline generation produced empty output"
+    assert len(generated_subq) > 0, "Subq-ops generation produced empty output"
+    assert generated_baseline == generated_subq, (
+        f"Subq-ops path diverged from baseline:\nBaseline: {generated_baseline}\nSubq-ops: {generated_subq}"
+    )
+
+
 def test_different_prompts_produce_different_outputs(mbridge_checkpoint_path, tmp_path):
     """Test that different prompts produce different sequences.