Enable pipeline model parallelism for Evo2 inference (#1478)

kjaniknvidia · claude · web-flow · commit 6a60786a7e83 · 2026-02-24T19:20:58.000Z
Remove the PP > 1 guard, argparse choices=[1] restriction, and hardcoded pre_process/post_process=True so the model provider auto-detects pipeline stage. Tested with PP=1, PP=2, and PP=5. ### Description For the most part I just removed the guarding that forces PP=1. There's only one functional line change. 1. Line 257 — Removed the if pipeline_model_parallel_size != 1: raise ValueError(...) guard (3 lines deleted) 2. Line 334 — Changed model_provider.provide(pre_process=True, post_process=True) to model_provider.provide() so each pipeline stage auto-detects whether it needs embedding/output layers 3. Line 508 — Removed choices=[1] from the --pipeline-model-parallel-size argparse argument 4. Lines 245, 553 — Updated docstrings removing "(must be 1)" #### Usage torchrun --nproc-per-node 2 /workspace/bionemo/src/bionemo/evo2/run/infer.py \ --ckpt-dir /workspace/bionemo/evo2_1b_8k_bf16_mbridge \ --prompt "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG" \ --max-new-tokens 10 \ --top-k 1 \ --temperature 1.0 \ --pipeline-model-parallel-size 2 torchrun --nproc-per-node 5 /workspace/bionemo/src/bionemo/evo2/run/infer.py \ --ckpt-dir /workspace/bionemo/evo2_1b_8k_bf16_mbridge \ --prompt "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG" \ --max-new-tokens 10 \ --top-k 1 \ --temperature 1.0 \ --pipeline-model-parallel-size 5 │ PP=1 inference (1 GPU) PASS ATCGATCGAT │ │ PP=2 inference (2 GPUs) PASS ATCGATCGAT │ │ PP=5 inference (5 GPUs) PASS ATCGATCGAT │ ### Type of changes  - [x] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Refactor - [ ] Documentation update - [ ] Other (please describe): ### CI Pipeline Configuration Configure CI behavior by applying the relevant labels. By default, only basic unit tests are run. - [ciflow:skip](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:skip) - Skip all CI tests for this PR Unit tests marked as `@pytest.mark.multi_gpu` or `@pytest.mark.distributed` are not run in the PR pipeline. For more details, see [CONTRIBUTING](CONTRIBUTING.md) > [!NOTE] > By default, only basic unit tests are run. Add appropriate labels to enable an additional test coverage. #### Authorizing CI Runs We use [copy-pr-bot](https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/#automation) to manage authorization of CI runs on NVIDIA's compute resources. - If a pull request is opened by a trusted user and contains only trusted changes, the pull request's code will automatically be copied to a pull-request/ prefixed branch in the source repository (e.g. pull-request/123) - If a pull request is opened by an untrusted user or contains untrusted changes, an NVIDIA org member must leave an `/ok to test` comment on the pull request to trigger CI. This will need to be done for each new commit. #### Triggering Code Rabbit AI Review To trigger a code review from code rabbit, comment on a pull request with one of these commands: - @coderabbitai review - Triggers a standard review - @coderabbitai full review - Triggers a comprehensive review See https://docs.coderabbit.ai/reference/review-commands for a full list of commands. ### Pre-submit Checklist  - [x] I have tested these changes locally - [x] I have updated the documentation accordingly - [ ] I have added/updated tests as needed - [x] All existing tests pass successfully --------- Signed-off-by: Ken Janik <kjanik@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/infer.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/infer.py
@@ -242,7 +242,7 @@ def setup_inference_engine(
         max_seq_length: Maximum sequence length for generation.
         max_batch_size: Maximum batch size for inference.
         tensor_parallel_size: Tensor parallelism degree.
-        pipeline_model_parallel_size: Pipeline parallelism degree (must be 1).
+        pipeline_model_parallel_size: Pipeline parallelism degree.
         context_parallel_size: Context parallelism degree.
         mixed_precision_recipe: Override mixed precision recipe.
         random_seed: Random seed for reproducibility.
@@ -254,9 +254,6 @@ def setup_inference_engine(
         >>> components = setup_inference_engine(Path("/path/to/checkpoint"), max_batch_size=4)
         >>> results = generate(components, prompts=["ATCG", "GCTA"], max_new_tokens=100)
     """
-    if pipeline_model_parallel_size != 1:
-        raise ValueError("Pipeline parallelism > 1 is not supported for inference.")
-
     # -------------------------------------------------------------------------
     # Step 1: Load configuration from checkpoint
     # -------------------------------------------------------------------------
@@ -334,7 +331,7 @@ def setup_inference_engine(
     logger.info("Creating model...")
     model_provider.finalize()
 
-    raw_model = model_provider.provide(pre_process=True, post_process=True).eval().cuda()
+    raw_model = model_provider.provide().eval().cuda()
 
     logger.info(f"Loading weights from: {resolved_ckpt_dir}")
     _load_model_weights_from_checkpoint(
@@ -505,7 +502,7 @@ def parse_args() -> argparse.Namespace:
 
     # Parallelism arguments
     ap.add_argument("--tensor-parallel-size", type=int, default=1, help="Tensor parallelism")
-    ap.add_argument("--pipeline-model-parallel-size", type=int, choices=[1], default=1, help="Pipeline parallelism")
+    ap.add_argument("--pipeline-model-parallel-size", type=int, default=1, help="Pipeline parallelism")
     ap.add_argument("--context-parallel-size", type=int, default=1, help="Context parallelism")
 
     # Output arguments
@@ -550,7 +547,7 @@ def infer(
         top_p: Nucleus sampling parameter (0 = disabled).
         seed: Random seed for reproducibility.
         tensor_parallel_size: Tensor parallelism degree.
-        pipeline_model_parallel_size: Pipeline parallelism degree (must be 1).
+        pipeline_model_parallel_size: Pipeline parallelism degree.
         context_parallel_size: Context parallelism degree.
         output_file: Optional path to save generated text.
         mixed_precision_recipe: Override mixed precision recipe.
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/run/test_infer.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/run/test_infer.py
@@ -363,6 +363,7 @@ def run_infer_subprocess_parallel(
     top_k: int = 1,
     seed: int = 42,
     tensor_parallel_size: int = 1,
+    pipeline_model_parallel_size: int = 1,
     context_parallel_size: int = 1,
 ):
     """Helper to run inference as a subprocess with model parallelism.
@@ -379,12 +380,13 @@ def run_infer_subprocess_parallel(
         top_k: Top-k sampling parameter (1 for greedy).
         seed: Random seed for reproducibility.
         tensor_parallel_size: Tensor parallelism degree.
+        pipeline_model_parallel_size: Pipeline parallelism degree.
         context_parallel_size: Context parallelism degree.
 
     Returns:
         The generated text from the output file.
     """
-    nproc_per_node = tensor_parallel_size * context_parallel_size
+    nproc_per_node = tensor_parallel_size * pipeline_model_parallel_size * context_parallel_size
     open_port = find_free_network_port()
 
     cmd = [
@@ -412,6 +414,8 @@ def run_infer_subprocess_parallel(
         str(seed),
         "--tensor-parallel-size",
         str(tensor_parallel_size),
+        "--pipeline-model-parallel-size",
+        str(pipeline_model_parallel_size),
         "--context-parallel-size",
         str(context_parallel_size),
     ]
@@ -625,29 +629,39 @@ def mbridge_checkpoint_7b_1m_path(tmp_path_factory) -> Path:
 @pytest.mark.slow
 @pytest.mark.timeout(900)
 @pytest.mark.parametrize(
-    "tp, cp",
+    "tp, pp, cp",
     [
         # The 7b model has 32 attention heads, supporting TP=1, 2, 4, 8
-        pytest.param(1, 1, id="tp=1,cp=1"),
-        pytest.param(2, 1, id="tp=2,cp=1"),
-        pytest.param(4, 1, id="tp=4,cp=1"),
-        pytest.param(8, 1, id="tp=8,cp=1"),
+        # TP-only configs
+        pytest.param(1, 1, 1, id="tp=1,pp=1,cp=1"),
+        pytest.param(2, 1, 1, id="tp=2,pp=1,cp=1"),
+        pytest.param(4, 1, 1, id="tp=4,pp=1,cp=1"),
+        pytest.param(8, 1, 1, id="tp=8,pp=1,cp=1"),
+        # PP-only configs
+        pytest.param(1, 2, 1, id="tp=1,pp=2,cp=1"),
+        pytest.param(1, 4, 1, id="tp=1,pp=4,cp=1"),
+        pytest.param(1, 8, 1, id="tp=1,pp=8,cp=1"),
+        # Combined TP+PP configs
+        pytest.param(2, 2, 1, id="tp=2,pp=2,cp=1"),
+        pytest.param(4, 2, 1, id="tp=4,pp=2,cp=1"),
+        # CP>1 configs (known broken)
         pytest.param(
+            1,
             1,
             2,
-            id="tp=1,cp=2",
+            id="tp=1,pp=1,cp=2",
             marks=pytest.mark.xfail(reason="CP>1 is known broken for inference", strict=False),
         ),
     ],
 )
 @pytest.mark.skipif(bool(os.environ.get("CI")), reason="Skip in CI")
-def test_parallel_inference_accuracy_7b(mbridge_checkpoint_7b_1m_path, tmp_path, dna_sequences, tp, cp):
+def test_parallel_inference_accuracy_7b(mbridge_checkpoint_7b_1m_path, tmp_path, dna_sequences, tp, pp, cp):
     """Test that parallel inference with the 7b model produces accurate generation results.
 
-    Uses the 7b-1m checkpoint which supports TP>1 (32 attention heads), enabling
-    proper tensor parallel accuracy testing that the 1b model cannot support.
+    Uses the 7b-1m checkpoint which supports TP>1 (32 attention heads) and PP>1,
+    enabling proper tensor and pipeline parallel accuracy testing.
     """
-    num_gpus_required = tp * cp
+    num_gpus_required = tp * pp * cp
     if torch.cuda.device_count() < num_gpus_required:
         pytest.skip(f"Not enough GPUs: need {num_gpus_required}, have {torch.cuda.device_count()}")
 
@@ -672,6 +686,7 @@ def test_parallel_inference_accuracy_7b(mbridge_checkpoint_7b_1m_path, tmp_path,
             top_k=1,  # Greedy decoding
             seed=42,
             tensor_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
             context_parallel_size=cp,
         )