Modify output path to include dp rank

jstjohn · jstjohn · commit 3059a27d50b4 · 2025-09-15T17:40:41.000Z
Signed-off-by: John St John &lt;jstjohn@nvidia.com&gt;
diff --git a/sub-packages/bionemo-amplify/tests/bionemo/amplify/test_infer_amplify.py b/sub-packages/bionemo-amplify/tests/bionemo/amplify/test_infer_amplify.py
@@ -104,7 +104,7 @@ def test_infer_epoch_mode(
 
     # Load and verify results
     results: Dict[str, torch.Tensor] = {}
-    results = cast(Dict[str, torch.Tensor], torch.load(f"{result_dir}/predictions__rank_0.pt"))
+    results = cast(Dict[str, torch.Tensor], torch.load(f"{result_dir}/predictions__rank_0__dp_rank_0.pt"))
 
     assert isinstance(results, dict)
     keys_included = ["token_logits", "hidden_states", "embeddings", "input_ids"]
diff --git a/sub-packages/bionemo-esm2/examples/finetune.ipynb b/sub-packages/bionemo-esm2/examples/finetune.ipynb
@@ -648,7 +648,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -667,7 +667,7 @@
     "import torch\n",
     "\n",
     "\n",
-    "results = torch.load(f\"{regression_results_path}/predictions__rank_0.pt\")\n",
+    "results = torch.load(f\"{regression_results_path}/predictions__rank_0__dp_rank_0.pt\")\n",
     "\n",
     "for key, val in results.items():\n",
     "    if val is not None:\n",
@@ -833,7 +833,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -853,7 +853,7 @@
     "import torch\n",
     "\n",
     "\n",
-    "results = torch.load(f\"{sequence_classification_results_path}/predictions__rank_0.pt\")\n",
+    "results = torch.load(f\"{sequence_classification_results_path}/predictions__rank_0__dp_rank_0.pt\")\n",
     "\n",
     "for key, val in results.items():\n",
     "    if val is not None:\n",
@@ -1044,7 +1044,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1062,7 +1062,7 @@
     "import torch\n",
     "\n",
     "\n",
-    "results = torch.load(f\"{token_classification_results_path}/predictions__rank_0.pt\")\n",
+    "results = torch.load(f\"{token_classification_results_path}/predictions__rank_0__dp_rank_0.pt\")\n",
     "\n",
     "for key, val in results.items():\n",
     "    if val is not None:\n",
diff --git a/sub-packages/bionemo-esm2/examples/inference.ipynb b/sub-packages/bionemo-esm2/examples/inference.ipynb
@@ -357,12 +357,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Inference predictions are stored into `.pt` files for each device. Since we only used one device to run the inference (`--num-gpus 1`) in the previous step, the results were written to `{work_dir}/predictions__rank_0.pt` under the work directory of this notebook (defined above). The `.pt` file containes a dictionary of `{'result_key': torch.Tensor}` that be loaded with PyTorch:"
+    "Inference predictions are stored into `.pt` files for each device. Since we only used one device to run the inference (`--num-gpus 1`) in the previous step, the results were written to `{work_dir}/predictions__rank_0__dp_rank_0.pt` under the work directory of this notebook (defined above). The `.pt` file containes a dictionary of `{'result_key': torch.Tensor}` that be loaded with PyTorch:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -377,7 +377,7 @@
     }
    ],
    "source": [
-    "results = torch.load(f\"{work_dir}/predictions__rank_0.pt\")\n",
+    "results = torch.load(f\"{work_dir}/predictions__rank_0__dp_rank_0.pt\")\n",
     "\n",
     "for key, val in results.items():\n",
     "    if val is not None:\n",
@@ -609,7 +609,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -625,7 +625,7 @@
     }
    ],
    "source": [
-    "results = torch.load(f\"{work_dir}/predictions__rank_0.pt\")\n",
+    "results = torch.load(f\"{work_dir}/predictions__rank_0__dp_rank_0.pt\")\n",
     "\n",
     "for key, val in results.items():\n",
     "    if val is not None:\n",
diff --git a/sub-packages/bionemo-esm2/examples/mutant-design.ipynb b/sub-packages/bionemo-esm2/examples/mutant-design.ipynb
@@ -400,12 +400,12 @@
    "id": "67d09581-e784-4ccc-be88-194c8909068c",
    "metadata": {},
    "source": [
-    "This will write the output of ESM-2 inference into a python dictionary and save that into `predictions__rank_0.pt` which can be loaded via PyTorch. DDP inference is supported in BioNeMo Framework and can be utilized by setting `--num-gpus n` to use `n` devices. The output predictions are then written to n distinct files `predictions__rank_<0...n-1>.pt`. Please refer to [ESM-2 Inference Tutorial](./inference.ipynb) for more information regarding the DDP support and how to interpret the prediction outputs."
+    "This will write the output of ESM-2 inference into a python dictionary and save that into `predictions__rank_0__dp_rank_0.pt` which can be loaded via PyTorch. DDP inference is supported in BioNeMo Framework and can be utilized by setting `--num-gpus n` to use `n` devices. The output predictions are then written to n distinct files `predictions__rank_<0...n-1>.pt`. Please refer to [ESM-2 Inference Tutorial](./inference.ipynb) for more information regarding the DDP support and how to interpret the prediction outputs."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "2b48c5a7",
    "metadata": {},
    "outputs": [
@@ -421,7 +421,7 @@
     }
    ],
    "source": [
-    "results = torch.load(f\"{example_dir}/predictions__rank_0.pt\")\n",
+    "results = torch.load(f\"{example_dir}/predictions__rank_0__dp_rank_0.pt\")\n",
     "\n",
     "for key, val in results.items():\n",
     "    if val is not None:\n",
@@ -749,7 +749,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "id": "8ec1e825",
    "metadata": {},
    "outputs": [
@@ -762,7 +762,7 @@
     }
    ],
    "source": [
-    "results = torch.load(f\"{work_dir}/predictions__rank_0.pt\")\n",
+    "results = torch.load(f\"{work_dir}/predictions__rank_0__dp_rank_0.pt\")\n",
     "\n",
     "# cast to FP32 since BFloat16 is an unsupported ScalarType in numpy\n",
     "logits = results[\"token_logits\"].transpose(0, 1).to(dtype=torch.float32)  # s, b, h  -> b, s, h\n",
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_esm2_lora.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_esm2_lora.py
@@ -115,20 +115,20 @@ def test_different_results_with_peft(
         )
 
     if prediction_interval == "epoch":
-        results_original = torch.load(f"{result_dir_original}/predictions__rank_0.pt")
-        results_peft = torch.load(f"{result_dir_peft}/predictions__rank_0.pt")
+        results_original = torch.load(f"{result_dir_original}/predictions__rank_0__dp_rank_0.pt")
+        results_peft = torch.load(f"{result_dir_peft}/predictions__rank_0__dp_rank_0.pt")
 
     elif prediction_interval == "batch":
         results_original = batch_collator(
             [
                 torch.load(f, map_location="cpu")
-                for f in glob.glob(f"{result_dir_original}/predictions__rank_0__batch_*.pt")
+                for f in glob.glob(f"{result_dir_original}/predictions__rank_0__dp_rank_0__batch_*.pt")
             ]
         )
         results_peft = batch_collator(
             [
                 torch.load(f, map_location="cpu")
-                for f in glob.glob(f"{result_dir_peft}/predictions__rank_0__batch_*.pt")
+                for f in glob.glob(f"{result_dir_peft}/predictions__rank_0__dp_rank_0__batch_*.pt")
             ]
         )
     assert (results_original["embeddings"] != results_peft["embeddings"]).any()
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py
@@ -99,10 +99,13 @@ def test_infer_without_finetune_head(
     assert result_dir.exists(), "Could not find test results directory."
 
     if prediction_interval == "epoch":
-        results = torch.load(f"{result_dir}/predictions__rank_0.pt")
+        results = torch.load(f"{result_dir}/predictions__rank_0__dp_rank_0.pt")
     elif prediction_interval == "batch":
         results = batch_collator(
-            [torch.load(f, map_location="cpu") for f in glob.glob(f"{result_dir}/predictions__rank_0__batch_*.pt")]
+            [
+                torch.load(f, map_location="cpu")
+                for f in glob.glob(f"{result_dir}/predictions__rank_0__dp_rank_0__batch_*.pt")
+            ]
         )
     assert isinstance(results, dict)
     keys_included = ["token_logits", "hidden_states", "embeddings", "binary_logits", "input_ids"]
diff --git a/sub-packages/bionemo-geneformer/examples/geneformer-celltype-classification.ipynb b/sub-packages/bionemo-geneformer/examples/geneformer-celltype-classification.ipynb
@@ -926,35 +926,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import torch\n",
     "\n",
     "\n",
-    "infer_Xs_10m = torch.load(result_path_10m / \"predictions__rank_0.pt\")[\"embeddings\"].float().cpu().numpy()\n",
+    "infer_Xs_10m = torch.load(result_path_10m / \"predictions__rank_0__dp_rank_0.pt\")[\"embeddings\"].float().cpu().numpy()\n",
     "assert len(adata) == len(infer_Xs_10m), (len(adata), len(infer_Xs_10m))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "infer_Xs_106m = torch.load(result_path_106m / \"predictions__rank_0.pt\")[\"embeddings\"].float().cpu().numpy()\n",
+    "infer_Xs_106m = torch.load(result_path_106m / \"predictions__rank_0__dp_rank_0.pt\")[\"embeddings\"].float().cpu().numpy()\n",
     "assert len(adata) == len(infer_Xs_106m), (len(adata), len(infer_Xs_106m))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "infer_Xs_10m_random = (\n",
-    "    torch.load(results_path_10m_random / \"predictions__rank_0.pt\")[\"embeddings\"].float().cpu().numpy()\n",
+    "    torch.load(results_path_10m_random / \"predictions__rank_0__dp_rank_0.pt\")[\"embeddings\"].float().cpu().numpy()\n",
     ")\n",
     "assert len(adata) == len(infer_Xs_10m_random), (len(adata), len(infer_Xs_10m_random))"
    ]
diff --git a/sub-packages/bionemo-geneformer/examples/geneformer-gene-embedding-GRN.ipynb b/sub-packages/bionemo-geneformer/examples/geneformer-gene-embedding-GRN.ipynb
@@ -441,7 +441,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -456,7 +456,7 @@
     "import torch\n",
     "\n",
     "\n",
-    "predictions = torch.load(result_path_10m / \"predictions__rank_0.pt\", weights_only=False)\n",
+    "predictions = torch.load(result_path_10m / \"predictions__rank_0__dp_rank_0.pt\", weights_only=False)\n",
     "\n",
     "print(predictions.keys())"
    ]
diff --git a/sub-packages/bionemo-geneformer/examples/geneformer_cellxgene_tutorial.ipynb b/sub-packages/bionemo-geneformer/examples/geneformer_cellxgene_tutorial.ipynb
@@ -1345,7 +1345,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "id": "c64ebc1a",
    "metadata": {},
    "outputs": [
@@ -1363,7 +1363,7 @@
    ],
    "source": [
     "!ls -altrh {tutorial_output_dir}/\n",
-    "tutorial_output_inference_pickle = f\"{tutorial_output_dir}/predictions__rank_0.pt\"\n",
+    "tutorial_output_inference_pickle = f\"{tutorial_output_dir}/predictions__rank_0__dp_rank_0.pt\"\n",
     "!ls -altrh {tutorial_output_inference_pickle}"
    ]
   },
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/celltype_classification_bench/bench.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/celltype_classification_bench/bench.py
@@ -154,7 +154,7 @@ def load_data_run_benchmark(result_path, adata_path, write_results=True):
 
     adata = read_h5ad(adata_path)
 
-    infer_Xs = torch.load(result_path / "predictions__rank_0.pt")["embeddings"].float().cpu().numpy()
+    infer_Xs = torch.load(result_path / "predictions__rank_0__dp_rank_0.pt")["embeddings"].float().cpu().numpy()
     assert len(adata) == len(infer_Xs), (len(adata), len(infer_Xs))
 
     infer_metadata = adata.obs

Original file line number	Diff line number	Diff line change
`@@ -357,12 +357,12 @@`
`357`	`357`	`"cell_type": "markdown",`
`358`	`358`	`"metadata": {},`
`359`	`359`	`"source": [`
`360`		- "Inference predictions are stored into `.pt` files for each device. Since we only used one device to run the inference (`--num-gpus 1`) in the previous step, the results were written to `{work_dir}/predictions__rank_0.pt` under the work directory of this notebook (defined above). The `.pt` file containes a dictionary of `{'result_key': torch.Tensor}` that be loaded with PyTorch:"
	`360`	+ "Inference predictions are stored into `.pt` files for each device. Since we only used one device to run the inference (`--num-gpus 1`) in the previous step, the results were written to `{work_dir}/predictions__rank_0__dp_rank_0.pt` under the work directory of this notebook (defined above). The `.pt` file containes a dictionary of `{'result_key': torch.Tensor}` that be loaded with PyTorch:"
`361`	`361`	`]`
`362`	`362`	`},`
`363`	`363`	`{`
`364`	`364`	`"cell_type": "code",`
`365`		`- "execution_count": 54,`
	`365`	`+ "execution_count": null,`
`366`	`366`	`"metadata": {},`
`367`	`367`	`"outputs": [`
`368`	`368`	`{`
`@@ -377,7 +377,7 @@`
`377`	`377`	`}`
`378`	`378`	`],`
`379`	`379`	`"source": [`
`380`		`- "results = torch.load(f\"{work_dir}/predictions__rank_0.pt\")\n",`
	`380`	`+ "results = torch.load(f\"{work_dir}/predictions__rank_0__dp_rank_0.pt\")\n",`
`381`	`381`	`"\n",`
`382`	`382`	`"for key, val in results.items():\n",`
`383`	`383`	`" if val is not None:\n",`
`@@ -609,7 +609,7 @@`
`609`	`609`	`},`
`610`	`610`	`{`
`611`	`611`	`"cell_type": "code",`
`612`		`- "execution_count": 64,`
	`612`	`+ "execution_count": null,`
`613`	`613`	`"metadata": {},`
`614`	`614`	`"outputs": [`
`615`	`615`	`{`
`@@ -625,7 +625,7 @@`
`625`	`625`	`}`
`626`	`626`	`],`
`627`	`627`	`"source": [`
`628`		`- "results = torch.load(f\"{work_dir}/predictions__rank_0.pt\")\n",`
	`628`	`+ "results = torch.load(f\"{work_dir}/predictions__rank_0__dp_rank_0.pt\")\n",`
`629`	`629`	`"\n",`
`630`	`630`	`"for key, val in results.items():\n",`
`631`	`631`	`" if val is not None:\n",`
Original file line number	Diff line number	Diff line change
`@@ -115,20 +115,20 @@ def test_different_results_with_peft(`
`115`	`115`	`)`
`116`	`116`
`117`	`117`	`if prediction_interval == "epoch":`
`118`		`- results_original = torch.load(f"{result_dir_original}/predictions__rank_0.pt")`
`119`		`- results_peft = torch.load(f"{result_dir_peft}/predictions__rank_0.pt")`
	`118`	`+ results_original = torch.load(f"{result_dir_original}/predictions__rank_0__dp_rank_0.pt")`
	`119`	`+ results_peft = torch.load(f"{result_dir_peft}/predictions__rank_0__dp_rank_0.pt")`
`120`	`120`
`121`	`121`	`elif prediction_interval == "batch":`
`122`	`122`	`results_original = batch_collator(`
`123`	`123`	`[`
`124`	`124`	`torch.load(f, map_location="cpu")`
`125`		`- for f in glob.glob(f"{result_dir_original}/predictions__rank_0__batch_*.pt")`
	`125`	`+ for f in glob.glob(f"{result_dir_original}/predictions__rank_0__dp_rank_0__batch_*.pt")`
`126`	`126`	`]`
`127`	`127`	`)`
`128`	`128`	`results_peft = batch_collator(`
`129`	`129`	`[`
`130`	`130`	`torch.load(f, map_location="cpu")`
`131`		`- for f in glob.glob(f"{result_dir_peft}/predictions__rank_0__batch_*.pt")`
	`131`	`+ for f in glob.glob(f"{result_dir_peft}/predictions__rank_0__dp_rank_0__batch_*.pt")`
`132`	`132`	`]`
`133`	`133`	`)`
`134`	`134`	`assert (results_original["embeddings"] != results_peft["embeddings"]).any()`
Original file line number	Diff line number	Diff line change
`@@ -441,7 +441,7 @@`
`441`	`441`	`},`
`442`	`442`	`{`
`443`	`443`	`"cell_type": "code",`
`444`		`- "execution_count": 29,`
	`444`	`+ "execution_count": null,`
`445`	`445`	`"metadata": {},`
`446`	`446`	`"outputs": [`
`447`	`447`	`{`
`@@ -456,7 +456,7 @@`
`456`	`456`	`"import torch\n",`
`457`	`457`	`"\n",`
`458`	`458`	`"\n",`
`459`		`- "predictions = torch.load(result_path_10m / \"predictions__rank_0.pt\", weights_only=False)\n",`
	`459`	`+ "predictions = torch.load(result_path_10m / \"predictions__rank_0__dp_rank_0.pt\", weights_only=False)\n",`
`460`	`460`	`"\n",`
`461`	`461`	`"print(predictions.keys())"`
`462`	`462`	`]`