Address PR feedback

jstjohn · jstjohn · commit ea643253fbfc · 2025-09-10T22:31:15.000Z
Signed-off-by: John St John &lt;jstjohn@nvidia.com&gt;
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/predict.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/predict.py
@@ -224,13 +224,10 @@ def predict_step(self, batch, batch_idx: int | None = None) -> Tensor | dict[str
             softmax_logprobs = torch.log_softmax(forward_out_gathered, dim=-1)
             softmax_logprobs = softmax_logprobs[:, :-1]
             input_ids = tokens_gathered[:, 1:]
-            try:
-                assert softmax_logprobs.shape[1] == input_ids.shape[1]
-            except Exception as e:
-                if torch.distributed.get_rank() == 0:
-                    breakpoint()
-                torch.distributed.barrier()
-                raise e
+            if softmax_logprobs.shape[1] != input_ids.shape[1]:
+                raise RuntimeError(
+                    f"Softmax logprobs shape {softmax_logprobs.shape} does not match input ids shape {input_ids.shape}"
+                )
 
             logprobs = torch.gather(
                 softmax_logprobs,  # Gather likelihoods...
@@ -404,6 +401,11 @@ def predict(
     """
     if work_dir is None:
         work_dir = Path(tempfile.mkdtemp())
+    if files_per_subdir is None and write_interval == "batch":
+        logger.warning(
+            "--files-per-subdir is not set with --write-interval batch, will write all predictions to a "
+            "single directory. This may cause problems if you are predicting on a very large dataset."
+        )
     sequence_parallel = tensor_parallel_size > 1 and not no_sequence_parallel
     output_dir.mkdir(parents=True, exist_ok=True)  # Make sure the output directory exists, files will be written here.
     model_parallel_size = tensor_parallel_size * pipeline_model_parallel_size * context_parallel_size
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/utils/callbacks.py b/sub-packages/bionemo-llm/src/bionemo/llm/utils/callbacks.py
@@ -91,24 +91,35 @@ def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, *args, **kwa
                 "in the model's predictions as outputs are not ordered and batch indices do not track input order."
             )
 
+    @staticmethod
+    def _assert_initialized():
+        """Asserts that the environment is initialized."""
+        if not (
+            torch.distributed.is_available() and torch.distributed.is_initialized() and parallel_state.is_initialized()
+        ):
+            raise RuntimeError("This function is only defined within an initialized megatron parallel environment.")
+
     @property
     def data_parallel_world_size(self) -> int:
         """Returns the data parallel world size."""
+        self._assert_initialized()
         return torch.distributed.get_world_size(parallel_state.get_data_parallel_group(with_context_parallel=False))
 
     @property
     def data_parallel_rank(self) -> int:
         """Returns the data parallel rank."""
+        self._assert_initialized()
         return torch.distributed.get_rank(parallel_state.get_data_parallel_group(with_context_parallel=False))
 
     @property
     def should_write_predictions(self) -> bool:
         """Returns the context parallel rank."""
         # TODO: handle expert parallelism and other kinds of parallelism
+        self._assert_initialized()
+        if not parallel_state.is_pipeline_last_stage():
+            return False
         return self.save_all_model_parallel_ranks or (
-            parallel_state.is_pipeline_last_stage()
-            and parallel_state.get_tensor_model_parallel_rank() == 0
-            and parallel_state.get_context_parallel_rank() == 0
+            parallel_state.get_tensor_model_parallel_rank() == 0 and parallel_state.get_context_parallel_rank() == 0
         )
 
     @override