Add train ddp to recipe

balvisio · balvisio · commit 1db0a0cfb6c0 · 2026-05-08T17:49:36.000Z
diff --git a/bionemo-recipes/recipes/codonfm_native_te/Dockerfile b/bionemo-recipes/recipes/codonfm_native_te/Dockerfile
@@ -1,9 +1,14 @@
 # syntax=docker/dockerfile:1.4
-FROM nvcr.io/nvidia/pytorch:26.04-py3
+FROM nvcr.io/nvidia/pytorch:26.02-py3
+
+RUN apt-get update && apt-get install -y tmux npm
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=requirements.txt,target=/requirements.txt \
     PIP_CONSTRAINT= pip install -r /requirements.txt
 
+RUN curl -fsSL https://claude.ai/install.sh | bash  # Install Claude CLI tool
+RUN npm install -g @openai/codex
+
 WORKDIR /workspace/bionemo
 COPY . .
diff --git a/bionemo-recipes/recipes/codonfm_native_te/checkpoint.py b/bionemo-recipes/recipes/codonfm_native_te/checkpoint.py
@@ -221,3 +221,91 @@ def save_final_model_fsdp2(
     save_file(model_state_dict, os.path.join(save_directory, "model.safetensors"))
     config.to_json_file(os.path.join(save_directory, "config.json"))
     logger.info(f"Saved final FSDP2 model to {save_directory}")
+
+
+# ============================================================================
+# DDP Checkpointing
+# ============================================================================
+
+
+def load_checkpoint_ddp(
+    model: torch.nn.Module,
+    optimizer: torch.optim.Optimizer,
+    scheduler: torch.optim.lr_scheduler.LRScheduler,
+    ckpt_path: str | os.PathLike,
+    dist_config: DistributedConfig,
+) -> CheckpointOutput:
+    """Load DDP checkpoint."""
+    checkpoint_path, _ = get_latest_checkpoint(ckpt_path)
+    if not checkpoint_path:
+        logger.info("No DDP checkpoint found, starting from scratch")
+        return CheckpointOutput(model, optimizer, scheduler, 0, 0)
+
+    checkpoint = torch.load(
+        checkpoint_path / "checkpoint.pt",
+        map_location=f"cuda:{dist_config.local_rank}",
+        weights_only=True,
+    )
+
+    model.load_state_dict(checkpoint["model"], strict=False)
+    optimizer.load_state_dict(checkpoint["optimizer"])
+    scheduler.load_state_dict(checkpoint["scheduler"])
+
+    if dist_config.is_main_process():
+        logger.info(f"Loaded DDP checkpoint from step {checkpoint['step']}")
+
+    # Increment the step by one to avoid re-running the previous step.
+    return CheckpointOutput(model, optimizer, scheduler, checkpoint["step"] + 1, checkpoint["epoch"])
+
+
+def save_checkpoint_ddp(
+    model: torch.nn.Module,
+    optimizer: torch.optim.Optimizer,
+    scheduler: torch.optim.lr_scheduler.LRScheduler,
+    ckpt_path: str | os.PathLike,
+    step: int,
+    epoch: int,
+    dist_config: DistributedConfig,
+    max_checkpoints: int | None = None,
+) -> None:
+    """Save DDP checkpoint (rank-0 only since the model is replicated)."""
+    if not dist_config.is_main_process():
+        return
+
+    ckpt_path = Path(ckpt_path)
+    checkpoint_path = ckpt_path / f"step_{step}"
+    checkpoint_path.mkdir(parents=True, exist_ok=True)
+
+    torch.save(
+        {
+            "model": model.state_dict(),
+            "optimizer": optimizer.state_dict(),
+            "scheduler": scheduler.state_dict(),
+            "step": step,
+            "epoch": epoch,
+        },
+        checkpoint_path / "checkpoint.pt",
+    )
+    logger.info(f"Saved DDP checkpoint to {checkpoint_path}")
+
+    if max_checkpoints is not None:
+        prune_checkpoints(ckpt_path, max_checkpoints)
+
+
+def save_final_model_ddp(
+    model: torch.nn.Module,
+    config,
+    save_directory: str | os.PathLike,
+    dist_config: DistributedConfig,
+) -> None:
+    """Save final model for DDP - only on main process."""
+    if not dist_config.is_main_process():
+        return
+
+    # Unwrap DDP if wrapped.
+    underlying_model = model.module if hasattr(model, "module") else model
+
+    os.makedirs(save_directory, exist_ok=True)
+    save_file(underlying_model.state_dict(), os.path.join(save_directory, "model.safetensors"))
+    config.to_json_file(os.path.join(save_directory, "config.json"))
+    logger.info(f"Saved final DDP model to {save_directory}")
diff --git a/bionemo-recipes/recipes/codonfm_native_te/run_1b.sh b/bionemo-recipes/recipes/codonfm_native_te/run_1b.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+export CPATH=/usr/local/cuda/include
+export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+
+# Run config
+export CONFIG_NAME=encodon_1b
+export NPROC_PER_NODE=8
+export DIST_STRATEGY=ddp  # fsdp or ddp
+
+# Training
+export NUM_TRAIN_STEPS=100
+export MICRO_BATCH_SIZE=31
+export NUM_WORKERS=1
+export USE_SEQUENCE_PACKING=True
+export USE_FP32_MASTER_WEIGHTS=True
+export NUM_WARMUP_STEPS=500
+
+# Logging / W&B
+export LOGGER_FREQUENCY=10
+export WANDB_API_KEY=""
+export WANDB_PROJECT=codon-fm-low-precision
+
+# Checkpointing
+export SAVE_FINAL_MODEL=False
+export SAVE_EVERY_N_STEPS=100000
+export CKPT_DIR=/tmp
+export RESUME_FROM_CHECKPOINT=False
+
+# Hydra
+export HYDRA_RUN_DIR=1b_test
+
+# Quantization / FP8
+export QUANT_STATS_ENABLED=False
+export FP8_ENABLED=True
+export FP8_RECIPE=transformer_engine.common.recipe.MXFP8BlockScaling
+export FP8_FORMAT=E4M3
+
+# Data
+export DATASET_DATA_PATH=/data/balvisio/codonfm/reference-dataset/codonfm/processed_unfiltered/
+
+# Derived: build wandb run name from model size, batch size, and precision recipe
+MODEL_SIZE="${CONFIG_NAME##*_}"
+if [ "${FP8_ENABLED}" = "True" ]; then
+  RECIPE_SHORT="${FP8_RECIPE##*.}"
+  RECIPE_SHORT="${RECIPE_SHORT%BlockScaling}"
+  RECIPE_SHORT="${RECIPE_SHORT%Scaling}"
+  PRECISION_TAG="${RECIPE_SHORT,,}_${FP8_FORMAT,,}"
+else
+  PRECISION_TAG="bf16"
+fi
+export WANDB_RUN_NAME="${MODEL_SIZE}_${DIST_STRATEGY}_bs${MICRO_BATCH_SIZE}_${PRECISION_TAG}"
+
+# Pick training script based on distributed strategy.
+# DDP can't emulate FSDP's fp32-master / bf16-param split, so force fp32 master weights off.
+case "${DIST_STRATEGY}" in
+  fsdp)
+    TRAIN_SCRIPT=train_fsdp2.py
+    ;;
+  ddp)
+    TRAIN_SCRIPT=train_ddp.py
+    if [ "${USE_FP32_MASTER_WEIGHTS}" = "True" ]; then
+      echo "DIST_STRATEGY=ddp: overriding USE_FP32_MASTER_WEIGHTS=True -> False" >&2
+      export USE_FP32_MASTER_WEIGHTS=False
+    fi
+    ;;
+  *)
+    echo "DIST_STRATEGY must be 'fsdp' or 'ddp', got '${DIST_STRATEGY}'" >&2
+    exit 1
+    ;;
+esac
+
+torchrun --nproc_per_node=${NPROC_PER_NODE} ${TRAIN_SCRIPT} \
+  --config-name ${CONFIG_NAME} \
+  quant_stats_config.enabled=${QUANT_STATS_ENABLED} \
+  logger.frequency=${LOGGER_FREQUENCY} \
+  num_train_steps=${NUM_TRAIN_STEPS} \
+  dataset.micro_batch_size=${MICRO_BATCH_SIZE} \
+  dataset.num_workers=${NUM_WORKERS} \
+  dataset.data_path=${DATASET_DATA_PATH} \
+  use_sequence_packing=${USE_SEQUENCE_PACKING} \
+  use_fp32_master_weights=${USE_FP32_MASTER_WEIGHTS} \
+  lr_scheduler_kwargs.num_warmup_steps=${NUM_WARMUP_STEPS} \
+  wandb_init_args.name=${WANDB_RUN_NAME} \
+  +wandb_init_args.project=${WANDB_PROJECT} \
+  checkpoint.save_final_model=${SAVE_FINAL_MODEL} \
+  checkpoint.save_every_n_steps=${SAVE_EVERY_N_STEPS} \
+  checkpoint.ckpt_dir=${CKPT_DIR} \
+  checkpoint.resume_from_checkpoint=${RESUME_FROM_CHECKPOINT} \
+  hydra.run.dir=${HYDRA_RUN_DIR} \
+  fp8_config.enabled=${FP8_ENABLED} \
+  fp8_config.fp8_recipe=${FP8_RECIPE} \
+  fp8_config.fp8_format=${FP8_FORMAT}
diff --git a/bionemo-recipes/recipes/codonfm_native_te/train_ddp.py b/bionemo-recipes/recipes/codonfm_native_te/train_ddp.py