Added support for different precision modes

balvisio · balvisio · commit aab4267eea7b · 2026-05-15T19:22:37.000Z
diff --git a/bionemo-recipes/recipes/codonfm_native_te/hydra_config/L0_sanity.yaml b/bionemo-recipes/recipes/codonfm_native_te/hydra_config/L0_sanity.yaml
@@ -6,6 +6,8 @@ defaults:
 model_preset: encodon_200k
 num_train_steps: 250
 
+precision: fp32
+
 use_sequence_packing: false
 dataset:
   data_path: train.parquet
diff --git a/bionemo-recipes/recipes/codonfm_native_te/hydra_config/defaults.yaml b/bionemo-recipes/recipes/codonfm_native_te/hydra_config/defaults.yaml
@@ -81,4 +81,15 @@ quant_stats_config:
 # Note: The layers are going to come in 1 indexed and we convert them to be 0 indexed at runtime.
 fp8_layers: null
 fp4_layers: null
-use_fp32_master_weights: null
+
+# Precision mode. One of:
+#   fp32       - params, compute, grads, and optimizer state all in fp32.
+#   bf16       - params, compute, grads, and optimizer state all in bf16 (pure bf16).
+#   bf16-mixed - fp32 master weights + bf16 compute (via autocast in DDP, via FSDP2
+#                MixedPrecisionPolicy.param_dtype=bf16 in FSDP2).
+precision: ???
+
+# Gradient reduce dtype for FSDP2 when precision=bf16-mixed. One of: fp32, bf16.
+# fp32 (default) is more conservative than PTL FSDP bf16-mixed (which reduces in bf16).
+# Ignored for other precision modes and for DDP.
+grad_reduce_type: fp32
diff --git a/bionemo-recipes/recipes/codonfm_native_te/hydra_config/encodon_1b.yaml b/bionemo-recipes/recipes/codonfm_native_te/hydra_config/encodon_1b.yaml
@@ -6,6 +6,8 @@ defaults:
 model_preset: encodon_1b
 num_train_steps: 500_000
 
+precision: bf16-mixed
+
 use_sequence_packing: true
 dataset:
   data_path: ???
diff --git a/bionemo-recipes/recipes/codonfm_native_te/hydra_config/encodon_5b.yaml b/bionemo-recipes/recipes/codonfm_native_te/hydra_config/encodon_5b.yaml
@@ -6,6 +6,8 @@ defaults:
 model_preset: encodon_5b
 num_train_steps: 500_000
 
+precision: bf16-mixed
+
 use_sequence_packing: true
 dataset:
   data_path: ???
diff --git a/bionemo-recipes/recipes/codonfm_native_te/run_1b.sh b/bionemo-recipes/recipes/codonfm_native_te/run_1b.sh
@@ -14,7 +14,10 @@ export NUM_TRAIN_STEPS=100
 export MICRO_BATCH_SIZE=31
 export NUM_WORKERS=1
 export USE_SEQUENCE_PACKING=True
-export USE_FP32_MASTER_WEIGHTS=True
+# Precision mode: one of fp32, bf16, bf16-mixed. bf16-mixed matches the reference codonfm `--bf16`.
+export PRECISION=bf16-mixed
+# Only used for FSDP2 + bf16-mixed. One of fp32, bf16.
+export GRAD_REDUCE_TYPE=fp32
 export NUM_WARMUP_STEPS=500
 
 # Logging / W&B
@@ -46,24 +49,19 @@ if [ "${FP8_ENABLED}" = "True" ]; then
   RECIPE_SHORT="${FP8_RECIPE##*.}"
   RECIPE_SHORT="${RECIPE_SHORT%BlockScaling}"
   RECIPE_SHORT="${RECIPE_SHORT%Scaling}"
-  PRECISION_TAG="${RECIPE_SHORT,,}_${FP8_FORMAT,,}"
+  PRECISION_TAG="${PRECISION}_${RECIPE_SHORT,,}_${FP8_FORMAT,,}"
 else
-  PRECISION_TAG="bf16"
+  PRECISION_TAG="${PRECISION}"
 fi
 export WANDB_RUN_NAME="${MODEL_SIZE}_${DIST_STRATEGY}_bs${MICRO_BATCH_SIZE}_${PRECISION_TAG}"
 
 # Pick training script based on distributed strategy.
-# DDP can't emulate FSDP's fp32-master / bf16-param split, so force fp32 master weights off.
 case "${DIST_STRATEGY}" in
   fsdp)
     TRAIN_SCRIPT=train_fsdp2.py
     ;;
   ddp)
     TRAIN_SCRIPT=train_ddp.py
-    if [ "${USE_FP32_MASTER_WEIGHTS}" = "True" ]; then
-      echo "DIST_STRATEGY=ddp: overriding USE_FP32_MASTER_WEIGHTS=True -> False" >&2
-      export USE_FP32_MASTER_WEIGHTS=False
-    fi
     ;;
   *)
     echo "DIST_STRATEGY must be 'fsdp' or 'ddp', got '${DIST_STRATEGY}'" >&2
@@ -80,7 +78,8 @@ torchrun --nproc_per_node=${NPROC_PER_NODE} ${TRAIN_SCRIPT} \
   dataset.num_workers=${NUM_WORKERS} \
   dataset.data_path=${DATASET_DATA_PATH} \
   use_sequence_packing=${USE_SEQUENCE_PACKING} \
-  use_fp32_master_weights=${USE_FP32_MASTER_WEIGHTS} \
+  precision=${PRECISION} \
+  grad_reduce_type=${GRAD_REDUCE_TYPE} \
   lr_scheduler_kwargs.num_warmup_steps=${NUM_WARMUP_STEPS} \
   wandb_init_args.name=${WANDB_RUN_NAME} \
   wandb_init_args.project=${WANDB_PROJECT} \
diff --git a/bionemo-recipes/recipes/codonfm_native_te/slurm/1b.sh b/bionemo-recipes/recipes/codonfm_native_te/slurm/1b.sh
@@ -0,0 +1,195 @@
+#!/bin/bash
+#SBATCH --account=
+#SBATCH --nodes=1
+#SBATCH --partition=
+#SBATCH --ntasks-per-node=1
+#SBATCH --time=03:55:00
+#SBATCH --mem=0
+#SBATCH --job-name=
+#SBATCH --mail-type=FAIL
+#SBATCH --overcommit
+#SBATCH --exclusive
+set -euxo pipefail
+
+# ============================================================================
+# Codon 1B
+# ============================================================================
+
+BASE_DIR=""
+CONTAINER=""
+DATA_DIR="${BASE_DIR}/data"
+CODE_MOUNT="/workspace/bionemo"
+
+
+: "${WANDB_API_KEY:?Set WANDB_API_KEY in ~/.bash_profile}"
+: "${HUGGING_FACE_HUB_TOKEN:?Set HUGGING_FACE_HUB_TOKEN in ~/.bash_profile}"
+: "${CLUSTER_NAME:?Set CLUSTER_NAME in ~/.bash_profile}"
+
+# Experiment parameters
+export CONFIG_NAME=encodon_1b
+export NPROC_PER_NODE=8
+export DIST_STRATEGY=ddp  # fsdp or ddp
+
+# Training
+export NUM_TRAIN_STEPS=1000
+export MICRO_BATCH_SIZE=31
+export LEARNING_RATE=7.5e-5
+export NUM_WORKERS=1
+export USE_SEQUENCE_PACKING=False
+# Precision mode: one of fp32, bf16, bf16-mixed. bf16-mixed matches the reference codonfm `--bf16`.
+export PRECISION=bf16-mixed
+# Only used for FSDP2 + bf16-mixed. One of fp32, bf16.
+export GRAD_REDUCE_TYPE=fp32
+export NUM_WARMUP_STEPS=50
+
+# Logging / W&B
+export LOGGER_FREQUENCY=10
+export WANDB_PROJECT=
+
+# Checkpointing
+export SAVE_FINAL_MODEL=True
+export SAVE_EVERY_N_STEPS=100000
+export RESUME_FROM_CHECKPOINT=True
+
+# Hydra
+export HYDRA_RUN_DIR=1b_test
+
+# Quantization / FP8
+export QUANT_STATS_ENABLED=False
+export FP8_ENABLED=False
+export FP8_RECIPE=transformer_engine.common.recipe.MXFP8BlockScaling
+export FP8_FORMAT=E4M3
+
+# Derived: build wandb run name from model size, batch size, and precision recipe
+MODEL_SIZE="${CONFIG_NAME##*_}"
+if [ "${FP8_ENABLED}" = "True" ]; then
+  RECIPE_SHORT="${FP8_RECIPE##*.}"
+  RECIPE_SHORT="${RECIPE_SHORT%BlockScaling}"
+  RECIPE_SHORT="${RECIPE_SHORT%Scaling}"
+  PRECISION_TAG="${PRECISION}_${RECIPE_SHORT,,}_${FP8_FORMAT,,}"
+else
+  PRECISION_TAG="${PRECISION}"
+fi
+
+if [ "${USE_SEQUENCE_PACKING}" = "True" ]; then
+  BATCH_TYPE_TAG="thd"
+else
+  BATCH_TYPE_TAG="bshd"
+fi
+
+export WANDB_RUN_NAME="${MODEL_SIZE}_${DIST_STRATEGY}_${BATCH_TYPE_TAG}_bs${MICRO_BATCH_SIZE}_${PRECISION_TAG}_nodes_${SLURM_JOB_NUM_NODES}_${CLUSTER_NAME}"
+
+# Mounts
+RESULTS_DIR="${BASE_DIR}/results/${WANDB_RUN_NAME}"
+CKPT_DIR="${BASE_DIR}/checkpoints/${WANDB_RUN_NAME}"
+
+mkdir -p "${RESULTS_DIR}" "${CKPT_DIR}"
+
+MOUNTS="${DATA_DIR}:${CODE_MOUNT}/data,${RESULTS_DIR}:${CODE_MOUNT}/results,${CKPT_DIR}:${CODE_MOUNT}/checkpoints"
+
+
+read -r -d '' COMMAND <<'OUTER_EOF' || true
+set -euxo pipefail
+
+echo "========================================="
+echo "CodonFM ${CONFIG_NAME} - STRATEGY: ${DIST_STRATEGY} - PRECISION: ${PRECISION_TAG} - CLUSTER: ${CLUSTER_NAME}"
+echo "Job ID: ${SLURM_JOB_ID}"
+echo "Nodes: ${SLURM_JOB_NUM_NODES}"
+echo "========================================="
+
+# Pick training script based on distributed strategy.
+case "${DIST_STRATEGY}" in
+  fsdp)
+    TRAIN_SCRIPT=train_fsdp2.py
+    ;;
+  ddp)
+    TRAIN_SCRIPT=train_ddp.py
+    ;;
+  *)
+    echo "DIST_STRATEGY must be 'fsdp' or 'ddp', got '${DIST_STRATEGY}'" >&2
+    exit 1
+    ;;
+esac
+
+torchrun --nproc_per_node=${NPROC_PER_NODE} ${TRAIN_SCRIPT} \
+  --config-name ${CONFIG_NAME} \
+  quant_stats_config.enabled=${QUANT_STATS_ENABLED} \
+  logger.frequency=${LOGGER_FREQUENCY} \
+  num_train_steps=${NUM_TRAIN_STEPS} \
+  dataset.micro_batch_size=${MICRO_BATCH_SIZE} \
+  adamw_kwargs.lr=${LEARNING_RATE} \
+  dataset.num_workers=${NUM_WORKERS} \
+  dataset.data_path=/workspace/bionemo/data/processed_unfiltered/ \
+  use_sequence_packing=${USE_SEQUENCE_PACKING} \
+  precision=${PRECISION} \
+  grad_reduce_type=${GRAD_REDUCE_TYPE} \
+  lr_scheduler_kwargs.num_warmup_steps=${NUM_WARMUP_STEPS} \
+  wandb_init_args.name=${WANDB_RUN_NAME} \
+  +wandb_init_args.id=${WANDB_RUN_NAME} \
+  +wandb_init_args.project=${WANDB_PROJECT} \
+  checkpoint.save_final_model=${SAVE_FINAL_MODEL} \
+  checkpoint.save_every_n_steps=${SAVE_EVERY_N_STEPS} \
+  checkpoint.ckpt_dir=/workspace/bionemo/checkpoints \
+  checkpoint.resume_from_checkpoint=${RESUME_FROM_CHECKPOINT} \
+  hydra.run.dir=${HYDRA_RUN_DIR} \
+  fp8_config.enabled=${FP8_ENABLED} \
+  fp8_config.fp8_recipe=${FP8_RECIPE} \
+  fp8_config.fp8_format=${FP8_FORMAT} \
+  +dataset.pad_to_multiple_of=32
+
+echo "========================================="
+echo "Training complete!"
+echo "========================================="
+OUTER_EOF
+
+# Inject environment variables into the command.
+COMMAND="export DIST_STRATEGY=\"${DIST_STRATEGY}\"; ${COMMAND}"
+COMMAND="export PRECISION_TAG=\"${PRECISION_TAG}\"; ${COMMAND}"
+COMMAND="export CLUSTER_NAME=\"${CLUSTER_NAME}\"; ${COMMAND}"
+COMMAND="export NPROC_PER_NODE=\"${NPROC_PER_NODE}\"; ${COMMAND}"
+COMMAND="export CONFIG_NAME=\"${CONFIG_NAME}\"; ${COMMAND}"
+COMMAND="export QUANT_STATS_ENABLED=\"${QUANT_STATS_ENABLED}\"; ${COMMAND}"
+COMMAND="export LOGGER_FREQUENCY=\"${LOGGER_FREQUENCY}\"; ${COMMAND}"
+COMMAND="export NUM_TRAIN_STEPS=\"${NUM_TRAIN_STEPS}\"; ${COMMAND}"
+COMMAND="export MICRO_BATCH_SIZE=\"${MICRO_BATCH_SIZE}\"; ${COMMAND}"
+COMMAND="export LEARNING_RATE=\"${LEARNING_RATE}\"; ${COMMAND}"
+COMMAND="export NUM_WORKERS=\"${NUM_WORKERS}\"; ${COMMAND}"
+COMMAND="export USE_SEQUENCE_PACKING=\"${USE_SEQUENCE_PACKING}\"; ${COMMAND}"
+COMMAND="export PRECISION=\"${PRECISION}\"; ${COMMAND}"
+COMMAND="export GRAD_REDUCE_TYPE=\"${GRAD_REDUCE_TYPE}\"; ${COMMAND}"
+COMMAND="export NUM_WARMUP_STEPS=\"${NUM_WARMUP_STEPS}\"; ${COMMAND}"
+COMMAND="export WANDB_RUN_NAME=\"${WANDB_RUN_NAME}\"; ${COMMAND}"
+COMMAND="export WANDB_PROJECT=\"${WANDB_PROJECT}\"; ${COMMAND}"
+COMMAND="export SAVE_FINAL_MODEL=\"${SAVE_FINAL_MODEL}\"; ${COMMAND}"
+COMMAND="export SAVE_EVERY_N_STEPS=\"${SAVE_EVERY_N_STEPS}\"; ${COMMAND}"
+COMMAND="export RESUME_FROM_CHECKPOINT=\"${RESUME_FROM_CHECKPOINT}\"; ${COMMAND}"
+COMMAND="export HYDRA_RUN_DIR=\"${HYDRA_RUN_DIR}\"; ${COMMAND}"
+COMMAND="export FP8_ENABLED=\"${FP8_ENABLED}\"; ${COMMAND}"
+COMMAND="export FP8_RECIPE=\"${FP8_RECIPE}\"; ${COMMAND}"
+COMMAND="export FP8_FORMAT=\"${FP8_FORMAT}\"; ${COMMAND}"
+
+COMMAND="export WANDB_API_KEY=\"${WANDB_API_KEY}\"; ${COMMAND}"
+COMMAND="export HUGGING_FACE_HUB_TOKEN=\"${HUGGING_FACE_HUB_TOKEN}\"; ${COMMAND}"
+COMMAND="export HF_TOKEN=\"${HUGGING_FACE_HUB_TOKEN}\"; ${COMMAND}"
+
+echo "Launching: ${WANDB_RUN_NAME}"
+
+# AUTO-CHAIN: resubmit on timeout.
+trap '
+    rc=$?
+    if [ "$rc" -eq 143 ] || [ "$rc" -eq 137 ]; then
+      echo "Killed by signal (rc=$rc) — assuming SLURM timeout, resubmitting..."
+      sbatch --dependency=singleton "${BASH_SOURCE[0]}"
+    elif [ "$rc" -eq 0 ]; then
+      echo "Clean exit — training finished, NOT resubmitting."
+    else
+      echo "Error exit (rc=$rc) — NOT resubmitting; investigate ${RESULTS_DIR}"
+    fi
+  ' EXIT
+
+srun \
+  --output "${RESULTS_DIR}/slurm-%j-%n.out" \
+  --error  "${RESULTS_DIR}/error-%j-%n.out" \
+  --container-image "${CONTAINER}" \
+  --container-mounts "${MOUNTS}" \
+  bash -c "${COMMAND}"
diff --git a/bionemo-recipes/recipes/codonfm_native_te/tests/test_train.py b/bionemo-recipes/recipes/codonfm_native_te/tests/test_train.py
@@ -130,15 +130,31 @@ def test_sanity_convergence_fsdp2_fp8(tmp_path, recipe_path):
     assert final_loss < 5.0, f"Final loss {final_loss} is too high"
 
 
-def test_sanity_convergence_fsdp2_fp32_master_weights(tmp_path, recipe_path):
-    """Test CodonFM with FP32 master weights."""
+def test_sanity_convergence_fsdp2_bf16_mixed(tmp_path, recipe_path):
+    """Test CodonFM with bf16-mixed precision (fp32 master weights + bf16 compute)."""
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
             config_name="L0_sanity",
             overrides=[
                 f"+wandb_init_args.dir={tmp_path}",
                 f"checkpoint.ckpt_dir={tmp_path}",
-                "use_fp32_master_weights=true",
+                "precision=bf16-mixed",
+            ],
+        )
+
+    final_loss = main_fsdp2(sanity_config)
+    assert final_loss < 5.0, f"Final loss {final_loss} is too high"
+
+
+def test_sanity_convergence_fsdp2_bf16(tmp_path, recipe_path):
+    """Test CodonFM with pure bf16 precision."""
+    with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
+        sanity_config = compose(
+            config_name="L0_sanity",
+            overrides=[
+                f"+wandb_init_args.dir={tmp_path}",
+                f"checkpoint.ckpt_dir={tmp_path}",
+                "precision=bf16",
             ],
         )
 
diff --git a/bionemo-recipes/recipes/codonfm_native_te/train_ddp.py b/bionemo-recipes/recipes/codonfm_native_te/train_ddp.py
diff --git a/bionemo-recipes/recipes/codonfm_native_te/train_fsdp2.py b/bionemo-recipes/recipes/codonfm_native_te/train_fsdp2.py