Skip to content

Commit e66c666

Browse files
authored
Jwilber/update conv schedule (#1559)
Update schedule to run less frequent, and update esm2 jobs: - `.github/workflows/scdl-performance-tests.yml` — scdl now runs weekly on Fridays 2pm PST (0 22 * * 5) instead of daily. - `.github/workflows/convergence-tests.yml` — single cron 0 8 * * 0 (Sun 00:00 PST = Sat night midnight), a new gate-biweekly job that skips odd ISO weeks, and the matrix runs all three configs (esm2_650m, esm2_15b, codonfm_ptl_te) on every scheduled run. - `ci/lepton/.../esm2_native_te_15b.yaml` — num_train_steps: 20_000 → 500. - `ci/lepton/.../esm2_native_te_650m.yaml` — added top-level fp8_recipe/fp8_format, two new FP8 products (bshd+fp8, thd+fp8) mirroring the 15b config, and extended run_script to pass both fp8 params. --------- Signed-off-by: jwilber <jwilber@nvidia.com>
1 parent afd48cd commit e66c666

4 files changed

Lines changed: 49 additions & 7 deletions

File tree

.github/workflows/convergence-tests.yml

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,32 @@ on:
3232
required: false
3333
type: string
3434
schedule:
35-
- cron: "0 8 * * 1,3" # Mon/Wed at 1am PST (esm2)
36-
- cron: "0 8 * * 2,4" # Tue/Thu at 1am PST (codonfm)
35+
- cron: "0 8 * * 0" # Sun 00:00 PST (Sat night midnight), bi-weekly gated in job
3736

3837
jobs:
38+
# GitHub cron has no native bi-weekly, so gate on even ISO week number.
39+
gate-biweekly:
40+
runs-on: ubuntu-latest
41+
outputs:
42+
proceed: ${{ steps.check.outputs.proceed }}
43+
steps:
44+
- id: check
45+
run: |
46+
if [ "${{ github.event_name }}" != "schedule" ]; then
47+
echo "proceed=true" >> "$GITHUB_OUTPUT"
48+
elif [ $((10#$(date -u +%V) % 2)) -eq 0 ]; then
49+
echo "proceed=true" >> "$GITHUB_OUTPUT"
50+
else
51+
echo "proceed=false" >> "$GITHUB_OUTPUT"
52+
fi
53+
3954
submit-lepton-jobs:
55+
needs: gate-biweekly
56+
if: needs.gate-biweekly.outputs.proceed == 'true'
4057
runs-on: ubuntu-latest
4158
strategy:
4259
matrix:
43-
# Mon/Wed runs esm2, Tue/Thu runs codonfm
44-
model_config: ${{ github.event_name == 'schedule' && github.event.schedule == '0 8 * * 2,4' && fromJSON('["codonfm_ptl_te"]') || github.event_name == 'schedule' && github.event.schedule == '0 8 * * 1,3' && fromJSON('["esm2_native_te_650m", "esm2_native_te_15b"]') || fromJSON(format('["{0}"]', github.event.inputs.model_config)) }}
60+
model_config: ${{ github.event_name == 'schedule' && fromJSON('["esm2_native_te_650m", "esm2_native_te_15b", "codonfm_ptl_te"]') || fromJSON(format('["{0}"]', github.event.inputs.model_config)) }}
4561
fail-fast: false
4662
steps:
4763
- name: Checkout

.github/workflows/scdl-performance-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ on:
1313
required: false
1414
type: string
1515
schedule:
16-
- cron: "0 9 * * *" # everyday at 1am PST
16+
- cron: "0 22 * * 5" # Fridays at 2pm PST (weekly)
1717

1818
jobs:
1919
submit-lepton-jobs:

ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ use_torch_compile: false
6161
# these should match the keys in the recipe's config file
6262
model_tag: nvidia/esm2_t48_15B_UR50D
6363
# task_cmd: train_fsdp2 # mfsdp
64-
num_train_steps: 20_000
64+
num_train_steps: 500
6565
# dataset commands
6666
micro_batch_size: 8
6767
load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data

ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ framework: native # native, accelerate
3232
precision: bf16 # likely bf16 or fp8
3333
te_enabled: true
3434
fp8_enabled: false
35+
fp8_recipe: ""
36+
fp8_format: ""
3537
# thd_enabled: false
3638

3739
# Catchall for additional features/configs
@@ -90,6 +92,28 @@ products:
9092
micro_batch_size: 48
9193
wandb_name: "esm2_native_650m__fsdp2__thd__${now:%Y%m%d-%H%M%S}__${gitsha:}"
9294
job_name: "esm2-native-650m-fsdp2-thd"
95+
# TE bshd perf, FSDP2, FP8
96+
- config: L1_650M
97+
task_cmd: train_fsdp2
98+
parallelism_strategy: fsdp2
99+
thd_enabled: false
100+
fp8_enabled: true
101+
fp8_recipe: transformer_engine.common.recipe.Float8BlockScaling
102+
fp8_format: E4M3
103+
micro_batch_size: 48
104+
wandb_name: "esm2_native_650m__fsdp2__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}"
105+
job_name: "esm2-native-650m-fsdp2-fp8"
106+
# TE thd perf, FSDP2, FP8
107+
- config: L1_650M
108+
task_cmd: train_fsdp2
109+
parallelism_strategy: fsdp2
110+
thd_enabled: true
111+
fp8_enabled: true
112+
fp8_recipe: transformer_engine.common.recipe.Float8BlockScaling
113+
fp8_format: E4M3
114+
micro_batch_size: 48
115+
wandb_name: "esm2_native_650m__fsdp2__thd__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}"
116+
job_name: "esm2-native-650m-fsdp2-thd-fp8"
93117
# OSS Convergence Baseline
94118
# - config: L1_650M
95119
# model_tag: facebook/esm2_t33_650M_UR50D
@@ -137,4 +161,6 @@ run_script: |
137161
checkpoint.resume_from_checkpoint=${resume_from_checkpoint} \
138162
+checkpoint.save_checkpoints=${save_checkpoints} \
139163
+checkpoint.use_distributed_checkpoint_fsdp2=${use_distributed_checkpoint_fsdp2} \
140-
fp8_config.enabled=${fp8_enabled}
164+
fp8_config.enabled=${fp8_enabled} \
165+
fp8_config.fp8_recipe=${fp8_recipe} \
166+
fp8_config.fp8_format=${fp8_format}

0 commit comments

Comments
 (0)