Skip to content

Commit e50e8a3

Browse files
committed
Add TIMM ViT recipe to BioNeMo with Megatron-FSDP.
Signed-off-by: Cory Ye <cye@nvidia.com>
1 parent fe8d7e6 commit e50e8a3

521 files changed

Lines changed: 98768 additions & 0 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

recipes/vit/.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
*.tar*
2+
*.zip
3+
checkpoints/
4+
wandb/
5+
outputs/
6+
__pycache__/
7+
.ruff_cache/

recipes/vit/.ruff.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
extend = "../.ruff.toml"
2+
[lint]
3+
per-file-ignores = { "tokenizer_auto" = ["ALL"] }
4+
ignore = ["RUF","D","N","E","PLW","PERF","C","F"]

recipes/vit/Dockerfile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
FROM nvcr.io/nvidia/pytorch:25.06-py3
2+
3+
RUN --mount=type=secret,id=netrc,target=/root/.netrc \
4+
--mount=type=cache,target=/root/.cache/pip \
5+
--mount=type=bind,source=requirements.txt,target=/requirements.txt \
6+
PIP_CONSTRAINT= pip install -r /requirements.txt
7+
8+
WORKDIR /workspace
9+
COPY . .

recipes/vit/README.md

Lines changed: 47 additions & 0 deletions

recipes/vit/config/defaults.yaml

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
model:
2+
vit:
3+
img_size: 224
4+
patch_size: 16
5+
in_chans: 3
6+
num_classes: 100000
7+
global_pool: "token"
8+
embed_dim: 768
9+
depth: 12
10+
num_heads: 12
11+
mlp_ratio: 4.0
12+
qkv_bias: true
13+
qk_norm: false
14+
scale_attn_norm: false
15+
scale_mlp_norm: false
16+
proj_bias: true
17+
init_values: null
18+
class_token: true
19+
pos_embed: true
20+
no_embed_class: false
21+
reg_tokens: 0
22+
pre_norm: false
23+
final_norm: true
24+
fc_norm: null
25+
pool_include_prefix: false
26+
drop_rate: 0.0
27+
pos_drop_rate: 0.0
28+
patch_drop_rate: 0.0
29+
proj_drop_rate: 0.0
30+
attn_drop_rate: 0.0
31+
drop_path_rate: 0.0
32+
weight_init: "timm"
33+
init_variance_rescale: false
34+
transformer_engine: false
35+
channels_last: false
36+
37+
optimizer:
38+
lr: 1e-4
39+
betas: [0.9, 0.98]
40+
eps: 1e-8
41+
weight_decay: 0.01
42+
43+
distributed:
44+
dp_inter: 1
45+
dp_shard: 1
46+
cp: 1
47+
tp: 1
48+
49+
fsdp:
50+
init_model_with_meta_device: true
51+
zero_dp_strategy: "optim_grads_params"
52+
fsdp_unit_modules:
53+
- vit.Block
54+
- vit.PatchEmbed
55+
- torch.nn.LayerNorm
56+
- torch.nn.Linear
57+
use_hybrid_fsdp: true
58+
outer_dp_sharding_strategy: "optim"
59+
grad_reduce_in_fp32: false
60+
preserve_fp32_weights: true
61+
62+
training:
63+
steps: 500
64+
val_interval: 25
65+
log_interval: 5
66+
checkpoint:
67+
path: null
68+
resume_from_metric: null
69+
70+
dataset:
71+
num_workers: 0
72+
train:
73+
root: null
74+
class_map: null
75+
label_map: null
76+
class_filter: null
77+
batch_size: 1
78+
shuffle: false
79+
transform_kwargs:
80+
img_size: 224
81+
scale: null
82+
ratio: null
83+
train_crop_mode: null
84+
hflip: 0.5
85+
vflip: 0.
86+
color_jitter: 0.4
87+
color_jitter_prob: null
88+
grayscale_prob: 0.
89+
gaussian_blur_prob: 0.
90+
interpolation: 'random'
91+
re_prob: 0.
92+
re_mode: 'const'
93+
re_count: 1
94+
re_num_splits: 0
95+
normalize: True
96+
separate: False
97+
patch_size: 16
98+
patchify: False
99+
val:
100+
root: null
101+
class_map: null
102+
label_map: null
103+
class_filter: null
104+
batch_size: 1
105+
shuffle: false
106+
transform_kwargs:
107+
img_size: 224
108+
crop_pct: null
109+
crop_mode: null
110+
crop_border_pixels: null
111+
interpolation: "bilinear"
112+
mean: [0.485, 0.456, 0.406]
113+
std: [0.229, 0.224, 0.225]
114+
normalize: true
115+
patch_size: 16
116+
patchify: false
117+
118+
random:
119+
seed: 42
120+
121+
profiling:
122+
torch_memory_profile: false
123+
wandb: false
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
defaults:
2+
- defaults
3+
- _self_
4+
5+
model:
6+
vit:
7+
img_size: 224
8+
patch_size: 16
9+
in_chans: 3
10+
num_classes: 100000
11+
global_pool: "map"
12+
embed_dim: 768
13+
depth: 12
14+
num_heads: 12
15+
mlp_ratio: 4.0
16+
qkv_bias: true
17+
qk_norm: true
18+
scale_attn_norm: true
19+
scale_mlp_norm: true
20+
proj_bias: true
21+
init_values: null
22+
class_token: true
23+
pos_embed: true
24+
no_embed_class: false
25+
reg_tokens: 8
26+
pre_norm: true
27+
final_norm: true
28+
fc_norm: true
29+
pool_include_prefix: false
30+
drop_rate: 0.05
31+
pos_drop_rate: 0.05
32+
patch_drop_rate: 0.05
33+
proj_drop_rate: 0.05
34+
attn_drop_rate: 0.05
35+
drop_path_rate: 0.05
36+
weight_init: null
37+
init_variance_rescale: true
38+
transformer_engine: false
39+
channels_last: false
40+
41+
distributed:
42+
dp_inter: 1
43+
dp_shard: 1
44+
cp: 1
45+
tp: 1
46+
47+
fsdp:
48+
init_model_with_meta_device: true
49+
zero_dp_strategy: 3
50+
fsdp_unit_modules:
51+
- vit.Block
52+
- vit.PatchEmbed
53+
- torch.nn.LayerNorm
54+
- torch.nn.Linear
55+
use_hybrid_fsdp: true
56+
outer_dp_sharding_strategy: 1
57+
grad_reduce_in_fp32: false
58+
preserve_fp32_weights: true
59+
60+
training:
61+
steps: 500
62+
val_interval: 25
63+
log_interval: 5
64+
checkpoint:
65+
path: "./checkpoints/vit"
66+
resume_from_metric: "-" # + = Highest Metric (Score), - = Lowest Metric (Loss)
67+
68+
dataset:
69+
num_workers: 4
70+
train:
71+
root: "./data/super-tiny-imagenet-5/train"
72+
class_map: "./data/super-tiny-imagenet-5/words.txt"
73+
label_map: null # Not needed, training data is labeled by directory.
74+
class_filter: null
75+
batch_size: 5
76+
shuffle: true
77+
val:
78+
root: "./data/super-tiny-imagenet-5/val"
79+
class_map: "./data/super-tiny-imagenet-5/words.txt"
80+
label_map: "./data/super-tiny-imagenet-5/val/val_annotations.txt"
81+
class_filter: null
82+
batch_size: 5
83+
shuffle: false
84+
85+
random:
86+
seed: 42
87+
88+
profiling:
89+
torch_memory_profile: false
90+
torch_memory_profile_kwargs:
91+
max_entries: 250000
92+
wandb: false
93+
wandb_kwargs:
94+
# To use WandB, export WANDB_API_KEY=<your_api_key>!
95+
name: "bionemo-vit"
96+
project: "bionemo-recipes"
97+
dir: null
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
defaults:
2+
- defaults
3+
- vit_base_patch16_224
4+
- _self_
5+
6+
model:
7+
transformer_engine: true
8+
9+
training:
10+
checkpoint:
11+
path: "./checkpoints/vit_te"
12+
resume_from_metric: "-" # + = Highest Metric (Score), - = Lowest Metric (Loss)
2.3 KB
2.46 KB
2.04 KB

0 commit comments

Comments
 (0)