|
| 1 | +scope: partial-conv |
| 2 | +time_limit: 14400 |
| 3 | +key_segments: |
| 4 | + # Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included. |
| 5 | + dataset_config: False |
| 6 | + dataset_dir: False |
| 7 | + data_base_path: False |
| 8 | + num_workers: False |
| 9 | + limit_val_batches: False |
| 10 | + val_check_interval: False |
| 11 | + experiment_name: False |
| 12 | + workspace: False |
| 13 | + restore_from_checkpoint_path: False |
| 14 | + activation_checkpoint_layers: False |
| 15 | + lora_enabled: False |
| 16 | + lr: False |
| 17 | + min_lr: False |
| 18 | + warmup_steps: False |
| 19 | + accumulate_grad_batches: False |
| 20 | + clip_grad: False |
| 21 | + weight_decay: False |
| 22 | + attention_dropout: False |
| 23 | + hidden_dropout: False |
| 24 | + precision: False |
| 25 | + seq_length: False |
| 26 | +script_args: |
| 27 | + # All arguments referenced in the script string must be specified here. |
| 28 | + # Arguments not referenced in the script string must have the 'arg' field specified. |
| 29 | + # See jet/core/configs.py for the specification of the configuration class |
| 30 | + workspace: /workspace/bionemo2 |
| 31 | + data_base_path: /data/evo2 |
| 32 | + restore_from_checkpoint_path: checkpoints/nemo2_evo2_1b_8k |
| 33 | + nodes: 1 |
| 34 | + model: evo2 |
| 35 | + config_name: 1b |
| 36 | + num_workers: 1 |
| 37 | + limit_val_batches: 20 |
| 38 | + dataset_config: training_data_config.yaml |
| 39 | + dataset_dir: preprocessed_data |
| 40 | + val_check_interval: 5 |
| 41 | + seq_length: 8192 |
| 42 | + warmup_steps: 10 |
| 43 | + activation_checkpoint_layers: 2 |
| 44 | + lr: 0.000015 |
| 45 | + min_lr: 0.0000149 |
| 46 | + accumulate_grad_batches: 4 |
| 47 | + max_steps: 1000 |
| 48 | + gpus: 1 |
| 49 | + clip_grad: 250 |
| 50 | + weight_decay: 0.001 |
| 51 | + attention_dropout: 0.01 |
| 52 | + hidden_dropout: 0.01 |
| 53 | + stop_steps: 100 |
| 54 | + batch_size: 2 |
| 55 | + variant: finetune |
| 56 | + precision: fp8 |
| 57 | + products: |
| 58 | + - variant: finetune |
| 59 | + lora_enabled: "" |
| 60 | + task: finetune_from_ckpt |
| 61 | + experiment_name: evo2-finetune |
| 62 | + - variant: lora_finetune |
| 63 | + lora_enabled: "--lora-finetune" |
| 64 | + task: lora_finetune_from_ckpt |
| 65 | + experiment_name: evo2-lora-finetune |
| 66 | +script: |- |
| 67 | + WANDB_API_KEY=$BIONEMO_WANDB_API_KEY train_${model} \ |
| 68 | + -d ${data_base_path}/${dataset_config} \ |
| 69 | + --dataset-dir=${data_base_path}/${dataset_dir} \ |
| 70 | + --ckpt-dir=${data_base_path}/${restore_from_checkpoint_path} \ |
| 71 | + ${lora_enabled} \ |
| 72 | + --model-size=${config_name} \ |
| 73 | + --max-steps=${max_steps} \ |
| 74 | + --experiment-name=${experiment_name}_${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s \ |
| 75 | + --lr=${lr} \ |
| 76 | + --min-lr=${min_lr} \ |
| 77 | + --warmup-steps=${warmup_steps} \ |
| 78 | + --result-dir=${tensorboard_dir} \ |
| 79 | + --micro-batch-size=${batch_size} \ |
| 80 | + --grad-acc-batches=${accumulate_grad_batches} \ |
| 81 | + --limit-val-batches=${limit_val_batches} \ |
| 82 | + --seq-length=${seq_length} \ |
| 83 | + --clip-grad=${clip_grad} \ |
| 84 | + --wd=${weight_decay} \ |
| 85 | + --attention-dropout=${attention_dropout} \ |
| 86 | + --hidden-dropout=${hidden_dropout} \ |
| 87 | + --num-layers 4 \ |
| 88 | + --hybrid-override-pattern 'SDH*' \ |
| 89 | + --devices=${gpus} \ |
| 90 | + --num-nodes=${nodes} \ |
| 91 | + --val-check-interval=${val_check_interval} \ |
| 92 | + --wandb-project=${wandb_project_name} \ |
| 93 | + --wandb-group=${model}_${variant}_${config_name}_${task}_${target} \ |
| 94 | + --create-tensorboard-logger \ |
| 95 | + --activation-checkpoint-recompute-num-layers=${activation_checkpoint_layers} \ |
| 96 | + --disable-checkpointing \ |
| 97 | + --early-stop-on-step=${stop_steps} \ |
| 98 | + --garbage-collect-at-inference; |
0 commit comments