Skip to content

Commit 4a0f83b

Browse files
author
Donglai Wei
committed
Fix mito_mitoEM_H OOM training
1 parent b0a06f5 commit 4a0f83b

2 files changed

Lines changed: 32 additions & 2 deletions

File tree

justfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,13 +121,13 @@ slurm partition num_cpu num_gpu cmd constraint='' mem='32G':
121121
--output=slurm_outputs/slurm-%j.out \
122122
--error=slurm_outputs/slurm-%j.err \
123123
--nodes=1 \
124-
--ntasks={{num_gpu}} \
124+
--ntasks=1 \
125125
--gpus-per-node={{num_gpu}} \
126126
--cpus-per-task={{num_cpu}} \
127127
--mem={{mem}} \
128128
--time=$time_limit \
129129
$constraint_flag \
130-
--wrap="mkdir -p \$HOME/.just && export JUST_TEMPDIR=\$HOME/.just TMPDIR=\$HOME/.just && source /projects/weilab/weidf/lib/miniconda3/bin/activate pytc && cd $PWD && srun --ntasks={{num_gpu}} --ntasks-per-node={{num_gpu}} {{cmd}}"
130+
--wrap="mkdir -p \$HOME/.just && export JUST_TEMPDIR=\$HOME/.just TMPDIR=\$HOME/.just NCCL_SOCKET_FAMILY=AF_INET && source /projects/weilab/weidf/lib/miniconda3/bin/activate pytc && cd $PWD && srun --ntasks=1 {{cmd}}"
131131

132132
# Alias for slurm (kept for backward compatibility)
133133
slurm-sh partition num_cpu num_gpu cmd constraint='' mem='32G':

tutorials/mito_mitoEM_H.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,28 @@
11
_base_: mito_mitoEM_common.yaml
22

33
experiment_name: mitoem30h_mednext_sdt
4+
system:
5+
training:
6+
num_workers: 2
7+
batch_size: 2
8+
inference:
9+
batch_size: 1
10+
model:
11+
input_size:
12+
- 16
13+
- 256
14+
- 256
15+
output_size:
16+
- 16
17+
- 256
18+
- 256
419
data:
20+
use_preloaded_cache: false
21+
persistent_workers: false
22+
patch_size:
23+
- 16
24+
- 256
25+
- 256
526
train_image:
627
- EM30-H/im_train.h5
728
train_label:
@@ -10,6 +31,15 @@ data:
1031
- EM30-H/im_val.h5
1132
val_label:
1233
- EM30-H/mito_val-v2.h5
34+
optimization:
35+
accumulate_grad_batches: 4
36+
inference:
37+
sliding_window:
38+
window_size:
39+
- 16
40+
- 256
41+
- 256
42+
sw_batch_size: 1
1343
monitor:
1444
checkpoint:
1545
dirpath: outputs/mitoem30h_mednext_sdt/checkpoints/

0 commit comments

Comments
 (0)