Skip to content

Commit 14c8966

Browse files
author
Donglai Wei
committed
fix num_cpus=-1
1 parent 4f90c8b commit 14c8966

3 files changed

Lines changed: 42 additions & 16 deletions

File tree

connectomics/config/auto_config.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,37 @@ def _available_cpus_for_current_run() -> int:
4949
return max(os.cpu_count() or 1, 1)
5050

5151

52+
def _infer_local_process_count(
53+
*,
54+
requested_num_gpus: int,
55+
available_gpus: int,
56+
) -> int:
57+
"""
58+
Estimate how many trainer processes will run on this node for a config section.
59+
60+
In this codebase, when running under Slurm with ``SLURM_NTASKS=1`` and
61+
``num_gpus > 1``, Lightning uses local multi-GPU spawn (one process per GPU).
62+
For externally launched distributed jobs (``SLURM_NTASKS>1``), each task
63+
should use its own worker budget, so we keep process count at 1 here.
64+
"""
65+
slurm_ntasks = os.environ.get("SLURM_NTASKS", "1")
66+
try:
67+
slurm_ntasks_int = int(slurm_ntasks)
68+
except ValueError:
69+
slurm_ntasks_int = 1
70+
71+
resolved_num_gpus = requested_num_gpus
72+
if requested_num_gpus == -1:
73+
resolved_num_gpus = available_gpus
74+
75+
# CPU-only / single-GPU / externally launched distributed: no local spawn fan-out.
76+
if resolved_num_gpus <= 1 or slurm_ntasks_int != 1:
77+
return 1
78+
79+
# Local spawn fan-out: one process per GPU.
80+
return int(resolved_num_gpus)
81+
82+
5283
def resolve_runtime_resource_sentinels(
5384
config: DictConfig,
5485
print_results: bool = True,
@@ -83,11 +114,16 @@ def resolve_runtime_resource_sentinels(
83114
)
84115

85116
if getattr(section, "num_workers", None) == -1:
86-
section.num_workers = available_cpus
117+
process_count = _infer_local_process_count(
118+
requested_num_gpus=getattr(section, "num_gpus", 0),
119+
available_gpus=available_gpus,
120+
)
121+
section.num_workers = max(1, available_cpus // process_count)
87122
if print_results:
88123
print(
89124
f"🔧 Auto-detected system.{section_name}.num_workers: "
90-
f"-1 → {section.num_workers}"
125+
f"-1 → {section.num_workers} "
126+
f"(available_cpus={available_cpus}, local_processes={process_count})"
91127
)
92128

93129
if getattr(section, "num_gpus", 0) < -1:

tutorials/mito_mitoEM_common.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ data:
6464
- 32
6565
- 256
6666
- 256
67-
iter_num_per_epoch: 2000
67+
iter_num_per_epoch: 200
6868
image_transform:
6969
clip_percentile_low: 0.005
7070
clip_percentile_high: 0.995
@@ -93,7 +93,7 @@ data:
9393
intensity:
9494
enabled: true
9595
optimization:
96-
max_epochs: 1000
96+
max_epochs: 500
9797
accumulate_grad_batches: 1
9898
val_check_interval: 10
9999
num_sanity_val_steps: 0

tutorials/neuron_nisb_base_40nm.yaml

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ data:
4949
train_label: seed*/data.zarr/seg_40-36-36nm.h5
5050
val_image: seed*/data.zarr/img_40-36-36nm.h5
5151
val_label: seed*/data.zarr/seg_40-36-36nm.h5
52-
iter_num_per_epoch: 1000
52+
iter_num_per_epoch: 200
5353

5454
train_resolution: [36, 36, 40]
5555
val_resolution: [36, 36, 40]
@@ -104,19 +104,9 @@ data:
104104
contrast_prob: 0.5
105105
contrast_range: [0.9, 1.1]
106106

107-
missing_section:
108-
enabled: true
109-
prob: 0.05
110-
num_sections: 2
111-
112-
misalignment:
113-
enabled: true
114-
prob: 0.05
115-
displacement: 10
116-
rotate_ratio: 0.0
117107

118108
optimization:
119-
max_epochs: 100
109+
max_epochs: 500
120110
gradient_clip_val: 1.0
121111
accumulate_grad_batches: 1
122112
precision: "16-mixed"

0 commit comments

Comments
 (0)