fix num_cpus=-1

Donglai Wei · Donglai Wei · commit 14c89663f035 · 2026-02-20T01:10:38.000-05:00
diff --git a/connectomics/config/auto_config.py b/connectomics/config/auto_config.py
@@ -49,6 +49,37 @@ def _available_cpus_for_current_run() -> int:
     return max(os.cpu_count() or 1, 1)
 
 
+def _infer_local_process_count(
+    *,
+    requested_num_gpus: int,
+    available_gpus: int,
+) -> int:
+    """
+    Estimate how many trainer processes will run on this node for a config section.
+
+    In this codebase, when running under Slurm with ``SLURM_NTASKS=1`` and
+    ``num_gpus > 1``, Lightning uses local multi-GPU spawn (one process per GPU).
+    For externally launched distributed jobs (``SLURM_NTASKS>1``), each task
+    should use its own worker budget, so we keep process count at 1 here.
+    """
+    slurm_ntasks = os.environ.get("SLURM_NTASKS", "1")
+    try:
+        slurm_ntasks_int = int(slurm_ntasks)
+    except ValueError:
+        slurm_ntasks_int = 1
+
+    resolved_num_gpus = requested_num_gpus
+    if requested_num_gpus == -1:
+        resolved_num_gpus = available_gpus
+
+    # CPU-only / single-GPU / externally launched distributed: no local spawn fan-out.
+    if resolved_num_gpus <= 1 or slurm_ntasks_int != 1:
+        return 1
+
+    # Local spawn fan-out: one process per GPU.
+    return int(resolved_num_gpus)
+
+
 def resolve_runtime_resource_sentinels(
     config: DictConfig,
     print_results: bool = True,
@@ -83,11 +114,16 @@ def resolve_runtime_resource_sentinels(
                 )
 
         if getattr(section, "num_workers", None) == -1:
-            section.num_workers = available_cpus
+            process_count = _infer_local_process_count(
+                requested_num_gpus=getattr(section, "num_gpus", 0),
+                available_gpus=available_gpus,
+            )
+            section.num_workers = max(1, available_cpus // process_count)
             if print_results:
                 print(
                     f"🔧 Auto-detected system.{section_name}.num_workers: "
-                    f"-1 → {section.num_workers}"
+                    f"-1 → {section.num_workers} "
+                    f"(available_cpus={available_cpus}, local_processes={process_count})"
                 )
 
         if getattr(section, "num_gpus", 0) < -1:
diff --git a/tutorials/mito_mitoEM_common.yaml b/tutorials/mito_mitoEM_common.yaml
@@ -64,7 +64,7 @@ data:
   - 32
   - 256
   - 256
-  iter_num_per_epoch: 2000
+  iter_num_per_epoch: 200
   image_transform:
     clip_percentile_low: 0.005
     clip_percentile_high: 0.995
@@ -93,7 +93,7 @@ data:
     intensity:
       enabled: true
 optimization:
-  max_epochs: 1000
+  max_epochs: 500
   accumulate_grad_batches: 1
   val_check_interval: 10
   num_sanity_val_steps: 0
diff --git a/tutorials/neuron_nisb_base_40nm.yaml b/tutorials/neuron_nisb_base_40nm.yaml
@@ -49,7 +49,7 @@ data:
   train_label: seed*/data.zarr/seg_40-36-36nm.h5
   val_image: seed*/data.zarr/img_40-36-36nm.h5
   val_label: seed*/data.zarr/seg_40-36-36nm.h5
-  iter_num_per_epoch: 1000
+  iter_num_per_epoch: 200
 
   train_resolution: [36, 36, 40]
   val_resolution: [36, 36, 40]
@@ -104,19 +104,9 @@ data:
       contrast_prob: 0.5
       contrast_range: [0.9, 1.1]
 
-    missing_section:
-      enabled: true
-      prob: 0.05
-      num_sections: 2
-
-    misalignment:
-      enabled: true
-      prob: 0.05
-      displacement: 10
-      rotate_ratio: 0.0
 
 optimization:
-  max_epochs: 100
+  max_epochs: 500
   gradient_clip_val: 1.0
   accumulate_grad_batches: 1
   precision: "16-mixed"