PytorchConnectomics
diff --git a/‎.claude/repos_other/BANIS_SUMMARY.md‎
Lines changed: 1 addition & 1 deletion b/‎.claude/repos_other/BANIS_SUMMARY.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎connectomics/config/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎connectomics/config/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎connectomics/config/auto_config.py‎
Lines changed: 73 additions & 0 deletions b/‎connectomics/config/auto_config.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎connectomics/training/deep_supervision.py‎
Lines changed: 34 additions & 6 deletions b/‎connectomics/training/deep_supervision.py‎
Lines changed: 34 additions & 6 deletions
diff --git a/‎connectomics/training/lit/data_factory.py‎
Lines changed: 25 additions & 12 deletions b/‎connectomics/training/lit/data_factory.py‎
Lines changed: 25 additions & 12 deletions
diff --git a/‎connectomics/training/lit/utils.py‎
Lines changed: 4 additions & 0 deletions b/‎connectomics/training/lit/utils.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎justfile‎
Lines changed: 78 additions & 21 deletions b/‎justfile‎
Lines changed: 78 additions & 21 deletions
@@ -4,7 +4,7 @@
 
 BANIS is a baseline implementation for the **Neuron Instance Segmentation Benchmark (NISB)**, providing an easily adaptable framework for neuron instance segmentation in electron microscopy (EM) images. The project combines affinity prediction with modern deep learning architectures (MedNeXt) and simple connected components for post-processing.
 
-**Repository Location**: `/projects/weilab/weidf/lib/banis`
+**Repository Location**: `/projects/weilab/weidf/lib/seg/banis`
 
 **Key Features**:
 - Affinity-based segmentation approach with short and long-range affinities
 
@@ -154,3 +154,4 @@ lib/
 
 # Development logs and documentation
 tmp/
+crackit/
@@ -23,6 +23,7 @@
     auto_plan_config,
     AutoConfigPlanner,
     AutoPlanResult,
+    resolve_runtime_resource_sentinels,
 )
 
 # GPU utilities
@@ -53,6 +54,7 @@
     "auto_plan_config",
     "AutoConfigPlanner",
     "AutoPlanResult",
+    "resolve_runtime_resource_sentinels",
     # GPU utilities
     "get_gpu_info",
     "print_gpu_info",
 
@@ -16,6 +16,7 @@
 from dataclasses import dataclass, field
 from omegaconf import OmegaConf, DictConfig
 import warnings
+import os
 
 from .gpu_utils import (
     get_gpu_info,
@@ -25,6 +26,78 @@
 )
 
 
+def _available_cpus_for_current_run() -> int:
+    """
+    Detect CPU slots available to the current process (SLURM/cgroup aware).
+
+    Priority:
+    1) CPU affinity mask (best under cgroups/SLURM)
+    2) SLURM_CPUS_PER_TASK
+    3) os.cpu_count()
+    """
+    try:
+        affinity = os.sched_getaffinity(0)
+        if affinity:
+            return len(affinity)
+    except Exception:
+        pass
+
+    slurm_cpus_per_task = os.environ.get("SLURM_CPUS_PER_TASK")
+    if slurm_cpus_per_task and slurm_cpus_per_task.isdigit():
+        return max(int(slurm_cpus_per_task), 1)
+
+    return max(os.cpu_count() or 1, 1)
+
+
+def resolve_runtime_resource_sentinels(
+    config: DictConfig,
+    print_results: bool = True,
+) -> DictConfig:
+    """
+    Resolve runtime resource sentinels in system.{training,inference}.
+
+    Sentinel convention:
+      - num_gpus = -1 -> use all GPUs visible to this run
+      - num_workers = -1 -> use all CPU slots available to this run
+
+    This is runtime-oriented (SLURM/cgroup aware) and complements auto-planning.
+    """
+    if not hasattr(config, "system"):
+        return config
+
+    gpu_info = get_gpu_info()
+    available_gpus = gpu_info["num_gpus"] if gpu_info["cuda_available"] else 0
+    available_cpus = _available_cpus_for_current_run()
+
+    for section_name in ("training", "inference"):
+        section = getattr(config.system, section_name, None)
+        if section is None:
+            continue
+
+        if getattr(section, "num_gpus", None) == -1:
+            section.num_gpus = available_gpus
+            if print_results:
+                print(
+                    f"🔧 Auto-detected system.{section_name}.num_gpus: "
+                    f"-1 → {section.num_gpus}"
+                )
+
+        if getattr(section, "num_workers", None) == -1:
+            section.num_workers = available_cpus
+            if print_results:
+                print(
+                    f"🔧 Auto-detected system.{section_name}.num_workers: "
+                    f"-1 → {section.num_workers}"
+                )
+
+        if getattr(section, "num_gpus", 0) < -1:
+            raise ValueError(f"system.{section_name}.num_gpus must be >= -1")
+        if getattr(section, "num_workers", 0) < -1:
+            raise ValueError(f"system.{section_name}.num_workers must be >= -1")
+
+    return config
+
+
 @dataclass
 class AutoPlanResult:
     """Results from automatic planning."""
 
@@ -34,6 +34,16 @@ def _loss_supports_weight(loss_fn: nn.Module) -> bool:
         return False
 
 
+def _is_class_index_loss(loss_fn: nn.Module) -> bool:
+    """Return True if loss expects class-index labels (1 channel target).
+
+    Cross-entropy style losses consume dense logits [B, C, ...] and class-index
+    targets [B, 1, ...] or [B, ...], unlike BCE/MSE-style losses that require
+    channel-aligned dense targets [B, C, ...].
+    """
+    return loss_fn.__class__.__name__ in {"CrossEntropyLoss", "CrossEntropyLossWrapper"}
+
+
 class DeepSupervisionHandler:
     """
     Handler for deep supervision and multi-task learning.
@@ -130,15 +140,33 @@ def compute_multitask_loss(
 
             # Extract channels for this task from outputs
             task_output = outputs[:, start_ch:end_ch, ...]
-            end_ch - start_ch
+            task_output_channels = end_ch - start_ch
+
+            # Determine label channel convention per task:
+            # - CE-style losses use class-index labels (1 channel)
+            # - Dense losses (BCE/MSE/MAE/Dice/etc.) use channel-aligned labels
+            task_loss_fns = [self.loss_functions[idx] for idx in loss_indices]
+            uses_class_index_targets = any(_is_class_index_loss(fn) for fn in task_loss_fns)
+            uses_dense_targets = any(not _is_class_index_loss(fn) for fn in task_loss_fns)
+
+            if uses_class_index_targets and uses_dense_targets:
+                raise ValueError(
+                    f"Task '{task_name}' mixes class-index and dense target losses. "
+                    "Use either CE-style losses only, or dense losses only, per task."
+                )
 
-            # Determine number of label channels needed
-            # For softmax-based losses (2+ output channels), label has 1 channel
-            # For sigmoid-based losses (1 output channel), label has 1 channel
-            # So labels always use 1 channel per task
-            num_label_channels = 1
+            num_label_channels = 1 if uses_class_index_targets else task_output_channels
 
             # Extract label channels
+            if label_ch_offset + num_label_channels > labels.shape[1]:
+                raise ValueError(
+                    f"Label channel mismatch for task '{task_name}': expected "
+                    f"{num_label_channels} channel(s) at offset {label_ch_offset}, "
+                    f"but label tensor has {labels.shape[1]} total channels. "
+                    f"Task output slice is [{start_ch}:{end_ch}] "
+                    f"({task_output_channels} channel(s))."
+                )
+
             task_label = labels[:, label_ch_offset:label_ch_offset + num_label_channels, ...]
             label_ch_offset += num_label_channels
 
 
@@ -299,7 +299,7 @@ def create_datamodule(
                         train_json_empty = True
                     else:
                         # Check if JSON file is empty or has no images
-                        with open(json_path, "r") as f:
+                        with open(json_path) as f:
                             json_data = json.load(f)
                         image_files = json_data.get(cfg.data.train_image_key, [])
                         if not image_files:
@@ -463,11 +463,23 @@ def create_datamodule(
         if val_data_dicts:
             print(f"  Val dataset size: {len(val_data_dicts)}")
 
-    # Auto-compute iter_num from volume size if not specified (only for training)
+    # Auto-compute iter_num from volume size if not specified (only for training).
+    # IMPORTANT: cfg.data.iter_num_per_epoch is interpreted as optimizer steps/epoch.
+    # Dataset iter_num is sample-count based, so we convert steps -> samples.
     iter_num = None
     if mode == "train":
-        iter_num = cfg.data.iter_num_per_epoch
-        if iter_num == -1 and dataset_type != "filename":
+        iter_num_cfg = cfg.data.iter_num_per_epoch
+        if iter_num_cfg > 0:
+            # Convert requested steps/epoch to per-epoch sample count expected by datasets.
+            # Account for per-device batch size and number of training devices.
+            num_devices = cfg.system.training.num_gpus if cfg.system.training.num_gpus > 0 else 1
+            iter_num = int(iter_num_cfg * cfg.system.training.batch_size * num_devices)
+            print(
+                f"  Requested iter_num_per_epoch={iter_num_cfg} steps -> "
+                f"dataset samples={iter_num} "
+                f"(batch_size={cfg.system.training.batch_size}, devices={num_devices})"
+            )
+        elif iter_num_cfg == -1 and dataset_type != "filename":
             # For filename datasets, iter_num is determined by the number of files
             print("📊 Auto-computing iter_num from volume size...")
             import h5py
@@ -506,8 +518,11 @@ def create_datamodule(
             print(f"  Stride: {cfg.data.stride}")
             print(f"  Samples per volume: {samples_per_vol}")
             print(f"  ✅ Total possible samples (iter_num): {iter_num:,}")
-            print(f"  ✅ Batches per epoch: {iter_num // cfg.system.training.batch_size:,}")
-        elif iter_num == -1 and dataset_type == "filename":
+            # Approximate steps/epoch for informational logging.
+            num_devices = cfg.system.training.num_gpus if cfg.system.training.num_gpus > 0 else 1
+            denom = max(1, cfg.system.training.batch_size * num_devices)
+            print(f"  ✅ Approx steps per epoch: {iter_num // denom:,}")
+        elif iter_num_cfg == -1 and dataset_type == "filename":
             # For filename datasets, iter_num will be determined by dataset length
             print("  Filename dataset: iter_num will be determined by number of files in JSON")
 
@@ -558,9 +573,9 @@ def create_datamodule(
         pad_size = getattr(cfg.data.image_transform, "pad_size", None) or getattr(
             cfg.data, "pad_size", None
         )
-        pad_mode = getattr(
-            cfg.data.image_transform, "pad_mode", None
-        ) or getattr(cfg.data, "pad_mode", "reflect")
+        pad_mode = getattr(cfg.data.image_transform, "pad_mode", None) or getattr(
+            cfg.data, "pad_mode", "reflect"
+        )
 
         # Create optimized cached datasets
         train_dataset = CachedVolumeDataset(
@@ -589,9 +604,7 @@ def create_datamodule(
             persistent_workers=preloaded_num_workers > 0,
         )
 
-        print(
-            f"  Preload policy: train={train_preload_cfg}, val={val_preload_cfg}"
-        )
+        print(f"  Preload policy: train={train_preload_cfg}, val={val_preload_cfg}")
 
         # Create validation dataset and loader if validation data exists
         val_loader = None
 
@@ -21,6 +21,7 @@
     Config,
     load_config,
     resolve_data_paths,
+    resolve_runtime_resource_sentinels,
     update_from_cli,
     validate_config,
 )
@@ -226,6 +227,9 @@ def setup_config(args) -> Config:
             cfg.data.cellmap["input_array_info"]["shape"] = [64, 64, 64]
             cfg.data.cellmap["target_array_info"]["shape"] = [64, 64, 64]
 
+    # Resolve -1 sentinels (auto-max resources for current runtime allocation).
+    cfg = resolve_runtime_resource_sentinels(cfg, print_results=True)
+
     # CPU-only fallback: avoid multiprocessing workers when no CUDA is available
     if not torch.cuda.is_available():
         if cfg.system.training.num_workers > 0:
 
@@ -5,6 +5,29 @@
 default:
     @just --list
 
+# Resolve SLURM time limit for a partition (fallback to sensible defaults).
+_slurm-time-limit partition:
+    #!/usr/bin/env bash
+    set -euo pipefail
+    time_limit=$(sinfo -p {{partition}} -h -o "%l" | head -1)
+    if [ -z "$time_limit" ] || [ "$time_limit" = "infinite" ]; then
+        case "{{partition}}" in
+            short|interactive)
+                time_limit="12:00:00"
+                ;;
+            medium)
+                time_limit="2-00:00:00"
+                ;;
+            long)
+                time_limit="5-00:00:00"
+                ;;
+            *)
+                time_limit="7-00:00:00"
+                ;;
+        esac
+    fi
+    echo "$time_limit"
+
 # ============================================================================
 # Setup & Data
 # ============================================================================
@@ -89,31 +112,17 @@ tensorboard-run experiment timestamp port='6006':
 #   just slurm short 8 4 "python scripts/main.py --config tutorials/lucchi.yaml"
 #   just slurm short 8 4 "just train lucchi++" "" "64G"    # override memory
 # Time limits: short=12h, medium=2d, long=5d
+# CPU-only convenience wrapper for single-task jobs.
+#   just slurm-cpu short 8 0 "python scripts/downsample_nisb.py --splits train"
 slurm partition num_cpu num_gpu cmd constraint='' mem='32G':
     #!/usr/bin/env bash
     constraint_flag=""
     if [ -n "{{constraint}}" ]; then
         constraint_flag="--constraint={{constraint}}"
     fi
 
-    # Set time limit to partition maximum
-    time_limit=$(sinfo -p {{partition}} -h -o "%l" | head -1)
-    if [ -z "$time_limit" ] || [ "$time_limit" = "infinite" ]; then
-        case "{{partition}}" in
-            short|interactive)
-                time_limit="12:00:00"
-                ;;
-            medium)
-                time_limit="2-00:00:00"
-                ;;
-            long)
-                time_limit="5-00:00:00"
-                ;;
-            *)
-                time_limit="7-00:00:00"
-                ;;
-        esac
-    fi
+    # Resolve partition time limit (with fallback defaults)
+    time_limit=$(just _slurm-time-limit {{partition}})
 
     # Run the command exactly as provided (no auto "just" wrapping).
     sbatch --job-name="pytc_{{cmd}}" \
@@ -129,9 +138,57 @@ slurm partition num_cpu num_gpu cmd constraint='' mem='32G':
            $constraint_flag \
            --wrap="mkdir -p \$HOME/.just && export JUST_TEMPDIR=\$HOME/.just TMPDIR=\$HOME/.just NCCL_SOCKET_FAMILY=AF_INET && source /projects/weilab/weidf/lib/miniconda3/bin/activate pytc && cd $PWD && srun --ntasks=1 --gpus-per-task={{num_gpu}} --cpus-per-task={{num_cpu}} {{cmd}}"
 
-# Alias for slurm (kept for backward compatibility)
-slurm-sh partition num_cpu num_gpu cmd constraint='' mem='32G':
-    just slurm {{partition}} {{num_cpu}} {{num_gpu}} {{cmd}} {{constraint}} {{mem}}
+# Generic CPU-only multi-task launcher (single node, no GPU).
+# Example:
+#   just slurm-cpu-parallel short 7 1 "python scripts/downsample_nisb.py --task \$SLURM_PROCID"
+slurm-cpu-parallel partition num_tasks='7' cpu_per_task='4' cmd='' constraint='' mem='64G':
+    #!/usr/bin/env bash
+    set -euo pipefail
+    mkdir -p slurm_outputs
+    cmd_value='{{cmd}}'
+    if [ -z "$cmd_value" ]; then
+        echo "Error: cmd must be provided. Usage:"
+        echo "  just slurm-cpu-parallel <partition> <num_tasks> <cpu_per_task> \"<command>\" [constraint] [mem]"
+        exit 2
+    fi
+
+    constraint_value='{{constraint}}'
+    constraint_flag=""
+    if [ -n "$constraint_value" ]; then
+        constraint_flag="--constraint=$constraint_value"
+    fi
+
+    # Resolve partition time limit (with fallback defaults)
+    time_limit=$(just _slurm-time-limit {{partition}})
+
+    sbatch --job-name="pytc_cpu_{{num_tasks}}t" \
+           --partition={{partition}} \
+           --output=slurm_outputs/slurm-%j.out \
+           --error=slurm_outputs/slurm-%j.err \
+           --nodes=1 \
+           --ntasks={{num_tasks}} \
+           --gpus-per-task=0 \
+           --cpus-per-task={{cpu_per_task}} \
+           --mem={{mem}} \
+           --time=$time_limit \
+           $constraint_flag \
+           --wrap="mkdir -p \$HOME/.just && export JUST_TEMPDIR=\$HOME/.just TMPDIR=\$HOME/.just && source /projects/weilab/weidf/lib/miniconda3/bin/activate pytc && cd $PWD && srun --ntasks={{num_tasks}} --gpus-per-task=0 --cpus-per-task={{cpu_per_task}} bash -c '$cmd_value'"
+
+# Generic CPU-only multi-task launcher for sharded scripts.
+# Automatically appends:
+#   --num-shards $SLURM_NTASKS --shard-index $SLURM_PROCID
+# Example:
+#   just slurm-cpu-sharded short 7 1 "python scripts/downsample_nisb.py"
+slurm-cpu-sharded partition num_tasks='7' cpu_per_task='4' cmd='' constraint='' mem='64G':
+    #!/usr/bin/env bash
+    set -euo pipefail
+    cmd_value='{{cmd}}'
+    if [ -z "$cmd_value" ]; then
+        echo "Error: cmd must be provided. Usage:"
+        echo "  just slurm-cpu-sharded <partition> <num_tasks> <cpu_per_task> \"<command>\" [constraint] [mem]"
+        exit 2
+    fi
+    just slurm-cpu-parallel {{partition}} {{num_tasks}} {{cpu_per_task}} "{{cmd}} --num-shards \$SLURM_NTASKS --shard-index \$SLURM_PROCID" "{{constraint}}" "{{mem}}"
 
 # Launch parameter sweep from config (e.g., just sweep tutorials/sweep_example.yaml)
 sweep config:
Original file line number	Diff line number	Diff line change
`@@ -154,3 +154,4 @@ lib/`
`154`	`154`
`155`	`155`	`# Development logs and documentation`
`156`	`156`	`tmp/`
	`157`	`+crackit/`