PytorchConnectomics
diff --git a/‎.claude/optuna/optuna_decoding_tuning.yaml‎
Lines changed: 0 additions & 1 deletion b/‎.claude/optuna/optuna_decoding_tuning.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎connectomics/config/hydra_config.py‎
Lines changed: 6 additions & 7 deletions b/‎connectomics/config/hydra_config.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎connectomics/data/dataset/dataset_volume_cached.py‎
Lines changed: 79 additions & 56 deletions b/‎connectomics/data/dataset/dataset_volume_cached.py‎
Lines changed: 79 additions & 56 deletions
diff --git a/‎connectomics/data/process/distance.py‎
Lines changed: 4 additions & 0 deletions b/‎connectomics/data/process/distance.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎connectomics/training/lit/data_factory.py‎
Lines changed: 4 additions & 0 deletions b/‎connectomics/training/lit/data_factory.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎connectomics/training/lit/trainer.py‎
Lines changed: 24 additions & 2 deletions b/‎connectomics/training/lit/trainer.py‎
Lines changed: 24 additions & 2 deletions
@@ -22,7 +22,6 @@ description: Automated optimization of decoding parameters using Optuna
 # ============================================================================
 system:
   num_gpus: 1
-  num_cpus: 8
   seed: 42
 
 # ============================================================================
 
@@ -54,13 +54,11 @@ class SystemTrainingConfig:
 
     Attributes:
         num_gpus: Number of GPUs to use for training (0 for CPU-only)
-        num_cpus: Number of CPU cores available for data loading
         num_workers: Number of parallel data loading workers
         batch_size: Training batch size (per GPU)
     """
 
     num_gpus: int = 1
-    num_cpus: int = 4
     num_workers: int = 8
     batch_size: int = 4
 
@@ -74,13 +72,11 @@ class SystemInferenceConfig:
 
     Attributes:
         num_gpus: Number of GPUs to use for inference (0 for CPU-only)
-        num_cpus: Number of CPU cores available for data loading
         num_workers: Number of parallel data loading workers
         batch_size: Inference batch size (usually 1 for large volumes)
     """
 
     num_gpus: int = 1
-    num_cpus: int = 1
     num_workers: int = 1
     batch_size: int = 1
 
@@ -422,7 +418,7 @@ class DataConfig:
 
     # Data properties
     patch_size: List[int] = field(default_factory=lambda: [128, 128, 128])
-    pad_size: List[int] = field(default_factory=lambda: [8, 32, 32])
+    pad_size: List[int] = field(default_factory=lambda: [0, 0, 0])
     pad_mode: str = "reflect"  # Padding mode: 'reflect', 'replicate', 'constant', 'edge'
     stride: List[int] = field(default_factory=lambda: [1, 1, 1])  # Sampling stride (z, y, x)
 
@@ -476,6 +472,10 @@ class DataConfig:
     use_preloaded_cache: bool = (
         True  # Preload volumes into memory for fast random cropping (default: True)
     )
+    cached_sampling_max_attempts: int = 10  # Retry attempts for foreground-aware sampling
+    cached_sampling_foreground_threshold: float = (
+        0.0  # Minimum (label > 0) fraction required for training crops; 0 disables foreground sampling
+    )
 
     # Reject sampling configuration (for volumetric patch sampling)
     reject_sampling: Optional[Dict[str, Any]] = None  # Dict with 'size_thres' and 'p' keys
@@ -572,7 +572,7 @@ class OptimizationConfig:
     benchmark: bool = True
 
     # Validation and logging
-    val_check_interval: Union[int, float] = 1.0
+    val_check_interval: Union[int, float] = 1.0  # Validate every N epochs (legacy key name)
     log_every_n_steps: int = 50
     num_sanity_val_steps: int = 0
 
@@ -1137,7 +1137,6 @@ class InferenceConfig:
     # Inference-specific overrides (override system settings during inference)
     # Use -1 to keep training values, or >= 0 to override
     num_gpus: int = -1  # Override system.training.num_gpus if >= 0
-    num_cpus: int = -1  # Override system.training.num_cpus if >= 0
     batch_size: int = -1  # Override system.training.batch_size if >= 0 (typically 1 for inference)
     num_workers: int = -1  # Override system.training.num_workers if >= 0
 
 
@@ -162,6 +162,8 @@ def __init__(
         mode: str = "train",
         pad_size: Optional[Tuple[int, ...]] = None,
         pad_mode: str = "reflect",
+        max_attempts: int = 10,
+        foreground_threshold: float = 0.05,
     ):
         self.image_paths = image_paths
         self.label_paths = label_paths if label_paths else [None] * len(image_paths)
@@ -194,6 +196,8 @@ def __init__(
         self._d2_rejected_patches = 0
         self._d2_foreground_fracs = []
         self._d2_last_report_step = 0
+        self.max_attempts = max_attempts
+        self.foreground_threshold = foreground_threshold
 
         # Load all volumes into memory
         print(f"  Loading {len(image_paths)} volumes into memory...")
@@ -271,13 +275,16 @@ def __init__(
         # Support both 2D and 3D: get last N dimensions matching patch_size
         ndim = len(self.patch_size)
         self.volume_sizes = [img.shape[-ndim:] for img in self.cached_images]  # (Z, Y, X) or (Y, X)
-        
+
         # [D2 DIAGNOSTIC] Print foreground sampling configuration
         if self.mode == "train":
-            print(f"  [D2] Foreground sampling ENABLED:")
-            print(f"    - Minimum foreground threshold: 5.0%")
-            print(f"    - Max retry attempts: 10")
-            print(f"    - Will report statistics every 100 batches")
+            if self.foreground_threshold > 0:
+                print("  [D2] Foreground sampling ENABLED:")
+                print(f"    - Minimum foreground threshold: {self.foreground_threshold * 100:.1f}%")
+                print(f"    - Max retry attempts: {self.max_attempts}")
+                print("    - Will report statistics every 100 batches")
+            else:
+                print("  [D2] Foreground sampling DISABLED (threshold <= 0)")
 
     def _apply_padding(
         self, volume: np.ndarray, mode: Optional[str] = None, constant_values: float = 0
@@ -366,15 +373,15 @@ def __len__(self) -> int:
     def set_epoch(self, epoch: int, base_seed: int = 0):
         """
         Set current epoch for epoch-based validation reseeding.
-        
+
         This method enables validation to sample different patches each epoch
         while maintaining determinism. For training, this has no effect since
         training already uses random sampling.
-        
+
         Args:
             epoch: Current training epoch
             base_seed: Base random seed (typically from cfg.system.seed)
-        
+
         Usage:
             Called by ValidationReseedingCallback at the start of each validation epoch:
                 if hasattr(dataset, 'set_epoch'):
@@ -387,32 +394,36 @@ def set_epoch(self, epoch: int, base_seed: int = 0):
             self.current_epoch = epoch
             effective_seed = self.base_seed + epoch
             random.seed(effective_seed)
-            
+
             # IMPORTANT: Print to verify reseeding is happening
             # This should appear in logs at the start of EACH validation epoch
-            print(f"[Validation] Set epoch={epoch}, base_seed={base_seed}, effective_seed={effective_seed}")
-            print(f"[Validation] Dataset: {type(self).__name__}@{id(self)}, mode={self.mode}, iter_num={self.iter_num}")
-    
+            print(
+                f"[Validation] Set epoch={epoch}, base_seed={base_seed}, effective_seed={effective_seed}"
+            )
+            print(
+                f"[Validation] Dataset: {type(self).__name__}@{id(self)}, mode={self.mode}, iter_num={self.iter_num}"
+            )
+
     def get_sampling_fingerprint(self, num_samples: int = 5) -> str:
         """
         Generate a deterministic fingerprint of validation sampling.
-        
+
         This allows verification that validation patches change across epochs.
         The fingerprint is based on the first N random samples that would be
         generated with the current RNG state.
-        
+
         Args:
             num_samples: Number of random samples to include in fingerprint
-        
+
         Returns:
             String representing the sampling fingerprint
         """
         if self.mode != "val":
             return "N/A (training mode)"
-        
+
         # Save current RNG state
         state = random.getstate()
-        
+
         try:
             # Generate deterministic samples
             samples = []
@@ -422,11 +433,11 @@ def get_sampling_fingerprint(self, num_samples: int = 5) -> str:
                 # Sample patch position
                 pos = self._get_random_crop_position(vol_idx)
                 samples.append((vol_idx, pos))
-            
+
             # Create fingerprint string
             fingerprint = ", ".join([f"v{v}@{p}" for v, p in samples])
             return fingerprint
-        
+
         finally:
             # Restore RNG state
             random.setstate(state)
@@ -488,25 +499,46 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         label = self.cached_labels[vol_idx]
         mask = self.cached_masks[vol_idx]
 
-        # [D2] Foreground-aware patch sampling: ensure patches contain sufficient mitochondria
-        # This prevents SDT collapse by avoiding background-only patches
-        max_attempts = 10
-        foreground_threshold = 0.05  # Require at least 5% foreground (SDT > 0)
-        
-        # [D2 DIAGNOSTIC] Track sampling attempts
-        attempts_used = 0
+        # [D2] Foreground-aware patch sampling: optional retry loop for training.
+        # Disabled by default when foreground_threshold <= 0.
+        max_attempts = self.max_attempts
+        foreground_threshold = self.foreground_threshold
+        use_foreground_sampling = (
+            self.mode == "train" and label is not None and foreground_threshold > 0
+        )
+
+        # [D2 DIAGNOSTIC] Track sampling attempts only when foreground sampling is active.
+        attempts_used = 1
         final_foreground_frac = 0.0
-        
-        for attempt in range(max_attempts):
-            attempts_used = attempt + 1
-            
-            # Get crop position
+
+        if use_foreground_sampling:
+            for attempt in range(max_attempts):
+                attempts_used = attempt + 1
+                pos = self._get_random_crop_position(vol_idx)
+
+                # Crop using fast numpy slicing (like v1)
+                image_crop = crop_volume(image, self.patch_size, pos)
+                label_crop = crop_volume(label, self.patch_size, pos)
+                if mask is not None:
+                    mask_crop = crop_volume(mask, self.patch_size, pos)
+                else:
+                    mask_crop = np.zeros_like(image_crop)
+
+                foreground_frac = (label_crop > 0).sum() / label_crop.size
+                final_foreground_frac = foreground_frac
+
+                if foreground_frac >= foreground_threshold:
+                    break
+
+                # [D2 DIAGNOSTIC] Patch rejected, increment counter
+                self._d2_rejected_patches += 1
+        else:
+            # Standard single-crop behavior (no foreground-based retry)
             if self.mode == "train":
                 pos = self._get_random_crop_position(vol_idx)
             else:
                 pos = self._get_center_crop_position(vol_idx)
 
-            # Crop using fast numpy slicing (like v1)
             image_crop = crop_volume(image, self.patch_size, pos)
             if label is not None:
                 label_crop = crop_volume(label, self.patch_size, pos)
@@ -518,39 +550,30 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
             else:
                 mask_crop = np.zeros_like(image_crop)
 
-            # [D2] Check if patch has sufficient foreground (only during training)
-            if self.mode == "train" and label is not None:
-                foreground_frac = (label_crop > 0).sum() / label_crop.size
-                final_foreground_frac = foreground_frac
-                
-                if foreground_frac >= foreground_threshold:
-                    # [D2 DIAGNOSTIC] Good patch found
-                    break
-                else:
-                    # [D2 DIAGNOSTIC] Patch rejected, increment counter
-                    self._d2_rejected_patches += 1
-            else:
-                # Val/test mode or no label: accept any patch
-                break
-
-        # [D2 DIAGNOSTIC] Record statistics
-        self._d2_total_samples += 1
-        self._d2_total_attempts += attempts_used
-        self._d2_foreground_fracs.append(final_foreground_frac * 100)  # Convert to percentage
-        
+        # [D2 DIAGNOSTIC] Record/report sampling stats only when enabled.
+        if use_foreground_sampling:
+            self._d2_total_samples += 1
+            self._d2_total_attempts += attempts_used
+            self._d2_foreground_fracs.append(final_foreground_frac * 100)  # percentage
+
         # [D2 DIAGNOSTIC] Print report every 100 samples (not too verbose)
-        if self.mode == "train" and self._d2_total_samples % 100 == 0:
+        if use_foreground_sampling and self._d2_total_samples % 100 == 0:
             avg_attempts = self._d2_total_attempts / self._d2_total_samples
             reject_rate = (self._d2_rejected_patches / self._d2_total_attempts) * 100
             avg_fg = sum(self._d2_foreground_fracs) / len(self._d2_foreground_fracs)
             min_fg = min(self._d2_foreground_fracs)
             max_fg = max(self._d2_foreground_fracs)
-            
+
             print(f"[D2 Sampling Stats after {self._d2_total_samples} batches]")
             print(f"  Avg attempts per patch: {avg_attempts:.2f}/{max_attempts}")
-            print(f"  Patches rejected: {self._d2_rejected_patches}/{self._d2_total_attempts} ({reject_rate:.1f}%)")
+            print(
+                f"  Patches rejected: {self._d2_rejected_patches}/{self._d2_total_attempts} ({reject_rate:.1f}%)"
+            )
             print(f"  Final foreground %: avg={avg_fg:.1f}%, min={min_fg:.1f}%, max={max_fg:.1f}%")
-            print(f"  Threshold: {foreground_threshold*100:.1f}% (5% minimum)")
+            print(
+                f"  Threshold: {foreground_threshold * 100:.1f}% "
+                f"({self.foreground_threshold * 100:.1f}% minimum)"
+            )
 
         # Create data dict
         data = {
 
@@ -306,6 +306,10 @@ def skeleton_aware_distance_transform(
     """
     eps = 1e-6
 
+    # Fast-path: empty label should produce all background energy.
+    if np.sum(label > 0) == 0:
+        return np.full(label.shape, bg_value, dtype=np.float32)
+
     # Configure bbox processor
     config = BBoxProcessorConfig(
         bg_value=bg_value,
 
@@ -568,6 +568,8 @@ def create_datamodule(
             mode="train",
             pad_size=tuple(pad_size) if pad_size else None,
             pad_mode=pad_mode,
+            max_attempts=cfg.data.cached_sampling_max_attempts,
+            foreground_threshold=cfg.data.cached_sampling_foreground_threshold,
         )
 
         # Use fewer workers since we're loading from memory
@@ -623,6 +625,8 @@ def create_datamodule(
                 mode="val",
                 pad_size=tuple(pad_size) if pad_size else None,
                 pad_mode=pad_mode,
+                max_attempts=cfg.data.cached_sampling_max_attempts,
+                foreground_threshold=cfg.data.cached_sampling_foreground_threshold,
             )
 
             # Create validation dataloader
 
@@ -176,7 +176,7 @@ def create_trainer(
                 f"  EMA: Enabled (decay={ema_cfg.decay}, warmup_steps={ema_cfg.warmup_steps}, "
                 f"validate_with_ema={ema_cfg.validate_with_ema})"
             )
-        
+
         # [FIX 1 - PROPER IMPLEMENTATION] Validation reseeding callback
         # This ensures validation datasets are reseeded at the start of EACH validation epoch
         # Previous fix in val_dataloader() only ran once during setup
@@ -272,6 +272,27 @@ def create_trainer(
         max_steps = -1  # -1 means unlimited steps
         training_mode = f"epoch-based ({max_epochs} epochs)"
 
+    # Treat optimization.val_check_interval as epoch interval (legacy key name).
+    # Accept values like 1.0 from existing YAMLs, but reject non-integer floats.
+    val_check_cfg = cfg.optimization.val_check_interval
+    if isinstance(val_check_cfg, float):
+        if not val_check_cfg.is_integer():
+            raise ValueError(
+                "optimization.val_check_interval must be an integer number of epochs "
+                f"(got {val_check_cfg})."
+            )
+        check_val_every_n_epoch = int(val_check_cfg)
+    else:
+        check_val_every_n_epoch = int(val_check_cfg)
+
+    if check_val_every_n_epoch < 1:
+        raise ValueError(
+            "optimization.val_check_interval must be >= 1 "
+            f"(got {check_val_every_n_epoch})."
+        )
+
+    print(f"  Validation: every {check_val_every_n_epoch} epoch(s)")
+
     trainer = pl.Trainer(
         max_epochs=max_epochs,
         max_steps=max_steps,
@@ -282,7 +303,8 @@ def create_trainer(
         precision=cfg.optimization.precision,
         gradient_clip_val=cfg.optimization.gradient_clip_val,
         accumulate_grad_batches=cfg.optimization.accumulate_grad_batches,
-        val_check_interval=cfg.optimization.val_check_interval,
+        val_check_interval=1.0,
+        check_val_every_n_epoch=check_val_every_n_epoch,
         log_every_n_steps=cfg.optimization.log_every_n_steps,
         callbacks=callbacks,
         logger=logger,