Add --no-download flag: run training without download/prepare steps

Alex J Lennon · Alex J Lennon · commit 652142dc449a · 2026-03-09T14:36:55.000Z
Made-with: Cursor
diff --git a/training/modules/data_pipeline.py b/training/modules/data_pipeline.py
@@ -320,7 +320,7 @@ class LibriSpeechDataset(Dataset):
     """
     
     def __init__(self, config: TrainingConfiguration, split: str, 
-                 is_training: bool = True, data_root: Optional[str] = None, interactive: bool = True):
+                 is_training: bool = True, data_root: Optional[str] = None, interactive: bool = True, no_download: bool = False):
         """
         Initialize LibriSpeech dataset
         
@@ -330,6 +330,7 @@ def __init__(self, config: TrainingConfiguration, split: str,
             is_training: Whether this is for training (affects augmentation)
             data_root: Root directory for LibriSpeech data
             interactive: If False, auto-confirm dataset download/prepare (--yes)
+            no_download: If True, use only existing prepared data; fail if missing (--no-download)
         """
         self.config = config
         self.split = split
@@ -348,7 +349,7 @@ def __init__(self, config: TrainingConfiguration, split: str,
             self.dataset_manager = DatasetManager()
         
         # Ensure dataset is prepared
-        if not self.dataset_manager.prepare_datasets([split], interactive=interactive):
+        if not self.dataset_manager.prepare_datasets([split], interactive=interactive, no_download=no_download):
             raise RuntimeError(f"Failed to prepare dataset: {split}")
         
         # Load prepared data file list
@@ -690,7 +691,8 @@ def collate_audio_samples(batch: List[AudioSample]) -> Dict[str, torch.Tensor]:
 def create_data_loaders(config: TrainingConfiguration, 
                        data_root: Optional[str] = None,
                        pin_memory: Optional[bool] = None,
-                       interactive: bool = True) -> Tuple[DataLoader, DataLoader, DataLoader]:
+                       interactive: bool = True,
+                       no_download: bool = False) -> Tuple[DataLoader, DataLoader, DataLoader]:
     """
     Create training, validation, and test data loaders
     
@@ -699,6 +701,7 @@ def create_data_loaders(config: TrainingConfiguration,
         data_root: Root directory for data (optional)
         pin_memory: Override pin_memory setting (optional)
         interactive: If False, auto-confirm dataset download/prepare (--yes)
+        no_download: If True, use only existing prepared data; fail if any missing (--no-download)
         
     Returns:
         Tuple of (train_loader, val_loader, test_loader)
@@ -714,7 +717,8 @@ def create_data_loaders(config: TrainingConfiguration,
             split=split,
             is_training=True,
             data_root=data_root,
-            interactive=interactive
+            interactive=interactive,
+            no_download=no_download
         )
         train_datasets.append(dataset)
     
@@ -727,15 +731,17 @@ def create_data_loaders(config: TrainingConfiguration,
         split=config.data.val_split,
         is_training=False,
         data_root=data_root,
-        interactive=interactive
+        interactive=interactive,
+        no_download=no_download
     )
     
     test_dataset = LibriSpeechDataset(
         config=config,
         split=config.data.test_split,
         is_training=False,
         data_root=data_root,
-        interactive=interactive
+        interactive=interactive,
+        no_download=no_download
     )
     
     # Create data loaders
diff --git a/training/modules/dataset_manager.py b/training/modules/dataset_manager.py
@@ -115,13 +115,14 @@ def _check_prepared_data(self, dataset: str) -> bool:
             return False
         return True
     
-    def prepare_datasets(self, datasets: List[str], interactive: bool = True) -> bool:
+    def prepare_datasets(self, datasets: List[str], interactive: bool = True, no_download: bool = False) -> bool:
         """
         Ensure all requested datasets are prepared and ready for training
         
         Args:
             datasets: List of dataset names to prepare
             interactive: Whether to prompt user for missing datasets
+            no_download: If True, do not download or prepare; fail if any dataset missing
             
         Returns:
             True if all datasets are ready, False if preparation failed
@@ -153,6 +154,14 @@ def prepare_datasets(self, datasets: List[str], interactive: bool = True) -> boo
                 logger.info(f"Dataset {dataset} is missing completely")
                 missing_datasets.append(dataset)
         
+        if no_download:
+            if missing_datasets or needs_preparation:
+                logger.error("--no-download: Refusing to download or prepare. Missing or not ready: %s",
+                             ", ".join(missing_datasets + needs_preparation))
+                return False
+            logger.info("All datasets are ready for training (no-download mode).")
+            return True
+        
         # Handle completely missing datasets
         if missing_datasets:
             if interactive:
diff --git a/training/train.py b/training/train.py
@@ -49,7 +49,7 @@ class TCNTrainer:
     clear error handling and progress monitoring.
     """
     
-    def __init__(self, config: TrainingConfiguration, resume_from: Optional[str] = None, data_root: Optional[str] = None, interactive: bool = True):
+    def __init__(self, config: TrainingConfiguration, resume_from: Optional[str] = None, data_root: Optional[str] = None, interactive: bool = True, no_download: bool = False):
         """
         Initialize trainer with configuration
         
@@ -58,6 +58,7 @@ def __init__(self, config: TrainingConfiguration, resume_from: Optional[str] = N
             resume_from: Path to checkpoint to resume from (optional)
             data_root: Root directory for data (optional)
             interactive: If False, auto-confirm dataset download/prepare (--yes)
+            no_download: If True, use only existing prepared data; fail if any dataset missing (--no-download)
         """
         self.config = config
         
@@ -90,7 +91,7 @@ def __init__(self, config: TrainingConfiguration, resume_from: Optional[str] = N
         # Create data loaders
         self.logger.info("Setting up data pipeline...")
         self.train_loader, self.val_loader, self.test_loader = create_data_loaders(
-            config, data_root=data_root, pin_memory=self.pin_memory, interactive=interactive)
+            config, data_root=data_root, pin_memory=self.pin_memory, interactive=interactive, no_download=no_download)
         
         # Create model
         self.logger.info("Creating TCN model...")
@@ -625,6 +626,12 @@ def parse_arguments():
         help='Non-interactive: auto-confirm download/prepare for all missing datasets (no prompts)'
     )
     
+    parser.add_argument(
+        '--no-download',
+        action='store_true',
+        help='Use only existing prepared data; exit with error if any requested dataset is missing or not prepared (no download or MFA)'
+    )
+    
     return parser.parse_args()
 
 
@@ -648,7 +655,7 @@ def main():
     
     try:
         # Create trainer
-        trainer = TCNTrainer(config, resume_from=args.resume, data_root=args.data_root, interactive=not args.yes)
+        trainer = TCNTrainer(config, resume_from=args.resume, data_root=args.data_root, interactive=not args.yes, no_download=args.no_download)
         
         if args.test_only:
             # Test only mode