Skip to content

Commit 652142d

Browse files
author
Alex J Lennon
committed
Add --no-download flag: run training without download/prepare steps
Made-with: Cursor
1 parent eeb2950 commit 652142d

3 files changed

Lines changed: 32 additions & 10 deletions

File tree

training/modules/data_pipeline.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ class LibriSpeechDataset(Dataset):
320320
"""
321321

322322
def __init__(self, config: TrainingConfiguration, split: str,
323-
is_training: bool = True, data_root: Optional[str] = None, interactive: bool = True):
323+
is_training: bool = True, data_root: Optional[str] = None, interactive: bool = True, no_download: bool = False):
324324
"""
325325
Initialize LibriSpeech dataset
326326
@@ -330,6 +330,7 @@ def __init__(self, config: TrainingConfiguration, split: str,
330330
is_training: Whether this is for training (affects augmentation)
331331
data_root: Root directory for LibriSpeech data
332332
interactive: If False, auto-confirm dataset download/prepare (--yes)
333+
no_download: If True, use only existing prepared data; fail if missing (--no-download)
333334
"""
334335
self.config = config
335336
self.split = split
@@ -348,7 +349,7 @@ def __init__(self, config: TrainingConfiguration, split: str,
348349
self.dataset_manager = DatasetManager()
349350

350351
# Ensure dataset is prepared
351-
if not self.dataset_manager.prepare_datasets([split], interactive=interactive):
352+
if not self.dataset_manager.prepare_datasets([split], interactive=interactive, no_download=no_download):
352353
raise RuntimeError(f"Failed to prepare dataset: {split}")
353354

354355
# Load prepared data file list
@@ -690,7 +691,8 @@ def collate_audio_samples(batch: List[AudioSample]) -> Dict[str, torch.Tensor]:
690691
def create_data_loaders(config: TrainingConfiguration,
691692
data_root: Optional[str] = None,
692693
pin_memory: Optional[bool] = None,
693-
interactive: bool = True) -> Tuple[DataLoader, DataLoader, DataLoader]:
694+
interactive: bool = True,
695+
no_download: bool = False) -> Tuple[DataLoader, DataLoader, DataLoader]:
694696
"""
695697
Create training, validation, and test data loaders
696698
@@ -699,6 +701,7 @@ def create_data_loaders(config: TrainingConfiguration,
699701
data_root: Root directory for data (optional)
700702
pin_memory: Override pin_memory setting (optional)
701703
interactive: If False, auto-confirm dataset download/prepare (--yes)
704+
no_download: If True, use only existing prepared data; fail if any missing (--no-download)
702705
703706
Returns:
704707
Tuple of (train_loader, val_loader, test_loader)
@@ -714,7 +717,8 @@ def create_data_loaders(config: TrainingConfiguration,
714717
split=split,
715718
is_training=True,
716719
data_root=data_root,
717-
interactive=interactive
720+
interactive=interactive,
721+
no_download=no_download
718722
)
719723
train_datasets.append(dataset)
720724

@@ -727,15 +731,17 @@ def create_data_loaders(config: TrainingConfiguration,
727731
split=config.data.val_split,
728732
is_training=False,
729733
data_root=data_root,
730-
interactive=interactive
734+
interactive=interactive,
735+
no_download=no_download
731736
)
732737

733738
test_dataset = LibriSpeechDataset(
734739
config=config,
735740
split=config.data.test_split,
736741
is_training=False,
737742
data_root=data_root,
738-
interactive=interactive
743+
interactive=interactive,
744+
no_download=no_download
739745
)
740746

741747
# Create data loaders

training/modules/dataset_manager.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,13 +115,14 @@ def _check_prepared_data(self, dataset: str) -> bool:
115115
return False
116116
return True
117117

118-
def prepare_datasets(self, datasets: List[str], interactive: bool = True) -> bool:
118+
def prepare_datasets(self, datasets: List[str], interactive: bool = True, no_download: bool = False) -> bool:
119119
"""
120120
Ensure all requested datasets are prepared and ready for training
121121
122122
Args:
123123
datasets: List of dataset names to prepare
124124
interactive: Whether to prompt user for missing datasets
125+
no_download: If True, do not download or prepare; fail if any dataset missing
125126
126127
Returns:
127128
True if all datasets are ready, False if preparation failed
@@ -153,6 +154,14 @@ def prepare_datasets(self, datasets: List[str], interactive: bool = True) -> boo
153154
logger.info(f"Dataset {dataset} is missing completely")
154155
missing_datasets.append(dataset)
155156

157+
if no_download:
158+
if missing_datasets or needs_preparation:
159+
logger.error("--no-download: Refusing to download or prepare. Missing or not ready: %s",
160+
", ".join(missing_datasets + needs_preparation))
161+
return False
162+
logger.info("All datasets are ready for training (no-download mode).")
163+
return True
164+
156165
# Handle completely missing datasets
157166
if missing_datasets:
158167
if interactive:

training/train.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class TCNTrainer:
4949
clear error handling and progress monitoring.
5050
"""
5151

52-
def __init__(self, config: TrainingConfiguration, resume_from: Optional[str] = None, data_root: Optional[str] = None, interactive: bool = True):
52+
def __init__(self, config: TrainingConfiguration, resume_from: Optional[str] = None, data_root: Optional[str] = None, interactive: bool = True, no_download: bool = False):
5353
"""
5454
Initialize trainer with configuration
5555
@@ -58,6 +58,7 @@ def __init__(self, config: TrainingConfiguration, resume_from: Optional[str] = N
5858
resume_from: Path to checkpoint to resume from (optional)
5959
data_root: Root directory for data (optional)
6060
interactive: If False, auto-confirm dataset download/prepare (--yes)
61+
no_download: If True, use only existing prepared data; fail if any dataset missing (--no-download)
6162
"""
6263
self.config = config
6364

@@ -90,7 +91,7 @@ def __init__(self, config: TrainingConfiguration, resume_from: Optional[str] = N
9091
# Create data loaders
9192
self.logger.info("Setting up data pipeline...")
9293
self.train_loader, self.val_loader, self.test_loader = create_data_loaders(
93-
config, data_root=data_root, pin_memory=self.pin_memory, interactive=interactive)
94+
config, data_root=data_root, pin_memory=self.pin_memory, interactive=interactive, no_download=no_download)
9495

9596
# Create model
9697
self.logger.info("Creating TCN model...")
@@ -625,6 +626,12 @@ def parse_arguments():
625626
help='Non-interactive: auto-confirm download/prepare for all missing datasets (no prompts)'
626627
)
627628

629+
parser.add_argument(
630+
'--no-download',
631+
action='store_true',
632+
help='Use only existing prepared data; exit with error if any requested dataset is missing or not prepared (no download or MFA)'
633+
)
634+
628635
return parser.parse_args()
629636

630637

@@ -648,7 +655,7 @@ def main():
648655

649656
try:
650657
# Create trainer
651-
trainer = TCNTrainer(config, resume_from=args.resume, data_root=args.data_root, interactive=not args.yes)
658+
trainer = TCNTrainer(config, resume_from=args.resume, data_root=args.data_root, interactive=not args.yes, no_download=args.no_download)
652659

653660
if args.test_only:
654661
# Test only mode

0 commit comments

Comments
 (0)