Skip to content

Commit 89728dd

Browse files
authored
Refactor data preparation for GigaSpeech recipe (#1986)
1 parent 9293edc commit 89728dd

13 files changed

Lines changed: 246 additions & 244 deletions
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../librispeech/ASR/local/compile_lg.py

egs/gigaspeech/ASR/local/compute_fbank_gigaspeech.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,21 @@
3232

3333
def compute_fbank_gigaspeech():
3434
in_out_dir = Path("data/fbank")
35+
3536
# number of workers in dataloader
3637
num_workers = 20
3738

3839
# number of seconds in a batch
3940
batch_duration = 1000
4041

41-
subsets = ("L", "M", "S", "XS", "DEV", "TEST")
42+
subsets = (
43+
"DEV",
44+
"TEST",
45+
# "L",
46+
# "M",
47+
# "S",
48+
# "XS",
49+
)
4250

4351
device = torch.device("cpu")
4452
if torch.cuda.is_available():

egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py

Lines changed: 15 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
import argparse
2020
import logging
21-
from datetime import datetime
21+
import os
2222
from pathlib import Path
2323

2424
import torch
@@ -32,7 +32,7 @@
3232
torch.set_num_interop_threads(1)
3333

3434

35-
def get_parser():
35+
def get_args():
3636
parser = argparse.ArgumentParser(
3737
formatter_class=argparse.ArgumentDefaultsHelpFormatter
3838
)
@@ -71,17 +71,15 @@ def get_parser():
7171
default=-1,
7272
help="Stop processing pieces until this number (exclusive).",
7373
)
74-
return parser
74+
return parser.parse_args()
7575

7676

7777
def compute_fbank_gigaspeech_splits(args):
7878
num_splits = args.num_splits
79-
output_dir = f"data/fbank/XL_split"
79+
output_dir = "data/fbank/gigaspeech_XL_split"
8080
output_dir = Path(output_dir)
8181
assert output_dir.exists(), f"{output_dir} does not exist!"
8282

83-
num_digits = 8 # num_digits is fixed by lhotse split-lazy
84-
8583
start = args.start
8684
stop = args.stop
8785
if stop < start:
@@ -95,6 +93,7 @@ def compute_fbank_gigaspeech_splits(args):
9593
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
9694
logging.info(f"device: {device}")
9795

96+
num_digits = 8 # num_digits is fixed by lhotse split-lazy
9897
for i in range(start, stop):
9998
idx = f"{i}".zfill(num_digits)
10099
logging.info(f"Processing {idx}/{num_splits}")
@@ -105,15 +104,22 @@ def compute_fbank_gigaspeech_splits(args):
105104
continue
106105

107106
raw_cuts_path = output_dir / f"gigaspeech_cuts_XL_raw.{idx}.jsonl.gz"
107+
if not raw_cuts_path.is_file():
108+
logging.info(f"{raw_cuts_path} does not exist - skipping it")
109+
continue
108110

109111
logging.info(f"Loading {raw_cuts_path}")
110112
cut_set = CutSet.from_file(raw_cuts_path)
111113

112114
logging.info("Computing features")
115+
filename = output_dir / f"gigaspeech_feats_XL_{idx}.lca"
116+
if filename.exists():
117+
logging.info(f"Removing {filename}")
118+
os.remove(str(filename))
113119

114120
cut_set = cut_set.compute_and_store_features_batch(
115121
extractor=extractor,
116-
storage_path=f"{output_dir}/gigaspeech_feats_{idx}",
122+
storage_path=f"{output_dir}/gigaspeech_feats_XL_{idx}",
117123
num_workers=args.num_workers,
118124
batch_duration=args.batch_duration,
119125
overwrite=True,
@@ -130,29 +136,10 @@ def compute_fbank_gigaspeech_splits(args):
130136

131137

132138
def main():
133-
now = datetime.now()
134-
date_time = now.strftime("%Y-%m-%d-%H-%M-%S")
135-
136-
log_filename = "log-compute_fbank_gigaspeech_splits"
137139
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
138-
log_filename = f"{log_filename}-{date_time}"
139-
140-
logging.basicConfig(
141-
filename=log_filename,
142-
format=formatter,
143-
level=logging.INFO,
144-
filemode="w",
145-
)
146-
147-
console = logging.StreamHandler()
148-
console.setLevel(logging.INFO)
149-
console.setFormatter(logging.Formatter(formatter))
150-
logging.getLogger("").addHandler(console)
151-
152-
parser = get_parser()
153-
args = parser.parse_args()
154-
logging.info(vars(args))
140+
logging.basicConfig(format=formatter, level=logging.INFO)
155141

142+
args = get_args()
156143
compute_fbank_gigaspeech_splits(args)
157144

158145

egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py

Lines changed: 0 additions & 1 deletion
This file was deleted.

egs/gigaspeech/ASR/local/preprocess_gigaspeech.py

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,6 @@
3030
# https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh
3131

3232

33-
def get_args():
34-
parser = argparse.ArgumentParser()
35-
parser.add_argument(
36-
"--perturb-speed",
37-
type=str2bool,
38-
default=False,
39-
help="Whether to use speed perturbation.",
40-
)
41-
42-
return parser.parse_args()
43-
44-
4533
def normalize_text(
4634
utt: str,
4735
punct_pattern=re.compile(r"<(COMMA|PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"),
@@ -57,7 +45,7 @@ def has_no_oov(
5745
return oov_pattern.search(sup.text) is None
5846

5947

60-
def preprocess_giga_speech(args):
48+
def preprocess_gigaspeech():
6149
src_dir = Path("data/manifests")
6250
output_dir = Path("data/fbank")
6351
output_dir.mkdir(exist_ok=True)
@@ -66,10 +54,10 @@ def preprocess_giga_speech(args):
6654
"DEV",
6755
"TEST",
6856
"XL",
69-
"L",
70-
"M",
71-
"S",
72-
"XS",
57+
# "L",
58+
# "M",
59+
# "S",
60+
# "XS",
7361
)
7462

7563
logging.info("Loading manifest (may take 4 minutes)")
@@ -110,17 +98,7 @@ def preprocess_giga_speech(args):
11098
recordings=m["recordings"],
11199
supervisions=m["supervisions"],
112100
)
113-
# Run data augmentation that needs to be done in the
114-
# time domain.
115-
if partition not in ["DEV", "TEST"]:
116-
if args.perturb_speed:
117-
logging.info(
118-
f"Speed perturb for {partition} with factors 0.9 and 1.1 "
119-
"(Perturbing may take 8 minutes and saving may take 20 minutes)"
120-
)
121-
cut_set = (
122-
cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
123-
)
101+
124102
logging.info(f"Saving to {raw_cuts_path}")
125103
cut_set.to_file(raw_cuts_path)
126104

@@ -129,8 +107,7 @@ def main():
129107
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
130108
logging.basicConfig(format=formatter, level=logging.INFO)
131109

132-
args = get_args()
133-
preprocess_giga_speech(args)
110+
preprocess_gigaspeech()
134111

135112

136113
if __name__ == "__main__":
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../librispeech/ASR/local/validate_bpe_lexicon.py

0 commit comments

Comments
 (0)