1818
1919import argparse
2020import logging
21- from datetime import datetime
21+ import os
2222from pathlib import Path
2323
2424import torch
3232torch .set_num_interop_threads (1 )
3333
3434
35- def get_parser ():
35+ def get_args ():
3636 parser = argparse .ArgumentParser (
3737 formatter_class = argparse .ArgumentDefaultsHelpFormatter
3838 )
@@ -71,17 +71,15 @@ def get_parser():
7171 default = - 1 ,
7272 help = "Stop processing pieces until this number (exclusive)." ,
7373 )
74- return parser
74+ return parser . parse_args ()
7575
7676
7777def compute_fbank_gigaspeech_splits (args ):
7878 num_splits = args .num_splits
79- output_dir = f "data/fbank/XL_split "
79+ output_dir = "data/fbank/gigaspeech_XL_split "
8080 output_dir = Path (output_dir )
8181 assert output_dir .exists (), f"{ output_dir } does not exist!"
8282
83- num_digits = 8 # num_digits is fixed by lhotse split-lazy
84-
8583 start = args .start
8684 stop = args .stop
8785 if stop < start :
@@ -95,6 +93,7 @@ def compute_fbank_gigaspeech_splits(args):
9593 extractor = KaldifeatFbank (KaldifeatFbankConfig (device = device ))
9694 logging .info (f"device: { device } " )
9795
96+ num_digits = 8 # num_digits is fixed by lhotse split-lazy
9897 for i in range (start , stop ):
9998 idx = f"{ i } " .zfill (num_digits )
10099 logging .info (f"Processing { idx } /{ num_splits } " )
@@ -105,15 +104,22 @@ def compute_fbank_gigaspeech_splits(args):
105104 continue
106105
107106 raw_cuts_path = output_dir / f"gigaspeech_cuts_XL_raw.{ idx } .jsonl.gz"
107+ if not raw_cuts_path .is_file ():
108+ logging .info (f"{ raw_cuts_path } does not exist - skipping it" )
109+ continue
108110
109111 logging .info (f"Loading { raw_cuts_path } " )
110112 cut_set = CutSet .from_file (raw_cuts_path )
111113
112114 logging .info ("Computing features" )
115+ filename = output_dir / f"gigaspeech_feats_XL_{ idx } .lca"
116+ if filename .exists ():
117+ logging .info (f"Removing { filename } " )
118+ os .remove (str (filename ))
113119
114120 cut_set = cut_set .compute_and_store_features_batch (
115121 extractor = extractor ,
116- storage_path = f"{ output_dir } /gigaspeech_feats_ { idx } " ,
122+ storage_path = f"{ output_dir } /gigaspeech_feats_XL_ { idx } " ,
117123 num_workers = args .num_workers ,
118124 batch_duration = args .batch_duration ,
119125 overwrite = True ,
@@ -130,29 +136,10 @@ def compute_fbank_gigaspeech_splits(args):
130136
131137
132138def main ():
133- now = datetime .now ()
134- date_time = now .strftime ("%Y-%m-%d-%H-%M-%S" )
135-
136- log_filename = "log-compute_fbank_gigaspeech_splits"
137139 formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
138- log_filename = f"{ log_filename } -{ date_time } "
139-
140- logging .basicConfig (
141- filename = log_filename ,
142- format = formatter ,
143- level = logging .INFO ,
144- filemode = "w" ,
145- )
146-
147- console = logging .StreamHandler ()
148- console .setLevel (logging .INFO )
149- console .setFormatter (logging .Formatter (formatter ))
150- logging .getLogger ("" ).addHandler (console )
151-
152- parser = get_parser ()
153- args = parser .parse_args ()
154- logging .info (vars (args ))
140+ logging .basicConfig (format = formatter , level = logging .INFO )
155141
142+ args = get_args ()
156143 compute_fbank_gigaspeech_splits (args )
157144
158145
0 commit comments