k2-fsa
diff --git a/‎egs/gigaspeech/ASR/local/compile_lg.py‎
Lines changed: 1 addition & 0 deletions b/‎egs/gigaspeech/ASR/local/compile_lg.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/gigaspeech/ASR/local/compute_fbank_gigaspeech.py‎
Lines changed: 9 additions & 1 deletion b/‎egs/gigaspeech/ASR/local/compute_fbank_gigaspeech.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py‎
Lines changed: 15 additions & 28 deletions b/‎egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py‎
Lines changed: 15 additions & 28 deletions
diff --git a/‎egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py‎
Lines changed: 0 additions & 1 deletion b/‎egs/gigaspeech/ASR/local/convert_transcript_words_to_tokens.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎egs/gigaspeech/ASR/local/preprocess_gigaspeech.py‎
Lines changed: 7 additions & 30 deletions b/‎egs/gigaspeech/ASR/local/preprocess_gigaspeech.py‎
Lines changed: 7 additions & 30 deletions
diff --git a/‎egs/gigaspeech/ASR/local/validate_bpe_lexicon.py‎
Lines changed: 1 addition & 0 deletions b/‎egs/gigaspeech/ASR/local/validate_bpe_lexicon.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/compile_lg.py
@@ -32,13 +32,21 @@
 
 def compute_fbank_gigaspeech():
     in_out_dir = Path("data/fbank")
+
     # number of workers in dataloader
     num_workers = 20
 
     # number of seconds in a batch
     batch_duration = 1000
 
-    subsets = ("L", "M", "S", "XS", "DEV", "TEST")
+    subsets = (
+        "DEV",
+        "TEST",
+        # "L",
+        # "M",
+        # "S",
+        # "XS",
+    )
 
     device = torch.device("cpu")
     if torch.cuda.is_available():
 
@@ -18,7 +18,7 @@
 
 import argparse
 import logging
-from datetime import datetime
+import os
 from pathlib import Path
 
 import torch
@@ -32,7 +32,7 @@
 torch.set_num_interop_threads(1)
 
 
-def get_parser():
+def get_args():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
@@ -71,17 +71,15 @@ def get_parser():
         default=-1,
         help="Stop processing pieces until this number (exclusive).",
     )
-    return parser
+    return parser.parse_args()
 
 
 def compute_fbank_gigaspeech_splits(args):
     num_splits = args.num_splits
-    output_dir = f"data/fbank/XL_split"
+    output_dir = "data/fbank/gigaspeech_XL_split"
     output_dir = Path(output_dir)
     assert output_dir.exists(), f"{output_dir} does not exist!"
 
-    num_digits = 8  # num_digits is fixed by lhotse split-lazy
-
     start = args.start
     stop = args.stop
     if stop < start:
@@ -95,6 +93,7 @@ def compute_fbank_gigaspeech_splits(args):
     extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
     logging.info(f"device: {device}")
 
+    num_digits = 8  # num_digits is fixed by lhotse split-lazy
     for i in range(start, stop):
         idx = f"{i}".zfill(num_digits)
         logging.info(f"Processing {idx}/{num_splits}")
@@ -105,15 +104,22 @@ def compute_fbank_gigaspeech_splits(args):
             continue
 
         raw_cuts_path = output_dir / f"gigaspeech_cuts_XL_raw.{idx}.jsonl.gz"
+        if not raw_cuts_path.is_file():
+            logging.info(f"{raw_cuts_path} does not exist - skipping it")
+            continue
 
         logging.info(f"Loading {raw_cuts_path}")
         cut_set = CutSet.from_file(raw_cuts_path)
 
         logging.info("Computing features")
+        filename = output_dir / f"gigaspeech_feats_XL_{idx}.lca"
+        if filename.exists():
+            logging.info(f"Removing {filename}")
+            os.remove(str(filename))
 
         cut_set = cut_set.compute_and_store_features_batch(
             extractor=extractor,
-            storage_path=f"{output_dir}/gigaspeech_feats_{idx}",
+            storage_path=f"{output_dir}/gigaspeech_feats_XL_{idx}",
             num_workers=args.num_workers,
             batch_duration=args.batch_duration,
             overwrite=True,
@@ -130,29 +136,10 @@ def compute_fbank_gigaspeech_splits(args):
 
 
 def main():
-    now = datetime.now()
-    date_time = now.strftime("%Y-%m-%d-%H-%M-%S")
-
-    log_filename = "log-compute_fbank_gigaspeech_splits"
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-    log_filename = f"{log_filename}-{date_time}"
-
-    logging.basicConfig(
-        filename=log_filename,
-        format=formatter,
-        level=logging.INFO,
-        filemode="w",
-    )
-
-    console = logging.StreamHandler()
-    console.setLevel(logging.INFO)
-    console.setFormatter(logging.Formatter(formatter))
-    logging.getLogger("").addHandler(console)
-
-    parser = get_parser()
-    args = parser.parse_args()
-    logging.info(vars(args))
+    logging.basicConfig(format=formatter, level=logging.INFO)
 
+    args = get_args()
     compute_fbank_gigaspeech_splits(args)
 
 
 
@@ -30,18 +30,6 @@
 # https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh
 
 
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--perturb-speed",
-        type=str2bool,
-        default=False,
-        help="Whether to use speed perturbation.",
-    )
-
-    return parser.parse_args()
-
-
 def normalize_text(
     utt: str,
     punct_pattern=re.compile(r"<(COMMA|PERIOD|QUESTIONMARK|EXCLAMATIONPOINT)>"),
@@ -57,7 +45,7 @@ def has_no_oov(
     return oov_pattern.search(sup.text) is None
 
 
-def preprocess_giga_speech(args):
+def preprocess_gigaspeech():
     src_dir = Path("data/manifests")
     output_dir = Path("data/fbank")
     output_dir.mkdir(exist_ok=True)
@@ -66,10 +54,10 @@ def preprocess_giga_speech(args):
         "DEV",
         "TEST",
         "XL",
-        "L",
-        "M",
-        "S",
-        "XS",
+        # "L",
+        # "M",
+        # "S",
+        # "XS",
     )
 
     logging.info("Loading manifest (may take 4 minutes)")
@@ -110,17 +98,7 @@ def preprocess_giga_speech(args):
             recordings=m["recordings"],
             supervisions=m["supervisions"],
         )
-        # Run data augmentation that needs to be done in the
-        # time domain.
-        if partition not in ["DEV", "TEST"]:
-            if args.perturb_speed:
-                logging.info(
-                    f"Speed perturb for {partition} with factors 0.9 and 1.1 "
-                    "(Perturbing may take 8 minutes and saving may take 20 minutes)"
-                )
-                cut_set = (
-                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
-                )
+
         logging.info(f"Saving to {raw_cuts_path}")
         cut_set.to_file(raw_cuts_path)
 
@@ -129,8 +107,7 @@ def main():
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
     logging.basicConfig(format=formatter, level=logging.INFO)
 
-    args = get_args()
-    preprocess_giga_speech(args)
+    preprocess_gigaspeech()
 
 
 if __name__ == "__main__":
 
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/validate_bpe_lexicon.py
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../librispeech/ASR/local/compile_lg.py`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../librispeech/ASR/local/validate_bpe_lexicon.py`