k2-fsa
diff --git a/‎egs/iwslt22_ta/ASR/README.md‎
Lines changed: 26 additions & 0 deletions b/‎egs/iwslt22_ta/ASR/README.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎egs/iwslt22_ta/ASR/RESULTS.md‎
Lines changed: 110 additions & 0 deletions b/‎egs/iwslt22_ta/ASR/RESULTS.md‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎egs/iwslt22_ta/ASR/local/__init__.py‎ b/‎egs/iwslt22_ta/ASR/local/__init__.py‎
diff --git a/‎egs/iwslt22_ta/ASR/local/cer.py‎
Lines changed: 1 addition & 0 deletions b/‎egs/iwslt22_ta/ASR/local/cer.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/iwslt22_ta/ASR/local/compute_fbank_gpu.py‎
Lines changed: 168 additions & 0 deletions b/‎egs/iwslt22_ta/ASR/local/compute_fbank_gpu.py‎
Lines changed: 168 additions & 0 deletions
diff --git a/‎egs/iwslt22_ta/ASR/local/compute_fbank_musan.py‎
Lines changed: 1 addition & 0 deletions b/‎egs/iwslt22_ta/ASR/local/compute_fbank_musan.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/iwslt22_ta/ASR/local/cuts_validate.py‎
Lines changed: 1 addition & 0 deletions b/‎egs/iwslt22_ta/ASR/local/cuts_validate.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/iwslt22_ta/ASR/local/display_manifest_statistics.py‎
Lines changed: 1 addition & 0 deletions b/‎egs/iwslt22_ta/ASR/local/display_manifest_statistics.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/iwslt22_ta/ASR/local/filter_cuts.py‎
Lines changed: 1 addition & 0 deletions b/‎egs/iwslt22_ta/ASR/local/filter_cuts.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/iwslt22_ta/ASR/local/generate_unique_lexicon.py‎
Lines changed: 1 addition & 0 deletions b/‎egs/iwslt22_ta/ASR/local/generate_unique_lexicon.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,26 @@
+# IWSLT_Ta
+
+The IWSLT Tunisian dataset is a 3-way parallel dataset consisting of approximately 160 hours
+and 200,000 lines of aligned audio, Tunisian transcripts, and English translations. This dataset
+comprises conversational telephone speech recorded at a sampling rate of 8kHz. The train, dev,
+and test1 splits of the iwslt2022 shared task correspond to catalog number LDC2022E01. Please
+note that access to this data requires an LDC subscription from your institution.To obtain this
+dataset, you should download the predefined splits by running the following command:
+git clone https://github.com/kevinduh/iwslt22-dialect.git. For more detailed information about
+the shared task, please refer to the task paper available at this link:
+https://aclanthology.org/2022.iwslt-1.10/.
+
+## Stateless Pruned Transducer Performance Record (after 20 epochs)
+
+|    Decoding method                 |     dev WER     |    test WER    | comment                                  |
+|------------------------------------|------------|------------|------------------------------------------|
+| modified beam search               | 47.6      | 51.2       | --epoch 20, --avg 10  |
+
+## Zipformer Performance Record (after 20 epochs)
+
+|    Decoding method                 |     dev WER     |    test WER    | comment                                  |
+|------------------------------------|------------|------------|------------------------------------------|
+| modified beam search               | 40.8      | 44.4        | --epoch 20, --avg 10  |
+
+
+See [RESULTS](RESULTS.md) for details.
@@ -0,0 +1,110 @@
+# Results
+
+
+
+### IWSLT Tunisian training results (Stateless Pruned Transducer)
+
+#### 2023-06-01
+
+
+|    Decoding method                 |     dev WER     |    test WER    | comment                                  |
+|------------------------------------|------------|------------|------------------------------------------|
+| modified beam search               | 47.6      | 51.2       | --epoch 20, --avg 13  |
+
+The training command for reproducing is given below:
+
+```
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+
+  
+./pruned_transducer_stateless5/train.py \
+  --world-size 4 \
+  --num-epochs 20 \
+  --start-epoch 1 \
+  --exp-dir pruned_transducer_stateless5/exp \
+  --max-duration 300 \
+  --num-buckets 50
+```
+
+The tensorboard training log can be found at
+https://tensorboard.dev/experiment/yBijWJSPSGuBqMwTZ509lA/
+
+The decoding command is:
+```
+for method in modified_beam_search; do
+    ./pruned_transducer_stateless5/decode.py \
+    --epoch 15 \
+    --beam-size 20 \
+    --avg 5 \
+    --exp-dir ./pruned_transducer_stateless5/exp \
+    --max-duration 400 \
+    --decoding-method modified_beam_search \
+    --max-sym-per-frame 1 \
+    --num-encoder-layers 12 \
+    --dim-feedforward 1024 \
+    --nhead 8 \
+    --encoder-dim 256 \
+    --decoder-dim 256 \
+    --joiner-dim 256 \
+    --use-averaged-model true
+done
+```
+
+### IWSLT Tunisian training results  (Zipformer)
+
+#### 2023-06-01
+
+You can find a pretrained model, training logs, decoding logs, and decoding results at: 
+<https://huggingface.co/AmirHussein/zipformer-iwslt22-Ta>
+
+
+
+|    Decoding method                 |     dev WER     |    test WER    | comment                                  |
+|------------------------------------|------------|------------|------------------------------------------|
+| modified beam search               | 40.8      | 44.1       | --epoch 20, --avg 13  |
+
+To reproduce the above result, use the following commands for training:
+
+# Note: the model was trained on V-100 32GB GPU
+
+```
+export CUDA_VISIBLE_DEVICES="0,1"
+./zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 20 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir zipformer/exp \
+  --causal 0 \
+  --num-encoder-layers 2,2,2,2,2,2 \
+  --feedforward-dim 512,768,1024,1536,1024,768 \
+  --encoder-dim 192,256,384,512,384,256 \
+  --encoder-unmasked-dim 192,192,256,256,256,192 \
+  --max-duration 800 \
+  --prune-range 10
+
+```
+
+The decoding command is:
+
+```
+for method in modified_beam_search; do
+  ./zipformer/decode.py \
+  --epoch 20 \
+  --beam-size 20 \
+  --avg 13 \
+  --exp-dir ./zipformer/exp\
+  --max-duration 800 \
+  --decoding-method $method \
+ 	--num-encoder-layers 2,2,2,2,2,2 \
+ 	--feedforward-dim 512,768,1024,1536,1024,768 \
+ 	--encoder-dim 192,256,384,512,384,256 \
+ 	--encoder-unmasked-dim 192,192,256,256,256,192 \
+  --use-averaged-model true
+ done
+```
+
+
+
+
@@ -0,0 +1 @@
+../../ST/local/cer.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+# Johns Hopkins University  (authors: Amir Hussein)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the MGB2 dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+
+import logging
+import os
+from pathlib import Path
+import argparse
+
+import torch
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse.recipes.utils import read_manifests_if_cached
+
+from icefall.utils import get_executor
+
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--num-splits",
+        type=int,
+        default=20,
+        help="Number of splits for the train set.",
+    )
+    parser.add_argument(
+        "--start",
+        type=int,
+        default=0,
+        help="Start index of the train set split.",
+    )
+    parser.add_argument(
+        "--stop",
+        type=int,
+        default=-1,
+        help="Stop index of the train set split.",
+    )
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help="If set, only compute features for the dev and val set.",
+    )
+
+    return parser.parse_args()
+
+
+def compute_fbank_gpu(args):
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    num_jobs = os.cpu_count()
+    num_mel_bins = 80
+    sampling_rate = 16000
+    sr = 16000
+
+    dataset_parts = ("dev", "test1") if args.test else ("train", "test1", "dev")
+    manifests = read_manifests_if_cached(
+        prefix="iwslt-ta", dataset_parts=dataset_parts, output_dir=src_dir
+    )
+    assert manifests is not None
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    for partition, m in manifests.items():
+        if (output_dir / f"cuts_{partition}.jsonl.gz").is_file():
+            logging.info(f"{partition} already exists - skipping.")
+            continue
+        logging.info(f"Processing {partition}")
+        cut_set = CutSet.from_manifests(
+            recordings=m["recordings"],
+            supervisions=m["supervisions"],
+        )
+
+        logging.info("About to split cuts into smaller chunks.")
+        if sr != None:
+            logging.info(f"Resampling to {sr}")
+            cut_set = cut_set.resample(sr)
+
+        cut_set = cut_set.trim_to_supervisions(
+                    keep_overlapping=False, 
+                    keep_all_channels=False)
+        cut_set = cut_set.filter(lambda c: c.duration >= .2 and c.duration <= 30)
+        if "train" in partition:
+            cut_set = (
+                cut_set
+                + cut_set.perturb_speed(0.9)
+                + cut_set.perturb_speed(1.1)
+            )
+            cut_set = cut_set.to_eager()
+            chunk_size = len(cut_set) // args.num_splits
+            cut_sets = cut_set.split_lazy(
+                output_dir=src_dir / f"cuts_train_raw_split{args.num_splits}",
+                chunk_size=chunk_size,)
+            start = args.start
+            stop = min(args.stop, args.num_splits) if args.stop > 0 else args.num_splits
+            num_digits = len(str(args.num_splits))
+
+            for i in range(start, stop):
+                idx = f"{i + 1}".zfill(num_digits)
+                cuts_train_idx_path = src_dir / f"cuts_train_{idx}.jsonl.gz"
+                logging.info(f"Processing train split {i}")
+                cs = cut_sets[i].compute_and_store_features_batch(
+                    extractor=extractor,
+                    storage_path=output_dir / f"feats_train_{idx}",
+                    batch_duration=1000,
+                    num_workers=8,
+                    storage_type=LilcomChunkyWriter,
+                    overwrite=True,
+                )
+                cs.to_file(cuts_train_idx_path)
+        else:
+            logging.info(f"Processing {partition}")
+            cut_set = cut_set.compute_and_store_features_batch(
+                extractor=extractor,
+                storage_path=output_dir / f"feats_{partition}",
+                batch_duration=1000,
+                num_workers=10,
+                storage_type=LilcomChunkyWriter,
+                overwrite=True,
+            )
+            cut_set.to_file(output_dir / f"cuts_{partition}.jsonl.gz")
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    args = get_args()
+
+    compute_fbank_gpu(args)
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/compute_fbank_musan.py
@@ -0,0 +1 @@
+../../ST/local/cuts_validate.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/display_manifest_statistics.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/filter_cuts.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/generate_unique_lexicon.py
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../librispeech/ASR/local/compute_fbank_musan.py`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../librispeech/ASR/local/display_manifest_statistics.py`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../librispeech/ASR/local/filter_cuts.py`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../librispeech/ASR/local/generate_unique_lexicon.py`