|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# Johns Hopkins University (authors: Amir Hussein) |
| 3 | +# |
| 4 | +# See ../../../../LICENSE for clarification regarding multiple authors |
| 5 | +# |
| 6 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 7 | +# you may not use this file except in compliance with the License. |
| 8 | +# You may obtain a copy of the License at |
| 9 | +# |
| 10 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +# |
| 12 | +# Unless required by applicable law or agreed to in writing, software |
| 13 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | +# See the License for the specific language governing permissions and |
| 16 | +# limitations under the License. |
| 17 | + |
| 18 | + |
| 19 | +""" |
| 20 | +This file computes fbank features of the MGB2 dataset. |
| 21 | +It looks for manifests in the directory data/manifests. |
| 22 | +
|
| 23 | +The generated fbank features are saved in data/fbank. |
| 24 | +""" |
| 25 | + |
| 26 | +import logging |
| 27 | +import os |
| 28 | +from pathlib import Path |
| 29 | +import argparse |
| 30 | + |
| 31 | +import torch |
| 32 | +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter |
| 33 | +from lhotse.recipes.utils import read_manifests_if_cached |
| 34 | + |
| 35 | +from icefall.utils import get_executor |
| 36 | + |
| 37 | +from lhotse.features.kaldifeat import ( |
| 38 | + KaldifeatFbank, |
| 39 | + KaldifeatFbankConfig, |
| 40 | + KaldifeatFrameOptions, |
| 41 | + KaldifeatMelOptions, |
| 42 | +) |
| 43 | + |
| 44 | +# Torch's multithreaded behavior needs to be disabled or |
| 45 | +# it wastes a lot of CPU and slow things down. |
| 46 | +# Do this outside of main() in case it needs to take effect |
| 47 | +# even when we are not invoking the main (e.g. when spawning subprocesses). |
| 48 | + |
| 49 | +def get_args(): |
| 50 | + parser = argparse.ArgumentParser() |
| 51 | + parser.add_argument( |
| 52 | + "--num-splits", |
| 53 | + type=int, |
| 54 | + default=20, |
| 55 | + help="Number of splits for the train set.", |
| 56 | + ) |
| 57 | + parser.add_argument( |
| 58 | + "--start", |
| 59 | + type=int, |
| 60 | + default=0, |
| 61 | + help="Start index of the train set split.", |
| 62 | + ) |
| 63 | + parser.add_argument( |
| 64 | + "--stop", |
| 65 | + type=int, |
| 66 | + default=-1, |
| 67 | + help="Stop index of the train set split.", |
| 68 | + ) |
| 69 | + parser.add_argument( |
| 70 | + "--test", |
| 71 | + action="store_true", |
| 72 | + help="If set, only compute features for the dev and val set.", |
| 73 | + ) |
| 74 | + |
| 75 | + return parser.parse_args() |
| 76 | + |
| 77 | + |
| 78 | +def compute_fbank_gpu(args): |
| 79 | + src_dir = Path("data/manifests") |
| 80 | + output_dir = Path("data/fbank") |
| 81 | + output_dir.mkdir(parents=True, exist_ok=True) |
| 82 | + num_jobs = os.cpu_count() |
| 83 | + num_mel_bins = 80 |
| 84 | + sampling_rate = 16000 |
| 85 | + sr = 16000 |
| 86 | + |
| 87 | + dataset_parts = ("dev", "test1") if args.test else ("train", "test1", "dev") |
| 88 | + manifests = read_manifests_if_cached( |
| 89 | + prefix="iwslt-ta", dataset_parts=dataset_parts, output_dir=src_dir |
| 90 | + ) |
| 91 | + assert manifests is not None |
| 92 | + |
| 93 | + extractor = KaldifeatFbank( |
| 94 | + KaldifeatFbankConfig( |
| 95 | + frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate), |
| 96 | + mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins), |
| 97 | + device="cuda", |
| 98 | + ) |
| 99 | + ) |
| 100 | + |
| 101 | + for partition, m in manifests.items(): |
| 102 | + if (output_dir / f"cuts_{partition}.jsonl.gz").is_file(): |
| 103 | + logging.info(f"{partition} already exists - skipping.") |
| 104 | + continue |
| 105 | + logging.info(f"Processing {partition}") |
| 106 | + cut_set = CutSet.from_manifests( |
| 107 | + recordings=m["recordings"], |
| 108 | + supervisions=m["supervisions"], |
| 109 | + ) |
| 110 | + |
| 111 | + logging.info("About to split cuts into smaller chunks.") |
| 112 | + if sr != None: |
| 113 | + logging.info(f"Resampling to {sr}") |
| 114 | + cut_set = cut_set.resample(sr) |
| 115 | + |
| 116 | + cut_set = cut_set.trim_to_supervisions( |
| 117 | + keep_overlapping=False, |
| 118 | + keep_all_channels=False) |
| 119 | + cut_set = cut_set.filter(lambda c: c.duration >= .2 and c.duration <= 30) |
| 120 | + if "train" in partition: |
| 121 | + cut_set = ( |
| 122 | + cut_set |
| 123 | + + cut_set.perturb_speed(0.9) |
| 124 | + + cut_set.perturb_speed(1.1) |
| 125 | + ) |
| 126 | + cut_set = cut_set.to_eager() |
| 127 | + chunk_size = len(cut_set) // args.num_splits |
| 128 | + cut_sets = cut_set.split_lazy( |
| 129 | + output_dir=src_dir / f"cuts_train_raw_split{args.num_splits}", |
| 130 | + chunk_size=chunk_size,) |
| 131 | + start = args.start |
| 132 | + stop = min(args.stop, args.num_splits) if args.stop > 0 else args.num_splits |
| 133 | + num_digits = len(str(args.num_splits)) |
| 134 | + |
| 135 | + for i in range(start, stop): |
| 136 | + idx = f"{i + 1}".zfill(num_digits) |
| 137 | + cuts_train_idx_path = src_dir / f"cuts_train_{idx}.jsonl.gz" |
| 138 | + logging.info(f"Processing train split {i}") |
| 139 | + cs = cut_sets[i].compute_and_store_features_batch( |
| 140 | + extractor=extractor, |
| 141 | + storage_path=output_dir / f"feats_train_{idx}", |
| 142 | + batch_duration=1000, |
| 143 | + num_workers=8, |
| 144 | + storage_type=LilcomChunkyWriter, |
| 145 | + overwrite=True, |
| 146 | + ) |
| 147 | + cs.to_file(cuts_train_idx_path) |
| 148 | + else: |
| 149 | + logging.info(f"Processing {partition}") |
| 150 | + cut_set = cut_set.compute_and_store_features_batch( |
| 151 | + extractor=extractor, |
| 152 | + storage_path=output_dir / f"feats_{partition}", |
| 153 | + batch_duration=1000, |
| 154 | + num_workers=10, |
| 155 | + storage_type=LilcomChunkyWriter, |
| 156 | + overwrite=True, |
| 157 | + ) |
| 158 | + cut_set.to_file(output_dir / f"cuts_{partition}.jsonl.gz") |
| 159 | + |
| 160 | +if __name__ == "__main__": |
| 161 | + formatter = ( |
| 162 | + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" |
| 163 | + ) |
| 164 | + |
| 165 | + logging.basicConfig(format=formatter, level=logging.INFO) |
| 166 | + args = get_args() |
| 167 | + |
| 168 | + compute_fbank_gpu(args) |
0 commit comments