Skip to content

Commit 73a5e7d

Browse files
naymaraqnaymaraq
andauthored
Remove tail margin algorithms from all ASR pipelines (#15605)
* cleunup tail padding algorithms Signed-off-by: naymaraq <dkaramyan@nvidia.com> * remove tail margin algorithms at all Signed-off-by: naymaraq <dkaramyan@nvidia.com> * Apply isort and black reformatting Signed-off-by: naymaraq <naymaraq@users.noreply.github.com> --------- Signed-off-by: naymaraq <dkaramyan@nvidia.com> Co-authored-by: naymaraq <dkaramyan@nvidia.com>
1 parent 06d60fc commit 73a5e7d

4 files changed

Lines changed: 14 additions & 28 deletions

File tree

nemo/collections/asr/inference/pipelines/buffered_ctc_pipeline.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,6 @@ def init_parameters(self, cfg: DictConfig) -> None:
100100
self.right_padding = self.padding_mode is FeatureBufferPaddingMode.RIGHT
101101
self.return_tail_result = cfg.return_tail_result
102102

103-
# Keep small amount of extra padding
104-
self.tail_padding_in_samples = max(int(self.chunk_size * self.sample_rate * 0.45), 6400)
105103
self.zero_log_probs = self.init_zero_log_probs() if self.right_padding else None
106104

107105
def init_endpointer(self) -> None:
@@ -238,10 +236,10 @@ def get_logprobs_given_raw_signals(
238236
buffers.append(buffer.unsqueeze_(0))
239237

240238
# Only final frames have right padding
241-
# Keep some amount of extra padding to avoid the performance degradation
242-
right_paddings = torch.tensor(
243-
[frame.size - frame.valid_size - self.tail_padding_in_samples for frame in frames], device=self.device
244-
).clamp(min=0)
239+
# Calculate right paddings
240+
right_paddings = torch.tensor([frame.size - frame.valid_size for frame in frames], device=self.device).clamp(
241+
min=0
242+
)
245243

246244
# Create and adjust the buffer lens
247245
buffer_lens = torch.tensor([buffers[0].size(1)] * len(buffers), device=self.device)
@@ -441,6 +439,5 @@ def get_request_generator(self) -> ContinuousBatchedRequestStreamer:
441439
device=self.device,
442440
pad_last_frame=True,
443441
right_pad_features=self.right_padding,
444-
tail_padding_in_samples=self.tail_padding_in_samples,
445442
)
446443
return request_generator

nemo/collections/asr/inference/pipelines/buffered_rnnt_pipeline.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,6 @@ def init_parameters(self, cfg: DictConfig) -> None:
132132
self.return_tail_result = cfg.return_tail_result
133133
self.tokens_to_move = self.punctuation_ids.union(self.language_token_ids)
134134

135-
# Keep small amount of extra padding
136-
self.tail_padding_in_samples = max(int(self.chunk_size * self.sample_rate * 0.45), 6400)
137135
self.zero_encoded = self.init_zero_enc() if self.right_padding else None
138136

139137
def init_endpointer(self) -> None:
@@ -313,10 +311,10 @@ def encode_raw_signals(
313311
buffers.append(buffer.unsqueeze_(0))
314312

315313
# Only final frames have right padding
316-
# Keep some amount of extra padding to avoid the performance degradation
317-
right_paddings = torch.tensor(
318-
[frame.size - frame.valid_size - self.tail_padding_in_samples for frame in frames], device=self.device
319-
).clamp(min=0)
314+
# Calculate right paddings
315+
right_paddings = torch.tensor([frame.size - frame.valid_size for frame in frames], device=self.device).clamp(
316+
min=0
317+
)
320318

321319
# Create and adjust the buffer lens
322320
buffer_lens = torch.tensor([buffers[0].size(1)] * len(buffers), device=self.device)
@@ -807,6 +805,5 @@ def get_request_generator(self) -> ContinuousBatchedRequestStreamer:
807805
device=self.device,
808806
pad_last_frame=True,
809807
right_pad_features=self.right_padding,
810-
tail_padding_in_samples=self.tail_padding_in_samples,
811808
)
812809
return request_generator

nemo/collections/asr/inference/streaming/buffering/cache_feature_bufferer.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ def __init__(
4444
preprocessor_cfg: DictConfig,
4545
device: torch.device,
4646
fill_value: float = LOG_MEL_ZERO,
47-
right_padding_ratio: float = 0.8,
4847
):
4948
"""
5049
Args:
@@ -55,7 +54,6 @@ def __init__(
5554
preprocessor_cfg (DictConfig): preprocessor configuration
5655
device (torch.device): device
5756
fill_value (float): fill value for the feature buffer
58-
right_padding_ratio (float): right padding ratio
5957
"""
6058
if buffer_size_in_secs < chunk_size_in_secs:
6159
raise ValueError(
@@ -68,7 +66,6 @@ def __init__(
6866
self.chunk_size_in_secs = chunk_size_in_secs
6967
self.preprocessor_cfg = preprocessor_cfg
7068
self.device = device
71-
self.right_padding_ratio = right_padding_ratio
7269

7370
self.is_buffer_size_equal_to_chunk_size = math.isclose(self.buffer_size_in_secs, self.chunk_size_in_secs)
7471
self.plus_one = 0 if self.is_buffer_size_equal_to_chunk_size else 1
@@ -142,12 +139,12 @@ def preprocess(
142139
"""
143140
signals = torch.vstack(audio_buffers).to(self.device) # B x T
144141
signals_len = torch.tensor([signals.shape[1]] * signals.shape[0], device=self.device, dtype=torch.long) # B
145-
right_paddings = right_paddings * self.right_padding_ratio
146142
signals_len = signals_len - right_paddings.long()
147-
features, _ = self.preprocessor(input_signal=signals, length=signals_len)
143+
features, feature_lens = self.preprocessor(input_signal=signals, length=signals_len)
148144
if features.shape[2] > expected_feat_len:
149145
features = features[:, :, :expected_feat_len] # B x F x T
150-
right_padding = torch.floor(right_paddings / self.sample_rate / self.timestep_duration) # B
146+
feature_lens = feature_lens.clamp(max=expected_feat_len)
147+
right_padding = (features.shape[2] - feature_lens).clamp(min=0).to(torch.long)
151148
return features, right_padding
152149

153150
def _update_feature_buffer(self, slot_ids: list[int], feat_chunk: Tensor) -> None:

nemo/collections/asr/inference/streaming/framing/multi_stream.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,6 @@ def __init__(
243243
device: torch.device = None,
244244
pad_last_frame: bool = False,
245245
right_pad_features: bool = False,
246-
tail_padding_in_samples: int = 0,
247246
):
248247
"""
249248
Args:
@@ -257,7 +256,6 @@ def __init__(
257256
device (torch.device): The device to use, required for request type FEATURE_BUFFER
258257
pad_last_frame (bool): Whether to pad the last frame
259258
right_pad_features (bool): Whether to right pad the features, optional for request type FEATURE_BUFFER
260-
tail_padding_in_samples (int): The tail padding in samples, optional for request type FEATURE_BUFFER
261259
"""
262260

263261
if request_type is RequestType.FEATURE_BUFFER:
@@ -284,7 +282,6 @@ def __init__(
284282
sample_rate=sample_rate, buffer_size_in_secs=buffer_size_in_secs
285283
)
286284
self.right_pad_features = right_pad_features
287-
self.tail_padding_in_samples = tail_padding_in_samples
288285

289286
def set_audio_filepaths(self, audio_filepaths: list[str], options: list[RequestOptions]) -> None:
290287
"""
@@ -351,11 +348,9 @@ def to_feature_buffers(self, frames: list[Frame]) -> list[FeatureBuffer]:
351348
buffer_lens = torch.tensor([buffers[0].size(1)] * len(buffers), device=self.device)
352349

353350
# Calculate right paddings and subtract from buffer lens
354-
# tail_padding_in_samples is used to keep some amount of padding at the end of the buffer
355-
# some models perform better with this padding
356-
right_paddings = torch.tensor(
357-
[frame.size - frame.valid_size - self.tail_padding_in_samples for frame in frames], device=self.device
358-
).clamp(min=0)
351+
right_paddings = torch.tensor([frame.size - frame.valid_size for frame in frames], device=self.device).clamp(
352+
min=0
353+
)
359354

360355
# Subtract right paddings from buffer lens
361356
buffer_lens = buffer_lens - right_paddings

0 commit comments

Comments
 (0)