Remove tail margin algorithms from all ASR pipelines (#15605)

naymaraq · naymaraq · web-flow · commit 73a5e7d69359 · 2026-04-17T15:38:58.000+05:30
* cleunup tail padding algorithms

Signed-off-by: naymaraq &lt;dkaramyan@nvidia.com&gt;

* remove tail margin algorithms at all

Signed-off-by: naymaraq &lt;dkaramyan@nvidia.com&gt;

* Apply isort and black reformatting

Signed-off-by: naymaraq &lt;naymaraq@users.noreply.github.com&gt;

---------

Signed-off-by: naymaraq &lt;dkaramyan@nvidia.com&gt;
Co-authored-by: naymaraq &lt;dkaramyan@nvidia.com&gt;
diff --git a/nemo/collections/asr/inference/pipelines/buffered_ctc_pipeline.py b/nemo/collections/asr/inference/pipelines/buffered_ctc_pipeline.py
@@ -100,8 +100,6 @@ def init_parameters(self, cfg: DictConfig) -> None:
         self.right_padding = self.padding_mode is FeatureBufferPaddingMode.RIGHT
         self.return_tail_result = cfg.return_tail_result
 
-        # Keep small amount of extra padding
-        self.tail_padding_in_samples = max(int(self.chunk_size * self.sample_rate * 0.45), 6400)
         self.zero_log_probs = self.init_zero_log_probs() if self.right_padding else None
 
     def init_endpointer(self) -> None:
@@ -238,10 +236,10 @@ def get_logprobs_given_raw_signals(
             buffers.append(buffer.unsqueeze_(0))
 
         # Only final frames have right padding
-        # Keep some amount of extra padding to avoid the performance degradation
-        right_paddings = torch.tensor(
-            [frame.size - frame.valid_size - self.tail_padding_in_samples for frame in frames], device=self.device
-        ).clamp(min=0)
+        # Calculate right paddings
+        right_paddings = torch.tensor([frame.size - frame.valid_size for frame in frames], device=self.device).clamp(
+            min=0
+        )
 
         # Create and adjust the buffer lens
         buffer_lens = torch.tensor([buffers[0].size(1)] * len(buffers), device=self.device)
@@ -441,6 +439,5 @@ def get_request_generator(self) -> ContinuousBatchedRequestStreamer:
             device=self.device,
             pad_last_frame=True,
             right_pad_features=self.right_padding,
-            tail_padding_in_samples=self.tail_padding_in_samples,
         )
         return request_generator
diff --git a/nemo/collections/asr/inference/pipelines/buffered_rnnt_pipeline.py b/nemo/collections/asr/inference/pipelines/buffered_rnnt_pipeline.py
@@ -132,8 +132,6 @@ def init_parameters(self, cfg: DictConfig) -> None:
         self.return_tail_result = cfg.return_tail_result
         self.tokens_to_move = self.punctuation_ids.union(self.language_token_ids)
 
-        # Keep small amount of extra padding
-        self.tail_padding_in_samples = max(int(self.chunk_size * self.sample_rate * 0.45), 6400)
         self.zero_encoded = self.init_zero_enc() if self.right_padding else None
 
     def init_endpointer(self) -> None:
@@ -313,10 +311,10 @@ def encode_raw_signals(
             buffers.append(buffer.unsqueeze_(0))
 
         # Only final frames have right padding
-        # Keep some amount of extra padding to avoid the performance degradation
-        right_paddings = torch.tensor(
-            [frame.size - frame.valid_size - self.tail_padding_in_samples for frame in frames], device=self.device
-        ).clamp(min=0)
+        # Calculate right paddings
+        right_paddings = torch.tensor([frame.size - frame.valid_size for frame in frames], device=self.device).clamp(
+            min=0
+        )
 
         # Create and adjust the buffer lens
         buffer_lens = torch.tensor([buffers[0].size(1)] * len(buffers), device=self.device)
@@ -807,6 +805,5 @@ def get_request_generator(self) -> ContinuousBatchedRequestStreamer:
             device=self.device,
             pad_last_frame=True,
             right_pad_features=self.right_padding,
-            tail_padding_in_samples=self.tail_padding_in_samples,
         )
         return request_generator
diff --git a/nemo/collections/asr/inference/streaming/buffering/cache_feature_bufferer.py b/nemo/collections/asr/inference/streaming/buffering/cache_feature_bufferer.py
@@ -44,7 +44,6 @@ def __init__(
         preprocessor_cfg: DictConfig,
         device: torch.device,
         fill_value: float = LOG_MEL_ZERO,
-        right_padding_ratio: float = 0.8,
     ):
         """
         Args:
@@ -55,7 +54,6 @@ def __init__(
             preprocessor_cfg (DictConfig): preprocessor configuration
             device (torch.device): device
             fill_value (float): fill value for the feature buffer
-            right_padding_ratio (float): right padding ratio
         """
         if buffer_size_in_secs < chunk_size_in_secs:
             raise ValueError(
@@ -68,7 +66,6 @@ def __init__(
         self.chunk_size_in_secs = chunk_size_in_secs
         self.preprocessor_cfg = preprocessor_cfg
         self.device = device
-        self.right_padding_ratio = right_padding_ratio
 
         self.is_buffer_size_equal_to_chunk_size = math.isclose(self.buffer_size_in_secs, self.chunk_size_in_secs)
         self.plus_one = 0 if self.is_buffer_size_equal_to_chunk_size else 1
@@ -142,12 +139,12 @@ def preprocess(
         """
         signals = torch.vstack(audio_buffers).to(self.device)  # B x T
         signals_len = torch.tensor([signals.shape[1]] * signals.shape[0], device=self.device, dtype=torch.long)  # B
-        right_paddings = right_paddings * self.right_padding_ratio
         signals_len = signals_len - right_paddings.long()
-        features, _ = self.preprocessor(input_signal=signals, length=signals_len)
+        features, feature_lens = self.preprocessor(input_signal=signals, length=signals_len)
         if features.shape[2] > expected_feat_len:
             features = features[:, :, :expected_feat_len]  # B x F x T
-        right_padding = torch.floor(right_paddings / self.sample_rate / self.timestep_duration)  # B
+            feature_lens = feature_lens.clamp(max=expected_feat_len)
+        right_padding = (features.shape[2] - feature_lens).clamp(min=0).to(torch.long)
         return features, right_padding
 
     def _update_feature_buffer(self, slot_ids: list[int], feat_chunk: Tensor) -> None:
diff --git a/nemo/collections/asr/inference/streaming/framing/multi_stream.py b/nemo/collections/asr/inference/streaming/framing/multi_stream.py
@@ -243,7 +243,6 @@ def __init__(
         device: torch.device = None,
         pad_last_frame: bool = False,
         right_pad_features: bool = False,
-        tail_padding_in_samples: int = 0,
     ):
         """
         Args:
@@ -257,7 +256,6 @@ def __init__(
             device (torch.device): The device to use, required for request type FEATURE_BUFFER
             pad_last_frame (bool): Whether to pad the last frame
             right_pad_features (bool): Whether to right pad the features, optional for request type FEATURE_BUFFER
-            tail_padding_in_samples (int): The tail padding in samples, optional for request type FEATURE_BUFFER
         """
 
         if request_type is RequestType.FEATURE_BUFFER:
@@ -284,7 +282,6 @@ def __init__(
                 sample_rate=sample_rate, buffer_size_in_secs=buffer_size_in_secs
             )
             self.right_pad_features = right_pad_features
-            self.tail_padding_in_samples = tail_padding_in_samples
 
     def set_audio_filepaths(self, audio_filepaths: list[str], options: list[RequestOptions]) -> None:
         """
@@ -351,11 +348,9 @@ def to_feature_buffers(self, frames: list[Frame]) -> list[FeatureBuffer]:
         buffer_lens = torch.tensor([buffers[0].size(1)] * len(buffers), device=self.device)
 
         # Calculate right paddings and subtract from buffer lens
-        # tail_padding_in_samples is used to keep some amount of padding at the end of the buffer
-        # some models perform better with this padding
-        right_paddings = torch.tensor(
-            [frame.size - frame.valid_size - self.tail_padding_in_samples for frame in frames], device=self.device
-        ).clamp(min=0)
+        right_paddings = torch.tensor([frame.size - frame.valid_size for frame in frames], device=self.device).clamp(
+            min=0
+        )
 
         # Subtract right paddings from buffer lens
         buffer_lens = buffer_lens - right_paddings