Fix transcribe when nbest hypotheses are returned (NVIDIA-NeMo#13540)

lilithgrigoryan · web-flow · commit 63b30fb4a634 · 2025-05-15T08:54:13.000-04:00
* fix transcribe when nbest

Signed-off-by: lilithgrigoryan &lt;lgrigoryan@nvidia.com&gt;

* minor fix

Signed-off-by: lilithgrigoryan &lt;lgrigoryan@nvidia.com&gt;

* fix in process_aed_timestamp_outputs to return list

Signed-off-by: lilithgrigoryan &lt;lgrigoryan@nvidia.com&gt;

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan &lt;lilithgrigoryan@users.noreply.github.com&gt;

* minor fix

Signed-off-by: lilithgrigoryan &lt;lgrigoryan@nvidia.com&gt;

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan &lt;lilithgrigoryan@users.noreply.github.com&gt;

* clean up

Signed-off-by: lilithgrigoryan &lt;lgrigoryan@nvidia.com&gt;

* clean up

Signed-off-by: lilithgrigoryan &lt;lgrigoryan@nvidia.com&gt;

* clean up

Signed-off-by: lilithgrigoryan &lt;lgrigoryan@nvidia.com&gt;

* Apply isort and black reformatting

Signed-off-by: lilithgrigoryan &lt;lilithgrigoryan@users.noreply.github.com&gt;

* restore canary not from hf

Signed-off-by: lilithgrigoryan &lt;lgrigoryan@nvidia.com&gt;

---------

Signed-off-by: lilithgrigoryan &lt;lgrigoryan@nvidia.com&gt;
Signed-off-by: lilithgrigoryan &lt;lilithgrigoryan@users.noreply.github.com&gt;
Co-authored-by: lilithgrigoryan &lt;lilithgrigoryan@users.noreply.github.com&gt;
diff --git a/nemo/collections/asr/parts/mixins/transcription.py b/nemo/collections/asr/parts/mixins/transcription.py
@@ -29,10 +29,11 @@
 from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations
 from nemo.collections.asr.parts.preprocessing.segment import AudioSegment, ChannelSelectorType
 from nemo.collections.asr.parts.utils import manifest_utils
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.common.data.utils import move_data_to_device
 from nemo.utils import logging, logging_mode
 
-TranscriptionReturnType = Union[List[str], List['Hypothesis'], Tuple[List[str]], Tuple[List['Hypothesis']]]
+TranscriptionReturnType = Union[List[str], List[Hypothesis], Tuple[List[str]], Tuple[List[Hypothesis]]]
 GenericTranscriptionType = Union[List[Any], List[List[Any]], Tuple[Any], Tuple[List[Any]], Dict[str, List[Any]]]
 
 
@@ -273,18 +274,7 @@ def transcribe(
                     if results is None:
                         results = []
 
-                        # if list of inner list of results, copy structure
-                        if isinstance(processed_outputs[0], list):
-                            for _ in processed_outputs:
-                                results.append([])
-
-                    # If nested list structure
-                    if isinstance(processed_outputs[0], list):
-                        for i, processed_output in enumerate(processed_outputs):
-                            results[i].extend(processed_output)
-                    else:
-                        # If flat list structure
-                        results.extend(processed_outputs)
+                    results.extend(processed_outputs)
 
                 elif isinstance(processed_outputs, dict):
                     # Create a results of the same type as each element in processed_outputs
diff --git a/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py b/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py
@@ -220,8 +220,8 @@ def forward(
                     hypotheses = [Hypothesis(score=0.0, y_sequence=[], timestamp=[]) for _ in range(self.beam_size)]
                     # Pack results into Hypotheses
                     hypotheses = pack_hypotheses(hypotheses, topk_hypotheses[i], beam_scores[i])
-                    self.format_hypotheses(hypotheses, decoder_input_ids)
                     packed_result.append(NBestHypotheses(hypotheses))
+                self.format_hypotheses(packed_result, decoder_input_ids)
             else:
                 beam_scores = [None for _ in range(len(best_hypo))]
                 best_hypo = best_hypo.detach().cpu()
@@ -234,7 +234,9 @@ def forward(
 
         return (packed_result,)
 
-    def format_hypotheses(self, packed_result: List[Hypothesis], decoder_input_ids: Union[torch.Tensor, None]) -> None:
+    def format_hypotheses(
+        self, packed_result: List[Hypothesis | NBestHypotheses], decoder_input_ids: Union[torch.Tensor, None]
+    ) -> None:
         """
         For each hypothesis in the mini-batch:
         * Remove the decoder input ids (prompt) from the predictions
@@ -246,21 +248,28 @@ def format_hypotheses(self, packed_result: List[Hypothesis], decoder_input_ids:
                 len(packed_result) == decoder_input_ids.shape[0]
             ), f"Mismatching number of examples {len(packed_result)=} {decoder_input_ids.shape[0]=}"
             decoder_input_ids = decoder_input_ids.detach().cpu()
-            for hyp, prefix in zip(packed_result, decoder_input_ids):
-                assert (
-                    hyp.y_sequence[: prefix.shape[0]] == prefix
-                ).all(), f"The decoder input IDs were not found at the beginning of prediction: {hyp.y_sequence=} {prefix=})"
-                hyp.y_sequence = hyp.y_sequence[prefix.shape[0] :]
-        for hyp in packed_result:
-            ids = hyp.y_sequence
-            ids_len = ids.shape[0]
-            pos = -1
-            while ids[pos] == self.pad or ids[pos] == self.eos:
-                pos -= 1
-                if ids_len + pos == -1:
-                    break  # empty sequence
-            if pos < -1:
-                hyp.y_sequence = ids[: pos + 1]
+
+            for h, prefix in zip(packed_result, decoder_input_ids):
+                hypotheses = h.n_best_hypotheses if isinstance(h, NBestHypotheses) else [h]
+                for hyp in hypotheses:
+                    assert (hyp.y_sequence[: prefix.shape[0]] == prefix).all(), (
+                        f"The decoder input IDs were not found at the beginning of prediction: "
+                        f"{hyp.y_sequence=} {prefix=}"
+                    )
+                    hyp.y_sequence = hyp.y_sequence[prefix.shape[0] :]
+
+        for h in packed_result:
+            hyps = h.n_best_hypotheses if isinstance(h, NBestHypotheses) else [h]
+            for hyp in hyps:
+                ids = hyp.y_sequence
+                ids_len = ids.shape[0]
+                pos = -1
+                while ids[pos] == self.pad or ids[pos] == self.eos:
+                    pos -= 1
+                    if ids_len + pos == -1:
+                        break  # empty sequence
+                if pos < -1:
+                    hyp.y_sequence = ids[: pos + 1]
 
 
 @dataclass
diff --git a/nemo/collections/asr/parts/utils/timestamp_utils.py b/nemo/collections/asr/parts/utils/timestamp_utils.py
@@ -22,22 +22,13 @@ def process_aed_timestamp_outputs(outputs, subsampling_factor: int = 1, window_s
     """
     Processes AED timestamp outputs and extracts word-level timestamps.
     Args:
-        outputs (list or Hypothesis): The hypothesis outputs to process. Can be a single Hypothesis object or a list of Hypothesis objects.
+        outputs (Hypothesis, list of Hypotesis or list of list of Hypotesis): The hypothesis outputs to process. Can be a single Hypothesis object or a list of Hypothesis objects.
         subsampling_factor (int, optional): The subsampling factor used in the model. Default is 1.
         window_stride (float, optional): The window stride used in the model. Default is 0.01.
     Returns:
-        list or Hypothesis: The processed hypothesis outputs with word-level timestamps added.
+        list of list of Hypotesis: The processed hypothesis outputs with word-level timestamps added.
     """
 
-    if outputs is None:
-        return outputs
-
-    if isinstance(outputs, Hypothesis):
-        outputs = [outputs]
-
-    if not isinstance(outputs[0], Hypothesis):
-        raise ValueError(f"Expected Hypothesis object, got {type(outputs[0])}")
-
     def extract_words_with_timestamps(text, subsampling_factor: int = 1, window_stride: float = 0.01):
         text = text.strip()  # remove leading and trailing whitespaces - training data artifact
 
@@ -77,24 +68,52 @@ def segments_offset_to_time(segments, window_stride, subsampling_factor):
             segment['end'] = segment['end_offset'] * window_stride * subsampling_factor
         return segments
 
-    for idx, hyp in enumerate(outputs):
+    def process_hypothesis(hyp, subsampling_factor: int, window_stride: float):
+        """
+        Processes a single Hypothesis object to extract timestamps.
+        """
         timestamp, text = extract_words_with_timestamps(hyp.text, subsampling_factor, window_stride)
+        hyp.text = text
         if timestamp is not None:
-            if len(outputs[idx].timestamp) == 0:
-                outputs[idx].timestamp = {}
-            outputs[idx].timestamp['char'] = []  # not supported for AED
-            outputs[idx].timestamp['word'] = timestamp
-            outputs[idx].text = text
+            if len(hyp.timestamp) == 0:
+                hyp.timestamp = {}
+
+            hyp.timestamp.update(
+                {
+                    'word': timestamp,
+                    'segment': [],
+                    'char': [],  # not supported for AED
+                }
+            )
+
             segments = AbstractCTCDecoding._get_segment_offsets(timestamp, segment_delimiter_tokens=['.', '?', '!'])
-            segments = segments_offset_to_time(segments, window_stride, subsampling_factor)
-            outputs[idx].timestamp['segment'] = segments
+            hyp.timestamp['segment'] = segments_offset_to_time(segments, window_stride, subsampling_factor)
         else:
-            outputs[idx].text = text
-            outputs[idx].timestamp = {}
-            outputs[idx].timestamp['word'] = []
-            outputs[idx].timestamp['segment'] = []
-            outputs[idx].timestamp['char'] = []
-    return outputs
+            hyp.timestamp = {
+                'word': [],
+                'segment': [],
+                'char': [],
+            }
+
+        return hyp
+
+    if outputs is None:
+        return outputs
+
+    if isinstance(outputs, Hypothesis):
+        return [process_hypothesis(outputs, subsampling_factor, window_stride)]
+    elif isinstance(outputs, list) and isinstance(outputs[0], Hypothesis):
+        # list of Hypothesis
+        return [process_hypothesis(hyp, subsampling_factor, window_stride) for hyp in outputs]
+    elif isinstance(outputs, list) and isinstance(outputs[0], list) and isinstance(outputs[0][0], Hypothesis):
+        # list of list of Hypothesis (for beam decoding)
+        return [
+            [process_hypothesis(hyp, subsampling_factor, window_stride) for hyp in hyps_list] for hyps_list in outputs
+        ]
+    else:
+        raise ValueError(
+            f"Expected Hypothesis, list of Hypothesis or list of list of Hypothesis object, got {type(outputs)}"
+        )
 
 
 def process_timestamp_outputs(outputs, subsampling_factor: int = 1, window_stride: float = 0.01):
diff --git a/tests/collections/asr/mixins/test_transcription.py b/tests/collections/asr/mixins/test_transcription.py
@@ -12,14 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import copy
 import json
 import os
 from dataclasses import dataclass
 from typing import Any, Dict, List
 
 import pytest
 import torch
+from omegaconf import open_dict
 from torch.utils.data import DataLoader, Dataset
 
 from nemo.collections.asr.data.audio_to_text import _speech_collate_fn
@@ -366,6 +367,54 @@ def test_transcribe_dataloader(self, audio_files, fast_conformer_ctc_model):
         assert isinstance(outputs[0], Hypothesis)
         assert isinstance(outputs[1], Hypothesis)
 
+    @pytest.mark.unit
+    def test_transcribe_return_nbest_rnnt(self, audio_files, fast_conformer_transducer_model):
+        fast_conformer_transducer_model.eval()
+        audio1, audio2 = audio_files
+
+        orig_decoding_config = copy.deepcopy(fast_conformer_transducer_model.cfg.decoding)
+
+        decoding_config = copy.deepcopy(fast_conformer_transducer_model.cfg.decoding)
+        with open_dict(decoding_config):
+            decoding_config["strategy"] = "malsd_batch"
+            decoding_config["beam"]["beam_size"] = 4
+            decoding_config["beam"]["return_best_hypothesis"] = False
+            decoding_config["beam"]["allow_cuda_graphs"] = False
+        fast_conformer_transducer_model.change_decoding_strategy(decoding_config)
+
+        outputs = fast_conformer_transducer_model.transcribe([audio1, audio2], batch_size=1, timestamps=False)
+
+        assert len(outputs) == 2
+        assert all(len(output) >= 1 for output in outputs)
+        assert all(isinstance(output, list) for output in outputs)
+        assert all(isinstance(hyp, Hypothesis) for output in outputs for hyp in output)
+
+        # Reset the decoding strategy to original
+        fast_conformer_transducer_model.change_decoding_strategy(orig_decoding_config)
+
+    @pytest.mark.unit
+    def test_transcribe_return_nbest_canary(self, audio_files, canary_1b_flash):
+        canary_1b_flash.eval()
+        audio1, audio2 = audio_files
+
+        orig_decoding_config = copy.deepcopy(canary_1b_flash.cfg.decoding)
+
+        decoding_config = copy.deepcopy(canary_1b_flash.cfg.decoding)
+        with open_dict(decoding_config):
+            decoding_config["beam"]["beam_size"] = 4
+            decoding_config["beam"]["return_best_hypothesis"] = False
+        canary_1b_flash.change_decoding_strategy(decoding_config)
+
+        outputs = canary_1b_flash.transcribe([audio1, audio2], batch_size=1, timestamps=False)
+
+        assert len(outputs) == 2
+        assert all(len(output) >= 1 for output in outputs)
+        assert all(isinstance(output, list) for output in outputs)
+        assert all(isinstance(hyp, Hypothesis) for output in outputs for hyp in output)
+
+        # Reset the decoding strategy to original
+        canary_1b_flash.change_decoding_strategy(orig_decoding_config)
+
     @pytest.mark.with_downloads()
     @pytest.mark.unit
     def test_timestamps_with_transcribe(self, audio_files, fast_conformer_ctc_model):