Always send prefill before audio streaming; fix bfloat16 audio output

erastorgueva-nv · erastorgueva-nv · commit d3db7008b1f8 · 2026-04-01T06:33:15.000Z
Signed-off-by: Elena Rastorgueva &lt;erastorgueva@nvidia.com&gt;
diff --git a/examples/speechlm2/nemo_inference_pipelines/triton/client_streaming.py b/examples/speechlm2/nemo_inference_pipelines/triton/client_streaming.py
@@ -134,42 +134,43 @@ def send_sequence_end(client, sequence_id):
     sequence_id = random.randint(1, 2**63 - 1)  # Generate random uint64 value
     
     try:
-        # If a system prompt is provided, send a separate prefill request first:
-        # zero-length audio + system_prompt, with sequence_start=True.
-        prefill_sent = False
-        if args.system_prompt is not None:
-            logger.info(f"Sending prefill request with system_prompt ({len(args.system_prompt)} chars)")
-            empty_audio = np.zeros((1, 0), dtype=np.float32)
-            prefill_inputs = [
-                grpcclient.InferInput(
-                    "audio_signal", empty_audio.shape, np_to_triton_dtype(empty_audio.dtype)
-                ),
-            ]
-            prefill_inputs[0].set_data_from_numpy(empty_audio)
+        # Always send a prefill request first (zero-length audio, sequence_start=True).
+        # This initializes the TTS speaker embedding and system prompt for the session.
+        # If --system_prompt is provided, it is included; otherwise the server uses
+        # its configured default.
+        logger.info("Sending prefill request%s",
+                     f" with system_prompt ({len(args.system_prompt)} chars)" if args.system_prompt else "")
+        empty_audio = np.zeros((1, 0), dtype=np.float32)
+        prefill_inputs = [
+            grpcclient.InferInput(
+                "audio_signal", empty_audio.shape, np_to_triton_dtype(empty_audio.dtype)
+            ),
+        ]
+        prefill_inputs[0].set_data_from_numpy(empty_audio)
 
+        if args.system_prompt is not None:
             prompt_np = np.array([args.system_prompt.encode("utf-8")], dtype=object)
             prompt_input = grpcclient.InferInput("system_prompt", prompt_np.shape, "BYTES")
             prompt_input.set_data_from_numpy(prompt_np)
             prefill_inputs.append(prompt_input)
 
-            prefill_outputs = [
-                grpcclient.InferRequestedOutput("output_text"),
-                grpcclient.InferRequestedOutput("output_asr_text"),
-                grpcclient.InferRequestedOutput("output_audio"),
-            ]
+        prefill_outputs = [
+            grpcclient.InferRequestedOutput("output_text"),
+            grpcclient.InferRequestedOutput("output_asr_text"),
+            grpcclient.InferRequestedOutput("output_audio"),
+        ]
 
-            prefill_start = time.time()
-            client.infer(
-                model_name,
-                prefill_inputs,
-                request_id=str(uuid.uuid4()),
-                outputs=prefill_outputs,
-                sequence_id=sequence_id,
-                sequence_start=True,
-                sequence_end=False,
-            )
-            logger.info(f"Prefill completed in {time.time() - prefill_start:.3f}s")
-            prefill_sent = True
+        prefill_start = time.time()
+        client.infer(
+            model_name,
+            prefill_inputs,
+            request_id=str(uuid.uuid4()),
+            outputs=prefill_outputs,
+            sequence_id=sequence_id,
+            sequence_start=True,
+            sequence_end=False,
+        )
+        logger.info(f"Prefill completed in {time.time() - prefill_start:.3f}s")
 
         for idx, audio_chunk in tqdm(enumerate(audio_signal_chunks)):
             inputs = [
@@ -193,7 +194,7 @@ def send_sequence_end(client, sequence_id):
                 request_id=str(uuid.uuid4()),
                 outputs=outputs,
                 sequence_id=sequence_id,
-                sequence_start=(idx == 0 and not prefill_sent),
+                sequence_start=False,
                 sequence_end=idx == len(audio_signal_chunks) - 1,
             )
             end_time = time.time()
diff --git a/examples/speechlm2/nemo_inference_pipelines/triton/model_repo_s2s/voicechat/1/infer_streaming.py b/examples/speechlm2/nemo_inference_pipelines/triton/model_repo_s2s/voicechat/1/infer_streaming.py
@@ -307,9 +307,11 @@ def get_generations(self, frames: List[Frame]) -> List[Tuple]:
     def execute(self, requests: Iterable) -> List[pb_utils.InferenceResponse]:
         """Execute the model and return the responses.
         
-        Zero-length audio with ``sequence_start=True`` and a ``system_prompt``
-        is treated as a prefill-only request by the pipeline (no fake audio
-        needed).  All other requests are normal audio generation.
+        Clients MUST send a prefill request (zero-length audio with
+        ``sequence_start=True``) before streaming audio.  The prefill
+        initializes the TTS speaker embedding and system prompt for the
+        session.  Sending audio on the first request without a prefill
+        will produce degraded speaker voice quality.
         
         Returns:
         - output_audio: float32 array of generated audio samples
@@ -329,7 +331,7 @@ def execute(self, requests: Iterable) -> List[pb_utils.InferenceResponse]:
         responses = []
         for audio, text, asr_text in generations:
             if isinstance(audio, torch.Tensor):
-                audio_np = audio.detach().cpu().numpy().astype(np.float32)
+                audio_np = audio.detach().cpu().float().numpy()
                 if audio_np.ndim == 1:
                     audio_np = audio_np.reshape(1, -1)
             else: