fix(aero_realtime): use silence fallback for videos without audio track

kcz358 · kcz358 · commit 4dba19a2dd28 · 2026-04-14T23:33:38.000-07:00
Videos without audio tracks caused NCCL allgather deadlocks because
the audio tower was skipped on some ranks but not others. Now returns
1s silence instead of None so audio tower always participates in forward.

Also adds aero_realtime to FlopsCounter (same as qwen3_vl).
diff --git a/src/lmms_engine/datasets/iterable/aero_realtime_iterable_dataset.py b/src/lmms_engine/datasets/iterable/aero_realtime_iterable_dataset.py
@@ -30,6 +30,7 @@
 import warnings
 
 import librosa
+from loguru import logger
 
 warnings.filterwarnings("ignore", message=".*PySoundFile.*")
 warnings.filterwarnings("ignore", message=".*__audioread_load.*", category=FutureWarning)
@@ -100,8 +101,7 @@ def load_from_json(self, data, data_folder=None) -> Dict[str, torch.Tensor]:
         if video_paths:
             for video_path in video_paths:
                 audio = self._extract_audio_from_video(video_path)
-                if audio is not None:
-                    audios.append(audio)
+                audios.append(audio)
 
         # Convert messages to HF format (realtime_text items are passed through)
         hf_messages = TrainUtilities.convert_open_to_hf(messages)
@@ -187,18 +187,20 @@ def _extract_audio_from_video(
         self,
         video_path: str,
         target_sr: Optional[int] = None,
-    ) -> Optional[np.ndarray]:
+    ) -> np.ndarray:
         """Extract audio waveform from a video file.
 
         Uses librosa to load the audio track from the video file.
-        Returns None if audio extraction fails (e.g. no audio track).
+        If extraction fails (e.g. no audio track), returns 1 second of
+        silence so that the audio tower always participates in the forward
+        pass across all ranks, avoiding NCCL allgather deadlocks.
 
         Args:
             video_path: Full path to the video file.
             target_sr: Target sampling rate. Defaults to processor's rate.
 
         Returns:
-            Audio waveform as float32 numpy array, or None on failure.
+            Audio waveform as float32 numpy array.
         """
         if target_sr is None:
             target_sr = self.processor.sampling_rate
@@ -207,8 +209,10 @@ def _extract_audio_from_video(
             audio, _ = librosa.load(video_path, sr=target_sr, mono=True)
             return audio
         except Exception:
-            # Video may not have an audio track
-            return None
+            # Video has no audio track — return silence to keep audio tower
+            # in the forward pass (required for DDP/FSDP gradient sync)
+            logger.warning(f"Failed to extract audio from {video_path}, using 1s silence fallback")
+            return np.zeros(target_sr, dtype=np.float32)
 
     def get_collator(self):
         return AeroRealtimeCollator(self.processor)
diff --git a/src/lmms_engine/models/utils.py b/src/lmms_engine/models/utils.py
@@ -36,6 +36,7 @@
     "qwen3_5",
     "qwen3_vl",
     "qwen3_vl_moe",
+    "aero_realtime",
     "deepseek_v3",
     "minicpmv",
     "minicpmo",
@@ -74,6 +75,7 @@ def __init__(self, config: PretrainedConfig):
             "qwen3_omni_moe_thinker": self._estimate_qwen2_moe_flops,
             "qwen3_vl": self._estimate_qwen2_flops,
             "qwen3_vl_moe": self._estimate_qwen2_moe_flops,
+            "aero_realtime": self._estimate_qwen2_flops,
             "deepseek_v3": self._estimate_deepseek_v3_flops,
             "minicpmv": self._estimate_qwen2_flops,
             "minicpmo": self._estimate_qwen2_flops,
@@ -90,6 +92,7 @@ def __init__(self, config: PretrainedConfig):
             "qwen2_5_omni_thinker",
             "qwen3_omni_moe",
             "qwen3_omni_moe_thinker",
+            "aero_realtime",
         ]:
             self.config = config.text_config
             self.config.model_type = config.model_type