Skip to content

Commit 4dba19a

Browse files
committed
fix(aero_realtime): use silence fallback for videos without audio track
Videos without audio tracks caused NCCL allgather deadlocks because the audio tower was skipped on some ranks but not others. Now returns 1s silence instead of None so audio tower always participates in forward. Also adds aero_realtime to FlopsCounter (same as qwen3_vl).
1 parent a776374 commit 4dba19a

2 files changed

Lines changed: 14 additions & 7 deletions

File tree

src/lmms_engine/datasets/iterable/aero_realtime_iterable_dataset.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import warnings
3131

3232
import librosa
33+
from loguru import logger
3334

3435
warnings.filterwarnings("ignore", message=".*PySoundFile.*")
3536
warnings.filterwarnings("ignore", message=".*__audioread_load.*", category=FutureWarning)
@@ -100,8 +101,7 @@ def load_from_json(self, data, data_folder=None) -> Dict[str, torch.Tensor]:
100101
if video_paths:
101102
for video_path in video_paths:
102103
audio = self._extract_audio_from_video(video_path)
103-
if audio is not None:
104-
audios.append(audio)
104+
audios.append(audio)
105105

106106
# Convert messages to HF format (realtime_text items are passed through)
107107
hf_messages = TrainUtilities.convert_open_to_hf(messages)
@@ -187,18 +187,20 @@ def _extract_audio_from_video(
187187
self,
188188
video_path: str,
189189
target_sr: Optional[int] = None,
190-
) -> Optional[np.ndarray]:
190+
) -> np.ndarray:
191191
"""Extract audio waveform from a video file.
192192
193193
Uses librosa to load the audio track from the video file.
194-
Returns None if audio extraction fails (e.g. no audio track).
194+
If extraction fails (e.g. no audio track), returns 1 second of
195+
silence so that the audio tower always participates in the forward
196+
pass across all ranks, avoiding NCCL allgather deadlocks.
195197
196198
Args:
197199
video_path: Full path to the video file.
198200
target_sr: Target sampling rate. Defaults to processor's rate.
199201
200202
Returns:
201-
Audio waveform as float32 numpy array, or None on failure.
203+
Audio waveform as float32 numpy array.
202204
"""
203205
if target_sr is None:
204206
target_sr = self.processor.sampling_rate
@@ -207,8 +209,10 @@ def _extract_audio_from_video(
207209
audio, _ = librosa.load(video_path, sr=target_sr, mono=True)
208210
return audio
209211
except Exception:
210-
# Video may not have an audio track
211-
return None
212+
# Video has no audio track — return silence to keep audio tower
213+
# in the forward pass (required for DDP/FSDP gradient sync)
214+
logger.warning(f"Failed to extract audio from {video_path}, using 1s silence fallback")
215+
return np.zeros(target_sr, dtype=np.float32)
212216

213217
def get_collator(self):
214218
return AeroRealtimeCollator(self.processor)

src/lmms_engine/models/utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
"qwen3_5",
3737
"qwen3_vl",
3838
"qwen3_vl_moe",
39+
"aero_realtime",
3940
"deepseek_v3",
4041
"minicpmv",
4142
"minicpmo",
@@ -74,6 +75,7 @@ def __init__(self, config: PretrainedConfig):
7475
"qwen3_omni_moe_thinker": self._estimate_qwen2_moe_flops,
7576
"qwen3_vl": self._estimate_qwen2_flops,
7677
"qwen3_vl_moe": self._estimate_qwen2_moe_flops,
78+
"aero_realtime": self._estimate_qwen2_flops,
7779
"deepseek_v3": self._estimate_deepseek_v3_flops,
7880
"minicpmv": self._estimate_qwen2_flops,
7981
"minicpmo": self._estimate_qwen2_flops,
@@ -90,6 +92,7 @@ def __init__(self, config: PretrainedConfig):
9092
"qwen2_5_omni_thinker",
9193
"qwen3_omni_moe",
9294
"qwen3_omni_moe_thinker",
95+
"aero_realtime",
9396
]:
9497
self.config = config.text_config
9598
self.config.model_type = config.model_type

0 commit comments

Comments
 (0)