refactor(aero_realtime): use audio realtime stream

kcz358 · kcz358 · commit 6c0ce0bdd711 · 2026-05-01T00:51:03.000-07:00
diff --git a/examples/aero_realtime/example_config.yaml b/examples/aero_realtime/example_config.yaml
@@ -1,9 +1,9 @@
 # AeroRealtime Training Configuration
 # Trains the AeroRealtime model on LLaVA-Video-178K data (normal video QA mode).
 #
-# The dual-stream additive design is active: during video regions, the model
-# receives additive vision+text embeddings and learns to stay silent (rt_pad)
-# until spoken to (rt_speak boundary at delay_seconds).
+# The realtime text stream is conditioned on audio positions only. Video
+# placeholders receive pure vision features, while audio placeholders carry
+# rt_pad / rt_speak / realtime text tokens along the audio timeline.
 #
 # Audio is auto-extracted from video files by the dataset.
 #
diff --git a/src/lmms_engine/datasets/processor/aero_realtime_processor.py b/src/lmms_engine/datasets/processor/aero_realtime_processor.py
@@ -21,11 +21,11 @@
 video playback).
 
 The processor builds ``text_stream_ids`` with the delay mechanism:
-- ``<|rt_start|>`` at position 0 of the video region
-- ``<|rt_pad|>`` for silence positions before the delay boundary
+- ``<|rt_start|>`` at the first audio position
+- ``<|rt_pad|>`` for audio silence positions before the delay boundary
 - ``<|rt_speak|>`` at the delay boundary
 - After ``<|rt_speak|>``: ``<|rt_pad|>`` for normal QA, or actual text tokens
-  at the appropriate temporal positions for realtime data
+  at the appropriate audio positions for realtime data
 """
 
 from typing import Dict, List, Optional
@@ -50,9 +50,9 @@ class AeroRealtimeDataProcessor(Qwen3_VLDataProcessor):
     """Data processor for AeroRealtime training.
 
     Builds ``input_ids``, ``text_stream_ids``, and ``labels`` for the
-    dual-stream additive training design.  Handles:
-    - Normal video QA: video region filled with ``<|rt_pad|>`` after delay
-    - Realtime training: text tokens placed at temporal positions in video region
+    dual-stream training design.  Handles:
+    - Normal video QA: audio timeline filled with ``<|rt_pad|>`` after delay
+    - Realtime training: text tokens placed at temporal positions on audio tokens
     - Image-only: standard scatter (no text_stream_ids)
     - Audio extraction from video for audio-vision fusion
     """
@@ -334,14 +334,13 @@ def _build_normal_qa_ids_and_labels(
         """Build input_ids, text_stream_ids, and labels from HF messages.
 
         For normal video QA the text_stream_ids only differ from input_ids
-        in the multimodal pad regions:
-            - all ``<|video_pad|>`` and ``<|audio_pad|>`` slots → ``<|rt_pad|>``
-            - first chunk's first ``<|video_pad|>`` → ``<|rt_start|>``
-            - speak chunk's first ``<|audio_pad|>`` → ``<|rt_speak|>``
-
-        Envelope boundary tokens (timestamps, vision_start/end,
-        audio_start/end) keep their original ids in text_stream_ids so the
-        LM sees the same special tokens it would in input_ids.
+        on audio pad positions:
+            - all ``<|audio_pad|>`` slots -> ``<|rt_pad|>``
+            - first ``<|audio_pad|>`` -> ``<|rt_start|>``
+            - delayed ``<|audio_pad|>`` -> ``<|rt_speak|>``
+
+        Video placeholders and envelope boundary tokens keep their original
+        ids; vision features replace video placeholder embeddings in the model.
         """
         results = self.get_qwen_template_labels(
             hf_messages,
@@ -364,7 +363,7 @@ def _build_normal_qa_ids_and_labels(
         text_stream_id = list(input_id)  # start as a copy of input_ids
 
         if has_video and has_audio:
-            # video + audio: per-chunk envelope filler
+            # video + audio: only audio pads carry realtime stream tokens
             self.processor._fill_text_stream_video_audio(
                 stream=text_stream_id,
                 video_grid_thw=video_grid_thw,
@@ -374,7 +373,6 @@ def _build_normal_qa_ids_and_labels(
                 vision_end_id=self.tokenizer.convert_tokens_to_ids(self.processor.vision_end_token),
                 audio_start_id=self.tokenizer.convert_tokens_to_ids(self.processor.audio_start_token),
                 audio_end_id=self.tokenizer.convert_tokens_to_ids(self.processor.audio_end_token),
-                video_pad_id=self.video_token_id,
                 audio_pad_id=self.audio_token_id,
                 rt_start_id=self.rt_start_id,
                 rt_pad_id=self.rt_pad_id,
@@ -460,7 +458,6 @@ def _build_realtime_ids_and_labels(
             vision_end_id=self.tokenizer.convert_tokens_to_ids(self.processor.vision_end_token),
             audio_start_id=self.tokenizer.convert_tokens_to_ids(self.processor.audio_start_token),
             audio_end_id=self.tokenizer.convert_tokens_to_ids(self.processor.audio_end_token),
-            video_pad_id=self.video_token_id,
             audio_pad_id=self.audio_token_id,
             rt_start_id=self.rt_start_id,
             rt_pad_id=self.rt_pad_id,
diff --git a/src/lmms_engine/models/aero_realtime/aero_realtime_liger.py b/src/lmms_engine/models/aero_realtime/aero_realtime_liger.py
@@ -70,9 +70,9 @@ def aero_realtime_lce_forward(
 ):
     """RMPad-aware forward for AeroRealtime with LigerCE loss.
 
-    Same pipeline as the original forward (embed → vision → audio → add at
-    video/audio token positions independently — time alignment comes from
-    the per-chunk envelope token order, not from feature-level fusion).
+    Same pipeline as the original forward (embed → scatter vision → add audio
+    on audio token positions — realtime conditioning lives on the audio
+    timeline).
     Adds:
     - Proper mrope position_ids via ``qwen3_vl_get_rope_index``
     - Unpadding of inputs_embeds/position_ids/labels before the language model
@@ -126,9 +126,9 @@ def aero_realtime_lce_forward(
         else:
             audio_features_flat = audio_features.reshape(-1, audio_features.shape[-1])
 
-    # ---- 5. Add vision/audio features (independent paths) ----
+    # ---- 5. Scatter video features and add audio features ----
 
-    # 5a. Add video features at video_token_index positions
+    # 5a. Scatter video features at video_token_index positions
     if video_features is not None:
         video_mask = original_input_ids == self.config.video_token_index
         n_video_tokens = video_mask.sum().item()
@@ -137,12 +137,11 @@ def aero_realtime_lce_forward(
             raise ValueError(
                 f"Video token count ({n_video_tokens}) does not match " f"video feature count ({n_video_features})."
             )
-        video_mask_flat = video_mask.reshape(-1)
-        inputs_embeds_flat = inputs_embeds.reshape(-1, inputs_embeds.shape[-1])
-        inputs_embeds_flat[video_mask_flat] = inputs_embeds_flat[video_mask_flat] + video_features.to(
-            inputs_embeds.dtype
+        video_mask_expanded = video_mask.unsqueeze(-1).expand_as(inputs_embeds)
+        inputs_embeds = inputs_embeds.masked_scatter(
+            video_mask_expanded,
+            video_features.to(inputs_embeds.dtype),
         )
-        inputs_embeds = inputs_embeds_flat.reshape(inputs_embeds.shape)
 
     # 5b. Add audio features at audio_token_index positions
     if audio_features_flat is not None:
diff --git a/src/lmms_engine/models/aero_realtime/modeling_aero_realtime.py b/src/lmms_engine/models/aero_realtime/modeling_aero_realtime.py
@@ -518,8 +518,8 @@ def forward(
         Audio and video are kept as **separate** token streams in the input
         sequence (per-chunk envelope ``[VS][AS][video_pad×S][audio_pad×N]
         [AE][VE]``) so time alignment is expressed entirely through token
-        order and RoPE.  Each modality's features are simply added at the
-        positions of their corresponding placeholder tokens.
+        order and RoPE.  Vision features replace vision placeholders; audio
+        features are added to the realtime text stream on audio placeholders.
 
         Modality combinations:
 
@@ -528,22 +528,22 @@ def forward(
             positions.  ``text_stream_ids`` is not used.
 
         **Video mode** (``pixel_values_videos`` + ``video_grid_thw``):
-            Video features are **added** to embeddings at
+            Video features are scattered (replace) at
             ``video_token_index`` positions.
 
         **Audio mode** (``input_features``):
             Audio features are **added** to embeddings at
             ``audio_token_index`` positions.
 
-        **Video + Audio**: both add paths run independently on their own
-        token positions.  ``text_stream_ids`` carries the realtime markers
-        (``<|rt_start|>``, ``<|rt_pad|>``, ``<|rt_speak|>``) at the
-        envelope positions and is used for the input embedding lookup.
+        **Video + Audio**: video placeholders receive pure vision features.
+        ``text_stream_ids`` carries realtime markers (``<|rt_start|>``,
+        ``<|rt_pad|>``, ``<|rt_speak|>``) only at audio positions, where audio
+        features are added to the realtime text embeddings.
 
         Pipeline:
             1. Embed ``text_stream_ids`` (if provided) or ``input_ids``.
             2. Image features → scatter at ``image_token_index``.
-            3. Video features → add at ``video_token_index``.
+            3. Video features → scatter at ``video_token_index``.
             4. Audio features → add at ``audio_token_index``.
             5. Forward through the language model.
 
@@ -552,9 +552,9 @@ def forward(
                 Shape ``[batch_size, seq_len]``.  Used to determine the
                 position masks for image/video/audio features.
             text_stream_ids: Parallel text-stream token ids.
-                Shape ``[batch_size, seq_len]``.  At video/audio positions
-                contains ``<|rt_start|>``, ``<|rt_pad|>``, ``<|rt_speak|>``,
-                or actual text tokens; mirrors ``input_ids`` elsewhere.
+                Shape ``[batch_size, seq_len]``.  At audio positions contains
+                ``<|rt_start|>``, ``<|rt_pad|>``, ``<|rt_speak|>``, or actual
+                text tokens; mirrors ``input_ids`` elsewhere.
                 If not provided, falls back to ``input_ids``.
             pixel_values: Image pixel values (flat across batch).
             image_grid_thw: Grid info per image. ``[num_images, 3]``.
@@ -569,9 +569,8 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # Determine which token ids to use for embedding
-        # text_stream_ids provides the actual text tokens (including rt_start,
-        # rt_pad, rt_speak) at video/audio positions.  input_ids is only used
-        # for determining the position mask.
+        # text_stream_ids provides the realtime text tokens at audio positions.
+        # input_ids is used for determining modality placeholder masks.
         embed_ids = text_stream_ids if text_stream_ids is not None else input_ids
 
         # ----------------------------------------------------------------
@@ -602,7 +601,7 @@ def forward(
             )
 
         # ----------------------------------------------------------------
-        # 3. Video features — extract (additive fusion happens below)
+        # 3. Video features — extract (scatter happens below)
         # ----------------------------------------------------------------
         video_features = None
         if pixel_values_videos is not None:
@@ -625,12 +624,11 @@ def forward(
                 audio_features_flat = audio_features.reshape(-1, audio_features.shape[-1])
 
         # ----------------------------------------------------------------
-        # 5. Add video / audio features to text-stream embeddings
-        #    (independent paths — time alignment is handled by token order
-        #    within each per-chunk envelope, not by feature-level fusion)
+        # 5. Scatter video features and add audio features to text-stream embeddings.
+        #    Realtime text conditioning lives only on the audio timeline.
         # ----------------------------------------------------------------
 
-        # 5a. Video features -> add at video_token_index positions
+        # 5a. Video features -> scatter at video_token_index positions
         if video_features is not None:
             video_mask = input_ids == self.config.video_token_index
             n_video_tokens = video_mask.sum().item()
@@ -640,12 +638,11 @@ def forward(
                     f"Video token count ({n_video_tokens}) does not match " f"video feature count ({n_video_features})."
                 )
 
-            video_mask_flat = video_mask.reshape(-1)
-            inputs_embeds_flat = inputs_embeds.reshape(-1, inputs_embeds.shape[-1])
-            inputs_embeds_flat[video_mask_flat] = inputs_embeds_flat[video_mask_flat] + video_features.to(
-                inputs_embeds.dtype
+            video_mask_expanded = video_mask.unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds = inputs_embeds.masked_scatter(
+                video_mask_expanded,
+                video_features.to(inputs_embeds.dtype),
             )
-            inputs_embeds = inputs_embeds_flat.reshape(inputs_embeds.shape)
 
         # 5b. Audio features -> add at audio_token_index positions
         if audio_features_flat is not None:
diff --git a/src/lmms_engine/models/aero_realtime/processing_aero_realtime.py b/src/lmms_engine/models/aero_realtime/processing_aero_realtime.py
@@ -74,8 +74,8 @@ class AeroRealtimeProcessor(ProcessorMixin):
     - Text tokenization with placeholder expansion for images, videos, and
       audio tokens.
     - Construction of ``text_stream_ids`` carrying the realtime markers
-      (``<|rt_start|>``, ``<|rt_pad|>``, ``<|rt_speak|>``) when audio
-      is present (streaming mode).
+      (``<|rt_start|>``, ``<|rt_pad|>``, ``<|rt_speak|>``) on audio
+      positions when audio is present (streaming mode).
 
     Args:
         image_processor: Image processor instance (e.g. ``Qwen2VLImageProcessor``).
@@ -630,18 +630,17 @@ def _build_text_stream_ids(
     ) -> Union[list, torch.Tensor]:
         """Build ``text_stream_ids`` for the realtime dual-stream design.
 
-        ``text_stream_ids`` mirrors ``input_ids`` everywhere except inside
-        the multimodal regions, where it carries the realtime-text-stream
+        ``text_stream_ids`` mirrors ``input_ids`` everywhere except audio
+        placeholder positions, where it carries the realtime-text-stream
         markers (``<|rt_start|>``, ``<|rt_pad|>``, ``<|rt_speak|>``).
 
         Streaming mode is gated on the presence of audio.  Two layouts:
 
         - **video + audio (interleave)**: input contains per-chunk envelopes
-          ``[VS][AS][video_pad×S][audio_pad×N][AE][VE]``.  All envelope and
-          pad positions become ``<|rt_pad|>`` (model stays silent over
-          vision); the very first ``video_pad`` of the first chunk becomes
-          ``<|rt_start|>``; the first audio_pad of the first chunk whose
-          start time ``>= delay_seconds`` becomes ``<|rt_speak|>``.
+          ``[VS][AS][video_pad×S][audio_pad×N][AE][VE]``.  Video placeholders
+          stay as ``<|video_pad|>``; only audio placeholders carry
+          ``<|rt_pad|>``, with the first audio placeholder as ``<|rt_start|>``
+          and the first delayed audio placeholder as ``<|rt_speak|>``.
         - **audio-only**: ``[AS][audio_pad×N][AE]``.  First ``audio_pad``
           becomes ``<|rt_start|>``; the first audio_pad whose timestamp
           ``>= delay_seconds`` becomes ``<|rt_speak|>``.
@@ -656,7 +655,6 @@ def _build_text_stream_ids(
         rt_pad_id = self.tokenizer.convert_tokens_to_ids(self.rt_pad_token)
         rt_speak_id = self.tokenizer.convert_tokens_to_ids(self.rt_speak_token)
 
-        video_pad_id = self.tokenizer.convert_tokens_to_ids(self.video_token)
         audio_pad_id = self.tokenizer.convert_tokens_to_ids(self.audio_token)
         vision_start_id = self.tokenizer.convert_tokens_to_ids(self.vision_start_token)
         vision_end_id = self.tokenizer.convert_tokens_to_ids(self.vision_end_token)
@@ -681,7 +679,6 @@ def _build_text_stream_ids(
                     vision_end_id=vision_end_id,
                     audio_start_id=audio_start_id,
                     audio_end_id=audio_end_id,
-                    video_pad_id=video_pad_id,
                     audio_pad_id=audio_pad_id,
                     rt_start_id=rt_start_id,
                     rt_pad_id=rt_pad_id,
@@ -721,19 +718,20 @@ def _fill_text_stream_video_audio(
         vision_end_id: int,
         audio_start_id: int,
         audio_end_id: int,
-        video_pad_id: int,
         audio_pad_id: int,
         rt_start_id: int,
         rt_pad_id: int,
         rt_speak_id: int,
     ) -> None:
         """In-place fill of text_stream for the interleaved video+audio mode.
 
-        Only ``<|video_pad|>`` and ``<|audio_pad|>`` positions (which receive
-        added vision / audio features in the model) are overwritten:
-            - all video_pad / audio_pad slots → ``<|rt_pad|>``
-            - first chunk's first video_pad → ``<|rt_start|>``
-            - speak chunk's first audio_pad → ``<|rt_speak|>``
+        Only ``<|audio_pad|>`` positions are overwritten:
+            - all audio_pad slots -> ``<|rt_pad|>``
+            - first audio_pad -> ``<|rt_start|>``
+            - first delayed audio_pad -> ``<|rt_speak|>``
+
+        ``<|video_pad|>`` positions keep their original ids because video
+        features replace those embeddings in the model.
 
         Envelope boundary tokens (``<t.t seconds>``, ``<|vision_start|>``,
         ``<|audio_start|>``, ``<|audio_end|>``, ``<|vision_end|>``) keep
@@ -788,22 +786,43 @@ def _fill_text_stream_video_audio(
             #   as_+spatial+1 .. ae-1:       <|audio_pad|> × N_t
             #   ae:           <|audio_end|>
             #   ve:           <|vision_end|>
-            video_pad_start = as_ + 1
-            video_pad_end = as_ + spatial  # inclusive
             audio_pad_start = as_ + spatial + 1
             audio_pad_end = ae - 1  # inclusive
 
-            for k in range(video_pad_start, video_pad_end + 1):
-                stream[k] = rt_pad_id
             for k in range(audio_pad_start, audio_pad_end + 1):
                 stream[k] = rt_pad_id
 
-            # rt_start: first chunk's first video_pad
-            if c_idx == 0 and video_pad_start <= video_pad_end:
-                stream[video_pad_start] = rt_start_id
-            # rt_speak: speak chunk's first audio_pad
-            if c_idx == speak_chunk and audio_pad_start <= audio_pad_end:
-                stream[audio_pad_start] = rt_speak_id
+        audio_ranges = []
+        for c_idx, ((_, as_, ae, _), (_, _, _, spatial)) in enumerate(zip(envelopes, chunks)):
+            audio_pad_start = as_ + spatial + 1
+            audio_pad_end = ae - 1
+            if audio_pad_start <= audio_pad_end:
+                audio_ranges.append((c_idx, audio_pad_start, audio_pad_end))
+
+        if not audio_ranges:
+            return
+
+        first_audio_pos = audio_ranges[0][1]
+        speak_pos = None
+        for c_idx, audio_pad_start, _ in audio_ranges:
+            if c_idx >= speak_chunk:
+                speak_pos = audio_pad_start
+                break
+        if speak_pos is None:
+            speak_pos = audio_ranges[-1][1]
+
+        if speak_pos == first_audio_pos:
+            for _, audio_pad_start, audio_pad_end in audio_ranges:
+                if audio_pad_start <= first_audio_pos < audio_pad_end:
+                    speak_pos = first_audio_pos + 1
+                    break
+                if audio_pad_start > first_audio_pos:
+                    speak_pos = audio_pad_start
+                    break
+
+        stream[first_audio_pos] = rt_start_id
+        if speak_pos != first_audio_pos:
+            stream[speak_pos] = rt_speak_id
 
     def _fill_text_stream_audio_only(
         self,

Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,9 @@`
`1`	`1`	`# AeroRealtime Training Configuration`
`2`	`2`	`# Trains the AeroRealtime model on LLaVA-Video-178K data (normal video QA mode).`
`3`	`3`	`#`
`4`		`-# The dual-stream additive design is active: during video regions, the model`
`5`		`-# receives additive vision+text embeddings and learns to stay silent (rt_pad)`
`6`		`-# until spoken to (rt_speak boundary at delay_seconds).`
	`4`	`+# The realtime text stream is conditioned on audio positions only. Video`
	`5`	`+# placeholders receive pure vision features, while audio placeholders carry`
	`6`	`+# rt_pad / rt_speak / realtime text tokens along the audio timeline.`
`7`	`7`	`#`
`8`	`8`	`# Audio is auto-extracted from video files by the dataset.`
`9`	`9`	`#`