EvolvingLMMs-Lab
diff --git a/‎src/lmms_engine/datasets/processor/aero_realtime_processor.py‎
Lines changed: 160 additions & 68 deletions b/‎src/lmms_engine/datasets/processor/aero_realtime_processor.py‎
Lines changed: 160 additions & 68 deletions
@@ -146,7 +146,6 @@ def process(
         audios: Optional[List[np.ndarray]] = None,
         sampling_rate: Optional[int] = None,
         videos=None,
-        video_metadata=None,
         realtime_segments: Optional[List[Dict]] = None,
         system_message: str = "You are a helpful assistant",
         add_system_prompt=True,
@@ -162,14 +161,13 @@ def process(
             audios: List of audio waveforms (mono, float32, at sampling_rate).
             sampling_rate: Audio sampling rate.
             videos: List of video frames (numpy arrays, TCHW format).
-            video_metadata: Video metadata for timestamp computation.
-                If not provided, computed from video processor.
             realtime_segments: List of ``{"start_sec": float, "text": str}``
                 dicts extracted from assistant ``realtime_text`` content items.
                 If None, this is treated as normal video QA.
             system_message: System prompt text.
             add_system_prompt: Whether to add a system prompt.
-            **kwargs: Additional kwargs (e.g. ``fps``, ``do_sample_frames``).
+            **kwargs: Forwarded to the model processor (e.g. ``fps``,
+                ``do_sample_frames``, ``video_metadata``).
 
         Returns:
             Dict with ``input_ids``, ``text_stream_ids``, ``labels``, and
@@ -200,9 +198,6 @@ def process(
         _video_metadata = None
         if videos is not None:
             videos_kwargs = output_kwargs.get("videos_kwargs", {})
-            videos_kwargs["return_metadata"] = True
-            if video_metadata is not None:
-                videos_kwargs["video_metadata"] = video_metadata
             video_inputs = self.processor.video_processor(videos=videos, return_tensors="pt", **videos_kwargs)
             video_grid_thw = video_inputs["video_grid_thw"]
             _video_metadata = video_inputs.pop("video_metadata")
@@ -239,6 +234,35 @@ def process(
             num_video_tokens = None
 
         has_video = video_grid_thw is not None
+        has_audio = bool(audio_inputs)
+
+        # Per-video audio token splits across video temporal chunks.
+        # Required for envelope construction when both video and audio are
+        # present (the inner ``<|audio_pad|>`` count of each per-chunk envelope).
+        audio_per_chunk_per_video = None
+        if has_video and has_audio:
+            mel_lengths = audio_inputs["audio_attention_mask"].sum(-1)
+            num_audio_tokens_list = [self.processor._get_num_audio_tokens(int(m.item())) for m in mel_lengths]
+            temporal_patch_size = getattr(self.processor.video_processor, "temporal_patch_size", 2)
+            audio_per_chunk_per_video = []
+            for v_idx in range(len(video_grid_thw)):
+                metadata = _video_metadata[v_idx]
+                fps = metadata.fps if metadata.fps is not None else 24.0
+                grid_t = int(video_grid_thw[v_idx][0])
+                second_per_grid = temporal_patch_size / fps
+                # Audio sample paired with this video by positional index
+                a_idx = v_idx if v_idx < len(num_audio_tokens_list) else 0
+                n_audio = num_audio_tokens_list[a_idx]
+                audio_duration = self.processor._get_audio_duration_seconds(audio_inputs["audio_attention_mask"][a_idx])
+                audio_rate = (n_audio / audio_duration) if audio_duration > 0 else 0.0
+                audio_per_chunk_per_video.append(
+                    self.processor._split_audio_across_chunks(
+                        n_audio=n_audio,
+                        grid_t=grid_t,
+                        second_per_grid=second_per_grid,
+                        audio_rate=audio_rate,
+                    )
+                )
 
         # ==============================================================
         # 5. Build input_ids, text_stream_ids, labels
@@ -250,6 +274,8 @@ def process(
                 num_video_tokens=num_video_tokens,
                 video_grid_thw=video_grid_thw,
                 video_metadata=_video_metadata,
+                audio_per_chunk_per_video=audio_per_chunk_per_video,
+                audio_attention_mask=audio_inputs.get("audio_attention_mask") if has_audio else None,
                 system_message=system_message,
                 add_system_prompt=add_system_prompt,
             )
@@ -259,17 +285,7 @@ def process(
             raise RuntimeError("Not implemented yet")
 
         # ==============================================================
-        # 6. Compute video_timestep and audio_timestep
-        # ==============================================================
-        if video_grid_thw is not None and _video_metadata is not None:
-            inputs["video_timestep"] = self.processor._compute_video_timestep(video_grid_thw, _video_metadata)
-
-        if audio_inputs:
-            audio_mask = audio_inputs["audio_attention_mask"]
-            inputs["audio_timestep"] = self.processor._compute_audio_timestep(audio_mask)
-
-        # ==============================================================
-        # 7. Pack vision/audio tensors into output
+        # 6. Pack vision/audio tensors into output
         # ==============================================================
         if images is not None:
             inputs["pixel_values"] = image_inputs["pixel_values"]
@@ -297,26 +313,33 @@ def _build_normal_qa_ids_and_labels(
         num_video_tokens: Optional[List[int]],
         video_grid_thw=None,
         video_metadata=None,
+        audio_per_chunk_per_video: Optional[List[List[int]]] = None,
+        audio_attention_mask: Optional[torch.Tensor] = None,
         realtime_segments: Optional[List[Dict]] = None,
         system_message: str = "You are a helpful assistant",
         add_system_prompt: bool = True,
     ) -> dict:
         """Build input_ids, text_stream_ids, and labels from HF messages.
 
-        For normal video QA: text_stream_ids has rt_start/rt_pad/rt_speak
-        with all rt_pad after rt_speak (model learns to stay silent).
+        For normal video QA the text_stream_ids only differ from input_ids
+        in the multimodal pad regions:
+            - all ``<|video_pad|>`` and ``<|audio_pad|>`` slots → ``<|rt_pad|>``
+            - first chunk's first ``<|video_pad|>`` → ``<|rt_start|>``
+            - speak chunk's first ``<|audio_pad|>`` → ``<|rt_speak|>``
 
-        For realtime training: text_stream_ids has actual text tokens placed
-        at the right temporal positions after rt_speak.
+        Envelope boundary tokens (timestamps, vision_start/end,
+        audio_start/end) keep their original ids in text_stream_ids so the
+        LM sees the same special tokens it would in input_ids.
         """
         results = self.get_qwen_template_labels(
             hf_messages,
             num_image_tokens,
             num_video_tokens,
             video_metadata,
             video_grid_thw,
-            system_message,
-            add_system_prompt,
+            audio_per_chunk_per_video=audio_per_chunk_per_video,
+            system_message=system_message,
+            add_system_prompt=add_system_prompt,
         )
         input_id = results["input_ids"].tolist()
         target = results["labels"].tolist()
@@ -325,59 +348,53 @@ def _build_normal_qa_ids_and_labels(
         # Build text_stream_ids
         # ==============================================================
         has_video = video_grid_thw is not None
+        has_audio = audio_attention_mask is not None
         text_stream_id = list(input_id)  # start as a copy of input_ids
 
-        if has_video:
-            vision_start_id = self.tokenizer.convert_tokens_to_ids(self.processor.vision_start_token)
-            vision_end_id = self.tokenizer.convert_tokens_to_ids(self.processor.vision_end_token)
-            temporal_patch_size = getattr(self.processor.video_processor, "temporal_patch_size", 2)
-
-            # Pre-compute per-frame timestamps for all videos
-            all_frame_timestamps = []
-            for v_idx in range(len(video_grid_thw)):
-                metadata = video_metadata[v_idx]
-                fps = metadata.fps if metadata.fps is not None else 24.0
-                timestamps = self.processor._calculate_timestamps(metadata.frames_indices, fps, temporal_patch_size)
-                all_frame_timestamps.extend(timestamps)
-
-            input_id_t = torch.tensor(input_id)
-            vs_positions = (input_id_t == vision_start_id).nonzero(as_tuple=True)[0].tolist()
-            ve_positions = (input_id_t == vision_end_id).nonzero(as_tuple=True)[0].tolist()
-
-            assert len(all_frame_timestamps) == len(vs_positions), "The timestamps and frame number should be equal"
-
-            # Find the first frame whose timestamp >= delay_seconds
-            speak_frame = len(all_frame_timestamps) - 1  # fallback to last frame
-            for idx, ts in enumerate(all_frame_timestamps):
-                if ts >= self.processor.delay_seconds:
-                    speak_frame = idx
-                    break
-
-            # Fill text_stream_id for each frame's [VS][VP*N][VE] region
-            for idx, (vs, ve) in enumerate(zip(vs_positions, ve_positions)):
-                # VS and VE → rt_pad
-                text_stream_id[vs] = self.rt_pad_id
-                text_stream_id[ve] = self.rt_pad_id
-                # VP region (vs+1 to ve-1) → rt_pad
-                for k in range(vs + 1, ve):
-                    text_stream_id[k] = self.rt_pad_id
-                # First frame: place rt_start at first VP position
-                if idx == 0:
-                    text_stream_id[vs + 1] = self.rt_start_id
-                # Delay frame: place rt_speak at first VP position
-                if idx == speak_frame:
-                    text_stream_id[vs + 1] = self.rt_speak_id
+        if has_video and has_audio:
+            # video + audio: per-chunk envelope filler
+            self.processor._fill_text_stream_video_audio(
+                stream=text_stream_id,
+                video_grid_thw=video_grid_thw,
+                video_metadata=video_metadata,
+                temporal_patch_size=getattr(self.processor.video_processor, "temporal_patch_size", 2),
+                vision_start_id=self.tokenizer.convert_tokens_to_ids(self.processor.vision_start_token),
+                vision_end_id=self.tokenizer.convert_tokens_to_ids(self.processor.vision_end_token),
+                audio_start_id=self.tokenizer.convert_tokens_to_ids(self.processor.audio_start_token),
+                audio_end_id=self.tokenizer.convert_tokens_to_ids(self.processor.audio_end_token),
+                video_pad_id=self.video_token_id,
+                audio_pad_id=self.audio_token_id,
+                rt_start_id=self.rt_start_id,
+                rt_pad_id=self.rt_pad_id,
+                rt_speak_id=self.rt_speak_id,
+            )
+        elif has_audio:
+            # audio-only: single envelope per audio sample
+            n_samples = audio_attention_mask.shape[0]
+            for s_idx in range(n_samples):
+                self.processor._fill_text_stream_audio_only(
+                    stream=text_stream_id,
+                    sample_idx=s_idx,
+                    audio_attention_mask=audio_attention_mask,
+                    audio_start_id=self.tokenizer.convert_tokens_to_ids(self.processor.audio_start_token),
+                    audio_end_id=self.tokenizer.convert_tokens_to_ids(self.processor.audio_end_token),
+                    audio_pad_id=self.audio_token_id,
+                    rt_start_id=self.rt_start_id,
+                    rt_pad_id=self.rt_pad_id,
+                    rt_speak_id=self.rt_speak_id,
+                )
+        # video-only (no audio): no text_stream_ids (matches processor)
 
         input_id = torch.tensor(input_id, dtype=torch.long)
         target = torch.tensor(target, dtype=torch.long)
-        text_stream_id = torch.tensor(text_stream_id, dtype=torch.long)
 
         result = dict(
             input_ids=input_id,
             labels=target,
         )
-        if has_video:
-            result["text_stream_ids"] = text_stream_id
+        # text_stream_ids only when audio is present (= streaming mode)
+        if has_audio:
+            result["text_stream_ids"] = torch.tensor(text_stream_id, dtype=torch.long)
 
         return result
 
@@ -388,6 +405,7 @@ def get_qwen_template_labels(
         num_video_tokens: List[int],
         video_metadata: List[dict],
         video_grid_thw=None,
+        audio_per_chunk_per_video: Optional[List[List[int]]] = None,
         system_message: str = "You are a helpful assistant",
         add_system_prompt: bool = True,
         add_generation_prompt: bool = False,
@@ -426,6 +444,7 @@ def get_qwen_template_labels(
                     video_start_from,
                     curr_timestamp,
                     video_grid_thw,
+                    audio_per_chunk_per_video=audio_per_chunk_per_video,
                 )
                 video_start_from += used_video
 
@@ -449,6 +468,8 @@ def get_qwen_template_labels(
                 target[idx] = -100
             if encode_id == self.video_token_id:
                 target[idx] = -100
+            if encode_id == self.audio_token_id:
+                target[idx] = -100
 
         input_id = torch.tensor(input_id, dtype=torch.long)
         target = torch.tensor(target, dtype=torch.long)
@@ -458,6 +479,77 @@ def get_qwen_template_labels(
             labels=target,
         )
 
+    def _expand_encode_id_video_tokens(
+        self,
+        encode_id: List[int],
+        video_token_num: List[int],
+        start_from: int = 0,
+        curr_timestamp: List[float] = None,
+        video_grid_thw=None,
+        audio_per_chunk_per_video: Optional[List[List[int]]] = None,
+    ):
+        """Expand ``<|video_pad|>`` placeholders.
+
+        - Without audio: per-frame Qwen3VL legacy expansion (delegated to
+          parent).
+        - With audio: per-chunk envelope expansion matching the model
+          processor's path 5b layout::
+
+            <t.t seconds><|vision_start|><|audio_start|>
+              <|video_pad|>×spatial <|audio_pad|>×N_t
+            <|audio_end|><|vision_end|>
+        """
+        if audio_per_chunk_per_video is None:
+            return super()._expand_encode_id_video_tokens(
+                encode_id, video_token_num, start_from, curr_timestamp, video_grid_thw
+            )
+
+        merge_length = self.processor.video_processor.merge_size**2
+        vision_start_id = self.processor.vision_start_token_id
+        vision_end_id = self.processor.vision_end_token_id
+        audio_start_id = self.tokenizer.convert_tokens_to_ids(self.processor.audio_start_token)
+        audio_end_id = self.tokenizer.convert_tokens_to_ids(self.processor.audio_end_token)
+        temporal_patch_size = getattr(self.processor.video_processor, "temporal_patch_size", 2)
+
+        video_pos = [i for i, x in enumerate(encode_id) if x == self.video_token_id]
+        expanded_encode_id = []
+        prev = 0
+        for idx, pos in enumerate(video_pos):
+            v_global = idx + start_from
+            grid = video_grid_thw[v_global]
+            grid_t = int(grid[0])
+            spatial = int(grid[1:].prod() // merge_length)
+
+            # Figure out per-chunk audio counts; fps from grid (we only have
+            # curr_timestamp which is per-frame timestamps in seconds).  Use
+            # them directly for the chunk start times.
+            audio_per_chunk = audio_per_chunk_per_video[v_global]
+            assert len(audio_per_chunk) == grid_t, f"audio_per_chunk len {len(audio_per_chunk)} != grid_t {grid_t}"
+
+            # Strip surrounding <|vision_start|> / <|vision_end|> from the
+            # template (positions pos-1 and pos+1) -- we will emit our own.
+            expanded_encode_id.extend(encode_id[prev : pos - 1])
+
+            for t in range(grid_t):
+                # Per-frame timestamp (seconds) from the video metadata
+                t_sec = curr_timestamp[t] if t < len(curr_timestamp) else (t * temporal_patch_size)
+                timestamp_token_ids = self.processor.tokenizer.encode(f"<{t_sec:.1f} seconds>")
+                n_audio_t = audio_per_chunk[t]
+                expanded_encode_id.extend(timestamp_token_ids)
+                expanded_encode_id.append(vision_start_id)
+                expanded_encode_id.append(audio_start_id)
+                expanded_encode_id.extend([self.video_token_id] * spatial)
+                expanded_encode_id.extend([self.audio_token_id] * n_audio_t)
+                expanded_encode_id.append(audio_end_id)
+                expanded_encode_id.append(vision_end_id)
+
+            prev = pos + 2  # skip past original <|vision_end|>
+
+            if idx == len(video_pos) - 1:
+                expanded_encode_id.extend(encode_id[prev:])
+
+        return expanded_encode_id, len(video_pos)
+
     # ------------------------------------------------------------------
     # Chat template
     # ------------------------------------------------------------------
@@ -484,7 +576,7 @@ def chat_template(self):
                     "{% for content in message['content'] %}"
                         "{% if 'audio' in content or 'audio_url' in content %}"
                             "{% set audio_count.value = audio_count.value + 1 %}"
-                            "<|AUDIO|>"
+                            "<|audio_pad|>"
                         "{% elif content['type'] == 'image' or 'image' in content or 'image_url' in content %}"
                             "{% set image_count.value = image_count.value + 1 %}"
                             "<|vision_start|><|image_pad|><|vision_end|>"