@@ -74,8 +74,8 @@ class AeroRealtimeProcessor(ProcessorMixin):
7474 - Text tokenization with placeholder expansion for images, videos, and
7575 audio tokens.
7676 - Construction of ``text_stream_ids`` carrying the realtime markers
77- (``<|rt_start|>``, ``<|rt_pad|>``, ``<|rt_speak|>``) when audio
78- is present (streaming mode).
77+ (``<|rt_start|>``, ``<|rt_pad|>``, ``<|rt_speak|>``) on audio
78+ positions when audio is present (streaming mode).
7979
8080 Args:
8181 image_processor: Image processor instance (e.g. ``Qwen2VLImageProcessor``).
@@ -630,18 +630,17 @@ def _build_text_stream_ids(
630630 ) -> Union [list , torch .Tensor ]:
631631 """Build ``text_stream_ids`` for the realtime dual-stream design.
632632
633- ``text_stream_ids`` mirrors ``input_ids`` everywhere except inside
634- the multimodal regions , where it carries the realtime-text-stream
633+ ``text_stream_ids`` mirrors ``input_ids`` everywhere except audio
634+ placeholder positions , where it carries the realtime-text-stream
635635 markers (``<|rt_start|>``, ``<|rt_pad|>``, ``<|rt_speak|>``).
636636
637637 Streaming mode is gated on the presence of audio. Two layouts:
638638
639639 - **video + audio (interleave)**: input contains per-chunk envelopes
640- ``[VS][AS][video_pad×S][audio_pad×N][AE][VE]``. All envelope and
641- pad positions become ``<|rt_pad|>`` (model stays silent over
642- vision); the very first ``video_pad`` of the first chunk becomes
643- ``<|rt_start|>``; the first audio_pad of the first chunk whose
644- start time ``>= delay_seconds`` becomes ``<|rt_speak|>``.
640+ ``[VS][AS][video_pad×S][audio_pad×N][AE][VE]``. Video placeholders
641+ stay as ``<|video_pad|>``; only audio placeholders carry
642+ ``<|rt_pad|>``, with the first audio placeholder as ``<|rt_start|>``
643+ and the first delayed audio placeholder as ``<|rt_speak|>``.
645644 - **audio-only**: ``[AS][audio_pad×N][AE]``. First ``audio_pad``
646645 becomes ``<|rt_start|>``; the first audio_pad whose timestamp
647646 ``>= delay_seconds`` becomes ``<|rt_speak|>``.
@@ -656,7 +655,6 @@ def _build_text_stream_ids(
656655 rt_pad_id = self .tokenizer .convert_tokens_to_ids (self .rt_pad_token )
657656 rt_speak_id = self .tokenizer .convert_tokens_to_ids (self .rt_speak_token )
658657
659- video_pad_id = self .tokenizer .convert_tokens_to_ids (self .video_token )
660658 audio_pad_id = self .tokenizer .convert_tokens_to_ids (self .audio_token )
661659 vision_start_id = self .tokenizer .convert_tokens_to_ids (self .vision_start_token )
662660 vision_end_id = self .tokenizer .convert_tokens_to_ids (self .vision_end_token )
@@ -681,7 +679,6 @@ def _build_text_stream_ids(
681679 vision_end_id = vision_end_id ,
682680 audio_start_id = audio_start_id ,
683681 audio_end_id = audio_end_id ,
684- video_pad_id = video_pad_id ,
685682 audio_pad_id = audio_pad_id ,
686683 rt_start_id = rt_start_id ,
687684 rt_pad_id = rt_pad_id ,
@@ -721,19 +718,20 @@ def _fill_text_stream_video_audio(
721718 vision_end_id : int ,
722719 audio_start_id : int ,
723720 audio_end_id : int ,
724- video_pad_id : int ,
725721 audio_pad_id : int ,
726722 rt_start_id : int ,
727723 rt_pad_id : int ,
728724 rt_speak_id : int ,
729725 ) -> None :
730726 """In-place fill of text_stream for the interleaved video+audio mode.
731727
732- Only ``<|video_pad|>`` and ``<|audio_pad|>`` positions (which receive
733- added vision / audio features in the model) are overwritten:
734- - all video_pad / audio_pad slots → ``<|rt_pad|>``
735- - first chunk's first video_pad → ``<|rt_start|>``
736- - speak chunk's first audio_pad → ``<|rt_speak|>``
728+ Only ``<|audio_pad|>`` positions are overwritten:
729+ - all audio_pad slots -> ``<|rt_pad|>``
730+ - first audio_pad -> ``<|rt_start|>``
731+ - first delayed audio_pad -> ``<|rt_speak|>``
732+
733+ ``<|video_pad|>`` positions keep their original ids because video
734+ features replace those embeddings in the model.
737735
738736 Envelope boundary tokens (``<t.t seconds>``, ``<|vision_start|>``,
739737 ``<|audio_start|>``, ``<|audio_end|>``, ``<|vision_end|>``) keep
@@ -788,22 +786,43 @@ def _fill_text_stream_video_audio(
788786 # as_+spatial+1 .. ae-1: <|audio_pad|> × N_t
789787 # ae: <|audio_end|>
790788 # ve: <|vision_end|>
791- video_pad_start = as_ + 1
792- video_pad_end = as_ + spatial # inclusive
793789 audio_pad_start = as_ + spatial + 1
794790 audio_pad_end = ae - 1 # inclusive
795791
796- for k in range (video_pad_start , video_pad_end + 1 ):
797- stream [k ] = rt_pad_id
798792 for k in range (audio_pad_start , audio_pad_end + 1 ):
799793 stream [k ] = rt_pad_id
800794
801- # rt_start: first chunk's first video_pad
802- if c_idx == 0 and video_pad_start <= video_pad_end :
803- stream [video_pad_start ] = rt_start_id
804- # rt_speak: speak chunk's first audio_pad
805- if c_idx == speak_chunk and audio_pad_start <= audio_pad_end :
806- stream [audio_pad_start ] = rt_speak_id
795+ audio_ranges = []
796+ for c_idx , ((_ , as_ , ae , _ ), (_ , _ , _ , spatial )) in enumerate (zip (envelopes , chunks )):
797+ audio_pad_start = as_ + spatial + 1
798+ audio_pad_end = ae - 1
799+ if audio_pad_start <= audio_pad_end :
800+ audio_ranges .append ((c_idx , audio_pad_start , audio_pad_end ))
801+
802+ if not audio_ranges :
803+ return
804+
805+ first_audio_pos = audio_ranges [0 ][1 ]
806+ speak_pos = None
807+ for c_idx , audio_pad_start , _ in audio_ranges :
808+ if c_idx >= speak_chunk :
809+ speak_pos = audio_pad_start
810+ break
811+ if speak_pos is None :
812+ speak_pos = audio_ranges [- 1 ][1 ]
813+
814+ if speak_pos == first_audio_pos :
815+ for _ , audio_pad_start , audio_pad_end in audio_ranges :
816+ if audio_pad_start <= first_audio_pos < audio_pad_end :
817+ speak_pos = first_audio_pos + 1
818+ break
819+ if audio_pad_start > first_audio_pos :
820+ speak_pos = audio_pad_start
821+ break
822+
823+ stream [first_audio_pos ] = rt_start_id
824+ if speak_pos != first_audio_pos :
825+ stream [speak_pos ] = rt_speak_id
807826
808827 def _fill_text_stream_audio_only (
809828 self ,
0 commit comments