Skip to content

Commit 3fbd6b9

Browse files
committed
feat(aero_realtime): initial implementation of realtime multimodal model
Add aero_realtime model with dual-stream additive design for realtime training, including: - Model architecture with dual-stream additive design - Training pipeline: processor, collator, dataset, monkey patch - LigerCE rmpad forward for memory-efficient training - Video token format aligned with Qwen3 VL - Silence fallback for videos without audio track - Weight init and forward test scripts
1 parent b928344 commit 3fbd6b9

20 files changed

Lines changed: 3874 additions & 0 deletions
Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
# AeroRealtime Training Configuration
2+
# Trains the AeroRealtime model on LLaVA-Video-178K data (normal video QA mode).
3+
#
4+
# The dual-stream additive design is active: during video regions, the model
5+
# receives additive vision+text embeddings and learns to stay silent (rt_pad)
6+
# until spoken to (rt_speak boundary at delay_seconds).
7+
#
8+
# Audio is auto-extracted from video files by the dataset.
9+
#
10+
# Override paths at launch time:
11+
# model_config.load_from_pretrained_path=/path/to/aero_realtime_init
12+
# dataset_config.processor_config.processor_name=/path/to/aero_realtime_init
13+
# dataset_config.datasets.0.path=/path/to/parquet
14+
# dataset_config.datasets.0.data_folder=/path/to/video/root
15+
16+
trainer_type: fsdp2_trainer
17+
18+
dataset_config:
19+
dataset_type: aero_realtime_iterable
20+
dataset_format: yaml
21+
dataset_path: null
22+
processor_config:
23+
processor_name: null # Override: path to aero_realtime_init checkpoint
24+
processor_type: aero_realtime
25+
extra_kwargs:
26+
video_max_pixels: 360448 # ~602*600 — moderate resolution for 4B model
27+
video_min_pixels: 28800
28+
image_max_pixels: 360448
29+
image_min_pixels: 28800
30+
31+
# Inline dataset configuration — override paths at launch time
32+
datasets:
33+
- path: null # Override: path to parquet file
34+
data_folder: null # Override: root directory for video files
35+
data_type: parquet
36+
37+
shuffle: true
38+
eval_dataset_path: null
39+
object_storage: none
40+
bucket_name: null
41+
42+
# Packing disabled for initial training (video lengths vary)
43+
packing: false
44+
packing_strategy: first_fit
45+
packing_length: 8192
46+
filter_overlong: true
47+
filter_overlong_workers: 8
48+
max_length: null
49+
50+
# Video sampling: 1 fps for 0-30s videos
51+
video_sampling_strategy: fps
52+
video_max_pixels: 360448
53+
video_max_frames: 64
54+
frame_num: 32
55+
fps: 1
56+
video_backend: qwen_vl_utils
57+
58+
extra_kwargs: {}
59+
60+
trainer_args:
61+
output_dir: ./output/aero_realtime_training
62+
overwrite_output_dir: false
63+
do_train: true
64+
do_eval: false
65+
do_predict: false
66+
eval_strategy: 'no'
67+
prediction_loss_only: false
68+
69+
# Batch size — start conservative for 4B+ model with video+audio
70+
per_device_train_batch_size: 1
71+
per_device_eval_batch_size: 1
72+
gradient_accumulation_steps: 4
73+
eval_accumulation_steps: null
74+
eval_delay: 0
75+
torch_empty_cache_steps: null
76+
77+
# Optimizer
78+
learning_rate: 1.0e-05
79+
weight_decay: 0.0
80+
adam_beta1: 0.9
81+
adam_beta2: 0.999
82+
adam_epsilon: 1.0e-08
83+
max_grad_norm: 1.0
84+
85+
# Training schedule
86+
num_train_epochs: 1
87+
max_steps: -1
88+
lr_scheduler_type: cosine
89+
lr_scheduler_kwargs: {}
90+
warmup_ratio: 0.03
91+
warmup_steps: 0
92+
93+
# Logging
94+
log_level: passive
95+
log_level_replica: warning
96+
log_on_each_node: true
97+
logging_dir: ./output/aero_realtime_training/runs
98+
logging_strategy: steps
99+
logging_first_step: false
100+
logging_steps: 1
101+
logging_nan_inf_filter: true
102+
103+
# Checkpointing
104+
save_strategy: steps
105+
save_steps: 500
106+
save_total_limit: 2
107+
save_safetensors: true
108+
save_on_each_node: false
109+
save_only_model: false
110+
restore_callback_states_from_checkpoint: false
111+
112+
# Device
113+
no_cuda: false
114+
use_cpu: false
115+
use_mps_device: false
116+
seed: 42
117+
data_seed: null
118+
jit_mode_eval: false
119+
120+
# Precision
121+
bf16: true
122+
fp16: false
123+
fp16_opt_level: O1
124+
half_precision_backend: auto
125+
bf16_full_eval: false
126+
fp16_full_eval: false
127+
tf32: true
128+
129+
# Distributed
130+
local_rank: 0
131+
ddp_backend: null
132+
tpu_num_cores: null
133+
tpu_metrics_debug: false
134+
debug: []
135+
136+
# Dataloader
137+
dataloader_drop_last: true
138+
eval_steps: null
139+
dataloader_num_workers: 4
140+
dataloader_prefetch_factor: null
141+
past_index: -1
142+
run_name: aero_realtime_training
143+
disable_tqdm: false
144+
remove_unused_columns: false
145+
label_names: null
146+
load_best_model_at_end: false
147+
metric_for_best_model: null
148+
greater_is_better: null
149+
ignore_data_skip: false
150+
151+
# FSDP config
152+
fsdp: []
153+
fsdp_min_num_params: 0
154+
fsdp_config:
155+
transformer_layer_cls_to_wrap:
156+
- Qwen3VLTextDecoderLayer
157+
reshard_after_forward: false
158+
min_num_params: 0
159+
xla: false
160+
xla_fsdp_v2: false
161+
xla_fsdp_grad_ckpt: false
162+
fsdp_transformer_layer_cls_to_wrap: null
163+
164+
accelerator_config:
165+
split_batches: false
166+
dispatch_batches: null
167+
even_batches: true
168+
use_seedable_sampler: true
169+
non_blocking: false
170+
gradient_accumulation_kwargs: null
171+
parallelism_config: null
172+
deepspeed: null
173+
174+
# Optimization
175+
label_smoothing_factor: 0.0
176+
optim: adamw_torch_fused
177+
optim_args: null
178+
adafactor: false
179+
group_by_length: false
180+
length_column_name: length
181+
182+
# Reporting
183+
report_to:
184+
- wandb
185+
project: huggingface
186+
trackio_space_id: trackio
187+
188+
# Advanced
189+
ddp_find_unused_parameters: null
190+
ddp_bucket_cap_mb: null
191+
ddp_broadcast_buffers: null
192+
dataloader_pin_memory: true
193+
dataloader_persistent_workers: false
194+
skip_memory_metrics: true
195+
use_legacy_prediction_loop: false
196+
push_to_hub: false
197+
resume_from_checkpoint: null
198+
hub_model_id: null
199+
hub_strategy: every_save
200+
hub_token: null
201+
hub_private_repo: null
202+
hub_always_push: false
203+
hub_revision: null
204+
205+
# Memory optimization
206+
gradient_checkpointing: true
207+
gradient_checkpointing_kwargs: null
208+
include_inputs_for_metrics: false
209+
include_for_metrics: []
210+
eval_do_concat_batches: true
211+
fp16_backend: auto
212+
push_to_hub_model_id: null
213+
push_to_hub_organization: null
214+
mp_parameters: ''
215+
auto_find_batch_size: false
216+
full_determinism: false
217+
torchdynamo: null
218+
ray_scope: last
219+
ddp_timeout: 1800
220+
221+
# Compilation
222+
torch_compile: false
223+
torch_compile_backend: null
224+
torch_compile_mode: null
225+
226+
include_tokens_per_second: false
227+
include_num_input_tokens_seen: 'no'
228+
neftune_noise_alpha: null
229+
optim_target_modules: null
230+
batch_eval_metrics: false
231+
eval_on_start: false
232+
233+
# Liger kernel for memory efficiency
234+
use_liger_kernel: true
235+
liger_kernel_config: null
236+
eval_use_gather_object: false
237+
average_tokens_across_devices: true
238+
use_muon: false
239+
240+
# Freeze vision tower initially (train language model + audio + fusion)
241+
freeze_modules: null
242+
243+
# Remove padding for efficiency
244+
use_rmpad: true
245+
246+
# FSDP2 configuration
247+
fsdp2: true
248+
sp_ulysses_degree: 1
249+
reduce_dtype: bfloat16
250+
output_dtype: bfloat16
251+
print_batch_input_steps: 5
252+
enable_profiler: false
253+
profiler_config:
254+
start_step: 1
255+
end_step: 3
256+
257+
model_config:
258+
extra_kwargs: {}
259+
load_from_pretrained_path: null # Override: path to aero_realtime_init checkpoint
260+
load_from_config: null
261+
attn_implementation: flash_attention_2
262+
model_type: aero_realtime
263+
torch_dtype: bfloat16
264+
overwrite_config: null
265+
monkey_patch_kwargs: null
266+
267+
extra_kwargs: null

examples/aero_realtime/run.sh

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/bin/bash
2+
3+
################################################################################
4+
# AeroRealtime Training — LLaVA-Video-178K (Normal Video QA)
5+
################################################################################
6+
#
7+
# DESCRIPTION:
8+
# Train the AeroRealtime model (5.09B params) on LLaVA-Video-178K data.
9+
# Uses Hydra --config-path / --config-name so all settings can be overridden
10+
# from the command line.
11+
#
12+
# MODEL:
13+
# - Vision: Qwen3-VL-4B vision tower (with built-in merger)
14+
# - Audio: Qwen2-Audio-7B encoder (auto-extracted from video)
15+
# - Language: Qwen3-VL-4B text model + lm_head
16+
# - Fusion: timestep-aligned mean pooling (audio -> vision bins)
17+
# - Design: additive dual-stream (text_stream_ids + vision features)
18+
#
19+
# PREREQUISITES:
20+
# 1. Prepare checkpoint: python tools/prepare_init_weight/prepare_aero_realtime.py
21+
# 2. Convert data: python tools/convert_data/convert_llava_video_to_parquet.py
22+
# 3. Install deps: pip install flash-attn --no-build-isolation && pip install liger-kernel librosa
23+
#
24+
# USAGE:
25+
# Set MODEL_PATH, DATA_PATH, DATA_FOLDER below, then run:
26+
# bash scripts/launch/aero_realtime_train.sh
27+
#
28+
################################################################################
29+
30+
# ----- Paths (edit these) ----------------------------------------------------
31+
MODEL_PATH=/path/to/aero_realtime_init
32+
DATA_PATH=/path/to/llava_video_0_30_s_cap_oe.parquet
33+
DATA_FOLDER=/path/to/LLaVA-Video-178K
34+
# -----------------------------------------------------------------------------
35+
36+
NGPUS=8
37+
CONFIG_DIR=$(cd "$(dirname "$0")/../config" && pwd)
38+
39+
torchrun --nproc_per_node="${NGPUS}" \
40+
--nnodes="1" \
41+
--node_rank="0" \
42+
--master_addr="127.0.0.1" \
43+
--master_port="8000" \
44+
-m lmms_engine.launch.cli \
45+
--config-path "${CONFIG_DIR}" \
46+
--config-name aero_realtime \
47+
model_config.load_from_pretrained_path="${MODEL_PATH}" \
48+
dataset_config.processor_config.processor_name="${MODEL_PATH}" \
49+
dataset_config.datasets.0.path="${DATA_PATH}" \
50+
dataset_config.datasets.0.data_folder="${DATA_FOLDER}"
51+
52+
################################################################################
53+
# EXAMPLES:
54+
#
55+
# Quick debug (5 steps, single GPU):
56+
# python -m lmms_engine.launch.cli \
57+
# --config-path scripts/config \
58+
# --config-name aero_realtime \
59+
# model_config.load_from_pretrained_path=${MODEL_PATH} \
60+
# dataset_config.processor_config.processor_name=${MODEL_PATH} \
61+
# dataset_config.datasets.0.path=${DATA_PATH} \
62+
# dataset_config.datasets.0.data_folder=${DATA_FOLDER} \
63+
# trainer_args.max_steps=5 \
64+
# trainer_args.print_batch_input_steps=1
65+
#
66+
# Freeze vision tower:
67+
# ... trainer_args.freeze_modules='["visual"]'
68+
#
69+
# Change learning rate:
70+
# ... trainer_args.learning_rate=2e-5
71+
#
72+
# Multi-node (2 nodes):
73+
# torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 \
74+
# --master_addr=<RANK_0_IP> --master_port=8000 \
75+
# -m lmms_engine.launch.cli \
76+
# --config-path scripts/config \
77+
# --config-name aero_realtime \
78+
# model_config.load_from_pretrained_path=${MODEL_PATH} \
79+
# ...
80+
#
81+
################################################################################

src/lmms_engine/datasets/collator/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
from .aero_realtime_collator import AeroRealtimeCollator
12
from .bagel_collator import BagelCollator
23
from .llava_collator import LLaVACollator
34
from .vision_collator import VisionCollator
45

56
__all__ = [
7+
"AeroRealtimeCollator",
68
"VisionCollator",
79
"BagelCollator",
810
"LLaVACollator",

0 commit comments

Comments
 (0)