EvolvingLMMs-Lab
diff --git a/‎examples/aero_realtime/example_config.yaml‎
Lines changed: 267 additions & 0 deletions b/‎examples/aero_realtime/example_config.yaml‎
Lines changed: 267 additions & 0 deletions
diff --git a/‎examples/aero_realtime/run.sh‎
Lines changed: 81 additions & 0 deletions b/‎examples/aero_realtime/run.sh‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎src/lmms_engine/datasets/collator/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/lmms_engine/datasets/collator/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,267 @@
+# AeroRealtime Training Configuration
+# Trains the AeroRealtime model on LLaVA-Video-178K data (normal video QA mode).
+#
+# The dual-stream additive design is active: during video regions, the model
+# receives additive vision+text embeddings and learns to stay silent (rt_pad)
+# until spoken to (rt_speak boundary at delay_seconds).
+#
+# Audio is auto-extracted from video files by the dataset.
+#
+# Override paths at launch time:
+#   model_config.load_from_pretrained_path=/path/to/aero_realtime_init
+#   dataset_config.processor_config.processor_name=/path/to/aero_realtime_init
+#   dataset_config.datasets.0.path=/path/to/parquet
+#   dataset_config.datasets.0.data_folder=/path/to/video/root
+
+trainer_type: fsdp2_trainer
+
+dataset_config:
+  dataset_type: aero_realtime_iterable
+  dataset_format: yaml
+  dataset_path: null
+  processor_config:
+    processor_name: null  # Override: path to aero_realtime_init checkpoint
+    processor_type: aero_realtime
+    extra_kwargs:
+      video_max_pixels: 360448   # ~602*600 — moderate resolution for 4B model
+      video_min_pixels: 28800
+      image_max_pixels: 360448
+      image_min_pixels: 28800
+
+  # Inline dataset configuration — override paths at launch time
+  datasets:
+    - path: null           # Override: path to parquet file
+      data_folder: null    # Override: root directory for video files
+      data_type: parquet
+
+  shuffle: true
+  eval_dataset_path: null
+  object_storage: none
+  bucket_name: null
+
+  # Packing disabled for initial training (video lengths vary)
+  packing: false
+  packing_strategy: first_fit
+  packing_length: 8192
+  filter_overlong: true
+  filter_overlong_workers: 8
+  max_length: null
+
+  # Video sampling: 1 fps for 0-30s videos
+  video_sampling_strategy: fps
+  video_max_pixels: 360448
+  video_max_frames: 64
+  frame_num: 32
+  fps: 1
+  video_backend: qwen_vl_utils
+
+  extra_kwargs: {}
+
+trainer_args:
+  output_dir: ./output/aero_realtime_training
+  overwrite_output_dir: false
+  do_train: true
+  do_eval: false
+  do_predict: false
+  eval_strategy: 'no'
+  prediction_loss_only: false
+
+  # Batch size — start conservative for 4B+ model with video+audio
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 1
+  gradient_accumulation_steps: 4
+  eval_accumulation_steps: null
+  eval_delay: 0
+  torch_empty_cache_steps: null
+
+  # Optimizer
+  learning_rate: 1.0e-05
+  weight_decay: 0.0
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1.0e-08
+  max_grad_norm: 1.0
+
+  # Training schedule
+  num_train_epochs: 1
+  max_steps: -1
+  lr_scheduler_type: cosine
+  lr_scheduler_kwargs: {}
+  warmup_ratio: 0.03
+  warmup_steps: 0
+
+  # Logging
+  log_level: passive
+  log_level_replica: warning
+  log_on_each_node: true
+  logging_dir: ./output/aero_realtime_training/runs
+  logging_strategy: steps
+  logging_first_step: false
+  logging_steps: 1
+  logging_nan_inf_filter: true
+
+  # Checkpointing
+  save_strategy: steps
+  save_steps: 500
+  save_total_limit: 2
+  save_safetensors: true
+  save_on_each_node: false
+  save_only_model: false
+  restore_callback_states_from_checkpoint: false
+
+  # Device
+  no_cuda: false
+  use_cpu: false
+  use_mps_device: false
+  seed: 42
+  data_seed: null
+  jit_mode_eval: false
+
+  # Precision
+  bf16: true
+  fp16: false
+  fp16_opt_level: O1
+  half_precision_backend: auto
+  bf16_full_eval: false
+  fp16_full_eval: false
+  tf32: true
+
+  # Distributed
+  local_rank: 0
+  ddp_backend: null
+  tpu_num_cores: null
+  tpu_metrics_debug: false
+  debug: []
+
+  # Dataloader
+  dataloader_drop_last: true
+  eval_steps: null
+  dataloader_num_workers: 4
+  dataloader_prefetch_factor: null
+  past_index: -1
+  run_name: aero_realtime_training
+  disable_tqdm: false
+  remove_unused_columns: false
+  label_names: null
+  load_best_model_at_end: false
+  metric_for_best_model: null
+  greater_is_better: null
+  ignore_data_skip: false
+
+  # FSDP config
+  fsdp: []
+  fsdp_min_num_params: 0
+  fsdp_config:
+    transformer_layer_cls_to_wrap:
+    - Qwen3VLTextDecoderLayer
+    reshard_after_forward: false
+    min_num_params: 0
+    xla: false
+    xla_fsdp_v2: false
+    xla_fsdp_grad_ckpt: false
+  fsdp_transformer_layer_cls_to_wrap: null
+
+  accelerator_config:
+    split_batches: false
+    dispatch_batches: null
+    even_batches: true
+    use_seedable_sampler: true
+    non_blocking: false
+    gradient_accumulation_kwargs: null
+  parallelism_config: null
+  deepspeed: null
+
+  # Optimization
+  label_smoothing_factor: 0.0
+  optim: adamw_torch_fused
+  optim_args: null
+  adafactor: false
+  group_by_length: false
+  length_column_name: length
+
+  # Reporting
+  report_to:
+  - wandb
+  project: huggingface
+  trackio_space_id: trackio
+
+  # Advanced
+  ddp_find_unused_parameters: null
+  ddp_bucket_cap_mb: null
+  ddp_broadcast_buffers: null
+  dataloader_pin_memory: true
+  dataloader_persistent_workers: false
+  skip_memory_metrics: true
+  use_legacy_prediction_loop: false
+  push_to_hub: false
+  resume_from_checkpoint: null
+  hub_model_id: null
+  hub_strategy: every_save
+  hub_token: null
+  hub_private_repo: null
+  hub_always_push: false
+  hub_revision: null
+
+  # Memory optimization
+  gradient_checkpointing: true
+  gradient_checkpointing_kwargs: null
+  include_inputs_for_metrics: false
+  include_for_metrics: []
+  eval_do_concat_batches: true
+  fp16_backend: auto
+  push_to_hub_model_id: null
+  push_to_hub_organization: null
+  mp_parameters: ''
+  auto_find_batch_size: false
+  full_determinism: false
+  torchdynamo: null
+  ray_scope: last
+  ddp_timeout: 1800
+
+  # Compilation
+  torch_compile: false
+  torch_compile_backend: null
+  torch_compile_mode: null
+
+  include_tokens_per_second: false
+  include_num_input_tokens_seen: 'no'
+  neftune_noise_alpha: null
+  optim_target_modules: null
+  batch_eval_metrics: false
+  eval_on_start: false
+
+  # Liger kernel for memory efficiency
+  use_liger_kernel: true
+  liger_kernel_config: null
+  eval_use_gather_object: false
+  average_tokens_across_devices: true
+  use_muon: false
+
+  # Freeze vision tower initially (train language model + audio + fusion)
+  freeze_modules: null
+
+  # Remove padding for efficiency
+  use_rmpad: true
+
+  # FSDP2 configuration
+  fsdp2: true
+  sp_ulysses_degree: 1
+  reduce_dtype: bfloat16
+  output_dtype: bfloat16
+  print_batch_input_steps: 5
+  enable_profiler: false
+  profiler_config:
+    start_step: 1
+    end_step: 3
+
+model_config:
+  extra_kwargs: {}
+  load_from_pretrained_path: null  # Override: path to aero_realtime_init checkpoint
+  load_from_config: null
+  attn_implementation: flash_attention_2
+  model_type: aero_realtime
+  torch_dtype: bfloat16
+  overwrite_config: null
+  monkey_patch_kwargs: null
+
+extra_kwargs: null
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+################################################################################
+# AeroRealtime Training — LLaVA-Video-178K (Normal Video QA)
+################################################################################
+#
+# DESCRIPTION:
+#   Train the AeroRealtime model (5.09B params) on LLaVA-Video-178K data.
+#   Uses Hydra --config-path / --config-name so all settings can be overridden
+#   from the command line.
+#
+# MODEL:
+#   - Vision: Qwen3-VL-4B vision tower (with built-in merger)
+#   - Audio: Qwen2-Audio-7B encoder (auto-extracted from video)
+#   - Language: Qwen3-VL-4B text model + lm_head
+#   - Fusion: timestep-aligned mean pooling (audio -> vision bins)
+#   - Design: additive dual-stream (text_stream_ids + vision features)
+#
+# PREREQUISITES:
+#   1. Prepare checkpoint:  python tools/prepare_init_weight/prepare_aero_realtime.py
+#   2. Convert data:        python tools/convert_data/convert_llava_video_to_parquet.py
+#   3. Install deps:        pip install flash-attn --no-build-isolation && pip install liger-kernel librosa
+#
+# USAGE:
+#   Set MODEL_PATH, DATA_PATH, DATA_FOLDER below, then run:
+#     bash scripts/launch/aero_realtime_train.sh
+#
+################################################################################
+
+# ----- Paths (edit these) ----------------------------------------------------
+MODEL_PATH=/path/to/aero_realtime_init
+DATA_PATH=/path/to/llava_video_0_30_s_cap_oe.parquet
+DATA_FOLDER=/path/to/LLaVA-Video-178K
+# -----------------------------------------------------------------------------
+
+NGPUS=8
+CONFIG_DIR=$(cd "$(dirname "$0")/../config" && pwd)
+
+torchrun --nproc_per_node="${NGPUS}" \
+    --nnodes="1" \
+    --node_rank="0" \
+    --master_addr="127.0.0.1" \
+    --master_port="8000" \
+    -m lmms_engine.launch.cli \
+    --config-path "${CONFIG_DIR}" \
+    --config-name aero_realtime \
+    model_config.load_from_pretrained_path="${MODEL_PATH}" \
+    dataset_config.processor_config.processor_name="${MODEL_PATH}" \
+    dataset_config.datasets.0.path="${DATA_PATH}" \
+    dataset_config.datasets.0.data_folder="${DATA_FOLDER}"
+
+################################################################################
+# EXAMPLES:
+#
+# Quick debug (5 steps, single GPU):
+#   python -m lmms_engine.launch.cli \
+#       --config-path scripts/config \
+#       --config-name aero_realtime \
+#       model_config.load_from_pretrained_path=${MODEL_PATH} \
+#       dataset_config.processor_config.processor_name=${MODEL_PATH} \
+#       dataset_config.datasets.0.path=${DATA_PATH} \
+#       dataset_config.datasets.0.data_folder=${DATA_FOLDER} \
+#       trainer_args.max_steps=5 \
+#       trainer_args.print_batch_input_steps=1
+#
+# Freeze vision tower:
+#   ... trainer_args.freeze_modules='["visual"]'
+#
+# Change learning rate:
+#   ... trainer_args.learning_rate=2e-5
+#
+# Multi-node (2 nodes):
+#   torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 \
+#       --master_addr=<RANK_0_IP> --master_port=8000 \
+#       -m lmms_engine.launch.cli \
+#       --config-path scripts/config \
+#       --config-name aero_realtime \
+#       model_config.load_from_pretrained_path=${MODEL_PATH} \
+#       ...
+#
+################################################################################
@@ -1,8 +1,10 @@
+from .aero_realtime_collator import AeroRealtimeCollator
 from .bagel_collator import BagelCollator
 from .llava_collator import LLaVACollator
 from .vision_collator import VisionCollator
 
 __all__ = [
+    "AeroRealtimeCollator",
     "VisionCollator",
     "BagelCollator",
     "LLaVACollator",