EvolvingLMMs-Lab
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 0 deletions b/‎.gitignore‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 17 additions & 25 deletions b/‎README.md‎
Lines changed: 17 additions & 25 deletions
diff --git a/‎dockerfile‎
Lines changed: 8 additions & 1 deletion b/‎dockerfile‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎eval_encoder/attentive_probe.py‎
Lines changed: 6 additions & 7 deletions b/‎eval_encoder/attentive_probe.py‎
Lines changed: 6 additions & 7 deletions
@@ -498,3 +498,10 @@ cache/
 list_*
 tmp
 *.jsonmodel_factory/vit_aim_v2_packing_hf_old.py
+
+
+ckpts/
+ckpts/**
+ckpts
+.gitginore
+
@@ -64,11 +64,15 @@ Coupled with global contrastive learning over a 2M-scale concept memory bank, On
 
 ### Video Processing Pipeline
 
-The visualization below illustrates four different video processing pipelines.  
-(1) **Original Video**: a continuous 64-frame sequence that preserves the complete temporal context.  
-(2) **Uniform Frame Sampling**: a conventional strategy that selects 4–8 evenly spaced frames; while simple and efficient, it is inherently lossy and fails to capture fine-grained inter-frame motion.  
-(3) **Temporal Saliency Detection**: a global analysis of all 64 frames to identify regions rich in temporal information, including motion patterns, appearance variations, and semantic events.  
-(4) **Codec-Style Patch Extraction**: selective extraction of the temporally salient patches in a zigzag order, achieving 75–98% compression while retaining critical temporal dynamics.  
+The visualization below illustrates four different video processing pipelines.
+
+**1. Original Video**: a continuous 64-frame sequence that preserves the complete temporal context.
+
+**2. Uniform Frame Sampling**: a conventional strategy that selects 4–8 evenly spaced frames; while simple and efficient, it is inherently lossy and fails to capture fine-grained inter-frame motion.
+
+**3. Temporal Saliency Detection**: a global analysis of all 64 frames to identify regions rich in temporal information, including motion patterns, appearance variations, and semantic events.
+
+**4. Codec-Style Patch Extraction**: selective extraction of the temporally salient patches in a zigzag order, achieving 75–98% compression while retaining critical temporal dynamics.
 
 <div align="center">
 <table style="width: 100%; max-width: 1200px; table-layout: fixed;">
@@ -272,25 +276,14 @@ Training configurations and hyperparameters will be documented soon. For now, pl
 To evaluate the encoder with uniform frame sampling, first navigate to the evaluation directory:
 
 ```bash
+pip install -e .
 cd eval_encoder
 ```
 
 Then run the following command:
 
 ```bash
-torchrun --nproc_per_node=8 --master_port=29507 attentive_probe.py \
-  --eval_freq 1 \
-  --default_lr_list 0.0001 \
-  --batch_size 32 \
-  --default_weight_decay 0 \
-  --dali_py_num_workers 8 \
-  --model_family llava_vit_sampling \
-  --dataset diving48 \
-  --num_frames 8 \
-  --model_weight lmms-lab-encoder/onevision-encoder-large \
-  --model_name hf_llava_vit_large_ln \
-  --embedding_size 1024 \
-  --frames_token_num 256
+bash eval_encoder/shells_eval_ap/eval_ov_encoder_large_16frames.sh
 ```
 
 **Sampling-Specific Parameters:**
@@ -313,22 +306,21 @@ torchrun --nproc_per_node=8 --master_port=29512 attentive_probe_codec.py \
   --batch_size 4 \
   --default_weight_decay 0 \
   --dali_py_num_workers 8 \
-  --model_family llava_vit_codec \
+  --model_family ov_encoder_codec \
   --dataset diving48 \
-  --num_frames 64 \
-  --model_weight lmms-lab/onevision-encoder-large \
-  --model_name hf_llava_vit_large_ln \
+  --model_weight lmms-lab-encoder/onevision-encoder-large \
+  --model_name ov_encoder_large \
   --embedding_size 1024 \
   --default_epoch 30 \
-  --data_root /path/to/your/data_attentive_probe/ \
-  --cache_dir /path/to/your/cache_residuals/ \
   --K_keep 2048 \
+  --num_frames 64 \
   --mv_compensate median
+
 ```
 
 **Codec-Specific Parameters:**
+- `K_keep`: Number of patches to keep.
 - `cache_dir`: Directory for cached codec patches. This is where the codec-selected patches will be stored/loaded.
-- `K_keep`: Number of patches to keep. For example, 256 patches per frame × 8 frames = 2048 total patches. Adjust based on your frame count and desired compression ratio.
 - `mv_compensate`: Motion vector compensation method (e.g., `median`).
 
 #### Shared Parameters
 
@@ -1,10 +1,17 @@
-FROM pytorch/pytorch:2.7.0-cuda11.8-cudnn9-runtime
+FROM nvcr.io/nvidia/pytorch:25.04-py3
 
 # Set up environment variables
 ENV DEBIAN_FRONTEND=noninteractive \
     PYTHONUNBUFFERED=1 \
     PIP_NO_CACHE_DIR=1
 
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgl1 \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+
+
 # Install system dependencies and ffmpeg in one layer
 RUN set -eux; \
     apt-get update && apt-get install -y --no-install-recommends \
 
@@ -8,7 +8,6 @@
 import torch
 import torch.nn.functional as F
 import torchmetrics
-from dataloader.ap_dataloader_dali import get_dali_dataloader
 from timm.loss import LabelSmoothingCrossEntropy
 from timm.models import create_model
 from timm.models.layers import trunc_normal_
@@ -19,7 +18,9 @@
 
 # Ensure custom models and layers are registered
 import model_factory
-from model_factory.layers import Siglip2MultiheadAttentionPoolingHead, Siglip2TransformerAttentionPoolingHead
+from dataloader.ap_dataloader_dali import get_dali_dataloader
+from model_factory.layers import (Siglip2MultiheadAttentionPoolingHead,
+                                  Siglip2TransformerAttentionPoolingHead)
 
 warnings.filterwarnings("ignore")
 
@@ -33,7 +34,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--dataset", default="ssv2")
 
     # Model
-    parser.add_argument("--model_family", default="llava_vit_sampling")
+    parser.add_argument("--model_family", default="chunk_wise_sampling")
     parser.add_argument("--model_name", default="ov_encoder_large")
     parser.add_argument("--model_weight", default="NULL")
     parser.add_argument("--num_frames", type=int, default=8)
@@ -42,7 +43,6 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--tubelet_size", type=int, default=1)
     parser.add_argument("--embedding_size", type=int, default=768)
     parser.add_argument("--num_classes", type=int, default=0)
-    # ===> New: target frame number parameter <===
     parser.add_argument("--target_frames", type=int, default=64,
                         help="Target number of frames to interpolate to (default: 64)")
 
@@ -155,7 +155,6 @@ def video_to_images(videos: torch.Tensor) -> torch.Tensor:
         "dinov2",
         "dinov3",
         "metaclip",
-        "llava_vit_si",
         "aimv2"
     ]
     if args.model_family in list_vit_single_image:
@@ -183,7 +182,7 @@ def video_to_images(videos: torch.Tensor) -> torch.Tensor:
                 else:
                     raise ValueError("SigLIP2 only supports image input with 4 dimensions [B, C, H, W].")
 
-    elif args.model_family == "llava_vit_sampling":
+    elif args.model_family == "chunk_wise_sampling":
         with torch.cuda.amp.autocast(dtype=torch.bfloat16):
             with torch.no_grad():
                 bs, C, T, H, W = videos.shape
@@ -410,7 +409,7 @@ def get_model(args: argparse.Namespace) -> nn.Module:
         return model
 
     model = create_model(args.model_name, pretrained=False)
-    if args.model_family in ["llava_vit_sampling"]:
+    if args.model_family in ["chunk_wise_sampling"]:
         state_dict = torch.load(args.model_weight, map_location="cpu")
         state_dict = {k.replace("_orig_mod.", "").replace("module.", ""): v for k, v in state_dict.items()}
         model.load_state_dict(state_dict, strict=True)