EvolvingLMMs-Lab
diff --git a/‎Encoder_Eval/README.md‎
Lines changed: 54 additions & 0 deletions b/‎Encoder_Eval/README.md‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎Encoder_Eval/src/video_attentive_probe.sh‎
Lines changed: 116 additions & 0 deletions b/‎Encoder_Eval/src/video_attentive_probe.sh‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎Encoder_Eval/src/video_linear_probe.sh‎
Lines changed: 104 additions & 0 deletions b/‎Encoder_Eval/src/video_linear_probe.sh‎
Lines changed: 104 additions & 0 deletions
@@ -0,0 +1,54 @@
+# Encoder_Eval: A Unified Evaluation Suite for Video and Image Encoders
+
+This repository provides a unified evaluation framework for benchmarking **video** and **image** encoders across diverse tasks, including **linear probing**, **attentive probing**, **dense segmentation**, and **object detection**.
+
+---
+
+## 📅 Project Progress
+
+### ✅ Completed
+- [x] `video_attentive_probe`: Attention-based probing for video encoders
+- [x] `video_linear_probe`: Linear probing for video encoders
+
+### ⬜ Upcoming / In Development
+- [ ] `image_attentive_probe`: Attention-based probing for image encoders
+- [ ] `image_linear_probe`: Linear probing for image encoders
+- [ ] `dense_segmentation`: Dense prediction benchmarking (image/video)
+- [ ] `object_detection`: Detection task evaluation (image/video)
+
+---
+
+## 💡 Key Features
+- Support for both **video** and **image** modalities.
+- Modular design for easy integration of new probing techniques.
+- Standardized evaluation pipelines for encoder representations.
+- Designed to benchmark both **frozen** and **fine-tuned** encoders.
+
+## 🔧 Setup
+
+```bash
+# Clone the repo
+git clone git@github.com:FeilongTangmonash/Encoder_Eval.git
+cd Encoder_Eval
+
+# Install dependencies
+pip install -r requirements.txt
+```
+## 🧱 code structure
+
+<pre>
+video_vit/
+└── video_encoder_eval/
+    └── video_linear_probe/
+        └── checkpoint/
+            └── mlcd_base/
+                └── backbone_base224.pt
+</pre>
+
+
+## 🚀 Usage
+We provide example scripts to perform a full evaluation of the UMT model using both the attentive probe and the linear probe methods. Simply run the commands below:
+```
+bash src/video_attentive_probe.sh
+bash src/video_linear_probe.sh
+```
@@ -0,0 +1,116 @@
+export NUM_GPUS=8
+export NNODES=1
+export RANK=0
+export ADDR="127.0.0.1"
+export PORT="32509"
+#pt=pretrain ppt=post-pretrain ft=finetune
+
+TRAIN_DATA_ROOT_PATH=/path/to/train/video
+TRAIN_DATA_CSV_PATH=/path/to/train/video
+VAL_DATA_ROOT_PATH=/path/to/val/video
+VAL_DATA_CSV_PATH=/path/to/val/csv
+OUTPUT=/path/to/output
+MODEL_NAME='umt'
+
+FINETUNE=/path/to/ckpt
+model='vit_large_patch16_224'
+EMBEDDING_SIZE=1024
+PATCH_SIZE=16
+NUM_FRAMES=8
+INPUT_SIZE=224
+TUBELET_SIZE=1
+BATCH_SIZE=32
+
+for SEED in 1
+do
+    for DATASET in ssv2 k400 k600 k700 hmdb51 ucf101 epic_verb epic_noun perception_test diving48 CharadesEgo  CharadesEgo_v1_only1st CharadesEgo_v1_only3rd
+    do
+        for NUM_SHOTS in 50
+        do
+            echo "SEED: $SEED"
+            echo "DATASET: $DATASET"
+            echo "NUM_SHOTS: $NUM_SHOTS"
+
+            FLASH=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" \
+                --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
+                ac_export_feature_and_attentive_probe.py \
+                --embedding_size ${EMBEDDING_SIZE} \
+                --data_set ${DATASET} \
+                --seed ${SEED} \
+                --num_shots ${NUM_SHOTS} \
+                --num_step 8 \
+                --train_data_root_path ${TRAIN_DATA_ROOT_PATH} \
+                --train_data_csv_path ${TRAIN_DATA_CSV_PATH} \
+                --val_data_root_path ${VAL_DATA_ROOT_PATH} \
+                --val_data_csv_path ${VAL_DATA_CSV_PATH} \
+                --save_report ${OUTPUT} \
+                --batch_size ${BATCH_SIZE} \
+                --model_name ${MODEL_NAME} \
+                --model ${model} \
+                --finetune ${FINETUNE} \
+                --num_frames ${NUM_FRAMES} \
+                --input_size ${INPUT_SIZE} \
+                --tubelet_size ${TUBELET_SIZE} \
+                --patch_size ${PATCH_SIZE}
+        done
+    done
+done
+
+# Due to the small dataset size, the following dataset raises errors when using 8 GPUs with a large batch size. 
+export NUM_GPUS=1
+export NNODES=1
+export RANK=0
+export ADDR="127.0.0.1"
+export PORT="32509"
+#pt=pretrain ppt=post-pretrain ft=finetune
+
+TRAIN_DATA_ROOT_PATH=/path/to/train/video
+TRAIN_DATA_CSV_PATH=/path/to/train/video
+VAL_DATA_ROOT_PATH=/path/to/val/video
+VAL_DATA_CSV_PATH=/path/to/val/csv
+OUTPUT=/path/to/output
+MODEL_NAME='umt'
+
+FINETUNE=/path/to/ckpt
+model='vit_large_patch16_224'
+EMBEDDING_SIZE=1024
+PATCH_SIZE=16
+NUM_FRAMES=8
+INPUT_SIZE=224
+TUBELET_SIZE=1
+BATCH_SIZE=32
+
+for SEED in 1
+do
+    for DATASET in RareAct Drone_Action
+    do
+        for NUM_SHOTS in 50
+        do
+            echo "SEED: $SEED"
+            echo "DATASET: $DATASET"
+            echo "NUM_SHOTS: $NUM_SHOTS"
+
+            FLASH=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" \
+                --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
+                ac_export_feature_and_attentive_probe.py \
+                --embedding_size ${EMBEDDING_SIZE} \
+                --data_set ${DATASET} \
+                --seed ${SEED} \
+                --num_shots ${NUM_SHOTS} \
+                --num_step 8 \
+                --train_data_root_path ${TRAIN_DATA_ROOT_PATH} \
+                --train_data_csv_path ${TRAIN_DATA_CSV_PATH} \
+                --val_data_root_path ${VAL_DATA_ROOT_PATH} \
+                --val_data_csv_path ${VAL_DATA_CSV_PATH} \
+                --save_report ${OUTPUT} \
+                --batch_size ${BATCH_SIZE} \
+                --model_name ${MODEL_NAME} \
+                --model ${model} \
+                --finetune ${FINETUNE} \
+                --num_frames ${NUM_FRAMES} \
+                --input_size ${INPUT_SIZE} \
+                --tubelet_size ${TUBELET_SIZE} \
+                --patch_size ${PATCH_SIZE}
+        done
+    done
+done
@@ -0,0 +1,104 @@
+export NUM_GPUS=8
+export NNODES=1
+export RANK=0
+export ADDR="127.0.0.1"
+export PORT="32509"
+#pt=pretrain ppt=post-pretrain ft=finetune
+
+TRAIN_DATA_ROOT_PATH=/path/to/train/video
+TRAIN_DATA_CSV_PATH=/path/to/train/video
+VAL_DATA_ROOT_PATH=/path/to/val/video
+VAL_DATA_CSV_PATH=/path/to/val/csv
+OUTPUT=/path/to/output
+MODEL_NAME='umt'
+
+
+FINETUNE=/path/to/ckpt
+model='vit_large_patch16_224'
+EMBEDDING_SIZE=768
+PATCH_SIZE=16
+NUM_FRAMES=8
+INPUT_SIZE=224
+TUBELET_SIZE=1
+BATCH_SIZE=32
+
+for SEED in 1
+do
+    for DATASET in ssv2 k400 k600 k700 hmdb51 ucf101 epic_verb epic_noun perception_test diving48 CharadesEgo  CharadesEgo_v1_only1st CharadesEgo_v1_only3rd
+    do
+        for NUM_SHOTS in 50
+        do
+            echo "SEED: $SEED"
+            echo "DATASET: $DATASET"
+            echo "NUM_SHOTS: $NUM_SHOTS"
+
+            FLASH=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" \
+                --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
+                ac_export_feature_and_linear_probe.py \
+                --embedding_size ${EMBEDDING_SIZE} \
+                --data_set ${DATASET} \
+                --seed ${SEED} \
+                --num_shots ${NUM_SHOTS} \
+                --num_step 8 \
+                --train_data_root_path ${TRAIN_DATA_ROOT_PATH} \
+                --train_data_csv_path ${TRAIN_DATA_CSV_PATH} \
+                --val_data_root_path ${VAL_DATA_ROOT_PATH} \
+                --val_data_csv_path ${VAL_DATA_CSV_PATH} \
+                --save_report ${OUTPUT} \
+                --batch_size ${BATCH_SIZE} \
+                --model_name ${MODEL_NAME} \
+                --model ${model} \
+                --finetune ${FINETUNE} \
+                --num_frames ${NUM_FRAMES} \
+                --input_size ${INPUT_SIZE} \
+                --tubelet_size ${TUBELET_SIZE} \
+                --patch_size ${PATCH_SIZE}
+        done
+    done
+done
+
+
+# Due to the small dataset size, the following dataset raises errors when using 8 GPUs with a large batch size. 
+FINETUNE=/path/to/ckpt
+model='vit_large_patch16_224'
+EMBEDDING_SIZE=768
+PATCH_SIZE=16
+NUM_FRAMES=8
+INPUT_SIZE=224
+TUBELET_SIZE=1
+BATCH_SIZE=32
+
+for SEED in 1
+do
+    for DATASET in RareAct Drone_Action
+    do
+        for NUM_SHOTS in 10
+        do
+            echo "SEED: $SEED"
+            echo "DATASET: $DATASET"
+            echo "NUM_SHOTS: $NUM_SHOTS"
+
+            FLASH=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" \
+                --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
+                ac_export_feature_and_linear_probe.py \
+                --embedding_size ${EMBEDDING_SIZE} \
+                --data_set ${DATASET} \
+                --seed ${SEED} \
+                --num_shots ${NUM_SHOTS} \
+                --num_step 8 \
+                --train_data_root_path ${TRAIN_DATA_ROOT_PATH} \
+                --train_data_csv_path ${TRAIN_DATA_CSV_PATH} \
+                --val_data_root_path ${VAL_DATA_ROOT_PATH} \
+                --val_data_csv_path ${VAL_DATA_CSV_PATH} \
+                --save_report ${OUTPUT} \
+                --batch_size ${BATCH_SIZE} \
+                --model_name ${MODEL_NAME} \
+                --model ${model} \
+                --finetune ${FINETUNE} \
+                --num_frames ${NUM_FRAMES} \
+                --input_size ${INPUT_SIZE} \
+                --tubelet_size ${TUBELET_SIZE} \
+                --patch_size ${PATCH_SIZE}
+        done
+    done
+done