EvolvingLMMs-Lab
diff --git a/‎llava_next/checkpoints/date1220_llavanext-llavavit_-2hid-qwen2.5-1.5b-sigvid-8nodes/finetune.sh‎
Lines changed: 62 additions & 0 deletions b/‎llava_next/checkpoints/date1220_llavanext-llavavit_-2hid-qwen2.5-1.5b-sigvid-8nodes/finetune.sh‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎llava_next/llava/mm_utils.py‎
Lines changed: 36 additions & 4 deletions b/‎llava_next/llava/mm_utils.py‎
Lines changed: 36 additions & 4 deletions
diff --git a/‎llava_next/llava/model/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎llava_next/llava/model/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎llava_next/llava/model/builder.py‎
Lines changed: 10 additions & 0 deletions b/‎llava_next/llava/model/builder.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎llava_next/llava/model/language_model/llava_gemma.py‎
Lines changed: 0 additions & 122 deletions b/‎llava_next/llava/model/language_model/llava_gemma.py‎
Lines changed: 0 additions & 122 deletions
@@ -0,0 +1,62 @@
+export OMP_NUM_THREADS=8
+export NCCL_IB_DISABLE=0
+export NCCL_IB_GID_INDEX=3
+export NCCL_SOCKET_IFNAME=eth0
+export PYTHONPATH=$(pwd)
+export CUDA_VISIBLE_DEVICES=6,7
+
+LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-1.5B-Instruct"
+LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
+VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/onevision-encoder-large"
+VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
+export WANDB_MODE=disabled
+
+
+export PORT=29502
+PROMPT_VERSION="qwen_1_5"
+
+BASE_RUN_NAME="./checkpoints/date1220_llavanext-llavavit_-2hid-qwen2.5-1.5b-sigvid-8nodes"
+echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
+
+mkdir -p $BASE_RUN_NAME
+cp $0 $BASE_RUN_NAME/$(basename $0)
+
+deepspeed --master_port 65535 \
+    llava/train/train_mem.py \
+    --deepspeed scripts/zero3.json \
+    --model_name_or_path ${LLM_VERSION} \
+    --version ${PROMPT_VERSION} \
+    --data_path /rice_vl/llava_video_8f_imgs_1027/video_800k_llavanextsig_740k_shuffled.jsonl \
+    --image_folder /rice_vl/llava_video_8f_imgs_1027 \
+    --pretrain_mm_mlp_adapter="/vlm/yinxie/code/checkpoints/projectors/llavanext-llavavit_-2hid-qwen2.5-1.5b-instruct-pretrain_blip558k_plain-1220-dist/mm_projector.bin" \
+    --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
+    --mm_vision_tower_lr=2e-6 \
+    --vision_tower ${VISION_MODEL_VERSION} \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --group_by_modality_length True \
+    --image_aspect_ratio anyres \
+    --image_grid_pinpoints "[(574, 1120), (1120, 574), (1120, 1120), (1694, 574), (574, 1694)]" \
+    --mm_patch_merge_type flat \
+    --bf16 True \
+    --run_name $BASE_RUN_NAME \
+    --output_dir $BASE_RUN_NAME \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 2 \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 20 \
+    --learning_rate 1e-5 \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 321120 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 1 \
+    --lazy_preprocess True \
+    --dataloader_drop_last True \
+    --attn_implementation flash_attention_2 | tee $BASE_RUN_NAME/train.log
+
+# You can delete the sdpa attn_implementation if you want to use flash attn
@@ -274,6 +274,14 @@ def process_anyres_image(image, processor, grid_pinpoints):
         possible_resolutions = ast.literal_eval(grid_pinpoints)
     best_resolution = select_best_resolution(image.size, possible_resolutions)
     image_padded = resize_and_pad_image(image, best_resolution)
+    if 'siglip' in processor.__class__.__name__.lower():
+        image_patches = [processor.preprocess(image_padded, return_tensors="pt", do_resize=False)["pixel_values"]]
+        grid_thw = [1, best_resolution[1] // 16, best_resolution[0] // 16]
+        return {'pixel_values': torch.cat(image_patches, dim=0), 'grid_thw': grid_thw}
+    else: # FIXME: for onevision encoder
+        image_patches = [processor.preprocess(image_padded, return_tensors="pt", do_resize=False)["pixel_values"]]
+        grid_thw = [1, best_resolution[1] // 14, best_resolution[0] // 14]
+        return {'pixel_values': torch.cat(image_patches, dim=0), 'grid_thw': grid_thw}
 
     patches = divide_to_patches(image_padded, processor.crop_size["height"])
 
@@ -314,6 +322,9 @@ def expand2square(pil_img, background_color):
 def process_images(images, image_processor, model_cfg):
     image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
     new_images = []
+    if len(images) == 8: #FIXME hardcoded for 8 images input as video sample
+        image_aspect_ratio = 'pad'
+
     if image_aspect_ratio == "highres":
         for image in images:
             image = process_highres_image(image, image_processor, model_cfg.image_grid_pinpoints)
@@ -322,15 +333,36 @@ def process_images(images, image_processor, model_cfg):
         for image in images:
             image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
             new_images.append(image)
+        return {'image_patchs': [img['pixel_values'] for img in new_images], 'grid_thw': [img['grid_thw'] for img in new_images]}
     elif image_aspect_ratio == "crop_split":
         for image in images:
             image = process_highres_image_crop_split(image, model_cfg, image_processor)
             new_images.append(image)
     elif image_aspect_ratio == "pad":
-        for image in images:
-            image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean))
-            image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
-            new_images.append(image)
+        if 'siglip' in image_processor.__class__.__name__.lower():
+            image_patchs = []
+            grid_thw = []
+            for image in images:
+                image = expand2square(image, tuple(int(0 * 255) for x in [0,0,0]))
+                image = image.resize((512, 512))
+                image_patchs.append(image_processor.preprocess(image, return_tensors="pt", do_resize=False)["pixel_values"])
+                grid_thw.append([1, 32, 32])
+            return {'image_patchs': image_patchs, 'grid_thw': torch.tensor(grid_thw)}
+
+        else: # FIXME: for onevision encoder video
+            image_patchs = []
+            grid_thw = []
+            for image in images:
+                image = expand2square(image, tuple(int(0 * 255) for x in [0,0,0]))
+                image = image.resize((504, 504))
+                image_patchs.append(image_processor.preprocess(image, return_tensors="pt", do_resize=False)["pixel_values"])
+                grid_thw.append([1, 36, 36])
+            return {'image_patchs': image_patchs, 'grid_thw': torch.tensor(grid_thw)}
+
+        image = image.resize((504, 504))
+        # image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean))
+        image = image_processor.preprocess(image, return_tensors="pt", do_resize=False)["pixel_values"]
+        new_images.append(image)
     else:
         return image_processor.preprocess(images, return_tensors="pt")["pixel_values"]
     if all(x.shape == new_images[0].shape for x in new_images):
 
@@ -17,3 +17,4 @@
 
 
 from .language_model.llava_qwen import LlavaQwenForCausalLM, LlavaQwenConfig
+from .language_model.llava_qwen3 import LlavaQwen3ForCausalLM, LlavaQwen3Config
@@ -221,6 +221,16 @@ def load_from_hf(repo_id, filename, subfolder=None):
                         model = LlavaQwenMoeForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation=attn_implementation, config=llava_cfg, **kwargs)
                     else:
                         model = LlavaQwenMoeForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation=attn_implementation, **kwargs)
+                elif "qwen3" in model_name.lower():
+                    from llava.model.language_model.llava_qwen3 import LlavaQwen3Config
+                    if overwrite_config is not None:
+                        llava_cfg = LlavaQwen3Config.from_pretrained(model_path)
+                        rank0_print(f"Overwriting config with {overwrite_config}")
+                        for k, v in overwrite_config.items():
+                            setattr(llava_cfg, k, v)
+                        model = LlavaQwen3ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation=attn_implementation, config=llava_cfg, **kwargs)
+                    else:
+                        model = LlavaQwen3ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation=attn_implementation, **kwargs)
 
                 else:
                     from llava.model.language_model.llava_qwen import LlavaQwenConfig
Original file line number	Diff line number	Diff line change
`@@ -17,3 +17,4 @@`
`17`	`17`
`18`	`18`
`19`	`19`	`from .language_model.llava_qwen import LlavaQwenForCausalLM, LlavaQwenConfig`
	`20`	`+from .language_model.llava_qwen3 import LlavaQwen3ForCausalLM, LlavaQwen3Config`