diff --git a/.gitignore b/.gitignore index 682f9d5a..33314713 100644 --- a/.gitignore +++ b/.gitignore @@ -188,6 +188,7 @@ ray_results/ comet_ml/ neptune/ optuna/ +checkpoints/ # Common data file formats (uncomment any you DO want to track) *.csv diff --git a/llava_next/llava/mm_utils.py b/llava_next/llava/mm_utils.py index 62a3e509..8421dd18 100755 --- a/llava_next/llava/mm_utils.py +++ b/llava_next/llava/mm_utils.py @@ -115,6 +115,75 @@ def process_highres_image(image, processor, grid_pinpoints): image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches] return torch.stack(image_patches, dim=0) +def smart_resize( + height: int, + width: int, + patch_size: int = 16, + min_pixels: int = 32 * 32, +): + """ + Rescales the image dimensions so that: + 1. Both dimensions (height and width) are divisible by 'factor' (32 for Siglip2). + 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. + 3. The aspect ratio of the image is maintained as closely as possible. + + This is similar to Qwen2VL's smart_resize but adapted for Siglip2's requirements. + + Args: + height: Original image height + width: Original image width + factor: Factor that dimensions must be divisible by (default: 32 = 2 * 16) + min_pixels: Minimum number of pixels (default: 1024 = 32*32) + max_pixels: Maximum number of pixels (default: 262144 = 512*512) + + Returns: + Tuple of (resized_height, resized_width) + """ + if max(height, width) / min(height, width) > 200: + raise ValueError( + f"Absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}" + ) + + # Round to nearest factor + h_bar = round(height / patch_size) * patch_size + w_bar = round(width / patch_size) * patch_size + + # Ensure minimum factor size + h_bar = max(patch_size, h_bar) + w_bar = max(patch_size, w_bar) + + if h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = math.ceil(height * beta / patch_size) * patch_size + w_bar = math.ceil(width * beta / patch_size) * patch_size + + return h_bar, w_bar + + +def process_native_image(image, processor): + orig_width, orig_height = image.size + if 'siglip' in processor.__class__.__name__.lower(): + target_height, target_width = smart_resize( + height=orig_height, + width=orig_width, + patch_size=16, + min_pixels=16*4, + ) + image = image.resize((target_width, target_height), Image.BICUBIC) + image_patches = [processor.preprocess(image, return_tensors="pt", do_resize=False)["pixel_values"]] + grid_thw = [1, target_height // 16, target_width // 16] + return {'pixel_values': torch.cat(image_patches, dim=0), 'grid_thw': grid_thw} + else: + target_height, target_width = smart_resize( + height=orig_height, + width=orig_width, + patch_size=14, + min_pixels=14*4, + ) + image = image.resize((target_width, target_height), Image.BICUBIC) + image_patches = [processor.preprocess(image, return_tensors="pt", do_resize=False, do_center_crop=False)["pixel_values"]] + grid_thw = [1, target_height // 14, target_width // 14] + return {'pixel_values': torch.cat(image_patches, dim=0), 'grid_thw': grid_thw} def select_best_resolution(original_size, possible_resolutions): """ @@ -274,6 +343,14 @@ def process_anyres_image(image, processor, grid_pinpoints): possible_resolutions = ast.literal_eval(grid_pinpoints) best_resolution = select_best_resolution(image.size, possible_resolutions) image_padded = resize_and_pad_image(image, best_resolution) + if 'siglip' in processor.__class__.__name__.lower(): + image_patches = [processor.preprocess(image_padded, return_tensors="pt", do_resize=False)["pixel_values"]] + grid_thw = [1, best_resolution[1] // 16, best_resolution[0] // 16] + return {'pixel_values': torch.cat(image_patches, dim=0), 'grid_thw': grid_thw} + else: # FIXME: for onevision encoder + image_patches = [processor.preprocess(image_padded, return_tensors="pt", do_resize=False, do_center_crop=False)["pixel_values"]] + grid_thw = [1, best_resolution[1] // 14, best_resolution[0] // 14] + return {'pixel_values': torch.cat(image_patches, dim=0), 'grid_thw': grid_thw} patches = divide_to_patches(image_padded, processor.crop_size["height"]) @@ -314,23 +391,51 @@ def expand2square(pil_img, background_color): def process_images(images, image_processor, model_cfg): image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None) new_images = [] + if len(images) == 8: #FIXME hardcoded for 8 images input as video sample + image_aspect_ratio = 'pad' + if image_aspect_ratio == "highres": for image in images: image = process_highres_image(image, image_processor, model_cfg.image_grid_pinpoints) new_images.append(image) + elif image_aspect_ratio == "native": + for image in images: + image = process_native_image(image, image_processor) + new_images.append(image) elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio: for image in images: image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints) new_images.append(image) + return {'image_patchs': [img['pixel_values'] for img in new_images], 'grid_thw': [img['grid_thw'] for img in new_images]} elif image_aspect_ratio == "crop_split": for image in images: image = process_highres_image_crop_split(image, model_cfg, image_processor) new_images.append(image) elif image_aspect_ratio == "pad": - for image in images: - image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean)) - image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0] - new_images.append(image) + if 'siglip' in image_processor.__class__.__name__.lower(): + image_patchs = [] + grid_thw = [] + for image in images: + image = expand2square(image, tuple(int(0 * 255) for x in [0,0,0])) + image = image.resize((512, 512)) + image_patchs.append(image_processor.preprocess(image, return_tensors="pt", do_resize=False)["pixel_values"]) + grid_thw.append([1, 32, 32]) + return {'image_patchs': image_patchs, 'grid_thw': torch.tensor(grid_thw)} + + else: # FIXME: for onevision encoder video + image_patchs = [] + grid_thw = [] + for image in images: + image = expand2square(image, tuple(int(0 * 255) for x in [0,0,0])) + image = image.resize((504, 504)) + image_patchs.append(image_processor.preprocess(image, return_tensors="pt", do_resize=False)["pixel_values"]) + grid_thw.append([1, 36, 36]) + return {'image_patchs': image_patchs, 'grid_thw': torch.tensor(grid_thw)} + + image = image.resize((504, 504)) + # image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean)) + image = image_processor.preprocess(image, return_tensors="pt", do_resize=False)["pixel_values"] + new_images.append(image) else: return image_processor.preprocess(images, return_tensors="pt")["pixel_values"] if all(x.shape == new_images[0].shape for x in new_images): diff --git a/llava_next/llava/model/__init__.py b/llava_next/llava/model/__init__.py index eb13de0e..c16448ba 100755 --- a/llava_next/llava/model/__init__.py +++ b/llava_next/llava/model/__init__.py @@ -17,3 +17,4 @@ from .language_model.llava_qwen import LlavaQwenForCausalLM, LlavaQwenConfig +from .language_model.llava_qwen3 import LlavaQwen3ForCausalLM, LlavaQwen3Config diff --git a/llava_next/llava/model/builder.py b/llava_next/llava/model/builder.py index faa5d7b6..69ef70d7 100755 --- a/llava_next/llava/model/builder.py +++ b/llava_next/llava/model/builder.py @@ -221,6 +221,16 @@ def load_from_hf(repo_id, filename, subfolder=None): model = LlavaQwenMoeForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation=attn_implementation, config=llava_cfg, **kwargs) else: model = LlavaQwenMoeForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation=attn_implementation, **kwargs) + elif "qwen3" in model_name.lower(): + from llava.model.language_model.llava_qwen3 import LlavaQwen3Config + if overwrite_config is not None: + llava_cfg = LlavaQwen3Config.from_pretrained(model_path) + rank0_print(f"Overwriting config with {overwrite_config}") + for k, v in overwrite_config.items(): + setattr(llava_cfg, k, v) + model = LlavaQwen3ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation=attn_implementation, config=llava_cfg, **kwargs) + else: + model = LlavaQwen3ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation=attn_implementation, **kwargs) else: from llava.model.language_model.llava_qwen import LlavaQwenConfig diff --git a/llava_next/llava/model/language_model/llava_gemma.py b/llava_next/llava/model/language_model/llava_gemma.py deleted file mode 100755 index 5c0ac173..00000000 --- a/llava_next/llava/model/language_model/llava_gemma.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright 2024 Duc Q. Nguyen, Haotian Liu and Bo Li -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn as nn -from torch.nn import CrossEntropyLoss - -from transformers import AutoConfig, AutoModelForCausalLM, GemmaConfig, GemmaModel, GemmaForCausalLM - -from transformers.modeling_outputs import CausalLMOutputWithPast -from transformers.generation.utils import GenerateOutput - -from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM - - -class LlavaGemmaConfig(GemmaConfig): - model_type = "llava_gemma" - - -class LlavaGemmaModel(LlavaMetaModel, GemmaModel): - config_class = LlavaGemmaConfig - - def __init__(self, config: GemmaConfig): - super(LlavaGemmaModel, self).__init__(config) - - -class LlavaGemmaForCausalLM(GemmaForCausalLM, LlavaMetaForCausalLM): - config_class = LlavaGemmaConfig - - def __init__(self, config): - super(GemmaForCausalLM, self).__init__(config) - self.model = LlavaGemmaModel(config) - - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_model(self): - return self.model - - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - images: Optional[torch.FloatTensor] = None, - image_sizes: Optional[List[List[int]]] = None, - return_dict: Optional[bool] = None, - cache_position: Optional[torch.LongTensor] = None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - - if inputs_embeds is None: - (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes) - - return super().forward( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - labels=labels, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - cache_position=cache_position, - ) - - @torch.no_grad() - def generate( - self, - inputs: Optional[torch.Tensor] = None, - images: Optional[torch.Tensor] = None, - image_sizes: Optional[torch.Tensor] = None, - **kwargs, - ) -> Union[GenerateOutput, torch.LongTensor]: - position_ids = kwargs.pop("position_ids", None) - attention_mask = kwargs.pop("attention_mask", None) - if "inputs_embeds" in kwargs: - raise NotImplementedError("`inputs_embeds` is not supported") - - if images is not None: - (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes) - else: - inputs_embeds = self.get_model().embed_tokens(inputs) - - return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) - - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): - images = kwargs.pop("images", None) - image_sizes = kwargs.pop("image_sizes", None) - inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) - if images is not None: - inputs["images"] = images - if image_sizes is not None: - inputs["image_sizes"] = image_sizes - return inputs - - -AutoConfig.register("llava_gemma", LlavaGemmaConfig) -AutoModelForCausalLM.register(LlavaGemmaConfig, LlavaGemmaForCausalLM) diff --git a/llava_next/llava/model/language_model/llava_llama.py b/llava_next/llava/model/language_model/llava_llama.py deleted file mode 100755 index 1a930e97..00000000 --- a/llava_next/llava/model/language_model/llava_llama.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright 2023 Haotian Liu -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn as nn - -from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig - -from torch.nn import CrossEntropyLoss - - -# , LlamaModel, LlamaForCausalLM, GenerationConfig -# from .modeling_llama import LlamaModel, LlamaForCausalLM -from transformers import LlamaModel, LlamaForCausalLM -from transformers.modeling_outputs import CausalLMOutputWithPast -from transformers.generation.utils import GenerateOutput - -from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM - - -class LlavaConfig(LlamaConfig): - model_type = "llava_llama" - temperature: float = 0.0 # reset to 0.0, previously 0.9 for Vicuna - max_new_tokens: int = 1024 - do_sample: bool = False - top_p: Optional[float] = None - # rope_scaling: Optional[dict] = {} - - -class LlavaLlamaModel(LlavaMetaModel, LlamaModel): - config_class = LlavaConfig - - def __init__(self, config: LlamaConfig): - super(LlavaLlamaModel, self).__init__(config) - - -class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM): - config_class = LlavaConfig - - def __init__(self, config): - LlamaForCausalLM.__init__(self, config) - - # configure default generation settings - config.model_type = "llava_llama" - # config.rope_scaling = None - - self.model = LlavaLlamaModel(config) - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - # Initialize weights and apply final processing - self.post_init() - - def get_model(self): - return self.model - - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - images: Optional[torch.FloatTensor] = None, - image_sizes: Optional[List[List[int]]] = None, - return_dict: Optional[bool] = None, - modalities: Optional[List[str]] = ["image"], - dpo_forward: Optional[bool] = None, - cache_position=None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - - if inputs_embeds is None: - (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes) - - if dpo_forward: - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - logits = self.lm_head(hidden_states) - return logits, labels - - else: - return super().forward( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - labels=labels, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - @torch.no_grad() - def generate( - self, - inputs: Optional[torch.Tensor] = None, - images: Optional[torch.Tensor] = None, - image_sizes: Optional[torch.Tensor] = None, - modalities: Optional[List[str]] = ["image"], - **kwargs, - ) -> Union[GenerateOutput, torch.LongTensor]: - modalities = kwargs.pop("modalities", None) if "modalities" in kwargs and modalities is None else modalities - position_ids = kwargs.pop("position_ids", None) - attention_mask = kwargs.pop("attention_mask", None) - if "inputs_embeds" in kwargs: - raise NotImplementedError("`inputs_embeds` is not supported") - - if images is not None: - (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes) - else: - inputs_embeds = self.get_model().embed_tokens(inputs) - - return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) - - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): - images = kwargs.pop("images", None) - image_sizes = kwargs.pop("image_sizes", None) - inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) - if images is not None: - inputs["images"] = images - if image_sizes is not None: - inputs["image_sizes"] = image_sizes - return inputs - - -AutoConfig.register("llava_llama", LlavaConfig) -AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM) diff --git a/llava_next/llava/model/language_model/llava_mistral.py b/llava_next/llava/model/language_model/llava_mistral.py deleted file mode 100755 index 2cc3b015..00000000 --- a/llava_next/llava/model/language_model/llava_mistral.py +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright 2023 Haotian Liu -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn as nn -from torch.nn import CrossEntropyLoss - -from transformers import AutoConfig, AutoModelForCausalLM, MistralConfig, MistralModel, MistralForCausalLM, GenerationConfig - -from transformers.modeling_outputs import CausalLMOutputWithPast -from transformers.generation.utils import GenerateOutput - -from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM - - -class LlavaMistralConfig(MistralConfig): - model_type = "llava_mistral" - temperature: float = 0.0 # reset to 0.0, previously 0.9 for Vicuna - max_new_tokens: int = 1024 - do_sample: bool = False - top_p: Optional[float] = None - - -class LlavaMistralModel(LlavaMetaModel, MistralModel): - config_class = LlavaMistralConfig - - def __init__(self, config: MistralConfig): - super(LlavaMistralModel, self).__init__(config) - - -class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM): - config_class = LlavaMistralConfig - - def __init__(self, config): - super(MistralForCausalLM, self).__init__(config) - - config.model_type = "llava_mistral" - config.rope_scaling = None - - self.model = LlavaMistralModel(config) - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - # Initialize weights and apply final processing - self.post_init() - - def get_model(self): - return self.model - - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - images: Optional[torch.FloatTensor] = None, - image_sizes: Optional[List[List[int]]] = None, - return_dict: Optional[bool] = None, - cache_position=None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - - if inputs_embeds is None: - (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes) - - return super().forward( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - labels=labels, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - @torch.no_grad() - def generate( - self, - inputs: Optional[torch.Tensor] = None, - images: Optional[torch.Tensor] = None, - image_sizes: Optional[torch.Tensor] = None, - **kwargs, - ) -> Union[GenerateOutput, torch.LongTensor]: - position_ids = kwargs.pop("position_ids", None) - attention_mask = kwargs.pop("attention_mask", None) - if "inputs_embeds" in kwargs: - raise NotImplementedError("`inputs_embeds` is not supported") - - if images is not None: - (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes) - else: - inputs_embeds = self.get_model().embed_tokens(inputs) - - return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) - - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): - images = kwargs.pop("images", None) - image_sizes = kwargs.pop("image_sizes", None) - inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) - if images is not None: - inputs["images"] = images - if image_sizes is not None: - inputs["image_sizes"] = image_sizes - return inputs - - -AutoConfig.register("llava_mistral", LlavaMistralConfig) -AutoModelForCausalLM.register(LlavaMistralConfig, LlavaMistralForCausalLM) diff --git a/llava_next/llava/model/language_model/llava_mixtral.py b/llava_next/llava/model/language_model/llava_mixtral.py deleted file mode 100755 index ca6c25da..00000000 --- a/llava_next/llava/model/language_model/llava_mixtral.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright 2023 Haotian Liu -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn as nn -from torch.nn import CrossEntropyLoss - -from transformers import AutoConfig, AutoModelForCausalLM, MixtralConfig, MixtralModel, MixtralForCausalLM, GenerationConfig - -from transformers.modeling_outputs import CausalLMOutputWithPast -from transformers.generation.utils import GenerateOutput - -from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM - - -class LlavaMixtralConfig(MixtralConfig): - model_type = "llava_mixtral" - - -class LlavaMixtralModel(LlavaMetaModel, MixtralModel): - config_class = LlavaMixtralConfig - - def __init__(self, config: MixtralConfig): - super(LlavaMixtralModel, self).__init__(config) - - -class LlavaMixtralForCausalLM(MixtralForCausalLM, LlavaMetaForCausalLM): - config_class = LlavaMixtralConfig - - def __init__(self, config): - super(MixtralForCausalLM, self).__init__(config) - - config.model_type = "llava_mixtral" - config.rope_scaling = None - self.model = LlavaMixtralModel(config) - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - # Initialize weights and apply final processing - self.post_init() - - def get_model(self): - return self.model - - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - images: Optional[torch.FloatTensor] = None, - image_sizes: Optional[List[List[int]]] = None, - return_dict: Optional[bool] = None, - modalities: Optional[List[str]] = ["image"], - dpo_forward: Optional[bool] = None, - cache_position=None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - - if inputs_embeds is None: - (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes) - - if dpo_forward: - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - logits = self.lm_head(hidden_states) - return logits, labels - - else: - return super().forward( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - labels=labels, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - @torch.no_grad() - def generate( - self, - inputs: Optional[torch.Tensor] = None, - images: Optional[torch.Tensor] = None, - image_sizes: Optional[torch.Tensor] = None, - modalities: Optional[List[str]] = ["image"], - **kwargs, - ) -> Union[GenerateOutput, torch.LongTensor]: - position_ids = kwargs.pop("position_ids", None) - attention_mask = kwargs.pop("attention_mask", None) - if "inputs_embeds" in kwargs: - raise NotImplementedError("`inputs_embeds` is not supported") - - if images is not None: - (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes) - else: - inputs_embeds = self.get_model().embed_tokens(inputs) - - return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) - - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): - images = kwargs.pop("images", None) - image_sizes = kwargs.pop("image_sizes", None) - inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) - if images is not None: - inputs["images"] = images - if image_sizes is not None: - inputs["image_sizes"] = image_sizes - return inputs - - -AutoConfig.register("llava_mixtral", LlavaMixtralConfig) -AutoModelForCausalLM.register(LlavaMixtralConfig, LlavaMixtralForCausalLM) diff --git a/llava_next/llava/model/language_model/llava_mpt.py b/llava_next/llava/model/language_model/llava_mpt.py deleted file mode 100755 index c3bce7d3..00000000 --- a/llava_next/llava/model/language_model/llava_mpt.py +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright 2023 Haotian Liu -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Optional, Tuple - -import torch - -from transformers import AutoConfig, AutoModelForCausalLM, MptConfig, MptForCausalLM, MptModel, GenerationConfig -from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM - - -class LlavaMptConfig(MptConfig): - model_type = "llava_mpt" - - -class LlavaMptModel(LlavaMetaModel, MptModel): - config_class = LlavaMptConfig - - def __init__(self, config: MptConfig): - config.hidden_size = config.d_model - super(LlavaMptModel, self).__init__(config) - - def embed_tokens(self, x): - return self.wte(x) - - -class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM): - config_class = LlavaMptConfig - supports_gradient_checkpointing = True - - def __init__(self, config): - super(MptForCausalLM, self).__init__(config) - - config.model_type = "llava_mpt" - config.rope_scaling = None - self.generation_config = GenerationConfig( - temperature=0.0, - max_new_tokens=1024, - do_sample=False, - top_p=None, - ) - - self.transformer = LlavaMptModel(config) - self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_model(self): - return self.transformer - - def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, LlavaMptModel): - module.gradient_checkpointing = value - - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, - attention_mask: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - labels: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - cache_position=None, - images=None, - ): - - input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images) - - return super().forward( - input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - labels=labels, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): - images = kwargs.pop("images", None) - _inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs) - _inputs["images"] = images - return _inputs - - -AutoConfig.register("llava_mpt", LlavaMptConfig) -AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM) diff --git a/llava_next/llava/model/language_model/llava_qwen.py b/llava_next/llava/model/language_model/llava_qwen.py index c1947476..131ffc54 100755 --- a/llava_next/llava/model/language_model/llava_qwen.py +++ b/llava_next/llava/model/language_model/llava_qwen.py @@ -50,7 +50,6 @@ def __init__(self, config): # super(Qwen2ForCausalLM, self).__init__(config) Qwen2ForCausalLM.__init__(self, config) config.model_type = "llava_qwen" - config.rope_scaling = None self.model = LlavaQwenModel(config) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) @@ -77,10 +76,12 @@ def forward( modalities: Optional[List[str]] = ["image"], dpo_forward: Optional[bool] = False, cache_position=None, + grid_thw: Optional[torch.Tensor] = None, + visible_indices: Optional[torch.Tensor] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: if inputs_embeds is None: - (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes) + (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes, grid_thw=grid_thw, visible_indices=visible_indices) if dpo_forward: outputs = self.model( @@ -120,6 +121,7 @@ def generate( images: Optional[torch.Tensor] = None, image_sizes: Optional[torch.Tensor] = None, modalities: Optional[List[str]] = ["image"], + grid_thw: Optional[torch.Tensor] = None, **kwargs, ) -> Union[GenerateOutput, torch.LongTensor]: position_ids = kwargs.pop("position_ids", None) @@ -128,7 +130,7 @@ def generate( raise NotImplementedError("`inputs_embeds` is not supported") if images is not None: - (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes) + (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes, grid_thw=grid_thw, visible_indices=kwargs.pop("visible_indices", None)) else: inputs_embeds = self.get_model().embed_tokens(inputs) diff --git a/llava_next/llava/model/language_model/llava_qwen_moe.py b/llava_next/llava/model/language_model/llava_qwen3.py similarity index 79% rename from llava_next/llava/model/language_model/llava_qwen_moe.py rename to llava_next/llava/model/language_model/llava_qwen3.py index 618a4822..dded8319 100755 --- a/llava_next/llava/model/language_model/llava_qwen_moe.py +++ b/llava_next/llava/model/language_model/llava_qwen3.py @@ -19,40 +19,38 @@ from torch.nn import CrossEntropyLoss import transformers -from transformers import AutoConfig, AutoModelForCausalLM +from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig, LlamaModel, LlamaForCausalLM from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.generation.utils import GenerateOutput # from ...constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM -from transformers import Qwen2MoeConfig, Qwen2MoeModel, Qwen2MoeForCausalLM +from transformers import Qwen3Config, Qwen3Model, Qwen3ForCausalLM -# from .qwen.modeling_qwen import QWenLMHeadModel, QWenModel -# from .qwen.configuration_qwen import QWenConfig -class LlavaQwenMoeConfig(Qwen2MoeConfig): - model_type = "llava_qwen_moe" +class LlavaQwen3Config(Qwen3Config): + model_type = "llava_qwen3" -class LlavaQwenMoeModel(LlavaMetaModel, Qwen2MoeModel): - config_class = LlavaQwenMoeConfig +class LlavaQwen3Model(LlavaMetaModel, Qwen3Model): + config_class = LlavaQwen3Config - def __init__(self, config: Qwen2MoeConfig): - super(LlavaQwenMoeModel, self).__init__(config) + def __init__(self, config: Qwen3Config): + super(LlavaQwen3Model, self).__init__(config) -class LlavaQwenMoeForCausalLM(Qwen2MoeForCausalLM, LlavaMetaForCausalLM): - config_class = LlavaQwenMoeConfig +class LlavaQwen3ForCausalLM(Qwen3ForCausalLM, LlavaMetaForCausalLM): + config_class = LlavaQwen3Config def __init__(self, config): - # super(Qwen2MoeForCausalLM, self).__init__(config) - Qwen2MoeForCausalLM.__init__(self, config) - config.model_type = "llava_qwen_moe" - config.rope_scaling = None + # super(Qwen2ForCausalLM, self).__init__(config) + Qwen3ForCausalLM.__init__(self, config) + config.model_type = "llava_qwen3" + # config.rope_scaling = None # Commented out, newer transformers requires rope_parameters - self.model = LlavaQwenMoeModel(config) + self.model = LlavaQwen3Model(config) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing self.post_init() @@ -77,11 +75,12 @@ def forward( modalities: Optional[List[str]] = ["image"], dpo_forward: Optional[bool] = False, cache_position=None, + grid_thw: Optional[torch.Tensor] = None, + visible_indices: Optional[torch.Tensor] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: if inputs_embeds is None: - (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes) - + (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes, grid_thw=grid_thw, visible_indices=visible_indices) if dpo_forward: outputs = self.model( input_ids=input_ids, @@ -120,6 +119,7 @@ def generate( images: Optional[torch.Tensor] = None, image_sizes: Optional[torch.Tensor] = None, modalities: Optional[List[str]] = ["image"], + grid_thw: Optional[torch.Tensor] = None, **kwargs, ) -> Union[GenerateOutput, torch.LongTensor]: position_ids = kwargs.pop("position_ids", None) @@ -128,7 +128,7 @@ def generate( raise NotImplementedError("`inputs_embeds` is not supported") if images is not None: - (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes) + (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes, grid_thw=grid_thw, visible_indices=kwargs.pop("visible_indices", None)) else: inputs_embeds = self.get_model().embed_tokens(inputs) @@ -145,5 +145,5 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_ return inputs -AutoConfig.register("llava_qwen_moe", LlavaQwenMoeConfig) -AutoModelForCausalLM.register(LlavaQwenMoeConfig, LlavaQwenMoeForCausalLM) +AutoConfig.register("llava_qwen3", LlavaQwen3Config) +AutoModelForCausalLM.register(LlavaQwen3Config, LlavaQwen3ForCausalLM) diff --git a/llava_next/llava/model/llava_arch.py b/llava_next/llava/model/llava_arch.py index feb70026..a71ddd81 100755 --- a/llava_next/llava/model/llava_arch.py +++ b/llava_next/llava/model/llava_arch.py @@ -189,19 +189,19 @@ def get_2dPool(self, image_feature, stride=2): image_feature = image_feature.view(num_frames, -1, num_dim) return image_feature - def encode_images(self, images): + def encode_images(self, images, grid_thw=None, visible_indices=None): # Check if we need spatial dimensions for spatial_merge projector projector_type = getattr(self.config, "mm_projector_type", "linear") vision_tower = self.get_model().get_vision_tower() if projector_type == "spatial_merge": # Request spatial dimensions from vision tower for spatial_merge - image_features, h, w = vision_tower(images, return_spatial_dims=True) + image_features, h, w = vision_tower(images, return_spatial_dims=True, visible_indices=visible_indices) # Pass h and w to the projector image_features = self.get_model().mm_projector(image_features, height=h, width=w) else: # Standard flow for other projector types - image_features = vision_tower(images) + image_features = vision_tower(images, grid_thw=grid_thw, visible_indices=visible_indices) # image_features = self.get_model().vision_resampler(image_features, images=images) image_features = self.get_model().mm_projector(image_features) return image_features @@ -272,7 +272,7 @@ def add_token_per_frame(self, image_feature): image_feature = image_feature.permute(1, 2, 0).contiguous() return image_feature - def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=["image"], image_sizes=None): + def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=["image"], image_sizes=None, grid_thw=None, visible_indices=None): vision_tower = self.get_vision_tower() # rank_print(modalities) if vision_tower is None or images is None or input_ids.shape[1] == 1: @@ -284,7 +284,7 @@ def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attentio # import pdb; pdb.set_trace() if type(images) is list or images.ndim == 5: if type(images) is list: - images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images] + images = [x.unsqueeze(0) if x.ndim == 2 else x for x in images] video_idx_in_batch = [] for _ in range(len(modalities)): @@ -293,14 +293,16 @@ def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attentio images_list = [] for image in images: - if image.ndim == 4: + if image.ndim == 3 or image.ndim == 4: images_list.append(image) else: images_list.append(image.unsqueeze(0)) - - concat_images = torch.cat([image for image in images_list], dim=0) - split_sizes = [image.shape[0] for image in images_list] - encoded_image_features = self.encode_images(concat_images) + try: + concat_images = torch.cat([image for image in images_list], dim=0) + split_sizes = [image.shape[0] for image in images_list] + except : + concat_images = images_list + encoded_image_features = self.encode_images(concat_images, grid_thw=grid_thw, visible_indices=visible_indices) # image_features,all_faster_video_features = self.encode_multimodals(concat_images, video_idx_in_batch, split_sizes) # This is a list, each element is [num_images, patch * patch, dim] @@ -442,7 +444,6 @@ def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attentio raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}") else: image_features = self.encode_images(images) - # TODO: image start / end is not implemented here to support pretraining. if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(self.config, "mm_use_im_start_end", False): raise NotImplementedError diff --git a/llava_next/llava/model/multimodal_encoder/aim_v2_448_encoder.py b/llava_next/llava/model/multimodal_encoder/aim_v2_448_encoder.py deleted file mode 100755 index 4b4392ff..00000000 --- a/llava_next/llava/model/multimodal_encoder/aim_v2_448_encoder.py +++ /dev/null @@ -1,174 +0,0 @@ -import torch -import torch.nn as nn -from llava.utils import rank0_print -from transformers import CLIPImageProcessor -from .aimv2.modeling_aimv2 import AIMv2Model -from .aimv2.configuration_aimv2 import AIMv2Config -try: - from s2wrapper import forward as multiscale_forward -except: - pass - - -class AIMv2448pxVisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - - self.vision_tower_name = vision_tower - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - self.cfg_only = AIMv2Config.from_pretrained(self.vision_tower_name) - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = AIMv2Model.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.is_loaded = True - - def feature_select(self, image_forward_outs): - select_feature_type = self.select_feature - - if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: - select_every_k_layer = len(image_forward_outs.hidden_states) // 4 - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) - select_feature_type = select_feature_type.replace("slicefour_", "") - elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]: - select_layers = [-2, -5, -8, -11, 6] - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1) - select_feature_type = select_feature_type.replace("slice_m25811_f6_", "") - else: - image_features = image_forward_outs.hidden_states[self.select_layer] - - # if select_feature_type == "patch": - # image_features = image_features[:, 1:] - # elif select_feature_type == "cls_patch": - # image_features = image_features - # else: - # raise ValueError(f"Unexpected select feature: {select_feature_type}") - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) - image_feature = self.feature_select(image_forward_out).to(image.dtype) - image_features.append(image_feature) - else: - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - return self.vision_tower.dtype - - @property - def device(self): - return self.vision_tower.device - - @property - def config(self): - if self.is_loaded: - return self.vision_tower.config - else: - return self.cfg_only - - @property - def hidden_size(self): - _hidden_size = self.config.hidden_size - if "slicefour" in self.select_feature: - _hidden_size *= 4 - if "slice_m25811_f6" in self.select_feature: - _hidden_size *= 5 - return _hidden_size - - @property - def num_patches_per_side(self): - return self.config.image_size // self.config.patch_size - - @property - def num_patches(self): - _num_patches = (self.config.image_size // self.config.patch_size) ** 2 - # if "cls_patch" in self.select_feature: - # _num_patches += 1 - return _num_patches - - @property - def image_size(self): - return self.config.image_size - - -class AIMv2448pxVisionTowerS2(AIMv2448pxVisionTower): - def __init__(self, vision_tower, args, delay_load=False): - - self.s2_scales = getattr(args, "s2_scales", "336,672,1008") - self.s2_scales = list(map(int, self.s2_scales.split(","))) - self.s2_scales.sort() - self.s2_split_size = self.s2_scales[0] - self.s2_image_size = self.s2_scales[-1] - - super().__init__(vision_tower, args, delay_load) - - # change resize/crop size in preprocessing to the largest image size in s2_scale - if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False): - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = AIMv2Model.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - self.is_loaded = True - - def forward_feature(self, images): - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - image_features.append(image_feature) - else: - image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - - return image_features - - @property - def hidden_size(self): - return self.config.hidden_size * len(self.s2_scales) diff --git a/llava_next/llava/model/multimodal_encoder/aim_v2_560_encoder.py b/llava_next/llava/model/multimodal_encoder/aim_v2_560_encoder.py deleted file mode 100755 index 1c0c5efd..00000000 --- a/llava_next/llava/model/multimodal_encoder/aim_v2_560_encoder.py +++ /dev/null @@ -1,176 +0,0 @@ -import torch -import torch.nn as nn -from llava.utils import rank0_print -from transformers import CLIPImageProcessor -from .aimv2.modeling_aimv2 import AIMv2Model -from .aimv2.configuration_aimv2 import AIMv2Config -try: - from s2wrapper import forward as multiscale_forward -except: - pass - - -class AIMv2560pxVisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - - self.vision_tower_name = vision_tower - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - self.cfg_only = AIMv2Config.from_pretrained(self.vision_tower_name) - - self.num_patches_per_side = 560 // 14 - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = AIMv2Model.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.is_loaded = True - - def feature_select(self, image_forward_outs): - select_feature_type = self.select_feature - - if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: - select_every_k_layer = len(image_forward_outs.hidden_states) // 4 - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) - select_feature_type = select_feature_type.replace("slicefour_", "") - elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]: - select_layers = [-2, -5, -8, -11, 6] - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1) - select_feature_type = select_feature_type.replace("slice_m25811_f6_", "") - else: - image_features = image_forward_outs.hidden_states[self.select_layer] - - # if select_feature_type == "patch": - # image_features = image_features[:, 1:] - # elif select_feature_type == "cls_patch": - # image_features = image_features - # else: - # raise ValueError(f"Unexpected select feature: {select_feature_type}") - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) - image_feature = self.feature_select(image_forward_out).to(image.dtype) - image_features.append(image_feature) - else: - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - return self.vision_tower.dtype - - @property - def device(self): - return self.vision_tower.device - - @property - def config(self): - if self.is_loaded: - return self.vision_tower.config - else: - return self.cfg_only - - @property - def hidden_size(self): - _hidden_size = self.config.hidden_size - if "slicefour" in self.select_feature: - _hidden_size *= 4 - if "slice_m25811_f6" in self.select_feature: - _hidden_size *= 5 - return _hidden_size - - # @property - # def num_patches_per_side(self): - # return self.config.image_size // self.config.patch_size - - @property - def num_patches(self): - # _num_patches = (self.config.image_size // self.config.patch_size) ** 2 - # if "cls_patch" in self.select_feature: - # _num_patches += 1 - return 1600 - - @property - def image_size(self): - return 560 - - -class AIMv2560pxVisionTowerS2(AIMv2560pxVisionTower): - def __init__(self, vision_tower, args, delay_load=False): - - self.s2_scales = getattr(args, "s2_scales", "336,672,1008") - self.s2_scales = list(map(int, self.s2_scales.split(","))) - self.s2_scales.sort() - self.s2_split_size = self.s2_scales[0] - self.s2_image_size = self.s2_scales[-1] - - super().__init__(vision_tower, args, delay_load) - - # change resize/crop size in preprocessing to the largest image size in s2_scale - if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False): - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = AIMv2Model.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - self.is_loaded = True - - def forward_feature(self, images): - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - image_features.append(image_feature) - else: - image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - - return image_features - - @property - def hidden_size(self): - return self.config.hidden_size * len(self.s2_scales) diff --git a/llava_next/llava/model/multimodal_encoder/aim_v2_encoder.py b/llava_next/llava/model/multimodal_encoder/aim_v2_encoder.py deleted file mode 100755 index 393a6c7c..00000000 --- a/llava_next/llava/model/multimodal_encoder/aim_v2_encoder.py +++ /dev/null @@ -1,174 +0,0 @@ -import torch -import torch.nn as nn -from llava.utils import rank0_print -from transformers import CLIPImageProcessor -from .aimv2.modeling_aimv2 import AIMv2Model -from .aimv2.configuration_aimv2 import AIMv2Config -try: - from s2wrapper import forward as multiscale_forward -except: - pass - - -class AIMv2VisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - - self.vision_tower_name = vision_tower - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - self.cfg_only = AIMv2Config.from_pretrained(self.vision_tower_name) - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = AIMv2Model.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.is_loaded = True - - def feature_select(self, image_forward_outs): - select_feature_type = self.select_feature - - if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: - select_every_k_layer = len(image_forward_outs.hidden_states) // 4 - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) - select_feature_type = select_feature_type.replace("slicefour_", "") - elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]: - select_layers = [-2, -5, -8, -11, 6] - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1) - select_feature_type = select_feature_type.replace("slice_m25811_f6_", "") - else: - image_features = image_forward_outs.hidden_states[self.select_layer] - - # if select_feature_type == "patch": - # image_features = image_features[:, 1:] - # elif select_feature_type == "cls_patch": - # image_features = image_features - # else: - # raise ValueError(f"Unexpected select feature: {select_feature_type}") - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) - image_feature = self.feature_select(image_forward_out).to(image.dtype) - image_features.append(image_feature) - else: - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - return self.vision_tower.dtype - - @property - def device(self): - return self.vision_tower.device - - @property - def config(self): - if self.is_loaded: - return self.vision_tower.config - else: - return self.cfg_only - - @property - def hidden_size(self): - _hidden_size = self.config.hidden_size - if "slicefour" in self.select_feature: - _hidden_size *= 4 - if "slice_m25811_f6" in self.select_feature: - _hidden_size *= 5 - return _hidden_size - - @property - def num_patches_per_side(self): - return self.config.image_size // self.config.patch_size - - @property - def num_patches(self): - _num_patches = (self.config.image_size // self.config.patch_size) ** 2 - # if "cls_patch" in self.select_feature: - # _num_patches += 1 - return _num_patches - - @property - def image_size(self): - return self.config.image_size - - -class AIMv2VisionTowerS2(AIMv2VisionTower): - def __init__(self, vision_tower, args, delay_load=False): - - self.s2_scales = getattr(args, "s2_scales", "336,672,1008") - self.s2_scales = list(map(int, self.s2_scales.split(","))) - self.s2_scales.sort() - self.s2_split_size = self.s2_scales[0] - self.s2_image_size = self.s2_scales[-1] - - super().__init__(vision_tower, args, delay_load) - - # change resize/crop size in preprocessing to the largest image size in s2_scale - if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False): - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = AIMv2Model.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - self.is_loaded = True - - def forward_feature(self, images): - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - image_features.append(image_feature) - else: - image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - - return image_features - - @property - def hidden_size(self): - return self.config.hidden_size * len(self.s2_scales) diff --git a/llava_next/llava/model/multimodal_encoder/aimv2/configuration_aimv2.py b/llava_next/llava/model/multimodal_encoder/aimv2/configuration_aimv2.py deleted file mode 100755 index dfafbf69..00000000 --- a/llava_next/llava/model/multimodal_encoder/aimv2/configuration_aimv2.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import Any - -from transformers.configuration_utils import PretrainedConfig - -__all__ = ["AIMv2Config"] - - -class AIMv2Config(PretrainedConfig): - """This is the configuration class to store the configuration of an [`AIMv2Model`]. - Instantiating a configuration with the defaults will yield a similar configuration - to that of the [apple/aimv2-large-patch14-native](https://huggingface.co/apple/aimv2-large-patch14-native) - Args: - hidden_size: Dimension of the hidden representations. - intermediate_size: Dimension of the SwiGLU representations. - num_hidden_layers: Number of hidden layers in the Transformer. - num_attention_heads: Number of attention heads for each attention layer - in the Transformer. - num_channels: Number of input channels. - num_queries: Number of learnable queries in the head. - patch_size: Patch size. - rms_norm_eps: Epsilon value used for the RMS normalization layer. - attention_dropout: Dropout ratio for attention probabilities. - projection_dropout: Dropout ratio for the projection layer after the attention. - qkv_bias: Whether to add a bias to the queries, keys and values. - use_bias: Whether to add a bias in the feed-forward and projection layers. - kwargs: Keyword arguments for the [`PretrainedConfig`]. - """ - - model_type: str = "aimv2" - - def __init__( - self, - hidden_size: int = 1024, - intermediate_size: int = 2816, - num_hidden_layers: int = 24, - num_attention_heads: int = 8, - num_channels: int = 3, - num_queries: int = 256, - patch_size: int = 14, - rms_norm_eps: float = 1e-5, - attention_dropout: float = 0.0, - projection_dropout: float = 0.0, - qkv_bias: bool = False, - use_bias: bool = False, - **kwargs: Any, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.num_queries = num_queries - self.patch_size = patch_size - self.attention_dropout = attention_dropout - self.rms_norm_eps = rms_norm_eps - - self.projection_dropout = projection_dropout - self.qkv_bias = qkv_bias - self.use_bias = use_bias - self.image_size = 560 \ No newline at end of file diff --git a/llava_next/llava/model/multimodal_encoder/aimv2/modeling_aimv2.py b/llava_next/llava/model/multimodal_encoder/aimv2/modeling_aimv2.py deleted file mode 100755 index b457b18a..00000000 --- a/llava_next/llava/model/multimodal_encoder/aimv2/modeling_aimv2.py +++ /dev/null @@ -1,221 +0,0 @@ -from typing import Optional, Tuple, Union - -import torch -from .configuration_aimv2 import AIMv2Config -from torch import nn -from torch.nn import functional as F -from transformers.modeling_outputs import BaseModelOutputWithNoAttention -from transformers.modeling_utils import PreTrainedModel - -__all__ = ["AIMv2Model"] - - -def _get_1d_sincos_pos_embed_from_grid( - embed_dim: int, pos: torch.Tensor -) -> torch.Tensor: - omega = torch.arange(float(embed_dim) // 2).bfloat16() - omega /= embed_dim / 2.0 - omega = 1.0 / 10000**omega # (D / 2,) - pos = pos.reshape(-1) # (M,) - out = pos[:, None] * omega[None, :] # (M, D / 2), outer product - emb_sin, emb_cos = torch.sin(out), torch.cos(out) # (M, D / 2) - emb = torch.concatenate([emb_sin, emb_cos], dim=1) # (M, D) - return emb - - -def get_sincos_pos_embed(h: int, w: int, embed_dim: int) -> torch.Tensor: - assert embed_dim % 2 == 0, embed_dim - grid_h = torch.arange(h) - grid_w = torch.arange(w) - grid = torch.meshgrid(grid_w, grid_h, indexing="xy") - grid = torch.stack(grid, dim=0) - grid = grid.reshape([2, 1, h, w]) - emb_h = _get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) - emb_w = _get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) - pos_embed = torch.concatenate([emb_h, emb_w], dim=1) # (H * W, D) - return pos_embed - - -class RMSNorm(nn.Module): - def __init__(self, dim: int, eps: float = 1e-6): - super().__init__() - self.weight = nn.Parameter(torch.ones(dim)) - self.eps = eps - - def forward(self, x: torch.Tensor) -> torch.Tensor: - # output = self._norm(x.float()).type_as(x) - output = self._norm(x).type_as(x) - return output * self.weight - - def extra_repr(self) -> str: - return f"{tuple(self.weight.shape)}, eps={self.eps}" - - def _norm(self, x: torch.Tensor) -> torch.Tensor: - return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) - - -class AIMv2SwiGLUFFN(nn.Module): - def __init__(self, config: AIMv2Config): - super().__init__() - hidden_features = config.intermediate_size - in_features = config.hidden_size - bias = config.use_bias - - self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) - self.fc2 = nn.Linear(hidden_features, in_features, bias=bias) - self.fc3 = nn.Linear(in_features, hidden_features, bias=bias) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = F.silu(self.fc1(x)) * self.fc3(x) - x = self.fc2(x) - return x - - -class AIMv2PatchEmbed(nn.Module): - def __init__(self, config: AIMv2Config): - super().__init__() - self.proj = nn.Conv2d( - config.num_channels, - config.hidden_size, - kernel_size=(config.patch_size, config.patch_size), - stride=(config.patch_size, config.patch_size), - ) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self.proj(x).flatten(2).transpose(1, 2) - x = self.norm(x) - return x - - -class AIMv2ViTPreprocessor(nn.Module): - def __init__(self, config: AIMv2Config): - super().__init__() - self.patch_h = config.patch_size - self.patch_w = config.patch_size - self.embed_dim = config.hidden_size - - self.patchifier = AIMv2PatchEmbed(config) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - _, _, H, W = x.shape - tokens = self.patchifier(x) - pos_embed = get_sincos_pos_embed( - H // self.patch_h, W // self.patch_w, embed_dim=self.embed_dim - ) - tokens = tokens + pos_embed.to(tokens.device) - return tokens - - -class AIMv2Attention(nn.Module): - def __init__(self, config: AIMv2Config): - super().__init__() - dim = config.hidden_size - - self.num_heads = config.num_attention_heads - self.qkv = nn.Linear(dim, dim * 3, bias=config.qkv_bias) - self.attn_drop = nn.Dropout(config.attention_dropout) - self.proj = nn.Linear(dim, dim, bias=config.use_bias) - self.proj_drop = nn.Dropout(config.projection_dropout) - - def forward( - self, x: torch.Tensor, mask: Optional[torch.Tensor] = None - ) -> torch.Tensor: - B, N, C = x.shape - qkv = ( - self.qkv(x) - .reshape(B, N, 3, self.num_heads, C // self.num_heads) - .permute(2, 0, 3, 1, 4) - ) - q, k, v = qkv.unbind(0) - - x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask) - x = x.transpose(1, 2).contiguous().reshape(B, N, C) - x = self.proj(x) - x = self.proj_drop(x) - return x - - -class AIMv2Block(nn.Module): - def __init__(self, config: AIMv2Config): - super().__init__() - self.attn = AIMv2Attention(config) - self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.mlp = AIMv2SwiGLUFFN(config) - self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, x: torch.Tensor, mask: Optional[torch.Tensor] = None - ) -> torch.Tensor: - x = x + self.attn(self.norm_1(x), mask) - x = x + self.mlp(self.norm_2(x)) - return x - - -class AIMv2Transformer(nn.Module): - def __init__(self, config: AIMv2Config): - super().__init__() - self.blocks = nn.ModuleList( - [AIMv2Block(config) for _ in range(config.num_hidden_layers)] - ) - self.post_trunk_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - tokens: torch.Tensor, - mask: Optional[torch.Tensor] = None, - output_hidden_states: bool = False, - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, ...]]]: - hidden_states = () if output_hidden_states else None - for block in self.blocks: - tokens = block(tokens, mask) - if output_hidden_states: - hidden_states += (tokens,) - tokens = self.post_trunk_norm(tokens) - return tokens, hidden_states - - -class AIMv2PretrainedModel(PreTrainedModel): - config_class = AIMv2Config - base_model_prefix = "aimv2" - main_input_name = "pixel_values" - _no_split_modules = ["AIMv2ViTPreprocessor", "AIMv2Block"] - _supports_sdpa = True - - -class AIMv2Model(AIMv2PretrainedModel): - def __init__(self, config: AIMv2Config): - super().__init__(config) - self.preprocessor = AIMv2ViTPreprocessor(config) - self.trunk = AIMv2Transformer(config) - - def forward( - self, - pixel_values: torch.Tensor, - mask: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[ - Tuple[torch.Tensor], - Tuple[torch.Tensor, Tuple[torch.Tensor, ...]], - BaseModelOutputWithNoAttention, - ]: - if output_hidden_states is None: - output_hidden_states = self.config.output_hidden_states - if return_dict is None: - return_dict = self.config.use_return_dict - - x = self.preprocessor(pixel_values) - x, hidden_states = self.trunk( - x, mask, output_hidden_states=output_hidden_states - ) - - if not return_dict: - res = (x,) - res += (hidden_states,) if output_hidden_states else () - return res - - return BaseModelOutputWithNoAttention( - last_hidden_state=x, - hidden_states=hidden_states, - ) diff --git a/llava_next/llava/model/multimodal_encoder/builder.py b/llava_next/llava/model/multimodal_encoder/builder.py index 9c0a48b0..aba5b1be 100755 --- a/llava_next/llava/model/multimodal_encoder/builder.py +++ b/llava_next/llava/model/multimodal_encoder/builder.py @@ -1,81 +1,16 @@ import os from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 -from .imagebind import ImageBindWrapper -from .open_clip_encoder import OpenCLIPVisionTower -from .hf_vision import HFVisionTower -from .siglip_encoder import SigLipVisionTower -from .mlcd_encoder import MLCDVisionTower, MLCDVisionTowerS2 -from .internViT_300M_448px_encoder import InternViT_300M_448px_VisionTower, InternViT_300M_448px_VisionTowerS2 -from .eva_8b_448px_encoder import EVA_8B_448px_VisionTower, EVA_8B_448px_VisionTowerS2 -from .hevc_vit_tower import HEVCViTVisionTower -from .hevc_vit_packing_tower import HEVCViTPackingVisionTower +from .siglip2_naflex import SigLip2NaflexVisionTower +from .onevision_encoder import OneVisionEncoderTower def build_vision_tower(vision_tower_cfg, **kwargs): vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None)) use_s2 = getattr(vision_tower_cfg, "s2", False) - # 1. HEVC-ViT (Your New Model) - priority match - # Check for packing mode first (more specific match) - if "hevc_vit_packing" in vision_tower.lower() or "packing" in vision_tower.lower(): - return HEVCViTPackingVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) - elif "hevc_vit" in vision_tower.lower(): - return HEVCViTVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) + if "siglip2" in vision_tower: + return SigLip2NaflexVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs) - # 2. MLCD Vision Towers - list_mlcd_vision_towers = [ - "rice-vit-huge-patch14", - "mlcd-vit-bigG-patch14", - "rice-vit-bigG-patch14", - "rice-vit-large-patch14" - ] - for _mlcd_tower_name in list_mlcd_vision_towers: - if _mlcd_tower_name in vision_tower: - if use_s2: - return MLCDVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) - else: - return MLCDVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) - - # 3. Specific CLIP variants - if "rice-vit-large-patch14-378" in vision_tower: - if use_s2: - return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) - else: - return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) - - # 4. EVA-8B - elif "EVA_8B_448px" in vision_tower: - if use_s2: - return EVA_8B_448px_VisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) - else: - return EVA_8B_448px_VisionTower(vision_tower, args=vision_tower_cfg, **kwargs) - - # 5. InternViT - elif "InternViT-300M-448px" in vision_tower: - if use_s2: - return InternViT_300M_448px_VisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) - else: - return InternViT_300M_448px_VisionTower(vision_tower, args=vision_tower_cfg, **kwargs) - - # 6. General CLIP / DFN / ShareGPT4V / DeepGlint / MLCD (generic match) - # 这里把比较通用的匹配放在特定模型之后 - elif any(x in vision_tower for x in ["clip", "mlcd", "unicom", "ShareGPT4V"]) or \ - any(vision_tower.lower().startswith(x) for x in ["dfn", "openai", "laion", "deepglint"]): - if use_s2: - return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) - else: - return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) - - # 7. Other Specific Architectures - elif "siglip" in vision_tower: - return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs) - - elif vision_tower.startswith("hf:"): - return HFVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) - - elif vision_tower in ["imagebind_huge"]: - return ImageBindWrapper(vision_tower, args=vision_tower_cfg, **kwargs) - - elif vision_tower.startswith("open_clip_hub"): - return OpenCLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) + elif "onevision" in vision_tower: + return OneVisionEncoderTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs) raise ValueError(f"Unknown vision tower: {vision_tower}") diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py deleted file mode 100755 index ede69005..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD -from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer -from .factory import list_models, add_model_config, get_model_config, load_checkpoint -from .loss import ClipLoss -from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg, convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype -from .openai import load_openai_model, list_openai_models -from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained -from .tokenizer import SimpleTokenizer, tokenize -from .transform import image_transform diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/constants.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/constants.py deleted file mode 100755 index a670bb3f..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/constants.py +++ /dev/null @@ -1,2 +0,0 @@ -OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) -OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/eva_vit_model.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/eva_vit_model.py deleted file mode 100755 index 23cb38c9..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/eva_vit_model.py +++ /dev/null @@ -1,571 +0,0 @@ -# -------------------------------------------------------- -# Adapted from https://github.com/microsoft/unilm/tree/master/beit -# -------------------------------------------------------- -import math -import os -import torch -import torch.nn as nn -import torch.nn.functional as F - -try: - from timm.models.layers import drop_path, to_2tuple, trunc_normal_ -except: - from timm.layers import drop_path, to_2tuple, trunc_normal_ - -from .transformer import PatchDropout -from .rope import VisionRotaryEmbedding, VisionRotaryEmbeddingFast - -if os.getenv("ENV_TYPE") == "deepspeed": - try: - from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint - except: - from torch.utils.checkpoint import checkpoint -else: - from torch.utils.checkpoint import checkpoint - -try: - import xformers.ops as xops -except ImportError: - xops = None - # print("Please 'pip install xformers'") - - -class DropPath(nn.Module): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" - - def __init__(self, drop_prob=None): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - - def forward(self, x): - return drop_path(x, self.drop_prob, self.training) - - def extra_repr(self) -> str: - return "p={}".format(self.drop_prob) - - -class Mlp(nn.Module): - def __init__( - self, - in_features, - hidden_features=None, - out_features=None, - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - drop=0.0, - subln=False, - ): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = nn.Linear(in_features, hidden_features) - self.act = act_layer() - - self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity() - - self.fc2 = nn.Linear(hidden_features, out_features) - self.drop = nn.Dropout(drop) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - # x = self.drop(x) - # commit this for the orignal BERT implement - x = self.ffn_ln(x) - - x = self.fc2(x) - x = self.drop(x) - return x - - -class SwiGLU(nn.Module): - def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0.0, norm_layer=nn.LayerNorm, subln=False): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - - self.w1 = nn.Linear(in_features, hidden_features) - self.w2 = nn.Linear(in_features, hidden_features) - - self.act = act_layer() - self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity() - self.w3 = nn.Linear(hidden_features, out_features) - - self.drop = nn.Dropout(drop) - - def forward(self, x): - x1 = self.w1(x) - x2 = self.w2(x) - hidden = self.act(x1) * x2 - x = self.ffn_ln(hidden) - x = self.w3(x) - x = self.drop(x) - return x - - -class Attention(nn.Module): - def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0, window_size=None, attn_head_dim=None, xattn=False, rope=None, subln=False, norm_layer=nn.LayerNorm): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - if attn_head_dim is not None: - head_dim = attn_head_dim - all_head_dim = head_dim * self.num_heads - self.scale = qk_scale or head_dim**-0.5 - - self.subln = subln - if self.subln: - self.q_proj = nn.Linear(dim, all_head_dim, bias=False) - self.k_proj = nn.Linear(dim, all_head_dim, bias=False) - self.v_proj = nn.Linear(dim, all_head_dim, bias=False) - else: - self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) - - if qkv_bias: - self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) - self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) - else: - self.q_bias = None - self.v_bias = None - - if window_size: - self.window_size = window_size - self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 - self.relative_position_bias_table = nn.Parameter(torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH - # cls to token & token 2 cls & cls to cls - - # get pair-wise relative position index for each token inside the window - coords_h = torch.arange(window_size[0]) - coords_w = torch.arange(window_size[1]) - coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww - relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * window_size[1] - 1 - relative_position_index = torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww - relative_position_index[0, 0:] = self.num_relative_distance - 3 - relative_position_index[0:, 0] = self.num_relative_distance - 2 - relative_position_index[0, 0] = self.num_relative_distance - 1 - - self.register_buffer("relative_position_index", relative_position_index) - else: - self.window_size = None - self.relative_position_bias_table = None - self.relative_position_index = None - - self.attn_drop = nn.Dropout(attn_drop) - self.inner_attn_ln = norm_layer(all_head_dim) if subln else nn.Identity() - # self.proj = nn.Linear(all_head_dim, all_head_dim) - self.proj = nn.Linear(all_head_dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - self.xattn = xattn - self.xattn_drop = attn_drop - - self.rope = rope - - def forward(self, x, rel_pos_bias=None, attn_mask=None): - B, N, C = x.shape - if self.subln: - q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias) - k = F.linear(input=x, weight=self.k_proj.weight, bias=None) - v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias) - - q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) # B, num_heads, N, C - k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) - v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) - else: - - qkv_bias = None - if self.q_bias is not None: - qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) - - qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) - qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) # 3, B, num_heads, N, C - q, k, v = qkv[0], qkv[1], qkv[2] - - if self.rope: - # slightly fast impl - q_t = q[:, :, 1:, :] - ro_q_t = self.rope(q_t) - q = torch.cat((q[:, :, :1, :], ro_q_t), -2).type_as(v) - - k_t = k[:, :, 1:, :] - ro_k_t = self.rope(k_t) - k = torch.cat((k[:, :, :1, :], ro_k_t), -2).type_as(v) - - if self.xattn: - q = q.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C - k = k.permute(0, 2, 1, 3) - v = v.permute(0, 2, 1, 3) - - x = xops.memory_efficient_attention( - q, - k, - v, - p=self.xattn_drop, - scale=self.scale, - ) - x = x.reshape(B, N, -1) - x = self.inner_attn_ln(x) - x = self.proj(x) - x = self.proj_drop(x) - else: - q = q * self.scale - attn = q @ k.transpose(-2, -1) - - if self.relative_position_bias_table is not None: - relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww - attn = attn + relative_position_bias.unsqueeze(0).type_as(attn) - - if rel_pos_bias is not None: - attn = attn + rel_pos_bias.type_as(attn) - - if attn_mask is not None: - attn_mask = attn_mask.bool() - attn = attn.masked_fill(~attn_mask[:, None, None, :], float("-inf")) - - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, N, -1) - x = self.inner_attn_ln(x) - x = self.proj(x) - x = self.proj_drop(x) - return x - - -class Block(nn.Module): - - def __init__( - self, - dim, - num_heads, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - drop=0.0, - attn_drop=0.0, - drop_path=0.0, - init_values=None, - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - window_size=None, - attn_head_dim=None, - xattn=False, - rope=None, - postnorm=False, - subln=False, - naiveswiglu=False, - ): - super().__init__() - self.norm1 = norm_layer(dim) - self.attn = Attention( - dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim, xattn=xattn, rope=rope, subln=subln, norm_layer=norm_layer - ) - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here - self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() - self.norm2 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) - - if naiveswiglu: - self.mlp = SwiGLU( - in_features=dim, - hidden_features=mlp_hidden_dim, - subln=subln, - norm_layer=norm_layer, - ) - else: - self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, subln=subln, drop=drop) - - if init_values is not None and init_values > 0: - self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True) - self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True) - else: - self.gamma_1, self.gamma_2 = None, None - - self.postnorm = postnorm - - def forward(self, x, rel_pos_bias=None, attn_mask=None): - if self.gamma_1 is None: - if self.postnorm: - x = x + self.drop_path(self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))) - x = x + self.drop_path(self.norm2(self.mlp(x))) - else: - x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)) - x = x + self.drop_path(self.mlp(self.norm2(x))) - else: - if self.postnorm: - x = x + self.drop_path(self.gamma_1 * self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))) - x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x))) - else: - x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)) - x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) - return x - - -class PatchEmbed(nn.Module): - """Image to Patch Embedding""" - - def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): - super().__init__() - img_size = to_2tuple(img_size) - patch_size = to_2tuple(patch_size) - num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) - self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) - self.img_size = img_size - self.patch_size = patch_size - self.num_patches = num_patches - - self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) - - def forward(self, x, **kwargs): - B, C, H, W = x.shape - # FIXME look at relaxing size constraints - assert H == self.img_size[0] and W == self.img_size[1], f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." - x = self.proj(x).flatten(2).transpose(1, 2) - return x - - -class RelativePositionBias(nn.Module): - - def __init__(self, window_size, num_heads): - super().__init__() - self.window_size = window_size - self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 - self.relative_position_bias_table = nn.Parameter(torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH - # cls to token & token 2 cls & cls to cls - - # get pair-wise relative position index for each token inside the window - coords_h = torch.arange(window_size[0]) - coords_w = torch.arange(window_size[1]) - coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww - relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * window_size[1] - 1 - relative_position_index = torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww - relative_position_index[0, 0:] = self.num_relative_distance - 3 - relative_position_index[0:, 0] = self.num_relative_distance - 2 - relative_position_index[0, 0] = self.num_relative_distance - 1 - - self.register_buffer("relative_position_index", relative_position_index) - - def forward(self): - relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH - return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww - - -class EVAVisionTransformer(nn.Module): - """Vision Transformer with support for patch or hybrid CNN input stage""" - - def __init__( - self, - img_size=224, - patch_size=16, - in_chans=3, - num_classes=1000, - embed_dim=768, - depth=12, - num_heads=12, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - drop_rate=0.0, - attn_drop_rate=0.0, - drop_path_rate=0.0, - norm_layer=nn.LayerNorm, - init_values=None, - patch_dropout=0.0, - use_abs_pos_emb=True, - use_rel_pos_bias=False, - use_shared_rel_pos_bias=False, - rope=False, - use_mean_pooling=True, - init_scale=0.001, - grad_checkpointing=False, - xattn=False, - postnorm=False, - pt_hw_seq_len=16, - intp_freq=False, - naiveswiglu=False, - subln=False, - ): - super().__init__() - self.image_size = img_size - self.num_classes = num_classes - self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models - - self.patch_embed = PatchEmbed(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) - num_patches = self.patch_embed.num_patches - - self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) - # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) - if use_abs_pos_emb: - self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) - else: - self.pos_embed = None - self.pos_drop = nn.Dropout(p=drop_rate) - - if use_shared_rel_pos_bias: - self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) - else: - self.rel_pos_bias = None - - if rope: - half_head_dim = embed_dim // num_heads // 2 - hw_seq_len = img_size // patch_size - self.rope = VisionRotaryEmbeddingFast( - dim=half_head_dim, - pt_seq_len=pt_hw_seq_len, - ft_seq_len=hw_seq_len if intp_freq else None, - # patch_dropout=patch_dropout - ) - else: - self.rope = None - - self.naiveswiglu = naiveswiglu - - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule - self.use_rel_pos_bias = use_rel_pos_bias - self.blocks = nn.ModuleList( - [ - Block( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[i], - norm_layer=norm_layer, - init_values=init_values, - window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, - xattn=xattn, - rope=self.rope, - postnorm=postnorm, - subln=subln, - naiveswiglu=naiveswiglu, - ) - for i in range(depth) - ] - ) - self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim) - self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None - self.head = nn.Linear(embed_dim, num_classes, bias=qkv_bias) if num_classes > 0 else nn.Identity() - - if self.pos_embed is not None: - trunc_normal_(self.pos_embed, std=0.02) - - trunc_normal_(self.cls_token, std=0.02) - - self.apply(self._init_weights) - self.fix_init_weight() - - if isinstance(self.head, nn.Linear): - trunc_normal_(self.head.weight, std=0.02) - self.head.weight.data.mul_(init_scale) - if self.head.bias is not None: - self.head.bias.data.mul_(init_scale) - - # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn - self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0.0 else nn.Identity() - - self.grad_checkpointing = grad_checkpointing - - def fix_init_weight(self): - def rescale(param, layer_id): - param.div_(math.sqrt(2.0 * layer_id)) - - for layer_id, layer in enumerate(self.blocks): - rescale(layer.attn.proj.weight.data, layer_id + 1) - if self.naiveswiglu: - rescale(layer.mlp.w3.weight.data, layer_id + 1) - else: - rescale(layer.mlp.fc2.weight.data, layer_id + 1) - - def get_cast_dtype(self) -> torch.dtype: - return self.blocks[0].mlp.fc2.weight.dtype - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=0.02) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - def get_num_layers(self): - return len(self.blocks) - - def lock(self, unlocked_groups=0, freeze_bn_stats=False): - assert unlocked_groups == 0, "partial locking not currently supported for this model" - for param in self.parameters(): - param.requires_grad = False - - @torch.jit.ignore - def set_grad_checkpointing(self, enable=True): - self.grad_checkpointing = enable - - @torch.jit.ignore - def no_weight_decay(self): - return {"pos_embed", "cls_token"} - - def get_classifier(self): - return self.head - - def reset_classifier(self, num_classes, global_pool=""): - self.num_classes = num_classes - self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() - - def forward_features(self, x, return_all_features=False): - - x = self.patch_embed(x) - batch_size, seq_len, _ = x.size() - - cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks - x = torch.cat((cls_tokens, x), dim=1) - if self.pos_embed is not None: - x = x + self.pos_embed - x = self.pos_drop(x) - - # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in - # if os.getenv("RoPE") == "1": - # if self.training and not isinstance(self.patch_dropout, nn.Identity): - # x, patch_indices_keep = self.patch_dropout(x) - # self.rope.forward = partial(self.rope.forward, patch_indices_keep=patch_indices_keep) - # else: - # self.rope.forward = partial(self.rope.forward, patch_indices_keep=None) - # x = self.patch_dropout(x) - # else: - x = self.patch_dropout(x) - - rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None - for blk in self.blocks: - if self.grad_checkpointing: - x = checkpoint(blk, x, (rel_pos_bias,)) - else: - x = blk(x, rel_pos_bias=rel_pos_bias) - - if not return_all_features: - x = self.norm(x) - if self.fc_norm is not None: - return self.fc_norm(x.mean(1)) - else: - return x[:, 0] - return x - - def forward(self, x, return_all_features=False): - if return_all_features: - return self.forward_features(x, return_all_features) - x = self.forward_features(x) - x = self.head(x) - return x diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/factory.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/factory.py deleted file mode 100755 index 535eeacb..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/factory.py +++ /dev/null @@ -1,528 +0,0 @@ -import json -import logging -import os -import pathlib -import re -from copy import deepcopy -from pathlib import Path -from typing import Optional, Tuple, Union, Dict, Any -import torch - -try: - import deepspeed -except ImportError: - deepspeed = None - -from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD -from .model import CLIP, CustomCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict, get_cast_dtype -from .openai import load_openai_model -from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained, list_pretrained_tags_by_model -from .transform import image_transform -from .tokenizer import HFTokenizer, tokenize -from .utils import resize_clip_pos_embed, resize_evaclip_pos_embed, resize_visual_pos_embed, resize_eva_pos_embed - - -_MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"] -_MODEL_CONFIGS = {} # directory (model_name: config) of model architecture configs - - -def _natural_key(string_): - return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())] - - -def _rescan_model_configs(): - global _MODEL_CONFIGS - - config_ext = (".json",) - config_files = [] - for config_path in _MODEL_CONFIG_PATHS: - if config_path.is_file() and config_path.suffix in config_ext: - config_files.append(config_path) - elif config_path.is_dir(): - for ext in config_ext: - config_files.extend(config_path.glob(f"*{ext}")) - - for cf in config_files: - with open(cf, "r", encoding="utf8") as f: - model_cfg = json.load(f) - if all(a in model_cfg for a in ("embed_dim", "vision_cfg", "text_cfg")): - _MODEL_CONFIGS[cf.stem] = model_cfg - - _MODEL_CONFIGS = dict(sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))) - - -_rescan_model_configs() # initial populate of model config registry - - -def list_models(): - """enumerate available model architectures based on config files""" - return list(_MODEL_CONFIGS.keys()) - - -def add_model_config(path): - """add model config path or file and update registry""" - if not isinstance(path, Path): - path = Path(path) - _MODEL_CONFIG_PATHS.append(path) - _rescan_model_configs() - - -def get_model_config(model_name): - if model_name in _MODEL_CONFIGS: - return deepcopy(_MODEL_CONFIGS[model_name]) - else: - return None - - -def get_tokenizer(model_name): - config = get_model_config(model_name) - tokenizer = HFTokenizer(config["text_cfg"]["hf_tokenizer_name"]) if "hf_tokenizer_name" in config["text_cfg"] else tokenize - return tokenizer - - -# loading openai CLIP weights when is_openai=True for training -def load_state_dict(checkpoint_path: str, map_location: str = "cpu", model_key: str = "model|module|state_dict", is_openai: bool = False, skip_list: list = []): - if is_openai: - model = torch.jit.load(checkpoint_path, map_location="cpu").eval() - state_dict = model.state_dict() - for key in ["input_resolution", "context_length", "vocab_size"]: - state_dict.pop(key, None) - else: - checkpoint = torch.load(checkpoint_path, map_location=map_location) - for mk in model_key.split("|"): - if isinstance(checkpoint, dict) and mk in checkpoint: - state_dict = checkpoint[mk] - break - else: - state_dict = checkpoint - if next(iter(state_dict.items()))[0].startswith("module"): - state_dict = {k[7:]: v for k, v in state_dict.items()} - - for k in skip_list: - if k in list(state_dict.keys()): - logging.info(f"Removing key {k} from pretrained checkpoint") - del state_dict[k] - - if os.getenv("RoPE") == "1": - for k in list(state_dict.keys()): - if "freqs_cos" in k or "freqs_sin" in k: - del state_dict[k] - return state_dict - - -def load_checkpoint(model, checkpoint_path, model_key="model|module|state_dict", strict=True): - state_dict = load_state_dict(checkpoint_path, model_key=model_key, is_openai=False) - # detect old format and make compatible with new format - if "positional_embedding" in state_dict and not hasattr(model, "positional_embedding"): - state_dict = convert_to_custom_text_state_dict(state_dict) - if "text.logit_scale" in state_dict and hasattr(model, "logit_scale"): - state_dict["logit_scale"] = state_dict["text.logit_scale"] - del state_dict["text.logit_scale"] - - # resize_clip_pos_embed for CLIP and open CLIP - if "visual.positional_embedding" in state_dict: - resize_clip_pos_embed(state_dict, model) - # specified to eva_vit_model - elif "visual.pos_embed" in state_dict: - resize_evaclip_pos_embed(state_dict, model) - - # resize_clip_pos_embed(state_dict, model) - incompatible_keys = model.load_state_dict(state_dict, strict=strict) - logging.info(f"incompatible_keys.missing_keys: {incompatible_keys.missing_keys}") - return incompatible_keys - - -def load_clip_visual_state_dict(checkpoint_path: str, map_location: str = "cpu", is_openai: bool = False, skip_list: list = []): - state_dict = load_state_dict(checkpoint_path, map_location=map_location, is_openai=is_openai, skip_list=skip_list) - - for k in list(state_dict.keys()): - if not k.startswith("visual."): - del state_dict[k] - for k in list(state_dict.keys()): - if k.startswith("visual."): - new_k = k[7:] - state_dict[new_k] = state_dict[k] - del state_dict[k] - return state_dict - - -def load_clip_text_state_dict(checkpoint_path: str, map_location: str = "cpu", is_openai: bool = False, skip_list: list = []): - state_dict = load_state_dict(checkpoint_path, map_location=map_location, is_openai=is_openai, skip_list=skip_list) - - for k in list(state_dict.keys()): - if k.startswith("visual."): - del state_dict[k] - return state_dict - - -def get_pretrained_tag(pretrained_model): - pretrained_model = pretrained_model.lower() - if "laion" in pretrained_model or "open_clip" in pretrained_model: - return "open_clip" - elif "openai" in pretrained_model: - return "clip" - elif "eva" in pretrained_model and "clip" in pretrained_model: - return "eva_clip" - else: - return "other" - - -def load_zero_partitions(model, state_dict, is_deepspeed_zero3_enabled, pretrained_model_path, ignore_mismatched_sizes=False): - """ - adept from pytorch lightning and transformers - with deepspeed.zero.Init(): - model = MyModel() - state_dict = torch.load(model_path, map_location="cpu") - load_zero_partitions(model, prefix="") - """ - - # because zero3 puts placeholders in model params, this context - # manager gathers (unpartitions) the params of the current layer, then loads from - # the state dict and then re-partitions them again - model_state_dict = model.state_dict() - expected_keys = list(model_state_dict.keys()) - loaded_keys = list(state_dict.keys()) - missing_keys = list(set(expected_keys) - set(loaded_keys)) - unexpected_keys = list(set(loaded_keys) - set(expected_keys)) - - # Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not - # matching the weights in the model. - mismatched_keys = [] - if ignore_mismatched_sizes: - for checkpoint_key in loaded_keys: - model_key = checkpoint_key - - if model_key in model_state_dict and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape: - mismatched_keys.append((checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)) - del state_dict[checkpoint_key] - # copy state_dict so _load_from_state_dict can modify it - metadata = getattr(state_dict, "_metadata", None) - state_dict = state_dict.copy() - if metadata is not None: - state_dict._metadata = metadata - - error_msgs = [] - - # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants - # so we need to apply the function recursively. - def load(module, prefix=""): - local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) - args = (state_dict, prefix, local_metadata, True, [], [], error_msgs) - if is_deepspeed_zero3_enabled: - # because zero3 puts placeholders in model params, this context - # manager gathers (unpartitions) the params of the current layer, then loads from - # the state dict and then re-partitions them again - with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0): - if torch.distributed.get_rank() == 0: - module._load_from_state_dict(*args) - else: - module._load_from_state_dict(*args) - - for name, child in module._modules.items(): - if child is not None: - load(child, prefix + name + ".") - - # Make sure we are able to load base models as well as derived models (with heads) - start_prefix = "" - model_to_load = model - load(model_to_load, prefix=start_prefix) - del state_dict - if len(error_msgs) > 0: - error_msg = "\n\t".join(error_msgs) - if "size mismatch" in error_msg: - error_msg += "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method." - raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}") - if len(unexpected_keys) > 0: - logging.warning( - f"Some weights of the model checkpoint at {pretrained_model_path} were not used when" - f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are" - f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or" - " with another architecture (e.g. initializing a BertForSequenceClassification model from a" - " BertForPreTraining model).\n- This IS NOT expected if you are initializing" - f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical" - " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)." - ) - else: - logging.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n") - if len(missing_keys) > 0: - logging.warning( - f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" - f" {pretrained_model_path} and are newly initialized: {missing_keys}\nYou should probably" - " TRAIN this model on a down-stream task to be able to use it for predictions and inference." - ) - elif len(mismatched_keys) == 0: - logging.info( - f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at" - f" {pretrained_model_path}.\nIf your task is similar to the task the model of the checkpoint" - f" was trained on, you can already use {model.__class__.__name__} for predictions without further" - " training." - ) - if len(mismatched_keys) > 0: - mismatched_warning = "\n".join([f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" for key, shape1, shape2 in mismatched_keys]) - logging.warning( - f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" - f" {pretrained_model_path} and are newly initialized because the shapes did not" - f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able" - " to use it for predictions and inference." - ) - - -def load_pretrained_checkpoint(model, visual_checkpoint_path, text_checkpoint_path, strict=True, visual_model=None, text_model=None, model_key="model|module|state_dict", skip_list=[]): - visual_tag = get_pretrained_tag(visual_model) - text_tag = get_pretrained_tag(text_model) - - logging.info(f"num of model state_dict keys: {len(model.state_dict().keys())}") - visual_incompatible_keys, text_incompatible_keys = None, None - if visual_checkpoint_path: - if visual_tag == "eva_clip" or visual_tag == "open_clip": - visual_state_dict = load_clip_visual_state_dict(visual_checkpoint_path, is_openai=False, skip_list=skip_list) - elif visual_tag == "clip": - visual_state_dict = load_clip_visual_state_dict(visual_checkpoint_path, is_openai=True, skip_list=skip_list) - else: - visual_state_dict = load_state_dict(visual_checkpoint_path, model_key=model_key, is_openai=False, skip_list=skip_list) - - # resize_clip_pos_embed for CLIP and open CLIP - if "positional_embedding" in visual_state_dict: - resize_visual_pos_embed(visual_state_dict, model) - # specified to EVA model - elif "pos_embed" in visual_state_dict: - resize_eva_pos_embed(visual_state_dict, model) - - visual_incompatible_keys = model.visual.load_state_dict(visual_state_dict, strict=strict) - logging.info(f"num of loaded visual_state_dict keys: {len(visual_state_dict.keys())}") - logging.info(f"visual_incompatible_keys.missing_keys: {visual_incompatible_keys.missing_keys}") - - if text_checkpoint_path: - if text_tag == "eva_clip" or text_tag == "open_clip": - text_state_dict = load_clip_text_state_dict(text_checkpoint_path, is_openai=False, skip_list=skip_list) - elif text_tag == "clip": - text_state_dict = load_clip_text_state_dict(text_checkpoint_path, is_openai=True, skip_list=skip_list) - else: - text_state_dict = load_state_dict(visual_checkpoint_path, model_key=model_key, is_openai=False, skip_list=skip_list) - - text_incompatible_keys = model.text.load_state_dict(text_state_dict, strict=strict) - - logging.info(f"num of loaded text_state_dict keys: {len(text_state_dict.keys())}") - logging.info(f"text_incompatible_keys.missing_keys: {text_incompatible_keys.missing_keys}") - - return visual_incompatible_keys, text_incompatible_keys - - -def create_model( - model_name: str, - pretrained: Optional[str] = None, - precision: str = "fp32", - device: Union[str, torch.device] = "cpu", - jit: bool = False, - force_quick_gelu: bool = False, - force_custom_clip: bool = False, - force_patch_dropout: Optional[float] = None, - pretrained_image: str = "", - pretrained_text: str = "", - pretrained_hf: bool = True, - pretrained_visual_model: str = None, - pretrained_text_model: str = None, - cache_dir: Optional[str] = None, - skip_list: list = [], -): - model_name = model_name.replace("/", "-") # for callers using old naming with / in ViT names - if isinstance(device, str): - device = torch.device(device) - - if pretrained and pretrained.lower() == "openai": - logging.info(f"Loading pretrained {model_name} from OpenAI.") - model = load_openai_model( - model_name, - precision=precision, - device=device, - jit=jit, - cache_dir=cache_dir, - ) - else: - model_cfg = get_model_config(model_name) - if model_cfg is not None: - logging.info(f"Loaded {model_name} model config.") - else: - logging.error(f"Model config for {model_name} not found; available models {list_models()}.") - raise RuntimeError(f"Model config for {model_name} not found.") - - if "rope" in model_cfg.get("vision_cfg", {}): - if model_cfg["vision_cfg"]["rope"]: - os.environ["RoPE"] = "1" - else: - os.environ["RoPE"] = "0" - - if force_quick_gelu: - # override for use of QuickGELU on non-OpenAI transformer models - model_cfg["quick_gelu"] = True - - if force_patch_dropout is not None: - # override the default patch dropout value - model_cfg["vision_cfg"]["patch_dropout"] = force_patch_dropout - - cast_dtype = get_cast_dtype(precision) - custom_clip = model_cfg.pop("custom_text", False) or force_custom_clip or ("hf_model_name" in model_cfg["text_cfg"]) - - if custom_clip: - if "hf_model_name" in model_cfg.get("text_cfg", {}): - model_cfg["text_cfg"]["hf_model_pretrained"] = pretrained_hf - model = CustomCLIP(**model_cfg, cast_dtype=cast_dtype) - else: - model = CLIP(**model_cfg, cast_dtype=cast_dtype) - - pretrained_cfg = {} - if pretrained: - checkpoint_path = "" - pretrained_cfg = get_pretrained_cfg(model_name, pretrained) - if pretrained_cfg: - checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir) - elif os.path.exists(pretrained): - checkpoint_path = pretrained - - if checkpoint_path: - logging.info(f"Loading pretrained {model_name} weights ({pretrained}).") - load_checkpoint(model, checkpoint_path, model_key="model|module|state_dict", strict=False) - else: - error_str = f"Pretrained weights ({pretrained}) not found for model {model_name}." f"Available pretrained tags ({list_pretrained_tags_by_model(model_name)}." - logging.warning(error_str) - raise RuntimeError(error_str) - else: - visual_checkpoint_path = "" - text_checkpoint_path = "" - - if pretrained_image: - pretrained_visual_model = pretrained_visual_model.replace("/", "-") # for callers using old naming with / in ViT names - pretrained_image_cfg = get_pretrained_cfg(pretrained_visual_model, pretrained_image) - if "timm_model_name" in model_cfg.get("vision_cfg", {}): - # pretrained weight loading for timm models set via vision_cfg - model_cfg["vision_cfg"]["timm_model_pretrained"] = True - elif pretrained_image_cfg: - visual_checkpoint_path = download_pretrained(pretrained_image_cfg, cache_dir=cache_dir) - elif os.path.exists(pretrained_image): - visual_checkpoint_path = pretrained_image - else: - logging.warning(f"Pretrained weights ({visual_checkpoint_path}) not found for model {model_name}.visual.") - raise RuntimeError(f"Pretrained weights ({visual_checkpoint_path}) not found for model {model_name}.visual.") - - if pretrained_text: - pretrained_text_model = pretrained_text_model.replace("/", "-") # for callers using old naming with / in ViT names - pretrained_text_cfg = get_pretrained_cfg(pretrained_text_model, pretrained_text) - if pretrained_image_cfg: - text_checkpoint_path = download_pretrained(pretrained_text_cfg, cache_dir=cache_dir) - elif os.path.exists(pretrained_text): - text_checkpoint_path = pretrained_text - else: - logging.warning(f"Pretrained weights ({text_checkpoint_path}) not found for model {model_name}.text.") - raise RuntimeError(f"Pretrained weights ({text_checkpoint_path}) not found for model {model_name}.text.") - - if visual_checkpoint_path: - logging.info(f"Loading pretrained {model_name}.visual weights ({visual_checkpoint_path}).") - if text_checkpoint_path: - logging.info(f"Loading pretrained {model_name}.text weights ({text_checkpoint_path}).") - - if visual_checkpoint_path or text_checkpoint_path: - load_pretrained_checkpoint(model, visual_checkpoint_path, text_checkpoint_path, strict=False, visual_model=pretrained_visual_model, text_model=pretrained_text_model, model_key="model|module|state_dict", skip_list=skip_list) - - if "fp16" in precision or "bf16" in precision: - logging.info(f"convert precision to {precision}") - model = model.to(torch.bfloat16) if "bf16" in precision else model.to(torch.float16) - - # model.to(device=device) - - # set image / mean metadata from pretrained_cfg if available, or use default - model.visual.image_mean = pretrained_cfg.get("mean", None) or OPENAI_DATASET_MEAN - model.visual.image_std = pretrained_cfg.get("std", None) or OPENAI_DATASET_STD - - if jit: - model = torch.jit.script(model) - - return model - - -def create_model_and_transforms( - model_name: str, - pretrained: Optional[str] = None, - precision: str = "fp32", - device: Union[str, torch.device] = "cpu", - jit: bool = False, - force_quick_gelu: bool = False, - force_custom_clip: bool = False, - force_patch_dropout: Optional[float] = None, - pretrained_image: str = "", - pretrained_text: str = "", - pretrained_hf: bool = True, - pretrained_visual_model: str = None, - pretrained_text_model: str = None, - image_mean: Optional[Tuple[float, ...]] = None, - image_std: Optional[Tuple[float, ...]] = None, - cache_dir: Optional[str] = None, - skip_list: list = [], -): - model = create_model( - model_name, - pretrained, - precision=precision, - device=device, - jit=jit, - force_quick_gelu=force_quick_gelu, - force_custom_clip=force_custom_clip, - force_patch_dropout=force_patch_dropout, - pretrained_image=pretrained_image, - pretrained_text=pretrained_text, - pretrained_hf=pretrained_hf, - pretrained_visual_model=pretrained_visual_model, - pretrained_text_model=pretrained_text_model, - cache_dir=cache_dir, - skip_list=skip_list, - ) - - image_mean = image_mean or getattr(model.visual, "image_mean", None) - image_std = image_std or getattr(model.visual, "image_std", None) - preprocess_train = image_transform(model.visual.image_size, is_train=True, mean=image_mean, std=image_std) - preprocess_val = image_transform(model.visual.image_size, is_train=False, mean=image_mean, std=image_std) - - return model, preprocess_train, preprocess_val - - -def create_model_from_pretrained( - model_name: str, - pretrained: str, - precision: str = "fp32", - device: Union[str, torch.device] = "cpu", - jit: bool = False, - force_quick_gelu: bool = False, - force_custom_clip: bool = False, - force_patch_dropout: Optional[float] = None, - return_transform: bool = True, - image_mean: Optional[Tuple[float, ...]] = None, - image_std: Optional[Tuple[float, ...]] = None, - cache_dir: Optional[str] = None, - is_frozen: bool = False, -): - if not is_pretrained_cfg(model_name, pretrained) and not os.path.exists(pretrained): - raise RuntimeError(f"{pretrained} is not a valid pretrained cfg or checkpoint for {model_name}." f" Use open_clip.list_pretrained() to find one.") - - model = create_model( - model_name, - pretrained, - precision=precision, - device=device, - jit=jit, - force_quick_gelu=force_quick_gelu, - force_custom_clip=force_custom_clip, - force_patch_dropout=force_patch_dropout, - cache_dir=cache_dir, - ) - - if is_frozen: - for param in model.parameters(): - param.requires_grad = False - - if not return_transform: - return model - - image_mean = image_mean or getattr(model.visual, "image_mean", None) - image_std = image_std or getattr(model.visual, "image_std", None) - preprocess = image_transform(model.visual.image_size, is_train=False, mean=image_mean, std=image_std) - - return model, preprocess diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_configs.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_configs.py deleted file mode 100755 index ddd2c672..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_configs.py +++ /dev/null @@ -1,57 +0,0 @@ -# HF architecture dict: -arch_dict = { - # https://huggingface.co/docs/transformers/model_doc/roberta#roberta - "roberta": { - "config_names": { - "context_length": "max_position_embeddings", - "vocab_size": "vocab_size", - "width": "hidden_size", - "heads": "num_attention_heads", - "layers": "num_hidden_layers", - "layer_attr": "layer", - "token_embeddings_attr": "embeddings", - }, - "pooler": "mean_pooler", - }, - # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig - "xlm-roberta": { - "config_names": { - "context_length": "max_position_embeddings", - "vocab_size": "vocab_size", - "width": "hidden_size", - "heads": "num_attention_heads", - "layers": "num_hidden_layers", - "layer_attr": "layer", - "token_embeddings_attr": "embeddings", - }, - "pooler": "mean_pooler", - }, - # https://huggingface.co/docs/transformers/model_doc/mt5#mt5 - "mt5": { - "config_names": { - # unlimited seqlen - # https://github.com/google-research/text-to-text-transfer-transformer/issues/273 - # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374 - "context_length": "", - "vocab_size": "vocab_size", - "width": "d_model", - "heads": "num_heads", - "layers": "num_layers", - "layer_attr": "block", - "token_embeddings_attr": "embed_tokens", - }, - "pooler": "mean_pooler", - }, - "bert": { - "config_names": { - "context_length": "max_position_embeddings", - "vocab_size": "vocab_size", - "width": "hidden_size", - "heads": "num_attention_heads", - "layers": "num_hidden_layers", - "layer_attr": "layer", - "token_embeddings_attr": "embeddings", - }, - "pooler": "mean_pooler", - }, -} diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_model.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_model.py deleted file mode 100755 index a156624b..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_model.py +++ /dev/null @@ -1,240 +0,0 @@ -""" huggingface model adapter - -Wraps HuggingFace transformers (https://github.com/huggingface/transformers) models for use as a text tower in CLIP model. -""" - -import re - -import torch -import torch.nn as nn -from torch.nn import functional as F -from torch import TensorType - -try: - import transformers - from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer, AutoConfig, PretrainedConfig - from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions -except ImportError as e: - transformers = None - - class BaseModelOutput: - pass - - class PretrainedConfig: - pass - - -from .hf_configs import arch_dict - - -# utils -def _camel2snake(s): - return re.sub(r"(? TensorType: - # image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(x.device) - # attn_mask = (x != self.config.pad_token_id).long() - # out = self.transformer( - # input_ids=x, - # attention_mask=attn_mask, - # encoder_hidden_states = image_embeds, - # encoder_attention_mask = image_atts, - # ) - # pooled_out = self.pooler(out, attn_mask) - - # return self.itm_proj(pooled_out) - - def mask(self, input_ids, vocab_size, device, targets=None, masked_indices=None, probability_matrix=None): - if masked_indices is None: - masked_indices = torch.bernoulli(probability_matrix).bool() - - masked_indices[input_ids == self.tokenizer.pad_token_id] = False - masked_indices[input_ids == self.tokenizer.cls_token_id] = False - - if targets is not None: - targets[~masked_indices] = -100 # We only compute loss on masked tokens - - # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) - indices_replaced = torch.bernoulli(torch.full(input_ids.shape, 0.8)).bool() & masked_indices - input_ids[indices_replaced] = self.tokenizer.mask_token_id - - # 10% of the time, we replace masked input tokens with random word - indices_random = torch.bernoulli(torch.full(input_ids.shape, 0.5)).bool() & masked_indices & ~indices_replaced - random_words = torch.randint(vocab_size, input_ids.shape, dtype=torch.long).to(device) - input_ids[indices_random] = random_words[indices_random] - # The rest of the time (10% of the time) we keep the masked input tokens unchanged - - if targets is not None: - return input_ids, targets - else: - return input_ids - - def forward_mlm(self, input_ids, image_embeds, mlm_probability=0.25): - labels = input_ids.clone() - attn_mask = (input_ids != self.config.pad_token_id).long() - image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(input_ids.device) - vocab_size = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["vocab_size"]) - probability_matrix = torch.full(labels.shape, mlm_probability) - input_ids, labels = self.mask(input_ids, vocab_size, input_ids.device, targets=labels, probability_matrix=probability_matrix) - mlm_output = self.transformer( - input_ids, - attention_mask=attn_mask, - encoder_hidden_states=image_embeds, - encoder_attention_mask=image_atts, - return_dict=True, - labels=labels, - ) - return mlm_output.loss - # mlm_output = self.transformer(input_ids, - # attention_mask = attn_mask, - # encoder_hidden_states = image_embeds, - # encoder_attention_mask = image_atts, - # return_dict = True, - # ).last_hidden_state - # logits = self.mlm_proj(mlm_output) - - # # logits = logits[:, :-1, :].contiguous().view(-1, vocab_size) - # logits = logits[:, 1:, :].contiguous().view(-1, vocab_size) - # labels = labels[:, 1:].contiguous().view(-1) - - # mlm_loss = F.cross_entropy( - # logits, - # labels, - # # label_smoothing=0.1, - # ) - # return mlm_loss - - def forward(self, x: TensorType) -> TensorType: - attn_mask = (x != self.config.pad_token_id).long() - out = self.transformer(input_ids=x, attention_mask=attn_mask) - pooled_out = self.pooler(out, attn_mask) - - return self.proj(pooled_out) - - def lock(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True): - if not unlocked_layers: # full freezing - for n, p in self.transformer.named_parameters(): - p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False - return - - encoder = self.transformer.encoder if hasattr(self.transformer, "encoder") else self.transformer - layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"]) - print(f"Unlocking {unlocked_layers}/{len(layer_list) + 1} layers of hf model") - embeddings = getattr(self.transformer, arch_dict[self.config.model_type]["config_names"]["token_embeddings_attr"]) - modules = [embeddings, *layer_list][:-unlocked_layers] - # freeze layers - for module in modules: - for n, p in module.named_parameters(): - p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False - - @torch.jit.ignore - def set_grad_checkpointing(self, enable=True): - self.transformer.gradient_checkpointing_enable() - - def get_num_layers(self): - encoder = self.transformer.encoder if hasattr(self.transformer, "encoder") else self.transformer - layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"]) - return len(layer_list) - - def init_parameters(self): - pass diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/loss.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/loss.py deleted file mode 100755 index ec4c9950..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/loss.py +++ /dev/null @@ -1,123 +0,0 @@ -import math -import torch -import torch.nn as nn -from torch.nn import functional as F - -try: - import torch.distributed.nn - from torch import distributed as dist - - has_distributed = True -except ImportError: - has_distributed = False - -try: - import horovod.torch as hvd -except ImportError: - hvd = None - -from timm.loss import LabelSmoothingCrossEntropy - - -def gather_features(image_features, text_features, local_loss=False, gather_with_grad=False, rank=0, world_size=1, use_horovod=False): - assert has_distributed, "torch.distributed did not import correctly, please use a PyTorch version with support." - if use_horovod: - assert hvd is not None, "Please install horovod" - if gather_with_grad: - all_image_features = hvd.allgather(image_features) - all_text_features = hvd.allgather(text_features) - else: - with torch.no_grad(): - all_image_features = hvd.allgather(image_features) - all_text_features = hvd.allgather(text_features) - if not local_loss: - # ensure grads for local rank when all_* features don't have a gradient - gathered_image_features = list(all_image_features.chunk(world_size, dim=0)) - gathered_text_features = list(all_text_features.chunk(world_size, dim=0)) - gathered_image_features[rank] = image_features - gathered_text_features[rank] = text_features - all_image_features = torch.cat(gathered_image_features, dim=0) - all_text_features = torch.cat(gathered_text_features, dim=0) - else: - # We gather tensors from all gpus - if gather_with_grad: - all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0) - all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0) - # all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features, async_op=True), dim=0) - # all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features, async_op=True), dim=0) - else: - gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)] - gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)] - dist.all_gather(gathered_image_features, image_features) - dist.all_gather(gathered_text_features, text_features) - if not local_loss: - # ensure grads for local rank when all_* features don't have a gradient - gathered_image_features[rank] = image_features - gathered_text_features[rank] = text_features - all_image_features = torch.cat(gathered_image_features, dim=0) - all_text_features = torch.cat(gathered_text_features, dim=0) - - return all_image_features, all_text_features - - -class ClipLoss(nn.Module): - - def __init__( - self, - local_loss=False, - gather_with_grad=False, - cache_labels=False, - rank=0, - world_size=1, - use_horovod=False, - smoothing=0.0, - ): - super().__init__() - self.local_loss = local_loss - self.gather_with_grad = gather_with_grad - self.cache_labels = cache_labels - self.rank = rank - self.world_size = world_size - self.use_horovod = use_horovod - self.label_smoothing_cross_entropy = LabelSmoothingCrossEntropy(smoothing=smoothing) if smoothing > 0 else None - - # cache state - self.prev_num_logits = 0 - self.labels = {} - - def forward(self, image_features, text_features, logit_scale=1.0): - device = image_features.device - if self.world_size > 1: - all_image_features, all_text_features = gather_features(image_features, text_features, self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod) - - if self.local_loss: - logits_per_image = logit_scale * image_features @ all_text_features.T - logits_per_text = logit_scale * text_features @ all_image_features.T - else: - logits_per_image = logit_scale * all_image_features @ all_text_features.T - logits_per_text = logits_per_image.T - else: - logits_per_image = logit_scale * image_features @ text_features.T - logits_per_text = logit_scale * text_features @ image_features.T - # calculated ground-truth and cache if enabled - num_logits = logits_per_image.shape[0] - if self.prev_num_logits != num_logits or device not in self.labels: - labels = torch.arange(num_logits, device=device, dtype=torch.long) - if self.world_size > 1 and self.local_loss: - labels = labels + num_logits * self.rank - if self.cache_labels: - self.labels[device] = labels - self.prev_num_logits = num_logits - else: - labels = self.labels[device] - - if self.label_smoothing_cross_entropy: - total_loss = (self.label_smoothing_cross_entropy(logits_per_image, labels) + self.label_smoothing_cross_entropy(logits_per_text, labels)) / 2 - else: - total_loss = (F.cross_entropy(logits_per_image, labels) + F.cross_entropy(logits_per_text, labels)) / 2 - - acc = None - i2t_acc = (logits_per_image.argmax(-1) == labels).sum() / len(logits_per_image) - t2i_acc = (logits_per_text.argmax(-1) == labels).sum() / len(logits_per_text) - acc = {"i2t": i2t_acc, "t2i": t2i_acc} - return total_loss, acc diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model.py deleted file mode 100755 index b2f3a231..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model.py +++ /dev/null @@ -1,429 +0,0 @@ -""" CLIP Model - -Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. -""" - -import os -from dataclasses import dataclass -from typing import Optional, Tuple, Union -from functools import partial - -import numpy as np -import torch -import torch.nn.functional as F -from torch import nn - -try: - from .hf_model import HFTextEncoder -except: - HFTextEncoder = None -from .modified_resnet import ModifiedResNet -from .timm_model import TimmModel -from .eva_vit_model import EVAVisionTransformer -from .transformer import LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer - -try: - from apex.normalization import FusedLayerNorm -except: - FusedLayerNorm = LayerNorm - # print("Please 'pip install apex'") - -try: - import xformers.ops as xops -except ImportError: - xops = None - # print("Please 'pip install xformers'") - - -class RMSnorm(nn.Module): - """ - adepted from transformers T5LayerNorm - """ - - def __init__(self, hidden_size, eps=1e-6): - """ - Construct a layernorm module in the T5 style. No bias and no subtraction of mean. - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean - # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated - # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for - # half-precision inputs is done in fp32 - - variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - - # convert into half-precision if necessary - if self.weight.dtype in [torch.float16, torch.bfloat16]: - hidden_states = hidden_states.to(self.weight.dtype) - - return self.weight * hidden_states - - -@dataclass -class CLIPVisionCfg: - layers: Union[Tuple[int, int, int, int], int] = 12 - width: int = 768 - head_width: int = 64 - mlp_ratio: float = 4.0 - patch_size: int = 16 - image_size: Union[Tuple[int, int], int] = 224 - ls_init_value: Optional[float] = None # layer scale initial value - patch_dropout: float = 0.0 # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results - global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580) - drop_path_rate: Optional[float] = None # drop path rate - timm_model_name: str = None # a valid model name overrides layers, width, patch_size - timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model - timm_pool: str = "avg" # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '') - timm_proj: str = "linear" # linear projection for timm model output ('linear', 'mlp', '') - timm_proj_bias: bool = False # enable bias final projection - eva_model_name: str = None # a valid eva model name overrides layers, width, patch_size - qkv_bias: bool = True - fusedLN: bool = False - xattn: bool = False - postnorm: bool = False - rope: bool = False - pt_hw_seq_len: int = 16 # 224/14 - intp_freq: bool = False - naiveswiglu: bool = False - subln: bool = False - use_rms_norm: bool = False - - -@dataclass -class CLIPTextCfg: - context_length: int = 77 - vocab_size: int = 49408 - width: int = 512 - heads: int = 8 - layers: int = 12 - ls_init_value: Optional[float] = None # layer scale initial value - hf_model_name: str = None - hf_tokenizer_name: str = None - hf_model_pretrained: bool = True - proj: str = "mlp" - pooler_type: str = "mean_pooler" - masked_language_modeling: bool = False - fusedLN: bool = False - xattn: bool = False - attn_mask: bool = True - - -def get_cast_dtype(precision: str): - cast_dtype = None - if precision == "bf16": - cast_dtype = torch.bfloat16 - elif precision == "fp16": - cast_dtype = torch.float16 - return cast_dtype - - -def _build_vision_tower(embed_dim: int, vision_cfg: CLIPVisionCfg, quick_gelu: bool = False, cast_dtype: Optional[torch.dtype] = None): - if isinstance(vision_cfg, dict): - vision_cfg = CLIPVisionCfg(**vision_cfg) - - # OpenAI models are pretrained w/ QuickGELU but native nn.GELU is both faster and more - # memory efficient in recent PyTorch releases (>= 1.10). - # NOTE: timm models always use native GELU regardless of quick_gelu flag. - act_layer = QuickGELU if quick_gelu else nn.GELU - - if vision_cfg.eva_model_name: - vision_heads = vision_cfg.width // vision_cfg.head_width - - norm_layer = RMSnorm if vision_cfg.use_rms_norm else LayerNorm - - visual = EVAVisionTransformer( - img_size=vision_cfg.image_size, - patch_size=vision_cfg.patch_size, - num_classes=embed_dim, - use_mean_pooling=vision_cfg.global_average_pool, # False - init_values=vision_cfg.ls_init_value, - patch_dropout=vision_cfg.patch_dropout, - embed_dim=vision_cfg.width, - depth=vision_cfg.layers, - num_heads=vision_heads, - mlp_ratio=vision_cfg.mlp_ratio, - qkv_bias=vision_cfg.qkv_bias, - drop_path_rate=vision_cfg.drop_path_rate, - norm_layer=partial(norm_layer, eps=1e-6), - xattn=vision_cfg.xattn, - rope=vision_cfg.rope, - postnorm=vision_cfg.postnorm, - pt_hw_seq_len=vision_cfg.pt_hw_seq_len, # 224/14 - intp_freq=vision_cfg.intp_freq, - naiveswiglu=vision_cfg.naiveswiglu, - subln=vision_cfg.subln, - ) - elif vision_cfg.timm_model_name: - visual = TimmModel( - vision_cfg.timm_model_name, pretrained=vision_cfg.timm_model_pretrained, pool=vision_cfg.timm_pool, proj=vision_cfg.timm_proj, proj_bias=vision_cfg.timm_proj_bias, embed_dim=embed_dim, image_size=vision_cfg.image_size - ) - act_layer = nn.GELU # so that text transformer doesn't use QuickGELU w/ timm models - elif isinstance(vision_cfg.layers, (tuple, list)): - vision_heads = vision_cfg.width * 32 // vision_cfg.head_width - visual = ModifiedResNet(layers=vision_cfg.layers, output_dim=embed_dim, heads=vision_heads, image_size=vision_cfg.image_size, width=vision_cfg.width) - else: - vision_heads = vision_cfg.width // vision_cfg.head_width - norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm - visual = VisionTransformer( - image_size=vision_cfg.image_size, - patch_size=vision_cfg.patch_size, - width=vision_cfg.width, - layers=vision_cfg.layers, - heads=vision_heads, - mlp_ratio=vision_cfg.mlp_ratio, - ls_init_value=vision_cfg.ls_init_value, - patch_dropout=vision_cfg.patch_dropout, - global_average_pool=vision_cfg.global_average_pool, - output_dim=embed_dim, - act_layer=act_layer, - norm_layer=norm_layer, - ) - - return visual - - -def _build_text_tower( - embed_dim: int, - text_cfg: CLIPTextCfg, - quick_gelu: bool = False, - cast_dtype: Optional[torch.dtype] = None, -): - if isinstance(text_cfg, dict): - text_cfg = CLIPTextCfg(**text_cfg) - - if text_cfg.hf_model_name: - text = HFTextEncoder(text_cfg.hf_model_name, output_dim=embed_dim, tokenizer_name=text_cfg.hf_tokenizer_name, proj=text_cfg.proj, pooler_type=text_cfg.pooler_type, masked_language_modeling=text_cfg.masked_language_modeling) - else: - act_layer = QuickGELU if quick_gelu else nn.GELU - norm_layer = LayerNorm - - text = TextTransformer( - context_length=text_cfg.context_length, - vocab_size=text_cfg.vocab_size, - width=text_cfg.width, - heads=text_cfg.heads, - layers=text_cfg.layers, - ls_init_value=text_cfg.ls_init_value, - output_dim=embed_dim, - act_layer=act_layer, - norm_layer=FusedLayerNorm if text_cfg.fusedLN else norm_layer, - xattn=text_cfg.xattn, - attn_mask=text_cfg.attn_mask, - ) - return text - - -class CLIP(nn.Module): - def __init__( - self, - embed_dim: int, - vision_cfg: CLIPVisionCfg, - text_cfg: CLIPTextCfg, - quick_gelu: bool = False, - cast_dtype: Optional[torch.dtype] = None, - ): - super().__init__() - self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype) - - text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype) - self.transformer = text.transformer - self.vocab_size = text.vocab_size - self.token_embedding = text.token_embedding - self.positional_embedding = text.positional_embedding - self.ln_final = text.ln_final - self.text_projection = text.text_projection - self.register_buffer("attn_mask", text.attn_mask, persistent=False) - - self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) - - def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False): - # lock image tower as per LiT - https://arxiv.org/abs/2111.07991 - self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats) - - @torch.jit.ignore - def set_grad_checkpointing(self, enable=True): - self.visual.set_grad_checkpointing(enable) - self.transformer.grad_checkpointing = enable - - @torch.jit.ignore - def no_weight_decay(self): - return {"logit_scale"} - - def encode_image(self, image, normalize: bool = False): - features = self.visual(image) - return F.normalize(features, dim=-1) if normalize else features - - def encode_text(self, text, normalize: bool = False): - cast_dtype = self.transformer.get_cast_dtype() - - x = self.token_embedding(text).to(cast_dtype) # [batch_size, n_ctx, d_model] - - x = x + self.positional_embedding.to(cast_dtype) - x = x.permute(1, 0, 2) # NLD -> LND - x = self.transformer(x, attn_mask=self.attn_mask) - x = x.permute(1, 0, 2) # LND -> NLD - x = self.ln_final(x) # [batch_size, n_ctx, transformer.width] - # take features from the eot embedding (eot_token is the highest number in each sequence) - x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection - return F.normalize(x, dim=-1) if normalize else x - - def forward(self, image, text): - image_features = self.encode_image(image, normalize=True) - text_features = self.encode_text(text, normalize=True) - return image_features, text_features, self.logit_scale.exp() - - -class CustomCLIP(nn.Module): - def __init__( - self, - embed_dim: int, - vision_cfg: CLIPVisionCfg, - text_cfg: CLIPTextCfg, - quick_gelu: bool = False, - cast_dtype: Optional[torch.dtype] = None, - itm_task: bool = False, - ): - super().__init__() - self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype) - self.text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype) - self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) - - def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False): - # lock image tower as per LiT - https://arxiv.org/abs/2111.07991 - self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats) - - def lock_text_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True): - self.text.lock(unlocked_layers, freeze_layer_norm) - - @torch.jit.ignore - def set_grad_checkpointing(self, enable=True): - self.visual.set_grad_checkpointing(enable) - self.text.set_grad_checkpointing(enable) - - @torch.jit.ignore - def no_weight_decay(self): - return {"logit_scale"} - - def encode_image(self, image, normalize: bool = False): - features = self.visual(image) - return F.normalize(features, dim=-1) if normalize else features - - def encode_text(self, text, normalize: bool = False): - features = self.text(text) - return F.normalize(features, dim=-1) if normalize else features - - def forward(self, image, text): - image_features = self.encode_image(image, normalize=True) - text_features = self.encode_text(text, normalize=True) - return image_features, text_features, self.logit_scale.exp() - - -def convert_weights_to_lp(model: nn.Module, dtype=torch.float16): - """Convert applicable model parameters to low-precision (bf16 or fp16)""" - - def _convert_weights(l): - - if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)): - l.weight.data = l.weight.data.to(dtype) - if l.bias is not None: - l.bias.data = l.bias.data.to(dtype) - - if isinstance(l, (nn.MultiheadAttention, Attention)): - for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]: - tensor = getattr(l, attr, None) - if tensor is not None: - tensor.data = tensor.data.to(dtype) - - if isinstance(l, nn.Parameter): - l.data = l.data.to(dtype) - - for name in ["text_projection", "proj"]: - if hasattr(l, name) and isinstance(l, nn.Parameter): - attr = getattr(l, name, None) - if attr is not None: - attr.data = attr.data.to(dtype) - - model.apply(_convert_weights) - - -convert_weights_to_fp16 = convert_weights_to_lp # backwards compat - - -# used to maintain checkpoint compatibility -def convert_to_custom_text_state_dict(state_dict: dict): - if "text_projection" in state_dict: - # old format state_dict, move text tower -> .text - new_state_dict = {} - for k, v in state_dict.items(): - if any(k.startswith(p) for p in ("text_projection", "positional_embedding", "token_embedding", "transformer", "ln_final", "logit_scale")): - k = "text." + k - new_state_dict[k] = v - return new_state_dict - return state_dict - - -def build_model_from_openai_state_dict( - state_dict: dict, - quick_gelu=True, - cast_dtype=torch.float16, -): - vit = "visual.proj" in state_dict - - if vit: - vision_width = state_dict["visual.conv1.weight"].shape[0] - vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")]) - vision_patch_size = state_dict["visual.conv1.weight"].shape[-1] - grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5) - image_size = vision_patch_size * grid_size - else: - counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]] - vision_layers = tuple(counts) - vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0] - output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5) - vision_patch_size = None - assert output_width**2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0] - image_size = output_width * 32 - - embed_dim = state_dict["text_projection"].shape[1] - context_length = state_dict["positional_embedding"].shape[0] - vocab_size = state_dict["token_embedding.weight"].shape[0] - transformer_width = state_dict["ln_final.weight"].shape[0] - transformer_heads = transformer_width // 64 - transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks"))) - - vision_cfg = CLIPVisionCfg( - layers=vision_layers, - width=vision_width, - patch_size=vision_patch_size, - image_size=image_size, - ) - text_cfg = CLIPTextCfg(context_length=context_length, vocab_size=vocab_size, width=transformer_width, heads=transformer_heads, layers=transformer_layers) - model = CLIP( - embed_dim, - vision_cfg=vision_cfg, - text_cfg=text_cfg, - quick_gelu=quick_gelu, # OpenAI models were trained with QuickGELU - cast_dtype=cast_dtype, - ) - - for key in ["input_resolution", "context_length", "vocab_size"]: - state_dict.pop(key, None) - - convert_weights_to_fp16(model) # OpenAI state dicts are partially converted to float16 - model.load_state_dict(state_dict) - return model.eval() - - -def trace_model(model, batch_size=256, device=torch.device("cpu")): - model.eval() - image_size = model.visual.image_size - example_images = torch.ones((batch_size, 3, image_size, image_size), device=device) - example_text = torch.zeros((batch_size, model.context_length), dtype=torch.int, device=device) - model = torch.jit.trace_module(model, inputs=dict(forward=(example_images, example_text), encode_text=(example_text,), encode_image=(example_images,))) - model.visual.image_size = image_size - return model diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/modified_resnet.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/modified_resnet.py deleted file mode 100755 index 9f29f845..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/modified_resnet.py +++ /dev/null @@ -1,179 +0,0 @@ -from collections import OrderedDict - -import torch -from torch import nn -from torch.nn import functional as F - -from .utils import freeze_batch_norm_2d - - -class Bottleneck(nn.Module): - expansion = 4 - - def __init__(self, inplanes, planes, stride=1): - super().__init__() - - # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 - self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.act1 = nn.ReLU(inplace=True) - - self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.act2 = nn.ReLU(inplace=True) - - self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() - - self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) - self.bn3 = nn.BatchNorm2d(planes * self.expansion) - self.act3 = nn.ReLU(inplace=True) - - self.downsample = None - self.stride = stride - - if stride > 1 or inplanes != planes * Bottleneck.expansion: - # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 - self.downsample = nn.Sequential(OrderedDict([("-1", nn.AvgPool2d(stride)), ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)), ("1", nn.BatchNorm2d(planes * self.expansion))])) - - def forward(self, x: torch.Tensor): - identity = x - - out = self.act1(self.bn1(self.conv1(x))) - out = self.act2(self.bn2(self.conv2(out))) - out = self.avgpool(out) - out = self.bn3(self.conv3(out)) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - out = self.act3(out) - return out - - -class AttentionPool2d(nn.Module): - def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None): - super().__init__() - self.positional_embedding = nn.Parameter(torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5) - self.k_proj = nn.Linear(embed_dim, embed_dim) - self.q_proj = nn.Linear(embed_dim, embed_dim) - self.v_proj = nn.Linear(embed_dim, embed_dim) - self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) - self.num_heads = num_heads - - def forward(self, x): - x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC - x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC - x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC - x, _ = F.multi_head_attention_forward( - query=x, - key=x, - value=x, - embed_dim_to_check=x.shape[-1], - num_heads=self.num_heads, - q_proj_weight=self.q_proj.weight, - k_proj_weight=self.k_proj.weight, - v_proj_weight=self.v_proj.weight, - in_proj_weight=None, - in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), - bias_k=None, - bias_v=None, - add_zero_attn=False, - dropout_p=0.0, - out_proj_weight=self.c_proj.weight, - out_proj_bias=self.c_proj.bias, - use_separate_proj_weight=True, - training=self.training, - need_weights=False, - ) - - return x[0] - - -class ModifiedResNet(nn.Module): - """ - A ResNet class that is similar to torchvision's but contains the following changes: - - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. - - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 - - The final pooling layer is a QKV attention instead of an average pool - """ - - def __init__(self, layers, output_dim, heads, image_size=224, width=64): - super().__init__() - self.output_dim = output_dim - self.image_size = image_size - - # the 3-layer stem - self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(width // 2) - self.act1 = nn.ReLU(inplace=True) - self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(width // 2) - self.act2 = nn.ReLU(inplace=True) - self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False) - self.bn3 = nn.BatchNorm2d(width) - self.act3 = nn.ReLU(inplace=True) - self.avgpool = nn.AvgPool2d(2) - - # residual layers - self._inplanes = width # this is a *mutable* variable used during construction - self.layer1 = self._make_layer(width, layers[0]) - self.layer2 = self._make_layer(width * 2, layers[1], stride=2) - self.layer3 = self._make_layer(width * 4, layers[2], stride=2) - self.layer4 = self._make_layer(width * 8, layers[3], stride=2) - - embed_dim = width * 32 # the ResNet feature dimension - self.attnpool = AttentionPool2d(image_size // 32, embed_dim, heads, output_dim) - - self.init_parameters() - - def _make_layer(self, planes, blocks, stride=1): - layers = [Bottleneck(self._inplanes, planes, stride)] - - self._inplanes = planes * Bottleneck.expansion - for _ in range(1, blocks): - layers.append(Bottleneck(self._inplanes, planes)) - - return nn.Sequential(*layers) - - def init_parameters(self): - if self.attnpool is not None: - std = self.attnpool.c_proj.in_features**-0.5 - nn.init.normal_(self.attnpool.q_proj.weight, std=std) - nn.init.normal_(self.attnpool.k_proj.weight, std=std) - nn.init.normal_(self.attnpool.v_proj.weight, std=std) - nn.init.normal_(self.attnpool.c_proj.weight, std=std) - - for resnet_block in [self.layer1, self.layer2, self.layer3, self.layer4]: - for name, param in resnet_block.named_parameters(): - if name.endswith("bn3.weight"): - nn.init.zeros_(param) - - def lock(self, unlocked_groups=0, freeze_bn_stats=False): - assert unlocked_groups == 0, "partial locking not currently supported for this model" - for param in self.parameters(): - param.requires_grad = False - if freeze_bn_stats: - freeze_batch_norm_2d(self) - - @torch.jit.ignore - def set_grad_checkpointing(self, enable=True): - # FIXME support for non-transformer - pass - - def stem(self, x): - x = self.act1(self.bn1(self.conv1(x))) - x = self.act2(self.bn2(self.conv2(x))) - x = self.act3(self.bn3(self.conv3(x))) - x = self.avgpool(x) - return x - - def forward(self, x): - x = self.stem(x) - x = self.layer1(x) - x = self.layer2(x) - x = self.layer3(x) - x = self.layer4(x) - x = self.attnpool(x) - - return x diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/openai.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/openai.py deleted file mode 100755 index 9fbf6fc7..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/openai.py +++ /dev/null @@ -1,144 +0,0 @@ -""" OpenAI pretrained model functions - -Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. -""" - -import os -import warnings -from typing import List, Optional, Union - -import torch - -from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype -from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url - -__all__ = ["list_openai_models", "load_openai_model"] - - -def list_openai_models() -> List[str]: - """Returns the names of available CLIP models""" - return list_pretrained_models_by_tag("openai") - - -def load_openai_model( - name: str, - precision: Optional[str] = None, - device: Optional[Union[str, torch.device]] = None, - jit: bool = True, - cache_dir: Optional[str] = None, -): - """Load a CLIP model - - Parameters - ---------- - name : str - A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict - precision: str - Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'. - device : Union[str, torch.device] - The device to put the loaded model - jit : bool - Whether to load the optimized JIT model (default) or more hackable non-JIT model. - cache_dir : Optional[str] - The directory to cache the downloaded model weights - - Returns - ------- - model : torch.nn.Module - The CLIP model - preprocess : Callable[[PIL.Image], torch.Tensor] - A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input - """ - if device is None: - device = "cuda" if torch.cuda.is_available() else "cpu" - if precision is None: - precision = "fp32" if device == "cpu" else "fp16" - - if get_pretrained_url(name, "openai"): - model_path = download_pretrained_from_url(get_pretrained_url(name, "openai"), cache_dir=cache_dir) - elif os.path.isfile(name): - model_path = name - else: - raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}") - - try: - # loading JIT archive - model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval() - state_dict = None - except RuntimeError: - # loading saved state dict - if jit: - warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead") - jit = False - state_dict = torch.load(model_path, map_location="cpu") - - if not jit: - # Build a non-jit model from the OpenAI jitted model state dict - cast_dtype = get_cast_dtype(precision) - try: - model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype) - except KeyError: - sd = {k[7:]: v for k, v in state_dict["state_dict"].items()} - model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype) - - # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use - model = model.to(device) - if precision.startswith("amp") or precision == "fp32": - model.float() - elif precision == "bf16": - convert_weights_to_lp(model, dtype=torch.bfloat16) - - return model - - # patch the device names - device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]) - device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1] - - def patch_device(module): - try: - graphs = [module.graph] if hasattr(module, "graph") else [] - except RuntimeError: - graphs = [] - - if hasattr(module, "forward1"): - graphs.append(module.forward1.graph) - - for graph in graphs: - for node in graph.findAllNodes("prim::Constant"): - if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"): - node.copyAttributes(device_node) - - model.apply(patch_device) - patch_device(model.encode_image) - patch_device(model.encode_text) - - # patch dtype to float32 (typically for CPU) - if precision == "fp32": - float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[]) - float_input = list(float_holder.graph.findNode("aten::to").inputs())[1] - float_node = float_input.node() - - def patch_float(module): - try: - graphs = [module.graph] if hasattr(module, "graph") else [] - except RuntimeError: - graphs = [] - - if hasattr(module, "forward1"): - graphs.append(module.forward1.graph) - - for graph in graphs: - for node in graph.findAllNodes("aten::to"): - inputs = list(node.inputs()) - for i in [1, 2]: # dtype can be the second or third argument to aten::to() - if inputs[i].node()["value"] == 5: - inputs[i].node().copyAttributes(float_node) - - model.apply(patch_float) - patch_float(model.encode_image) - patch_float(model.encode_text) - model.float() - - # ensure image_size attr available at consistent location for both jit and non-jit - model.visual.image_size = model.input_resolution.item() - return model diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/pretrained.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/pretrained.py deleted file mode 100755 index a603b604..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/pretrained.py +++ /dev/null @@ -1,314 +0,0 @@ -import hashlib -import os -import urllib -import warnings -from typing import Dict, Union - -from tqdm import tqdm - -try: - from huggingface_hub import hf_hub_download - - _has_hf_hub = True -except ImportError: - hf_hub_download = None - _has_hf_hub = False - - -def _pcfg(url="", hf_hub="", filename="", mean=None, std=None): - return dict( - url=url, - hf_hub=hf_hub, - mean=mean, - std=std, - ) - - -_VITB32 = dict( - openai=_pcfg("https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"), - laion400m_e31=_pcfg("https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"), - laion400m_e32=_pcfg("https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"), - laion2b_e16=_pcfg("https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-laion2b_e16-af8dbd0c.pth"), - laion2b_s34b_b79k=_pcfg(hf_hub="laion/CLIP-ViT-B-32-laion2B-s34B-b79K/"), -) - -_VITB32_quickgelu = dict( - openai=_pcfg("https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"), - laion400m_e31=_pcfg("https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"), - laion400m_e32=_pcfg("https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"), -) - -_VITB16 = dict( - openai=_pcfg("https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt"), - laion400m_e31=_pcfg("https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e31-00efa78f.pt"), - laion400m_e32=_pcfg("https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e32-55e67d44.pt"), - laion2b_s34b_b88k=_pcfg(hf_hub="laion/CLIP-ViT-B-16-laion2B-s34B-b88K/"), -) - -_EVAB16 = dict( - eva=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_B_psz14to16.pt"), - eva02=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_B_psz14to16.pt"), - eva_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_CLIP_B_psz16_s8B.pt"), - eva02_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_CLIP_B_psz16_s8B.pt"), -) - -_VITB16_PLUS_240 = dict( - laion400m_e31=_pcfg("https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16_plus_240-laion400m_e31-8fb26589.pt"), - laion400m_e32=_pcfg("https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16_plus_240-laion400m_e32-699c4b84.pt"), -) - -_VITL14 = dict( - openai=_pcfg("https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt"), - laion400m_e31=_pcfg("https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e31-69988bb6.pt"), - laion400m_e32=_pcfg("https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e32-3d133497.pt"), - laion2b_s32b_b82k=_pcfg(hf_hub="laion/CLIP-ViT-L-14-laion2B-s32B-b82K/", mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), -) - -_EVAL14 = dict( - eva=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_L_psz14.pt"), - eva02=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_L_psz14.pt"), - eva_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_s4B.pt"), - eva02_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_s4B.pt"), -) - -_VITL14_336 = dict( - openai=_pcfg("https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt"), -) - -_EVAL14_336 = dict( - eva_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_CLIP_L_336_psz14_s6B.pt"), - eva02_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_CLIP_L_336_psz14_s6B.pt"), - eva_clip_224to336=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_224to336.pt"), - eva02_clip_224to336=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_224to336.pt"), -) - -_VITH14 = dict( - laion2b_s32b_b79k=_pcfg(hf_hub="laion/CLIP-ViT-H-14-laion2B-s32B-b79K/"), -) - -_VITg14 = dict( - laion2b_s12b_b42k=_pcfg(hf_hub="laion/CLIP-ViT-g-14-laion2B-s12B-b42K/"), - laion2b_s34b_b88k=_pcfg(hf_hub="laion/CLIP-ViT-g-14-laion2B-s34B-b88K/"), -) - -_EVAg14 = dict( - eva=_pcfg(hf_hub="QuanSun/EVA-CLIP/"), - eva01=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA01_g_psz14.pt"), - eva_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA01_CLIP_g_14_psz14_s11B.pt"), - eva01_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA01_CLIP_g_14_psz14_s11B.pt"), -) - -_EVAg14_PLUS = dict( - eva=_pcfg(hf_hub="QuanSun/EVA-CLIP/"), - eva01=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA01_g_psz14.pt"), - eva_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA01_CLIP_g_14_plus_psz14_s11B.pt"), - eva01_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA01_CLIP_g_14_plus_psz14_s11B.pt"), -) - -_VITbigG14 = dict( - laion2b_s39b_b160k=_pcfg(hf_hub="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/"), -) - -_EVAbigE14 = dict( - eva=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_E_psz14.pt"), - eva02=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_E_psz14.pt"), - eva_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_s4B.pt"), - eva02_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_s4B.pt"), -) - -_EVAbigE14_PLUS = dict( - eva=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_E_psz14.pt"), - eva02=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_E_psz14.pt"), - eva_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt"), - eva02_clip=_pcfg(hf_hub="QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt"), -) - -_EVA_8B = dict( - eva=_pcfg(hf_hub="BAAI/EVA-CLIP-8B/EVA_8B_psz14.bin"), - eva_clip=_pcfg(hf_hub="BAAI/EVA-CLIP-8B/EVA_CLIP_8B_psz14_s9B.pt"), -) - -_EVA_8B_PLUS = dict( - eva_clip=_pcfg(hf_hub="BAAI/EVA-CLIP-8B-448/EVA_CLIP_8B_psz14_plus_s0.6B.pt"), -) - - -_PRETRAINED = { - # "ViT-B-32": _VITB32, - "OpenaiCLIP-B-32": _VITB32, - "OpenCLIP-B-32": _VITB32, - # "ViT-B-32-quickgelu": _VITB32_quickgelu, - "OpenaiCLIP-B-32-quickgelu": _VITB32_quickgelu, - "OpenCLIP-B-32-quickgelu": _VITB32_quickgelu, - # "ViT-B-16": _VITB16, - "OpenaiCLIP-B-16": _VITB16, - "OpenCLIP-B-16": _VITB16, - "EVA02-B-16": _EVAB16, - "EVA02-CLIP-B-16": _EVAB16, - # "ViT-B-16-plus-240": _VITB16_PLUS_240, - "OpenCLIP-B-16-plus-240": _VITB16_PLUS_240, - # "ViT-L-14": _VITL14, - "OpenaiCLIP-L-14": _VITL14, - "OpenCLIP-L-14": _VITL14, - "EVA02-L-14": _EVAL14, - "EVA02-CLIP-L-14": _EVAL14, - # "ViT-L-14-336": _VITL14_336, - "OpenaiCLIP-L-14-336": _VITL14_336, - "EVA02-CLIP-L-14-336": _EVAL14_336, - # "ViT-H-14": _VITH14, - # "ViT-g-14": _VITg14, - "OpenCLIP-H-14": _VITH14, - "OpenCLIP-g-14": _VITg14, - "EVA01-CLIP-g-14": _EVAg14, - "EVA01-CLIP-g-14-plus": _EVAg14_PLUS, - # "ViT-bigG-14": _VITbigG14, - "OpenCLIP-bigG-14": _VITbigG14, - "EVA02-CLIP-bigE-14": _EVAbigE14, - "EVA02-CLIP-bigE-14-plus": _EVAbigE14_PLUS, - "EVA-CLIP-8B": _EVA_8B, - "EVA-CLIP-8B-448": _EVA_8B_PLUS, - "EVA-CLIP-8B-plus": _EVA_8B_PLUS, -} - - -def _clean_tag(tag: str): - # normalize pretrained tags - return tag.lower().replace("-", "_") - - -def list_pretrained(as_str: bool = False): - """returns list of pretrained models - Returns a tuple (model_name, pretrain_tag) by default or 'name:tag' if as_str == True - """ - return [":".join([k, t]) if as_str else (k, t) for k in _PRETRAINED.keys() for t in _PRETRAINED[k].keys()] - - -def list_pretrained_models_by_tag(tag: str): - """return all models having the specified pretrain tag""" - models = [] - tag = _clean_tag(tag) - for k in _PRETRAINED.keys(): - if tag in _PRETRAINED[k]: - models.append(k) - return models - - -def list_pretrained_tags_by_model(model: str): - """return all pretrain tags for the specified model architecture""" - tags = [] - if model in _PRETRAINED: - tags.extend(_PRETRAINED[model].keys()) - return tags - - -def is_pretrained_cfg(model: str, tag: str): - if model not in _PRETRAINED: - return False - return _clean_tag(tag) in _PRETRAINED[model] - - -def get_pretrained_cfg(model: str, tag: str): - if model not in _PRETRAINED: - return {} - model_pretrained = _PRETRAINED[model] - return model_pretrained.get(_clean_tag(tag), {}) - - -def get_pretrained_url(model: str, tag: str): - cfg = get_pretrained_cfg(model, _clean_tag(tag)) - return cfg.get("url", "") - - -def download_pretrained_from_url( - url: str, - cache_dir: Union[str, None] = None, -): - if not cache_dir: - cache_dir = os.path.expanduser("~/.cache/clip") - os.makedirs(cache_dir, exist_ok=True) - filename = os.path.basename(url) - - if "openaipublic" in url: - expected_sha256 = url.split("/")[-2] - elif "mlfoundations" in url: - expected_sha256 = os.path.splitext(filename)[0].split("-")[-1] - else: - expected_sha256 = "" - - download_target = os.path.join(cache_dir, filename) - - if os.path.exists(download_target) and not os.path.isfile(download_target): - raise RuntimeError(f"{download_target} exists and is not a regular file") - - if os.path.isfile(download_target): - if expected_sha256: - if hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256): - return download_target - else: - warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file") - else: - return download_target - - with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: - with tqdm(total=int(source.headers.get("Content-Length")), ncols=80, unit="iB", unit_scale=True) as loop: - while True: - buffer = source.read(8192) - if not buffer: - break - - output.write(buffer) - loop.update(len(buffer)) - - if expected_sha256 and not hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256): - raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match") - - return download_target - - -def has_hf_hub(necessary=False): - if not _has_hf_hub and necessary: - # if no HF Hub module installed, and it is necessary to continue, raise error - raise RuntimeError("Hugging Face hub model specified but package not installed. Run `pip install huggingface_hub`.") - return _has_hf_hub - - -def download_pretrained_from_hf( - model_id: str, - filename: str = "open_clip_pytorch_model.bin", - revision=None, - cache_dir: Union[str, None] = None, -): - has_hf_hub(True) - cached_file = hf_hub_download(model_id, filename, revision=revision, cache_dir=cache_dir) - return cached_file - - -def download_pretrained( - cfg: Dict, - force_hf_hub: bool = False, - cache_dir: Union[str, None] = None, -): - target = "" - if not cfg: - return target - - download_url = cfg.get("url", "") - download_hf_hub = cfg.get("hf_hub", "") - if download_hf_hub and force_hf_hub: - # use HF hub even if url exists - download_url = "" - - if download_url: - target = download_pretrained_from_url(download_url, cache_dir=cache_dir) - elif download_hf_hub: - has_hf_hub(True) - # we assume the hf_hub entries in pretrained config combine model_id + filename in - # 'org/model_name/filename.pt' form. To specify just the model id w/o filename and - # use 'open_clip_pytorch_model.bin' default, there must be a trailing slash 'org/model_name/'. - model_id, filename = os.path.split(download_hf_hub) - if filename: - target = download_pretrained_from_hf(model_id, filename=filename, cache_dir=cache_dir) - else: - target = download_pretrained_from_hf(model_id, cache_dir=cache_dir) - - return target diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/rope.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/rope.py deleted file mode 100755 index 5fb3cce5..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/rope.py +++ /dev/null @@ -1,131 +0,0 @@ -from math import pi -import torch -from torch import nn -from einops import rearrange, repeat -import logging - - -def broadcat(tensors, dim=-1): - num_tensors = len(tensors) - shape_lens = set(list(map(lambda t: len(t.shape), tensors))) - assert len(shape_lens) == 1, "tensors must all have the same number of dimensions" - shape_len = list(shape_lens)[0] - dim = (dim + shape_len) if dim < 0 else dim - dims = list(zip(*map(lambda t: list(t.shape), tensors))) - expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim] - assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), "invalid dimensions for broadcastable concatentation" - max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims)) - expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims)) - expanded_dims.insert(dim, (dim, dims[dim])) - expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims))) - tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes))) - return torch.cat(tensors, dim=dim) - - -def rotate_half(x): - x = rearrange(x, "... (d r) -> ... d r", r=2) - x1, x2 = x.unbind(dim=-1) - x = torch.stack((-x2, x1), dim=-1) - return rearrange(x, "... d r -> ... (d r)") - - -class VisionRotaryEmbedding(nn.Module): - def __init__( - self, - dim, - pt_seq_len, - ft_seq_len=None, - custom_freqs=None, - freqs_for="lang", - theta=10000, - max_freq=10, - num_freqs=1, - ): - super().__init__() - if custom_freqs: - freqs = custom_freqs - elif freqs_for == "lang": - freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) - elif freqs_for == "pixel": - freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi - elif freqs_for == "constant": - freqs = torch.ones(num_freqs).float() - else: - raise ValueError(f"unknown modality {freqs_for}") - - if ft_seq_len is None: - ft_seq_len = pt_seq_len - t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len - - freqs_h = torch.einsum("..., f -> ... f", t, freqs) - freqs_h = repeat(freqs_h, "... n -> ... (n r)", r=2) - - freqs_w = torch.einsum("..., f -> ... f", t, freqs) - freqs_w = repeat(freqs_w, "... n -> ... (n r)", r=2) - - freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim=-1) - - self.register_buffer("freqs_cos", freqs.cos()) - self.register_buffer("freqs_sin", freqs.sin()) - - logging.info(f"Shape of rope freq: {self.freqs_cos.shape}") - - def forward(self, t, start_index=0): - rot_dim = self.freqs_cos.shape[-1] - end_index = start_index + rot_dim - assert rot_dim <= t.shape[-1], f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}" - t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:] - t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin) - - return torch.cat((t_left, t, t_right), dim=-1) - - -class VisionRotaryEmbeddingFast(nn.Module): - def __init__(self, dim, pt_seq_len, ft_seq_len=None, custom_freqs=None, freqs_for="lang", theta=10000, max_freq=10, num_freqs=1, patch_dropout=0.0): - super().__init__() - if custom_freqs: - freqs = custom_freqs - elif freqs_for == "lang": - freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) - elif freqs_for == "pixel": - freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi - elif freqs_for == "constant": - freqs = torch.ones(num_freqs).float() - else: - raise ValueError(f"unknown modality {freqs_for}") - - if ft_seq_len is None: - ft_seq_len = pt_seq_len - t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len - - freqs = torch.einsum("..., f -> ... f", t, freqs) - freqs = repeat(freqs, "... n -> ... (n r)", r=2) - freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim=-1) - - freqs_cos = freqs.cos().view(-1, freqs.shape[-1]) - freqs_sin = freqs.sin().view(-1, freqs.shape[-1]) - - self.patch_dropout = patch_dropout - - self.register_buffer("freqs_cos", freqs_cos) - self.register_buffer("freqs_sin", freqs_sin) - - logging.info(f"Shape of rope freq: {self.freqs_cos.shape}") - - def forward(self, t, patch_indices_keep=None): - if patch_indices_keep is not None: - batch = t.size()[0] - batch_indices = torch.arange(batch) - batch_indices = batch_indices[..., None] - - freqs_cos = repeat(self.freqs_cos, "i j -> n i m j", n=t.shape[0], m=t.shape[1]) - freqs_sin = repeat(self.freqs_sin, "i j -> n i m j", n=t.shape[0], m=t.shape[1]) - - freqs_cos = freqs_cos[batch_indices, patch_indices_keep] - freqs_cos = rearrange(freqs_cos, "n i m j -> n m i j") - freqs_sin = freqs_sin[batch_indices, patch_indices_keep] - freqs_sin = rearrange(freqs_sin, "n i m j -> n m i j") - - return t * freqs_cos + rotate_half(t) * freqs_sin - - return t * self.freqs_cos + rotate_half(t) * self.freqs_sin diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/timm_model.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/timm_model.py deleted file mode 100755 index 65de78df..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/timm_model.py +++ /dev/null @@ -1,114 +0,0 @@ -""" timm model adapter - -Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model. -""" - -import logging -from collections import OrderedDict - -import torch -import torch.nn as nn - -try: - import timm - from timm.models.layers import Mlp, to_2tuple - - try: - # old timm imports < 0.8.1 - from timm.models.layers.attention_pool2d import RotAttentionPool2d - from timm.models.layers.attention_pool2d import AttentionPool2d as AbsAttentionPool2d - except ImportError: - # new timm imports >= 0.8.1 - from timm.layers import RotAttentionPool2d - from timm.layers import AttentionPool2d as AbsAttentionPool2d -except ImportError: - timm = None - -from .utils import freeze_batch_norm_2d - - -class TimmModel(nn.Module): - """timm model adapter - # FIXME this adapter is a work in progress, may change in ways that break weight compat - """ - - def __init__(self, model_name, embed_dim, image_size=224, pool="avg", proj="linear", proj_bias=False, drop=0.0, pretrained=False): - super().__init__() - if timm is None: - raise RuntimeError("Please `pip install timm` to use timm models.") - - self.image_size = to_2tuple(image_size) - self.trunk = timm.create_model(model_name, pretrained=pretrained) - feat_size = self.trunk.default_cfg.get("pool_size", None) - feature_ndim = 1 if not feat_size else 2 - if pool in ("abs_attn", "rot_attn"): - assert feature_ndim == 2 - # if attn pooling used, remove both classifier and default pool - self.trunk.reset_classifier(0, global_pool="") - else: - # reset global pool if pool config set, otherwise leave as network default - reset_kwargs = dict(global_pool=pool) if pool else {} - self.trunk.reset_classifier(0, **reset_kwargs) - prev_chs = self.trunk.num_features - - head_layers = OrderedDict() - if pool == "abs_attn": - head_layers["pool"] = AbsAttentionPool2d(prev_chs, feat_size=feat_size, out_features=embed_dim) - prev_chs = embed_dim - elif pool == "rot_attn": - head_layers["pool"] = RotAttentionPool2d(prev_chs, out_features=embed_dim) - prev_chs = embed_dim - else: - assert proj, "projection layer needed if non-attention pooling is used." - - # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used - if proj == "linear": - head_layers["drop"] = nn.Dropout(drop) - head_layers["proj"] = nn.Linear(prev_chs, embed_dim, bias=proj_bias) - elif proj == "mlp": - head_layers["mlp"] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=drop, bias=(True, proj_bias)) - - self.head = nn.Sequential(head_layers) - - def lock(self, unlocked_groups=0, freeze_bn_stats=False): - """lock modules - Args: - unlocked_groups (int): leave last n layer groups unlocked (default: 0) - """ - if not unlocked_groups: - # lock full model - for param in self.trunk.parameters(): - param.requires_grad = False - if freeze_bn_stats: - freeze_batch_norm_2d(self.trunk) - else: - # NOTE: partial freeze requires latest timm (master) branch and is subject to change - try: - # FIXME import here until API stable and in an official release - from timm.models.helpers import group_parameters, group_modules - except ImportError: - raise RuntimeError("Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`") - matcher = self.trunk.group_matcher() - gparams = group_parameters(self.trunk, matcher) - max_layer_id = max(gparams.keys()) - max_layer_id = max_layer_id - unlocked_groups - for group_idx in range(max_layer_id + 1): - group = gparams[group_idx] - for param in group: - self.trunk.get_parameter(param).requires_grad = False - if freeze_bn_stats: - gmodules = group_modules(self.trunk, matcher, reverse=True) - gmodules = {k for k, v in gmodules.items() if v <= max_layer_id} - freeze_batch_norm_2d(self.trunk, gmodules) - - @torch.jit.ignore - def set_grad_checkpointing(self, enable=True): - try: - self.trunk.set_grad_checkpointing(enable) - except Exception as e: - logging.warning("grad checkpointing not supported for this timm image tower, continuing without...") - - def forward(self, x): - x = self.trunk(x) - x = self.head(x) - return x diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/tokenizer.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/tokenizer.py deleted file mode 100755 index 5f753e69..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/tokenizer.py +++ /dev/null @@ -1,205 +0,0 @@ -""" CLIP tokenizer - -Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. -""" - -import gzip -import html -import os -from functools import lru_cache -from typing import Union, List - -import ftfy -import regex as re -import torch - -# https://stackoverflow.com/q/62691279 -import os - -os.environ["TOKENIZERS_PARALLELISM"] = "false" - - -@lru_cache() -def default_bpe(): - return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") - - -@lru_cache() -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a signficant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - - -def get_pairs(word): - """Return set of symbol pairs in a word. - Word is represented as tuple of symbols (symbols being variable-length strings). - """ - pairs = set() - prev_char = word[0] - for char in word[1:]: - pairs.add((prev_char, char)) - prev_char = char - return pairs - - -def basic_clean(text): - text = ftfy.fix_text(text) - text = html.unescape(html.unescape(text)) - return text.strip() - - -def whitespace_clean(text): - text = re.sub(r"\s+", " ", text) - text = text.strip() - return text - - -class SimpleTokenizer(object): - def __init__(self, bpe_path: str = default_bpe(), special_tokens=None): - self.byte_encoder = bytes_to_unicode() - self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} - merges = gzip.open(bpe_path).read().decode("utf-8").split("\n") - merges = merges[1 : 49152 - 256 - 2 + 1] - merges = [tuple(merge.split()) for merge in merges] - vocab = list(bytes_to_unicode().values()) - vocab = vocab + [v + "" for v in vocab] - for merge in merges: - vocab.append("".join(merge)) - if not special_tokens: - special_tokens = ["", ""] - else: - special_tokens = ["", ""] + special_tokens - vocab.extend(special_tokens) - self.encoder = dict(zip(vocab, range(len(vocab)))) - self.decoder = {v: k for k, v in self.encoder.items()} - self.bpe_ranks = dict(zip(merges, range(len(merges)))) - self.cache = {t: t for t in special_tokens} - special = "|".join(special_tokens) - self.pat = re.compile(special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) - - self.vocab_size = len(self.encoder) - self.all_special_ids = [self.encoder[t] for t in special_tokens] - - def bpe(self, token): - if token in self.cache: - return self.cache[token] - word = tuple(token[:-1]) + (token[-1] + "",) - pairs = get_pairs(word) - - if not pairs: - return token + "" - - while True: - bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) - if bigram not in self.bpe_ranks: - break - first, second = bigram - new_word = [] - i = 0 - while i < len(word): - try: - j = word.index(first, i) - new_word.extend(word[i:j]) - i = j - except: - new_word.extend(word[i:]) - break - - if word[i] == first and i < len(word) - 1 and word[i + 1] == second: - new_word.append(first + second) - i += 2 - else: - new_word.append(word[i]) - i += 1 - new_word = tuple(new_word) - word = new_word - if len(word) == 1: - break - else: - pairs = get_pairs(word) - word = " ".join(word) - self.cache[token] = word - return word - - def encode(self, text): - bpe_tokens = [] - text = whitespace_clean(basic_clean(text)).lower() - for token in re.findall(self.pat, text): - token = "".join(self.byte_encoder[b] for b in token.encode("utf-8")) - bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")) - return bpe_tokens - - def decode(self, tokens): - text = "".join([self.decoder[token] for token in tokens]) - text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("", " ") - return text - - -_tokenizer = SimpleTokenizer() - - -def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor: - """ - Returns the tokenized representation of given input string(s) - - Parameters - ---------- - texts : Union[str, List[str]] - An input string or a list of input strings to tokenize - context_length : int - The context length to use; all CLIP models use 77 as the context length - - Returns - ------- - A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] - """ - if isinstance(texts, str): - texts = [texts] - - sot_token = _tokenizer.encoder[""] - eot_token = _tokenizer.encoder[""] - all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] - result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) - - for i, tokens in enumerate(all_tokens): - if len(tokens) > context_length: - tokens = tokens[:context_length] # Truncate - tokens[-1] = eot_token - result[i, : len(tokens)] = torch.tensor(tokens) - - return result - - -class HFTokenizer: - "HuggingFace tokenizer wrapper" - - def __init__(self, tokenizer_name: str): - from transformers import AutoTokenizer - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - - def __call__(self, texts: Union[str, List[str]], context_length: int = 77) -> torch.Tensor: - # same cleaning as for default tokenizer, except lowercasing - # adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance - if isinstance(texts, str): - texts = [texts] - texts = [whitespace_clean(basic_clean(text)) for text in texts] - input_ids = self.tokenizer(texts, return_tensors="pt", max_length=context_length, padding="max_length", truncation=True).input_ids - return input_ids diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transform.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transform.py deleted file mode 100755 index 8cad45a1..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transform.py +++ /dev/null @@ -1,104 +0,0 @@ -from typing import Optional, Sequence, Tuple - -import torch -import torch.nn as nn -import torchvision.transforms.functional as F - -from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, CenterCrop - -from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD - - -class ResizeMaxSize(nn.Module): - - def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn="max", fill=0): - super().__init__() - if not isinstance(max_size, int): - raise TypeError(f"Size should be int. Got {type(max_size)}") - self.max_size = max_size - self.interpolation = interpolation - self.fn = min if fn == "min" else min - self.fill = fill - - def forward(self, img): - if isinstance(img, torch.Tensor): - height, width = img.shape[:2] - else: - width, height = img.size - scale = self.max_size / float(max(height, width)) - if scale != 1.0: - new_size = tuple(round(dim * scale) for dim in (height, width)) - img = F.resize(img, new_size, self.interpolation) - pad_h = self.max_size - new_size[0] - pad_w = self.max_size - new_size[1] - img = F.pad(img, padding=[pad_w // 2, pad_h // 2, pad_w - pad_w // 2, pad_h - pad_h // 2], fill=self.fill) - return img - - -def _convert_to_rgb(image): - return image.convert("RGB") - - -# class CatGen(nn.Module): -# def __init__(self, num=4): -# self.num = num -# def mixgen_batch(image, text): -# batch_size = image.shape[0] -# index = np.random.permutation(batch_size) - -# cat_images = [] -# for i in range(batch_size): -# # image mixup -# image[i,:] = lam * image[i,:] + (1 - lam) * image[index[i],:] -# # text concat -# text[i] = tokenizer((str(text[i]) + " " + str(text[index[i]])))[0] -# text = torch.stack(text) -# return image, text - - -def image_transform( - image_size: int, - is_train: bool, - mean: Optional[Tuple[float, ...]] = None, - std: Optional[Tuple[float, ...]] = None, - resize_longest_max: bool = False, - fill_color: int = 0, -): - mean = mean or OPENAI_DATASET_MEAN - if not isinstance(mean, (list, tuple)): - mean = (mean,) * 3 - - std = std or OPENAI_DATASET_STD - if not isinstance(std, (list, tuple)): - std = (std,) * 3 - - if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]: - # for square size, pass size as int so that Resize() uses aspect preserving shortest edge - image_size = image_size[0] - - normalize = Normalize(mean=mean, std=std) - if is_train: - return Compose( - [ - RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC), - _convert_to_rgb, - ToTensor(), - normalize, - ] - ) - else: - if resize_longest_max: - transforms = [ResizeMaxSize(image_size, fill=fill_color)] - else: - transforms = [ - Resize(image_size, interpolation=InterpolationMode.BICUBIC), - CenterCrop(image_size), - ] - transforms.extend( - [ - _convert_to_rgb, - ToTensor(), - normalize, - ] - ) - return Compose(transforms) diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transformer.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transformer.py deleted file mode 100755 index bd5ce4b6..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transformer.py +++ /dev/null @@ -1,683 +0,0 @@ -import os -import logging -from collections import OrderedDict -import math -from typing import Callable, Optional, Sequence -import numpy as np -import torch -from torch import nn -from torch.nn import functional as F - -try: - from timm.models.layers import trunc_normal_ -except: - from timm.layers import trunc_normal_ - -from .rope import VisionRotaryEmbedding, VisionRotaryEmbeddingFast -from .utils import to_2tuple - -if os.getenv("ENV_TYPE") == "deepspeed": - try: - import deepspeed - from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint - except: - print("Please 'pip install deepspeed'") - deepspeed = None - from torch.utils.checkpoint import checkpoint -else: - from torch.utils.checkpoint import checkpoint - -try: - import xformers.ops as xops -except ImportError: - xops = None - # print("Please 'pip install xformers'") - - -class LayerNormFp32(nn.LayerNorm): - """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back).""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def forward(self, x: torch.Tensor): - output = F.layer_norm( - x.float(), - self.normalized_shape, - self.weight.float() if self.weight is not None else None, - self.bias.float() if self.bias is not None else None, - self.eps, - ) - return output.type_as(x) - - -class LayerNorm(nn.LayerNorm): - """Subclass torch's LayerNorm (with cast back to input dtype).""" - - def forward(self, x: torch.Tensor): - orig_type = x.dtype - x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) - return x.to(orig_type) - - -class QuickGELU(nn.Module): - # NOTE This is slower than nn.GELU or nn.SiLU and uses more GPU memory - def forward(self, x: torch.Tensor): - return x * torch.sigmoid(1.702 * x) - - -class LayerScale(nn.Module): - def __init__(self, dim, init_values=1e-5, inplace=False): - super().__init__() - self.inplace = inplace - self.gamma = nn.Parameter(init_values * torch.ones(dim)) - - def forward(self, x): - return x.mul_(self.gamma) if self.inplace else x * self.gamma - - -class PatchDropout(nn.Module): - """ - https://arxiv.org/abs/2212.00794 - """ - - def __init__(self, prob, exclude_first_token=True): - super().__init__() - assert 0 <= prob < 1.0 - self.prob = prob - self.exclude_first_token = exclude_first_token # exclude CLS token - logging.info(f"os.getenv('RoPE')={os.getenv('RoPE')}") - - def forward(self, x): - if not self.training or self.prob == 0.0: - return x - - if self.exclude_first_token: - cls_tokens, x = x[:, :1], x[:, 1:] - else: - cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1]) - - batch = x.size()[0] - num_tokens = x.size()[1] - - batch_indices = torch.arange(batch) - batch_indices = batch_indices[..., None] - - keep_prob = 1 - self.prob - num_patches_keep = max(1, int(num_tokens * keep_prob)) - - rand = torch.randn(batch, num_tokens) - patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices - - x = x[batch_indices, patch_indices_keep] - - if self.exclude_first_token: - x = torch.cat((cls_tokens, x), dim=1) - - if self.training and os.getenv("RoPE") == "1": - return x, patch_indices_keep - - return x - - -def _in_projection_packed( - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - w: torch.Tensor, - b: Optional[torch.Tensor] = None, -): - """ - https://github.com/pytorch/pytorch/blob/db2a237763eb8693a20788be94f8c192e762baa8/torch/nn/functional.py#L4726 - """ - E = q.size(-1) - if k is v: - if q is k: - # self-attention - return F.linear(q, w, b).chunk(3, dim=-1) - else: - # encoder-decoder attention - w_q, w_kv = w.split([E, E * 2]) - if b is None: - b_q = b_kv = None - else: - b_q, b_kv = b.split([E, E * 2]) - return (F.linear(q, w_q, b_q),) + F.linear(k, w_kv, b_kv).chunk(2, dim=-1) - else: - w_q, w_k, w_v = w.chunk(3) - if b is None: - b_q = b_k = b_v = None - else: - b_q, b_k, b_v = b.chunk(3) - return F.linear(q, w_q, b_q), F.linear(k, w_k, b_k), F.linear(v, w_v, b_v) - - -class Attention(nn.Module): - def __init__(self, dim, num_heads=8, qkv_bias=True, scaled_cosine=False, scale_heads=False, logit_scale_max=math.log(1.0 / 0.01), attn_drop=0.0, proj_drop=0.0, xattn=False, rope=False): - super().__init__() - self.scaled_cosine = scaled_cosine - self.scale_heads = scale_heads - assert dim % num_heads == 0, "dim should be divisible by num_heads" - self.num_heads = num_heads - self.head_dim = dim // num_heads - self.scale = self.head_dim**-0.5 - self.logit_scale_max = logit_scale_max - - # keeping in_proj in this form (instead of nn.Linear) to match weight scheme of original - self.in_proj_weight = nn.Parameter(torch.randn((dim * 3, dim)) * self.scale) - if qkv_bias: - self.in_proj_bias = nn.Parameter(torch.zeros(dim * 3)) - else: - self.in_proj_bias = None - - if self.scaled_cosine: - self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1)))) - else: - self.logit_scale = None - self.attn_drop = nn.Dropout(attn_drop) - if self.scale_heads: - self.head_scale = nn.Parameter(torch.ones((num_heads, 1, 1))) - else: - self.head_scale = None - self.out_proj = nn.Linear(dim, dim) - self.out_drop = nn.Dropout(proj_drop) - self.xattn = xattn - self.xattn_drop = attn_drop - self.rope = rope - - def forward(self, x, attn_mask: Optional[torch.Tensor] = None): - L, N, C = x.shape - q, k, v = F.linear(x, self.in_proj_weight, self.in_proj_bias).chunk(3, dim=-1) - if self.xattn: - q = q.contiguous().view(L, N, self.num_heads, -1).transpose(0, 1) - k = k.contiguous().view(L, N, self.num_heads, -1).transpose(0, 1) - v = v.contiguous().view(L, N, self.num_heads, -1).transpose(0, 1) - - x = xops.memory_efficient_attention( - q, - k, - v, - p=self.xattn_drop, - scale=self.scale if self.logit_scale is None else None, - attn_bias=xops.LowerTriangularMask() if attn_mask is not None else None, - ) - else: - q = q.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1) - k = k.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1) - v = v.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1) - - if self.logit_scale is not None: - attn = torch.bmm(F.normalize(q, dim=-1), F.normalize(k, dim=-1).transpose(-1, -2)) - logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp() - attn = attn.view(N, self.num_heads, L, L) * logit_scale - attn = attn.view(-1, L, L) - else: - q = q * self.scale - attn = torch.bmm(q, k.transpose(-1, -2)) - - if attn_mask is not None: - if attn_mask.dtype == torch.bool: - new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype) - new_attn_mask.masked_fill_(attn_mask, float("-inf")) - attn_mask = new_attn_mask - attn += attn_mask - - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = torch.bmm(attn, v) - - if self.head_scale is not None: - x = x.view(N, self.num_heads, L, C) * self.head_scale - x = x.view(-1, L, C) - x = x.transpose(0, 1).reshape(L, N, C) - x = self.out_proj(x) - x = self.out_drop(x) - return x - - -class CustomAttention(nn.Module): - def __init__(self, dim, num_heads=8, qkv_bias=True, scaled_cosine=True, scale_heads=False, logit_scale_max=math.log(1.0 / 0.01), attn_drop=0.0, proj_drop=0.0, xattn=False): - super().__init__() - self.scaled_cosine = scaled_cosine - self.scale_heads = scale_heads - assert dim % num_heads == 0, "dim should be divisible by num_heads" - self.num_heads = num_heads - self.head_dim = dim // num_heads - self.scale = self.head_dim**-0.5 - self.logit_scale_max = logit_scale_max - - # keeping in_proj in this form (instead of nn.Linear) to match weight scheme of original - self.in_proj_weight = nn.Parameter(torch.randn((dim * 3, dim)) * self.scale) - if qkv_bias: - self.in_proj_bias = nn.Parameter(torch.zeros(dim * 3)) - else: - self.in_proj_bias = None - - if self.scaled_cosine: - self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1)))) - else: - self.logit_scale = None - self.attn_drop = nn.Dropout(attn_drop) - if self.scale_heads: - self.head_scale = nn.Parameter(torch.ones((num_heads, 1, 1))) - else: - self.head_scale = None - self.out_proj = nn.Linear(dim, dim) - self.out_drop = nn.Dropout(proj_drop) - self.xattn = xattn - self.xattn_drop = attn_drop - - def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): - q, k, v = _in_projection_packed(query, key, value, self.in_proj_weight, self.in_proj_bias) - N_q, B_q, C_q = q.shape - N_k, B_k, C_k = k.shape - N_v, B_v, C_v = v.shape - if self.xattn: - # B, N, C -> B, N, num_heads, C - q = q.permute(1, 0, 2).reshape(B_q, N_q, self.num_heads, -1) - k = k.permute(1, 0, 2).reshape(B_k, N_k, self.num_heads, -1) - v = v.permute(1, 0, 2).reshape(B_v, N_v, self.num_heads, -1) - - x = xops.memory_efficient_attention(q, k, v, p=self.xattn_drop, scale=self.scale if self.logit_scale is None else None, attn_bias=xops.LowerTriangularMask() if attn_mask is not None else None) - else: - # B*H, L, C - q = q.contiguous().view(N_q, B_q * self.num_heads, -1).transpose(0, 1) - k = k.contiguous().view(N_k, B_k * self.num_heads, -1).transpose(0, 1) - v = v.contiguous().view(N_v, B_v * self.num_heads, -1).transpose(0, 1) - - if self.logit_scale is not None: - # B*H, N_q, N_k - attn = torch.bmm(F.normalize(q, dim=-1), F.normalize(k, dim=-1).transpose(-1, -2)) - logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp() - attn = attn.view(B_q, self.num_heads, N_q, N_k) * logit_scale - attn = attn.view(-1, N_q, N_k) - else: - q = q * self.scale - attn = torch.bmm(q, k.transpose(-1, -2)) - - if attn_mask is not None: - if attn_mask.dtype == torch.bool: - new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype) - new_attn_mask.masked_fill_(attn_mask, float("-inf")) - attn_mask = new_attn_mask - attn += attn_mask - - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = torch.bmm(attn, v) - - if self.head_scale is not None: - x = x.view(B_q, self.num_heads, N_q, C_q) * self.head_scale - x = x.view(-1, N_q, C_q) - x = x.transpose(0, 1).reshape(N_q, B_q, C_q) - x = self.out_proj(x) - x = self.out_drop(x) - return x - - -class CustomResidualAttentionBlock(nn.Module): - def __init__( - self, - d_model: int, - n_head: int, - mlp_ratio: float = 4.0, - ls_init_value: float = None, - act_layer: Callable = nn.GELU, - norm_layer: Callable = LayerNorm, - scale_cosine_attn: bool = False, - scale_heads: bool = False, - scale_attn: bool = False, - scale_fc: bool = False, - cross_attn: bool = False, - xattn: bool = False, - ): - super().__init__() - - self.ln_1 = norm_layer(d_model) - self.ln_1_k = norm_layer(d_model) if cross_attn else self.ln_1 - self.ln_1_v = norm_layer(d_model) if cross_attn else self.ln_1 - self.attn = CustomAttention(d_model, n_head, qkv_bias=True, attn_drop=0.0, proj_drop=0.0, scaled_cosine=scale_cosine_attn, scale_heads=scale_heads, xattn=xattn) - - self.ln_attn = norm_layer(d_model) if scale_attn else nn.Identity() - self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity() - - self.ln_2 = norm_layer(d_model) - mlp_width = int(d_model * mlp_ratio) - self.mlp = nn.Sequential(OrderedDict([("c_fc", nn.Linear(d_model, mlp_width)), ("ln", norm_layer(mlp_width) if scale_fc else nn.Identity()), ("gelu", act_layer()), ("c_proj", nn.Linear(mlp_width, d_model))])) - - self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity() - - def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): - q = q + self.ls_1(self.ln_attn(self.attn(self.ln_1(q), self.ln_1_k(k), self.ln_1_v(v), attn_mask=attn_mask))) - q = q + self.ls_2(self.mlp(self.ln_2(q))) - return q - - -class CustomTransformer(nn.Module): - def __init__( - self, - width: int, - layers: int, - heads: int, - mlp_ratio: float = 4.0, - ls_init_value: float = None, - act_layer: Callable = nn.GELU, - norm_layer: Callable = LayerNorm, - scale_cosine_attn: bool = True, - scale_heads: bool = False, - scale_attn: bool = False, - scale_fc: bool = False, - cross_attn: bool = False, - xattn: bool = False, - ): - super().__init__() - self.width = width - self.layers = layers - self.grad_checkpointing = False - self.xattn = xattn - - self.resblocks = nn.ModuleList( - [ - CustomResidualAttentionBlock( - width, - heads, - mlp_ratio, - ls_init_value=ls_init_value, - act_layer=act_layer, - norm_layer=norm_layer, - scale_cosine_attn=scale_cosine_attn, - scale_heads=scale_heads, - scale_attn=scale_attn, - scale_fc=scale_fc, - cross_attn=cross_attn, - xattn=xattn, - ) - for _ in range(layers) - ] - ) - - def get_cast_dtype(self) -> torch.dtype: - return self.resblocks[0].mlp.c_fc.weight.dtype - - def forward(self, q: torch.Tensor, k: torch.Tensor = None, v: torch.Tensor = None, attn_mask: Optional[torch.Tensor] = None): - if k is None and v is None: - k = v = q - for r in self.resblocks: - if self.grad_checkpointing and not torch.jit.is_scripting(): - q = checkpoint(r, q, k, v, attn_mask) - else: - q = r(q, k, v, attn_mask=attn_mask) - return q - - -class ResidualAttentionBlock(nn.Module): - def __init__( - self, - d_model: int, - n_head: int, - mlp_ratio: float = 4.0, - ls_init_value: float = None, - act_layer: Callable = nn.GELU, - norm_layer: Callable = LayerNorm, - xattn: bool = False, - ): - super().__init__() - - self.ln_1 = norm_layer(d_model) - if xattn: - self.attn = Attention(d_model, n_head, xattn=True) - else: - self.attn = nn.MultiheadAttention(d_model, n_head) - self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity() - - self.ln_2 = norm_layer(d_model) - mlp_width = int(d_model * mlp_ratio) - self.mlp = nn.Sequential(OrderedDict([("c_fc", nn.Linear(d_model, mlp_width)), ("gelu", act_layer()), ("c_proj", nn.Linear(mlp_width, d_model))])) - - self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity() - self.xattn = xattn - - def attention(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): - attn_mask = attn_mask.to(x.dtype) if attn_mask is not None else None - if self.xattn: - return self.attn(x, attn_mask=attn_mask) - return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask)[0] - - def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): - x = x + self.ls_1(self.attention(self.ln_1(x), attn_mask=attn_mask)) - x = x + self.ls_2(self.mlp(self.ln_2(x))) - return x - - -class Transformer(nn.Module): - def __init__( - self, - width: int, - layers: int, - heads: int, - mlp_ratio: float = 4.0, - ls_init_value: float = None, - act_layer: Callable = nn.GELU, - norm_layer: Callable = LayerNorm, - xattn: bool = False, - ): - super().__init__() - self.width = width - self.layers = layers - self.grad_checkpointing = False - - self.resblocks = nn.ModuleList([ResidualAttentionBlock(width, heads, mlp_ratio, ls_init_value=ls_init_value, act_layer=act_layer, norm_layer=norm_layer, xattn=xattn) for _ in range(layers)]) - - def get_cast_dtype(self) -> torch.dtype: - return self.resblocks[0].mlp.c_fc.weight.dtype - - def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): - for r in self.resblocks: - if self.grad_checkpointing and not torch.jit.is_scripting(): - x = checkpoint(r, x, attn_mask) - else: - x = r(x, attn_mask=attn_mask) - return x - - -class VisionTransformer(nn.Module): - def __init__( - self, - image_size: int, - patch_size: int, - width: int, - layers: int, - heads: int, - mlp_ratio: float, - ls_init_value: float = None, - patch_dropout: float = 0.0, - global_average_pool: bool = False, - output_dim: int = 512, - act_layer: Callable = nn.GELU, - norm_layer: Callable = LayerNorm, - xattn: bool = False, - ): - super().__init__() - self.image_size = to_2tuple(image_size) - self.patch_size = to_2tuple(patch_size) - self.grid_size = (self.image_size[0] // self.patch_size[0], self.image_size[1] // self.patch_size[1]) - self.output_dim = output_dim - self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False) - - scale = width**-0.5 - self.class_embedding = nn.Parameter(scale * torch.randn(width)) - self.positional_embedding = nn.Parameter(scale * torch.randn(self.grid_size[0] * self.grid_size[1] + 1, width)) - - # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn - self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0.0 else nn.Identity() - self.ln_pre = norm_layer(width) - - self.transformer = Transformer(width, layers, heads, mlp_ratio, ls_init_value=ls_init_value, act_layer=act_layer, norm_layer=norm_layer, xattn=xattn) - - self.global_average_pool = global_average_pool - self.ln_post = norm_layer(width) - self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) - - def lock(self, unlocked_groups=0, freeze_bn_stats=False): - for param in self.parameters(): - param.requires_grad = False - - if unlocked_groups != 0: - groups = [ - [ - self.conv1, - self.class_embedding, - self.positional_embedding, - self.ln_pre, - ], - *self.transformer.resblocks[:-1], - [ - self.transformer.resblocks[-1], - self.ln_post, - ], - self.proj, - ] - - def _unlock(x): - if isinstance(x, Sequence): - for g in x: - _unlock(g) - else: - if isinstance(x, torch.nn.Parameter): - x.requires_grad = True - else: - for p in x.parameters(): - p.requires_grad = True - - _unlock(groups[-unlocked_groups:]) - - def get_num_layers(self): - return self.transformer.layers - - @torch.jit.ignore - def set_grad_checkpointing(self, enable=True): - self.transformer.grad_checkpointing = enable - - @torch.jit.ignore - def no_weight_decay(self): - return {"positional_embedding", "class_embedding"} - - def forward(self, x: torch.Tensor, return_all_features: bool = False): - x = self.conv1(x) # shape = [*, width, grid, grid] - x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] - x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] - x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width] - x = x + self.positional_embedding.to(x.dtype) - - # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in - x = self.patch_dropout(x) - x = self.ln_pre(x) - - x = x.permute(1, 0, 2) # NLD -> LND - x = self.transformer(x) - x = x.permute(1, 0, 2) # LND -> NLD - - if not return_all_features: - if self.global_average_pool: - x = x.mean(dim=1) # x = x[:,1:,:].mean(dim=1) - else: - x = x[:, 0] - - x = self.ln_post(x) - - if self.proj is not None: - x = x @ self.proj - - return x - - -class TextTransformer(nn.Module): - def __init__( - self, - context_length: int = 77, - vocab_size: int = 49408, - width: int = 512, - heads: int = 8, - layers: int = 12, - ls_init_value: float = None, - output_dim: int = 512, - act_layer: Callable = nn.GELU, - norm_layer: Callable = LayerNorm, - xattn: bool = False, - attn_mask: bool = True, - ): - super().__init__() - self.context_length = context_length - self.vocab_size = vocab_size - self.width = width - self.output_dim = output_dim - - self.token_embedding = nn.Embedding(vocab_size, width) - self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width)) - self.transformer = Transformer(width=width, layers=layers, heads=heads, ls_init_value=ls_init_value, act_layer=act_layer, norm_layer=norm_layer, xattn=xattn) - - self.xattn = xattn - self.ln_final = norm_layer(width) - self.text_projection = nn.Parameter(torch.empty(width, output_dim)) - - if attn_mask: - self.register_buffer("attn_mask", self.build_attention_mask(), persistent=False) - else: - self.attn_mask = None - - self.init_parameters() - - def init_parameters(self): - nn.init.normal_(self.token_embedding.weight, std=0.02) - nn.init.normal_(self.positional_embedding, std=0.01) - - proj_std = (self.transformer.width**-0.5) * ((2 * self.transformer.layers) ** -0.5) - attn_std = self.transformer.width**-0.5 - fc_std = (2 * self.transformer.width) ** -0.5 - for block in self.transformer.resblocks: - nn.init.normal_(block.attn.in_proj_weight, std=attn_std) - nn.init.normal_(block.attn.out_proj.weight, std=proj_std) - nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) - nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) - - if self.text_projection is not None: - nn.init.normal_(self.text_projection, std=self.transformer.width**-0.5) - - @torch.jit.ignore - def set_grad_checkpointing(self, enable=True): - self.transformer.grad_checkpointing = enable - - @torch.jit.ignore - def no_weight_decay(self): - # return {'positional_embedding', 'token_embedding'} - return {"positional_embedding"} - - def get_num_layers(self): - return self.transformer.layers - - def build_attention_mask(self): - # lazily create causal attention mask, with full attention between the vision tokens - # pytorch uses additive attention mask; fill with -inf - mask = torch.empty(self.context_length, self.context_length) - mask.fill_(float("-inf")) - mask.triu_(1) # zero out the lower diagonal - return mask - - def forward(self, text, return_all_features: bool = False): - cast_dtype = self.transformer.get_cast_dtype() - x = self.token_embedding(text).to(cast_dtype) # [batch_size, n_ctx, d_model] - - x = x + self.positional_embedding.to(cast_dtype) - x = x.permute(1, 0, 2) # NLD -> LND - x = self.transformer(x, attn_mask=self.attn_mask) - # x = self.transformer(x) # no attention mask is applied - x = x.permute(1, 0, 2) # LND -> NLD - x = self.ln_final(x) - - if not return_all_features: - # x.shape = [batch_size, n_ctx, transformer.width] - # take features from the eot embedding (eot_token is the highest number in each sequence) - x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection - return x diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/utils.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/utils.py deleted file mode 100755 index 73b6d662..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/utils.py +++ /dev/null @@ -1,321 +0,0 @@ -from itertools import repeat -import collections.abc -import logging -import math -import numpy as np - -import torch -from torch import nn as nn -from torchvision.ops.misc import FrozenBatchNorm2d -import torch.nn.functional as F - - -# open CLIP -def resize_clip_pos_embed(state_dict, model, interpolation: str = "bicubic", seq_dim=1): - # Rescale the grid of position embeddings when loading from state_dict - old_pos_embed = state_dict.get("visual.positional_embedding", None) - if old_pos_embed is None or not hasattr(model.visual, "grid_size"): - return - grid_size = to_2tuple(model.visual.grid_size) - extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more) - new_seq_len = grid_size[0] * grid_size[1] + extra_tokens - if new_seq_len == old_pos_embed.shape[0]: - return - - if extra_tokens: - pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:] - else: - pos_emb_tok, pos_emb_img = None, old_pos_embed - old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img)))) - - logging.info("Resizing position embedding grid-size from %s to %s", old_grid_size, grid_size) - pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2) - pos_emb_img = F.interpolate( - pos_emb_img, - size=grid_size, - mode=interpolation, - align_corners=True, - ) - pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0] - if pos_emb_tok is not None: - new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0) - else: - new_pos_embed = pos_emb_img - state_dict["visual.positional_embedding"] = new_pos_embed - - -def resize_visual_pos_embed(state_dict, model, interpolation: str = "bicubic", seq_dim=1): - # Rescale the grid of position embeddings when loading from state_dict - old_pos_embed = state_dict.get("positional_embedding", None) - if old_pos_embed is None or not hasattr(model.visual, "grid_size"): - return - grid_size = to_2tuple(model.visual.grid_size) - extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more) - new_seq_len = grid_size[0] * grid_size[1] + extra_tokens - if new_seq_len == old_pos_embed.shape[0]: - return - - if extra_tokens: - pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:] - else: - pos_emb_tok, pos_emb_img = None, old_pos_embed - old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img)))) - - logging.info("Resizing position embedding grid-size from %s to %s", old_grid_size, grid_size) - pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2) - pos_emb_img = F.interpolate( - pos_emb_img, - size=grid_size, - mode=interpolation, - align_corners=True, - ) - pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0] - if pos_emb_tok is not None: - new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0) - else: - new_pos_embed = pos_emb_img - state_dict["positional_embedding"] = new_pos_embed - - -def resize_evaclip_pos_embed(state_dict, model, interpolation: str = "bicubic", seq_dim=1): - all_keys = list(state_dict.keys()) - # interpolate position embedding - if "visual.pos_embed" in state_dict: - pos_embed_checkpoint = state_dict["visual.pos_embed"] - embedding_size = pos_embed_checkpoint.shape[-1] - num_patches = model.visual.patch_embed.num_patches - # num_extra_tokens = model.visual.pos_embed.shape[-2] - num_patches - num_extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more) - # height (== width) for the checkpoint position embedding - orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) - # height (== width) for the new position embedding - new_size = int(num_patches**0.5) - # class_token and dist_token are kept unchanged - if orig_size != new_size: - print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) - extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] - # only the position tokens are interpolated - pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] - pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) - pos_tokens = torch.nn.functional.interpolate(pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False) - pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) - new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) - state_dict["visual.pos_embed"] = new_pos_embed - - patch_embed_proj = state_dict["visual.patch_embed.proj.weight"] - patch_size = model.visual.patch_embed.patch_size - state_dict["visual.patch_embed.proj.weight"] = torch.nn.functional.interpolate(patch_embed_proj.float(), size=patch_size, mode="bicubic", align_corners=False) - - -def resize_eva_pos_embed(state_dict, model, interpolation: str = "bicubic", seq_dim=1): - all_keys = list(state_dict.keys()) - # interpolate position embedding - if "pos_embed" in state_dict: - pos_embed_checkpoint = state_dict["pos_embed"] - embedding_size = pos_embed_checkpoint.shape[-1] - num_patches = model.visual.patch_embed.num_patches - # num_extra_tokens = model.visual.pos_embed.shape[-2] - num_patches - num_extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more) - # height (== width) for the checkpoint position embedding - orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) - # height (== width) for the new position embedding - new_size = int(num_patches**0.5) - # class_token and dist_token are kept unchanged - if orig_size != new_size: - print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) - extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] - # only the position tokens are interpolated - pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] - pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) - pos_tokens = torch.nn.functional.interpolate(pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False) - pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) - new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) - state_dict["pos_embed"] = new_pos_embed - - patch_embed_proj = state_dict["patch_embed.proj.weight"] - patch_size = model.visual.patch_embed.patch_size - state_dict["patch_embed.proj.weight"] = torch.nn.functional.interpolate(patch_embed_proj.float(), size=patch_size, mode="bicubic", align_corners=False) - - -def resize_rel_pos_embed(state_dict, model, interpolation: str = "bicubic", seq_dim=1): - all_keys = list(state_dict.keys()) - for key in all_keys: - if "relative_position_index" in key: - state_dict.pop(key) - - if "relative_position_bias_table" in key: - rel_pos_bias = state_dict[key] - src_num_pos, num_attn_heads = rel_pos_bias.size() - dst_num_pos, _ = model.visual.state_dict()[key].size() - dst_patch_shape = model.visual.patch_embed.patch_shape - if dst_patch_shape[0] != dst_patch_shape[1]: - raise NotImplementedError() - num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * (dst_patch_shape[1] * 2 - 1) - src_size = int((src_num_pos - num_extra_tokens) ** 0.5) - dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5) - if src_size != dst_size: - print("Position interpolate for %s from %dx%d to %dx%d" % (key, src_size, src_size, dst_size, dst_size)) - extra_tokens = rel_pos_bias[-num_extra_tokens:, :] - rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] - - def geometric_progression(a, r, n): - return a * (1.0 - r**n) / (1.0 - r) - - left, right = 1.01, 1.5 - while right - left > 1e-6: - q = (left + right) / 2.0 - gp = geometric_progression(1, q, src_size // 2) - if gp > dst_size // 2: - right = q - else: - left = q - - # if q > 1.090307: - # q = 1.090307 - - dis = [] - cur = 1 - for i in range(src_size // 2): - dis.append(cur) - cur += q ** (i + 1) - - r_ids = [-_ for _ in reversed(dis)] - - x = r_ids + [0] + dis - y = r_ids + [0] + dis - - t = dst_size // 2.0 - dx = np.arange(-t, t + 0.1, 1.0) - dy = np.arange(-t, t + 0.1, 1.0) - - print("Original positions = %s" % str(x)) - print("Target positions = %s" % str(dx)) - - all_rel_pos_bias = [] - - for i in range(num_attn_heads): - z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() - f = F.interpolate.interp2d(x, y, z, kind="cubic") - all_rel_pos_bias.append(torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device)) - - rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) - - new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0) - state_dict[key] = new_rel_pos_bias - - # interpolate position embedding - if "pos_embed" in state_dict: - pos_embed_checkpoint = state_dict["pos_embed"] - embedding_size = pos_embed_checkpoint.shape[-1] - num_patches = model.visual.patch_embed.num_patches - num_extra_tokens = model.visual.pos_embed.shape[-2] - num_patches - # height (== width) for the checkpoint position embedding - orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) - # height (== width) for the new position embedding - new_size = int(num_patches**0.5) - # class_token and dist_token are kept unchanged - if orig_size != new_size: - print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) - extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] - # only the position tokens are interpolated - pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] - pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) - pos_tokens = torch.nn.functional.interpolate(pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False) - pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) - new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) - state_dict["pos_embed"] = new_pos_embed - - patch_embed_proj = state_dict["patch_embed.proj.weight"] - patch_size = model.visual.patch_embed.patch_size - state_dict["patch_embed.proj.weight"] = torch.nn.functional.interpolate(patch_embed_proj.float(), size=patch_size, mode="bicubic", align_corners=False) - - -def freeze_batch_norm_2d(module, module_match={}, name=""): - """ - Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is - itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and - returned. Otherwise, the module is walked recursively and submodules are converted in place. - - Args: - module (torch.nn.Module): Any PyTorch module. - module_match (dict): Dictionary of full module names to freeze (all if empty) - name (str): Full module name (prefix) - - Returns: - torch.nn.Module: Resulting module - - Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762 - """ - res = module - is_match = True - if module_match: - is_match = name in module_match - if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)): - res = FrozenBatchNorm2d(module.num_features) - res.num_features = module.num_features - res.affine = module.affine - if module.affine: - res.weight.data = module.weight.data.clone().detach() - res.bias.data = module.bias.data.clone().detach() - res.running_mean.data = module.running_mean.data - res.running_var.data = module.running_var.data - res.eps = module.eps - else: - for child_name, child in module.named_children(): - full_child_name = ".".join([name, child_name]) if name else child_name - new_child = freeze_batch_norm_2d(child, module_match, full_child_name) - if new_child is not child: - res.add_module(child_name, new_child) - return res - - -# From PyTorch internals -def _ntuple(n): - def parse(x): - if isinstance(x, collections.abc.Iterable): - return x - return tuple(repeat(x, n)) - - return parse - - -to_1tuple = _ntuple(1) -to_2tuple = _ntuple(2) -to_3tuple = _ntuple(3) -to_4tuple = _ntuple(4) -to_ntuple = lambda n, x: _ntuple(n)(x) - - -def is_logging(args): - def is_global_master(args): - return args.rank == 0 - - def is_local_master(args): - return args.local_rank == 0 - - def is_master(args, local=False): - return is_local_master(args) if local else is_global_master(args) - - return is_master - - -class AllGather(torch.autograd.Function): - """An autograd function that performs allgather on a tensor. - Performs all_gather operation on the provided tensors. - *** Warning ***: torch.distributed.all_gather has no gradient. - """ - - @staticmethod - def forward(ctx, tensor, rank, world_size): - tensors_gather = [torch.empty_like(tensor) for _ in range(world_size)] - torch.distributed.all_gather(tensors_gather, tensor) - ctx.rank = rank - ctx.batch_size = tensor.shape[0] - return torch.cat(tensors_gather, 0) - - @staticmethod - def backward(ctx, grad_output): - return (grad_output[ctx.batch_size * ctx.rank : ctx.batch_size * (ctx.rank + 1)], None, None) - - -allgather = AllGather.apply diff --git a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_vit.py b/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_vit.py deleted file mode 100755 index 9bd74655..00000000 --- a/llava_next/llava/model/multimodal_encoder/dev_eva_clip/eva_vit.py +++ /dev/null @@ -1,141 +0,0 @@ -# Based on EVA, BEIT, timm and DeiT code bases -# https://github.com/baaivision/EVA -# https://github.com/rwightman/pytorch-image-models/tree/master/timm -# https://github.com/microsoft/unilm/tree/master/beit -# https://github.com/facebookresearch/deit/ -# https://github.com/facebookresearch/dino -# --------------------------------------------------------' -# not tested yet -import math -from transformers import CLIPImageProcessor - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.utils.checkpoint as checkpoint -from timm.models.layers import drop_path, to_2tuple, trunc_normal_ -from .eva_clip import create_model_and_transforms, get_model_config -import torch -import torchvision -import time - -from llava.utils import rank0_print - - -class EvaViTWrapper(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - self.vision_tower_name = vision_tower - self.pretrained = args.vision_tower_pretrained - self.args = args - - self.select_layer = args.mm_vision_select_layer - if self.select_layer < -1: - self.select_layer += 1 - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - self.model_config = get_model_config(self.vision_tower_name) - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - - def load_model(self): - rank0_print(f"Loading: {self.vision_tower_name}") - rank0_print(f"Pretrained: {self.pretrained}") - time_start = time.time() - model, _, image_processor = create_model_and_transforms(self.vision_tower_name, self.pretrained, force_custom_clip=True, precision="fp16") - time_end = time.time() - rank0_print(f"Loaded: {self.vision_tower_name} in {time_end - time_start:.2f}s") - self.device = next(model.parameters()).device - self.dtype = next(model.parameters()).dtype - if self.device.type != "meta": - model = model.to("cuda") - self.vision_tower = model.visual - resize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Resize)][0] - normalize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Normalize)][0] - self.resize_transform_size = resize_transform.size - self.image_processor = CLIPImageProcessor.from_pretrained( - "openai/clip-vit-large-patch14", - crop_size=resize_transform.size, - size={"shortest_edge": resize_transform.size}, - image_mean=list(normalize_transform.mean), - image_std=list(normalize_transform.std), - ) - rank0_print(f"Loaded image processor: {self.image_processor}") - self.vision_tower.requires_grad_(False) - self.is_loaded = True - - def feature_select(self, image_features): - select_feature_type = self.select_feature - - # if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: - # select_every_k_layer = len(image_features) // 4 - # image_features = torch.cat([image_features[i] for i in range(select_every_k_layer + self.select_layer, len(image_features), select_every_k_layer)], dim=-1) - # select_feature_type = select_feature_type.replace("slicefour_", "") - # elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]: - # select_layers = [-1, -4, -7, -10, 6] - # image_features = torch.cat([image_features[i] for i in select_layers], dim=-1) - # select_feature_type = select_feature_type.replace("slice_m25811_f6_", "") - # else: - # image_features = image_features[self.select_layer] - - if select_feature_type == "patch": - image_features = image_features[:, 1:] - elif select_feature_type == "cls_patch": - image_features = image_features - else: - raise ValueError(f"Unexpected select feature: {select_feature_type}") - return image_features - - def train(self, mode=True): - self.training = mode - - if self.is_loaded: - self.vision_tower.eval() - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_features = self.vision_tower.forward_features(image.to(self.dtype), return_all_features=True) - image_features = self.feature_select(image_features).to(self.dtype) - image_features.append(image_features) - else: - image_features = self.vision_tower.forward_features(images.to(self.dtype), return_all_features=True) - image_features = self.feature_select(image_features).to(self.dtype) - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def hidden_size(self): - return self.model_config["vision_cfg"]["width"] - - @property - def num_patches(self): - return (self.model_config["vision_cfg"]["image_size"] // self.model_config["vision_cfg"]["patch_size"]) ** 2 - - @property - def num_patches_per_side(self): - return self.model_config["vision_cfg"]["image_size"] // self.model_config["vision_cfg"]["patch_size"] - - @property - def config(self): - return self.model_config - - @property - def image_size(self): - return self.model_config["vision_cfg"]["image_size"] diff --git a/llava_next/llava/model/multimodal_encoder/eva_8B_448px/configuration_evaclip.py b/llava_next/llava/model/multimodal_encoder/eva_8B_448px/configuration_evaclip.py deleted file mode 100755 index 29115cb0..00000000 --- a/llava_next/llava/model/multimodal_encoder/eva_8B_448px/configuration_evaclip.py +++ /dev/null @@ -1,411 +0,0 @@ -# coding=utf-8 -""" EvaCLIP model configuration""" -# Code mainly copied here: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/configuration_clip.py -# and adjusted for evaclip - -import copy -import os -from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Mapping, Optional, Union - - -if TYPE_CHECKING: - from transformers.processing_utils import ProcessorMixin - from transformers.utils import TensorType - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - - -class EvaCLIPTextConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP - text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the text encoder of the CLIP - [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - vocab_size (`int`, *optional*, defaults to 49408): - Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by - the `inputs_ids` passed when calling [`CLIPModel`]. - hidden_size (`int`, *optional*, defaults to 512): - Dimensionality of the encoder layers and the pooler layer. - intermediate_size (`int`, *optional*, defaults to 2048): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 8): - Number of attention heads for each attention layer in the Transformer encoder. - max_position_embeddings (`int`, *optional*, defaults to 77): - The maximum sequence length that this model might ever be used with. Typically set this to something large - just in case (e.g., 512 or 1024 or 2048). - hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): - The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): - The epsilon used by the layer normalization layers. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float`, *optional*, defaults to 1): - A factor for initializing all weight matrices (should be kept to 1, used internally for initialization - testing). - - Example: - - ```python - >>> from transformers import CLIPTextConfig, CLIPTextModel - - >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration - >>> configuration = CLIPTextConfig() - - >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration - >>> model = CLIPTextModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - model_type = "clip_text_model" - - def __init__( - self, - vocab_size=49408, - hidden_size=512, - intermediate_size=2048, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=8, - max_position_embeddings=77, - hidden_act="gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - q_bias=True, - k_bias=True, - v_bias=True, - post_layernorm=False, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - use_rms_norm=False, - **kwargs, - ): - super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.q_bias=q_bias - self.k_bias=k_bias - self.v_bias=v_bias - self.post_layernorm = post_layernorm - self.attention_dropout = attention_dropout - self.use_rms_norm = use_rms_norm - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from CLIPConfig - if config_dict.get("model_type") == "clip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - - -class EvaCLIPVisionConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a - CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP - [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - intermediate_size (`int`, *optional*, defaults to 3072): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each attention layer in the Transformer encoder. - image_size (`int`, *optional*, defaults to 224): - The size (resolution) of each image. - patch_size (`int`, *optional*, defaults to 32): - The size (resolution) of each patch. - hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): - The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): - The epsilon used by the layer normalization layers. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float`, *optional*, defaults to 1): - A factor for initializing all weight matrices (should be kept to 1, used internally for initialization - testing). - - Example: - - ```python - >>> from transformers import CLIPVisionConfig, CLIPVisionModel - - >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration - >>> configuration = CLIPVisionConfig() - - >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration - >>> model = CLIPVisionModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "clip_vision_model" - - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=32, - hidden_act="gelu", - layer_norm_eps=1e-5, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - q_bias=True, - k_bias=True, - v_bias=True, - post_layernorm=False, - use_rms_norm=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.q_bias=q_bias - self.k_bias=k_bias - self.v_bias=v_bias - self.post_layernorm = post_layernorm - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.use_rms_norm = use_rms_norm - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from CLIPConfig - if config_dict.get("model_type") == "clip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - - -class EvaCLIPConfig(PretrainedConfig): - r""" - [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate - a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating - a configuration with the defaults will yield a similar configuration to that of the CLIP - [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - text_config (`dict`, *optional*): - Dictionary of configuration options used to initialize [`CLIPTextConfig`]. - vision_config (`dict`, *optional*): - Dictionary of configuration options used to initialize [`CLIPVisionConfig`]. - projection_dim (`int`, *optional*, defaults to 512): - Dimentionality of text and vision projection layers. - logit_scale_init_value (`float`, *optional*, defaults to 2.6592): - The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation. - kwargs (*optional*): - Dictionary of keyword arguments. - - Example: - - ```python - >>> from transformers import CLIPConfig, CLIPModel - - >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration - >>> configuration = CLIPConfig() - - >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration - >>> model = CLIPModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - - >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig - >>> from transformers import CLIPTextConfig, CLIPVisionConfig - - >>> # Initializing a CLIPText and CLIPVision configuration - >>> config_text = CLIPTextConfig() - >>> config_vision = CLIPVisionConfig() - - >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision) - ```""" - - model_type = "clip" - is_composition = True - - def __init__( - self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs - ): - # If `_config_dict` exist, we use them for the backward compatibility. - # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot - # of confusion!). - text_config_dict = kwargs.pop("text_config_dict", None) - vision_config_dict = kwargs.pop("vision_config_dict", None) - - super().__init__(**kwargs) - - # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in - # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most - # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. - if text_config_dict is not None: - if text_config is None: - text_config = {} - - # This is the complete result when using `text_config_dict`. - _text_config_dict = EvaCLIPTextConfig(**text_config_dict).to_dict() - - # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. - for key, value in _text_config_dict.items(): - if key in text_config and value != text_config[key] and key not in ["transformers_version"]: - # If specified in `text_config_dict` - if key in text_config_dict: - message = ( - f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. " - f'The value `text_config_dict["{key}"]` will be used instead.' - ) - # If inferred from default argument values (just to be super careful) - else: - message = ( - f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The " - f'value `text_config["{key}"]` will be overriden.' - ) - logger.warning(message) - - # Update all values in `text_config` with the ones in `_text_config_dict`. - text_config.update(_text_config_dict) - - if vision_config_dict is not None: - if vision_config is None: - vision_config = {} - - # This is the complete result when using `vision_config_dict`. - _vision_config_dict = EvaCLIPVisionConfig(**vision_config_dict).to_dict() - # convert keys to string instead of integer - if "id2label" in _vision_config_dict: - _vision_config_dict["id2label"] = { - str(key): value for key, value in _vision_config_dict["id2label"].items() - } - - # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different. - for key, value in _vision_config_dict.items(): - if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]: - # If specified in `vision_config_dict` - if key in vision_config_dict: - message = ( - f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different " - f'values. The value `vision_config_dict["{key}"]` will be used instead.' - ) - # If inferred from default argument values (just to be super careful) - else: - message = ( - f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. " - f'The value `vision_config["{key}"]` will be overriden.' - ) - logger.warning(message) - - # Update all values in `vision_config` with the ones in `_vision_config_dict`. - vision_config.update(_vision_config_dict) - - if text_config is None: - text_config = {} - logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.") - - if vision_config is None: - vision_config = {} - logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.") - - self.text_config = EvaCLIPTextConfig(**text_config) - self.vision_config = EvaCLIPVisionConfig(**vision_config) - - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.initializer_factor = 1.0 - - @classmethod - def from_text_vision_configs(cls, text_config: EvaCLIPTextConfig, vision_config: EvaCLIPVisionConfig, **kwargs): - r""" - Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model - configuration. - - Returns: - [`CLIPConfig`]: An instance of a configuration object - """ - - return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs) - - def to_dict(self): - """ - Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. - - Returns: - `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, - """ - output = copy.deepcopy(self.__dict__) - output["text_config"] = self.text_config.to_dict() - output["vision_config"] = self.vision_config.to_dict() - output["model_type"] = self.__class__.model_type - return output \ No newline at end of file diff --git a/llava_next/llava/model/multimodal_encoder/eva_8B_448px/model.py b/llava_next/llava/model/multimodal_encoder/eva_8B_448px/model.py deleted file mode 100755 index 1ed61dce..00000000 --- a/llava_next/llava/model/multimodal_encoder/eva_8B_448px/model.py +++ /dev/null @@ -1,1059 +0,0 @@ -# coding=utf-8 -""" EvaCLIP model configuration""" -# Code mainly copied here: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/configuration_clip.py -# and adjusted for evaclip - -from dataclasses import dataclass -from typing import Any, Optional, Tuple, Union - -import torch -import torch.utils.checkpoint -from torch import nn - -from transformers.activations import ACT2FN -from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import ( - ModelOutput, - logging, -) -from .configuration_evaclip import EvaCLIPConfig, EvaCLIPTextConfig, EvaCLIPVisionConfig - -# try: -# from xformers import ops as xops -# except ImportError: -# xops = None - - -logger = logging.get_logger(__name__) - - -class RMSNorm(nn.Module): - """ - adepted from transformers T5LayerNorm - """ - def __init__(self, hidden_size, eps=1e-6): - """ - Construct a layernorm module in the T5 style. No bias and no subtraction of mean. - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean - # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated - # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for - # half-precision inputs is done in fp32 - - variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - - # convert into half-precision if necessary - if self.weight.dtype in [torch.float16, torch.bfloat16]: - hidden_states = hidden_states.to(self.weight.dtype) - - return self.weight * hidden_states - -# Copied from transformers.models.bart.modeling_bart._expand_mask -def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): - """ - Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. - """ - bsz, src_len = mask.size() - tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) - - inverted_mask = 1.0 - expanded_mask - - return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) - - -# contrastive loss function, adapted from -# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html -def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: - return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) - -def clip_loss(similarity: torch.Tensor) -> torch.Tensor: - caption_loss = contrastive_loss(similarity) - image_loss = contrastive_loss(similarity.t()) - return (caption_loss + image_loss) / 2.0 - - -@dataclass -class EvaCLIPVisionModelOutput(ModelOutput): - image_embeds: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -class EvaCLIPTextModelOutput(ModelOutput): - text_embeds: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -class EvaCLIPOutput(ModelOutput): - loss: Optional[torch.FloatTensor] = None - logits_per_image: torch.FloatTensor = None - logits_per_text: torch.FloatTensor = None - text_embeds: torch.FloatTensor = None - image_embeds: torch.FloatTensor = None - text_model_output: BaseModelOutputWithPooling = None - vision_model_output: BaseModelOutputWithPooling = None - - def to_tuple(self) -> Tuple[Any]: - return tuple( - self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() - for k in self.keys() - ) - - -class EvaCLIPVisionEmbeddings(nn.Module): - def __init__(self, config: EvaCLIPVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.image_size = config.image_size - self.patch_size = config.patch_size - - self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) - - self.patch_embedding = nn.Conv2d( - in_channels=config.num_channels, - out_channels=self.embed_dim, - kernel_size=self.patch_size, - stride=self.patch_size, - bias=True, - ) - - self.num_patches = (self.image_size // self.patch_size) ** 2 - self.num_positions = self.num_patches + 1 - self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent = False) - - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] - patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] - patch_embeds = patch_embeds.flatten(2).transpose(1, 2) - - class_embeds = self.class_embedding.expand(batch_size, 1, -1) - embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) - return embeddings - - -class EvaCLIPTextEmbeddings(nn.Module): - def __init__(self, config: EvaCLIPTextConfig): - super().__init__() - embed_dim = config.hidden_size - - self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) - self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) - - # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False) - - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - ) -> torch.Tensor: - seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] - - if position_ids is None: - position_ids = self.position_ids[:, :seq_length] - - if inputs_embeds is None: - inputs_embeds = self.token_embedding(input_ids) - - position_embeddings = self.position_embedding(position_ids) - embeddings = inputs_embeds + position_embeddings - - return embeddings - - -class EvaCLIPAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) - self.scale = self.head_dim**-0.5 - self.dropout = config.attention_dropout - self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.k_bias) - self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.v_bias) - self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.q_bias) - self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - causal_attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - bsz, tgt_len, embed_dim = hidden_states.size() - - query_states = self.q_proj(hidden_states) * self.scale - key_states = self._shape(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape(self.v_proj(hidden_states), -1, bsz) - - proj_shape = (bsz * self.num_heads, -1, self.head_dim) - query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) - key_states = key_states.view(*proj_shape) - value_states = value_states.view(*proj_shape) - - src_len = key_states.size(1) - attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) - - if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): - raise ValueError( - f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" - f" {attn_weights.size()}" - ) - - # apply the causal_attention_mask first - if causal_attention_mask is not None: - if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" - f" {causal_attention_mask.size()}" - ) - attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask - attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, tgt_len, src_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask - attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - - attn_weights = nn.functional.softmax(attn_weights, dim=-1) - - if output_attentions: - # this operation is a bit akward, but it's required to - # make sure that attn_weights keeps its gradient. - # In order to do so, attn_weights have to reshaped - # twice and have to be reused in the following - attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) - attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) - else: - attn_weights_reshaped = None - - attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - - attn_output = torch.bmm(attn_probs, value_states) - - if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, attn_weights_reshaped - -class EvaCLIPTextAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) - self.scale = self.head_dim**-0.5 - self.dropout = config.attention_dropout - self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.k_bias) - self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.v_bias) - self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.q_bias) - self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True) - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - causal_attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - bsz, tgt_len, embed_dim = hidden_states.size() - - # get query proj - query_states = self.q_proj(hidden_states) - key_states = self._shape(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape(self.v_proj(hidden_states), -1, bsz) - - proj_shape = (bsz * self.num_heads, -1, self.head_dim) - query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) - key_states = key_states.view(*proj_shape) - value_states = value_states.view(*proj_shape) - - src_len = key_states.size(1) - attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) - - if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): - raise ValueError( - f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" - f" {attn_weights.size()}" - ) - - # apply the causal_attention_mask first - if causal_attention_mask is not None: - if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" - f" {causal_attention_mask.size()}" - ) - attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask - attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, tgt_len, src_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask - attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - - attn_weights = nn.functional.softmax(attn_weights, dim=-1) - - if output_attentions: - # this operation is a bit akward, but it's required to - # make sure that attn_weights keeps its gradient. - # In order to do so, attn_weights have to reshaped - # twice and have to be reused in the following - attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) - attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) - else: - attn_weights_reshaped = None - - attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - - attn_output = torch.bmm(attn_probs, value_states) - - if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, attn_weights_reshaped - -class EvaCLIPMLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.activation_fn = ACT2FN[config.hidden_act] - self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) - self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.fc1(hidden_states) - hidden_states = self.activation_fn(hidden_states) - hidden_states = self.fc2(hidden_states) - return hidden_states - - -class EvaCLIPEncoderLayer(nn.Module): - def __init__(self, config: EvaCLIPConfig): - super().__init__() - self.config = config - norm_layer = RMSNorm if config.use_rms_norm else nn.LayerNorm - self.embed_dim = config.hidden_size - self.post_layernorm = config.post_layernorm if config.post_layernorm is not None else False - self.self_attn = EvaCLIPAttention(config) - self.layer_norm1 = norm_layer(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = EvaCLIPMLP(config) - self.layer_norm2 = norm_layer(self.embed_dim, eps=config.layer_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: torch.Tensor, - causal_attention_mask: torch.Tensor, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.FloatTensor]: - residual = hidden_states - - if not self.post_layernorm: - hidden_states = self.layer_norm1(hidden_states) - hidden_states, attn_weights = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, - ) - if self.post_layernorm: - hidden_states = self.layer_norm1(hidden_states) - hidden_states = residual + hidden_states - residual = hidden_states - if not self.post_layernorm: - hidden_states = self.layer_norm2(hidden_states) - hidden_states = self.mlp(hidden_states) - if self.post_layernorm: - hidden_states = self.layer_norm2(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs - - -class EvaCLIPPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = EvaCLIPConfig - base_model_prefix = "clip" - supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] - - def _init_weights(self, module): - """Initialize the weights""" - factor = self.config.initializer_factor - if isinstance(module, EvaCLIPTextEmbeddings): - module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) - module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) - elif isinstance(module, EvaCLIPVisionEmbeddings): - factor = self.config.initializer_factor - nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) - nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) - nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor) - elif isinstance(module, EvaCLIPAttention): - factor = self.config.initializer_factor - in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor - out_proj_std = (module.embed_dim**-0.5) * factor - nn.init.normal_(module.q_proj.weight, std=in_proj_std) - nn.init.normal_(module.k_proj.weight, std=in_proj_std) - nn.init.normal_(module.v_proj.weight, std=in_proj_std) - nn.init.normal_(module.out_proj.weight, std=out_proj_std) - elif isinstance(module, EvaCLIPMLP): - factor = self.config.initializer_factor - in_proj_std = ( - (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor - ) - fc_std = (2 * module.config.hidden_size) ** -0.5 * factor - nn.init.normal_(module.fc1.weight, std=fc_std) - nn.init.normal_(module.fc2.weight, std=in_proj_std) - elif isinstance(module, EvaCLIPModel): - nn.init.normal_( - module.text_projection.weight, - std=module.text_embed_dim**-0.5 * self.config.initializer_factor, - ) - nn.init.normal_( - module.visual_projection.weight, - std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, - ) - elif isinstance(module, EvaCLIPVisionModelWithProjection): - nn.init.normal_( - module.visual_projection.weight, - std=self.config.hidden_size**-0.5 * self.config.initializer_factor, - ) - elif isinstance(module, EvaCLIPTextModelWithProjection): - nn.init.normal_( - module.text_projection.weight, - std=self.config.hidden_size**-0.5 * self.config.initializer_factor, - ) - - if isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() - - def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, EvaCLIPEncoder): - module.gradient_checkpointing = value - -class EvaCLIPEncoder(nn.Module): - """ - Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a - [`CLIPEncoderLayer`]. - Args: - config: CLIPConfig - """ - - def __init__(self, config: EvaCLIPConfig): - super().__init__() - self.config = config - self.layers = nn.ModuleList([EvaCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False - - def forward( - self, - inputs_embeds, - attention_mask: Optional[torch.Tensor] = None, - causal_attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - - hidden_states = inputs_embeds - for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - if self.gradient_checkpointing and self.training: - - def create_custom_forward(module): - def custom_forward(*inputs): - return module(*inputs, output_attentions) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(encoder_layer), - hidden_states, - attention_mask, - causal_attention_mask, - ) - else: - layer_outputs = encoder_layer( - hidden_states, - attention_mask, - causal_attention_mask, - output_attentions=output_attentions, - ) - - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions - ) - - -class EvaCLIPTextTransformer(EvaCLIPPreTrainedModel): - def __init__(self, config: EvaCLIPTextConfig): - super().__init__(config) - self.config = config - embed_dim = config.hidden_size - norm_layer = RMSNorm if config.use_rms_norm else nn.LayerNorm - self.embeddings = EvaCLIPTextEmbeddings(config) - self.encoder = EvaCLIPEncoder(config) - self.final_layer_norm = norm_layer(embed_dim, eps=config.layer_norm_eps) - - def gradient_checkpointing_enable(self): - self.encoder.gradient_checkpointing = True - - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if input_ids is None: - raise ValueError("You have to specify input_ids") - - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - - hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) - - bsz, seq_len = input_shape - # CLIP's text model uses causal mask, prepare it here. - # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 - causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to( - hidden_states.device - ) - # expand attention_mask - if attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - attention_mask = _expand_mask(attention_mask, hidden_states.dtype) - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - attention_mask=attention_mask, - causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - last_hidden_state = self.final_layer_norm(last_hidden_state) - - # text_embeds.shape = [batch_size, sequence_length, transformer.width] - # take features from the eot embedding (eot_token is the highest number in each sequence) - # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14 - pooled_output = last_hidden_state[ - torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), - input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1), - ] - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - def _build_causal_attention_mask(self, bsz, seq_len, dtype): - # lazily create causal attention mask, with full attention between the vision tokens - # pytorch uses additive attention mask; fill with -inf - mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype) - mask.fill_(torch.tensor(torch.finfo(dtype).min)) - mask.triu_(1) # zero out the lower diagonal - mask = mask.unsqueeze(1) # expand mask - return mask - -class EvaCLIPTextModel(EvaCLIPPreTrainedModel): - config_class = EvaCLIPTextConfig - - _no_split_modules = ["EvaCLIPEncoderLayer"] - - def __init__(self, config: EvaCLIPTextConfig): - super().__init__(config) - self.text_model = EvaCLIPTextTransformer(config) - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> nn.Module: - return self.text_model.embeddings.token_embedding - - def set_input_embeddings(self, value): - self.text_model.embeddings.token_embedding = value - - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - return self.text_model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - -class EvaCLIPVisionTransformer(EvaCLIPPreTrainedModel): - def __init__(self, config: EvaCLIPVisionConfig): - super().__init__(config) - self.config = config - embed_dim = config.hidden_size - norm_layer = RMSNorm if config.use_rms_norm else nn.LayerNorm - self.embeddings = EvaCLIPVisionEmbeddings(config) - self.encoder = EvaCLIPEncoder(config) - self.post_layernorm = norm_layer(embed_dim, eps=config.layer_norm_eps) - - def gradient_checkpointing_enable(self): - self.encoder.gradient_checkpointing = True - - def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if pixel_values is None: - raise ValueError("You have to specify pixel_values") - - hidden_states = self.embeddings(pixel_values) - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - pooled_output = last_hidden_state[:, 0, :] - pooled_output = self.post_layernorm(pooled_output) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - -class EvaCLIPVisionModel(PreTrainedModel): - config_class = EvaCLIPVisionConfig - main_input_name = "pixel_values" - - def __init__(self, config: EvaCLIPVisionConfig): - super().__init__(config) - # super().__init__() - self.vision_model = EvaCLIPVisionTransformer(config) - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> nn.Module: - return self.vision_model.embeddings.patch_embedding - - def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - return self.vision_model( - pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - -class EvaCLIPModel(EvaCLIPPreTrainedModel): - config_class = EvaCLIPConfig - - def __init__(self, config: EvaCLIPConfig): - super().__init__(config) - - if not (type(config.text_config).__name__ == "EvaCLIPTextConfig"): - raise ValueError( - "config.text_config is expected to be of type EvaCLIPTextConfig but is of type" - f" {type(config.text_config)}." - ) - - if not (type(config.vision_config).__name__ == "EvaCLIPVisionConfig"): - raise ValueError( - "config.vision_config is expected to be of type EvaCLIPVisionConfig but is of type" - f" {type(config.vision_config)}." - ) - - text_config = config.text_config - vision_config = config.vision_config - - self.projection_dim = config.projection_dim - self.text_embed_dim = text_config.hidden_size - self.vision_embed_dim = vision_config.hidden_size - - self.text_model = EvaCLIPTextTransformer(text_config) - self.vision_model = EvaCLIPVisionTransformer(vision_config) - - self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) - self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) - self.logit_scale = torch.tensor(100., requires_grad=False) - - # Initialize weights and apply final processing - self.post_init() - - def encode_text( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: - # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - text_outputs = self.text_model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = text_outputs[1] - text_features = self.text_projection(pooled_output) - - return text_features - - def encode_image( - self, - pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: - - # Use EvaCLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - vision_outputs = self.vision_model( - pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = vision_outputs[1] # pooled_output - image_features = self.visual_projection(pooled_output) - - return image_features - - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - pixel_values: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - return_loss: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, EvaCLIPOutput]: - # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - vision_outputs = self.vision_model( - pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - text_outputs = self.text_model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - image_embeds = vision_outputs[1] - image_embeds = self.visual_projection(image_embeds) - - text_embeds = text_outputs[1] - text_embeds = self.text_projection(text_embeds) - - # normalized features - image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) - text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) - - # cosine similarity as logits - logit_scale = self.logit_scale.exp() - logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale - logits_per_image = logits_per_text.t() - - loss = None - if return_loss: - loss = clip_loss(logits_per_text) - - if not return_dict: - output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - - return EvaCLIPOutput( - loss=loss, - logits_per_image=logits_per_image, - logits_per_text=logits_per_text, - text_embeds=text_embeds, - image_embeds=image_embeds, - text_model_output=text_outputs, - vision_model_output=vision_outputs, - ) - - -class EvaCLIPTextModelWithProjection(EvaCLIPPreTrainedModel): - config_class = EvaCLIPTextConfig - - _no_split_modules = ["EvaCLIPEncoderLayer"] - - def __init__(self, config: EvaCLIPTextConfig): - super().__init__(config) - - self.text_model = EvaCLIPTextTransformer(config) - - self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False) - - def get_input_embeddings(self) -> nn.Module: - return self.text_model.embeddings.token_embedding - - def set_input_embeddings(self, value): - self.text_model.embeddings.token_embedding = value - - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, EvaCLIPTextModelOutput]: - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - text_outputs = self.text_model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = text_outputs[1] - - text_embeds = self.text_projection(pooled_output) - - if not return_dict: - outputs = (text_embeds, text_outputs[0]) + text_outputs[2:] - return tuple(output for output in outputs if output is not None) - - return EvaCLIPTextModelOutput( - text_embeds=text_embeds, - last_hidden_state=text_outputs.last_hidden_state, - hidden_states=text_outputs.hidden_states, - attentions=text_outputs.attentions, - ) - -class EvaCLIPVisionModelWithProjection(EvaCLIPPreTrainedModel): - config_class = EvaCLIPVisionConfig - main_input_name = "pixel_values" - - def __init__(self, config: EvaCLIPVisionConfig): - super().__init__(config) - - self.vision_model = EvaCLIPVisionTransformer(config) - - self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> nn.Module: - return self.vision_model.embeddings.patch_embedding - - def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, EvaCLIPVisionModelOutput]: - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - vision_outputs = self.vision_model( - pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = vision_outputs[1] # pooled_output - - image_embeds = self.visual_projection(pooled_output) - - if not return_dict: - outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:] - return tuple(output for output in outputs if output is not None) - - return EvaCLIPVisionModelOutput( - image_embeds=image_embeds, - last_hidden_state=vision_outputs.last_hidden_state, - hidden_states=vision_outputs.hidden_states, - attentions=vision_outputs.attentions, - ) diff --git a/llava_next/llava/model/multimodal_encoder/eva_8b_448px_encoder.py b/llava_next/llava/model/multimodal_encoder/eva_8b_448px_encoder.py deleted file mode 100755 index 009a4d1e..00000000 --- a/llava_next/llava/model/multimodal_encoder/eva_8b_448px_encoder.py +++ /dev/null @@ -1,174 +0,0 @@ -import torch -import torch.nn as nn -from llava.utils import rank0_print -from transformers import CLIPImageProcessor -from .eva_8B_448px.model import EvaCLIPVisionModel, EvaCLIPVisionConfig -try: - from s2wrapper import forward as multiscale_forward -except: - pass - - - -class EVA_8B_448px_VisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - - self.vision_tower_name = vision_tower - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - self.cfg_only = EvaCLIPVisionConfig.from_pretrained(self.vision_tower_name) - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = EvaCLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.is_loaded = True - - def feature_select(self, image_forward_outs): - select_feature_type = self.select_feature - - if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: - select_every_k_layer = len(image_forward_outs.hidden_states) // 4 - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) - select_feature_type = select_feature_type.replace("slicefour_", "") - elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]: - select_layers = [-2, -5, -8, -11, 6] - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1) - select_feature_type = select_feature_type.replace("slice_m25811_f6_", "") - else: - image_features = image_forward_outs.hidden_states[self.select_layer] - - if select_feature_type == "patch": - image_features = image_features[:, 1:] - elif select_feature_type == "cls_patch": - image_features = image_features - else: - raise ValueError(f"Unexpected select feature: {select_feature_type}") - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) - image_feature = self.feature_select(image_forward_out).to(image.dtype) - image_features.append(image_feature) - else: - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - return self.vision_tower.dtype - - @property - def device(self): - return self.vision_tower.device - - @property - def config(self): - if self.is_loaded: - return self.vision_tower.config - else: - return self.cfg_only - - @property - def hidden_size(self): - _hidden_size = self.config.hidden_size - if "slicefour" in self.select_feature: - _hidden_size *= 4 - if "slice_m25811_f6" in self.select_feature: - _hidden_size *= 5 - return _hidden_size - - @property - def num_patches_per_side(self): - return self.config.image_size // self.config.patch_size - - @property - def num_patches(self): - _num_patches = (self.config.image_size // self.config.patch_size) ** 2 - if "cls_patch" in self.select_feature: - _num_patches += 1 - return _num_patches - - @property - def image_size(self): - return self.config.image_size - - -class EVA_8B_448px_VisionTowerS2(EVA_8B_448px_VisionTower): - def __init__(self, vision_tower, args, delay_load=False): - - self.s2_scales = getattr(args, "s2_scales", "336,672,1008") - self.s2_scales = list(map(int, self.s2_scales.split(","))) - self.s2_scales.sort() - self.s2_split_size = self.s2_scales[0] - self.s2_image_size = self.s2_scales[-1] - - super().__init__(vision_tower, args, delay_load) - - # change resize/crop size in preprocessing to the largest image size in s2_scale - if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False): - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = EvaCLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - self.is_loaded = True - - def forward_feature(self, images): - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - image_features.append(image_feature) - else: - image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - - return image_features - - @property - def hidden_size(self): - return self.config.hidden_size * len(self.s2_scales) diff --git a/llava_next/llava/model/multimodal_encoder/eva_clip/eva_clip_encoder.py b/llava_next/llava/model/multimodal_encoder/eva_clip/eva_clip_encoder.py deleted file mode 100755 index aaef516a..00000000 --- a/llava_next/llava/model/multimodal_encoder/eva_clip/eva_clip_encoder.py +++ /dev/null @@ -1,74 +0,0 @@ -import torch -import torch.nn as nn - -from .eva_clip_processors import EvaClipImageTrainProcessor -from .eva_vit import EVAEncoderWrapper -from .factory import list_models, add_model_config, get_model_config - -from llava.utils import rank0_print - - -class EvaClipVisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - self.vision_tower_name = vision_tower - self.vision_tower_pretrained = args.vision_tower_pretrained - self.config = get_model_config(vision_tower) - - if not delay_load: - rank0_print(f"Loading EVA ViT: {self.vision_tower_name}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - self.cfg_only = self.config - - def load_model(self, device_map=None): - rank0_print(f"Pretrained: {self.vision_tower_pretrained}") - self.image_processor = EvaClipImageTrainProcessor(self.config["vision_cfg"]["image_size"]) - self.vision_tower = EVAEncoderWrapper(self.vision_tower_pretrained, self.config) - rank0_print(f"Loaded image processor: {self.image_processor}") - self.vision_tower.requires_grad_(False) - self.is_loaded = True - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_feature = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0)).to(image.dtype) - image_features.append(image_feature) - else: - image_features = self.vision_tower(images.to(device=self.device, dtype=self.dtype)).to(images.dtype) - - return image_features - - @property - def dtype(self): - return self.vision_tower.dtype - - @property - def device(self): - return self.vision_tower.device - - @property - def hidden_size(self): - return self.config["vision_cfg"]["width"] - - @property - def num_patches(self): - return (self.config["vision_cfg"]["image_size"] // self.config["vision_cfg"]["patch_size"]) ** 2 - - @property - def num_patches_per_side(self): - return self.config["vision_cfg"]["image_size"] // self.config["vision_cfg"]["patch_size"] - - @property - def image_size(self): - return self.config["vision_cfg"]["image_size"] diff --git a/llava_next/llava/model/multimodal_encoder/eva_clip/eva_clip_processors.py b/llava_next/llava/model/multimodal_encoder/eva_clip/eva_clip_processors.py deleted file mode 100755 index 7ee12731..00000000 --- a/llava_next/llava/model/multimodal_encoder/eva_clip/eva_clip_processors.py +++ /dev/null @@ -1,72 +0,0 @@ -""" -# Adapted from https://github.com/baaivision/EVA/tree/master/EVA-CLIP -""" - -from torchvision import transforms -from torchvision.transforms.functional import InterpolationMode -from transformers.image_processing_utils import BatchFeature -from PIL import Image -from transformers.image_transforms import convert_to_rgb - - -class BaseProcessor: - def __init__(self): - self.transform = lambda x: x - return - - def __call__(self, item): - return self.transform(item) - - -class EvaClipImageBaseProcessor(BaseProcessor): - def __init__(self, mean=None, std=None): - self.mean = (0.48145466, 0.4578275, 0.40821073) if mean is None else mean - self.std = (0.26862954, 0.26130258, 0.27577711) if std is None else std - - self.normalize = transforms.Normalize(self.mean, self.std) - - @property - def image_mean(self): - return self.mean - - -class EvaClipImageTrainProcessor(EvaClipImageBaseProcessor): - def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0): - super().__init__(mean=mean, std=std) - - self.transform = transforms.Compose( - [ - convert_to_rgb, - transforms.Resize( - image_size, - interpolation=InterpolationMode.BICUBIC, - ), - transforms.CenterCrop(image_size), - transforms.ToTensor(), - self.normalize, - ] - ) - - self.image_size = image_size - - def preprocess(self, images, return_tensors): - if isinstance(images, Image.Image): - images = [images] - else: - assert isinstance(images, list) - - transformed_images = [self.transform(image).numpy() for image in images] - data = {"pixel_values": transformed_images} - - return BatchFeature(data=data, tensor_type=return_tensors) - - def __call__(self, item): - return self.transform(item) - - @property - def crop_size(self): - return {"height": self.image_size, "width": self.image_size} - - @property - def size(self): - return {"shortest_edge": self.image_size} diff --git a/llava_next/llava/model/multimodal_encoder/eva_clip/eva_vit.py b/llava_next/llava/model/multimodal_encoder/eva_clip/eva_vit.py deleted file mode 100755 index d2330c32..00000000 --- a/llava_next/llava/model/multimodal_encoder/eva_clip/eva_vit.py +++ /dev/null @@ -1,856 +0,0 @@ -""" -# Adapted from https://github.com/baaivision/EVA/tree/master/EVA-CLIP -""" - -from math import pi -import torch -from torch import nn -from einops import rearrange, repeat -import logging -from llava.utils import rank0_print - - -def broadcat(tensors, dim=-1): - num_tensors = len(tensors) - shape_lens = set(list(map(lambda t: len(t.shape), tensors))) - assert len(shape_lens) == 1, "tensors must all have the same number of dimensions" - shape_len = list(shape_lens)[0] - dim = (dim + shape_len) if dim < 0 else dim - dims = list(zip(*map(lambda t: list(t.shape), tensors))) - expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim] - assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), "invalid dimensions for broadcastable concatentation" - max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims)) - expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims)) - expanded_dims.insert(dim, (dim, dims[dim])) - expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims))) - tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes))) - return torch.cat(tensors, dim=dim) - - -def rotate_half(x): - x = rearrange(x, "... (d r) -> ... d r", r=2) - x1, x2 = x.unbind(dim=-1) - x = torch.stack((-x2, x1), dim=-1) - return rearrange(x, "... d r -> ... (d r)") - - -class VisionRotaryEmbeddingFast(nn.Module): - def __init__(self, dim, pt_seq_len, ft_seq_len=None, custom_freqs=None, freqs_for="lang", theta=10000, max_freq=10, num_freqs=1, patch_dropout=0.0): - super().__init__() - if custom_freqs: - freqs = custom_freqs - elif freqs_for == "lang": - freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) - elif freqs_for == "pixel": - freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi - elif freqs_for == "constant": - freqs = torch.ones(num_freqs).float() - else: - raise ValueError(f"unknown modality {freqs_for}") - - if ft_seq_len is None: - ft_seq_len = pt_seq_len - t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len - - freqs = torch.einsum("..., f -> ... f", t, freqs) - freqs = repeat(freqs, "... n -> ... (n r)", r=2) - freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim=-1) - - freqs_cos = freqs.cos().view(-1, freqs.shape[-1]) - freqs_sin = freqs.sin().view(-1, freqs.shape[-1]) - - self.patch_dropout = patch_dropout - - self.register_buffer("freqs_cos", freqs_cos) - self.register_buffer("freqs_sin", freqs_sin) - - logging.info(f"Shape of rope freq: {self.freqs_cos.shape}") - - def forward(self, t, patch_indices_keep=None): - if patch_indices_keep is not None: - batch = t.size()[0] - batch_indices = torch.arange(batch) - batch_indices = batch_indices[..., None] - - freqs_cos = repeat(self.freqs_cos, "i j -> n i m j", n=t.shape[0], m=t.shape[1]) - freqs_sin = repeat(self.freqs_sin, "i j -> n i m j", n=t.shape[0], m=t.shape[1]) - - freqs_cos = freqs_cos[batch_indices, patch_indices_keep] - freqs_cos = rearrange(freqs_cos, "n i m j -> n m i j") - freqs_sin = freqs_sin[batch_indices, patch_indices_keep] - freqs_sin = rearrange(freqs_sin, "n i m j -> n m i j") - - return t * freqs_cos + rotate_half(t) * freqs_sin - - return t * self.freqs_cos + rotate_half(t) * self.freqs_sin - - -class LayerNorm(nn.LayerNorm): - """Subclass torch's LayerNorm (with cast back to input dtype).""" - - def forward(self, x: torch.Tensor): - orig_type = x.dtype - x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) - return x.to(orig_type) - - -class PatchDropout(nn.Module): - """ - https://arxiv.org/abs/2212.00794 - """ - - def __init__(self, prob, exclude_first_token=True): - super().__init__() - assert 0 <= prob < 1.0 - self.prob = prob - self.exclude_first_token = exclude_first_token # exclude CLS token - logging.info(f"os.getenv('RoPE')={os.getenv('RoPE')}") - - def forward(self, x): - if not self.training or self.prob == 0.0: - return x - - if self.exclude_first_token: - cls_tokens, x = x[:, :1], x[:, 1:] - else: - cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1]) - - batch = x.size()[0] - num_tokens = x.size()[1] - - batch_indices = torch.arange(batch) - batch_indices = batch_indices[..., None] - - keep_prob = 1 - self.prob - num_patches_keep = max(1, int(num_tokens * keep_prob)) - - rand = torch.randn(batch, num_tokens) - patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices - - x = x[batch_indices, patch_indices_keep] - - if self.exclude_first_token: - x = torch.cat((cls_tokens, x), dim=1) - - if self.training and os.getenv("RoPE") == "1": - return x, patch_indices_keep - - return x - - -# -------------------------------------------------------- -# Adapted from https://github.com/microsoft/unilm/tree/master/beit -# -------------------------------------------------------- -import math -import os -import torch.nn as nn -import torch.nn.functional as F - -try: - from timm.models.layers import drop_path, to_2tuple, trunc_normal_ -except: - from timm.layers import drop_path, to_2tuple, trunc_normal_ - -if os.getenv("ENV_TYPE") == "deepspeed": - try: - from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint - except: - from torch.utils.checkpoint import checkpoint -else: - from torch.utils.checkpoint import checkpoint - -try: - import xformers.ops as xops -except ImportError: - xops = None - # print("Please 'pip install xformers'") - - -class DropPath(nn.Module): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" - - def __init__(self, drop_prob=None): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - - def forward(self, x): - return drop_path(x, self.drop_prob, self.training) - - def extra_repr(self) -> str: - return "p={}".format(self.drop_prob) - - -class Mlp(nn.Module): - def __init__( - self, - in_features, - hidden_features=None, - out_features=None, - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - drop=0.0, - subln=False, - ): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = nn.Linear(in_features, hidden_features) - self.act = act_layer() - - self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity() - - self.fc2 = nn.Linear(hidden_features, out_features) - self.drop = nn.Dropout(drop) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - # x = self.drop(x) - # commit this for the orignal BERT implement - x = self.ffn_ln(x) - - x = self.fc2(x) - x = self.drop(x) - return x - - -class SwiGLU(nn.Module): - def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0.0, norm_layer=nn.LayerNorm, subln=False): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - - self.w1 = nn.Linear(in_features, hidden_features) - self.w2 = nn.Linear(in_features, hidden_features) - - self.act = act_layer() - self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity() - self.w3 = nn.Linear(hidden_features, out_features) - - self.drop = nn.Dropout(drop) - - def forward(self, x): - x1 = self.w1(x) - x2 = self.w2(x) - hidden = self.act(x1) * x2 - x = self.ffn_ln(hidden) - x = self.w3(x) - x = self.drop(x) - return x - - -class Attention(nn.Module): - def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0, window_size=None, attn_head_dim=None, xattn=False, rope=None, subln=False, norm_layer=nn.LayerNorm): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - if attn_head_dim is not None: - head_dim = attn_head_dim - all_head_dim = head_dim * self.num_heads - self.scale = qk_scale or head_dim**-0.5 - - self.subln = subln - if self.subln: - self.q_proj = nn.Linear(dim, all_head_dim, bias=False) - self.k_proj = nn.Linear(dim, all_head_dim, bias=False) - self.v_proj = nn.Linear(dim, all_head_dim, bias=False) - else: - self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) - - if qkv_bias: - self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) - self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) - else: - self.q_bias = None - self.v_bias = None - - if window_size: - self.window_size = window_size - self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 - self.relative_position_bias_table = nn.Parameter(torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH - # cls to token & token 2 cls & cls to cls - - # get pair-wise relative position index for each token inside the window - coords_h = torch.arange(window_size[0]) - coords_w = torch.arange(window_size[1]) - coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww - relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * window_size[1] - 1 - relative_position_index = torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww - relative_position_index[0, 0:] = self.num_relative_distance - 3 - relative_position_index[0:, 0] = self.num_relative_distance - 2 - relative_position_index[0, 0] = self.num_relative_distance - 1 - - self.register_buffer("relative_position_index", relative_position_index) - else: - self.window_size = None - self.relative_position_bias_table = None - self.relative_position_index = None - - self.attn_drop = nn.Dropout(attn_drop) - self.inner_attn_ln = norm_layer(all_head_dim) if subln else nn.Identity() - # self.proj = nn.Linear(all_head_dim, all_head_dim) - self.proj = nn.Linear(all_head_dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - self.xattn = xattn - self.xattn_drop = attn_drop - - self.rope = rope - - def forward(self, x, rel_pos_bias=None, attn_mask=None): - B, N, C = x.shape - if self.subln: - q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias) - k = F.linear(input=x, weight=self.k_proj.weight, bias=None) - v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias) - - q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) # B, num_heads, N, C - k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) - v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) - else: - - qkv_bias = None - if self.q_bias is not None: - qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) - - qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) - qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) # 3, B, num_heads, N, C - q, k, v = qkv[0], qkv[1], qkv[2] - - if self.rope: - # slightly fast impl - q_t = q[:, :, 1:, :] - ro_q_t = self.rope(q_t) - q = torch.cat((q[:, :, :1, :], ro_q_t), -2).type_as(v) - - k_t = k[:, :, 1:, :] - ro_k_t = self.rope(k_t) - k = torch.cat((k[:, :, :1, :], ro_k_t), -2).type_as(v) - - if self.xattn and xops is not None: - q = q.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C - k = k.permute(0, 2, 1, 3) - v = v.permute(0, 2, 1, 3) - - x = xops.memory_efficient_attention( - q, - k, - v, - p=self.xattn_drop, - scale=self.scale, - ) - x = x.reshape(B, N, -1) - x = self.inner_attn_ln(x) - x = self.proj(x) - x = self.proj_drop(x) - else: - q = q * self.scale - attn = q @ k.transpose(-2, -1) - - if self.relative_position_bias_table is not None: - relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww - attn = attn + relative_position_bias.unsqueeze(0).type_as(attn) - - if rel_pos_bias is not None: - attn = attn + rel_pos_bias.type_as(attn) - - if attn_mask is not None: - attn_mask = attn_mask.bool() - attn = attn.masked_fill(~attn_mask[:, None, None, :], float("-inf")) - - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, N, -1) - x = self.inner_attn_ln(x) - x = self.proj(x) - x = self.proj_drop(x) - return x - - -class Block(nn.Module): - - def __init__( - self, - dim, - num_heads, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - drop=0.0, - attn_drop=0.0, - drop_path=0.0, - init_values=None, - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - window_size=None, - attn_head_dim=None, - xattn=False, - rope=None, - postnorm=False, - subln=False, - naiveswiglu=False, - ): - super().__init__() - self.norm1 = norm_layer(dim) - self.attn = Attention( - dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim, xattn=xattn, rope=rope, subln=subln, norm_layer=norm_layer - ) - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here - self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() - self.norm2 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) - - if naiveswiglu: - self.mlp = SwiGLU( - in_features=dim, - hidden_features=mlp_hidden_dim, - subln=subln, - norm_layer=norm_layer, - ) - else: - self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, subln=subln, drop=drop) - - if init_values is not None and init_values > 0: - self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True) - self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True) - else: - self.gamma_1, self.gamma_2 = None, None - - self.postnorm = postnorm - - def forward(self, x, rel_pos_bias=None, attn_mask=None): - if self.gamma_1 is None: - if self.postnorm: - x = x + self.drop_path(self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))) - x = x + self.drop_path(self.norm2(self.mlp(x))) - else: - x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)) - x = x + self.drop_path(self.mlp(self.norm2(x))) - else: - if self.postnorm: - x = x + self.drop_path(self.gamma_1 * self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))) - x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x))) - else: - x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)) - x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) - return x - - -class PatchEmbed(nn.Module): - """Image to Patch Embedding""" - - def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): - super().__init__() - img_size = to_2tuple(img_size) - patch_size = to_2tuple(patch_size) - num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) - self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) - self.img_size = img_size - self.patch_size = patch_size - self.num_patches = num_patches - - self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) - - def forward(self, x, **kwargs): - B, C, H, W = x.shape - # FIXME look at relaxing size constraints - assert H == self.img_size[0] and W == self.img_size[1], f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." - x = self.proj(x).flatten(2).transpose(1, 2) - return x - - -class RelativePositionBias(nn.Module): - - def __init__(self, window_size, num_heads): - super().__init__() - self.window_size = window_size - self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 - self.relative_position_bias_table = nn.Parameter(torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH - # cls to token & token 2 cls & cls to cls - - # get pair-wise relative position index for each token inside the window - coords_h = torch.arange(window_size[0]) - coords_w = torch.arange(window_size[1]) - coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww - relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * window_size[1] - 1 - relative_position_index = torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww - relative_position_index[0, 0:] = self.num_relative_distance - 3 - relative_position_index[0:, 0] = self.num_relative_distance - 2 - relative_position_index[0, 0] = self.num_relative_distance - 1 - - self.register_buffer("relative_position_index", relative_position_index) - - def forward(self): - relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH - return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww - - -class EVAVisionTransformer(nn.Module): - """Vision Transformer with support for patch or hybrid CNN input stage""" - - def __init__( - self, - img_size=224, - patch_size=16, - in_chans=3, - num_classes=1000, - embed_dim=768, - depth=12, - num_heads=12, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - drop_rate=0.0, - attn_drop_rate=0.0, - drop_path_rate=0.0, - norm_layer=nn.LayerNorm, - init_values=None, - patch_dropout=0.0, - use_abs_pos_emb=True, - use_rel_pos_bias=False, - use_shared_rel_pos_bias=False, - rope=False, - use_mean_pooling=True, - init_scale=0.001, - grad_checkpointing=False, - xattn=False, - postnorm=False, - pt_hw_seq_len=16, - intp_freq=False, - naiveswiglu=False, - subln=False, - ): - super().__init__() - self.image_size = img_size - self.num_classes = num_classes - self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models - - self.patch_embed = PatchEmbed(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) - num_patches = self.patch_embed.num_patches - - self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) - # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) - if use_abs_pos_emb: - self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) - else: - self.pos_embed = None - self.pos_drop = nn.Dropout(p=drop_rate) - - if use_shared_rel_pos_bias: - self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) - else: - self.rel_pos_bias = None - - if rope: - half_head_dim = embed_dim // num_heads // 2 - hw_seq_len = img_size // patch_size - self.rope = VisionRotaryEmbeddingFast( - dim=half_head_dim, - pt_seq_len=pt_hw_seq_len, - ft_seq_len=hw_seq_len if intp_freq else None, - # patch_dropout=patch_dropout - ) - else: - self.rope = None - - self.naiveswiglu = naiveswiglu - - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule - self.use_rel_pos_bias = use_rel_pos_bias - self.blocks = nn.ModuleList( - [ - Block( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[i], - norm_layer=norm_layer, - init_values=init_values, - window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, - xattn=xattn, - rope=self.rope, - postnorm=postnorm, - subln=subln, - naiveswiglu=naiveswiglu, - ) - for i in range(depth) - ] - ) - self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim) - self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None - self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() - - if self.pos_embed is not None: - trunc_normal_(self.pos_embed, std=0.02) - - trunc_normal_(self.cls_token, std=0.02) - # trunc_normal_(self.mask_token, std=.02) - - self.apply(self._init_weights) - self.fix_init_weight() - - if isinstance(self.head, nn.Linear): - trunc_normal_(self.head.weight, std=0.02) - self.head.weight.data.mul_(init_scale) - self.head.bias.data.mul_(init_scale) - - # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn - self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0.0 else nn.Identity() - - self.grad_checkpointing = grad_checkpointing - - def fix_init_weight(self): - def rescale(param, layer_id): - param.div_(math.sqrt(2.0 * layer_id)) - - for layer_id, layer in enumerate(self.blocks): - rescale(layer.attn.proj.weight.data, layer_id + 1) - if self.naiveswiglu: - rescale(layer.mlp.w3.weight.data, layer_id + 1) - else: - rescale(layer.mlp.fc2.weight.data, layer_id + 1) - - def get_cast_dtype(self) -> torch.dtype: - return self.blocks[0].mlp.fc2.weight.dtype - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=0.02) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - def get_num_layers(self): - return len(self.blocks) - - def lock(self, unlocked_groups=0, freeze_bn_stats=False): - assert unlocked_groups == 0, "partial locking not currently supported for this model" - for param in self.parameters(): - param.requires_grad = False - - @torch.jit.ignore - def set_grad_checkpointing(self, enable=True): - self.grad_checkpointing = enable - - @torch.jit.ignore - def no_weight_decay(self): - return {"pos_embed", "cls_token"} - - def get_classifier(self): - return self.head - - def reset_classifier(self, num_classes, global_pool=""): - self.num_classes = num_classes - self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() - - def forward_features(self, x, return_all_features=False): - - x = self.patch_embed(x) - batch_size, seq_len, _ = x.size() - - cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks - x = torch.cat((cls_tokens, x), dim=1) - if self.pos_embed is not None: - x = x + self.pos_embed - x = self.pos_drop(x) - - # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in - if os.getenv("RoPE") == "1": - if self.training and not isinstance(self.patch_dropout, nn.Identity): - x, patch_indices_keep = self.patch_dropout(x) - # Directly pass patch_indices_keep to self.rope.forward - x = self.rope.forward(x, patch_indices_keep=patch_indices_keep) - else: - # Pass None or omit the patch_indices_keep argument for default behavior - x = self.rope.forward(x, patch_indices_keep=None) - x = self.patch_dropout(x) - else: - x = self.patch_dropout(x) - - rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None - for i, blk in enumerate(self.blocks): - if i == len(self.blocks) - 1: - continue - if self.grad_checkpointing: - x = checkpoint(blk, x, (rel_pos_bias,)) - else: - x = blk(x, rel_pos_bias=rel_pos_bias) - - if not return_all_features: - x = self.norm(x) - if self.fc_norm is not None: - return self.fc_norm(x.mean(1)) - else: - return x[:, 0] - return x - - def forward(self, x, return_all_features=False): - if return_all_features: - return self.forward_features(x, return_all_features) - x = self.forward_features(x) - x = self.head(x) - return x - - -def load_state_dict(checkpoint_path: str, map_location: str = "cpu", model_key: str = "model|module|state_dict", is_openai: bool = False, skip_list: list = []): - if is_openai: - model = torch.jit.load(checkpoint_path, map_location="cpu").eval() - state_dict = model.state_dict() - for key in ["input_resolution", "context_length", "vocab_size"]: - state_dict.pop(key, None) - else: - checkpoint = torch.load(checkpoint_path, map_location=map_location) - for mk in model_key.split("|"): - if isinstance(checkpoint, dict) and mk in checkpoint: - state_dict = checkpoint[mk] - break - else: - state_dict = checkpoint - if next(iter(state_dict.items()))[0].startswith("module"): - state_dict = {k[7:]: v for k, v in state_dict.items()} - - for k in skip_list: - if k in list(state_dict.keys()): - logging.info(f"Removing key {k} from pretrained checkpoint") - del state_dict[k] - - if os.getenv("RoPE") == "1": - for k in list(state_dict.keys()): - if "freqs_cos" in k or "freqs_sin" in k: - del state_dict[k] - return state_dict - - -def load_clip_visual_state_dict(checkpoint_path: str, map_location: str = "cpu", is_openai: bool = False, skip_list: list = []): - state_dict = load_state_dict(checkpoint_path, map_location=map_location, is_openai=is_openai, skip_list=skip_list) - # for k in list(state_dict.keys()): - # if not k.startswith("visual."): - # del state_dict[k] - # for k in list(state_dict.keys()): - # if k.startswith("visual."): - # new_k = k[7:] - # state_dict[new_k] = state_dict[k] - # del state_dict[k] - return state_dict - - -from dataclasses import dataclass -from typing import Optional, Tuple, Union - -try: - from apex.normalization import FusedLayerNorm -except: - FusedLayerNorm = LayerNorm - # print("Please build and install Nvidia apex package with option '--cuda_ext' according to https://github.com/NVIDIA/apex#from-source .") - - -@dataclass -class CLIPVisionCfg: - layers: Union[Tuple[int, int, int, int], int] = 12 - width: int = 768 - head_width: int = 64 - mlp_ratio: float = 4.0 - patch_size: int = 16 - image_size: Union[Tuple[int, int], int] = 224 - ls_init_value: Optional[float] = None # layer scale initial value - patch_dropout: float = 0.0 # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results - global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580) - drop_path_rate: Optional[float] = None # drop path rate - timm_model_name: str = None # a valid model name overrides layers, width, patch_size - timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model - timm_pool: str = "avg" # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '') - timm_proj: str = "linear" # linear projection for timm model output ('linear', 'mlp', '') - timm_proj_bias: bool = False # enable bias final projection - eva_model_name: str = None # a valid eva model name overrides layers, width, patch_size - qkv_bias: bool = True - fusedLN: bool = False - xattn: bool = False - postnorm: bool = False - rope: bool = False - pt_hw_seq_len: int = 16 # 224/14 - intp_freq: bool = False - naiveswiglu: bool = False - subln: bool = False - - -def create_norm_layer_factory(use_fused_ln, eps=1e-6): - # Otherwise, use the standard LayerNorm - return lambda num_features: nn.LayerNorm(num_features, eps=eps) - - -def _build_vision_tower(vision_tower_path: str, embed_dim: int, vision_cfg: CLIPVisionCfg, **kwargs): - if isinstance(vision_cfg, dict): - vision_cfg = CLIPVisionCfg(**vision_cfg) - - if vision_cfg.eva_model_name: - vision_heads = vision_cfg.width // vision_cfg.head_width - # Determine the appropriate norm layer factory based on the configuration - norm_layer_factory = create_norm_layer_factory(vision_cfg.fusedLN, eps=1e-6) - - visual = EVAVisionTransformer( - img_size=vision_cfg.image_size, - patch_size=vision_cfg.patch_size, - num_classes=embed_dim, - use_mean_pooling=vision_cfg.global_average_pool, # False - init_values=vision_cfg.ls_init_value, - patch_dropout=vision_cfg.patch_dropout, - embed_dim=vision_cfg.width, - depth=vision_cfg.layers, - num_heads=vision_heads, - mlp_ratio=vision_cfg.mlp_ratio, - qkv_bias=vision_cfg.qkv_bias, - drop_path_rate=vision_cfg.drop_path_rate, - norm_layer=norm_layer_factory, - xattn=vision_cfg.xattn, - rope=vision_cfg.rope, - postnorm=vision_cfg.postnorm, - pt_hw_seq_len=vision_cfg.pt_hw_seq_len, # 224/14 - intp_freq=vision_cfg.intp_freq, - naiveswiglu=vision_cfg.naiveswiglu, - subln=vision_cfg.subln, - ) - - state_dict = load_clip_visual_state_dict(vision_tower_path) - incompatible_keys = visual.load_state_dict(state_dict, strict=False) - rank0_print("EVA-CLIP incompatible_keys:", incompatible_keys) - - return visual - - -class EVAEncoderWrapper(nn.Module): - def __init__(self, vision_tower_pretrained, config): - super(EVAEncoderWrapper, self).__init__() - self.config = config - self.config["vision_tower_path"] = vision_tower_pretrained - self.model = _build_vision_tower(**self.config) - - def forward(self, image, **kwargs): - encode = self.model(image, return_all_features=True)[:, 1:, :] # remove the CLS token - return encode - - @property - def dtype(self): - return list(self.parameters())[-1].dtype - - @property - def device(self): - return list(self.parameters())[-1].device diff --git a/llava_next/llava/model/multimodal_encoder/eva_clip/factory.py b/llava_next/llava/model/multimodal_encoder/eva_clip/factory.py deleted file mode 100755 index 6d3fafcf..00000000 --- a/llava_next/llava/model/multimodal_encoder/eva_clip/factory.py +++ /dev/null @@ -1,60 +0,0 @@ -import json -import logging -import os -import pathlib -import re -from copy import deepcopy -from pathlib import Path -from typing import Optional, Tuple, Union, Dict, Any -import torch - -_MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"] -_MODEL_CONFIGS = {} # directory (model_name: config) of model architecture configs - - -def _natural_key(string_): - return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())] - - -def _rescan_model_configs(): - global _MODEL_CONFIGS - - config_ext = (".json",) - config_files = [] - for config_path in _MODEL_CONFIG_PATHS: - if config_path.is_file() and config_path.suffix in config_ext: - config_files.append(config_path) - elif config_path.is_dir(): - for ext in config_ext: - config_files.extend(config_path.glob(f"*{ext}")) - - for cf in config_files: - with open(cf, "r", encoding="utf8") as f: - model_cfg = json.load(f) - if all(a in model_cfg for a in ("embed_dim", "vision_cfg", "text_cfg")): - _MODEL_CONFIGS[cf.stem] = model_cfg - - _MODEL_CONFIGS = dict(sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))) - - -_rescan_model_configs() # initial populate of model config registry - - -def list_models(): - """enumerate available model architectures based on config files""" - return list(_MODEL_CONFIGS.keys()) - - -def add_model_config(path): - """add model config path or file and update registry""" - if not isinstance(path, Path): - path = Path(path) - _MODEL_CONFIG_PATHS.append(path) - _rescan_model_configs() - - -def get_model_config(model_name): - if model_name in _MODEL_CONFIGS: - return deepcopy(_MODEL_CONFIGS[model_name]) - else: - return None diff --git a/llava_next/llava/model/multimodal_encoder/example_packing_tower_usage.py b/llava_next/llava/model/multimodal_encoder/example_packing_tower_usage.py deleted file mode 100644 index eaf69ebb..00000000 --- a/llava_next/llava/model/multimodal_encoder/example_packing_tower_usage.py +++ /dev/null @@ -1,183 +0,0 @@ -""" -Example usage of HEVCViTPackingVisionTower - -This example demonstrates how to use the packing mode vision tower. -""" - -# Example 1: Using with builder (recommended) -# ============================================ -# In your config file or code, specify a model path containing "packing": - -config_example = { - "mm_vision_tower": "/path/to/hevc_vit_packing_model", # Contains "packing" in name - "mm_projector_type": "patch_merger", - "mm_vision_select_layer": -1, - "mm_vision_select_feature": "patch", -} - -# The builder will automatically use HEVCViTPackingVisionTower -# from llava_next.llava.model.multimodal_encoder.builder import build_vision_tower -# tower = build_vision_tower(config) - - -# Example 2: Direct usage (for testing) -# ===================================== -def example_batch_processing(): - """Example of processing a batch of images""" - import torch - from llava_next.llava.model.multimodal_encoder.hevc_vit_packing_tower import HEVCViTPackingVisionTower - - # Mock args (in real usage, these come from your config) - class MockArgs: - mm_projector_type = "patch_merger" - mm_vision_select_layer = -1 - mm_vision_select_feature = "patch" - - # Initialize tower (requires a pretrained packing model) - tower = HEVCViTPackingVisionTower( - vision_tower="/path/to/hevc_vit_packing_model", - args=MockArgs(), - delay_load=False # Set to True if loading later - ) - - # Prepare batch of images [B, C, H, W] - batch_images = torch.randn(4, 3, 224, 224).cuda() - - # Forward pass - # Input: [4, 3, 224, 224] - # Internally converted to packing format: [784, 768] - # where 784 = 4 images × 196 patches/image - # and 768 = 16×16×3 (patch_dim) - # Output: [4, 196, hidden_size] - features = tower(batch_images) - - print(f"Input shape: {batch_images.shape}") - print(f"Output shape: {features.shape}") - return features - - -def example_list_processing(): - """Example of processing a list of images with different sizes""" - import torch - from llava_next.llava.model.multimodal_encoder.hevc_vit_packing_tower import HEVCViTPackingVisionTower - - class MockArgs: - mm_projector_type = "patch_merger" - mm_vision_select_layer = -1 - mm_vision_select_feature = "patch" - - tower = HEVCViTPackingVisionTower( - vision_tower="/path/to/hevc_vit_packing_model", - args=MockArgs(), - delay_load=False - ) - - # Prepare list of images with different sizes - images = [ - torch.randn(3, 224, 224).cuda(), # 196 patches - torch.randn(3, 224, 224).cuda(), # 196 patches - torch.randn(3, 448, 448).cuda(), # 784 patches - ] - - # Forward pass - # Internally converted to packing format: [1176, 768] - # where 1176 = 196 + 196 + 784 patches - # Output: List of [196, hidden_size], [196, hidden_size], [784, hidden_size] - features_list = tower(images) - - print(f"Input: {len(images)} images") - for i, feat in enumerate(features_list): - print(f" Image {i}: {feat.shape}") - return features_list - - -def example_with_spatial_dims(): - """Example of getting spatial dimensions (for spatial_merge projector)""" - import torch - from llava_next.llava.model.multimodal_encoder.hevc_vit_packing_tower import HEVCViTPackingVisionTower - - class MockArgs: - mm_projector_type = "spatial_merge" # Using spatial merge - mm_vision_select_layer = -1 - mm_vision_select_feature = "patch" - - tower = HEVCViTPackingVisionTower( - vision_tower="/path/to/hevc_vit_packing_model", - args=MockArgs(), - delay_load=False - ) - - batch_images = torch.randn(2, 3, 224, 224).cuda() - - # Forward pass with spatial dimensions - features, h, w = tower(batch_images, return_spatial_dims=True) - - print(f"Input shape: {batch_images.shape}") - print(f"Output shape: {features.shape}") - print(f"Spatial dims: h={h}, w={w}") - return features, h, w - - -# Example 3: Understanding the conversion -# ======================================= -def show_conversion_details(): - """ - Show how the conversion works step by step - """ - print("=" * 60) - print("CONVERSION DETAILS") - print("=" * 60) - - # Configuration - batch_size = 4 - channels = 3 - height = 224 - width = 224 - patch_size = 16 - - # Calculate dimensions - h_patches = height // patch_size # 14 - w_patches = width // patch_size # 14 - num_patches_per_image = h_patches * w_patches # 196 - total_patches = batch_size * num_patches_per_image # 784 - patch_dim = patch_size * patch_size * channels # 768 - - print(f"\nInput Format:") - print(f" Shape: [{batch_size}, {channels}, {height}, {width}]") - print(f" Batch size: {batch_size}") - print(f" Image size: {height}x{width}") - - print(f"\nPacking Format (Internal):") - print(f" hidden_states shape: [{total_patches}, {patch_dim}]") - print(f" - total_patches = {batch_size} × {h_patches} × {w_patches} = {total_patches}") - print(f" - patch_dim = {patch_size} × {patch_size} × {channels} = {patch_dim}") - print(f" grid_thw: {[[1, h_patches, w_patches]] * batch_size}") - - print(f"\nOutput Format:") - print(f" Shape: [{batch_size}, {num_patches_per_image}, hidden_size]") - print(f" - Reshaped from packing output [{total_patches}, hidden_size]") - - print("\n" + "=" * 60) - - -if __name__ == "__main__": - print("HEVCViTPackingVisionTower Usage Examples") - print("=" * 60) - print("\nNOTE: These examples require:") - print("1. A converted packing model checkpoint") - print("2. FlashAttention 2 installed") - print("3. CUDA-compatible GPU") - print("\n" + "=" * 60) - - # Show conversion details (doesn't require actual model) - show_conversion_details() - - # Uncomment to run actual examples (requires model and dependencies) - # print("\nExample 1: Batch Processing") - # example_batch_processing() - # - # print("\nExample 2: List Processing") - # example_list_processing() - # - # print("\nExample 3: With Spatial Dimensions") - # example_with_spatial_dims() diff --git a/llava_next/llava/model/multimodal_encoder/hevc_vit_packing_tower.py b/llava_next/llava/model/multimodal_encoder/hevc_vit_packing_tower.py deleted file mode 100644 index 7fc67874..00000000 --- a/llava_next/llava/model/multimodal_encoder/hevc_vit_packing_tower.py +++ /dev/null @@ -1,340 +0,0 @@ -import torch -import torch.nn as nn -from llava.utils import rank0_print -from transformers import CLIPImageProcessor - -from model_factory.vit_preview_v0_packing_hf import LlavaViTPackingConfig as HEVCViTPackingConfig -from model_factory.vit_preview_v0_packing_hf import LlavaViTPackingModel as HEVCViTPackingModel -from model_factory.vit_preview_v0_packing_hf import compute_patch_positions_from_grid_thw - - -class HEVCViTPackingVisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - self.projector_type = getattr(args, "mm_projector_type", "patch_merger") - - self.vision_tower_name = vision_tower - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - rank0_print(f"Loading vision tower (packing mode): {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - self.cfg_only = HEVCViTPackingConfig.from_pretrained(self.vision_tower_name) - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - # Load CLIPImageProcessor (saved in conversion script) - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - - # Load HEVCViTPackingModel - self.vision_tower = HEVCViTPackingModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - self.vision_tower.head = None - - - self.is_loaded = True - - def feature_select(self, image_forward_outs): - select_feature_type = self.select_feature - - if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: - select_every_k_layer = len(image_forward_outs.hidden_states) // 4 - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) - select_feature_type = select_feature_type.replace("slicefour_", "") - elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]: - select_layers = [-2, -5, -8, -11, 6] - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1) - select_feature_type = select_feature_type.replace("slice_m25811_f6_", "") - else: - # Use select_layer to pick a specific hidden state - # hidden_states is a tuple where the last element is the final layer output - # For select_layer=-1, we get last_hidden_state; for -2, we get second-to-last, etc. - image_features = image_forward_outs.hidden_states[self.select_layer] - - # Note: HEVC ViT does not have a cls token, so we just return all patch features - # Both "patch" and "cls_patch" return the same features for HEVC ViT - if select_feature_type not in ["patch", "cls_patch"]: - raise ValueError(f"Unexpected select feature: {select_feature_type}") - return image_features - - def forward(self, images, return_spatial_dims=False): - """ - Args: - images: Input images in standard format [B, C, H, W] or list of images - return_spatial_dims: If True, return (features, h, w) tuple for spatial_merge projector - """ - # Calculate spatial dimensions from input images - if type(images) is list: - # For list of images, process each separately and combine - sample_image = images[0] - height, width = sample_image.shape[-2:] - - # ============================================================ - # 【INPUT CONVERSION】: Convert list of images to packing format - # Standard format: List of [C, H, W] tensors - # Packing format: [total_num_patches, patch_dim] where - # patch_dim = patch_size * patch_size * in_channels - # ============================================================ - all_hidden_states = [] - all_grid_thw = [] - - for image in images: - # Convert single image [C, H, W] to packing format - hidden_states, grid_thw = self._image_to_packing_input( - image.to(device=self.device, dtype=self.dtype) - ) - all_hidden_states.append(hidden_states) - all_grid_thw.append(grid_thw) - - # Concatenate all patches along sequence dimension - packed_hidden_states = torch.cat(all_hidden_states, dim=0) # [total_seq_len, patch_dim] - packed_grid_thw = torch.cat(all_grid_thw, dim=0) # [num_images, 3] - # ============================================================ - # 【END INPUT CONVERSION】 - # ============================================================ - - # Generate patch_positions from grid_thw for RoPE calculation - patch_positions = compute_patch_positions_from_grid_thw(packed_grid_thw) - - # Forward pass through packing model - image_forward_outs = self.vision_tower( - hidden_states=packed_hidden_states, - grid_thw=packed_grid_thw, - patch_positions=patch_positions, - output_hidden_states=True - ) - - # ============================================================ - # 【OUTPUT CONVERSION】: Convert packing output back to feature format - # Packing output: [total_seq_len, hidden_size] - all patches concatenated - # Target format: List of [num_patches, hidden_size] per image - # ============================================================ - image_features = self.feature_select(image_forward_outs) - - # Split the packed output back to individual images - image_features_list = [] - start_idx = 0 - # Optimized for distributed training - extract scalars efficiently - for i in range(packed_grid_thw.shape[0]): - # Extract values with minimal synchronization (single sync per row) - thw = packed_grid_thw[i] - t, h, w = thw.tolist() - seq_len = t * h * w - image_features_list.append(image_features[start_idx:start_idx + seq_len]) - start_idx += seq_len - - image_features = image_features_list - # ============================================================ - # 【END OUTPUT CONVERSION】 - # ============================================================ - - # For list processing, use the first image's dimensions for spatial dims - # (Note: in list mode, all images should ideally have the same size) - # height and width already set from sample_image at line 80 - else: - # Extract height and width from batch of images - height, width = images.shape[-2:] - - # ============================================================ - # 【INPUT CONVERSION】: Convert batch images to packing format - # Standard format: [B, C, H, W] - # Packing format: [total_num_patches, patch_dim] where - # total_num_patches = B * h_patches * w_patches - # patch_dim = patch_size * patch_size * in_channels - # ============================================================ - batch_size = images.shape[0] - - # Convert batch to packing format - packed_hidden_states, packed_grid_thw = self._batch_images_to_packing_input(images) - # ============================================================ - # 【END INPUT CONVERSION】 - # ============================================================ - - # Generate patch_positions from grid_thw for RoPE calculation - patch_positions = compute_patch_positions_from_grid_thw(packed_grid_thw) - - # Forward pass through packing model - image_forward_outs = self.vision_tower( - hidden_states=packed_hidden_states, - grid_thw=packed_grid_thw, - patch_positions=patch_positions, - output_hidden_states=True - ) - - # ============================================================ - # 【OUTPUT CONVERSION】: Convert packing output back to feature format - # Packing output: [total_seq_len, hidden_size] - all patches from all images concatenated - # Target format: [B, num_patches, hidden_size] - # ============================================================ - raw_features = self.feature_select(image_forward_outs) - # Split the packed output back to batch format - # Calculate num_patches per image - # Optimized for distributed training - extract scalars efficiently - thw = packed_grid_thw[0] - t, h_patches, w_patches = thw.tolist() - num_patches_per_image = t * h_patches * w_patches - - # Reshape from [total_seq_len, hidden_size] to [B, num_patches, hidden_size] - image_features = raw_features.view(batch_size, num_patches_per_image, -1) - # print(image_features.size()) - # ============================================================ - # 【END OUTPUT CONVERSION】 - # ============================================================ - - # Calculate h and w in patch coordinates - h = height // self.config.patch_size - w = width // self.config.patch_size - - if return_spatial_dims: - return image_features, h, w - return image_features - - def _image_to_packing_input(self, image_tensor): - """ - Convert a single image tensor to packing model input format. - - Args: - image_tensor: [C, H, W] tensor - - Returns: - hidden_states: [seq_len, patch_dim] tensor - grid_thw: [1, 3] tensor with [t, h, w] patches - """ - patch_size = self.config.patch_size - channels, height, width = image_tensor.shape - - # Calculate patch dimensions - h_patches = height // patch_size - w_patches = width // patch_size - t_frames = 1 # Images have t=1 - - # Reshape to patches: (C, H, W) -> (h_patches, w_patches, C, patch_size, patch_size) - patches = image_tensor.view( - channels, h_patches, patch_size, w_patches, patch_size - ) - patches = patches.permute(1, 3, 0, 2, 4).contiguous() # (h, w, C, pH, pW) - - # Flatten to (seq_len, patch_dim) - seq_len = t_frames * h_patches * w_patches - patch_dim = patch_size * patch_size * channels - hidden_states = patches.view(seq_len, patch_dim) - - # Create grid_thw: [t, h, w] - grid_thw = torch.tensor( - [[t_frames, h_patches, w_patches]], - dtype=torch.long, - device=image_tensor.device - ) - - return hidden_states, grid_thw - - def _batch_images_to_packing_input(self, images): - """ - Convert a batch of images to packing model input format. - - Args: - images: [B, C, H, W] tensor - - Returns: - hidden_states: [total_seq_len, patch_dim] tensor - grid_thw: [B, 3] tensor with [t, h, w] patches for each image - """ - batch_size, channels, height, width = images.shape - patch_size = self.config.patch_size - - # Calculate patch dimensions - h_patches = height // patch_size - w_patches = width // patch_size - t_frames = 1 # Images have t=1 - - # Reshape batch to patches - # [B, C, H, W] -> [B, C, h_patches, patch_size, w_patches, patch_size] - patches = images.view( - batch_size, channels, h_patches, patch_size, w_patches, patch_size - ) - # [B, C, h_patches, patch_size, w_patches, patch_size] -> [B, h_patches, w_patches, C, patch_size, patch_size] - patches = patches.permute(0, 2, 4, 1, 3, 5).contiguous() - - # Flatten to (total_seq_len, patch_dim) - seq_len_per_image = t_frames * h_patches * w_patches - patch_dim = patch_size * patch_size * channels - hidden_states = patches.view(batch_size * seq_len_per_image, patch_dim) - - # Create grid_thw for each image in batch - grid_thw_values = torch.tensor( - [t_frames, h_patches, w_patches], - dtype=torch.long, - device=images.device - ) - grid_thw = grid_thw_values.unsqueeze(0).expand(batch_size, 3).contiguous() - - return hidden_states, grid_thw - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - if self.is_loaded: - return self.vision_tower.dtype - else: - return torch.float32 # Default dtype when not loaded - - @property - def device(self): - if self.is_loaded: - return self.vision_tower.device - else: - return torch.device("cpu") # Default device when not loaded - - @property - def config(self): - if self.is_loaded: - return self.vision_tower.config - else: - return self.cfg_only - - @property - def hidden_size(self): - _hidden_size = self.config.hidden_size - if "slicefour" in self.select_feature: - _hidden_size *= 4 - if "slice_m25811_f6" in self.select_feature: - _hidden_size *= 5 - return _hidden_size - - @property - def num_patches_per_side(self): - # Base patch count per side - base_patches_per_side = self.config.image_size // self.config.patch_size - # If using spatial_merge projector, reduce each side by 2x (merge_size) - if self.projector_type == "spatial_merge": - return base_patches_per_side // 2 - return base_patches_per_side - - @property - def num_patches(self): - # Base total patch count - base_patches = (self.config.image_size // self.config.patch_size) ** 2 - # If using spatial_merge projector, reduce total patches by 4x (merge_size^2 = 2^2) - if self.projector_type == "spatial_merge": - return base_patches // 4 - return base_patches - - @property - def image_size(self): - return self.config.image_size diff --git a/llava_next/llava/model/multimodal_encoder/hevc_vit_tower.py b/llava_next/llava/model/multimodal_encoder/hevc_vit_tower.py deleted file mode 100755 index f76d6d64..00000000 --- a/llava_next/llava/model/multimodal_encoder/hevc_vit_tower.py +++ /dev/null @@ -1,160 +0,0 @@ -import torch -import torch.nn as nn -from llava.utils import rank0_print -from transformers import CLIPImageProcessor - -from model_factory.vit_ov_encoder import LlavaViTConfig as HEVCViTConfig -from model_factory.vit_ov_encoder import LlavaViTModel as HEVCViTModel - - -class HEVCViTVisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - self.projector_type = getattr(args, "mm_projector_type", "patch_merger") - - self.vision_tower_name = vision_tower - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - self.cfg_only = HEVCViTConfig.from_pretrained(self.vision_tower_name) - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - # 加载 CLIPImageProcessor (我们在转换脚本里保存了这个配置) - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - - # 加载 HEVCViTModel - self.vision_tower = HEVCViTModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.is_loaded = True - - def feature_select(self, image_forward_outs): - select_feature_type = self.select_feature - - if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: - select_every_k_layer = len(image_forward_outs.hidden_states) // 4 - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) - select_feature_type = select_feature_type.replace("slicefour_", "") - elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]: - select_layers = [-2, -5, -8, -11, 6] - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1) - select_feature_type = select_feature_type.replace("slice_m25811_f6_", "") - else: - # Use select_layer to pick a specific hidden state - # hidden_states is a tuple where the last element is the final layer output - # For select_layer=-1, we get last_hidden_state; for -2, we get second-to-last, etc. - image_features = image_forward_outs.hidden_states[self.select_layer] - - # Note: HEVC ViT does not have a cls token, so we just return all patch features - # Both "patch" and "cls_patch" return the same features for HEVC ViT - if select_feature_type not in ["patch", "cls_patch"]: - raise ValueError(f"Unexpected select feature: {select_feature_type}") - return image_features - - def forward(self, images, return_spatial_dims=False): - """ - Args: - images: Input images - return_spatial_dims: If True, return (features, h, w) tuple for spatial_merge projector - """ - # Calculate spatial dimensions from input images - if type(images) is list: - # For list of images, use the first image to determine dimensions - sample_image = images[0] - # Extract height and width (works for both image and video) - height, width = sample_image.shape[-2:] - - image_features = [] - for image in images: - image_forward_out = self.vision_tower( - image.to(device=self.device, dtype=self.dtype).unsqueeze(0), - output_hidden_states=True - ) - image_feature = self.feature_select(image_forward_out).to(image.dtype) - image_features.append(image_feature) - else: - # Extract height and width from batch of images - if images.ndim == 5: # (B, C, T, H, W) - video batch - height, width = images.shape[-2:] - else: # (B, C, H, W) - image batch - height, width = images.shape[-2:] - - image_forward_outs = self.vision_tower( - images.to(device=self.device, dtype=self.dtype), - output_hidden_states=True - ) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - - # Calculate h and w in patch coordinates - h = height // self.config.patch_size - w = width // self.config.patch_size - - if return_spatial_dims: - return image_features, h, w - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - return self.vision_tower.dtype - - @property - def device(self): - return self.vision_tower.device - - @property - def config(self): - if self.is_loaded: - return self.vision_tower.config - else: - return self.cfg_only - - @property - def hidden_size(self): - _hidden_size = self.config.hidden_size - if "slicefour" in self.select_feature: - _hidden_size *= 4 - if "slice_m25811_f6" in self.select_feature: - _hidden_size *= 5 - return _hidden_size - - @property - def num_patches_per_side(self): - # Base patch count per side - base_patches_per_side = self.config.image_size // self.config.patch_size - # If using spatial_merge projector, reduce each side by 2x (merge_size) - if self.projector_type == "spatial_merge": - return base_patches_per_side // 2 - return base_patches_per_side - - @property - def num_patches(self): - # Base total patch count - base_patches = (self.config.image_size // self.config.patch_size) ** 2 - # If using spatial_merge projector, reduce total patches by 4x (merge_size^2 = 2^2) - if self.projector_type == "spatial_merge": - return base_patches // 4 - return base_patches - - @property - def image_size(self): - return self.config.image_size diff --git a/llava_next/llava/model/multimodal_encoder/hf_vision.py b/llava_next/llava/model/multimodal_encoder/hf_vision.py deleted file mode 100755 index a413208e..00000000 --- a/llava_next/llava/model/multimodal_encoder/hf_vision.py +++ /dev/null @@ -1,111 +0,0 @@ -import torch -import torch.nn as nn - -from transformers import AutoModel, AutoImageProcessor, AutoConfig, CLIPImageProcessor -from llava.utils import rank0_print - - -class HFVisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - - self.vision_tower_name = vision_tower.replace("hf:", "", 1) - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - self.load_model() - else: - self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name) - - def load_model(self): - try: - self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name) - except Exception as e: - if "448" in self.vision_tower_name: - image_size = 448 - # use image processor with conig - self.image_processor = CLIPImageProcessor(size={"shortest_edge": image_size}, do_center_crop=True, crop_size=image_size) - else: - self.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") - rank0_print(f"Loaded image processor: {self.image_processor}") - self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, torch_dtype=torch.bfloat16, trust_remote_code=True).to("cuda") - self.device = self.vision_tower.device - self.dtype = self.vision_tower.dtype - self.config = self.vision_tower.config - - if hasattr(self.vision_tower, "vision_model"): - self.vision_tower = self.vision_tower.vision_model - self.vision_tower.requires_grad_(False) - # self.vision_tower.eval() - self.is_loaded = True - - def feature_select(self, image_forward_outs): - select_feature_type = self.select_feature - - if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: - select_every_k_layer = len(image_forward_outs.hidden_states) // 4 - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) - select_feature_type = select_feature_type.replace("slicefour_", "") - else: - image_features = image_forward_outs.hidden_states[self.select_layer] - - if select_feature_type == "patch": - image_features = image_features[:, 1:] - elif select_feature_type == "cls_patch": - image_features = image_features - else: - raise ValueError(f"Unexpected select feature: {select_feature_type}") - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) - image_feature = self.feature_select(image_forward_out).to(image.dtype) - image_features.append(image_feature) - else: - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - # @property - # def dtype(self): - # return self.vision_tower.dtype - - # @property - # def device(self): - # return self.vision_tower.device - - @property - def hidden_size(self): - try: - _hidden_size = self.config.hidden_size - except: - _hidden_size = self.config.vision_config.hidden_size - if "slicefour" in self.select_feature: - _hidden_size *= 4 - return _hidden_size - - @property - def num_patches(self): - _num_patches = (self.config.image_size // self.config.patch_size) ** 2 - if "cls_patch" in self.select_feature: - _num_patches += 1 - return _num_patches - - @property - def num_patches_per_side(self): - return self.config.image_size // self.config.patch_size - - @property - def image_size(self): - return self.config.image_size diff --git a/llava_next/llava/model/multimodal_encoder/imagebind.py b/llava_next/llava/model/multimodal_encoder/imagebind.py deleted file mode 100755 index 8bbe71c7..00000000 --- a/llava_next/llava/model/multimodal_encoder/imagebind.py +++ /dev/null @@ -1,73 +0,0 @@ -import torch -import torch.nn as nn - -from transformers import CLIPImageProcessor - -try: - from imagebind.models import imagebind_model - from imagebind.models.imagebind_model import ModalityType - from imagebind.data import load_and_transform_audio_data -except ImportError: - pass - - -class ImageBindWrapper(nn.Module): - def __init__(self, vision_tower, select_layer, select_feature="patch", delay_load=False): - super().__init__() - - self.is_loaded = False - - self.vision_tower_name = vision_tower - self.select_layer = select_layer - self.select_feature = select_feature - - if not delay_load: - self.load_model() - - def load_model(self): - self.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") - self.vision_tower = imagebind_model.imagebind_huge(pretrained=True) - for p in self.vision_tower.parameters(): - p.requires_grad = False - self.vision_tower.eval() - self.is_loaded = True - - def train(self, mode=True): - self.training = mode - - if self.is_loaded: - self.vision_tower.eval() - - @torch.no_grad() - def forward(self, x): - if type(x) == dict: - if x["audios"] is not None: - inputs = {ModalityType.AUDIO: load_and_transform_audio_data(x["audios"], device=self.device).half()} - embeddings = self.vision_tower(inputs) - audio_embedding = embeddings[ModalityType.AUDIO] - return audio_embedding.unsqueeze(1) - else: - inputs = {ModalityType.VISION: x.to(dtype=self.dtype)} - embeddings = self.vision_tower(inputs) - vision_embedding = embeddings[ModalityType.VISION] - if vision_embedding.ndim == 2: - return vision_embedding.unsqueeze(1) - if vision_embedding.shape[1] == 257: - return vision_embedding[:, 1:] - raise ValueError(f"Unexpected shape: {vision_embedding.shape}") - - @property - def dummy_feature(self): - return torch.zeros(1, 1024, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - return self.vision_tower.modality_preprocessors.vision.cls_token.dtype - - @property - def device(self): - return self.vision_tower.modality_preprocessors.vision.cls_token.device - - @property - def hidden_size(self): - return 1024 diff --git a/llava_next/llava/model/multimodal_encoder/internViT_300M_448px_encoder.py b/llava_next/llava/model/multimodal_encoder/internViT_300M_448px_encoder.py deleted file mode 100755 index 2406a568..00000000 --- a/llava_next/llava/model/multimodal_encoder/internViT_300M_448px_encoder.py +++ /dev/null @@ -1,173 +0,0 @@ -import torch -import torch.nn as nn -from llava.utils import rank0_print -from transformers import CLIPImageProcessor, AutoModel - -try: - from s2wrapper import forward as multiscale_forward -except: - pass - - -class InternViT_300M_448px_VisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - - self.vision_tower_name = vision_tower - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - raise - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, device_map=device_map, trust_remote_code=True) - self.vision_tower.requires_grad_(False) - - self.is_loaded = True - - def feature_select(self, image_forward_outs): - select_feature_type = self.select_feature - - if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: - select_every_k_layer = len(image_forward_outs.hidden_states) // 4 - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) - select_feature_type = select_feature_type.replace("slicefour_", "") - elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]: - select_layers = [-2, -5, -8, -11, 6] - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1) - select_feature_type = select_feature_type.replace("slice_m25811_f6_", "") - else: - image_features = image_forward_outs.hidden_states[self.select_layer] - - if select_feature_type == "patch": - image_features = image_features[:, 1:] - elif select_feature_type == "cls_patch": - image_features = image_features - else: - raise ValueError(f"Unexpected select feature: {select_feature_type}") - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) - image_feature = self.feature_select(image_forward_out).to(image.dtype) - image_features.append(image_feature) - else: - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - return self.vision_tower.dtype - - @property - def device(self): - return self.vision_tower.device - - @property - def config(self): - if self.is_loaded: - return self.vision_tower.config - else: - return self.cfg_only - - @property - def hidden_size(self): - _hidden_size = self.config.hidden_size - if "slicefour" in self.select_feature: - _hidden_size *= 4 - if "slice_m25811_f6" in self.select_feature: - _hidden_size *= 5 - return _hidden_size - - @property - def num_patches_per_side(self): - return self.config.image_size // self.config.patch_size - - @property - def num_patches(self): - _num_patches = (self.config.image_size // self.config.patch_size) ** 2 - if "cls_patch" in self.select_feature: - _num_patches += 1 - return _num_patches - - @property - def image_size(self): - return self.config.image_size - - -class InternViT_300M_448px_VisionTowerS2(InternViT_300M_448px_VisionTower): - def __init__(self, vision_tower, args, delay_load=False): - - self.s2_scales = getattr(args, "s2_scales", "336,672,1008") - self.s2_scales = list(map(int, self.s2_scales.split(","))) - self.s2_scales.sort() - self.s2_split_size = self.s2_scales[0] - self.s2_image_size = self.s2_scales[-1] - - super().__init__(vision_tower, args, delay_load) - - # change resize/crop size in preprocessing to the largest image size in s2_scale - if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False): - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, device_map=device_map, trust_remote_code=True) - self.vision_tower.requires_grad_(False) - - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - self.is_loaded = True - - def forward_feature(self, images): - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - image_features.append(image_feature) - else: - image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - - return image_features - - @property - def hidden_size(self): - return self.config.hidden_size * len(self.s2_scales) diff --git a/llava_next/llava/model/multimodal_encoder/mlcd/vit_rope2d_hf.py b/llava_next/llava/model/multimodal_encoder/mlcd/vit_rope2d_hf.py deleted file mode 100755 index 9fe79be5..00000000 --- a/llava_next/llava/model/multimodal_encoder/mlcd/vit_rope2d_hf.py +++ /dev/null @@ -1,452 +0,0 @@ -from typing import Optional, Tuple, Union - -import torch -import torch.utils.checkpoint -from torch import nn -from torch.nn import functional as F -from transformers.models.clip.modeling_clip import (CLIPMLP, BaseModelOutput, - BaseModelOutputWithPooling, - CLIPVisionConfig, - PreTrainedModel) - - -def rotate_half(x): - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: - orig_dtype = tensor.dtype - tensor = tensor.float() - cos = freqs.cos() - sin = freqs.sin() - cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float() - sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float() - output = (tensor * cos) + (rotate_half(tensor) * sin) - output = output.to(orig_dtype) - return output - - -class VisionRotaryEmbedding(nn.Module): - def __init__(self, dim: int, theta: float = 10000.0) -> None: - super().__init__() - inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - def forward(self, seqlen: int) -> torch.Tensor: - seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype) - freqs = torch.outer(seq, self.inv_freq) - return freqs - - -class MLCDVisionConfig(CLIPVisionConfig): - - model_type = "mlcd_vision_model" - - def __init__(self,**kwargs): - super().__init__(**kwargs) - - -class MLCDMLP(CLIPMLP): - def __init__(self, config: MLCDVisionConfig): - super().__init__(config) - - -class MLCDVisionEmbeddings(torch.nn.Module): - def __init__(self, config: MLCDVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.image_size = config.image_size - self.patch_size = config.patch_size - - self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) - - self.patch_embedding = nn.Conv2d( - in_channels=config.num_channels, - out_channels=self.embed_dim, - kernel_size=self.patch_size, - stride=self.patch_size, - bias=False, - ) - - self.num_patches = (self.image_size // self.patch_size) ** 2 - self.num_positions = self.num_patches + 1 - - - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] - target_dtype = self.patch_embedding.weight.dtype - patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] - patch_embeds = patch_embeds.flatten(2).transpose(1, 2) - - class_embeds = self.class_embedding.expand(batch_size, 1, -1) - embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - - return embeddings - - -class MLCDSdpaAttention(torch.nn.Module): - """Multi-headed attention from these papers - - - Attention is all you need: - https://arxiv.org/abs/1706.03762 - - - RoFormer: Enhanced Transformer with Rotary Position Embedding: - https://arxiv.org/abs/2104.09864 - """ - - def __init__(self, config: MLCDVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) - self.scale = self.head_dim**-0.5 - self.dropout = config.attention_dropout - self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) - - - def forward( - self, - hidden_states: torch.Tensor, - rotary_pos_emb: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - """Input shape: Batch x Seq x Hidden Size""" - batch_size, seq_length , hidden_size = hidden_states.size() - # Each of shape: [batch_size, seq_length, num_heads, head_dim] - q = self.q_proj(hidden_states).reshape((batch_size, seq_length, self.num_heads, self.head_dim)) - k = self.k_proj(hidden_states).reshape((batch_size, seq_length, self.num_heads, self.head_dim)) - v = self.v_proj(hidden_states).reshape((batch_size, seq_length, self.num_heads, self.head_dim)) - q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) - k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) - q = q.permute(0, 2, 1, 3).contiguous() - k = k.permute(0, 2, 1, 3).contiguous() - v = v.permute(0, 2, 1, 3).contiguous() - # q (batch_size, num_heads, seq_length, head_dim) - # k (batch_size, num_heads, seq_length, head_dim) - # v (batch_size, num_heads, seq_length, head_dim) - attn_output = F.scaled_dot_product_attention(q, k, v, None, dropout_p=0.0) - attn_output = attn_output.permute(2, 0, 1, 3).contiguous() # [seq_length, batch_size, num_heads, head_dim] - attn_output = attn_output.view(seq_length, batch_size, -1) # [seq_length, batch_size, embedding_dim] - attn_output = self.out_proj(attn_output) - attn_output = attn_output.permute(1, 0, 2).contiguous() # [batch_size, seq_length, embedding_dim] - return attn_output, None - - -class MLCDEncoderLayer(nn.Module): - def __init__(self, config: MLCDVisionConfig): - super().__init__() - self.embed_dim = config.hidden_size - self.self_attn = MLCDSdpaAttention(config) - self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = MLCDMLP(config) - self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - rotary_pos_emb: torch.Tensor, - ) -> Tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - """ - residual = hidden_states - - hidden_states = self.layer_norm1(hidden_states) - - hidden_states = self.self_attn( - hidden_states=hidden_states, - rotary_pos_emb=rotary_pos_emb, - )[0] - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.layer_norm2(hidden_states) - - hidden_states = self.mlp(hidden_states) - - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - return outputs - - -class MLCDEncoder(nn.Module): - """ - Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a - [`MLCDEncoderLayer`]. - - Args: - config: MLCDVisionConfig - """ - - def __init__(self, config: MLCDVisionConfig): - super().__init__() - self.config = config - self.layers = nn.ModuleList([MLCDEncoderLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False - - def forward( - self, - inputs_embeds, - rotary_pos_emb, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: - r""" - Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Causal mask for the text model. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - """ - - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - - hidden_states = inputs_embeds - for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - encoder_layer.__call__, - hidden_states, - rotary_pos_emb - ) - else: - layer_outputs = encoder_layer( - hidden_states, - rotary_pos_emb - ) - - hidden_states = layer_outputs[0] - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, None] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=None, - ) - - -class MLCDVisionTransformer(nn.Module): - def __init__(self, config: MLCDVisionConfig): - super().__init__() - self.config = config - embed_dim = config.hidden_size - - self.embeddings = MLCDVisionEmbeddings(config) - self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - self.encoder = MLCDEncoder(config) - self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - - self.vision_rotary_embedding = VisionRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2) - self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2)) - - - def rot_pos_emb(self, grid_thw): - pos_ids = [] - for t, h, w in grid_thw: - hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) - hpos_ids = hpos_ids.reshape(h, 1, w, 1) - hpos_ids = hpos_ids.permute(0, 2, 1, 3) - hpos_ids = hpos_ids.flatten() - - wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) - wpos_ids = wpos_ids.reshape(h, 1, w, 1) - wpos_ids = wpos_ids.permute(0, 2, 1, 3) - wpos_ids = wpos_ids.flatten() - pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) - pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() - rotary_pos_emb_full = self.vision_rotary_embedding(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb - - - def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - # output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - - """ - - twh = (1, pixel_values.size(3) // self.config.patch_size, pixel_values.size(2) // self.config.patch_size) - rotary_pos_emb = self.rot_pos_emb(torch.tensor([twh], device=pixel_values.device)) - rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0) - - # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if pixel_values is None: - raise ValueError("You have to specify pixel_values") - - hidden_states = self.embeddings(pixel_values) - hidden_states = self.pre_layrnorm(hidden_states) - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - rotary_pos_emb=rotary_pos_emb, - # output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - pooled_output = last_hidden_state[:, 0, :] - pooled_output = self.post_layernorm(pooled_output) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - # attentions=encoder_outputs.attentions, - ) - - -class MLCDPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - config_class = MLCDVisionConfig - base_model_prefix = "mlcd" - supports_gradient_checkpointing = True - _supports_sdpa = True - # _supports_flash_attn_2 = True - - def _init_weights(self, module): - """Initialize the weights""" - factor = self.config.initializer_factor - if isinstance(module, MLCDVisionEmbeddings): - factor = self.config.initializer_factor - nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) - nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) - elif isinstance(module, MLCDSdpaAttention): - factor = self.config.initializer_factor - in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor - out_proj_std = (module.embed_dim**-0.5) * factor - nn.init.normal_(module.q_proj.weight, std=in_proj_std) - nn.init.normal_(module.k_proj.weight, std=in_proj_std) - nn.init.normal_(module.v_proj.weight, std=in_proj_std) - nn.init.normal_(module.out_proj.weight, std=out_proj_std) - elif isinstance(module, MLCDMLP): - factor = self.config.initializer_factor - in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor - fc_std = (2 * module.config.hidden_size) ** -0.5 * factor - nn.init.normal_(module.fc1.weight, std=fc_std) - nn.init.normal_(module.fc2.weight, std=in_proj_std) - - - if isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() - - -class MLCDVisionModel(MLCDPreTrainedModel): - config_class = MLCDVisionConfig - main_input_name = "pixel_values" - _no_split_modules = ["MLCDEncoderLayer"] - - def __init__(self, config: MLCDVisionConfig): - super().__init__(config) - self.vision_model = MLCDVisionTransformer(config) - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> nn.Module: - return self.vision_model.embeddings.patch_embedding - - def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - - Examples: - - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import AutoProcessor, MLCDVisionModel - - >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14") - >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14") - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> inputs = processor(images=image, return_tensors="pt") - - >>> outputs = model(**inputs) - >>> last_hidden_state = outputs.last_hidden_state - >>> pooled_output = outputs.pooler_output # pooled CLS states - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - return self.vision_model( - pixel_values=pixel_values, - # output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) diff --git a/llava_next/llava/model/multimodal_encoder/mlcd/vit_rope2d_hf_wo_class_token.py b/llava_next/llava/model/multimodal_encoder/mlcd/vit_rope2d_hf_wo_class_token.py deleted file mode 100755 index dbae803b..00000000 --- a/llava_next/llava/model/multimodal_encoder/mlcd/vit_rope2d_hf_wo_class_token.py +++ /dev/null @@ -1,453 +0,0 @@ -from typing import Optional, Tuple, Union - -import torch -import torch.utils.checkpoint -from torch import nn -from torch.nn import functional as F -from transformers.models.clip.modeling_clip import (CLIPMLP, BaseModelOutput, - BaseModelOutputWithPooling, - CLIPVisionConfig, - PreTrainedModel) - - -def rotate_half(x): - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: - orig_dtype = tensor.dtype - tensor = tensor.float() - cos = freqs.cos() - sin = freqs.sin() - cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float() - sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float() - output = (tensor * cos) + (rotate_half(tensor) * sin) - output = output.to(orig_dtype) - return output - - -class VisionRotaryEmbedding(nn.Module): - def __init__(self, dim: int, theta: float = 10000.0) -> None: - super().__init__() - inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - def forward(self, seqlen: int) -> torch.Tensor: - seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype) - freqs = torch.outer(seq, self.inv_freq) - return freqs - - -class MLCDVisionConfig(CLIPVisionConfig): - - model_type = "mlcd_vision_model" - - def __init__(self,**kwargs): - super().__init__(**kwargs) - - -class MLCDMLP(CLIPMLP): - def __init__(self, config: MLCDVisionConfig): - super().__init__(config) - - -class MLCDVisionEmbeddings(torch.nn.Module): - def __init__(self, config: MLCDVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.image_size = config.image_size - self.patch_size = config.patch_size - - self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) - - self.patch_embedding = nn.Conv2d( - in_channels=config.num_channels, - out_channels=self.embed_dim, - kernel_size=self.patch_size, - stride=self.patch_size, - bias=False, - ) - - self.num_patches = (self.image_size // self.patch_size) ** 2 - self.num_positions = self.num_patches + 1 - - - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] - target_dtype = self.patch_embedding.weight.dtype - patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] - patch_embeds = patch_embeds.flatten(2).transpose(1, 2) - - class_embeds = self.class_embedding.expand(batch_size, 1, -1) - # embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = patch_embeds - - return embeddings - - -class MLCDSdpaAttention(torch.nn.Module): - """Multi-headed attention from these papers - - - Attention is all you need: - https://arxiv.org/abs/1706.03762 - - - RoFormer: Enhanced Transformer with Rotary Position Embedding: - https://arxiv.org/abs/2104.09864 - """ - - def __init__(self, config: MLCDVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) - self.scale = self.head_dim**-0.5 - self.dropout = config.attention_dropout - self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) - - - def forward( - self, - hidden_states: torch.Tensor, - rotary_pos_emb: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - """Input shape: Batch x Seq x Hidden Size""" - batch_size, seq_length , hidden_size = hidden_states.size() - # Each of shape: [batch_size, seq_length, num_heads, head_dim] - q = self.q_proj(hidden_states).reshape((batch_size, seq_length, self.num_heads, self.head_dim)) - k = self.k_proj(hidden_states).reshape((batch_size, seq_length, self.num_heads, self.head_dim)) - v = self.v_proj(hidden_states).reshape((batch_size, seq_length, self.num_heads, self.head_dim)) - q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) - k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) - q = q.permute(0, 2, 1, 3).contiguous() - k = k.permute(0, 2, 1, 3).contiguous() - v = v.permute(0, 2, 1, 3).contiguous() - # q (batch_size, num_heads, seq_length, head_dim) - # k (batch_size, num_heads, seq_length, head_dim) - # v (batch_size, num_heads, seq_length, head_dim) - attn_output = F.scaled_dot_product_attention(q, k, v, None, dropout_p=0.0) - attn_output = attn_output.permute(2, 0, 1, 3).contiguous() # [seq_length, batch_size, num_heads, head_dim] - attn_output = attn_output.view(seq_length, batch_size, -1) # [seq_length, batch_size, embedding_dim] - attn_output = self.out_proj(attn_output) - attn_output = attn_output.permute(1, 0, 2).contiguous() # [batch_size, seq_length, embedding_dim] - return attn_output, None - - -class MLCDEncoderLayer(nn.Module): - def __init__(self, config: MLCDVisionConfig): - super().__init__() - self.embed_dim = config.hidden_size - self.self_attn = MLCDSdpaAttention(config) - self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = MLCDMLP(config) - self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - rotary_pos_emb: torch.Tensor, - ) -> Tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - """ - residual = hidden_states - - hidden_states = self.layer_norm1(hidden_states) - - hidden_states = self.self_attn( - hidden_states=hidden_states, - rotary_pos_emb=rotary_pos_emb, - )[0] - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.layer_norm2(hidden_states) - - hidden_states = self.mlp(hidden_states) - - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - return outputs - - -class MLCDEncoder(nn.Module): - """ - Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a - [`MLCDEncoderLayer`]. - - Args: - config: MLCDVisionConfig - """ - - def __init__(self, config: MLCDVisionConfig): - super().__init__() - self.config = config - self.layers = nn.ModuleList([MLCDEncoderLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False - - def forward( - self, - inputs_embeds, - rotary_pos_emb, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: - r""" - Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Causal mask for the text model. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - """ - - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - - hidden_states = inputs_embeds - for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - encoder_layer.__call__, - hidden_states, - rotary_pos_emb - ) - else: - layer_outputs = encoder_layer( - hidden_states, - rotary_pos_emb - ) - - hidden_states = layer_outputs[0] - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, None] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=None, - ) - - -class MLCDVisionTransformer(nn.Module): - def __init__(self, config: MLCDVisionConfig): - super().__init__() - self.config = config - embed_dim = config.hidden_size - - self.embeddings = MLCDVisionEmbeddings(config) - self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - self.encoder = MLCDEncoder(config) - self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - - self.vision_rotary_embedding = VisionRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2) - self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2)) - - - def rot_pos_emb(self, grid_thw): - pos_ids = [] - for t, h, w in grid_thw: - hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) - hpos_ids = hpos_ids.reshape(h, 1, w, 1) - hpos_ids = hpos_ids.permute(0, 2, 1, 3) - hpos_ids = hpos_ids.flatten() - - wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) - wpos_ids = wpos_ids.reshape(h, 1, w, 1) - wpos_ids = wpos_ids.permute(0, 2, 1, 3) - wpos_ids = wpos_ids.flatten() - pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) - pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() - rotary_pos_emb_full = self.vision_rotary_embedding(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb - - - def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - # output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - - """ - - twh = (1, pixel_values.size(3) // self.config.patch_size, pixel_values.size(2) // self.config.patch_size) - rotary_pos_emb = self.rot_pos_emb(torch.tensor([twh], device=pixel_values.device)) - # rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0) - - # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if pixel_values is None: - raise ValueError("You have to specify pixel_values") - - hidden_states = self.embeddings(pixel_values) - hidden_states = self.pre_layrnorm(hidden_states) - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - rotary_pos_emb=rotary_pos_emb, - # output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - pooled_output = last_hidden_state[:, 0, :] - pooled_output = self.post_layernorm(pooled_output) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - # attentions=encoder_outputs.attentions, - ) - - -class MLCDPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - config_class = MLCDVisionConfig - base_model_prefix = "mlcd" - supports_gradient_checkpointing = True - _supports_sdpa = True - # _supports_flash_attn_2 = True - - def _init_weights(self, module): - """Initialize the weights""" - factor = self.config.initializer_factor - if isinstance(module, MLCDVisionEmbeddings): - factor = self.config.initializer_factor - nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) - nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) - elif isinstance(module, MLCDSdpaAttention): - factor = self.config.initializer_factor - in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor - out_proj_std = (module.embed_dim**-0.5) * factor - nn.init.normal_(module.q_proj.weight, std=in_proj_std) - nn.init.normal_(module.k_proj.weight, std=in_proj_std) - nn.init.normal_(module.v_proj.weight, std=in_proj_std) - nn.init.normal_(module.out_proj.weight, std=out_proj_std) - elif isinstance(module, MLCDMLP): - factor = self.config.initializer_factor - in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor - fc_std = (2 * module.config.hidden_size) ** -0.5 * factor - nn.init.normal_(module.fc1.weight, std=fc_std) - nn.init.normal_(module.fc2.weight, std=in_proj_std) - - - if isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() - - -class MLCDVisionModel(MLCDPreTrainedModel): - config_class = MLCDVisionConfig - main_input_name = "pixel_values" - _no_split_modules = ["MLCDEncoderLayer"] - - def __init__(self, config: MLCDVisionConfig): - super().__init__(config) - self.vision_model = MLCDVisionTransformer(config) - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> nn.Module: - return self.vision_model.embeddings.patch_embedding - - def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - - Examples: - - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import AutoProcessor, MLCDVisionModel - - >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14") - >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14") - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> inputs = processor(images=image, return_tensors="pt") - - >>> outputs = model(**inputs) - >>> last_hidden_state = outputs.last_hidden_state - >>> pooled_output = outputs.pooler_output # pooled CLS states - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - return self.vision_model( - pixel_values=pixel_values, - # output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) diff --git a/llava_next/llava/model/multimodal_encoder/mlcd_encoder.py b/llava_next/llava/model/multimodal_encoder/mlcd_encoder.py deleted file mode 100755 index 55f3c809..00000000 --- a/llava_next/llava/model/multimodal_encoder/mlcd_encoder.py +++ /dev/null @@ -1,196 +0,0 @@ -import torch -import torch.nn as nn -from llava.utils import rank0_print -# from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig -from transformers import CLIPImageProcessor -from .mlcd.vit_rope2d_hf import MLCDVisionModel, MLCDVisionConfig - -try: - from s2wrapper import forward as multiscale_forward -except: - pass - - -# class CLIPVisionTower(nn.Module): -class MLCDVisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - self.projector_type = getattr(args, "mm_projector_type", "patch_merger") - - self.vision_tower_name = vision_tower - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - # self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) - self.cfg_only = MLCDVisionConfig.from_pretrained(self.vision_tower_name) - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - # self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower = MLCDVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.is_loaded = True - - def feature_select(self, image_forward_outs): - select_feature_type = self.select_feature - - if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: - select_every_k_layer = len(image_forward_outs.hidden_states) // 4 - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) - select_feature_type = select_feature_type.replace("slicefour_", "") - elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]: - select_layers = [-2, -5, -8, -11, 6] - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1) - select_feature_type = select_feature_type.replace("slice_m25811_f6_", "") - else: - image_features = image_forward_outs.hidden_states[self.select_layer] - - if select_feature_type == "patch": - image_features = image_features[:, 1:] - - elif select_feature_type == "patch_interpolate": - image_features = image_features[:, 1:] - # Add bilinear interpolation here - B, N, C = image_features.shape - H = W = int(N**0.5) - image_features = image_features.reshape(B, H, W, C).permute(0, 3, 1, 2) # B, H, W, C -> B, C, H, W - image_features = torch.nn.functional.interpolate(image_features, size=(H // 2, W // 2), mode='bilinear', align_corners=False) # B, C, H, W -> B, C, H/2, W/2 - image_features = image_features.permute(0, 2, 3, 1).reshape(B, (H // 2) * (W // 2), C) # B, C, H/2, W/2 -> B, H/2, W/2, C -> B, H/2*W/2, C - - elif select_feature_type == "cls_patch": - image_features = image_features - else: - raise ValueError(f"Unexpected select feature: {select_feature_type}") - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) - image_feature = self.feature_select(image_forward_out).to(image.dtype) - image_features.append(image_feature) - else: - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - return self.vision_tower.dtype - - @property - def device(self): - return self.vision_tower.device - - @property - def config(self): - if self.is_loaded: - return self.vision_tower.config - else: - return self.cfg_only - - @property - def hidden_size(self): - _hidden_size = self.config.hidden_size - if "slicefour" in self.select_feature: - _hidden_size *= 4 - if "slice_m25811_f6" in self.select_feature: - _hidden_size *= 5 - return _hidden_size - - @property - def num_patches_per_side(self): - if self.select_feature == "patch_interpolate" or self.projector_type == "patch_merger": - return self.config.image_size // (self.config.patch_size * 2) - else: - return self.config.image_size // self.config.patch_size - - @property - def num_patches(self): - if self.select_feature == "patch_interpolate" or self.projector_type == "patch_merger": - _num_patches = (self.config.image_size // (self.config.patch_size * 2)) ** 2 - else: - _num_patches = (self.config.image_size // self.config.patch_size) ** 2 - if "cls_patch" in self.select_feature: - _num_patches += 1 - return _num_patches - - @property - def image_size(self): - return self.config.image_size - - -# class CLIPVisionTowerS2(CLIPVisionTower): -class MLCDVisionTowerS2(MLCDVisionTower): - def __init__(self, vision_tower, args, delay_load=False): - - self.s2_scales = getattr(args, "s2_scales", "336,672,1008") - self.s2_scales = list(map(int, self.s2_scales.split(","))) - self.s2_scales.sort() - self.s2_split_size = self.s2_scales[0] - self.s2_image_size = self.s2_scales[-1] - - super().__init__(vision_tower, args, delay_load) - - # change resize/crop size in preprocessing to the largest image size in s2_scale - if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False): - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = MLCDVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - self.is_loaded = True - - def forward_feature(self, images): - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - image_features.append(image_feature) - else: - image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - - return image_features - - @property - def hidden_size(self): - return self.config.hidden_size * len(self.s2_scales) diff --git a/llava_next/llava/model/multimodal_encoder/mlcd_encoder_wo_class_token.py b/llava_next/llava/model/multimodal_encoder/mlcd_encoder_wo_class_token.py deleted file mode 100755 index c562d6c2..00000000 --- a/llava_next/llava/model/multimodal_encoder/mlcd_encoder_wo_class_token.py +++ /dev/null @@ -1,182 +0,0 @@ -import torch -import torch.nn as nn -from llava.utils import rank0_print -# from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig -from transformers import CLIPImageProcessor -from .mlcd.vit_rope2d_hf_wo_class_token import MLCDVisionModel, MLCDVisionConfig -# from mlcd.vit_rope2d_hf import MLCDVisionModel, MLCDVisionConfig - -try: - from s2wrapper import forward as multiscale_forward -except: - pass - - -# class CLIPVisionTower(nn.Module): -class MLCDVisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - - self.vision_tower_name = vision_tower - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - # self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) - self.cfg_only = MLCDVisionConfig.from_pretrained(self.vision_tower_name) - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - # self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower = MLCDVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.is_loaded = True - - def feature_select(self, image_forward_outs): - select_feature_type = self.select_feature - - if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: - select_every_k_layer = len(image_forward_outs.hidden_states) // 4 - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) - select_feature_type = select_feature_type.replace("slicefour_", "") - elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]: - select_layers = [-2, -5, -8, -11, 6] - image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1) - select_feature_type = select_feature_type.replace("slice_m25811_f6_", "") - else: - image_features = image_forward_outs.hidden_states[self.select_layer] - - if select_feature_type == "patch": - # image_features = image_features[:, 1:] - image_features = image_features - - elif select_feature_type == "cls_patch": - image_features = image_features - else: - raise ValueError(f"Unexpected select feature: {select_feature_type}") - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) - image_feature = self.feature_select(image_forward_out).to(image.dtype) - image_features.append(image_feature) - else: - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - return self.vision_tower.dtype - - @property - def device(self): - return self.vision_tower.device - - @property - def config(self): - if self.is_loaded: - return self.vision_tower.config - else: - return self.cfg_only - - @property - def hidden_size(self): - _hidden_size = self.config.hidden_size - if "slicefour" in self.select_feature: - _hidden_size *= 4 - if "slice_m25811_f6" in self.select_feature: - _hidden_size *= 5 - return _hidden_size - - @property - def num_patches_per_side(self): - return self.config.image_size // self.config.patch_size - - @property - def num_patches(self): - _num_patches = (self.config.image_size // self.config.patch_size) ** 2 - if "cls_patch" in self.select_feature: - _num_patches += 1 - return _num_patches - - @property - def image_size(self): - return self.config.image_size - - -# class CLIPVisionTowerS2(CLIPVisionTower): -class MLCDVisionTowerS2(MLCDVisionTower): - def __init__(self, vision_tower, args, delay_load=False): - - self.s2_scales = getattr(args, "s2_scales", "336,672,1008") - self.s2_scales = list(map(int, self.s2_scales.split(","))) - self.s2_scales.sort() - self.s2_split_size = self.s2_scales[0] - self.s2_image_size = self.s2_scales[-1] - - super().__init__(vision_tower, args, delay_load) - - # change resize/crop size in preprocessing to the largest image size in s2_scale - if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False): - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = MLCDVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - self.is_loaded = True - - def forward_feature(self, images): - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - image_features.append(image_feature) - else: - image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - - return image_features - - @property - def hidden_size(self): - return self.config.hidden_size * len(self.s2_scales) diff --git a/llava_next/llava/model/multimodal_encoder/onevision_encoder.py b/llava_next/llava/model/multimodal_encoder/onevision_encoder.py new file mode 100644 index 00000000..48419dbb --- /dev/null +++ b/llava_next/llava/model/multimodal_encoder/onevision_encoder.py @@ -0,0 +1,149 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch OneVision Encoder model.""" + +import torch +import torch.nn as nn + +from transformers import AutoConfig, AutoImageProcessor, AutoModel +from transformers.utils import logging + +from llava.utils import rank0_print + +logger = logging.get_logger(__name__) + + +class OneVisionEncoderTower(nn.Module): + """ + Vision Tower wrapper for LlavaViT model, compatible with LLaVA framework. + """ + def __init__(self, vision_tower, vision_tower_cfg=None, delay_load=False): + super().__init__() + + self.is_loaded = False + self.vision_tower_name = vision_tower + self.select_layer = vision_tower_cfg.mm_vision_select_layer if vision_tower_cfg is not None else None + + # Default config - will be updated after loading + self.config = AutoConfig.from_pretrained(self.vision_tower_name, trust_remote_code=True) + + self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name, trust_remote_code=True) + + if not delay_load: + rank0_print(f"Loading vision tower: {vision_tower}") + self.load_model() + elif getattr(vision_tower_cfg, "unfreeze_mm_vision_tower", False): + rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") + self.load_model() + elif hasattr(vision_tower_cfg, "mm_tunable_parts") and "mm_vision_tower" in vision_tower_cfg.mm_tunable_parts: + rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") + self.load_model() + else: + self.cfg_only = self.config + + def load_model(self, device_map=None): + if self.is_loaded: + rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) + return + + self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, trust_remote_code=True) + + # Update config from loaded model + self.config = self.vision_tower.config + + self.is_loaded = True + + def forward(self, images, grid_thw=None, visible_indices=None): + """ + Forward pass for the vision tower. + + Args: + images: Can be: + - Tensor of shape (B, C, H, W) for single images + - Tensor of shape (B, C, T, H, W) for video + - List of tensors + grid_thw: Optional grid info for variable resolution + + Returns: + image_features: Tensor of shape (B, num_patches, hidden_size) + """ + if isinstance(images, list): + image_features = [] + for image in images: + image_forward_out = self.vision_tower( + image.to(device=self.device, dtype=self.dtype).unsqueeze(0), + output_hidden_states=True, + visible_indices=visible_indices + ) + image_feature = image_forward_out.hidden_states[-1].to(image.dtype) + image_features.append(image_feature) + image_features = torch.cat(image_features, dim=0) + else: + # Handle tensor input + pixel_values = images.to(device=self.device, dtype=self.dtype) + + # Ensure correct shape: (B, C, H, W) or (B, C, T, H, W) + if pixel_values.dim() == 3: + # (C, H, W) -> (1, C, H, W) + pixel_values = pixel_values.unsqueeze(0) + bs = pixel_values.shape[0] + if bs == 8: # FIXME hardcoded for 8 images input as video sample + # (B, C, T, H, W) -> (1, C, B*T, H, W) + pixel_values = pixel_values.unsqueeze(0).permute(0, 2, 1, 3, 4) + image_forward_outs = self.vision_tower( + pixel_values, + output_hidden_states=True, + visible_indices=visible_indices + ) + + # Get last hidden state (before head if exists) + if self.select_layer is not None: + image_features = image_forward_outs.hidden_states[self.select_layer] + else: + image_features = image_forward_outs.hidden_states[-2] + if bs == 8: # FIXME hardcoded for 8 images input as video sample + image_features = image_features.squeeze(0).reshape(8, -1, self.hidden_size) + + return image_features + + @property + def dummy_feature(self): + return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) + + @property + def dtype(self): + for p in self.vision_tower.parameters(): + return p.dtype + + @property + def device(self): + for p in self.vision_tower.parameters(): + return p.device + + @property + def hidden_size(self): + return self.config.hidden_size + + @property + def num_patches(self): + return (self.config.image_size // self.config.patch_size) ** 2 + + @property + def num_patches_per_side(self): + return self.config.image_size // self.config.patch_size + + @property + def image_size(self): + return self.config.image_size diff --git a/llava_next/llava/model/multimodal_encoder/open_clip_encoder.py b/llava_next/llava/model/multimodal_encoder/open_clip_encoder.py deleted file mode 100755 index 17a3277f..00000000 --- a/llava_next/llava/model/multimodal_encoder/open_clip_encoder.py +++ /dev/null @@ -1,163 +0,0 @@ -import torch -import torch.nn as nn -from transformers import CLIPImageProcessor -from llava.utils import rank0_print - -try: - import open_clip - import torchvision - from open_clip.transformer import _expand_token -except ImportError: - print("OpenCLIP not installed") - open_clip = None - -HIDDEN_SIZE_DICT = { - "ViT-H-14-378-quickgelu": 1280, -} - - -class OpenCLIPVisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - self.model_name = vision_tower.replace("open_clip_hub:", "") - self.pretrained = args.vision_tower_pretrained - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - - def load_model(self, device_map="auto"): - rank0_print(f"Loading OpenCLIP model: {self.model_name}") - rank0_print(f"Pretrained: {self.pretrained}") - vision_tower, _, image_processor = open_clip.create_model_and_transforms(model_name=self.model_name, pretrained=self.pretrained, precision="fp32", device="cuda") - - resize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Resize)][0] - normalize_transform = [t for t in image_processor.transforms if isinstance(t, torchvision.transforms.Normalize)][0] - self.resize_transform_size = resize_transform.size # 224 or 384 - self.patch_size = vision_tower.visual.conv1.kernel_size[0] # 14 or 16 - - self.image_processor = CLIPImageProcessor.from_pretrained( - "openai/clip-vit-large-patch14", - crop_size=resize_transform.size, - size={"shortest_edge": resize_transform.size}, - image_mean=list(normalize_transform.mean), - image_std=list(normalize_transform.std), - ) - rank0_print(f"Loaded image processor: {self.image_processor}") - self.vision_tower = vision_tower.visual - self.vision_tower.requires_grad_(False) - - self.is_loaded = True - - def feature_select(self, image_forward_outs): - image_features = image_forward_outs[self.select_layer] - if self.select_feature == "patch": - image_features = image_features[:, 1:] - elif self.select_feature == "cls_patch": - image_features = image_features - elif self.select_feature == "conv_flatten": - image_features = image_features.flatten(2).transpose(1, 2) - else: - raise ValueError(f"Unexpected select feature: {self.select_feature}") - return image_features - - def forward_visual(self, x, output_hidden_states=False): - if hasattr(self.vision_tower, "trunk") and hasattr(self.vision_tower.trunk, "_intermediate_layers"): - return self.vision_tower.trunk._intermediate_layers(x, abs(self.select_layer)) - else: - - def forward_openclip(self, x: torch.Tensor): - features = [] - x = self.conv1(x) # shape = [*, width, grid, grid] - x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] - x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] - - # class embeddings and positional embeddings - x = torch.cat( - [_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x], - dim=1, - ) - # shape = [*, grid ** 2 + 1, width] - x = x + self.positional_embedding.to(x.dtype) - - x = self.patch_dropout(x) - x = self.ln_pre(x) - - x = x.permute(1, 0, 2) # NLD -> LND - for r in self.transformer.resblocks: - x = r(x, attn_mask=None) - features.append(x) - return features - - return forward_openclip(self.vision_tower, x) - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_forward_out = self.forward_visual(image.to(self.dtype).unsqueeze(0), output_hidden_states=True) - image_feature = self.feature_select(image_forward_out).to(image.dtype) - image_features.append(image_feature) - else: - image_forward_outs = self.forward_visual(images.to(self.dtype), output_hidden_states=True) - image_features = self.feature_select(image_forward_outs).to(images.dtype) - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - if hasattr(self.vision_tower, "conv1"): - return self.vision_tower.conv1.weight.dtype - if hasattr(self.vision_tower, "trunk"): - return self.vision_tower.trunk.patch_embed.proj.weight.dtype - raise NotImplementedError - - @property - def device(self): - if hasattr(self.vision_tower, "conv1"): - return self.vision_tower.conv1.weight.device - if hasattr(self.vision_tower, "trunk"): - return self.vision_tower.trunk.patch_embed.proj.weight.device - raise NotImplementedError - - @property - def config(self): - return None - - @property - def hidden_size(self): - if self.model_name in HIDDEN_SIZE_DICT: - return HIDDEN_SIZE_DICT[self.model_name] - else: - raise NotImplementedError - - @property - def num_patches(self): - image_size = self.resize_transform_size if isinstance(self.resize_transform_size, int) else self.resize_transform_size[0] - _num_patches = (image_size // self.patch_size) ** 2 - if "cls_patch" in self.select_feature: - _num_patches += 1 - return _num_patches - - @property - def image_size(self): - return self.resize_transform_size - - @property - def num_patches_per_side(self): - return self.resize_transform_size // self.patch_size diff --git a/llava_next/llava/model/multimodal_encoder/siglip2_naflex.py b/llava_next/llava/model/multimodal_encoder/siglip2_naflex.py index a400d193..ce3b6547 100644 --- a/llava_next/llava/model/multimodal_encoder/siglip2_naflex.py +++ b/llava_next/llava/model/multimodal_encoder/siglip2_naflex.py @@ -1,37 +1,24 @@ -""" -# Adapted from https://huggingface.co/MILVLG/imp-v1-3b/blob/main/vision_encoder.py -""" - -from typing import Optional, Tuple, Union, Dict, List, Callable -from collections import defaultdict -from transformers.utils import TensorType, logging - +from typing import Optional, Tuple, Union, List, Callable from dataclasses import dataclass from functools import lru_cache -from functools import partial, reduce -from PIL import Image + +import math +import os + +import numpy as np import torch +import torch.nn.functional as F import torch.utils.checkpoint from torch import nn -import numpy as np -import math -import os -from transformers.image_processing_utils import BatchFeature, get_size_dict, BaseImageProcessor + +from transformers import PretrainedConfig +from transformers.activations import ACT2FN +from transformers.image_processing_utils import BatchFeature, BaseImageProcessor from transformers.image_transforms import ( convert_to_rgb, - normalize, - rescale, resize, to_channel_dimension_format, ) -import torch.nn.functional as F -from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask -from transformers.activations import ACT2FN -from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling -from transformers.modeling_utils import PreTrainedModel, ALL_ATTENTION_FUNCTIONS -from transformers import PretrainedConfig -from transformers.utils import ModelOutput -from llava.utils import rank0_print from transformers.image_utils import ( ChannelDimension, ImageInput, @@ -43,6 +30,13 @@ valid_images, validate_preprocess_arguments, ) +from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask +from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from transformers.modeling_utils import PreTrainedModel, ALL_ATTENTION_FUNCTIONS +from transformers.utils import TensorType, logging, ModelOutput + +from llava.utils import rank0_print + logger = logging.get_logger(__name__) @lru_cache(maxsize=256) @@ -285,7 +279,8 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) + if hasattr(cls, '_set_token_in_kwargs'): + cls._set_token_in_kwargs(kwargs) config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) @@ -581,6 +576,8 @@ class SigLip2PreTrainedModel(PreTrainedModel): config_class = SigLip2VisionConfig base_model_prefix = "siglip2" supports_gradient_checkpointing = True + _supports_flash_attn_2 = True + _supports_sdpa = True def _init_weights(self, module): """Initialize the weights""" @@ -682,7 +679,7 @@ def __init__(self, config: SigLip2VisionConfig): self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" def forward( - self, + self, pixel_values: torch.FloatTensor, attention_mask: torch.Tensor, spatial_shapes: torch.LongTensor, @@ -690,8 +687,8 @@ def forward( output_hidden_states: Optional[bool] = None, ) -> BaseModelOutputWithPooling: r""" - Returns: - + spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`): + Tensor containing the spatial dimensions (height, width) of the input images. """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -821,6 +818,7 @@ def __init__(self, vision_tower, vision_tower_cfg, delay_load=False): self.config = SigLip2VisionConfig() self.vision_tower_name = vision_tower + self.select_layer = vision_tower_cfg.mm_vision_select_layer self.image_processor = SigLip2ImageProcessor() @@ -842,15 +840,15 @@ def load_model(self, device_map=None): rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) return - self.vision_tower = SigLip2VisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) - - del self.vision_tower.vision_model.encoder.layers[-1:] - # self.vision_tower.vision_model.head = nn.Identity() - self.vision_tower.requires_grad_(False) + self.vision_tower = SigLip2VisionModel.from_pretrained( + self.vision_tower_name, + device_map=device_map, + attn_implementation="flash_attention_2" + ) self.is_loaded = True - def forward(self, images): + def forward(self, images, grid_thw=None, visible_indices=None): if type(images) is list: image_features = [] for image in images: @@ -859,7 +857,7 @@ def forward(self, images): assert image_features.shape[-2] == 729 image_features.append(image_feature) elif hasattr(images, 'keys'): - # 原有的字典处理逻辑 + # Handle dictionary input format pixel_values = images['pixel_values'].to(device=self.device, dtype=self.dtype) spatial_shapes = images['spatial_shapes'].to(device=self.device) pixel_attention_mask = images.get('pixel_attention_mask', None) @@ -874,24 +872,31 @@ def forward(self, images): ) image_features = image_forward_outs.hidden_states[-1] else: - # 新增:处理直接传入张量的情况 - # 假定张量形状为 [batch_size, height, width, channels] 或 [batch_size, sequence_length, hidden_size] + # Handle direct tensor input + # Assumes tensor shape is [batch_size, height, width, channels] or [batch_size, sequence_length, hidden_size] batch_size = images.shape[0] - # assert 1==2, f'images.shape:{images.shape}' - # 如果是原始图像张量,需要进行预处理 - if len(images.shape) == 4: # [B, H, W, C] 格式 - # 这里可能需要调用图像处理器进行处理 + # If raw image tensor, preprocessing is needed + if len(images.shape) == 4: # [B, H, W, C] format + # Call image processor to preprocess processed = self.image_processor.preprocess(images, return_tensors="pt") pixel_values = processed['pixel_values'].to(device=self.device, dtype=self.dtype) spatial_shapes = processed['spatial_shapes'].to(device=self.device) pixel_attention_mask = None - else: # 假设已经是特征或标记化的张量 - # 直接使用传入的张量作为像素值 + else: pixel_values = images.to(device=self.device, dtype=self.dtype) - feat_w, feat_h = int(pixel_values.shape[1]**0.5), int(pixel_values.shape[1]**0.5) - # 为每个批次项目估计标准形状 (35, 35) - spatial_shapes = torch.tensor([[feat_w, feat_h]] * batch_size, device=self.device) + if grid_thw is not None: + spatial_shapes = [] + for b in range(batch_size): + thw = grid_thw[b] + feat_h = thw[1] + feat_w = thw[2] + spatial_shapes.append([feat_h, feat_w]) + spatial_shapes = torch.tensor(spatial_shapes, device=self.device) + else: + feat_w, feat_h = int(pixel_values.shape[1]**0.5), int(pixel_values.shape[1]**0.5) + # Estimate standard shape for each batch item + spatial_shapes = torch.tensor([[feat_h, feat_w]] * batch_size, device=self.device) pixel_attention_mask = None image_forward_outs = self.vision_tower( @@ -900,7 +905,10 @@ def forward(self, images): spatial_shapes, output_hidden_states=True ) - image_features = image_forward_outs.hidden_states[-1] + if self.select_layer is not None: + image_features = image_forward_outs.hidden_states[self.select_layer] + else: + image_features = image_forward_outs.hidden_states[-2] return image_features @@ -935,19 +943,3 @@ def num_patches_per_side(self): @property def image_size(self): return 560 - - -# vision_tower = SigLip2NaflexVisionTower("/vlm/pretrain_models/SigLIP2/siglip2-so400m-patch16-naflex", vision_tower_cfg={}) - -# image_path = '/vlm/yinxie/code/sa_514193.jpg' -# from PIL import Image -# image = Image.open(image_path).convert("RGB") -# image = image.resize((560, 560), Image.BICUBIC) # Resize to 256x256 -# image = vision_tower.image_processor.preprocess(image, do_resize=False, return_tensors="pt") -# print('image.shape', image['pixel_values'][0].shape) # Should be (1, 256, 256, 3) -# vision_tower.to(vision_tower.device) -# vision_tower.eval() -# with torch.no_grad(): -# image_features = vision_tower(image) -# print(image_features.shape) # Should be (1, 768, 729) -# print(vision_tower) \ No newline at end of file diff --git a/llava_next/llava/model/multimodal_encoder/siglip_encoder.py b/llava_next/llava/model/multimodal_encoder/siglip_encoder.py deleted file mode 100755 index f1e101a2..00000000 --- a/llava_next/llava/model/multimodal_encoder/siglip_encoder.py +++ /dev/null @@ -1,620 +0,0 @@ -""" -# Adapted from https://huggingface.co/MILVLG/imp-v1-3b/blob/main/vision_encoder.py -""" - -from typing import Optional, Tuple, Union, Dict -from dataclasses import dataclass -from functools import partial, reduce -from PIL import Image -import torch -import torch.utils.checkpoint -from torch import nn -import os -from transformers.image_processing_utils import BatchFeature, get_size_dict -from transformers.image_transforms import ( - convert_to_rgb, - normalize, - rescale, - resize, - to_channel_dimension_format, -) -from transformers.image_utils import ( - ChannelDimension, - PILImageResampling, - to_numpy_array, -) -from transformers.activations import ACT2FN -from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling -from transformers.modeling_utils import PreTrainedModel -from transformers import PretrainedConfig -from transformers.utils import ModelOutput -from llava.utils import rank0_print - - -class SigLipImageProcessor: - def __init__(self, image_mean=(0.5, 0.5, 0.5), image_std=(0.5, 0.5, 0.5), size=(384, 384), crop_size: Dict[str, int] = None, resample=PILImageResampling.BICUBIC, rescale_factor=1 / 255, data_format=ChannelDimension.FIRST): - crop_size = crop_size if crop_size is not None else {"height": 384, "width": 384} - crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") - - self.image_mean = image_mean - self.image_std = image_std - self.size = size - self.resample = resample - self.rescale_factor = rescale_factor - self.data_format = data_format - self.crop_size = crop_size - - def preprocess(self, images, return_tensors): - if isinstance(images, Image.Image): - images = [images] - else: - # to adapt video data - images = [to_numpy_array(image) for image in images] - assert isinstance(images, list) - - transforms = [ - convert_to_rgb, - to_numpy_array, - partial(resize, size=self.size, resample=self.resample, data_format=self.data_format), - partial(rescale, scale=self.rescale_factor, data_format=self.data_format), - partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format), - partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format), - ] - - images = reduce(lambda x, f: [*map(f, x)], transforms, images) - data = {"pixel_values": images} - - return BatchFeature(data=data, tensor_type=return_tensors) - - -class SigLipVisionConfig(PretrainedConfig): - model_type = "siglip_vision_model" - - def __init__( - self, - hidden_size=1152, - image_mean=(0.5, 0.5, 0.5), - intermediate_size=4304, - num_hidden_layers=27, - num_attention_heads=16, - num_channels=3, - image_size=384, - patch_size=14, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.image_mean = image_mean - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from SigLipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - print(f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors.") - - return cls.from_dict(config_dict, **kwargs) - - -@dataclass -# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->SigLip -class SigLipVisionModelOutput(ModelOutput): - """ - Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. - - Args: - image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): - The image embeddings obtained by applying the projection layer to the pooler_output. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - image_embeds: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -class SigLipVisionEmbeddings(nn.Module): - def __init__(self, config: SigLipVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.image_size = config.image_size - self.patch_size = config.patch_size - - self.patch_embedding = nn.Conv2d( - in_channels=config.num_channels, - out_channels=self.embed_dim, - kernel_size=self.patch_size, - stride=self.patch_size, - padding="valid", - ) - - self.num_patches = (self.image_size // self.patch_size) ** 2 - self.num_positions = self.num_patches - self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] - embeddings = patch_embeds.flatten(2).transpose(1, 2) - - embeddings = embeddings + self.position_embedding(self.position_ids) - return embeddings - - -class SigLipAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__ - def __init__(self, config): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError(f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" f" {self.num_heads}).") - self.scale = self.head_dim**-0.5 - self.dropout = config.attention_dropout - - self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - batch_size, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) - - k_v_seq_len = key_states.shape[-2] - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale - - if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len): - raise ValueError(f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is" f" {attn_weights.size()}") - - if attention_mask is not None: - if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len): - raise ValueError(f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}") - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim): - raise ValueError(f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is" f" {attn_output.size()}") - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, attn_weights - - -# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->SigLip -class SigLipMLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.activation_fn = ACT2FN[config.hidden_act] - self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) - self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.fc1(hidden_states) - hidden_states = self.activation_fn(hidden_states) - hidden_states = self.fc2(hidden_states) - return hidden_states - - -# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->SigLip -class SigLipEncoderLayer(nn.Module): - def __init__(self, config: SigLipVisionConfig): - super().__init__() - self.embed_dim = config.hidden_size - self.self_attn = SigLipAttention(config) - self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = SigLipMLP(config) - self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - - # Ignore copy - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: torch.Tensor, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): - Input to the layer of shape `(batch, seq_len, embed_dim)`. - attention_mask (`torch.FloatTensor`): - Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ - residual = hidden_states - - hidden_states = self.layer_norm1(hidden_states) - hidden_states, attn_weights = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - output_attentions=output_attentions, - ) - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.layer_norm2(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs - - -class SigLipPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = SigLipVisionConfig - base_model_prefix = "siglip" - supports_gradient_checkpointing = True - - def _init_weights(self, module): - """Initialize the weights""" - pass - - -# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->SigLip -class SigLipEncoder(nn.Module): - """ - Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a - [`SigLipEncoderLayer`]. - - Args: - config: SigLipVisionConfig - """ - - def __init__(self, config: SigLipVisionConfig): - super().__init__() - self.config = config - self.layers = nn.ModuleList([SigLipEncoderLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False - - # Ignore copy - def forward( - self, - inputs_embeds, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: - r""" - Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - - hidden_states = inputs_embeds - for encoder_layer in self.layers: - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - encoder_layer.__call__, - hidden_states, - attention_mask, - output_attentions, - ) - else: - layer_outputs = encoder_layer( - hidden_states, - attention_mask, - output_attentions=output_attentions, - ) - - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) - return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions) - - -class SigLipVisionTransformer(nn.Module): - def __init__(self, config: SigLipVisionConfig): - super().__init__() - self.config = config - embed_dim = config.hidden_size - - self.embeddings = SigLipVisionEmbeddings(config) - self.encoder = SigLipEncoder(config) - self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - self.head = SigLipMultiheadAttentionPoolingHead(config) - - def forward( - self, - pixel_values, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - hidden_states = self.embeddings(pixel_values) - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - last_hidden_state = self.post_layernorm(last_hidden_state) - - pooled_output = self.head(last_hidden_state) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - -class SigLipMultiheadAttentionPoolingHead(nn.Module): - """Multihead Attention Pooling.""" - - def __init__(self, config: SigLipVisionConfig): - super().__init__() - - self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size)) - self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True) - self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = SigLipMLP(config) - - def forward(self, hidden_state): - batch_size = hidden_state.shape[0] - probe = self.probe.repeat(batch_size, 1, 1) - - hidden_state = self.attention(probe, hidden_state, hidden_state)[0] - - residual = hidden_state - hidden_state = self.layernorm(hidden_state) - hidden_state = residual + self.mlp(hidden_state) - - return hidden_state[:, 0] - - -class SigLipVisionModel(SigLipPreTrainedModel): - config_class = SigLipVisionConfig - main_input_name = "pixel_values" - _no_split_modules = ["SigLipEncoderLayer"] - - def __init__(self, config: SigLipVisionConfig): - super().__init__(config) - - self.vision_model = SigLipVisionTransformer(config) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> nn.Module: - return self.vision_model.embeddings.patch_embedding - - def forward( - self, - pixel_values, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - - Examples: - - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import AutoProcessor, SigLipVisionModel - - >>> model = SigLipVisionModel.from_pretrained("google/siglip-base-patch16-224") - >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224") - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> inputs = processor(images=image, return_tensors="pt") - - >>> outputs = model(**inputs) - >>> last_hidden_state = outputs.last_hidden_state - >>> pooled_output = outputs.pooler_output # pooled features - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - return self.vision_model( - pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - -class SigLipVisionTower(nn.Module): - def __init__(self, vision_tower, vision_tower_cfg, delay_load=False): - super().__init__() - - self.is_loaded = False - - self.config = SigLipVisionConfig() - - self.vision_tower_name = vision_tower - - self.image_processor = SigLipImageProcessor() - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(vision_tower_cfg, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(vision_tower_cfg, "mm_tunable_parts") and "mm_vision_tower" in vision_tower_cfg.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - self.cfg_only = self.config - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.vision_tower = SigLipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) - - del self.vision_tower.vision_model.encoder.layers[-1:] - self.vision_tower.vision_model.head = nn.Identity() - self.vision_tower.requires_grad_(False) - - self.is_loaded = True - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) - image_feature = image_forward_out.hidden_states[-1].to(image.dtype) - assert image_features.shape[-2] == 729 - image_features.append(image_feature) - else: - image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - image_features = image_forward_outs.hidden_states[-1].to(images.dtype) - assert image_features.shape[-2] == 729 - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - for p in self.vision_tower.parameters(): - return p.dtype - - @property - def device(self): - for p in self.vision_tower.parameters(): - return p.device - - @property - def hidden_size(self): - return self.config.hidden_size - - @property - def num_patches(self): - return (self.config.image_size // self.config.patch_size) ** 2 - - @property - def num_patches_per_side(self): - return self.config.image_size // self.config.patch_size - # return self.model_config["vision_cfg"]["image_size"] // self.model_config["vision_cfg"]["patch_size"] - - @property - def image_size(self): - return self.config.image_size diff --git a/llava_next/llava/model/multimodal_encoder/unit/LICENSE b/llava_next/llava/model/multimodal_encoder/unit/LICENSE deleted file mode 100755 index 261eeb9e..00000000 --- a/llava_next/llava/model/multimodal_encoder/unit/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/llava_next/llava/model/multimodal_encoder/unit/README.md b/llava_next/llava/model/multimodal_encoder/unit/README.md deleted file mode 100755 index 2524c60c..00000000 --- a/llava_next/llava/model/multimodal_encoder/unit/README.md +++ /dev/null @@ -1,80 +0,0 @@ -## UNIT: Unifying Image and Text Recognition in One Vision Encoder - -- Paper: [Arxiv](https://arxiv.org/abs/2409.04095) [NeurIPS 2024] -- Model: [UNIT_600M](https://huggingface.co/yeeaa/UNIT_600M/tree/main), [UNIT_1B](https://huggingface.co/yeeaa/UNIT_1B/tree/main) - - -## Install -This project supports both NVIDIA and Ascend GPUs. - -- Dependencies & Environment - - Python >= 3.9 - - NVIDIA GPU, CUDA >= 11.7 - - ASCEND NPU (Recommend to use 910B), CANN 8.0.0 RC1, torch-npu = 2.1.0 - -- Install pytorch packages -```Shell -pip install torch==2.1.0 -pip install timm==0.9.12 -pip install transformers==4.32.1 -``` - - -## Usage - -```Python -import torch -from PIL import Image -from transformers import CLIPImageProcessor - -from unit import UNITModel - -### uncomment to use Ascend NPU -# import torch_npu -# from torch_npu.npu import amp -# from torch_npu.contrib import transfer_to_npu - -# use UNIT_600M model -model_path = "/path/to/UNIT_600M/" -### uncomment to use UNIT_1B model -# model_path = "/path/to/UNIT_1B/" - -model = UNITModel.from_pretrained(model_path) - -model.to(device='cuda') -model.eval() - -image_processor = CLIPImageProcessor.from_pretrained(model_path) - -image = Image.open("test.jpg").convert('RGB') - -image_input = image_processor(image)['pixel_values'][0] -image_tensor = torch.tensor(image_input).unsqueeze(0).to(torch.bfloat16).cuda() - -with torch.set_grad_enabled(False): - cls_tokens, spatial_tokens = model(image_tensor) - -### Note: Applying a LayerNorm layer to these tokens is crucial before feeding them into LLMs. -``` - -## Results -- MLLM downstrean tasks - -| Method | GQA | OKVQA | ChartQA | DocVQA | InfoVQA | OCRBench | POPE | MME | SEED-Image | MathVista | -| ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | -| CLIP-L | 62.34 | 56.97 | 51.96 | 57.19 | 29.31 | 382 | 84.67 | 1503.60| 69.79 | 42.7 | -| SigLIP | 63.02 | 61.06 | 56.48 | 61.97 | 29.70 | 429 | 85.93 | 1489.37 | 71.63 | 44.2 | -| UNIT-600M | 63.89 | 61.52| 61.0 | 65.49 | 31.92 | 480| 85.81 | 1529.76 | 72.81 | 44.6 | -| UNIT-1B | 64.90 | 56.78 | 66.64 | 71.34 | 34.81 | 540 | 87.54 | 1531.92 | 73.15 | 44.3 | - -## Citation -If you use the code in your research, please cite: - -```bib -@INPROCEEDINGS{Zhu2024UNIT, - author = {Zhu, Yi and Zhou, Yanpeng and Wang, Chunwei and Cao, Yang and Han, Jianhua and Hou, Lu and Xu, Hang.}, - title = {UNIT: Unifying Image and Text Recognition in One Vision Encoder}, - booktitle = {The Thirty-eighth Annual Conference on Neural Information Processing Systems (NeurIPS)}, - year = {2024} -} -``` diff --git a/llava_next/llava/model/multimodal_encoder/unit/__init__.py b/llava_next/llava/model/multimodal_encoder/unit/__init__.py deleted file mode 100755 index 4497bc50..00000000 --- a/llava_next/llava/model/multimodal_encoder/unit/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .hf_model import UNITModel \ No newline at end of file diff --git a/llava_next/llava/model/multimodal_encoder/unit/cls_token.py b/llava_next/llava/model/multimodal_encoder/unit/cls_token.py deleted file mode 100755 index 1d3376c1..00000000 --- a/llava_next/llava/model/multimodal_encoder/unit/cls_token.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. -# -# NVIDIA CORPORATION and its licensors retain all intellectual property -# and proprietary rights in and to this software, related documentation -# and any modifications thereto. Any use, reproduction, disclosure or -# distribution of this software and related documentation without an express -# license agreement from NVIDIA CORPORATION is strictly prohibited. - -import torch -from torch import nn - - -class ClsToken(nn.Module): - def __init__(self, ndim: int, - num_tokens: int = 1, - enabled: bool = True, - register_multiple: int = 0, - ): - super().__init__() - - self.ndim = ndim - self.enabled = enabled - self.num_registers = 0 - self.num_tokens = num_tokens - if enabled: - if register_multiple > 0: - self.num_registers = register_multiple - (num_tokens % register_multiple) - - scale = ndim ** -0.5 - self.token = nn.Parameter(torch.randn(num_tokens + self.num_registers, ndim) * scale) - else: - self.token = None - - self.num_patches = self.num_tokens + self.num_registers - - def disable(self): - self.token = None - self.enabled = False - - def forward(self, x: torch.Tensor): - if self.token is None: - return x - - token = self.token.unsqueeze(0).expand(x.shape[0], -1, -1) - x = torch.cat([ - token, - x, - ], dim=1) - - return x - - def no_weight_decay(self): - return [ - 'token', - ] \ No newline at end of file diff --git a/llava_next/llava/model/multimodal_encoder/unit/hf_model.py b/llava_next/llava/model/multimodal_encoder/unit/hf_model.py deleted file mode 100755 index 988c6aac..00000000 --- a/llava_next/llava/model/multimodal_encoder/unit/hf_model.py +++ /dev/null @@ -1,146 +0,0 @@ -from collections import namedtuple -from typing import Optional, Union, Tuple - -import torch -from torch import nn - -from timm.models import VisionTransformer, create_model, checkpoint_seq - -from transformers import PretrainedConfig, PreTrainedModel - -import timm -from timm.models.vision_transformer import _create_vision_transformer -from timm.models.registry import register_model - -from .input_conditioner import get_default_conditioner, InputConditioner -from .vit_patch_generator import ViTPatchGenerator - -@register_model -def vit_huge_patch14_224_1B(pretrained: bool = False, **kwargs) -> VisionTransformer: - model_args = dict(patch_size=14, embed_dim=1280, depth=50, num_heads=16) - model = _create_vision_transformer('vit_huge_patch14_224_1B', pretrained=pretrained, **dict(model_args, **kwargs)) - return model - - -class UNITConfig(PretrainedConfig): - """Pretrained Hugging Face configuration for RADIO models.""" - - def __init__( - self, - args: Optional[dict] = None, - version: Optional[str] = "v1", - **kwargs, - ): - self.args = args - self.version = version - super().__init__(**kwargs) - -class UNITModelEncoder(nn.Module): - def __init__( - self, - model: nn.Module, - max_img_size: Union[int, Tuple[int, int]] = 1024, - num_cls_tokens: int = 1, - register_multiple: int = 0, - pos_dropout: float = 0.1, - ): - super().__init__() - - self.model = model - - input_conditioner: InputConditioner = get_default_conditioner() - self.input_conditioner = input_conditioner - - patch_size = model.patch_embed.patch_size[0] - embed_dim = model.embed_dim - input_dims = model.patch_embed.img_size - normalize_patches = not isinstance(model.patch_embed.norm, nn.Identity) - cls_token = model.cls_token is not None - - max_img_size = int(round(max_img_size / patch_size) * patch_size) - - patch_generator = ViTPatchGenerator( - patch_size=patch_size, - embed_dim=embed_dim, - input_dims=input_dims, - normalize_patches=normalize_patches, - cls_token=cls_token, - max_input_dims=max_img_size, - pos_dropout=pos_dropout, - num_cls_tokens=num_cls_tokens, - register_multiple=register_multiple, - - ) - - model.patch_generator = patch_generator - model.patch_embed = None - model.cls_token = None - model.pos_embed = None - model.pos_drop = None - model.num_cls_tokens = num_cls_tokens - model.num_registers = patch_generator.num_registers - self.ln_final = torch.nn.LayerNorm(embed_dim, eps=1e-6) - - - - def forward(self, x: torch.Tensor): - x = self.input_conditioner(x) - x = self.model.patch_generator(x) - - if self.model.grad_checkpointing and not torch.jit.is_scripting(): - x = checkpoint_seq(self.model.blocks, x) - else: - x = self.model.blocks(x) - x = self.model.norm(x) - - cls_tokens = x[:, 0] - spatial_tokens = x[:, self.model.patch_generator.num_skip:] - - # Apply layer normalization to the cls_tokens and spatial_tokens - spatial_tokens = self.ln_final(spatial_tokens) - return cls_tokens, spatial_tokens - - -class UNITModel(PreTrainedModel): - """Pretrained Hugging Face model for UNIT. - - This class inherits from PreTrainedModel, which provides - HuggingFace's functionality for loading and saving models. - """ - - config_class = UNITConfig - - def __init__(self, config: UNITConfig): - super().__init__(config) - - UNITArgs = namedtuple("UNITArgs", config.args.keys()) - args = UNITArgs(**config.args) - self.config = config - - weight_init = args.model_kwargs.pop("weight_init", "skip") - - model = create_model( - args.model, - pretrained=args.pretrained, - in_chans=args.in_chans, - num_classes=args.num_classes, - drop_rate=args.drop, - drop_path_rate=args.drop_path, - drop_block_rate=args.drop_block, - global_pool=args.gp, - bn_momentum=args.bn_momentum, - bn_eps=args.bn_eps, - scriptable=args.torchscript, - checkpoint_path=args.initial_checkpoint, - weight_init=weight_init, - **args.model_kwargs, - ) - - self.unit_model = UNITModelEncoder(model, - args.cpe_max_size, - args.num_cls_tokens, - args.register_multiple - ) - - def forward(self, x: torch.Tensor): - return self.unit_model.forward(x) diff --git a/llava_next/llava/model/multimodal_encoder/unit/input_conditioner.py b/llava_next/llava/model/multimodal_encoder/unit/input_conditioner.py deleted file mode 100755 index 2cf57c37..00000000 --- a/llava_next/llava/model/multimodal_encoder/unit/input_conditioner.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. -# -# NVIDIA CORPORATION and its licensors retain all intellectual property -# and proprietary rights in and to this software, related documentation -# and any modifications thereto. Any use, reproduction, disclosure or -# distribution of this software and related documentation without an express -# license agreement from NVIDIA CORPORATION is strictly prohibited. - -from typing import Union, Tuple - -import torch -from torch import nn - - -norm_t = Union[Tuple[float, float, float], torch.Tensor] - -class InputConditioner(nn.Module): - def __init__(self, - input_scale: float, - norm_mean: norm_t, - norm_std: norm_t, - dtype: torch.dtype = None, - ): - super().__init__() - - self.dtype = dtype - - self.register_buffer("norm_mean", _to_tensor(norm_mean) / input_scale) - self.register_buffer("norm_std", _to_tensor(norm_std) / input_scale) - - def forward(self, x: torch.Tensor): - y = (x - self.norm_mean) / self.norm_std - if self.dtype is not None: - y = y.to(self.dtype) - return y - - -def get_default_conditioner(): - from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD - - return InputConditioner( - input_scale=1.0, - norm_mean=OPENAI_CLIP_MEAN, - norm_std=OPENAI_CLIP_STD, - ) - - -def _to_tensor(v: norm_t): - return torch.as_tensor(v, ).view(-1, 1, 1) \ No newline at end of file diff --git a/llava_next/llava/model/multimodal_encoder/unit/vit_patch_generator.py b/llava_next/llava/model/multimodal_encoder/unit/vit_patch_generator.py deleted file mode 100755 index 70cd9644..00000000 --- a/llava_next/llava/model/multimodal_encoder/unit/vit_patch_generator.py +++ /dev/null @@ -1,318 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. -# -# NVIDIA CORPORATION and its licensors retain all intellectual property -# and proprietary rights in and to this software, related documentation -# and any modifications thereto. Any use, reproduction, disclosure or -# distribution of this software and related documentation without an express -# license agreement from NVIDIA CORPORATION is strictly prohibited. - -import math -from typing import Union, Tuple, Optional - -import torch -import torch.nn.functional as F -from torch import nn -from einops import rearrange - -from .cls_token import ClsToken - -input_dim_t = Union[int, Tuple[int, int]] - -try: - # raise ImportError() - from indirect_grid_sample import indirect_grid_sample -except ImportError: - indirect_grid_sample = None - -class ViTPatchGenerator(nn.Module): - def __init__(self, - patch_size: int, - embed_dim: int, - input_dims: input_dim_t, - abs_pos: bool = True, - normalize_patches: bool = False, - cls_token: bool = False, - max_input_dims: Optional[input_dim_t] = None, - pos_dropout: float = 0.0, - return_pos_enc: bool = False, - num_cls_tokens: int = 1, - register_multiple: int = 0, - device=None, dtype=None, - ): - super().__init__() - - if isinstance(input_dims, int): - input_dims = (input_dims, input_dims) - - if max_input_dims is None: - max_input_dims = input_dims - if isinstance(max_input_dims, int): - max_input_dims = (max_input_dims, max_input_dims) - - max_input_dims = tuple( - int(math.ceil(d / patch_size) * patch_size) - for d in max_input_dims - ) - - self.cpe_mode = max_input_dims != input_dims - self.pos_dropout = pos_dropout - self.return_pos_enc = return_pos_enc - - factory = dict(device=device, dtype=dtype) - - self.patch_size = patch_size - self.abs_pos = abs_pos - self.embed_dim = embed_dim - - self.num_rows = max_input_dims[0] // patch_size - self.num_cols = max_input_dims[1] // patch_size - self.input_dims = tuple(d // patch_size for d in input_dims) - self.num_patches = self.num_rows * self.num_cols - self.max_input_dims = max_input_dims - - self.im_to_patches = Im2Patches(patch_size) - self.embedder = ViTPatchLinear(patch_size, embed_dim, **factory) - - if abs_pos: - scale = embed_dim ** -0.5 - self.pos_embed = nn.Parameter(torch.randn(1, self.num_patches, embed_dim, **factory) * scale) - - self.cls_token = ClsToken( - embed_dim, - num_tokens=num_cls_tokens, - enabled=cls_token, - register_multiple=register_multiple, - ) - - self.patch_normalizer = nn.LayerNorm(embed_dim) if normalize_patches else nn.Identity() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - patches = self.embed_patches(x) - patches, pos_enc = self.apply_pos_enc(patches, input_size=x.shape[2:]) - patches = self.cls_token(patches) - patches = self.patch_normalizer(patches) - if self.return_pos_enc: - return patches, pos_enc - return patches - - @property - def apply_cls_token(self): - return self.cls_token.enabled - - @property - def num_cls_tokens(self): - return self.cls_token.num_tokens - - @property - def num_registers(self): - return self.cls_token.num_registers - - @property - def num_skip(self): - return self.num_cls_tokens + self.num_registers - - def no_weight_decay(self): - return [ - 'pos_embed', - ] - - def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): - if self.abs_pos: - self._load_embed(state_dict[f'{prefix}pos_embed'], self.pos_embed) - - def _load_embed(self, src_embed: torch.Tensor, targ_embed: nn.Parameter): - if src_embed.shape != targ_embed.shape: - src_size = int(math.sqrt(src_embed.shape[1])) - - assert src_size ** 2 == src_embed.shape[1], 'Unable to interpolate non-square embedding' - - src_embed = rearrange(src_embed, 'b (h w) c -> b c h w', h=src_size, w=src_size) - src_embed = F.interpolate(src_embed, size=(self.num_rows, self.num_cols), mode='bicubic', align_corners=True, antialias=False) - src_embed = rearrange(src_embed, 'b c h w -> b (h w) c') - targ_embed.data.copy_(src_embed) - - def _load_projection(self, src_proj_weight: torch.Tensor, targ_proj_weight: torch.Tensor): - if src_proj_weight.shape != targ_proj_weight.shape: - src_patch_size = int(math.sqrt(src_proj_weight.shape[1] // 3)) - - assert (src_patch_size ** 2) * 3 == src_proj_weight.shape[1], 'Unable to interpolate non-square patch size' - - src_proj_weight = rearrange(src_proj_weight, 'b (c h w) -> b c h w', c=3, h=src_patch_size, w=src_patch_size) - src_proj_weight = F.interpolate(src_proj_weight, size=(self.patch_size, self.patch_size), mode='bicubic', align_corners=True, antialias=False) - src_proj_weight = rearrange(src_proj_weight, 'b c h w -> b (c h w)') - targ_proj_weight.data.copy_(src_proj_weight) - - def embed_patches(self, x: torch.Tensor) -> torch.Tensor: - patches = self.im_to_patches(x) - dtype = patches.dtype - # patches = self.embedder(patches.to(torch.float32)).to(dtype) - patches = self.embedder(patches) - return patches - - def apply_pos_enc(self, - patches: torch.Tensor, - patch_idxs: Optional[torch.Tensor] = None, - input_size: Optional[Tuple[int, int]] = None, - ) -> torch.Tensor: - if not self.abs_pos: - return patches - - pos_enc = self.get_pos_enc(patches.shape[0], patch_idxs, input_size) - - if self.training and self.pos_dropout > 0: - keeps = torch.rand(patches.shape[0], 1, 1, dtype=pos_enc.dtype, device=pos_enc.device) > self.pos_dropout - pos_enc_drop = torch.where(keeps, pos_enc, 0) - else: - pos_enc_drop = pos_enc - - return patches + pos_enc_drop, pos_enc - - def get_pos_enc(self, - batch_size: int, - patch_idxs: Optional[torch.Tensor] = None, - input_size: Optional[Tuple[int, int]] = None, - ) -> torch.Tensor: - if input_size is None: - input_dims = self.input_dims - else: - input_dims = tuple(d // self.patch_size for d in input_size) - - pos_embed = self._get_pos_embeddings(batch_size, input_dims) - - if patch_idxs is None: - return pos_embed - - exp_patch_idxs = patch_idxs.unsqueeze(-1).expand(-1, -1, pos_embed.shape[-1]) - - pos_embed = torch.gather(pos_embed.expand(patch_idxs.shape[0], -1, -1), dim=1, index=exp_patch_idxs) - return pos_embed - - - def _get_pos_embeddings(self, batch_size: int, input_dims: Tuple[int, int]): - if (self.num_rows, self.num_cols) == input_dims: - return self.pos_embed - - pos_embed = self.pos_embed.reshape(1, self.num_rows, self.num_cols, -1).permute(0, 3, 1, 2) - dtype = pos_embed.dtype - # pos_embed = pos_embed.to(dtype=torch.float32) - # pos_embed = pos_embed.to(dtype=torch.float32) - - def window_select(pos_embed): - if input_dims[0] < pos_embed.shape[-2]: - pos_embed = pos_embed[..., :input_dims[0], :] - if input_dims[1] < pos_embed.shape[-1]: - pos_embed = pos_embed[..., :, :input_dims[1]] - return pos_embed - - if self.cpe_mode: - if self.training: - min_scale = math.sqrt(0.1) - # Ensure all intermediate tensors use the same dtype as pos_embed - scale = torch.rand(batch_size, 1, 1, device=pos_embed.device, dtype=dtype) * (1 - min_scale) + min_scale - aspect_min = math.log(3 / 4) - aspect_max = -aspect_min - aspect = torch.exp(torch.rand(batch_size, 1, 1, device=pos_embed.device, dtype=dtype) * (aspect_max - aspect_min) + aspect_min) - - scale_x = scale * aspect - scale_y = scale * (1 / aspect) - scale_xy = torch.stack([scale_x, scale_y], dim=-1).clamp_(0, 1) - - pos_xy = torch.rand(batch_size, 1, 1, 2, device=pos_embed.device, dtype=dtype) * (1 - scale_xy) - - lin_x = torch.linspace(0, 1, steps=input_dims[1], device=pos_embed.device, dtype=dtype)[None, None].expand(batch_size, input_dims[0], -1) - lin_y = torch.linspace(0, 1, steps=input_dims[0], device=pos_embed.device, dtype=dtype)[None, :, None].expand(batch_size, -1, input_dims[1]) - - lin_xy = torch.stack([lin_x, lin_y], dim=-1) - - grid_xy = lin_xy * scale_xy + pos_xy - - # Convert to [-1, 1] range - grid_xy.mul_(2).sub_(1) - - # Make sure grid_xy has the same dtype as pos_embed - grid_xy = grid_xy.to(dtype=dtype) - - # Use grid_sample with consistent dtypes - - dtype = pos_embed.dtype - # 修改后的代码 - pos_embed = F.grid_sample( - pos_embed.expand(batch_size, -1, -1, -1).to(dtype=torch.float32), - grid=grid_xy.to(dtype=torch.float32), - mode='bilinear', - padding_mode='zeros', - align_corners=True, - ) - pos_embed = pos_embed.to(dtype=dtype) - grid_xy = grid_xy.to(dtype=dtype) - - else: - # i_rows, i_cols = input_dims - # p_rows, p_cols = pos_embed.shape[2:] - # if i_rows <= p_rows and i_cols <= p_cols: - # left = (p_cols - i_cols) // 2 - # top = (p_rows - i_rows) // 2 - # pos_embed = pos_embed[..., top:top+i_rows, left:left+i_cols] - # else: - max_dim = max(input_dims) - pos_embed = F.interpolate(pos_embed, size=(max_dim, max_dim), align_corners=True, mode='bilinear').to(pos_embed.dtype) - - pos_embed = window_select(pos_embed) - else: - pos_embed = window_select(pos_embed) - - if pos_embed.shape[-2:] != input_dims: - # pos_embed = F.interpolate(pos_embed.float(), size=input_dims, align_corners=True, mode='bilinear').to(pos_embed.dtype) - pos_embed = F.interpolate(pos_embed, size=input_dims, align_corners=True, mode='bilinear') - - pos_embed = pos_embed.flatten(2).permute(0, 2, 1) - - # pos_embed = pos_embed.to(dtype=torch.float32) - - return pos_embed - - -class Im2Patches(nn.Module): - def __init__(self, patch_size: int): - super().__init__() - self.patch_size = patch_size - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.patch_size == 1: - patches = x.flatten(2) - patches = patches.permute(0, 2, 1) - return patches - - py = x.shape[-2] // self.patch_size - px = x.shape[-1] // self.patch_size - patches = rearrange(x, 'b c (py yy) (px xx) -> b (py px) (c yy xx)', - py=py, yy=self.patch_size, - px=px, xx=self.patch_size, - ) - return patches - - -class ViTPatchLinear(nn.Linear): - def __init__(self, patch_size: int, embed_dim: int, **factory): - super().__init__( - 3 * (patch_size ** 2), - embed_dim, - bias=False, - **factory - ) - self.patch_size = patch_size - - def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): - if self.bias is not None: - self.bias.data.copy_(state_dict[f'{prefix}bias']) - - chk_weight = state_dict[f'{prefix}weight'] - if chk_weight.shape != self.weight.shape: - src_patch_size = int(math.sqrt(chk_weight.shape[1] // 3)) - - assert (src_patch_size ** 2) * 3 == chk_weight.shape[1], 'Unable to interpolate non-square patch size' - - chk_weight = rearrange(chk_weight, 'b (c h w) -> b c h w', c=3, h=src_patch_size, w=src_patch_size) - chk_weight = F.interpolate(chk_weight, size=(self.patch_size, self.patch_size), mode='bicubic', align_corners=True, antialias=False) - chk_weight = rearrange(chk_weight, 'b c h w -> b (c h w)') - self.weight.data.copy_(chk_weight) \ No newline at end of file diff --git a/llava_next/llava/model/multimodal_encoder/unit448_encoder.py b/llava_next/llava/model/multimodal_encoder/unit448_encoder.py deleted file mode 100755 index c524f39f..00000000 --- a/llava_next/llava/model/multimodal_encoder/unit448_encoder.py +++ /dev/null @@ -1,155 +0,0 @@ -import torch -import torch.nn as nn -from llava.utils import rank0_print -from transformers import CLIPImageProcessor -from .unit import UNITModel -from .unit.hf_model import UNITConfig - -try: - from s2wrapper import forward as multiscale_forward -except: - pass - - -class UNITVisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - - self.vision_tower_name = vision_tower - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - # self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) - self.cfg_only = UNITConfig.from_pretrained(self.vision_tower_name) - - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = UNITModel.from_pretrained(self.vision_tower_name, device_map=device_map) - print(self.vision_tower) - self.vision_tower.requires_grad_(False) - - self.is_loaded = True - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_feature = self.vision_tower(image.to(device=self.device).unsqueeze(0))[1] - image_feature = image_feature - image_features.append(image_feature) - else: - image_features = self.vision_tower(images.to(device=self.device))[1] - image_features = image_features - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - return self.vision_tower.dtype - - @property - def device(self): - return self.vision_tower.device - - @property - def config(self): - if self.is_loaded: - return self.vision_tower.config - else: - return self.cfg_only - - @property - def hidden_size(self): - return 1280 - - @property - def num_patches_per_side(self): - return 32 - - @property - def num_patches(self): - _num_patches = (self.config.image_size // self.config.patch_size) ** 2 - if "cls_patch" in self.select_feature: - _num_patches += 1 - return _num_patches - - @property - def image_size(self): - return 448 - - -# class CLIPVisionTowerS2(CLIPVisionTower): -class UNITVisionTowerS2(UNITVisionTower): - def __init__(self, vision_tower, args, delay_load=False): - - self.s2_scales = getattr(args, "s2_scales", "336,672,1008") - self.s2_scales = list(map(int, self.s2_scales.split(","))) - self.s2_scales.sort() - self.s2_split_size = self.s2_scales[0] - self.s2_image_size = self.s2_scales[-1] - - super().__init__(vision_tower, args, delay_load) - - # change resize/crop size in preprocessing to the largest image size in s2_scale - if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False): - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = UNITModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - self.is_loaded = True - - def forward_feature(self, images): - # image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - # image_features = self.feature_select(image_forward_outs).to(images.dtype) - print(images.dtype) - image_forward_outs = self.vision_tower(images.to(device=self.device, ))[1] - image_features = image_forward_outs - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - image_features.append(image_feature) - else: - image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - - return image_features - - @property - def hidden_size(self): - return self.config.hidden_size * len(self.s2_scales) diff --git a/llava_next/llava/model/multimodal_encoder/unit_encoder.py b/llava_next/llava/model/multimodal_encoder/unit_encoder.py deleted file mode 100755 index f18e990c..00000000 --- a/llava_next/llava/model/multimodal_encoder/unit_encoder.py +++ /dev/null @@ -1,192 +0,0 @@ -import torch -import torch.nn as nn -from llava.utils import rank0_print -from transformers import CLIPImageProcessor -from .unit import UNITModel -from .unit.hf_model import UNITConfig - -try: - from s2wrapper import forward as multiscale_forward -except: - pass - - -class UNITVisionTower(nn.Module): - def __init__(self, vision_tower, args, delay_load=False): - super().__init__() - - self.is_loaded = False - - self.vision_tower_name = vision_tower - self.select_layer = args.mm_vision_select_layer - self.select_feature = getattr(args, "mm_vision_select_feature", "patch") - - if not delay_load: - rank0_print(f"Loading vision tower: {vision_tower}") - self.load_model() - elif getattr(args, "unfreeze_mm_vision_tower", False): - # TODO: better detector is needed. - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.") - self.load_model() - elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts: - rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.") - self.load_model() - else: - # self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) - self.cfg_only = UNITConfig.from_pretrained(self.vision_tower_name) - - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - # self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower = UNITModel.from_pretrained(self.vision_tower_name, device_map=device_map) - print(self.vision_tower) - self.vision_tower.requires_grad_(False) - - self.is_loaded = True - - # def feature_select(self, image_forward_outs): - # select_feature_type = self.select_feature - - # if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]: - # select_every_k_layer = len(image_forward_outs.hidden_states) // 4 - # image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1) - # select_feature_type = select_feature_type.replace("slicefour_", "") - # elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]: - # select_layers = [-2, -5, -8, -11, 6] - # image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1) - # select_feature_type = select_feature_type.replace("slice_m25811_f6_", "") - # else: - # image_features = image_forward_outs.hidden_states[self.select_layer] - - # if select_feature_type == "patch": - # image_features = image_features[:, 1:] - # elif select_feature_type == "cls_patch": - # image_features = image_features - # else: - # raise ValueError(f"Unexpected select feature: {select_feature_type}") - # return image_features - - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - # image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) - # image_feature = self.feature_select(image_forward_out).to(image.dtype) - # image_feature = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))[1] - image_feature = self.vision_tower(image.to(device=self.device).unsqueeze(0))[1] - image_feature = image_feature - image_features.append(image_feature) - else: - # image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - # image_features = self.feature_select(image_forward_outs).to(images.dtype) - image_features = self.vision_tower(images.to(device=self.device))[1] - image_features = image_features - - return image_features - - @property - def dummy_feature(self): - return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) - - @property - def dtype(self): - return self.vision_tower.dtype - - @property - def device(self): - return self.vision_tower.device - - @property - def config(self): - if self.is_loaded: - return self.vision_tower.config - else: - return self.cfg_only - - @property - def hidden_size(self): - return 1280 - # _hidden_size = self.config.hidden_size - # if "slicefour" in self.select_feature: - # _hidden_size *= 4 - # if "slice_m25811_f6" in self.select_feature: - # _hidden_size *= 5 - # return _hidden_size - - @property - def num_patches_per_side(self): - # return self.config.image_size // self.config.patch_size - return 27 - - @property - def num_patches(self): - _num_patches = (self.config.image_size // self.config.patch_size) ** 2 - if "cls_patch" in self.select_feature: - _num_patches += 1 - return _num_patches - - @property - def image_size(self): - # return self.config.image_size - return 378 - - -# class CLIPVisionTowerS2(CLIPVisionTower): -class UNITVisionTowerS2(UNITVisionTower): - def __init__(self, vision_tower, args, delay_load=False): - - self.s2_scales = getattr(args, "s2_scales", "336,672,1008") - self.s2_scales = list(map(int, self.s2_scales.split(","))) - self.s2_scales.sort() - self.s2_split_size = self.s2_scales[0] - self.s2_image_size = self.s2_scales[-1] - - super().__init__(vision_tower, args, delay_load) - - # change resize/crop size in preprocessing to the largest image size in s2_scale - if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False): - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - def load_model(self, device_map=None): - if self.is_loaded: - rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name)) - return - - self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) - self.vision_tower = UNITModel.from_pretrained(self.vision_tower_name, device_map=device_map) - self.vision_tower.requires_grad_(False) - - self.image_processor.size["shortest_edge"] = self.s2_image_size - self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size - - self.is_loaded = True - - def forward_feature(self, images): - # image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) - # image_features = self.feature_select(image_forward_outs).to(images.dtype) - print(images.dtype) - image_forward_outs = self.vision_tower(images.to(device=self.device, ))[1] - image_features = image_forward_outs - return image_features - - def forward(self, images): - if type(images) is list: - image_features = [] - for image in images: - image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - image_features.append(image_feature) - else: - image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True) - - return image_features - - @property - def hidden_size(self): - return self.config.hidden_size * len(self.s2_scales) diff --git a/llava_next/llava/model/multimodal_resampler/builder.py b/llava_next/llava/model/multimodal_resampler/builder.py index 7a4b207f..992c0140 100755 --- a/llava_next/llava/model/multimodal_resampler/builder.py +++ b/llava_next/llava/model/multimodal_resampler/builder.py @@ -1,9 +1,14 @@ import torch +import warnings from .masked_drop import MaskedDrop from .spatial_pool import SpatialPool from .perceiver import PerceiverResampler -from .qformer import Qformer +try: + from .qformer import Qformer +except ImportError as e: + warnings.warn(f"Failed to import Qformer: {e}. Qformer resampler will not be available.") + Qformer = None class IdentityMap(torch.nn.Module): @@ -27,6 +32,8 @@ def build_vision_resampler(model_args, delay_load=False, **kwargs): elif resampler_type == "perceiver": return PerceiverResampler(model_args, **kwargs) elif resampler_type == "qformer": + if Qformer is None: + raise ImportError("Qformer is not available due to import error. Please check the warning message.") return Qformer(model_args, **kwargs) elif resampler_type is None: return IdentityMap() diff --git a/llava_next/llava/train/train.py b/llava_next/llava/train/train.py index 9becc78d..a30072ca 100755 --- a/llava_next/llava/train/train.py +++ b/llava_next/llava/train/train.py @@ -45,7 +45,7 @@ from llava import conversation as conversation_lib from llava.model import * -from llava.mm_utils import process_highres_image, process_anyres_image, process_highres_image_crop_split, tokenizer_image_token +from llava.mm_utils import process_highres_image, process_native_image, process_anyres_image, process_highres_image_crop_split, tokenizer_image_token from llava.utils import rank0_print, process_video_with_pyav, process_video_with_decord torch.multiprocessing.set_sharing_strategy("file_system") @@ -958,6 +958,7 @@ def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, super(LazySupervisedDataset, self).__init__() self.tokenizer = tokenizer self.list_data_dict = [] + self.weights = torch.tensor([1296, 36, 1], dtype=torch.int64) # for codec input # Handle multiple JSON files specified in the data_path if "{" in data_path and "}" in data_path: @@ -1025,10 +1026,17 @@ def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, else: data_args.dataset_paths = [data_path] rank0_print(f"Loading {data_path}") - with open(data_path, "r") as file: - cur_data_dict = json.load(file) - rank0_print(f"Loaded {len(cur_data_dict)} samples from {data_path}") - self.list_data_dict.extend(cur_data_dict) + if data_path.endswith(".jsonl"): + with open(data_path, "r") as file: + cur_data_dict = [] + for line in file: + if line.strip(): + cur_data_dict.append(json.loads(line.strip())) + else: + with open(data_path, "r") as file: + cur_data_dict = json.load(file) + rank0_print(f"Loaded {len(cur_data_dict)} samples from {data_path}") + self.list_data_dict.extend(cur_data_dict) rank0_print(f"Loaded {len(self.list_data_dict)} samples from {data_path}") rank0_print("Formatting inputs...Skip in lazy mode") @@ -1073,12 +1081,21 @@ def process_image(self, image_file: Union[bytes, str], overwrite_image_aspect_ra image_size = image.size image_aspect_ratio = self.data_args.image_aspect_ratio + grid_thw = None if overwrite_image_aspect_ratio is not None: image_aspect_ratio = overwrite_image_aspect_ratio if image_aspect_ratio == "highres": image = process_highres_image(image, self.data_args.image_processor, self.data_args.image_grid_pinpoints) + elif image_aspect_ratio == "native": + image = process_native_image(image, self.data_args.image_processor) + if type(image) is dict: + grid_thw = image['grid_thw'] + image = image['pixel_values'] elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio: image = process_anyres_image(image, self.data_args.image_processor, self.data_args.image_grid_pinpoints) + if type(image) is dict: + grid_thw = image['grid_thw'] + image = image['pixel_values'] elif image_aspect_ratio == "crop_split": image = process_highres_image_crop_split(image, self.data_args) elif image_aspect_ratio == "pad": @@ -1095,12 +1112,26 @@ def expand2square(pil_img, background_color): result = Image.new(pil_img.mode, (height, height), background_color) result.paste(pil_img, ((height - width) // 2, 0)) return result - - image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean)) - image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0] + image = expand2square(image, tuple(int(0 * 255) for x in [0,0,0])) + if 'siglip' in str(processor).lower(): + image = image.resize((512, 512)) + grid_thw = [1,32,32] + else: + image = image.resize((504, 504)) + grid_thw = [1,36,36] + + image = processor.preprocess(image, return_tensors="pt", do_resize=False)["pixel_values"] else: - image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0] - return image, image_size, "image" + if 'siglip' in str(processor).lower(): + image = image.resize((512, 512)) + image = processor.preprocess(image, return_tensors="pt", do_resize=False)["pixel_values"] + else: + image = image.resize((504, 504)) + image = processor.preprocess(image, return_tensors="pt", do_resize=False, do_center_crop=False)["pixel_values"] + + if grid_thw is None: + return image, image_size, "image" + return image, image_size, "image", grid_thw def __getitem__(self, i) -> Dict[str, torch.Tensor]: # TODO: define number of retries somewhere else @@ -1140,6 +1171,11 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]: if isinstance(i, int): sources = [sources] assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME + if 'positions_thw' in sources[0]: + visible_indices = (torch.tensor(np.load(sources[0]['positions_thw'])) * self.weights).sum(dim=1) + else: + visible_indices = None + if "images" in sources[0] and len(sources[0]['images']) < 9: image_file = self.list_data_dict[i]["images"] @@ -1149,7 +1185,7 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]: # overwrite to process with simple pad if len(image_file) > 1: image = [self.process_image(f, "pad") for f in image_file] - image = [[im[0], im[1], "image"] for im in image] + image = [[im[0], im[1], "image", im[3]] for im in image] else: image = [self.process_image(image_file)] sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args) @@ -1223,7 +1259,7 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]: # overwrite to process with simple pad if len(image_file) > 1: image = [self.process_image(f, "pad") for f in image_file] - image = [[im[0], im[1], "image"] for im in image] + image = [[im[0], im[1], "image", im[3]] for im in image] else: image = [self.process_image(image_file)] sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args) @@ -1240,17 +1276,18 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]: if isinstance(i, int): data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0]) + data_dict['visible_indices'] = visible_indices # image exist in the data if "images" in self.list_data_dict[i] or 'image' in self.list_data_dict[i]: - data_dict["image"] = image + data_dict["image"] = image elif "video" in self.list_data_dict[i]: data_dict["image"] = image elif self.data_args.is_multimodal: # image does not exist in the data, but the model is multimodal - crop_size = self.data_args.image_processor.crop_size + crop_size = 224 data_dict["image"] = [ - (torch.zeros(1, 3, crop_size["height"], crop_size["width"]), (crop_size["width"], crop_size["height"]), "text"), + (torch.zeros(196, 768), (22,224), "text", [[1,14,14]]), ] # prompt exist in the data if prompt is not None: @@ -1403,6 +1440,8 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: batch["image_sizes"] = [im[1] for im_list in images for im in im_list] batch["modalities"] = [im[2] for im_list in images for im in im_list] + if all(len(im) == 4 for im_list in images for im in im_list): + batch["grid_thw"] = torch.tensor([im[3] for im_list in images for im in im_list]) images = [im[0] for im_list in images for im in im_list] # if all(x is not None and x.shape == images[0].shape for x in images): @@ -1414,6 +1453,8 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: if "prompt" in instances[0]: batch["prompts"] = [instance["prompt"] for instance in instances] + if 'visible_indices' in instances[0]: + batch['visible_indices'] = [instance['visible_indices'] for instance in instances] return batch @@ -1546,6 +1587,15 @@ def get_model(model_args, training_args, bnb_model_from_pretrained_args): from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock deepspeed.utils.set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock]) + elif "qwen3" in model_args.model_name_or_path.lower(): + model = LlavaQwen3ForCausalLM.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + attn_implementation=training_args.attn_implementation, + torch_dtype=(torch.bfloat16 if training_args.bf16 else None), + low_cpu_mem_usage=False, + **customized_kwargs, + ) else: model = LlavaQwenForCausalLM.from_pretrained( model_args.model_name_or_path, @@ -1823,7 +1873,7 @@ def make_inputs_require_grad(module, input, output): module = module.to(torch.bfloat16) data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args) - trainer = LLaVATrainer(model=model, tokenizer=tokenizer, args=training_args, **data_module) + trainer = LLaVATrainer(model=model, processing_class=tokenizer, args=training_args, **data_module) if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): trainer.train(resume_from_checkpoint=True) diff --git a/llava_next/scripts/deprecated/560px/dist_AIMv2_l_14_560_finetune.sh b/llava_next/scripts/deprecated/560px/dist_AIMv2_l_14_560_finetune.sh deleted file mode 100755 index 33270a0f..00000000 --- a/llava_next/scripts/deprecated/560px/dist_AIMv2_l_14_560_finetune.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/aimv2-large-patch14-native" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_aimv2-large-patch14-native-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v10 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 1120), (1120, 560), (1120, 1120), (1680, 560), (560, 1680)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/560px/dist_AIMv2_l_14_560_pretrain.sh b/llava_next/scripts/deprecated/560px/dist_AIMv2_l_14_560_pretrain.sh deleted file mode 100755 index 98d76c2c..00000000 --- a/llava_next/scripts/deprecated/560px/dist_AIMv2_l_14_560_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/aimv2-large-patch14-native" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v10 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/560px/dist_rice_L_14_560_v1_finetune_4x4.sh b/llava_next/scripts/deprecated/560px/dist_rice_L_14_560_v1_finetune_4x4.sh deleted file mode 100755 index 1af77e7a..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_L_14_560_v1_finetune_4x4.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v1-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-14nodes-4x4" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (1680, 560), (2240, 560), (560, 1120), (1120, 1120), (1680, 1120), (2240, 1120), (560, 1680), (1120, 1680), (1680, 1680), (2240, 1680), (560, 2240), (1120, 2240), (1680, 2240), (2240, 2240)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/560px/dist_rice_L_14_560_v1_finetune_4x4_from_stage_1.5_3B.sh b/llava_next/scripts/deprecated/560px/dist_rice_L_14_560_v1_finetune_4x4_from_stage_1.5_3B.sh deleted file mode 100755 index a83ac06e..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_L_14_560_v1_finetune_4x4_from_stage_1.5_3B.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-3B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v1-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -mm_projector_type="patch_merger" -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-${mm_projector_type}-pretrain_blip558k-emova-alignment-7m-finetune_llavanext780k-14nodes-4x4" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path checkpoints/emova-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v1-_vlm_pretrain_models_Qwen_Qwen2.5-3B-Instruct-mlp2x_gelu-pretrain_7M_vqa_stage_1_5/ \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type patch_merger \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (1680, 560), (2240, 560), (560, 1120), (1120, 1120), (1680, 1120), (2240, 1120), (560, 1680), (1120, 1680), (1680, 1680), (2240, 1680), (560, 2240), (1120, 2240), (1680, 2240), (2240, 2240)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 16000 \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v0_finetune.sh b/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v0_finetune.sh deleted file mode 100755 index f6161bfd..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v0_finetune.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v0" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-bigG-patch14-560-v0-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v11 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 1120), (1120, 560), (1120, 1120), (1680, 560), (560, 1680)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v0_pretrain.sh b/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v0_pretrain.sh deleted file mode 100755 index fe1c60dc..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v0_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v0" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v11 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v1_finetune.sh b/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v1_finetune.sh deleted file mode 100755 index f309196d..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v1_finetune.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v0" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-bigG-patch14-560-v0-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 1120), (1120, 560), (1120, 1120), (1680, 560), (560, 1680)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v1_pretrain.sh b/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v1_pretrain.sh deleted file mode 100755 index 94d3db4c..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v1_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v0" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_align_wemova_finetune.sh b/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_align_wemova_finetune.sh deleted file mode 100755 index 3277d54f..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_align_wemova_finetune.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v2-align-from-emova" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-bigG-patch14-560-v2-align-from-emova-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile_14nodes.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 1120), (1120, 560), (1120, 1120), (1680, 560), (560, 1680)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_align_wemova_pretrain.sh b/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_align_wemova_pretrain.sh deleted file mode 100755 index b178e29f..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_align_wemova_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v2-align-from-emova" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile_14nodes.txt \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_finetune.sh b/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_finetune.sh deleted file mode 100755 index 02986178..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_finetune.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v2" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-bigG-patch14-560-v2-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 1120), (1120, 560), (1120, 1120), (1680, 560), (560, 1680)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_finetune_4x4.sh b/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_finetune_4x4.sh deleted file mode 100755 index 4a94ea17..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_finetune_4x4.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v2" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-bigG-patch14-560-v2-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-14nodes-plus" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (1680, 560), (2240, 560), (560, 1120), (1120, 1120), (1680, 1120), (2240, 1120), (560, 1680), (1120, 1680), (1680, 1680), (2240, 1680), (560, 2240), (1120, 2240), (1680, 2240), (2240, 2240)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 200 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_pretrain.sh b/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_pretrain.sh deleted file mode 100755 index db3f28cf..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_bigG_14_560_v2_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v2" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_no_class_token_finetune.sh b/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_no_class_token_finetune.sh deleted file mode 100755 index 43cb37b5..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_no_class_token_finetune.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1-wo-class-token/" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v1-wo-class-token_-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 1120), (1120, 560), (1120, 1120), (1680, 560), (560, 1680)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_no_class_token_pretrain.sh b/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_no_class_token_pretrain.sh deleted file mode 100755 index d2fecbfe..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_no_class_token_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1-wo-class-token/" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_finetune.sh b/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_finetune.sh deleted file mode 100755 index 2380211a..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_finetune.sh +++ /dev/null @@ -1,71 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-3B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v1-_vlm_pretrain_models_Qwen_Qwen2.5-3B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="emova_llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-14nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path /vlm/data/train_images/Emova-ollm/emova-sft-4m/llava_format_vqa_txt.json \ - --image_folder "/" \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (1680, 560), (2240, 560), (560, 1120), (1120, 1120), (1680, 1120), (2240, 1120), (560, 1680), (1120, 1680), (1680, 1680), (2240, 1680), (560, 2240), (1120, 2240), (1680, 2240), (2240, 2240)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 14000 \ - --dataloader_num_workers 4 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True \ - --mm_vision_select_feature patch_interpolate diff --git a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain.sh b/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain.sh deleted file mode 100755 index 7fa9f243..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain.sh +++ /dev/null @@ -1,58 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-3B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --mm_vision_select_feature patch_interpolate \ No newline at end of file diff --git a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain_700M_caption.sh b/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain_700M_caption.sh deleted file mode 100755 index dddd1b44..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain_700M_caption.sh +++ /dev/null @@ -1,63 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-3B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_7M_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (560, 1120), (1120, 1120), (1680, 560), (560, 1680)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --mm_vision_tower_lr 1e-5 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 5000 \ - --gradient_checkpointing True \ - --dataloader_num_workers 4 \ - --lazy_preprocess True \ - --mm_vision_select_feature patch_interpolate \ No newline at end of file diff --git a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain_v2_stage_1.5.sh b/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain_v2_stage_1.5.sh deleted file mode 100755 index 4204b678..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain_v2_stage_1.5.sh +++ /dev/null @@ -1,65 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-3B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=qwen_1_5 - -BASE_RUN_NAME="emova-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_7M_vqa_stage_1_5" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path /vlm/data/train_images/Emova-ollm/emova-alignment-7m/llava_format_vqa.json \ - --image_folder / \ - --vision_tower ${VISION_MODEL_VERSION} \ - --pretrain_mm_mlp_adapter "/vlm/xiangan/unicom_unit/checkpoints/projectors/emova-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v1-_vlm_pretrain_models_Qwen_Qwen2.5-3B-Instruct-mlp2x_gelu-pretrain_blip558k_plain_stage_1/mm_projector.bin" \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_select_layer -2 \ - --mm_projector_type patch_merger \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (560, 1120), (1120, 1120), (1680, 560), (560, 1680)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --output_dir ./checkpoints/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --mm_vision_tower_lr 1e-5 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 5000 \ - --gradient_checkpointing True \ - --dataloader_num_workers 4 \ - --lazy_preprocess True \ - --mm_vision_select_feature patch \ No newline at end of file diff --git a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain_v2_stage_1.sh b/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain_v2_stage_1.sh deleted file mode 100755 index 866f6b18..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain_v2_stage_1.sh +++ /dev/null @@ -1,58 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-3B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="emova-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_stage_1" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type patch_merger \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --mm_vision_select_feature patch \ No newline at end of file diff --git a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain_v2_stage_2.sh b/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain_v2_stage_2.sh deleted file mode 100755 index f4aa6dcf..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_emova_pretrain_v2_stage_2.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-3B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="" - - -PROJECTOR_NAME="emova-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v1-_vlm_pretrain_models_Qwen_Qwen2.5-3B-Instruct-mlp2x_gelu-3.4M-stage2" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="emova_llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_3.4M-14nodes-stage2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path checkpoints/emova-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v1-_vlm_pretrain_models_Qwen_Qwen2.5-3B-Instruct-mlp2x_gelu-pretrain_7M_vqa_stage_1_5/ \ - --version ${PROMPT_VERSION} \ - --data_path /vlm/data/train_images/Emova-ollm/emova-sft-4m/llava_format_vqa_txt.json \ - --image_folder "/" \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type patch_merger \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (1680, 560), (2240, 560), (560, 1120), (1120, 1120), (1680, 1120), (2240, 1120), (560, 1680), (1120, 1680), (1680, 1680), (2240, 1680), (560, 2240), (1120, 2240), (1680, 2240), (2240, 2240)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 14000 \ - --dataloader_num_workers 4 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True \ - --mm_vision_select_feature patch diff --git a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_ov_finetune.sh b/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_ov_finetune.sh deleted file mode 100755 index 9f029332..00000000 --- a/llava_next/scripts/deprecated/560px/dist_rice_l_14_560_v1_qwen25_3b_ov_finetune.sh +++ /dev/null @@ -1,71 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-3B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v1-_vlm_pretrain_models_Qwen_Qwen2.5-3B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="one_vision_llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-14nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v12 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path /mnt/vlmdata/data/finetune_data/onevision_robot_3871k.json \ - --image_folder "/mnt/vlmdata/data/train_images" \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (1680, 560), (2240, 560), (560, 1120), (1120, 1120), (1680, 1120), (2240, 1120), (560, 1680), (1120, 1680), (1680, 1680), (2240, 1680), (560, 2240), (1120, 2240), (1680, 2240), (2240, 2240)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 14000 \ - --dataloader_num_workers 4 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True \ - --mm_vision_select_feature patch_interpolate diff --git a/llava_next/scripts/deprecated/backup/dist_finetune_AIMv2.sh b/llava_next/scripts/deprecated/backup/dist_finetune_AIMv2.sh deleted file mode 100755 index fae911cf..00000000 --- a/llava_next/scripts/deprecated/backup/dist_finetune_AIMv2.sh +++ /dev/null @@ -1,71 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/apple/aimv2-1B-patch14-448" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images" -PROJECTOR_NAME="llavanext-_vlm_pretrain_models_apple_aimv2-1B-patch14-448-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" - -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-apple_aimv2-1B-patch14-448-qwen2.5-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_finetune_EVA8B_448px.sh b/llava_next/scripts/deprecated/backup/dist_finetune_EVA8B_448px.sh deleted file mode 100755 index 7bb6c151..00000000 --- a/llava_next/scripts/deprecated/backup/dist_finetune_EVA8B_448px.sh +++ /dev/null @@ -1,71 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/EVA_8B_448px" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images" -PROJECTOR_NAME="llavanext-_vlm_pretrain_models_EVA_8B_448px-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" - -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-_vlm_pretrain_models_EVA_8B_448px-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_finetune_InternViT_300M_448px.sh b/llava_next/scripts/deprecated/backup/dist_finetune_InternViT_300M_448px.sh deleted file mode 100755 index 8867f0d1..00000000 --- a/llava_next/scripts/deprecated/backup/dist_finetune_InternViT_300M_448px.sh +++ /dev/null @@ -1,71 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/InternViT-300M-448px" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images" -PROJECTOR_NAME="llavanext-_vlm_pretrain_models_InternViT-300M-448px-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" - -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-_vlm_pretrain_models_InternViT-300M-448px-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_finetune_rice_384.sh b/llava_next/scripts/deprecated/backup/dist_finetune_rice_384.sh deleted file mode 100755 index dacbc4e9..00000000 --- a/llava_next/scripts/deprecated/backup/dist_finetune_rice_384.sh +++ /dev/null @@ -1,71 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/RICE/rice-vit-large-patch14-378" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images" -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_RICE_rice-vit-large-patch14-378-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" - -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-RICE-vit-l-14-378px-qwen2.5-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(378, 756), (756, 378), (756, 756), (1134, 378), (378, 1134)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_finetune_rice_384_v2.sh b/llava_next/scripts/deprecated/backup/dist_finetune_rice_384_v2.sh deleted file mode 100755 index 0acabcc3..00000000 --- a/llava_next/scripts/deprecated/backup/dist_finetune_rice_384_v2.sh +++ /dev/null @@ -1,71 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/RICE/rice-vit-large-patch14-378-v2" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images" -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_RICE_rice-vit-large-patch14-378-v2-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" - -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-RICE-vit-l-14-378px-v2-qwen2.5-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(378, 756), (756, 378), (756, 756), (1134, 378), (378, 1134)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_finetune_rice_384_v3.sh b/llava_next/scripts/deprecated/backup/dist_finetune_rice_384_v3.sh deleted file mode 100755 index efd883a8..00000000 --- a/llava_next/scripts/deprecated/backup/dist_finetune_rice_384_v3.sh +++ /dev/null @@ -1,71 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/RICE/rice-vit-large-patch14-378-v3" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images" -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_RICE_rice-vit-large-patch14-378-v3-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" - -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-RICE-vit-l-14-378px-v3-qwen2.5-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(378, 756), (756, 378), (756, 756), (1134, 378), (378, 1134)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_finetune_rice_384_v4.sh b/llava_next/scripts/deprecated/backup/dist_finetune_rice_384_v4.sh deleted file mode 100755 index 739de5f5..00000000 --- a/llava_next/scripts/deprecated/backup/dist_finetune_rice_384_v4.sh +++ /dev/null @@ -1,71 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/RICE/rice-vit-large-patch14-378-v4" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images" -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_RICE_rice-vit-large-patch14-378-v4-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" - -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-RICE-vit-l-14-378px-v4-qwen2.5-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(378, 756), (756, 378), (756, 756), (1134, 378), (378, 1134)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_finetune_unit.sh b/llava_next/scripts/deprecated/backup/dist_finetune_unit.sh deleted file mode 100755 index 15e126a9..00000000 --- a/llava_next/scripts/deprecated/backup/dist_finetune_unit.sh +++ /dev/null @@ -1,71 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/UNIT/UNIT_600M_448px" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images" -PROJECTOR_NAME="llavanext-_vlm_pretrain_models_UNIT_UNIT_600M_448px-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" - -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-unit-qwen2.5-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_finetune_unit_1b_448px.sh b/llava_next/scripts/deprecated/backup/dist_finetune_unit_1b_448px.sh deleted file mode 100755 index c1031c7c..00000000 --- a/llava_next/scripts/deprecated/backup/dist_finetune_unit_1b_448px.sh +++ /dev/null @@ -1,71 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/UNIT/UNIT_1B_448px" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images" -PROJECTOR_NAME="llavanext-_vlm_pretrain_models_UNIT_UNIT_1B_448px-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" - -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-unit-448px-qwen2.5-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_mlcd_bigG_14_448_finetune.sh b/llava_next/scripts/deprecated/backup/dist_mlcd_bigG_14_448_finetune.sh deleted file mode 100755 index d67de090..00000000 --- a/llava_next/scripts/deprecated/backup/dist_mlcd_bigG_14_448_finetune.sh +++ /dev/null @@ -1,72 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/mlcd-vit-bigG-patch14-448" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_mlcd-vit-bigG-patch14-448-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v1 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_mlcd_bigG_14_448_pretrain.sh b/llava_next/scripts/deprecated/backup/dist_mlcd_bigG_14_448_pretrain.sh deleted file mode 100755 index a3e6f005..00000000 --- a/llava_next/scripts/deprecated/backup/dist_mlcd_bigG_14_448_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/mlcd-vit-bigG-patch14-448" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v1 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_pretrain_AIMv2.sh b/llava_next/scripts/deprecated/backup/dist_pretrain_AIMv2.sh deleted file mode 100755 index 2b6e4809..00000000 --- a/llava_next/scripts/deprecated/backup/dist_pretrain_AIMv2.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/apple/aimv2-1B-patch14-448" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain/images \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_pretrain_EVA8B_448px.sh b/llava_next/scripts/deprecated/backup/dist_pretrain_EVA8B_448px.sh deleted file mode 100755 index eb9fb776..00000000 --- a/llava_next/scripts/deprecated/backup/dist_pretrain_EVA8B_448px.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/EVA_8B_448px" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain/images \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_pretrain_InternViT_300M_448px.sh b/llava_next/scripts/deprecated/backup/dist_pretrain_InternViT_300M_448px.sh deleted file mode 100755 index c8a5c517..00000000 --- a/llava_next/scripts/deprecated/backup/dist_pretrain_InternViT_300M_448px.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/InternViT-300M-448px" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain/images \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_pretrain_rice_384.sh b/llava_next/scripts/deprecated/backup/dist_pretrain_rice_384.sh deleted file mode 100755 index 4375a6c8..00000000 --- a/llava_next/scripts/deprecated/backup/dist_pretrain_rice_384.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/RICE/rice-vit-large-patch14-378" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain/images \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_pretrain_rice_384_v2.sh b/llava_next/scripts/deprecated/backup/dist_pretrain_rice_384_v2.sh deleted file mode 100755 index 20c49cfb..00000000 --- a/llava_next/scripts/deprecated/backup/dist_pretrain_rice_384_v2.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/RICE/rice-vit-large-patch14-378-v2" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain/images \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_pretrain_rice_384_v3.sh b/llava_next/scripts/deprecated/backup/dist_pretrain_rice_384_v3.sh deleted file mode 100755 index 2862544e..00000000 --- a/llava_next/scripts/deprecated/backup/dist_pretrain_rice_384_v3.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/RICE/rice-vit-large-patch14-378-v3" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain/images \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_pretrain_rice_384_v4.sh b/llava_next/scripts/deprecated/backup/dist_pretrain_rice_384_v4.sh deleted file mode 100755 index 7a34a055..00000000 --- a/llava_next/scripts/deprecated/backup/dist_pretrain_rice_384_v4.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/RICE/rice-vit-large-patch14-378-v4" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain/images \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_pretrain_unit.sh b/llava_next/scripts/deprecated/backup/dist_pretrain_unit.sh deleted file mode 100755 index d4c6028c..00000000 --- a/llava_next/scripts/deprecated/backup/dist_pretrain_unit.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/UNIT/UNIT_600M_448px" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain/images \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_pretrain_unit_1b_448px.sh b/llava_next/scripts/deprecated/backup/dist_pretrain_unit_1b_448px.sh deleted file mode 100755 index 641d0398..00000000 --- a/llava_next/scripts/deprecated/backup/dist_pretrain_unit_1b_448px.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/UNIT/UNIT_1B_448px" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile.txt \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain/images \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3300000_finetune.sh b/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3300000_finetune.sh deleted file mode 100755 index 780b6db4..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3300000_finetune.sh +++ /dev/null @@ -1,72 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-huge-patch14-448-3300000" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-huge-patch14-448-3300000-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v1 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3300000_pretrain.sh b/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3300000_pretrain.sh deleted file mode 100755 index bbdc1fcd..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3300000_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-huge-patch14-448-3300000" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v0 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3500000_finetune.sh b/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3500000_finetune.sh deleted file mode 100755 index 16050549..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3500000_finetune.sh +++ /dev/null @@ -1,72 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-huge-patch14-448-3500000" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-huge-patch14-448-3500000-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v0 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3500000_pretrain.sh b/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3500000_pretrain.sh deleted file mode 100755 index 9a94a37d..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3500000_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-huge-patch14-448-3500000" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v0 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True diff --git a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3500000_pretrain_sdpa.sh b/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3500000_pretrain_sdpa.sh deleted file mode 100755 index 6bd27a2d..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3500000_pretrain_sdpa.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-huge-patch14-448-3500000" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_sdpa" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v0 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True diff --git a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3700000_finetune.sh b/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3700000_finetune.sh deleted file mode 100755 index 050c4737..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3700000_finetune.sh +++ /dev/null @@ -1,72 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-huge-patch14-448-3700000" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-huge-patch14-448-3700000-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v1 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3700000_pretrain.sh b/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3700000_pretrain.sh deleted file mode 100755 index 0c6fab94..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3700000_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-huge-patch14-448-3700000" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v1 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3700000_pretrain_sdpa.sh b/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3700000_pretrain_sdpa.sh deleted file mode 100755 index c126c25b..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_w_ocr_3700000_pretrain_sdpa.sh +++ /dev/null @@ -1,56 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-huge-patch14-448-3700000" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_sdpa" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v1 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_wo_ocr_finetune.sh b/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_wo_ocr_finetune.sh deleted file mode 100755 index e1fb3fe7..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_wo_ocr_finetune.sh +++ /dev/null @@ -1,69 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-huge-patch14-448-wo-ocr" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-huge-patch14-448-wo-ocr-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v0 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --dataloader_drop_last True \ - --attn_implementation sdpa diff --git a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_wo_ocr_pretrain-00410000.sh b/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_wo_ocr_pretrain-00410000.sh deleted file mode 100755 index f72fbbc4..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_wo_ocr_pretrain-00410000.sh +++ /dev/null @@ -1,58 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-huge-patch14-448-wo-ocr-00410000" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_sdpa" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v0 \ - --master_port 12345 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True diff --git a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_wo_ocr_pretrain.sh b/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_wo_ocr_pretrain.sh deleted file mode 100755 index da139b04..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_h_14_448_wo_ocr_pretrain.sh +++ /dev/null @@ -1,58 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-huge-patch14-448-wo-ocr" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_sdpa" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v0 \ - --master_port 12345 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_336_v0_finetune.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_336_v0_finetune.sh deleted file mode 100755 index 99ebaf9c..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_336_v0_finetune.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-336-v0" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-336-v0-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v1 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_336_v0_pretrain.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_336_v0_pretrain.sh deleted file mode 100755 index 7ca92a68..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_336_v0_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-336-v0" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v1 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_378_v0_finetune.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_378_v0_finetune.sh deleted file mode 100755 index 8eab17e7..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_378_v0_finetune.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-378-v0" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-378-v0-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v0 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(378, 756), (756, 378), (756, 756), (1134, 378), (378, 1134)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_378_v0_pretrain.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_378_v0_pretrain.sh deleted file mode 100755 index 4461d6b0..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_378_v0_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-378-v0" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v0 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_378_v1_finetune.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_378_v1_finetune.sh deleted file mode 100755 index 3da89e78..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_378_v1_finetune.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-378-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-378-v1-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v0 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(378, 756), (756, 378), (756, 756), (1134, 378), (378, 1134)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_378_v1_pretrain.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_378_v1_pretrain.sh deleted file mode 100755 index 3963594b..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_378_v1_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-378-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v0 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_448_v0_finetune.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_448_v0_finetune.sh deleted file mode 100755 index ee8a8e83..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_448_v0_finetune.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-448-v0" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-448-v0-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v1 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_448_v0_pretrain.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_448_v0_pretrain.sh deleted file mode 100755 index e6a5e6b3..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_448_v0_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-448-v0" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v1 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_560_v1_finetune.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_560_v1_finetune.sh deleted file mode 100755 index 9dcbb867..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_560_v1_finetune.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v1-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v2 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 1120), (1120, 560), (1120, 1120), (1680, 560), (560, 1680)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_560_v1_pretrain.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_560_v1_pretrain.sh deleted file mode 100755 index d2369250..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_560_v1_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v2 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_560_v2_finetune.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_560_v2_finetune.sh deleted file mode 100755 index 16852838..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_560_v2_finetune.sh +++ /dev/null @@ -1,70 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v2" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v2-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-8nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v2 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 1120), (1120, 560), (1120, 1120), (1680, 560), (560, 1680)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_560_v2_pretrain.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_560_v2_pretrain.sh deleted file mode 100755 index 28cb4c8f..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_560_v2_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v2" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v2 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 2 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/dist_rice_l_14_672_v0_pretrain.sh b/llava_next/scripts/deprecated/backup/dist_rice_l_14_672_v0_pretrain.sh deleted file mode 100755 index f0147583..00000000 --- a/llava_next/scripts/deprecated/backup/dist_rice_l_14_672_v0_pretrain.sh +++ /dev/null @@ -1,57 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-672-v0" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile list_host_v0 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/finetune_AIMv2_l14_336px.sh b/llava_next/scripts/deprecated/backup/finetune_AIMv2_l14_336px.sh deleted file mode 100755 index 88d43af8..00000000 --- a/llava_next/scripts/deprecated/backup/finetune_AIMv2_l14_336px.sh +++ /dev/null @@ -1,69 +0,0 @@ -export OMP_NUM_THREADS=8 -export NCCL_IB_DISABLE=0 -export NCCL_IB_GID_INDEX=3 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_DEBUG=INFO -export NUM_GPUS=8 -export NNODES=1 -export RANK=0 -export ADDR="localhost" -export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/apple/aimv2-large-patch14-336" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images" -PROJECTOR_NAME="llavanext-_vlm_pretrain_models_apple_aimv2-large-patch14-336-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - - -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 8 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 2000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/finetune_RICE_l14_378px.sh b/llava_next/scripts/deprecated/backup/finetune_RICE_l14_378px.sh deleted file mode 100755 index eadd1073..00000000 --- a/llava_next/scripts/deprecated/backup/finetune_RICE_l14_378px.sh +++ /dev/null @@ -1,69 +0,0 @@ -export OMP_NUM_THREADS=8 -export NCCL_IB_DISABLE=0 -export NCCL_IB_GID_INDEX=3 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_DEBUG=INFO -export NUM_GPUS=8 -export NNODES=1 -export RANK=0 -export ADDR="localhost" -export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/RICE/rice-vit-large-patch14-378-v4" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images" -PROJECTOR_NAME="llavanext-_vlm_xiangan_pretrain_models_deepglint_RICE_rice-vit-large-patch14-378-v4-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain_single_node" -PROMPT_VERSION="qwen_1_5" - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_single_node" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - - -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(378, 756), (756, 378), (756, 756), (1134, 378), (378, 1134)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 8 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 2000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/finetune_clip.sh b/llava_next/scripts/deprecated/backup/finetune_clip.sh deleted file mode 100755 index 80e8bf72..00000000 --- a/llava_next/scripts/deprecated/backup/finetune_clip.sh +++ /dev/null @@ -1,68 +0,0 @@ -export OMP_NUM_THREADS=8 -export NCCL_IB_DISABLE=0 -export NCCL_IB_GID_INDEX=3 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_DEBUG=INFO -export NUM_GPUS=8 -export NNODES=1 -export RANK=0 -export ADDR="localhost" -export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="openai/clip-vit-large-patch14-336" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="llava-next data root" -PROJECTOR_NAME="pretrained_projector name" - -PROMPT_VERSION="qwen_1_5" - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 8 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 3000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/backup/finetune_mlcd.sh b/llava_next/scripts/deprecated/backup/finetune_mlcd.sh deleted file mode 100755 index 7014540d..00000000 --- a/llava_next/scripts/deprecated/backup/finetune_mlcd.sh +++ /dev/null @@ -1,70 +0,0 @@ -export OMP_NUM_THREADS=8 -export NCCL_IB_DISABLE=0 -export NCCL_IB_GID_INDEX=3 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_DEBUG=INFO -export NUM_GPUS=8 -export NNODES=1 -export RANK=0 -export ADDR="localhost" -export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="DeepGlint-AI/mlcd-vit-bigG-patch14-448" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images" -PROJECTOR_NAME="pretrained_projector name" - -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter ./checkpoints/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "./checkpoints/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 8 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 2000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True - diff --git a/llava_next/scripts/deprecated/backup/finetune_ov.sh b/llava_next/scripts/deprecated/backup/finetune_ov.sh deleted file mode 100755 index d0d7de16..00000000 --- a/llava_next/scripts/deprecated/backup/finetune_ov.sh +++ /dev/null @@ -1,75 +0,0 @@ -export OMP_NUM_THREADS=8 -export NCCL_IB_DISABLE=0 -export NCCL_IB_GID_INDEX=3 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_DEBUG=INFO - -LLM_VERSION="Qwen/Qwen2.5-7B-Instruct" -# for 7b model we recommend bs=1, accum=2, 16 nodes, 128 gpus, lr=1e-5, warmup=0.03 -# for 72b model we recommend bs=1, accum=1, 32 nodes, 256 gpus, lr=1e-5, warmup=0.03 -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="DeepGlint-AI/mlcd-vit-large-patch14-336" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" - -############### Pretrain ################ - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-ov-robo" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -############### Finetune ################ - -# Stage 2 -PROMPT_VERSION="qwen_1_5" -RUN_NAME="llava-onevision-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-ov_stage_am9" -PREV_STAGE_CHECKPOINT="/path/to/ckpt/" # replace it with your last checkpoint training from single image collection -DATA_ROOT="/vlm/data/train_images" -echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}" -echo "MID_RUN_NAME: ${RUN_NAME}" - -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path $PREV_STAGE_CHECKPOINT \ - --version $PROMPT_VERSION \ - --data_path ${DATA_ROOT}/onevision/onevision.yaml \ - --image_folder ${DATA_ROOT}/onevision \ - --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \ - --mm_vision_tower_lr=2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[[336, 336],[336,672],[336,1008],[336,1344],[336,1680],[336,2016],[672,336],[672,672],[672,1008],[672,1344],[672,1680],[672,2016],[1008,336],[1008,672],[1008,1008],[1008,1344],[1008,1680],[1008,2016],[1344,336],[1344,672],[1344,1008],[1344,1344],[1344,1680],[1344,2016],[1680,336],[1680,672],[1680,1008],[1680,1344],[1680,1680],[1680,2016],[2016,336],[2016,672],[2016,1008],[2016,1344],[2016,1680],[2016,2016]]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $RUN_NAME \ - --output_dir ./checkpoints/onevision/$RUN_NAME \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 1000 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --gradient_checkpointing True \ - --dataloader_num_workers 4 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True \ - --frames_upbound 32 -exit 0; - -# You can delete the sdpa attn_implementation if you want to use flash attn \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/pretrain_AIMv2_l14_336px.sh b/llava_next/scripts/deprecated/backup/pretrain_AIMv2_l14_336px.sh deleted file mode 100755 index 74dad297..00000000 --- a/llava_next/scripts/deprecated/backup/pretrain_AIMv2_l14_336px.sh +++ /dev/null @@ -1,59 +0,0 @@ -export OMP_NUM_THREADS=8 -export NCCL_IB_DISABLE=0 -export NCCL_IB_GID_INDEX=3 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_DEBUG=INFO -export NUM_GPUS=8 -export NNODES=1 -export RANK=0 -export ADDR="localhost" -export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/apple/aimv2-large-patch14-336" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain/images \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 16 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --run_name $BASE_RUN_NAME \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/pretrain_RICE_L14_378px.sh b/llava_next/scripts/deprecated/backup/pretrain_RICE_L14_378px.sh deleted file mode 100755 index 23cf17db..00000000 --- a/llava_next/scripts/deprecated/backup/pretrain_RICE_L14_378px.sh +++ /dev/null @@ -1,59 +0,0 @@ -export OMP_NUM_THREADS=8 -export NCCL_IB_DISABLE=0 -export NCCL_IB_GID_INDEX=3 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_DEBUG=INFO -export NUM_GPUS=8 -export NNODES=1 -export RANK=0 -export ADDR="localhost" -export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/RICE/rice-vit-large-patch14-378-v4" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain/images \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 16 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --run_name $BASE_RUN_NAME \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/pretrain_RICE_h14_448px.sh b/llava_next/scripts/deprecated/backup/pretrain_RICE_h14_448px.sh deleted file mode 100755 index bf03a1d3..00000000 --- a/llava_next/scripts/deprecated/backup/pretrain_RICE_h14_448px.sh +++ /dev/null @@ -1,60 +0,0 @@ -export OMP_NUM_THREADS=8 -export NCCL_IB_DISABLE=0 -export NCCL_IB_GID_INDEX=3 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_DEBUG=INFO -export NUM_GPUS=8 -export NNODES=1 -export RANK=0 -export ADDR="localhost" -export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-huge-patch14-448-init" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 16 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --run_name $BASE_RUN_NAME \ - --attn_implementation sdpa \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/pretrain_mlcd.sh b/llava_next/scripts/deprecated/backup/pretrain_mlcd.sh deleted file mode 100755 index e8cd1591..00000000 --- a/llava_next/scripts/deprecated/backup/pretrain_mlcd.sh +++ /dev/null @@ -1,59 +0,0 @@ -export OMP_NUM_THREADS=8 -export NCCL_IB_DISABLE=0 -export NCCL_IB_GID_INDEX=3 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_DEBUG=INFO -export NUM_GPUS=8 -export NNODES=1 -export RANK=0 -export ADDR="localhost" -export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="DeepGlint-AI/mlcd-vit-bigG-patch14-448" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="llava 558k data root" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder ${DATA_ROOT}/LLaVA-Pretrain/images \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir ./checkpoints/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 16 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 16 \ - --lazy_preprocess True \ - --report_to wandb \ - --run_name $BASE_RUN_NAME \ No newline at end of file diff --git a/llava_next/scripts/deprecated/bigG/stage_1.5.sh b/llava_next/scripts/deprecated/bigG/stage_1.5.sh deleted file mode 100755 index 59561360..00000000 --- a/llava_next/scripts/deprecated/bigG/stage_1.5.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -# ========================================================================= -# 1. 环境与路径配置 (Environment & Path Configuration) -# ========================================================================= -export PYTHONPATH=$(pwd) - -# --- 模型与数据路径定义 --- -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v2" -PRETRAINED_PROJECTOR="/vlm/xiangan/checkpoints_rice_vl/projectors/_vlm_xiangan_pretrain_models_deepglint_rice-vit-bigG-patch14-560-v2-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct_patch_merger_558k_stage_1/mm_projector.bin" - -# --- 实验命名 --- -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -BASE_RUN_NAME="emova-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-patch_merger-pretrain_7M_vqa_stage_1_5" - - -# ========================================================================= -# 2. 启动训练 (Launch Training) -# ========================================================================= -echo "Starting Training Run: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile_14nodes.txt \ - llava/train/train_mem.py \ - --model_name_or_path ${LLM_VERSION} \ - --vision_tower ${VISION_MODEL_VERSION} \ - --pretrain_mm_mlp_adapter "${PRETRAINED_PROJECTOR}" \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --version qwen_1_5 \ - --mm_vision_select_layer -2 \ - --mm_projector_type patch_merger \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --mm_vision_select_feature patch \ - --data_path /vlm/data/train_images/Emova-ollm/emova-alignment-7m/llava_format_vqa.json \ - --image_folder / \ - --lazy_preprocess True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (560, 1120), (1120, 1120), (1680, 560), (560, 1680)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --output_dir /vlm/xiangan/checkpoints_rice_vl/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --mm_vision_tower_lr 2e-6 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 4500 \ - --gradient_checkpointing True \ - --dataloader_num_workers 4 \ - --deepspeed scripts/zero3.json diff --git a/llava_next/scripts/deprecated/bigG/stage_1.sh b/llava_next/scripts/deprecated/bigG/stage_1.sh deleted file mode 100644 index 06783ad2..00000000 --- a/llava_next/scripts/deprecated/bigG/stage_1.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash - -# ================================================================= -# Multimodal Model Pre-training Script -# ================================================================= -# -# This script handles the setup and execution of a DeepSpeed-based -# pre-training job for a large multimodal model. -# - -# --- 1. Environment & Node Configuration --- -# Sets up the environment for distributed training. -export OMP_NUM_THREADS=8 -export NCCL_IB_DISABLE=0 -export NCCL_IB_GID_INDEX=3 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_DEBUG=INFO -export NUM_GPUS=8 -export NNODES=1 -export RANK=0 -export ADDR="localhost" -export PORT="29500" -export PYTHONPATH=$(pwd) - - -# --- 2. Model & Data Paths --- -# Define the locations for models and datasets. -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v2" -DATA_ROOT="/vlm/data/pretrain_data" - - -# --- 3. Run Configuration --- -# Create a unique run name based on model versions for easy tracking. -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -PROMPT_VERSION=plain -BASE_RUN_NAME="${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}_patch_merger_558k_stage_1" - -echo "=================================================" -echo "Starting Pre-training Run: ${BASE_RUN_NAME}" -echo "=================================================" - - -# --- 4. DeepSpeed Training Launch --- -# Execute the training script with specified hyperparameters. -deepspeed --hostfile hostfile_14nodes.txt llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - \ - --model_name_or_path ${LLM_VERSION} \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type patch_merger \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_vision_select_layer -2 \ - --mm_vision_select_feature patch \ - \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --version ${PROMPT_VERSION} \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - \ - --bf16 True \ - --output_dir /vlm/xiangan/checkpoints_rice_vl/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 2 \ - --lazy_preprocess True - -echo "=================================================" -echo "Run ${BASE_RUN_NAME} Finished." -echo "=================================================" diff --git a/llava_next/scripts/deprecated/bigG/stage_2.sh b/llava_next/scripts/deprecated/bigG/stage_2.sh deleted file mode 100755 index d4adadd5..00000000 --- a/llava_next/scripts/deprecated/bigG/stage_2.sh +++ /dev/null @@ -1,69 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v2" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROMPT_VERSION="qwen_1_5" - - -mm_projector_type="patch_merger" -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-${mm_projector_type}-pretrain_blip558k-emova-alignment-7m-finetune_llavanext780k-14nodes-4x4" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile_14nodes.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path /vlm/xiangan/checkpoints_rice_vl/emova-_vlm_xiangan_pretrain_models_deepglint_rice-vit-bigG-patch14-560-v2-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-patch_merger-pretrain_7M_vqa_stage_1_5 \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type patch_merger \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (1680, 560), (2240, 560), (560, 1120), (1120, 1120), (1680, 1120), (2240, 1120), (560, 1680), (1120, 1680), (1680, 1680), (2240, 1680), (560, 2240), (1120, 2240), (1680, 2240), (2240, 2240)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "/vlm/xiangan/checkpoints_rice_vl/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 16000 \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/bigG/stage_2_ov.sh b/llava_next/scripts/deprecated/bigG/stage_2_ov.sh deleted file mode 100755 index dfc4668b..00000000 --- a/llava_next/scripts/deprecated/bigG/stage_2_ov.sh +++ /dev/null @@ -1,68 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-bigG-patch14-560-v2" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROMPT_VERSION="qwen_1_5" - - -mm_projector_type="patch_merger" -BASE_RUN_NAME="llava-one-vision-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-${mm_projector_type}-pretrain_blip558k-emova-alignment-7m-finetune_llavanext780k-14nodes-4x4-no-group" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile hostfile_14nodes.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path /vlm/xiangan/checkpoints_rice_vl/emova-_vlm_xiangan_pretrain_models_deepglint_rice-vit-bigG-patch14-560-v2-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-patch_merger-pretrain_7M_vqa_stage_1_5 \ - --version ${PROMPT_VERSION} \ - --data_path /vlm/data/train_images/lmms-lab/LLaVA-OneVision-Data/llava_format.json \ - --image_folder / \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type patch_merger \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (1680, 560), (2240, 560), (560, 1120), (1120, 1120), (1680, 1120), (2240, 1120), (560, 1680), (1120, 1680), (1680, 1680), (2240, 1680), (560, 2240), (1120, 2240), (1680, 2240), (2240, 2240)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "/vlm/xiangan/checkpoints_rice_vl/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 16000 \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True diff --git a/llava_next/scripts/deprecated/llava_one_vision/rice_L_14_560_v1_qwen25_3b_2x2_from_stage_1.5_3B.sh b/llava_next/scripts/deprecated/llava_one_vision/rice_L_14_560_v1_qwen25_3b_2x2_from_stage_1.5_3B.sh deleted file mode 100755 index 463ea342..00000000 --- a/llava_next/scripts/deprecated/llava_one_vision/rice_L_14_560_v1_qwen25_3b_2x2_from_stage_1.5_3B.sh +++ /dev/null @@ -1,119 +0,0 @@ -#!/bin/bash - -# ============================================================================== -# Configuration -# ============================================================================== - -# --- Environment Setup --- -# Set the Python path to the current directory -export PYTHONPATH=$(pwd) - -# (Optional) Multi-node/GPU environment variables - uncomment to use -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" - -# --- Core Model & Data Configuration --- -# Paths to the pre-trained models and initial checkpoint -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-3B-Instruct" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -MODEL_CHECKPOINT="checkpoints/emova-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v1-_vlm_pretrain_models_Qwen_Qwen2.5-3B-Instruct-mlp2x_gelu-pretrain_7M_vqa_stage_1_5/" - -# Specific dataset for this training run -TRAIN_DATA_PATH="/vlm/data/train_images/lmms-lab/LLaVA-OneVision-Data/llava_format.json" -TRAIN_IMAGE_FOLDER="/" # Root directory for images, as paths in JSON are absolute - -# --- Run & Naming Configuration --- -# Clean up model version strings for use in names (replaces '/' with '_') -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" - -# Define the type of projector and the prompt version -mm_projector_type="patch_merger" -PROMPT_VERSION="qwen_1_5" - -# Construct a descriptive name for the training run and define the output directory -BASE_RUN_NAME="LLaVA-OV-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-${mm_projector_type}-Align-EMOVA3.5M-Finetune-OV3.5M-14nodes-4x4" -OUTPUT_DIR="./checkpoints/${BASE_RUN_NAME}" - -# --- Training Hyperparameters --- -EPOCHS=1 -PER_DEVICE_BATCH_SIZE=2 -PER_DEVICE_EVAL_BATCH_SIZE=4 -GRAD_ACCUM_STEPS=1 -LEARNING_RATE=1e-5 -WARMUP_RATIO=0.03 -SAVE_STEPS=500 -MAX_SEQ_LEN=12000 -DATALOADER_WORKERS=2 - -# ============================================================================== -# Pre-flight Check -# ============================================================================== -# Print key configuration parameters for debugging before starting the run -echo "================ Training Configuration ================" -echo "RUN NAME: ${BASE_RUN_NAME}" -echo "MODEL CHECKPOINT:${MODEL_CHECKPOINT}" -echo "DATA PATH: ${TRAIN_DATA_PATH}" -echo "IMAGE FOLDER: ${TRAIN_IMAGE_FOLDER}" -echo "BATCH SIZE: ${PER_DEVICE_BATCH_SIZE} (per device)" -echo "LEARNING RATE: ${LEARNING_RATE}" -echo "MAX SEQ LENGTH: ${MAX_SEQ_LEN}" -echo "OUTPUT DIR: ${OUTPUT_DIR}" -echo "========================================================" - - -# ============================================================================== -# Training Execution -# ============================================================================== -# Launch the multi-node training script using DeepSpeed -deepspeed --hostfile hostfile_12nodes.txt \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path "${MODEL_CHECKPOINT}" \ - --version "${PROMPT_VERSION}" \ - --data_path "${TRAIN_DATA_PATH}" \ - --image_folder "${TRAIN_IMAGE_FOLDER}" \ - --vision_tower "${VISION_MODEL_VERSION}" \ - --mm_projector_type "${mm_projector_type}" \ - --mm_tunable_parts "mm_vision_tower,mm_mlp_adapter,mm_language_model" \ - --mm_vision_tower_lr 2e-6 \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --image_aspect_ratio "anyres" \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (1680, 560), (2240, 560), (560, 1120), (1120, 1120), (1680, 1120), (2240, 1120), (560, 1680), (1120, 1680), (1680, 1680), (2240, 1680), (560, 2240), (1120, 2240), (1680, 2240), (2240, 2240)]" \ - --mm_patch_merge_type "spatial_unpad" \ - --group_by_modality_length True \ - --bf16 True \ - --run_name "${BASE_RUN_NAME}" \ - --output_dir "${OUTPUT_DIR}" \ - --num_train_epochs "${EPOCHS}" \ - --per_device_train_batch_size "${PER_DEVICE_BATCH_SIZE}" \ - --per_device_eval_batch_size "${PER_DEVICE_EVAL_BATCH_SIZE}" \ - --gradient_accumulation_steps "${GRAD_ACCUM_STEPS}" \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps "${SAVE_STEPS}" \ - --save_total_limit 1 \ - --learning_rate "${LEARNING_RATE}" \ - --weight_decay 0. \ - --warmup_ratio "${WARMUP_RATIO}" \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length "${MAX_SEQ_LEN}" \ - --dataloader_num_workers "${DATALOADER_WORKERS}" \ - --lazy_preprocess True \ - --dataloader_drop_last True \ - --report_to "wandb" \ - --torch_compile True \ - --torch_compile_backend "inductor" diff --git a/llava_next/scripts/deprecated/quick_start/rice_L_14_560_v1_qwen25_3b_2x2_from_stage_1.5_3B.sh b/llava_next/scripts/deprecated/quick_start/rice_L_14_560_v1_qwen25_3b_2x2_from_stage_1.5_3B.sh deleted file mode 100755 index 70e8c1ef..00000000 --- a/llava_next/scripts/deprecated/quick_start/rice_L_14_560_v1_qwen25_3b_2x2_from_stage_1.5_3B.sh +++ /dev/null @@ -1,81 +0,0 @@ -export PYTHONPATH=$(pwd) - -# 基本配置参数 - 更清晰地组织 -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-3B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/xiangan/pretrain_models/deepglint/rice-vit-large-patch14-560-v1" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" -PROMPT_VERSION="qwen_1_5" - -# 模型配置 -MODEL_PATH="/vlm/xiangan/unicom_unit/checkpoints/emova-_vlm_xiangan_pretrain_models_deepglint_rice-vit-large-patch14-560-v1-_vlm_pretrain_models_Qwen_Qwen2.5-3B-Instruct-mlp2x_gelu-pretrain_7M_vqa_stage_1_5/" -mm_projector_type="patch_merger" -BASE_RUN_NAME="llava_next-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-${mm_projector_type}-pretrain_blip558k_emova_alignment_7m_finetune_llavanext7805_4x4" -OUTPUT_DIR="./checkpoints/${BASE_RUN_NAME}" - -# 训练参数 - 便于调整 -EPOCHS=1 -BATCH_SIZE=4 -EVAL_BATCH_SIZE=4 -GRAD_ACCUM=4 -LR=1e-5 -WARMUP_RATIO=0.03 -SAVE_STEPS=500 -MAX_SEQ_LEN=8192 -NUM_WORKERS=4 # 增加了数据加载器的线程数 - -# 打印关键配置参数用于调试 -echo "================ 训练配置 ================" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" -echo "MODEL_PATH: ${MODEL_PATH}" -echo "DATA_PATH: ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json" -echo "BATCH_SIZE: ${BATCH_SIZE} (x 节点数 x GPU数/节点)" -echo "LEARNING_RATE: ${LR}" -echo "MAX_SEQ_LEN: ${MAX_SEQ_LEN}" -echo "=========================================" - -# 启动训练 - 使用localhost替代主机文件 -deepspeed --include localhost \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${MODEL_PATH} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type ${mm_projector_type} \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(560, 560), (1120, 560), (1680, 560), (560, 1120), (1120, 1120), (560, 1680)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name ${BASE_RUN_NAME} \ - --output_dir ${OUTPUT_DIR} \ - --num_train_epochs ${EPOCHS} \ - --per_device_train_batch_size ${BATCH_SIZE} \ - --per_device_eval_batch_size ${EVAL_BATCH_SIZE} \ - --gradient_accumulation_steps ${GRAD_ACCUM} \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps ${SAVE_STEPS} \ - --save_total_limit 1 \ - --learning_rate ${LR} \ - --weight_decay 0. \ - --warmup_ratio ${WARMUP_RATIO} \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length ${MAX_SEQ_LEN} \ - --dataloader_num_workers ${NUM_WORKERS} \ - --lazy_preprocess True \ - --report_to wandb \ - --torch_compile True \ - --torch_compile_backend "inductor" \ - --dataloader_drop_last True \ No newline at end of file diff --git a/llava_next/scripts/eval/__init__.py b/llava_next/scripts/eval/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/llava_next/scripts/eval/compute_scores.py b/llava_next/scripts/eval/compute_scores.py deleted file mode 100755 index d36553ab..00000000 --- a/llava_next/scripts/eval/compute_scores.py +++ /dev/null @@ -1,43 +0,0 @@ -import sys, os, json -import glob -from collections import defaultdict -from scripts.eval.robo_metric import ( - get_robovqa_score, - get_openeqa_score - ) - -def print_scores(title, named_num_scores): # dict[name: str]: (num: int, score: float or str) - if len(named_num_scores) > 0: - max_name_len = max([len(name) for name in named_num_scores.keys()]) - name_width = max(max_name_len, len(title)) + 2 - print(f"\n{title:<{name_width}} {'NUM.':<10}{'SCORE':<10}") - for name, (num, score) in named_num_scores.items(): - if isinstance(score, float): - score = f'{score:.2f}' - else: - assert isinstance(score, str) - print(f"{name:<{name_width}}{num:<10}{score:<10}") - -def compute_scores(in_dir): - result_json_path = glob.glob(os.path.join(in_dir, "*.json")) - metric_info_all = {} - - for bmk_path in result_json_path: - bmk_name = os.path.basename(bmk_path).split(".")[0] - if "robovqa" in bmk_path: - score_info, contents = get_robovqa_score(bmk_path) - metric_info_all[f"{bmk_name}"] = (len(contents), score_info["score"] * 100) - metric_info_all[f"{bmk_name}_bleu1"] = (len(contents), score_info["bleu1"] * 100) - metric_info_all[f"{bmk_name}_bleu2"] = (len(contents), score_info["bleu2"] * 100) - metric_info_all[f"{bmk_name}_bleu3"] = (len(contents), score_info["bleu3"] * 100) - metric_info_all[f"{bmk_name}_bleu4"] = (len(contents), score_info["bleu4"] * 100) - elif "openeqa" in bmk_path: - score_info, contents = get_openeqa_score(bmk_path) - for type_name, score_item in score_info.item(): - metric_info_all[f"{bmk_name}_{type_name}"] = (len(contents), score_item * 100) - - print_scores('GENERATIVE', metric_info_all) - -if __name__ == '__main__': - in_dir = sys.argv[1] - compute_scores(in_dir) diff --git a/llava_next/scripts/eval/eval_robo.sh b/llava_next/scripts/eval/eval_robo.sh deleted file mode 100755 index f5109200..00000000 --- a/llava_next/scripts/eval/eval_robo.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -e -export PYTHONPATH=$(pwd) - -export YOUR_API_KEY="YOUR_API_KEY" -export YOUR_ENDPOINT="YOUR_ENDPOINT" - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit 1 -fi - -model_dir=$1 -bmk_root=/vlm/data/benchmarks -image_folder=/vlm/data/eval_data/eval_images - -if [ ! -d "$model_dir" ]; then - echo "Error: $model_dir does not exist" - exit 1 -fi - -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -export MASTER_PORT=29501 - -generative_benchmarks="[ -'robovqa', -'openeqa' -]" - -python llava/benchmark/eval_robo.py \ - --model_dir=$model_dir \ - --benchmarks="$(echo $generative_benchmarks | tr -d '\n')" \ - --image_folder=$image_folder \ - --bmk_root=$bmk_root \ - --batch_size=1 \ - --max_new_tokens=128 \ - --num_workers=1 - -python scripts/eval/compute_scores.py $model_dir/eval_robo diff --git a/llava_next/scripts/eval/robo_metric.py b/llava_next/scripts/eval/robo_metric.py deleted file mode 100755 index a5443250..00000000 --- a/llava_next/scripts/eval/robo_metric.py +++ /dev/null @@ -1,200 +0,0 @@ -import json -import numpy as np -import re -from collections import defaultdict -from tqdm import tqdm -import sys, os -from third_party.openeqa.evaluation.llm_match import get_llm_match_score -from nltk.translate.bleu_score import sentence_bleu - -# ---------------------------------- Metrics For RoboVQA ----------------------------------- - -def robovqa_process_results(doc, results): - pred = results.replace("\n", "").lower() - gt = doc["answer"].replace("\n", "").lower() - if gt in ['yes', 'no']: - pred = re.sub(r'\b\w*yes\w*\b', 'yes', pred) - pred = re.sub(r'\b\w*no\w*\b', 'no', pred) - score, bleu1, bleu2, bleu3, bleu4 = get_bleu_score(pred, gt) - return_dict = { - "score": score, - "bleu1": bleu1, - "bleu2": bleu2, - "bleu3": bleu3, - "bleu4": bleu4 - } - return return_dict - -def get_bleu_score(prediction, target): - bleu1, bleu2, bleu3, bleu4 = 0, 0, 0, 0 - candidate = list(prediction.split(" ")) - reference = [list(target.split(" "))] - if target is not None: - # print(f"pred:{pred}, gt:{gt}, bleu:{sentence_bleu(reference, candidate)}") - if len(reference[0]) <= 1: - bleu1 = sentence_bleu(reference, candidate, weights=(1.00, 0.00, 0.00, 0.00)) - bleu2 = sentence_bleu(reference, candidate, weights=(1.00, 0.00, 0.00, 0.00)) - bleu3 = sentence_bleu(reference, candidate, weights=(1.00, 0.00, 0.00, 0.00)) - bleu4 = sentence_bleu(reference, candidate, weights=(1.00, 0.00, 0.00, 0.00)) - elif len(reference[0]) == 2: - bleu1 = sentence_bleu(reference, candidate, weights=(1.00, 0.00, 0.00, 0.00)) - bleu2 = sentence_bleu(reference, candidate, weights=(0.50, 0.50, 0.00, 0.00)) - bleu3 = sentence_bleu(reference, candidate, weights=(0.50, 0.50, 0.00, 0.00)) - bleu4 = sentence_bleu(reference, candidate, weights=(0.50, 0.50, 0.00, 0.00)) - elif len(reference[0]) == 3: - bleu1 = sentence_bleu(reference, candidate, weights=(1.00, 0.00, 0.00, 0.00)) - bleu2 = sentence_bleu(reference, candidate, weights=(0.50, 0.50, 0.00, 0.00)) - bleu3 = sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0.00)) - bleu4 = sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0.00)) - else: - bleu1 = sentence_bleu(reference, candidate, weights=(1.00, 0.00, 0.00, 0.00)) - bleu2 = sentence_bleu(reference, candidate, weights=(0.50, 0.50, 0.00, 0.00)) - bleu3 = sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0.00)) - bleu4 = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)) - - score = (bleu1 + bleu2 + bleu3 + bleu4) / 4 - return score, bleu1, bleu2, bleu3, bleu4 - -def reformat_robovqa_result(result_path): - contents = json.loads(open(result_path).read()) - res_all = [] - for c in contents: - elem = { - "resps":c["pred"], - "doc":{ - "dataset":c["type_level_2"], - "question":c["question"], - "question_type":c["type_level_1"], - "answer":c["gt"] - } - } - res_all.append(elem) - return res_all - -def get_robovqa_score(result_path): - - contents = reformat_robovqa_result(result_path) - res_all = [] - - for item in contents: - res = robovqa_process_results(item["doc"], item["resps"]) - res_all.append(res) - res_score = [one["score"] for one in res_all] - res_bleu1 = [one["bleu1"] for one in res_all] - res_bleu2 = [one["bleu2"] for one in res_all] - res_bleu3 = [one["bleu3"] for one in res_all] - res_bleu4 = [one["bleu4"] for one in res_all] - return { - "score": np.mean(res_score), - "bleu1": np.mean(res_bleu1), - "bleu2": np.mean(res_bleu2), - "bleu3": np.mean(res_bleu3), - "bleu4": np.mean(res_bleu4) - }, contents - -# ---------------------------------- Metrics For OpenEQA ----------------------------------- - -def get_output_filename(path): - dir_name = os.path.dirname(path) - base_name = os.path.basename(path) - name, ext = os.path.splitext(base_name) - new_filename = f"data_with_llm_score{ext}" - new_path = os.path.join(dir_name, new_filename) - return new_path - -def req_openeqa_llm_score(result_path): - - with open(result_path, "r") as f: - contents = json.load(f) - - output_file = get_output_filename(result_path) - temp_data, error_data = [], [] - resume_flag = False - - try: - with open(output_file, "r") as f: - temp_data = json.load(f) - resume_id = temp_data[-1]["unique_id"] - except FileNotFoundError: - resume_id = -1 - resume_flag = True - print("No json file to be loaded.") - - for i, item in enumerate(tqdm(contents)): - - if item["unique_id"] == resume_id: - resume_flag = True - continue - - if resume_flag: - question = item["question"] - answer = item["gt"] - extra_answers = item["extra_gt"] - prediction = item["pred"] - score = get_llm_match_score(question, answer, prediction, extra_answers=extra_answers, endpoint=True) - if isinstance(score, str): - print("*" * 40) - print("Item ID: {}".format(i)) - print("Example question: {}".format(question)) - print("Ground-truth extra-answers: {}".format(extra_answers)) - print("Ground-truth answer: {}".format(answer)) - print("Predicted answer: {}".format(prediction)) - print(f"Saving temporary result into {output_file}") - error_data.append(item) - else: - llm_score = (score - 1) / 4 - item["llm_score"] = llm_score - - print("*" * 40) - print("Item ID: {}".format(i)) - print("Example question: {}".format(question)) - print("Ground-truth extra-answers: {}".format(extra_answers)) - print("Ground-truth answer: {}".format(answer)) - print("Predicted answer: {}".format(prediction)) - print("LLM-match score: {}".format(llm_score)) - - temp_data.append(item) - - if (i + 1) % 50 == 0 or (i + 1) == len(contents): - print(f"Saving temporary result into {output_file}") - with open(output_file, 'w') as outfile: - json.dump(temp_data, outfile, indent=4) - - if len(error_data) > 0: - print("*" * 40) - dir_path = os.path.dirname(output_file) - error_path = os.path.join(dir_path, 'score_error.json') - with open(error_path, 'a') as outfile: - json.dump(error_data, outfile, indent=4) - print(f"Saving error response log into {error_path}") - - return output_file - -def get_openeqa_score(result_path): - - scores = defaultdict(list) - scores_avg = defaultdict(float) - output_file = req_openeqa_llm_score(result_path) - - with open(output_file, "r") as f: - contents = json.load(f) - - for item in contents: - type_level_1 = item['type_level_1'] - llm_score = item['llm_score'] - scores[type_level_1].append(llm_score) - - for item in contents: - type_level_2 = item['type_level_2'] - llm_score = item['llm_score'] - scores[type_level_2].append(llm_score) - - for type_name, score_list in scores.items(): - average_score = sum(score_list) / len(score_list) - scores_avg[type_name] = round(average_score, 4) - - return scores_avg, contents - - - - diff --git a/llava_next/scripts/hevc_vit_448/2025_12_04_dist_hevc_l_14_448_finetune_packing.sh b/llava_next/scripts/hevc_vit_448/2025_12_04_dist_hevc_l_14_448_finetune_packing.sh deleted file mode 100644 index 985037af..00000000 --- a/llava_next/scripts/hevc_vit_448/2025_12_04_dist_hevc_l_14_448_finetune_packing.sh +++ /dev/null @@ -1,67 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/hevc/hevc_vit_packing_12_04_00210000_l14_flash_attn_freeze" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="hevc_vit_packing_flashattn_qwen25_select_layer_m2" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter /video_vit/xiangan/checkpoint_llava_next/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "/video_vit/xiangan/checkpoint_llava_next/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --report_to wandb \ - --dataloader_drop_last True diff --git a/llava_next/scripts/hevc_vit_448/2025_12_04_dist_hevc_l_14_448_pretrain_packing.sh b/llava_next/scripts/hevc_vit_448/2025_12_04_dist_hevc_l_14_448_pretrain_packing.sh deleted file mode 100644 index 8addbbb2..00000000 --- a/llava_next/scripts/hevc_vit_448/2025_12_04_dist_hevc_l_14_448_pretrain_packing.sh +++ /dev/null @@ -1,56 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/hevc/hevc_vit_packing_12_04_00210000_l14_flash_attn_freeze" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir /video_vit/xiangan/checkpoint_llava_next/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 2 \ - --lazy_preprocess True diff --git a/llava_next/scripts/hevc_vit_448/2025_12_06_dist_hevc_l_14_448_ocr_finetune_packing.sh b/llava_next/scripts/hevc_vit_448/2025_12_06_dist_hevc_l_14_448_ocr_finetune_packing.sh deleted file mode 100644 index ece98219..00000000 --- a/llava_next/scripts/hevc_vit_448/2025_12_06_dist_hevc_l_14_448_ocr_finetune_packing.sh +++ /dev/null @@ -1,67 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/hevc/hevc_vit_ocr_packing_12_06_00068000_l14_flash_attn" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="hevc_vit_ocr_packing_flashattn_qwen25_select_layer_m2" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter /video_vit/xiangan/checkpoint_llava_next/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "/video_vit/xiangan/checkpoint_llava_next/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --report_to wandb \ - --dataloader_drop_last True diff --git a/llava_next/scripts/hevc_vit_448/2025_12_06_dist_hevc_l_14_448_ocr_pretrain_packing.sh b/llava_next/scripts/hevc_vit_448/2025_12_06_dist_hevc_l_14_448_ocr_pretrain_packing.sh deleted file mode 100644 index 011e6f9f..00000000 --- a/llava_next/scripts/hevc_vit_448/2025_12_06_dist_hevc_l_14_448_ocr_pretrain_packing.sh +++ /dev/null @@ -1,56 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/hevc/hevc_vit_ocr_packing_12_06_00068000_l14_flash_attn" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir /video_vit/xiangan/checkpoint_llava_next/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 2 \ - --lazy_preprocess True diff --git a/llava_next/scripts/hevc_vit_448/2025_12_07_dist_hevc_l_14_448_ocr_finetune_packing.sh b/llava_next/scripts/hevc_vit_448/2025_12_07_dist_hevc_l_14_448_ocr_finetune_packing.sh deleted file mode 100644 index 9cc7f14c..00000000 --- a/llava_next/scripts/hevc_vit_448/2025_12_07_dist_hevc_l_14_448_ocr_finetune_packing.sh +++ /dev/null @@ -1,67 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/hevc/hevc_vit_ocr_packing_12_06_00118000_l14_flash_attn" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="hevc_vit_ocr_packing_12_06_00118000_qwen25_select_layer_m2" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter /video_vit/xiangan/checkpoint_llava_next/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "/video_vit/xiangan/checkpoint_llava_next/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --report_to wandb \ - --dataloader_drop_last True diff --git a/llava_next/scripts/hevc_vit_448/2025_12_07_dist_hevc_l_14_448_ocr_pretrain_packing.sh b/llava_next/scripts/hevc_vit_448/2025_12_07_dist_hevc_l_14_448_ocr_pretrain_packing.sh deleted file mode 100644 index 8811653f..00000000 --- a/llava_next/scripts/hevc_vit_448/2025_12_07_dist_hevc_l_14_448_ocr_pretrain_packing.sh +++ /dev/null @@ -1,56 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/hevc/hevc_vit_ocr_packing_12_06_00118000_l14_flash_attn" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir /video_vit/xiangan/checkpoint_llava_next/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 2 \ - --lazy_preprocess True diff --git a/llava_next/scripts/hevc_vit_448/2025_12_08_dist_hevc_l_14_448_ocr_finetune_packing.sh b/llava_next/scripts/hevc_vit_448/2025_12_08_dist_hevc_l_14_448_ocr_finetune_packing.sh deleted file mode 100644 index 81e848cf..00000000 --- a/llava_next/scripts/hevc_vit_448/2025_12_08_dist_hevc_l_14_448_ocr_finetune_packing.sh +++ /dev/null @@ -1,67 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/hevc/hevc_vit_ocr_packing_12_08_00140000_l14_flash_attn" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="hevc_vit_ocr_packing_12_08_00140000_qwen25_select_layer_m2" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter /video_vit/xiangan/checkpoint_llava_next/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "/video_vit/xiangan/checkpoint_llava_next/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --report_to wandb \ - --dataloader_drop_last True diff --git a/llava_next/scripts/hevc_vit_448/2025_12_08_dist_hevc_l_14_448_ocr_pretrain_packing.sh b/llava_next/scripts/hevc_vit_448/2025_12_08_dist_hevc_l_14_448_ocr_pretrain_packing.sh deleted file mode 100644 index 7b849c98..00000000 --- a/llava_next/scripts/hevc_vit_448/2025_12_08_dist_hevc_l_14_448_ocr_pretrain_packing.sh +++ /dev/null @@ -1,56 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/hevc/hevc_vit_ocr_packing_12_08_00140000_l14_flash_attn" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir /video_vit/xiangan/checkpoint_llava_next/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 2 \ - --lazy_preprocess True diff --git a/llava_next/scripts/hevc_vit_448/2025_12_20_dist_hevc_l_14_448_ocr_pretrain_2b.sh b/llava_next/scripts/hevc_vit_448/2025_12_20_dist_hevc_l_14_448_ocr_pretrain_2b.sh deleted file mode 100644 index 989967a6..00000000 --- a/llava_next/scripts/hevc_vit_448/2025_12_20_dist_hevc_l_14_448_ocr_pretrain_2b.sh +++ /dev/null @@ -1,56 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-1.5B" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/xiangan/checkpoint_llava_vit/2025_12_08_new_l14_continue_128gpus_all_residual/hevc_vit_hf" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_32 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_projector_type spatial_merge \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir /video_vit/xiangan/checkpoint_llava_next/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 2 \ - --lazy_preprocess True diff --git a/llava_next/scripts/hevc_vit_448/2025_12_20_dist_hevc_l_14_448_ocr_pretrain_2b_mlp.sh b/llava_next/scripts/hevc_vit_448/2025_12_20_dist_hevc_l_14_448_ocr_pretrain_2b_mlp.sh deleted file mode 100644 index 4fb8674f..00000000 --- a/llava_next/scripts/hevc_vit_448/2025_12_20_dist_hevc_l_14_448_ocr_pretrain_2b_mlp.sh +++ /dev/null @@ -1,56 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-1.5B" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/xiangan/checkpoint_llava_vit/2025_12_08_new_l14_continue_128gpus_all_residual/hevc_vit_hf" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_32 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir /video_vit/xiangan/checkpoint_llava_next/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 2 \ - --lazy_preprocess True diff --git a/llava_next/scripts/hevc_vit_448/2025_12_20_dist_hevc_l_14_448_ocr_sft_2b.sh b/llava_next/scripts/hevc_vit_448/2025_12_20_dist_hevc_l_14_448_ocr_sft_2b.sh deleted file mode 100644 index 471ab199..00000000 --- a/llava_next/scripts/hevc_vit_448/2025_12_20_dist_hevc_l_14_448_ocr_sft_2b.sh +++ /dev/null @@ -1,67 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-1.5B" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/xiangan/checkpoint_llava_vit/2025_12_08_new_l14_continue_128gpus_all_residual/hevc_vit_hf" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="qwen25_1_5_hevc_vit_12_20" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_32 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter /video_vit/xiangan/checkpoint_llava_next/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type spatial_merge \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "/video_vit/xiangan/checkpoint_llava_next/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --report_to wandb \ - --dataloader_drop_last True diff --git a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_finetune.sh b/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_finetune.sh deleted file mode 100755 index a19ceb2b..00000000 --- a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_finetune.sh +++ /dev/null @@ -1,66 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/xiangan/checkpoint_llava_vit/2025_11_22_new_l14_continue_128gpus_how_to_100m_448px_224px/00148000/backbone_hevc_vit_hf" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="llavanext-_video_vit_xiangan_checkpoint_llava_vit_2025_11_22_new_l14_continue_128gpus_how_to_100m_448px_224px_00148000_backbone_hevc_vit_hf-_vlm_pretrain_models_Qwen_Qwen2.5-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-10nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter /video_vit/xiangan/checkpoint_llava_next/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 1e-5 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "/video_vit/xiangan/checkpoint_llava_next/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --report_to wandb \ - --dataloader_drop_last True diff --git a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_finetune_fixed_flashattn.sh b/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_finetune_fixed_flashattn.sh deleted file mode 100755 index ef54daba..00000000 --- a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_finetune_fixed_flashattn.sh +++ /dev/null @@ -1,66 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/hevc/backbone_hevc_vit_flash_attn_hf" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="hevc_vit_flashattn_qwen25" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-10nodes" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter /video_vit/xiangan/checkpoint_llava_next/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 1e-5 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "/video_vit/xiangan/checkpoint_llava_next/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --report_to wandb \ - --dataloader_drop_last True diff --git a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_finetune_fixed_flashattn_select_layer_m2.sh b/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_finetune_fixed_flashattn_select_layer_m2.sh deleted file mode 100644 index 3be85a97..00000000 --- a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_finetune_fixed_flashattn_select_layer_m2.sh +++ /dev/null @@ -1,67 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/hevc/backbone_hevc_vit_hf_version_12_01_version_00192000" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="hevc_vit_flashattn_qwen25_select_layer_m2" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter /video_vit/xiangan/checkpoint_llava_next/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "/video_vit/xiangan/checkpoint_llava_next/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --report_to wandb \ - --dataloader_drop_last True diff --git a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_finetune_fixed_flashattn_v2.sh b/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_finetune_fixed_flashattn_v2.sh deleted file mode 100755 index a0bbd8f4..00000000 --- a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_finetune_fixed_flashattn_v2.sh +++ /dev/null @@ -1,66 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/hevc/backbone_hevc_vit_flash_attn_hf" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/train_images/LLaVA-NeXT-Data" - - -PROJECTOR_NAME="hevc_vit_flashattn_qwen25" -PROMPT_VERSION="qwen_1_5" - - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k-finetune_llavanext780k-10nodes_v2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - --master_port 65534 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/llava_next_raw_format/llava_next_raw_format_processed.json \ - --image_folder ${DATA_ROOT}/llava_next_raw_format \ - --pretrain_mm_mlp_adapter /video_vit/xiangan/checkpoint_llava_next/projectors/${PROJECTOR_NAME}/mm_projector.bin \ - --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \ - --mm_vision_tower_lr 2e-6 \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --group_by_modality_length True \ - --image_aspect_ratio anyres \ - --image_grid_pinpoints "[(448, 896), (896, 448), (896, 896), (1344, 448), (448, 1344)]" \ - --mm_patch_merge_type spatial_unpad \ - --bf16 True \ - --run_name $BASE_RUN_NAME \ - --output_dir "/video_vit/xiangan/checkpoint_llava_next/${BASE_RUN_NAME}" \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 1 \ - --learning_rate 1e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 32768 \ - --dataloader_num_workers 2 \ - --lazy_preprocess True \ - --report_to wandb \ - --dataloader_drop_last True diff --git a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_pretrain.sh b/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_pretrain.sh deleted file mode 100755 index 8e3349f8..00000000 --- a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_pretrain.sh +++ /dev/null @@ -1,55 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/xiangan/checkpoint_llava_vit/2025_11_22_new_l14_continue_128gpus_how_to_100m_448px_224px/00148000/backbone_hevc_vit_hf" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir /video_vit/xiangan/checkpoint_llava_next/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 2 \ - --lazy_preprocess True diff --git a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_pretrain_fixed_flashattn.sh b/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_pretrain_fixed_flashattn.sh deleted file mode 100755 index 8ff33c39..00000000 --- a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_pretrain_fixed_flashattn.sh +++ /dev/null @@ -1,55 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/xiangan/checkpoint_llava_vit/2025_11_22_new_l14_continue_128gpus_how_to_100m_448px_224px/00148000/backbone_hevc_vit_flash_attn_hf/" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_projector_type mlp2x_gelu \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir /video_vit/xiangan/checkpoint_llava_next/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 2 \ - --lazy_preprocess True diff --git a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_pretrain_fixed_flashattn_select_layer_m2.sh b/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_pretrain_fixed_flashattn_select_layer_m2.sh deleted file mode 100644 index e2656721..00000000 --- a/llava_next/scripts/hevc_vit_448/dist_hevc_l_14_448_pretrain_fixed_flashattn_select_layer_m2.sh +++ /dev/null @@ -1,56 +0,0 @@ -# export OMP_NUM_THREADS=8 -# export NCCL_IB_DISABLE=0 -# export NCCL_IB_GID_INDEX=3 -# export NCCL_SOCKET_IFNAME=eth0 -# export NCCL_DEBUG=INFO -# export NUM_GPUS=8 -# export NNODES=1 -# export RANK=0 -# export ADDR="localhost" -# export PORT="29500" -export PYTHONPATH=$(pwd) - -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" -LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/video_vit/pretrain_models/deepglint/hevc/backbone_hevc_vit_hf_version_12_01_version_00192000" -VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" - -############### Pretrain ################ - -PROMPT_VERSION=plain - -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain_select_layer_m2" -echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" - -deepspeed --hostfile host_80 \ - llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ - --model_name_or_path ${LLM_VERSION} \ - --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain \ - --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ - --mm_projector_type mlp2x_gelu \ - --mm_vision_select_layer -2 \ - --mm_use_im_start_end False \ - --mm_use_im_patch_token False \ - --bf16 True \ - --output_dir /video_vit/xiangan/checkpoint_llava_next/projectors/${BASE_RUN_NAME} \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --save_strategy "no" \ - --save_steps 50000 \ - --learning_rate 1e-3 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --logging_steps 1 \ - --tf32 True \ - --model_max_length 8192 \ - --gradient_checkpointing True \ - --dataloader_num_workers 2 \ - --lazy_preprocess True diff --git a/llava_next/scripts/onevision_encoder/finetune.sh b/llava_next/scripts/onevision_encoder/finetune.sh new file mode 100644 index 00000000..de6b1679 --- /dev/null +++ b/llava_next/scripts/onevision_encoder/finetune.sh @@ -0,0 +1,59 @@ +export OMP_NUM_THREADS=8 +export NCCL_IB_DISABLE=0 +export NCCL_IB_GID_INDEX=3 +export NCCL_SOCKET_IFNAME=eth0 +export PYTHONPATH=$(pwd) + +LLM_VERSION="Qwen/Qwen2.5-1.5B-Instruct" +LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" +VISION_MODEL_VERSION="lmms-lab/onevision-encoder-large" +VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" +export WANDB_MODE=disabled + + +export PORT=29502 +PROMPT_VERSION="qwen_1_5" + +BASE_RUN_NAME="./checkpoints/date1220_llavanext-llavavit_-2hid-qwen2.5-1.5b-sigvid-8nodes" +echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" + +mkdir -p $BASE_RUN_NAME +cp $0 $BASE_RUN_NAME/$(basename $0) + +deepspeed --master_port 65535 \ + llava/train/train_mem.py \ + --deepspeed scripts/zero3.json \ + --model_name_or_path ${LLM_VERSION} \ + --version ${PROMPT_VERSION} \ + --data_path /path/to/your/training_data.jsonl \ + --image_folder /path/to/your/images \ + --pretrain_mm_mlp_adapter="./checkpoints/projectors/your_pretrained_projector/mm_projector.bin" \ + --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \ + --mm_vision_tower_lr=2e-6 \ + --vision_tower ${VISION_MODEL_VERSION} \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --group_by_modality_length True \ + --image_aspect_ratio anyres \ + --image_grid_pinpoints "[(574, 1120), (1120, 574), (1120, 1120), (1694, 574), (574, 1694)]" \ + --mm_patch_merge_type flat \ + --bf16 True \ + --run_name $BASE_RUN_NAME \ + --output_dir $BASE_RUN_NAME \ + --num_train_epochs 1 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 2 \ + --save_strategy "steps" \ + --save_steps 500 \ + --save_total_limit 20 \ + --learning_rate 1e-5 \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 321120 \ + --gradient_checkpointing True \ + --dataloader_num_workers 1 \ + --lazy_preprocess True \ + --dataloader_drop_last True \ + --attn_implementation flash_attention_2 | tee $BASE_RUN_NAME/train.log \ No newline at end of file diff --git a/llava_next/scripts/deprecated/backup/pretrain_clip.sh b/llava_next/scripts/onevision_encoder/pretrain.sh old mode 100755 new mode 100644 similarity index 61% rename from llava_next/scripts/deprecated/backup/pretrain_clip.sh rename to llava_next/scripts/onevision_encoder/pretrain.sh index e96aa269..f4fa470f --- a/llava_next/scripts/deprecated/backup/pretrain_clip.sh +++ b/llava_next/scripts/onevision_encoder/pretrain.sh @@ -3,35 +3,37 @@ export NCCL_IB_DISABLE=0 export NCCL_IB_GID_INDEX=3 export NCCL_SOCKET_IFNAME=eth0 export NCCL_DEBUG=INFO -export NUM_GPUS=8 -export NNODES=1 +export PYTHONPATH=$(pwd) + + export RANK=0 +export NUM_GPUS=2 +export NNODES=1 export ADDR="localhost" -export PORT="29500" -export PYTHONPATH=$(pwd) +export PORT=29502 +PROMPT_VERSION="qwen_1_5" -LLM_VERSION="Qwen/Qwen2.5-7B-Instruct" +LLM_VERSION="Qwen/Qwen2.5-1.5B-Instruct" LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="openai/clip-vit-large-patch14-336" +VISION_MODEL_VERSION="lmms-lab/onevision-encoder-large" VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="llava 558k data root" ############### Pretrain ################ PROMPT_VERSION=plain -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" +BASE_RUN_NAME="llava-onevision_-2hid-qwen2.5-1.5b-instruct-pretrain_blip558k_plain-$(date +%m%d)" echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \ +deepspeed --master_port 65535 \ llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ + --deepspeed scripts/zero2.json \ --model_name_or_path ${LLM_VERSION} \ --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder ${DATA_ROOT}/LLaVA-Pretrain/images \ + --data_path pretrain_data/blip_laion_cc_sbu_558k.json \ + --image_folder LLaVA-Pretrain/images \ --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ + --mm_tunable_parts="mm_mlp_adapter" \ --mm_vision_select_layer -2 \ --mm_projector_type mlp2x_gelu \ --mm_use_im_start_end False \ @@ -41,8 +43,8 @@ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NN --num_train_epochs 1 \ --per_device_train_batch_size 16 \ --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ + --gradient_accumulation_steps 4 \ + --mm_patch_merge_type flat \ --save_strategy "no" \ --save_steps 50000 \ --learning_rate 1e-3 \ @@ -55,5 +57,4 @@ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NN --gradient_checkpointing True \ --dataloader_num_workers 16 \ --lazy_preprocess True \ - --report_to wandb \ - --run_name $BASE_RUN_NAME \ No newline at end of file + --run_name $BASE_RUN_NAME \ No newline at end of file diff --git a/llava_next/scripts/siglip2/finetune.sh b/llava_next/scripts/siglip2/finetune.sh new file mode 100644 index 00000000..c5cd2d26 --- /dev/null +++ b/llava_next/scripts/siglip2/finetune.sh @@ -0,0 +1,58 @@ +export OMP_NUM_THREADS=8 +export NCCL_IB_DISABLE=0 +export NCCL_IB_GID_INDEX=3 +export NCCL_SOCKET_IFNAME=eth0 +export PYTHONPATH=$(pwd) + +LLM_VERSION="Qwen/Qwen2.5-1.5B-Instruct" +LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" +VISION_MODEL_VERSION="google/siglip2-so400m-patch16-naflex" +VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" +export WANDB_MODE=disabled + +export PORT=29502 +PROMPT_VERSION="qwen_1_5" + +BASE_RUN_NAME="./checkpoints/llava-siglip2naflex-qwen2.5-1.5b-sigvid-fixpos" +echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" + +mkdir -p $BASE_RUN_NAME +cp $0 $BASE_RUN_NAME/$(basename $0) + +deepspeed --master_port 65535 \ + llava/train/train_mem.py \ + --deepspeed scripts/zero3.json \ + --model_name_or_path ${LLM_VERSION} \ + --version ${PROMPT_VERSION} \ + --data_path video_800k_llavanextsig_740k_shuffled.jsonl \ + --image_folder train_images_root \ + --pretrain_mm_mlp_adapter="projector/mm_projector.bin" \ + --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \ + --mm_vision_tower_lr=2e-6 \ + --vision_tower ${VISION_MODEL_VERSION} \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --group_by_modality_length True \ + --image_aspect_ratio anyres \ + --image_grid_pinpoints "[(576, 1120), (1120, 576), (1120, 1120), (1696, 576), (576, 1696)]" \ + --mm_patch_merge_type flat \ + --bf16 True \ + --run_name $BASE_RUN_NAME \ + --output_dir $BASE_RUN_NAME \ + --num_train_epochs 1 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 2 \ + --save_strategy "steps" \ + --save_steps 500 \ + --save_total_limit 20 \ + --learning_rate 1e-5 \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 321120 \ + --gradient_checkpointing True \ + --dataloader_num_workers 1 \ + --lazy_preprocess True \ + --dataloader_drop_last True \ + --attn_implementation flash_attention_2 | tee $BASE_RUN_NAME/train.log diff --git a/llava_next/scripts/deprecated/backup/pretrain_AIMv2.sh b/llava_next/scripts/siglip2/pretrain.sh old mode 100755 new mode 100644 similarity index 61% rename from llava_next/scripts/deprecated/backup/pretrain_AIMv2.sh rename to llava_next/scripts/siglip2/pretrain.sh index 7702b8d5..8ed1f64a --- a/llava_next/scripts/deprecated/backup/pretrain_AIMv2.sh +++ b/llava_next/scripts/siglip2/pretrain.sh @@ -3,35 +3,35 @@ export NCCL_IB_DISABLE=0 export NCCL_IB_GID_INDEX=3 export NCCL_SOCKET_IFNAME=eth0 export NCCL_DEBUG=INFO -export NUM_GPUS=8 -export NNODES=1 +export PYTHONPATH=$(pwd) + export RANK=0 +export NUM_GPUS=2 +export NNODES=1 export ADDR="localhost" -export PORT="29500" -export PYTHONPATH=$(pwd) +export PORT=29502 +PROMPT_VERSION="qwen_1_5" -LLM_VERSION="/vlm/pretrain_models/Qwen/Qwen2.5-7B-Instruct" +LLM_VERSION="Qwen/Qwen2.5-1.5B-Instruct" LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" -VISION_MODEL_VERSION="/vlm/pretrain_models/apple/aimv2-1B-patch14-448" +VISION_MODEL_VERSION="google/siglip2-so400m-patch16-naflex" VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" -DATA_ROOT="/vlm/data/pretrain_data" -############### Pretrain ################ PROMPT_VERSION=plain -BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain" +BASE_RUN_NAME="llava-siglip2naflex_-2hid-qwen2.5-1.5b-instruct-pretrain_blip558k_plain" echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" -ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \ +deepspeed --master_port 65535 \ llava/train/train_mem.py \ - --deepspeed scripts/zero3.json \ + --deepspeed scripts/zero2.json \ --model_name_or_path ${LLM_VERSION} \ --version ${PROMPT_VERSION} \ - --data_path ${DATA_ROOT}/blip_laion_cc_sbu_558k.json \ - --image_folder /vlm/data/train_images/LLaVA-Pretrain/images \ + --data_path /path/to/pretrain_data/blip_laion_cc_sbu_558k.json \ + --image_folder /path/to/LLaVA-Pretrain/images \ --vision_tower ${VISION_MODEL_VERSION} \ - --mm_tunable_parts mm_mlp_adapter \ + --mm_tunable_parts="mm_mlp_adapter" \ --mm_vision_select_layer -2 \ --mm_projector_type mlp2x_gelu \ --mm_use_im_start_end False \ @@ -42,7 +42,7 @@ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NN --per_device_train_batch_size 16 \ --per_device_eval_batch_size 4 \ --gradient_accumulation_steps 1 \ - --evaluation_strategy "no" \ + --mm_patch_merge_type flat \ --save_strategy "no" \ --save_steps 50000 \ --learning_rate 1e-3 \ @@ -55,5 +55,4 @@ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NN --gradient_checkpointing True \ --dataloader_num_workers 16 \ --lazy_preprocess True \ - --report_to wandb \ --run_name $BASE_RUN_NAME \ No newline at end of file diff --git a/llava_next/scripts/zero2.json b/llava_next/scripts/zero2.json index b5ba7ebe..a072ab33 100755 --- a/llava_next/scripts/zero2.json +++ b/llava_next/scripts/zero2.json @@ -10,15 +10,6 @@ "bf16": { "enabled": "auto" }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, "zero_optimization": { "stage": 2, "offload_optimizer": {