foundation-model-stack · willmj · Apr 22, 2025 · Apr 17, 2025 · Apr 18, 2025 · Apr 18, 2025
@@ -919,12 +919,12 @@ For information on supported dataset formats and how to tune a vision-language m
 
   ? May be supported, but not tested
 
-Model Name & Size  | Model Architecture | Full Finetuning |
--------------------- | ---------------- | --------------- |
-Llama 3.2-11B Vision  | MllamaForConditionalGeneration | ✅* |
-Llava 1.5-7B  | LlavaForConditionalGeneration | ✅* |
-Granite 3.1-2B Vision  | LlavaNextForConditionalGeneration | ✅* |
-Llava Mistral 1.6-7B  | LlavaNextForConditionalGeneration | ✅* |
+Model Name & Size  | Model Architecture | LoRA Tuning | Full Finetuning |
+-------------------- | ---------------- | --------------- | --------------- |
+Llama 3.2-11B Vision  | MllamaForConditionalGeneration | ✅* | ✅* | 
+Llava 1.5-7B  | LlavaForConditionalGeneration | ✅* | ✅* | 
+Granite 3.1-2B Vision  | LlavaNextForConditionalGeneration | ✅* | ✅* |
+Llava Mistral 1.6-7B  | LlavaNextForConditionalGeneration | ✅* | ✅* |
 
 (*) - Supported with `fms-hf-tuning` v2.8.0 or later.
 

@@ -74,6 +74,7 @@
 CHAT_DATA_MULTI_TURN_GRANITE_3_1B = os.path.join(
     JSONL_DATA_DIR, "multi_turn_chat_granite_instruct.jsonl"
 )
+IMAGE_DATASET = os.path.join(JSONL_DATA_DIR, "image_dataset.jsonl")
 EMPTY_DATA = os.path.join(JSON_DATA_DIR, "empty_data.json")
 MALFORMATTED_DATA = os.path.join(JSON_DATA_DIR, "malformatted_data.json")
 

@@ -0,0 +1,27 @@
+# Copyright The FMS HF Tuning Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helpful saved vison models for unit tests.
+"""
+# Standard
+import os
+
+### Constants used for model path
+PREDEFINED_MODEL_PATH = os.path.join(os.path.dirname(__file__))
+TINY_LLAMA_VISION_MODEL_NAME = os.path.join(
+    PREDEFINED_MODEL_PATH, "tiny_llama_vision_model"
+)
+TINY_GRANITE_VISION_MODEL_NAME = os.path.join(
+    PREDEFINED_MODEL_PATH, "tiny_granite_vision_model"
+)
@@ -0,0 +1,6 @@
+{
+  "<image>": 49155,
+  "<|end_of_role|>": 49153,
+  "<|start_of_role|>": 49152,
+  "<|tool_call|>": 49154
+}
@@ -0,0 +1,3 @@
+{
+  "chat_template": "{%- if tools %}\n    {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n    {%- for tool in tools %}\n    {{- tool | tojson(indent=4) }}\n    {%- if not loop.last %}\n        {{- '\n\n' }}\n    {%- endif %}\n    {%- endfor %}\n    {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages if message['role'] == 'system'%}{% else %}<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n{% endfor %}{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n    {{- '<|system|>\n' + message['content'][0]['text'] + '\n' }}\n    {%- elif message['role'] == 'user' %}<|user|>\n {# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + '\n' }}{% endfor %}\n{%- elif message['role'] == 'assistant' %}\n    {{- '<|assistant|>\n'  + message['content'][0]['text']  + '<|end_of_text|>' }}\n    {%- elif message['role'] == 'assistant_tool_call' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'][0]['text']  + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'tool_response' %}\n    {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'][0]['text'] + '<|end_of_text|>\n' }}\n    {%- endif %}\n    {%- if loop.last and add_generation_prompt %}\n    {{- '<|assistant|>\n' }}\n    {%- endif %}\n{%- endfor %}"
+}
@@ -0,0 +1,175 @@
+{
+  "architectures": [
+    "LlavaNextForConditionalGeneration"
+  ],
+  "image_grid_pinpoints": [
+    [
+      384,
+      384
+    ],
+    [
+      384,
+      768
+    ],
+    [
+      384,
+      1152
+    ],
+    [
+      384,
+      1536
+    ],
+    [
+      384,
+      1920
+    ],
+    [
+      384,
+      2304
+    ],
+    [
+      384,
+      2688
+    ],
+    [
+      384,
+      3072
+    ],
+    [
+      384,
+      3456
+    ],
+    [
+      384,
+      3840
+    ],
+    [
+      768,
+      384
+    ],
+    [
+      768,
+      768
+    ],
+    [
+      768,
+      1152
+    ],
+    [
+      768,
+      1536
+    ],
+    [
+      768,
+      1920
+    ],
+    [
+      1152,
+      384
+    ],
+    [
+      1152,
+      768
+    ],
+    [
+      1152,
+      1152
+    ],
+    [
+      1536,
+      384
+    ],
+    [
+      1536,
+      768
+    ],
+    [
+      1920,
+      384
+    ],
+    [
+      1920,
+      768
+    ],
+    [
+      2304,
+      384
+    ],
+    [
+      2688,
+      384
+    ],
+    [
+      3072,
+      384
+    ],
+    [
+      3456,
+      384
+    ],
+    [
+      3840,
+      384
+    ]
+  ],
+  "image_seq_length": 576,
+  "image_token_index": 49155,
+  "model_type": "llava_next",
+  "multimodal_projector_bias": true,
+  "projector_hidden_act": "gelu",
+  "text_config": {
+    "architectures": [
+      "GraniteForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.1,
+    "attention_multiplier": 0.015625,
+    "bos_token_id": 0,
+    "embedding_multiplier": 12.0,
+    "eos_token_id": 0,
+    "hidden_act": "silu",
+    "hidden_size": 256,
+    "initializer_range": 0.02,
+    "intermediate_size": 1024,
+    "logits_scaling": 8.0,
+    "max_position_embeddings": 131072,
+    "mlp_bias": false,
+    "model_type": "granite",
+    "num_attention_heads": 4,
+    "num_hidden_layers": 4,
+    "num_key_value_heads": 8,
+    "pad_token_id": 0,
+    "residual_multiplier": 0.22,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "rope_theta": 300000,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "vocab_size": 49156
+  },
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.50.3",
+  "use_image_newline_parameter": true,
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 256,
+    "image_size": 384,
+    "intermediate_size": 512,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 4,
+    "num_channels": 3,
+    "num_hidden_layers": 4,
+    "patch_size": 14
+  },
+  "vision_feature_layer": [
+    -24,
+    -20,
+    -12,
+    -1
+  ],
+  "vision_feature_select_strategy": "full"
+}
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "pad_token_id": 0,
+  "transformers_version": "4.50.3"
+}