Skip to content

Commit 7c987f9

Browse files
committed
PR Changes
Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
1 parent b43741d commit 7c987f9

6 files changed

Lines changed: 65 additions & 58 deletions

File tree

tests/data/test_data_preprocessing.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@
2020

2121
# Third Party
2222
from datasets import Dataset, IterableDataset
23-
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
23+
from PIL import Image
24+
from transformers import AutoProcessor, AutoTokenizer, DataCollatorForSeq2Seq
2425
from trl import DataCollatorForCompletionOnlyLM
2526
import datasets
27+
import numpy as np
2628
import pyarrow
2729
import pytest
2830
import yaml
@@ -62,6 +64,7 @@
6264
# Local
6365
from tuning.config import configs
6466
from tuning.config.acceleration_configs import AttentionAndDistributedPackingConfig
67+
from tuning.data.collators import VisionDataCollator
6568
from tuning.data.data_config import DataPreProcessorConfig, DataSetConfig
6669
from tuning.data.data_preprocessing_utils import get_data_collator
6770
from tuning.data.data_processors import DataPreProcessor, get_datapreprocessor
@@ -71,6 +74,8 @@
7174
process_dataargs,
7275
)
7376

77+
LLAMA_VISION_MODEL_NAME = "tests/artifacts/tiny-llama-vision-model"
78+
7479

7580
@pytest.mark.parametrize(
7681
"datafile, column_names",
@@ -1831,3 +1836,51 @@ def test_get_processed_dataset(datafile, datasetconfigname):
18311836
"train_dataset",
18321837
)
18331838
assert len(os.listdir(train_dataset_dir)) == num_dataset_shards
1839+
1840+
1841+
def test_vision_data_collator():
1842+
"""Test the VisionDataCollator with dummy Image data."""
1843+
1844+
processor = AutoProcessor.from_pretrained(LLAMA_VISION_MODEL_NAME)
1845+
collator = VisionDataCollator(processor)
1846+
processor_kwargs = {}
1847+
processor_kwargs["return_tensors"] = "pt"
1848+
processor_kwargs["padding"] = True
1849+
image_size = (32, 32)
1850+
1851+
def generate_pil_image(size=image_size):
1852+
"""Generate a dummy image array of the specified size and return PIL Image."""
1853+
image_array = np.random.randint(0, 256, size=(*size, 3), dtype=np.uint8)
1854+
return Image.fromarray(image_array)
1855+
1856+
image1 = generate_pil_image()
1857+
image2 = generate_pil_image()
1858+
1859+
features = [
1860+
{
1861+
"processor_kwargs": processor_kwargs,
1862+
"fields_name": {
1863+
"dataset_text_field": "text",
1864+
"dataset_image_field": "image",
1865+
},
1866+
"text": "Describe the image.",
1867+
"image": [image1],
1868+
},
1869+
{
1870+
"processor_kwargs": processor_kwargs,
1871+
"fields_name": {
1872+
"dataset_text_field": "text",
1873+
"dataset_image_field": "image",
1874+
},
1875+
"text": "What is in the image?",
1876+
"image": [image2],
1877+
},
1878+
]
1879+
1880+
# Call the collator which returns a batch dictionary containing "input_ids" and "labels"
1881+
batch = collator(features)
1882+
1883+
assert "input_ids" in batch
1884+
assert "labels" in batch
1885+
assert "attention_mask" in batch
1886+
assert batch["input_ids"].shape == batch["labels"].shape

tests/test_sft_trainer.py

Lines changed: 0 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,7 @@
2727

2828
# Third Party
2929
from datasets.exceptions import DatasetGenerationError, DatasetNotFoundError
30-
from PIL import Image
31-
from transformers import AutoProcessor
3230
from transformers.trainer_callback import TrainerCallback
33-
import numpy as np
3431
import pytest
3532
import torch
3633
import transformers
@@ -93,7 +90,6 @@
9390
DataHandlerType,
9491
add_tokenizer_eos_token,
9592
)
96-
from tuning.utils.collators import VisionDataCollator
9793
from tuning.utils.import_utils import is_fms_accelerate_available
9894

9995
MODEL_ARGS = configs.ModelArguments(
@@ -127,7 +123,6 @@
127123
)
128124

129125
PEFT_LORA_ARGS = peft_config.LoraConfig(r=8, lora_alpha=32, lora_dropout=0.05)
130-
LLAMA_VISION_MODEL_NAME = "tests/artifacts/tiny-llama-vision-model"
131126

132127

133128
@pytest.mark.parametrize(
@@ -1974,51 +1969,3 @@ def test_handler(element, tokenizer, **kwargs):
19741969
},
19751970
)
19761971
_validate_training(tempdir)
1977-
1978-
1979-
def test_vision_data_collator():
1980-
"""Test the VisionDataCollator with dummy Image data."""
1981-
1982-
processor = AutoProcessor.from_pretrained(LLAMA_VISION_MODEL_NAME)
1983-
collator = VisionDataCollator(processor)
1984-
processor_kwargs = {}
1985-
processor_kwargs["return_tensors"] = "pt"
1986-
processor_kwargs["padding"] = True
1987-
image_size = (32, 32)
1988-
1989-
def generate_pil_image(size=image_size):
1990-
"""Generate a dummy image array of the specified size and return PIL Image."""
1991-
image_array = np.random.randint(0, 256, size=(*size, 3), dtype=np.uint8)
1992-
return Image.fromarray(image_array)
1993-
1994-
image1 = generate_pil_image()
1995-
image2 = generate_pil_image()
1996-
1997-
features = [
1998-
{
1999-
"processor_kwargs": processor_kwargs,
2000-
"fields_name": {
2001-
"dataset_text_field": "text",
2002-
"dataset_image_field": "image",
2003-
},
2004-
"text": "Describe the image.",
2005-
"image": [image1],
2006-
},
2007-
{
2008-
"processor_kwargs": processor_kwargs,
2009-
"fields_name": {
2010-
"dataset_text_field": "text",
2011-
"dataset_image_field": "image",
2012-
},
2013-
"text": "What is in the image?",
2014-
"image": [image2],
2015-
},
2016-
]
2017-
2018-
# Call the collator which returns a batch dictionary containing "input_ids" and "labels"
2019-
batch = collator(features)
2020-
2021-
assert "input_ids" in batch
2022-
assert "labels" in batch
2023-
assert "attention_mask" in batch
2024-
assert batch["input_ids"].shape == batch["labels"].shape

tests/utils/test_embedding_resize.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,14 @@ def test_resize_llama_vision_model():
235235
assert "<unk>" in tokenizer.get_vocab()
236236
assert resize_result["num_new_tokens"] == 1
237237

238-
# 2 new tokens were added: <unk> and <image>
238+
# Resizing adds 2 tokens (<unk> and <image>) because the tokenizer vocab size (128257)
239+
# is 1 greater than the output embedding size (128256),
240+
# i.e., len(tokenizer) == model.get_output_embeddings().weight.shape[0] + 1.
241+
242+
# When special_tokens_dict only contains <unk>, the embedding size calculation
243+
# increases the embedding size from 128256 to 128258 (adding 2 tokens in total).
244+
# Consequently, the model's input embeddings are resized with an increase of 2 tokens as well.
245+
239246
assert (
240247
resized_output_embeddings.weight.shape[0]
241248
== current_output_embeddings.weight.shape[0] + 2

tuning/data/data_preprocessing_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
# Local
2828
from tuning.config import configs
29-
from tuning.utils.collators import VisionDataCollator
29+
from tuning.data.collators import VisionDataCollator
3030

3131
logger = logging.getLogger(__name__)
3232

tuning/data/setup_dataprocessor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,15 +252,15 @@ def _get_vision_dataset_handlers(data_args, processor_kwargs):
252252
)
253253

254254
# Second data handler configuration
255-
fn_kwargs2 = {
255+
handler_fn_kwargs2 = {
256256
"fields_name": {
257257
"dataset_text_field": data_args.dataset_text_field,
258258
"dataset_image_field": data_args.dataset_image_field,
259259
},
260260
"processor_kwargs": processor_kwargs,
261261
}
262262
kwargs2 = {
263-
"fn_kwargs": fn_kwargs2,
263+
"fn_kwargs": handler_fn_kwargs2,
264264
}
265265
handlers.append(
266266
DataHandlerConfig("prepare_multimodal_data_processor", arguments=kwargs2)

0 commit comments

Comments
 (0)