Skip to content

Commit 09810e3

Browse files
feat: upgrade to transformers v5 (#659)
* accommodate renaming of AutoModelForVision2Seq to AutoModelForImageTextToText Signed-off-by: Yash Mehan <yashmehan@gmail.com> * accomodate renaming of include_tokens_per_second to include_num_input_tokens_seen Signed-off-by: Yash Mehan <yashmehan@gmail.com> * accommodate mllama model.language_model issue and gptneoxtokenizer and llama tokenizer Signed-off-by: Yash Mehan <yashmehan@gmail.com> * replace addition_special_tokens with extra_special_tokens Signed-off-by: Yash Mehan <yashmehan@gmail.com> * changed library versions in pyproject.toml Signed-off-by: Yash Mehan <yashmehan@gmail.com> * fix pyproject toml after the rebase Signed-off-by: Yash Mehan <yashmehan@gmail.com> * add more handing for the filenotfound exception handling in multi rank case Signed-off-by: Yash Mehan <yashmehan@gmail.com> * merge existing extra_special_tokens with new ones to prevent replacement Signed-off-by: Yash Mehan <yashmehan@gmail.com> * remove the old commented lines, only letting the new renamed lines stay, adding justification for what was renamed Signed-off-by: Yash Mehan <yashmehan@gmail.com> * remove the old commented lines Signed-off-by: Yash Mehan <yashmehan@gmail.com> * remove the old commented lines Signed-off-by: Yash Mehan <yashmehan@gmail.com> * complete housekeeping and removing old comments and repetitive comments Signed-off-by: Yash Mehan <yashmehan@gmail.com> * put upperbounds on dependencies in pyproject Signed-off-by: Yash Mehan <yashmehan@gmail.com> * fix input_ids Signed-off-by: Yash Mehan <yashmehan@gmail.com> * linting and formatting Signed-off-by: Yash Mehan <yashmehan@gmail.com> * remove multiple excepts Signed-off-by: Yash Mehan <yashmehan@gmail.com> * add messages control to remove in pylintrc Signed-off-by: Yash Mehan <yashmehan@gmail.com> * upgrade support to trl 0.27+ Signed-off-by: Yash Mehan <yashmehan@gmail.com> --------- Signed-off-by: Yash Mehan <yashmehan@gmail.com> Signed-off-by: Yash Mehan <71321431+yash4242@users.noreply.github.com> Co-authored-by: Dushyant Behl <dushyantbehl@users.noreply.github.com>
1 parent 4716bfe commit 09810e3

12 files changed

Lines changed: 317 additions & 45 deletions

.pylintrc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -447,7 +447,10 @@ disable=raw-checker-failed,
447447
duplicate-code,
448448
unbalanced-tuple-unpacking,
449449
unspecified-encoding,
450-
too-many-lines
450+
too-many-lines,
451+
no-name-in-module,
452+
unexpected-keyword-arg,
453+
unused-argument
451454

452455
# Enable the message, report, category or checker with the given id(s). You can
453456
# either give multiple identifier separated by comma (,) or put this option

build/accelerate_launch.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,14 +110,19 @@ def main():
110110
# message to termination log.
111111
logging.error(traceback.format_exc())
112112
# The exit code that sft_trainer.py threw is captured in e.returncode
113-
114113
return_code = e.returncode
115114
if return_code not in [INTERNAL_ERROR_EXIT_CODE, USER_ERROR_EXIT_CODE]:
116115
return_code = INTERNAL_ERROR_EXIT_CODE
117116
write_termination_log(f"Unhandled exception during training. {e}")
118117
sys.exit(return_code)
119118
except Exception as e: # pylint: disable=broad-except
120119
logging.error(traceback.format_exc())
120+
# v5: torch.distributed raises ChildFailedError with per-rank exit codes
121+
# Check if the root cause was a user error
122+
if hasattr(e, "failures"):
123+
root_codes = [f.exitcode for f in e.failures.values()]
124+
if any(c == USER_ERROR_EXIT_CODE for c in root_codes):
125+
sys.exit(USER_ERROR_EXIT_CODE)
121126
write_termination_log(f"Unhandled exception during training. {e}")
122127
sys.exit(INTERNAL_ERROR_EXIT_CODE)
123128

pyproject.toml

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,19 @@ classifiers=[
2828
dependencies = [
2929
"numpy>=1.26.4,<2.2.0",
3030
"accelerate>=1.9.0,<2.0.0",
31-
"transformers>=4.55.0,<=4.55.4",
32-
"torch>2.7.0,<2.9.0",
33-
"torchvision<0.24",
31+
"transformers>=5.2.0,<5.3.0",
32+
"torch>2.7.0,<=2.9.0",
33+
"torchvision<=0.24.0",
3434
"sentencepiece>=0.1.99,<0.3",
35-
"tokenizers<=0.22",
35+
"tokenizers<=0.23.0",
3636
"tqdm>=4.66.2,<5.0",
37-
"trl>=0.19.1,<0.20.0",
38-
"peft>=0.18.0,< 0.19.0",
37+
"trl>=0.27.0,<0.29.0",
38+
"peft>=0.18.1,<0.19.0",
3939
"datasets>=4.0.0,<5.0.0",
4040
"simpleeval>=0.9.13,<2.0",
4141
"pillow>=12.1.1",
42-
"kernels<=0.9.0",
42+
"kernels>=0.12.1,<0.13.0",
43+
"huggingface_hub>=1.3.0,<1.4.0",
4344
]
4445

4546
[project.optional-dependencies]

tests/build/test_launch_script.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
"warmup_ratio": 0.03,
5252
"lr_scheduler_type": "cosine",
5353
"logging_steps": 1,
54-
"include_tokens_per_second": True,
54+
"include_num_input_tokens_seen": True,
5555
"packing": False,
5656
"response_template": "\n### Label:",
5757
"dataset_text_field": "output",

tests/data/test_data_preprocessing.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
from datasets import Dataset, DatasetDict, IterableDataset
2323
from PIL import Image
2424
from transformers import AutoProcessor, AutoTokenizer, DataCollatorForSeq2Seq
25-
from trl import DataCollatorForCompletionOnlyLM
2625
import datasets
2726
import numpy as np
2827
import pyarrow
@@ -69,7 +68,7 @@
6968
# Local
7069
from tuning.config import configs
7170
from tuning.config.acceleration_configs import AttentionAndDistributedPackingConfig
72-
from tuning.data.collators import VisionDataCollator
71+
from tuning.data.collators import DataCollatorForCompletionOnlyLM, VisionDataCollator
7372
from tuning.data.data_config import (
7473
DataHandlerConfig,
7574
DataPreProcessorConfig,

tests/test_sft_trainer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@
124124
warmup_ratio=0.03,
125125
lr_scheduler_type="cosine",
126126
logging_steps=1,
127-
include_tokens_per_second=True,
127+
include_num_input_tokens_seen=True,
128128
packing=False,
129129
max_seq_length=4096,
130130
save_strategy="epoch",
@@ -140,7 +140,7 @@
140140
warmup_ratio=0.03,
141141
lr_scheduler_type="cosine",
142142
logging_steps=1,
143-
include_tokens_per_second=True,
143+
include_num_input_tokens_seen=True,
144144
packing=False,
145145
max_seq_length=4096,
146146
save_strategy="epoch",

tests/utils/test_embedding_resize.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,9 @@
2020

2121
# Third Party
2222
from transformers import (
23-
AutoModelForCausalLM,
24-
AutoModelForVision2Seq,
25-
AutoProcessor,
26-
AutoTokenizer,
23+
AutoModelForImageTextToText, # AutoModelForVision2Seq was renamed to this in transformers v5
2724
)
25+
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
2826
import torch
2927

3028
# First Party
@@ -128,16 +126,17 @@ def test_special_tokens_before_and_after():
128126
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
129127

130128
input_tokenizer_len = len(tokenizer.get_vocab())
131-
addn_spl_tokens_before = tokenizer.special_tokens_map.get(
132-
"additional_special_tokens"
133-
)
129+
addn_spl_tokens_before = list(tokenizer.extra_special_tokens)
134130
assert (
135131
len(addn_spl_tokens_before) > 0
136132
), "this test needs tokenizer special tokens to not be empty before testing"
137133

138134
special_tokens_dict = {"sep_token": "<SEP>", "pad_token": "<PAD>"}
139135
addn_spl_tokens_added = ["<NotSeenTokenA>", "<NotSeenTokenB>", "<NotSeenTokenC>"]
140-
special_tokens_dict["additional_special_tokens"] = addn_spl_tokens_added
136+
# for transformers v5: merge existing extra_special_tokens with new ones to prevent replacement
137+
special_tokens_dict["additional_special_tokens"] = (
138+
list(tokenizer.extra_special_tokens) + addn_spl_tokens_added
139+
)
141140

142141
resize_result = tokenizer_and_embedding_resize(
143142
special_tokens_dict=special_tokens_dict,
@@ -150,9 +149,7 @@ def test_special_tokens_before_and_after():
150149
addn_spl_tokens_before.extend(addn_spl_tokens_added)
151150
expected_addn_special_tokens = addn_spl_tokens_before
152151
expected_embedding_size = input_tokenizer_len + len(addn_spl_tokens_added) + 2
153-
addn_spl_tokens_after = tokenizer.special_tokens_map.get(
154-
"additional_special_tokens"
155-
)
152+
addn_spl_tokens_after = list(tokenizer.extra_special_tokens)
156153

157154
assert "<SEP>" in tokenizer.get_vocab()
158155
assert "<PAD>" in tokenizer.get_vocab()
@@ -212,7 +209,9 @@ def test_resize_with_multiple_of():
212209

213210

214211
def test_resize_llama_vision_model():
215-
model = AutoModelForVision2Seq.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME)
212+
model = AutoModelForImageTextToText.from_pretrained(
213+
TINY_LLAMA_VISION_MODEL_NAME
214+
) # AutoModelForVision2Seq was renamed to AutoModelForImageTextToText in transformers v5
216215
processor = AutoProcessor.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME)
217216
tokenizer = processor.tokenizer
218217

0 commit comments

Comments
 (0)