Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
ee02546
accommodate renaming of AutoModelForVision2Seq to AutoModelForImageTe…
yash4242 Feb 24, 2026
6b797d2
accomodate renaming of include_tokens_per_second to include_num_input…
yash4242 Feb 24, 2026
b749727
accommodate mllama model.language_model issue and gptneoxtokenizer an…
yash4242 Feb 24, 2026
a3bc638
replace addition_special_tokens with extra_special_tokens
yash4242 Feb 24, 2026
f823261
Merge branch 'main' into yash-tfv5
dushyantbehl Feb 24, 2026
28605d4
changed library versions in pyproject.toml
yash4242 Feb 26, 2026
ba6ffb0
fix pyproject toml after the rebase
yash4242 Feb 26, 2026
6c28f16
add more handing for the filenotfound exception handling in multi ran…
yash4242 Feb 26, 2026
800b288
merge existing extra_special_tokens with new ones to prevent replacement
yash4242 Feb 26, 2026
c7c9d89
remove the old commented lines, only letting the new renamed lines st…
yash4242 Feb 26, 2026
d5e1862
remove the old commented lines
yash4242 Feb 26, 2026
407a43d
remove the old commented lines
yash4242 Feb 26, 2026
35033ec
Merge branch 'main' into yash-tfv5
yash4242 Feb 26, 2026
68ca44c
complete housekeeping and removing old comments and repetitive comments
yash4242 Feb 27, 2026
3168e54
put upperbounds on dependencies in pyproject
yash4242 Feb 27, 2026
c507131
fix input_ids
yash4242 Feb 27, 2026
eb7b243
Merge branch 'main' into yash-tfv5
yash4242 Feb 27, 2026
4f482f1
linting and formatting
yash4242 Mar 2, 2026
e14d2ce
remove multiple excepts
yash4242 Mar 6, 2026
fd3b977
add messages control to remove in pylintrc
yash4242 Mar 6, 2026
c854702
upgrade support to trl 0.27+
yash4242 Mar 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions build/accelerate_launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,16 @@ def main():
return_code = INTERNAL_ERROR_EXIT_CODE
write_termination_log(f"Unhandled exception during training. {e}")
sys.exit(return_code)
except Exception as e: # pylint: disable=broad-except
logging.error(traceback.format_exc())
# v5: torch.distributed raises ChildFailedError with per-rank exit codes
# Check if the root cause was a user error
if hasattr(e, 'failures'):
root_codes = [f.exitcode for f in e.failures.values()]
if any(c == USER_ERROR_EXIT_CODE for c in root_codes):
sys.exit(USER_ERROR_EXIT_CODE)
write_termination_log(f"Unhandled exception during training. {e}")
sys.exit(INTERNAL_ERROR_EXIT_CODE)
except Exception as e: # pylint: disable=broad-except
logging.error(traceback.format_exc())
write_termination_log(f"Unhandled exception during training. {e}")
Expand Down
13 changes: 7 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,19 @@ classifiers=[
dependencies = [
"numpy>=1.26.4,<2.2.0",
"accelerate>=1.9.0,<2.0.0",
"transformers>=4.55.0,<=4.55.4",
"torch>2.7.0,<2.9.0",
"torchvision<0.24",
"transformers==5.2.0",
Comment thread
yash4242 marked this conversation as resolved.
Outdated
"torch==2.10.0",
"torchvision>=0.25.0",
Comment thread
yash4242 marked this conversation as resolved.
Outdated
"sentencepiece>=0.1.99,<0.3",
"tokenizers<=0.22",
"tokenizers==0.22.2",
Comment thread
yash4242 marked this conversation as resolved.
Outdated
"tqdm>=4.66.2,<5.0",
"trl>=0.19.1,<0.20.0",
"peft>=0.18.0,< 0.19.0",
"peft>=0.18.1,<0.19.0",
Comment thread
dushyantbehl marked this conversation as resolved.
"datasets>=4.0.0,<5.0.0",
"simpleeval>=0.9.13,<2.0",
"pillow>=12.1.1",
"kernels<=0.9.0",
"kernels==0.12.1",
Comment thread
yash4242 marked this conversation as resolved.
Outdated
"huggingface_hub>=1.3.0"
]

[project.optional-dependencies]
Expand Down
2 changes: 1 addition & 1 deletion tests/build/test_launch_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"warmup_ratio": 0.03,
"lr_scheduler_type": "cosine",
"logging_steps": 1,
"include_tokens_per_second": True,
"include_num_input_tokens_seen": True,
"packing": False,
"response_template": "\n### Label:",
"dataset_text_field": "output",
Expand Down
6 changes: 4 additions & 2 deletions tests/test_sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@
warmup_ratio=0.03,
lr_scheduler_type="cosine",
logging_steps=1,
include_tokens_per_second=True,
# include_tokens_per_second=True,
Comment thread
dushyantbehl marked this conversation as resolved.
Outdated
include_num_input_tokens_seen=True,
packing=False,
max_seq_length=4096,
save_strategy="epoch",
Expand All @@ -140,7 +141,8 @@
warmup_ratio=0.03,
lr_scheduler_type="cosine",
logging_steps=1,
include_tokens_per_second=True,
# include_tokens_per_second=True,
Comment thread
yash4242 marked this conversation as resolved.
Outdated
include_num_input_tokens_seen=True,
packing=False,
max_seq_length=4096,
save_strategy="epoch",
Expand Down
16 changes: 7 additions & 9 deletions tests/utils/test_embedding_resize.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# Third Party
from transformers import (
AutoModelForCausalLM,
AutoModelForVision2Seq,
AutoModelForImageTextToText, #AutoModelForVision2Seq was renamed to this in transformers v5
AutoProcessor,
AutoTokenizer,
)
Expand Down Expand Up @@ -128,16 +128,16 @@ def test_special_tokens_before_and_after():
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

input_tokenizer_len = len(tokenizer.get_vocab())
addn_spl_tokens_before = tokenizer.special_tokens_map.get(
"additional_special_tokens"
)
addn_spl_tokens_before = list(tokenizer.extra_special_tokens) # "additional_special_tokens" was renamed to extra_special_tokens in transformers v5
assert (
len(addn_spl_tokens_before) > 0
), "this test needs tokenizer special tokens to not be empty before testing"

special_tokens_dict = {"sep_token": "<SEP>", "pad_token": "<PAD>"}
addn_spl_tokens_added = ["<NotSeenTokenA>", "<NotSeenTokenB>", "<NotSeenTokenC>"]
special_tokens_dict["additional_special_tokens"] = addn_spl_tokens_added
# for transformers v5: merge existing extra_special_tokens with new ones to prevent replacement
special_tokens_dict["additional_special_tokens"] = list(tokenizer.extra_special_tokens) + addn_spl_tokens_added # "additional_special_tokens" was renamed to extra_special_tokens in transformers v5
Comment thread
yash4242 marked this conversation as resolved.
Outdated


resize_result = tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens_dict,
Expand All @@ -150,9 +150,7 @@ def test_special_tokens_before_and_after():
addn_spl_tokens_before.extend(addn_spl_tokens_added)
expected_addn_special_tokens = addn_spl_tokens_before
expected_embedding_size = input_tokenizer_len + len(addn_spl_tokens_added) + 2
addn_spl_tokens_after = tokenizer.special_tokens_map.get(
"additional_special_tokens"
)
addn_spl_tokens_after = list(tokenizer.extra_special_tokens) # "additional_special_tokens" was renamed to extra_special_tokens in transformers v5
Comment thread
yash4242 marked this conversation as resolved.
Outdated

assert "<SEP>" in tokenizer.get_vocab()
assert "<PAD>" in tokenizer.get_vocab()
Expand Down Expand Up @@ -212,7 +210,7 @@ def test_resize_with_multiple_of():


def test_resize_llama_vision_model():
model = AutoModelForVision2Seq.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME)
model = AutoModelForImageTextToText.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME) # AutoModelForVision2Seq was renamed to AutoModelForImageTextToText in transformers v5
processor = AutoProcessor.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME)
tokenizer = processor.tokenizer

Expand Down
43 changes: 34 additions & 9 deletions tuning/data/tokenizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,21 +44,42 @@ def get_special_tokens_dict(

special_tokens_dict = {}
if not tokenizer_name_or_path:
# TODO: understand if we need to hardcode these here or just use defaults in model
if isinstance(
tokenizer, (transformers.LlamaTokenizer, transformers.LlamaTokenizerFast)
):
# # TODO: understand if we need to hardcode these here or just use defaults in model
# if isinstance(
Comment thread
yash4242 marked this conversation as resolved.
Outdated
# tokenizer, (transformers.LlamaTokenizer, transformers.LlamaTokenizerFast)
# ):
llama_classes = tuple(
cls for cls in [
getattr(transformers, "LlamaTokenizer", None),
getattr(transformers, "LlamaTokenizerFast", None),
] if cls is not None
)
is_llama_tokenizer = (
(bool(llama_classes) and isinstance(tokenizer, llama_classes))
or "llama" in (getattr(tokenizer, "name_or_path", "") or "").lower()
)

gpt_neox_classes = tuple(
cls for cls in [
getattr(transformers, "GPTNeoXTokenizerFast", None),
getattr(transformers, "GPTNeoXTokenizer", None),
] if cls is not None
)

if is_llama_tokenizer:
special_tokens_dict["bos_token"] = "<s>"
special_tokens_dict["eos_token"] = "</s>"
special_tokens_dict["unk_token"] = "<unk>"
special_tokens_dict["pad_token"] = "<pad>"
elif isinstance(
tokenizer, (transformers.GPT2Tokenizer, transformers.GPTNeoXTokenizerFast)
# tokenizer, (transformers.GPT2Tokenizer, transformers.GPTNeoXTokenizerFast)
tokenizer, (transformers.GPT2Tokenizer, *gpt_neox_classes)
):
special_tokens_dict["pad_token"] = "<pad>"

# Add special tokens only when a custom tokenizer is not passed
if tokenizer.pad_token is None:
# if tokenizer.pad_token is None:
if tokenizer.pad_token is None or "pad_token" in special_tokens_dict:
logger.warning("PAD token set to default, missing in tokenizer")
special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
Expand Down Expand Up @@ -102,7 +123,8 @@ def tokenizer_and_embedding_resize(
dict: Metadata on number of added tokens.
"""
num_new_tokens = tokenizer.add_special_tokens(
special_tokens_dict=special_tokens_dict, replace_additional_special_tokens=False
special_tokens_dict=special_tokens_dict,
# replace_additional_special_tokens=False
)
embedding_size = int(multiple_of * math.ceil(len(tokenizer) / multiple_of))
num_new_tokens = num_new_tokens + embedding_size - len(tokenizer)
Expand All @@ -119,8 +141,11 @@ def tokenizer_and_embedding_resize(
model.set_input_embeddings(resized_input_embeddings)

# Resize vocab size when embeddings updated for Mllama models
if model.language_model.vocab_size != embedding_size:
model.language_model.vocab_size = embedding_size
# if model.language_model.vocab_size != embedding_size:
# model.language_model.vocab_size = embedding_size
if model.model.vocab_size != embedding_size:
model.model.vocab_size = embedding_size

else:
model.resize_token_embeddings(embedding_size)

Expand Down
5 changes: 3 additions & 2 deletions tuning/sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from torch.cuda import OutOfMemoryError
from transformers import (
AutoModelForCausalLM,
AutoModelForVision2Seq,
AutoModelForImageTextToText, # AutoModelForVision2Seq was renamed in transformers v5
AutoProcessor,
AutoTokenizer,
TrainerCallback,
Expand Down Expand Up @@ -292,7 +292,8 @@ def train(
)
)
# try to load model as a vision model
model = AutoModelForVision2Seq.from_pretrained(
# in transformers v5, AutoModelForVision2Seq was renamed to AutoModelForImageTextToText
model = AutoModelForImageTextToText.from_pretrained(
model_args.model_name_or_path, **model_kwargs
)
try:
Expand Down
Loading