diff --git a/.pylintrc b/.pylintrc
index 612fa0e8f..570205ae4 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -447,10 +447,7 @@ disable=raw-checker-failed,
         duplicate-code,
         unbalanced-tuple-unpacking,
         unspecified-encoding,
-        too-many-lines,
-        no-name-in-module,
-        unexpected-keyword-arg,
-        unused-argument
+        too-many-lines
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py
index 968753bfd..43cf8dda0 100644
--- a/build/accelerate_launch.py
+++ b/build/accelerate_launch.py
@@ -110,6 +110,7 @@ def main():
         # message to termination log.
         logging.error(traceback.format_exc())
         # The exit code that sft_trainer.py threw is captured in e.returncode
+
         return_code = e.returncode
         if return_code not in [INTERNAL_ERROR_EXIT_CODE, USER_ERROR_EXIT_CODE]:
             return_code = INTERNAL_ERROR_EXIT_CODE
@@ -117,12 +118,6 @@ def main():
         sys.exit(return_code)
     except Exception as e:  # pylint: disable=broad-except
         logging.error(traceback.format_exc())
-        # v5: torch.distributed raises ChildFailedError with per-rank exit codes
-        # Check if the root cause was a user error
-        if hasattr(e, "failures"):
-            root_codes = [f.exitcode for f in e.failures.values()]
-            if any(c == USER_ERROR_EXIT_CODE for c in root_codes):
-                sys.exit(USER_ERROR_EXIT_CODE)
         write_termination_log(f"Unhandled exception during training. {e}")
         sys.exit(INTERNAL_ERROR_EXIT_CODE)
 
diff --git a/pyproject.toml b/pyproject.toml
index 27fdaaa7b..334ee79d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,19 +28,18 @@ classifiers=[
 dependencies = [
 "numpy>=1.26.4,<2.2.0",
 "accelerate>=1.9.0,<2.0.0",
-"transformers>=5.2.0,<5.3.0",
-"torch>2.7.0,<=2.9.0",
-"torchvision<=0.24.0",
+"transformers>=4.55.0,<=4.55.4",
+"torch>2.7.0,<2.9.0",
+"torchvision<0.24",
 "sentencepiece>=0.1.99,<0.3",
-"tokenizers<=0.23.0",
+"tokenizers<=0.22",
 "tqdm>=4.66.2,<5.0",
-"trl>=0.27.0,<0.29.0",
-"peft>=0.18.1,<0.19.0",
+"trl>=0.19.1,<0.20.0",
+"peft>=0.18.0,< 0.19.0",
 "datasets>=4.0.0,<5.0.0",
 "simpleeval>=0.9.13,<2.0",
 "pillow>=12.1.1",
-"kernels>=0.12.1,<0.13.0",
-"huggingface_hub>=1.3.0,<1.4.0",
+"kernels<=0.9.0",
 ]
 
 [project.optional-dependencies]
diff --git a/tests/build/test_launch_script.py b/tests/build/test_launch_script.py
index 400ecc105..322fe5998 100644
--- a/tests/build/test_launch_script.py
+++ b/tests/build/test_launch_script.py
@@ -51,7 +51,7 @@
     "warmup_ratio": 0.03,
     "lr_scheduler_type": "cosine",
     "logging_steps": 1,
-    "include_num_input_tokens_seen": True,
+    "include_tokens_per_second": True,
     "packing": False,
     "response_template": "\n### Label:",
     "dataset_text_field": "output",
diff --git a/tests/data/test_data_preprocessing.py b/tests/data/test_data_preprocessing.py
index 5732ae06c..7d0dd2eee 100644
--- a/tests/data/test_data_preprocessing.py
+++ b/tests/data/test_data_preprocessing.py
@@ -22,6 +22,7 @@
 from datasets import Dataset, DatasetDict, IterableDataset
 from PIL import Image
 from transformers import AutoProcessor, AutoTokenizer, DataCollatorForSeq2Seq
+from trl import DataCollatorForCompletionOnlyLM
 import datasets
 import numpy as np
 import pyarrow
@@ -68,7 +69,7 @@
 # Local
 from tuning.config import configs
 from tuning.config.acceleration_configs import AttentionAndDistributedPackingConfig
-from tuning.data.collators import DataCollatorForCompletionOnlyLM, VisionDataCollator
+from tuning.data.collators import VisionDataCollator
 from tuning.data.data_config import (
     DataHandlerConfig,
     DataPreProcessorConfig,
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index 029cf63ad..10ed33a09 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -124,7 +124,7 @@
     warmup_ratio=0.03,
     lr_scheduler_type="cosine",
     logging_steps=1,
-    include_num_input_tokens_seen=True,
+    include_tokens_per_second=True,
     packing=False,
     max_seq_length=4096,
     save_strategy="epoch",
@@ -140,7 +140,7 @@
     warmup_ratio=0.03,
     lr_scheduler_type="cosine",
     logging_steps=1,
-    include_num_input_tokens_seen=True,
+    include_tokens_per_second=True,
     packing=False,
     max_seq_length=4096,
     save_strategy="epoch",
diff --git a/tests/utils/test_embedding_resize.py b/tests/utils/test_embedding_resize.py
index 2ef4513b8..5ec7e7ab7 100644
--- a/tests/utils/test_embedding_resize.py
+++ b/tests/utils/test_embedding_resize.py
@@ -20,9 +20,11 @@
 
 # Third Party
 from transformers import (
-    AutoModelForImageTextToText,  # AutoModelForVision2Seq was renamed to this in transformers v5
+    AutoModelForCausalLM,
+    AutoModelForVision2Seq,
+    AutoProcessor,
+    AutoTokenizer,
 )
-from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 import torch
 
 # First Party
@@ -126,17 +128,16 @@ def test_special_tokens_before_and_after():
     model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
 
     input_tokenizer_len = len(tokenizer.get_vocab())
-    addn_spl_tokens_before = list(tokenizer.extra_special_tokens)
+    addn_spl_tokens_before = tokenizer.special_tokens_map.get(
+        "additional_special_tokens"
+    )
     assert (
         len(addn_spl_tokens_before) > 0
     ), "this test needs tokenizer special tokens to not be empty before testing"
 
     special_tokens_dict = {"sep_token": "<SEP>", "pad_token": "<PAD>"}
     addn_spl_tokens_added = ["<NotSeenTokenA>", "<NotSeenTokenB>", "<NotSeenTokenC>"]
-    # for transformers v5: merge existing extra_special_tokens with new ones to prevent replacement
-    special_tokens_dict["additional_special_tokens"] = (
-        list(tokenizer.extra_special_tokens) + addn_spl_tokens_added
-    )
+    special_tokens_dict["additional_special_tokens"] = addn_spl_tokens_added
 
     resize_result = tokenizer_and_embedding_resize(
         special_tokens_dict=special_tokens_dict,
@@ -149,7 +150,9 @@ def test_special_tokens_before_and_after():
     addn_spl_tokens_before.extend(addn_spl_tokens_added)
     expected_addn_special_tokens = addn_spl_tokens_before
     expected_embedding_size = input_tokenizer_len + len(addn_spl_tokens_added) + 2
-    addn_spl_tokens_after = list(tokenizer.extra_special_tokens)
+    addn_spl_tokens_after = tokenizer.special_tokens_map.get(
+        "additional_special_tokens"
+    )
 
     assert "<SEP>" in tokenizer.get_vocab()
     assert "<PAD>" in tokenizer.get_vocab()
@@ -209,9 +212,7 @@ def test_resize_with_multiple_of():
 
 
 def test_resize_llama_vision_model():
-    model = AutoModelForImageTextToText.from_pretrained(
-        TINY_LLAMA_VISION_MODEL_NAME
-    )  # AutoModelForVision2Seq was renamed to AutoModelForImageTextToText in transformers v5
+    model = AutoModelForVision2Seq.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME)
     processor = AutoProcessor.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME)
     tokenizer = processor.tokenizer
 
diff --git a/tuning/data/collators.py b/tuning/data/collators.py
index b9947e3ab..c7f63a99c 100644
--- a/tuning/data/collators.py
+++ b/tuning/data/collators.py
@@ -12,15 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Standard
-from typing import Any, Optional, Union
-import logging
-
-# Third Party
-from transformers import DataCollatorForLanguageModeling
-import numpy as np
-import torch
-
 # Local
 from tuning.data.utils import try_convert_bytes_dict_to_pil
 
@@ -100,239 +91,3 @@ def __call__(self, features):
         batch["labels"] = labels
 
         return batch
-
-
-class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
-    """
-    Data collator used for completion tasks.
-    It ensures that all the tokens of the labels
-    are set to an 'ignore_index'
-    when they do not come from the assistant.
-    This ensure that the loss is only calculated on the completion made by
-    the assistant.
-
-    Args:
-        response_template (`Union[str, list[int]]`):
-            the template form that indicates the
-            start of the response, typically
-            something like '### Response:\n'. It
-            can also be passed as tokenized ids,
-            which can be useful when using a tokenizer
-            that encodes the response
-            differently if it does not have proper context.
-        instruction_template (`Union[str, list[int]]`):
-            the template form that indicates the start
-            of the human instruction, typically
-            something like '###
-            Human:\n'. Useful for assistant-style
-            conversation datasets. It can also be passed
-            as tokenized ids.
-        mlm (`bool`, *optional*, defaults to `False`): Whether
-        to use masked language modeling in the underlying
-            `DataCollatorForLanguageModeling` class.
-            Note that this option currently has no effect but is present
-             for flexibility and backwards-compatibility.
-        ignore_index (`int`, *optional*, defaults to `-100`):
-            The index to use to ignore the initial tokens with
-    """
-
-    def __init__(
-        self,
-        *args,
-        response_template: Union[str, list[int]],
-        instruction_template: Optional[Union[str, list[int]]] = None,
-        mlm: bool = False,
-        ignore_index: int = -100,
-        padding_free: bool = False,
-        **kwargs,
-    ):
-        super().__init__(*args, mlm=mlm, **kwargs)
-
-        self.instruction_template = instruction_template
-        if isinstance(instruction_template, str):
-            # The user provides a string, must tokenize
-            self.instruction_token_ids = self.tokenizer.encode(
-                self.instruction_template, add_special_tokens=False
-            )
-        else:
-            # The user already provides the token ids
-            self.instruction_token_ids = instruction_template
-
-        self.response_template = response_template
-        if isinstance(response_template, str):
-            # The user provides a string, must tokenize
-            self.response_token_ids = self.tokenizer.encode(
-                self.response_template, add_special_tokens=False
-            )
-        else:
-            # The user already provides the token ids
-            self.response_token_ids = response_template
-
-        if (
-            not self.mlm
-            and self.instruction_template
-            and self.tokenizer.pad_token_id == self.tokenizer.eos_token_id
-        ):
-            logging.warning(
-                "The pad_token_id and eos_token_id values "
-                "of this tokenizer are identical. "
-                "If you are planning for multi-turn training, "
-                "it can result in the model continuously generating "
-                "questions and answers without eos token. "
-                "To avoid this, set the pad_token_id to a different value.",
-            )
-
-        self.ignore_index = ignore_index
-        self.padding_free = padding_free
-
-    def torch_call(
-        self, examples: list[Union[list[int], Any, dict[str, Any]]]
-    ) -> dict[str, Any]:
-        batch = super().torch_call(examples)
-
-        if self.instruction_template is None:
-            for i in range(len(examples)):
-                response_token_ids_start_idx = None
-
-                for idx in np.where(batch["labels"][i] == self.response_token_ids[0])[
-                    0
-                ]:
-                    # `response_token_ids` is
-                    # `'### Response:\n'`, here we are just making sure
-                    # that the token IDs match
-                    if (
-                        self.response_token_ids
-                        == batch["labels"][i][
-                            idx : idx + len(self.response_token_ids)
-                        ].tolist()
-                    ):
-                        response_token_ids_start_idx = idx
-
-                if response_token_ids_start_idx is None:
-                    logging.warning(
-                        "Could not find response key %s in the following instance: "
-                        "%s. This instance will be ignored in loss "
-                        "calculation. Note, if this happens often, "
-                        "consider increasing the `max_length`.",
-                        self.response_template,
-                        self.tokenizer.decode(batch["input_ids"][i]),
-                    )
-                    batch["labels"][i, :] = self.ignore_index
-                else:
-                    response_token_ids_end_idx = response_token_ids_start_idx + len(
-                        self.response_token_ids
-                    )
-
-                    # Make pytorch loss function ignore all
-                    # tokens up through the end of the response key
-                    batch["labels"][i, :response_token_ids_end_idx] = self.ignore_index
-
-        else:
-            for i in range(len(examples)):
-                response_token_ids_idxs = []
-                human_token_ids_idxs = []
-
-                for assistant_idx in np.where(
-                    batch["labels"][i] == self.response_token_ids[0]
-                )[0]:
-                    # find the indexes of the start of a response.
-                    if (
-                        self.response_token_ids
-                        == batch["labels"][i][
-                            assistant_idx : assistant_idx + len(self.response_token_ids)
-                        ].tolist()
-                    ):
-                        response_token_ids_idxs.append(
-                            assistant_idx + len(self.response_token_ids)
-                        )
-
-                if len(response_token_ids_idxs) == 0:
-                    logging.warning(
-                        "Could not find response key %s in the following instance: "
-                        "%s. This instance will be ignored in loss "
-                        "calculation. Note, if this happens often, "
-                        "consider increasing the `max_length`.",
-                        self.response_template,
-                        self.tokenizer.decode(batch["input_ids"][i]),
-                    )
-                    batch["labels"][i, :] = self.ignore_index
-
-                human_token_ids = self.instruction_token_ids
-                for human_idx in np.where(batch["labels"][i] == human_token_ids[0])[0]:
-                    # find the indexes of the start of a human answer.
-                    if (
-                        human_token_ids
-                        == batch["labels"][i][
-                            human_idx : human_idx + len(human_token_ids)
-                        ].tolist()
-                    ):
-                        human_token_ids_idxs.append(human_idx)
-
-                if len(human_token_ids_idxs) == 0:
-                    logging.warning(
-                        "Could not find instruction key `%s` in the following instance: "
-                        "%s. This instance will be ignored in loss "
-                        "calculation. Note, if this happens often, "
-                        "consider increasing the `max_length`.",
-                        self.instruction_template,
-                        self.tokenizer.decode(batch["input_ids"][i]),
-                    )
-                    batch["labels"][i, :] = self.ignore_index
-
-                if (
-                    len(human_token_ids_idxs) > 0
-                    and len(response_token_ids_idxs) > 0
-                    and human_token_ids_idxs[0] > response_token_ids_idxs[0]
-                ):
-                    human_token_ids_idxs = [0] + human_token_ids_idxs
-
-                for idx, (start, end) in enumerate(
-                    zip(human_token_ids_idxs, response_token_ids_idxs)
-                ):
-                    # Make pytorch loss function ignore all non response tokens
-                    if idx != 0:
-                        batch["labels"][i, start:end] = self.ignore_index
-                    else:
-                        batch["labels"][i, :end] = self.ignore_index
-
-                if len(response_token_ids_idxs) < len(human_token_ids_idxs):
-                    batch["labels"][i, human_token_ids_idxs[-1] :] = self.ignore_index
-
-        if self.padding_free:
-            # remove padding, `attention_mask` and add `position_ids`
-            attn_mask = batch.pop("attention_mask")
-            batch["input_ids"] = batch["input_ids"][attn_mask.bool()].unsqueeze(0)
-            batch["position_ids"] = (
-                attn_mask.cumsum(1)[attn_mask.bool()].unsqueeze(0) - 1
-            )
-            batch["labels"] = batch["labels"][attn_mask.bool()].unsqueeze(0)
-            batch["labels"][batch["position_ids"] == 0] = self.ignore_index
-
-            # Calculate cumulative sequence lengths for queries and
-            # keys to prevent graph breaks during further computations.
-            flattened_position_ids = batch["position_ids"].flatten()
-            indices_q = torch.arange(
-                flattened_position_ids.size(0),
-                device=flattened_position_ids.device,
-                dtype=torch.int32,
-            )
-            batch["cu_seq_lens_q"] = torch.cat(
-                (
-                    indices_q[flattened_position_ids == 0],
-                    torch.tensor(
-                        flattened_position_ids.size(),
-                        device=flattened_position_ids.device,
-                        dtype=torch.int32,
-                    ),
-                )
-            ).unsqueeze(0)
-            batch["cu_seq_lens_k"] = batch["cu_seq_lens_q"]
-
-            # Determine maximum sequence lengths to
-            # prevent graph breaks during further computations.
-            batch["max_length_k"] = torch.tensor(
-                [flattened_position_ids.max().item() + 1]
-            )
-            batch["max_length_q"] = batch["max_length_k"]
-
-        return batch
diff --git a/tuning/data/data_handlers.py b/tuning/data/data_handlers.py
index f549f18a2..327fb40ac 100644
--- a/tuning/data/data_handlers.py
+++ b/tuning/data/data_handlers.py
@@ -532,7 +532,7 @@ def tokenize_and_apply_chat_template_with_masking(
             add_generation_prompt=False,
             tools=tools,
             documents=documents,
-        )["input_ids"]
+        )
     )
 
     # clone labels from input ids
@@ -557,7 +557,7 @@ def tokenize_and_apply_chat_template_with_masking(
                         add_generation_prompt=False,
                         tools=tools,
                         documents=documents,
-                    )["input_ids"].shape[1]
+                    ).shape[1]
                 )
             # next, we calculate the end index of this non-assistant message
             if (
@@ -578,7 +578,7 @@ def tokenize_and_apply_chat_template_with_masking(
                         add_generation_prompt=True,
                         tools=tools,
                         documents=documents,
-                    )["input_ids"].shape[1]
+                    ).shape[1]
                 )
             else:
                 # for the last message or the message that doesn't follow with
@@ -594,7 +594,7 @@ def tokenize_and_apply_chat_template_with_masking(
                         add_generation_prompt=False,
                         tools=tools,
                         documents=documents,
-                    )["input_ids"].shape[1]
+                    ).shape[1]
                 )
             # set the label to -100 for the non-assistant part
             labels[:, message_start_idx:message_end_idx] = -100
diff --git a/tuning/data/data_preprocessing_utils.py b/tuning/data/data_preprocessing_utils.py
index b43081864..04d0d7a2d 100644
--- a/tuning/data/data_preprocessing_utils.py
+++ b/tuning/data/data_preprocessing_utils.py
@@ -22,10 +22,11 @@
     DataCollatorForSeq2Seq,
     LlavaProcessor,
 )
+from trl import DataCollatorForCompletionOnlyLM
 
 # Local
 from tuning.config import configs
-from tuning.data.collators import DataCollatorForCompletionOnlyLM, VisionDataCollator
+from tuning.data.collators import VisionDataCollator
 
 logger = logging.getLogger(__name__)
 
diff --git a/tuning/data/tokenizer_utils.py b/tuning/data/tokenizer_utils.py
index 13faa47d8..622da0e3c 100644
--- a/tuning/data/tokenizer_utils.py
+++ b/tuning/data/tokenizer_utils.py
@@ -44,37 +44,21 @@ def get_special_tokens_dict(
 
     special_tokens_dict = {}
     if not tokenizer_name_or_path:
-        llama_classes = tuple(
-            cls
-            for cls in [
-                getattr(transformers, "LlamaTokenizer", None),
-                getattr(transformers, "LlamaTokenizerFast", None),
-            ]
-            if cls is not None
-        )
-        is_llama_tokenizer = (
-            bool(llama_classes) and isinstance(tokenizer, llama_classes)
-        ) or "llama" in (getattr(tokenizer, "name_or_path", "") or "").lower()
-
-        gpt_neox_classes = tuple(
-            cls
-            for cls in [
-                getattr(transformers, "GPTNeoXTokenizerFast", None),
-                getattr(transformers, "GPTNeoXTokenizer", None),
-            ]
-            if cls is not None
-        )
-
-        if is_llama_tokenizer:
+        # TODO: understand if we need to hardcode these here or just use defaults in model
+        if isinstance(
+            tokenizer, (transformers.LlamaTokenizer, transformers.LlamaTokenizerFast)
+        ):
             special_tokens_dict["bos_token"] = "<s>"
             special_tokens_dict["eos_token"] = "</s>"
             special_tokens_dict["unk_token"] = "<unk>"
             special_tokens_dict["pad_token"] = "<pad>"
-        elif isinstance(tokenizer, (transformers.GPT2Tokenizer, *gpt_neox_classes)):
+        elif isinstance(
+            tokenizer, (transformers.GPT2Tokenizer, transformers.GPTNeoXTokenizerFast)
+        ):
             special_tokens_dict["pad_token"] = "<pad>"
 
         # Add special tokens only when a custom tokenizer is not passed
-        if tokenizer.pad_token is None or "pad_token" in special_tokens_dict:
+        if tokenizer.pad_token is None:
             logger.warning("PAD token set to default, missing in tokenizer")
             special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
         if tokenizer.eos_token is None:
@@ -118,8 +102,7 @@ def tokenizer_and_embedding_resize(
         dict: Metadata on number of added tokens.
     """
     num_new_tokens = tokenizer.add_special_tokens(
-        special_tokens_dict=special_tokens_dict,
-        # replace_additional_special_tokens=False
+        special_tokens_dict=special_tokens_dict, replace_additional_special_tokens=False
     )
     embedding_size = int(multiple_of * math.ceil(len(tokenizer) / multiple_of))
     num_new_tokens = num_new_tokens + embedding_size - len(tokenizer)
@@ -136,9 +119,8 @@ def tokenizer_and_embedding_resize(
         model.set_input_embeddings(resized_input_embeddings)
 
         # Resize vocab size when embeddings updated for Mllama models
-        if model.model.vocab_size != embedding_size:
-            model.model.vocab_size = embedding_size
-
+        if model.language_model.vocab_size != embedding_size:
+            model.language_model.vocab_size = embedding_size
     else:
         model.resize_token_embeddings(embedding_size)
 
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index 51d23fe16..1fa524e89 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -28,11 +28,9 @@
 from peft import LoraConfig
 from peft.utils.other import fsdp_auto_wrap_policy
 from torch.cuda import OutOfMemoryError
-from transformers import (
-    AutoModelForImageTextToText,  # AutoModelForVision2Seq was renamed in transformers v5
-)
 from transformers import (
     AutoModelForCausalLM,
+    AutoModelForVision2Seq,
     AutoProcessor,
     AutoTokenizer,
     TrainerCallback,
@@ -294,8 +292,7 @@ def train(
                 )
             )
             # try to load model as a vision model
-            # in transformers v5, AutoModelForVision2Seq was renamed to AutoModelForImageTextToText
-            model = AutoModelForImageTextToText.from_pretrained(
+            model = AutoModelForVision2Seq.from_pretrained(
                 model_args.model_name_or_path, **model_kwargs
             )
             try: