diff --git a/.pylintrc b/.pylintrc
index 570205ae4..612fa0e8f 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -447,7 +447,10 @@ disable=raw-checker-failed,
         duplicate-code,
         unbalanced-tuple-unpacking,
         unspecified-encoding,
-        too-many-lines
+        too-many-lines,
+        no-name-in-module,
+        unexpected-keyword-arg,
+        unused-argument
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py
index 43cf8dda0..968753bfd 100644
--- a/build/accelerate_launch.py
+++ b/build/accelerate_launch.py
@@ -110,7 +110,6 @@ def main():
         # message to termination log.
         logging.error(traceback.format_exc())
         # The exit code that sft_trainer.py threw is captured in e.returncode
-
         return_code = e.returncode
         if return_code not in [INTERNAL_ERROR_EXIT_CODE, USER_ERROR_EXIT_CODE]:
             return_code = INTERNAL_ERROR_EXIT_CODE
@@ -118,6 +117,12 @@ def main():
         sys.exit(return_code)
     except Exception as e:  # pylint: disable=broad-except
         logging.error(traceback.format_exc())
+        # v5: torch.distributed raises ChildFailedError with per-rank exit codes
+        # Check if the root cause was a user error
+        if hasattr(e, "failures"):
+            root_codes = [f.exitcode for f in e.failures.values()]
+            if any(c == USER_ERROR_EXIT_CODE for c in root_codes):
+                sys.exit(USER_ERROR_EXIT_CODE)
         write_termination_log(f"Unhandled exception during training. {e}")
         sys.exit(INTERNAL_ERROR_EXIT_CODE)
 
diff --git a/pyproject.toml b/pyproject.toml
index 334ee79d2..27fdaaa7b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,18 +28,19 @@ classifiers=[
 dependencies = [
 "numpy>=1.26.4,<2.2.0",
 "accelerate>=1.9.0,<2.0.0",
-"transformers>=4.55.0,<=4.55.4",
-"torch>2.7.0,<2.9.0",
-"torchvision<0.24",
+"transformers>=5.2.0,<5.3.0",
+"torch>2.7.0,<=2.9.0",
+"torchvision<=0.24.0",
 "sentencepiece>=0.1.99,<0.3",
-"tokenizers<=0.22",
+"tokenizers<=0.23.0",
 "tqdm>=4.66.2,<5.0",
-"trl>=0.19.1,<0.20.0",
-"peft>=0.18.0,< 0.19.0",
+"trl>=0.27.0,<0.29.0",
+"peft>=0.18.1,<0.19.0",
 "datasets>=4.0.0,<5.0.0",
 "simpleeval>=0.9.13,<2.0",
 "pillow>=12.1.1",
-"kernels<=0.9.0",
+"kernels>=0.12.1,<0.13.0",
+"huggingface_hub>=1.3.0,<1.4.0",
 ]
 
 [project.optional-dependencies]
diff --git a/tests/build/test_launch_script.py b/tests/build/test_launch_script.py
index 322fe5998..400ecc105 100644
--- a/tests/build/test_launch_script.py
+++ b/tests/build/test_launch_script.py
@@ -51,7 +51,7 @@
     "warmup_ratio": 0.03,
     "lr_scheduler_type": "cosine",
     "logging_steps": 1,
-    "include_tokens_per_second": True,
+    "include_num_input_tokens_seen": True,
     "packing": False,
     "response_template": "\n### Label:",
     "dataset_text_field": "output",
diff --git a/tests/data/test_data_preprocessing.py b/tests/data/test_data_preprocessing.py
index a1072d2ec..0e7153932 100644
--- a/tests/data/test_data_preprocessing.py
+++ b/tests/data/test_data_preprocessing.py
@@ -22,7 +22,6 @@
 from datasets import Dataset, DatasetDict, IterableDataset
 from PIL import Image
 from transformers import AutoProcessor, AutoTokenizer, DataCollatorForSeq2Seq
-from trl import DataCollatorForCompletionOnlyLM
 import datasets
 import numpy as np
 import pyarrow
@@ -69,7 +68,7 @@
 # Local
 from tuning.config import configs
 from tuning.config.acceleration_configs import AttentionAndDistributedPackingConfig
-from tuning.data.collators import VisionDataCollator
+from tuning.data.collators import DataCollatorForCompletionOnlyLM, VisionDataCollator
 from tuning.data.data_config import (
     DataHandlerConfig,
     DataPreProcessorConfig,
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index c5423fd17..df972dfbc 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -124,7 +124,7 @@
     warmup_ratio=0.03,
     lr_scheduler_type="cosine",
     logging_steps=1,
-    include_tokens_per_second=True,
+    include_num_input_tokens_seen=True,
     packing=False,
     max_seq_length=4096,
     save_strategy="epoch",
@@ -140,7 +140,7 @@
     warmup_ratio=0.03,
     lr_scheduler_type="cosine",
     logging_steps=1,
-    include_tokens_per_second=True,
+    include_num_input_tokens_seen=True,
     packing=False,
     max_seq_length=4096,
     save_strategy="epoch",
diff --git a/tests/utils/test_embedding_resize.py b/tests/utils/test_embedding_resize.py
index 5ec7e7ab7..2ef4513b8 100644
--- a/tests/utils/test_embedding_resize.py
+++ b/tests/utils/test_embedding_resize.py
@@ -20,11 +20,9 @@
 
 # Third Party
 from transformers import (
-    AutoModelForCausalLM,
-    AutoModelForVision2Seq,
-    AutoProcessor,
-    AutoTokenizer,
+    AutoModelForImageTextToText,  # AutoModelForVision2Seq was renamed to this in transformers v5
 )
+from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 import torch
 
 # First Party
@@ -128,16 +126,17 @@ def test_special_tokens_before_and_after():
     model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
 
     input_tokenizer_len = len(tokenizer.get_vocab())
-    addn_spl_tokens_before = tokenizer.special_tokens_map.get(
-        "additional_special_tokens"
-    )
+    addn_spl_tokens_before = list(tokenizer.extra_special_tokens)
     assert (
         len(addn_spl_tokens_before) > 0
     ), "this test needs tokenizer special tokens to not be empty before testing"
 
     special_tokens_dict = {"sep_token": "<SEP>", "pad_token": "<PAD>"}
     addn_spl_tokens_added = ["<NotSeenTokenA>", "<NotSeenTokenB>", "<NotSeenTokenC>"]
-    special_tokens_dict["additional_special_tokens"] = addn_spl_tokens_added
+    # for transformers v5: merge existing extra_special_tokens with new ones to prevent replacement
+    special_tokens_dict["additional_special_tokens"] = (
+        list(tokenizer.extra_special_tokens) + addn_spl_tokens_added
+    )
 
     resize_result = tokenizer_and_embedding_resize(
         special_tokens_dict=special_tokens_dict,
@@ -150,9 +149,7 @@ def test_special_tokens_before_and_after():
     addn_spl_tokens_before.extend(addn_spl_tokens_added)
     expected_addn_special_tokens = addn_spl_tokens_before
     expected_embedding_size = input_tokenizer_len + len(addn_spl_tokens_added) + 2
-    addn_spl_tokens_after = tokenizer.special_tokens_map.get(
-        "additional_special_tokens"
-    )
+    addn_spl_tokens_after = list(tokenizer.extra_special_tokens)
 
     assert "<SEP>" in tokenizer.get_vocab()
     assert "<PAD>" in tokenizer.get_vocab()
@@ -212,7 +209,9 @@ def test_resize_with_multiple_of():
 
 
 def test_resize_llama_vision_model():
-    model = AutoModelForVision2Seq.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME)
+    model = AutoModelForImageTextToText.from_pretrained(
+        TINY_LLAMA_VISION_MODEL_NAME
+    )  # AutoModelForVision2Seq was renamed to AutoModelForImageTextToText in transformers v5
     processor = AutoProcessor.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME)
     tokenizer = processor.tokenizer
 
diff --git a/tuning/data/collators.py b/tuning/data/collators.py
index c7f63a99c..b9947e3ab 100644
--- a/tuning/data/collators.py
+++ b/tuning/data/collators.py
@@ -12,6 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Standard
+from typing import Any, Optional, Union
+import logging
+
+# Third Party
+from transformers import DataCollatorForLanguageModeling
+import numpy as np
+import torch
+
 # Local
 from tuning.data.utils import try_convert_bytes_dict_to_pil
 
@@ -91,3 +100,239 @@ def __call__(self, features):
         batch["labels"] = labels
 
         return batch
+
+
+class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
+    """
+    Data collator used for completion tasks.
+    It ensures that all the tokens of the labels
+    are set to an 'ignore_index'
+    when they do not come from the assistant.
+    This ensure that the loss is only calculated on the completion made by
+    the assistant.
+
+    Args:
+        response_template (`Union[str, list[int]]`):
+            the template form that indicates the
+            start of the response, typically
+            something like '### Response:\n'. It
+            can also be passed as tokenized ids,
+            which can be useful when using a tokenizer
+            that encodes the response
+            differently if it does not have proper context.
+        instruction_template (`Union[str, list[int]]`):
+            the template form that indicates the start
+            of the human instruction, typically
+            something like '###
+            Human:\n'. Useful for assistant-style
+            conversation datasets. It can also be passed
+            as tokenized ids.
+        mlm (`bool`, *optional*, defaults to `False`): Whether
+        to use masked language modeling in the underlying
+            `DataCollatorForLanguageModeling` class.
+            Note that this option currently has no effect but is present
+             for flexibility and backwards-compatibility.
+        ignore_index (`int`, *optional*, defaults to `-100`):
+            The index to use to ignore the initial tokens with
+    """
+
+    def __init__(
+        self,
+        *args,
+        response_template: Union[str, list[int]],
+        instruction_template: Optional[Union[str, list[int]]] = None,
+        mlm: bool = False,
+        ignore_index: int = -100,
+        padding_free: bool = False,
+        **kwargs,
+    ):
+        super().__init__(*args, mlm=mlm, **kwargs)
+
+        self.instruction_template = instruction_template
+        if isinstance(instruction_template, str):
+            # The user provides a string, must tokenize
+            self.instruction_token_ids = self.tokenizer.encode(
+                self.instruction_template, add_special_tokens=False
+            )
+        else:
+            # The user already provides the token ids
+            self.instruction_token_ids = instruction_template
+
+        self.response_template = response_template
+        if isinstance(response_template, str):
+            # The user provides a string, must tokenize
+            self.response_token_ids = self.tokenizer.encode(
+                self.response_template, add_special_tokens=False
+            )
+        else:
+            # The user already provides the token ids
+            self.response_token_ids = response_template
+
+        if (
+            not self.mlm
+            and self.instruction_template
+            and self.tokenizer.pad_token_id == self.tokenizer.eos_token_id
+        ):
+            logging.warning(
+                "The pad_token_id and eos_token_id values "
+                "of this tokenizer are identical. "
+                "If you are planning for multi-turn training, "
+                "it can result in the model continuously generating "
+                "questions and answers without eos token. "
+                "To avoid this, set the pad_token_id to a different value.",
+            )
+
+        self.ignore_index = ignore_index
+        self.padding_free = padding_free
+
+    def torch_call(
+        self, examples: list[Union[list[int], Any, dict[str, Any]]]
+    ) -> dict[str, Any]:
+        batch = super().torch_call(examples)
+
+        if self.instruction_template is None:
+            for i in range(len(examples)):
+                response_token_ids_start_idx = None
+
+                for idx in np.where(batch["labels"][i] == self.response_token_ids[0])[
+                    0
+                ]:
+                    # `response_token_ids` is
+                    # `'### Response:\n'`, here we are just making sure
+                    # that the token IDs match
+                    if (
+                        self.response_token_ids
+                        == batch["labels"][i][
+                            idx : idx + len(self.response_token_ids)
+                        ].tolist()
+                    ):
+                        response_token_ids_start_idx = idx
+
+                if response_token_ids_start_idx is None:
+                    logging.warning(
+                        "Could not find response key %s in the following instance: "
+                        "%s. This instance will be ignored in loss "
+                        "calculation. Note, if this happens often, "
+                        "consider increasing the `max_length`.",
+                        self.response_template,
+                        self.tokenizer.decode(batch["input_ids"][i]),
+                    )
+                    batch["labels"][i, :] = self.ignore_index
+                else:
+                    response_token_ids_end_idx = response_token_ids_start_idx + len(
+                        self.response_token_ids
+                    )
+
+                    # Make pytorch loss function ignore all
+                    # tokens up through the end of the response key
+                    batch["labels"][i, :response_token_ids_end_idx] = self.ignore_index
+
+        else:
+            for i in range(len(examples)):
+                response_token_ids_idxs = []
+                human_token_ids_idxs = []
+
+                for assistant_idx in np.where(
+                    batch["labels"][i] == self.response_token_ids[0]
+                )[0]:
+                    # find the indexes of the start of a response.
+                    if (
+                        self.response_token_ids
+                        == batch["labels"][i][
+                            assistant_idx : assistant_idx + len(self.response_token_ids)
+                        ].tolist()
+                    ):
+                        response_token_ids_idxs.append(
+                            assistant_idx + len(self.response_token_ids)
+                        )
+
+                if len(response_token_ids_idxs) == 0:
+                    logging.warning(
+                        "Could not find response key %s in the following instance: "
+                        "%s. This instance will be ignored in loss "
+                        "calculation. Note, if this happens often, "
+                        "consider increasing the `max_length`.",
+                        self.response_template,
+                        self.tokenizer.decode(batch["input_ids"][i]),
+                    )
+                    batch["labels"][i, :] = self.ignore_index
+
+                human_token_ids = self.instruction_token_ids
+                for human_idx in np.where(batch["labels"][i] == human_token_ids[0])[0]:
+                    # find the indexes of the start of a human answer.
+                    if (
+                        human_token_ids
+                        == batch["labels"][i][
+                            human_idx : human_idx + len(human_token_ids)
+                        ].tolist()
+                    ):
+                        human_token_ids_idxs.append(human_idx)
+
+                if len(human_token_ids_idxs) == 0:
+                    logging.warning(
+                        "Could not find instruction key `%s` in the following instance: "
+                        "%s. This instance will be ignored in loss "
+                        "calculation. Note, if this happens often, "
+                        "consider increasing the `max_length`.",
+                        self.instruction_template,
+                        self.tokenizer.decode(batch["input_ids"][i]),
+                    )
+                    batch["labels"][i, :] = self.ignore_index
+
+                if (
+                    len(human_token_ids_idxs) > 0
+                    and len(response_token_ids_idxs) > 0
+                    and human_token_ids_idxs[0] > response_token_ids_idxs[0]
+                ):
+                    human_token_ids_idxs = [0] + human_token_ids_idxs
+
+                for idx, (start, end) in enumerate(
+                    zip(human_token_ids_idxs, response_token_ids_idxs)
+                ):
+                    # Make pytorch loss function ignore all non response tokens
+                    if idx != 0:
+                        batch["labels"][i, start:end] = self.ignore_index
+                    else:
+                        batch["labels"][i, :end] = self.ignore_index
+
+                if len(response_token_ids_idxs) < len(human_token_ids_idxs):
+                    batch["labels"][i, human_token_ids_idxs[-1] :] = self.ignore_index
+
+        if self.padding_free:
+            # remove padding, `attention_mask` and add `position_ids`
+            attn_mask = batch.pop("attention_mask")
+            batch["input_ids"] = batch["input_ids"][attn_mask.bool()].unsqueeze(0)
+            batch["position_ids"] = (
+                attn_mask.cumsum(1)[attn_mask.bool()].unsqueeze(0) - 1
+            )
+            batch["labels"] = batch["labels"][attn_mask.bool()].unsqueeze(0)
+            batch["labels"][batch["position_ids"] == 0] = self.ignore_index
+
+            # Calculate cumulative sequence lengths for queries and
+            # keys to prevent graph breaks during further computations.
+            flattened_position_ids = batch["position_ids"].flatten()
+            indices_q = torch.arange(
+                flattened_position_ids.size(0),
+                device=flattened_position_ids.device,
+                dtype=torch.int32,
+            )
+            batch["cu_seq_lens_q"] = torch.cat(
+                (
+                    indices_q[flattened_position_ids == 0],
+                    torch.tensor(
+                        flattened_position_ids.size(),
+                        device=flattened_position_ids.device,
+                        dtype=torch.int32,
+                    ),
+                )
+            ).unsqueeze(0)
+            batch["cu_seq_lens_k"] = batch["cu_seq_lens_q"]
+
+            # Determine maximum sequence lengths to
+            # prevent graph breaks during further computations.
+            batch["max_length_k"] = torch.tensor(
+                [flattened_position_ids.max().item() + 1]
+            )
+            batch["max_length_q"] = batch["max_length_k"]
+
+        return batch
diff --git a/tuning/data/data_handlers.py b/tuning/data/data_handlers.py
index 327fb40ac..f549f18a2 100644
--- a/tuning/data/data_handlers.py
+++ b/tuning/data/data_handlers.py
@@ -532,7 +532,7 @@ def tokenize_and_apply_chat_template_with_masking(
             add_generation_prompt=False,
             tools=tools,
             documents=documents,
-        )
+        )["input_ids"]
     )
 
     # clone labels from input ids
@@ -557,7 +557,7 @@ def tokenize_and_apply_chat_template_with_masking(
                         add_generation_prompt=False,
                         tools=tools,
                         documents=documents,
-                    ).shape[1]
+                    )["input_ids"].shape[1]
                 )
             # next, we calculate the end index of this non-assistant message
             if (
@@ -578,7 +578,7 @@ def tokenize_and_apply_chat_template_with_masking(
                         add_generation_prompt=True,
                         tools=tools,
                         documents=documents,
-                    ).shape[1]
+                    )["input_ids"].shape[1]
                 )
             else:
                 # for the last message or the message that doesn't follow with
@@ -594,7 +594,7 @@ def tokenize_and_apply_chat_template_with_masking(
                         add_generation_prompt=False,
                         tools=tools,
                         documents=documents,
-                    ).shape[1]
+                    )["input_ids"].shape[1]
                 )
             # set the label to -100 for the non-assistant part
             labels[:, message_start_idx:message_end_idx] = -100
diff --git a/tuning/data/data_preprocessing_utils.py b/tuning/data/data_preprocessing_utils.py
index 04d0d7a2d..b43081864 100644
--- a/tuning/data/data_preprocessing_utils.py
+++ b/tuning/data/data_preprocessing_utils.py
@@ -22,11 +22,10 @@
     DataCollatorForSeq2Seq,
     LlavaProcessor,
 )
-from trl import DataCollatorForCompletionOnlyLM
 
 # Local
 from tuning.config import configs
-from tuning.data.collators import VisionDataCollator
+from tuning.data.collators import DataCollatorForCompletionOnlyLM, VisionDataCollator
 
 logger = logging.getLogger(__name__)
 
diff --git a/tuning/data/tokenizer_utils.py b/tuning/data/tokenizer_utils.py
index 622da0e3c..13faa47d8 100644
--- a/tuning/data/tokenizer_utils.py
+++ b/tuning/data/tokenizer_utils.py
@@ -44,21 +44,37 @@ def get_special_tokens_dict(
 
     special_tokens_dict = {}
     if not tokenizer_name_or_path:
-        # TODO: understand if we need to hardcode these here or just use defaults in model
-        if isinstance(
-            tokenizer, (transformers.LlamaTokenizer, transformers.LlamaTokenizerFast)
-        ):
+        llama_classes = tuple(
+            cls
+            for cls in [
+                getattr(transformers, "LlamaTokenizer", None),
+                getattr(transformers, "LlamaTokenizerFast", None),
+            ]
+            if cls is not None
+        )
+        is_llama_tokenizer = (
+            bool(llama_classes) and isinstance(tokenizer, llama_classes)
+        ) or "llama" in (getattr(tokenizer, "name_or_path", "") or "").lower()
+
+        gpt_neox_classes = tuple(
+            cls
+            for cls in [
+                getattr(transformers, "GPTNeoXTokenizerFast", None),
+                getattr(transformers, "GPTNeoXTokenizer", None),
+            ]
+            if cls is not None
+        )
+
+        if is_llama_tokenizer:
             special_tokens_dict["bos_token"] = "<s>"
             special_tokens_dict["eos_token"] = "</s>"
             special_tokens_dict["unk_token"] = "<unk>"
             special_tokens_dict["pad_token"] = "<pad>"
-        elif isinstance(
-            tokenizer, (transformers.GPT2Tokenizer, transformers.GPTNeoXTokenizerFast)
-        ):
+        elif isinstance(tokenizer, (transformers.GPT2Tokenizer, *gpt_neox_classes)):
             special_tokens_dict["pad_token"] = "<pad>"
 
         # Add special tokens only when a custom tokenizer is not passed
-        if tokenizer.pad_token is None:
+        if tokenizer.pad_token is None or "pad_token" in special_tokens_dict:
             logger.warning("PAD token set to default, missing in tokenizer")
             special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
         if tokenizer.eos_token is None:
@@ -102,7 +118,8 @@ def tokenizer_and_embedding_resize(
         dict: Metadata on number of added tokens.
     """
     num_new_tokens = tokenizer.add_special_tokens(
-        special_tokens_dict=special_tokens_dict, replace_additional_special_tokens=False
+        special_tokens_dict=special_tokens_dict,
+        # replace_additional_special_tokens=False
     )
     embedding_size = int(multiple_of * math.ceil(len(tokenizer) / multiple_of))
     num_new_tokens = num_new_tokens + embedding_size - len(tokenizer)
@@ -119,8 +136,9 @@ def tokenizer_and_embedding_resize(
         model.set_input_embeddings(resized_input_embeddings)
 
         # Resize vocab size when embeddings updated for Mllama models
-        if model.language_model.vocab_size != embedding_size:
-            model.language_model.vocab_size = embedding_size
+        if model.model.vocab_size != embedding_size:
+            model.model.vocab_size = embedding_size
+
     else:
         model.resize_token_embeddings(embedding_size)
 
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index 1fa524e89..51d23fe16 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -28,9 +28,11 @@
 from peft import LoraConfig
 from peft.utils.other import fsdp_auto_wrap_policy
 from torch.cuda import OutOfMemoryError
+from transformers import (
+    AutoModelForImageTextToText,  # AutoModelForVision2Seq was renamed in transformers v5
+)
 from transformers import (
     AutoModelForCausalLM,
-    AutoModelForVision2Seq,
     AutoProcessor,
     AutoTokenizer,
     TrainerCallback,
@@ -292,7 +294,8 @@ def train(
                 )
             )
             # try to load model as a vision model
-            model = AutoModelForVision2Seq.from_pretrained(
+            # in transformers v5, AutoModelForVision2Seq was renamed to AutoModelForImageTextToText
+            model = AutoModelForImageTextToText.from_pretrained(
                 model_args.model_name_or_path, **model_kwargs
             )
             try: