diff --git a/.pylintrc b/.pylintrc index 570205ae4..612fa0e8f 100644 --- a/.pylintrc +++ b/.pylintrc @@ -447,7 +447,10 @@ disable=raw-checker-failed, duplicate-code, unbalanced-tuple-unpacking, unspecified-encoding, - too-many-lines + too-many-lines, + no-name-in-module, + unexpected-keyword-arg, + unused-argument # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py index 43cf8dda0..968753bfd 100644 --- a/build/accelerate_launch.py +++ b/build/accelerate_launch.py @@ -110,7 +110,6 @@ def main(): # message to termination log. logging.error(traceback.format_exc()) # The exit code that sft_trainer.py threw is captured in e.returncode - return_code = e.returncode if return_code not in [INTERNAL_ERROR_EXIT_CODE, USER_ERROR_EXIT_CODE]: return_code = INTERNAL_ERROR_EXIT_CODE @@ -118,6 +117,12 @@ def main(): sys.exit(return_code) except Exception as e: # pylint: disable=broad-except logging.error(traceback.format_exc()) + # v5: torch.distributed raises ChildFailedError with per-rank exit codes + # Check if the root cause was a user error + if hasattr(e, "failures"): + root_codes = [f.exitcode for f in e.failures.values()] + if any(c == USER_ERROR_EXIT_CODE for c in root_codes): + sys.exit(USER_ERROR_EXIT_CODE) write_termination_log(f"Unhandled exception during training. {e}") sys.exit(INTERNAL_ERROR_EXIT_CODE) diff --git a/pyproject.toml b/pyproject.toml index 334ee79d2..27fdaaa7b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,18 +28,19 @@ classifiers=[ dependencies = [ "numpy>=1.26.4,<2.2.0", "accelerate>=1.9.0,<2.0.0", -"transformers>=4.55.0,<=4.55.4", -"torch>2.7.0,<2.9.0", -"torchvision<0.24", +"transformers>=5.2.0,<5.3.0", +"torch>2.7.0,<=2.9.0", +"torchvision<=0.24.0", "sentencepiece>=0.1.99,<0.3", -"tokenizers<=0.22", +"tokenizers<=0.23.0", "tqdm>=4.66.2,<5.0", -"trl>=0.19.1,<0.20.0", -"peft>=0.18.0,< 0.19.0", +"trl>=0.27.0,<0.29.0", +"peft>=0.18.1,<0.19.0", "datasets>=4.0.0,<5.0.0", "simpleeval>=0.9.13,<2.0", "pillow>=12.1.1", -"kernels<=0.9.0", +"kernels>=0.12.1,<0.13.0", +"huggingface_hub>=1.3.0,<1.4.0", ] [project.optional-dependencies] diff --git a/tests/build/test_launch_script.py b/tests/build/test_launch_script.py index 322fe5998..400ecc105 100644 --- a/tests/build/test_launch_script.py +++ b/tests/build/test_launch_script.py @@ -51,7 +51,7 @@ "warmup_ratio": 0.03, "lr_scheduler_type": "cosine", "logging_steps": 1, - "include_tokens_per_second": True, + "include_num_input_tokens_seen": True, "packing": False, "response_template": "\n### Label:", "dataset_text_field": "output", diff --git a/tests/data/test_data_preprocessing.py b/tests/data/test_data_preprocessing.py index a1072d2ec..0e7153932 100644 --- a/tests/data/test_data_preprocessing.py +++ b/tests/data/test_data_preprocessing.py @@ -22,7 +22,6 @@ from datasets import Dataset, DatasetDict, IterableDataset from PIL import Image from transformers import AutoProcessor, AutoTokenizer, DataCollatorForSeq2Seq -from trl import DataCollatorForCompletionOnlyLM import datasets import numpy as np import pyarrow @@ -69,7 +68,7 @@ # Local from tuning.config import configs from tuning.config.acceleration_configs import AttentionAndDistributedPackingConfig -from tuning.data.collators import VisionDataCollator +from tuning.data.collators import DataCollatorForCompletionOnlyLM, VisionDataCollator from tuning.data.data_config import ( DataHandlerConfig, DataPreProcessorConfig, diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index c5423fd17..df972dfbc 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -124,7 +124,7 @@ warmup_ratio=0.03, lr_scheduler_type="cosine", logging_steps=1, - include_tokens_per_second=True, + include_num_input_tokens_seen=True, packing=False, max_seq_length=4096, save_strategy="epoch", @@ -140,7 +140,7 @@ warmup_ratio=0.03, lr_scheduler_type="cosine", logging_steps=1, - include_tokens_per_second=True, + include_num_input_tokens_seen=True, packing=False, max_seq_length=4096, save_strategy="epoch", diff --git a/tests/utils/test_embedding_resize.py b/tests/utils/test_embedding_resize.py index 5ec7e7ab7..2ef4513b8 100644 --- a/tests/utils/test_embedding_resize.py +++ b/tests/utils/test_embedding_resize.py @@ -20,11 +20,9 @@ # Third Party from transformers import ( - AutoModelForCausalLM, - AutoModelForVision2Seq, - AutoProcessor, - AutoTokenizer, + AutoModelForImageTextToText, # AutoModelForVision2Seq was renamed to this in transformers v5 ) +from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer import torch # First Party @@ -128,16 +126,17 @@ def test_special_tokens_before_and_after(): model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) input_tokenizer_len = len(tokenizer.get_vocab()) - addn_spl_tokens_before = tokenizer.special_tokens_map.get( - "additional_special_tokens" - ) + addn_spl_tokens_before = list(tokenizer.extra_special_tokens) assert ( len(addn_spl_tokens_before) > 0 ), "this test needs tokenizer special tokens to not be empty before testing" special_tokens_dict = {"sep_token": "", "pad_token": ""} addn_spl_tokens_added = ["", "", ""] - special_tokens_dict["additional_special_tokens"] = addn_spl_tokens_added + # for transformers v5: merge existing extra_special_tokens with new ones to prevent replacement + special_tokens_dict["additional_special_tokens"] = ( + list(tokenizer.extra_special_tokens) + addn_spl_tokens_added + ) resize_result = tokenizer_and_embedding_resize( special_tokens_dict=special_tokens_dict, @@ -150,9 +149,7 @@ def test_special_tokens_before_and_after(): addn_spl_tokens_before.extend(addn_spl_tokens_added) expected_addn_special_tokens = addn_spl_tokens_before expected_embedding_size = input_tokenizer_len + len(addn_spl_tokens_added) + 2 - addn_spl_tokens_after = tokenizer.special_tokens_map.get( - "additional_special_tokens" - ) + addn_spl_tokens_after = list(tokenizer.extra_special_tokens) assert "" in tokenizer.get_vocab() assert "" in tokenizer.get_vocab() @@ -212,7 +209,9 @@ def test_resize_with_multiple_of(): def test_resize_llama_vision_model(): - model = AutoModelForVision2Seq.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME) + model = AutoModelForImageTextToText.from_pretrained( + TINY_LLAMA_VISION_MODEL_NAME + ) # AutoModelForVision2Seq was renamed to AutoModelForImageTextToText in transformers v5 processor = AutoProcessor.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME) tokenizer = processor.tokenizer diff --git a/tuning/data/collators.py b/tuning/data/collators.py index c7f63a99c..b9947e3ab 100644 --- a/tuning/data/collators.py +++ b/tuning/data/collators.py @@ -12,6 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Standard +from typing import Any, Optional, Union +import logging + +# Third Party +from transformers import DataCollatorForLanguageModeling +import numpy as np +import torch + # Local from tuning.data.utils import try_convert_bytes_dict_to_pil @@ -91,3 +100,239 @@ def __call__(self, features): batch["labels"] = labels return batch + + +class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling): + """ + Data collator used for completion tasks. + It ensures that all the tokens of the labels + are set to an 'ignore_index' + when they do not come from the assistant. + This ensure that the loss is only calculated on the completion made by + the assistant. + + Args: + response_template (`Union[str, list[int]]`): + the template form that indicates the + start of the response, typically + something like '### Response:\n'. It + can also be passed as tokenized ids, + which can be useful when using a tokenizer + that encodes the response + differently if it does not have proper context. + instruction_template (`Union[str, list[int]]`): + the template form that indicates the start + of the human instruction, typically + something like '### + Human:\n'. Useful for assistant-style + conversation datasets. It can also be passed + as tokenized ids. + mlm (`bool`, *optional*, defaults to `False`): Whether + to use masked language modeling in the underlying + `DataCollatorForLanguageModeling` class. + Note that this option currently has no effect but is present + for flexibility and backwards-compatibility. + ignore_index (`int`, *optional*, defaults to `-100`): + The index to use to ignore the initial tokens with + """ + + def __init__( + self, + *args, + response_template: Union[str, list[int]], + instruction_template: Optional[Union[str, list[int]]] = None, + mlm: bool = False, + ignore_index: int = -100, + padding_free: bool = False, + **kwargs, + ): + super().__init__(*args, mlm=mlm, **kwargs) + + self.instruction_template = instruction_template + if isinstance(instruction_template, str): + # The user provides a string, must tokenize + self.instruction_token_ids = self.tokenizer.encode( + self.instruction_template, add_special_tokens=False + ) + else: + # The user already provides the token ids + self.instruction_token_ids = instruction_template + + self.response_template = response_template + if isinstance(response_template, str): + # The user provides a string, must tokenize + self.response_token_ids = self.tokenizer.encode( + self.response_template, add_special_tokens=False + ) + else: + # The user already provides the token ids + self.response_token_ids = response_template + + if ( + not self.mlm + and self.instruction_template + and self.tokenizer.pad_token_id == self.tokenizer.eos_token_id + ): + logging.warning( + "The pad_token_id and eos_token_id values " + "of this tokenizer are identical. " + "If you are planning for multi-turn training, " + "it can result in the model continuously generating " + "questions and answers without eos token. " + "To avoid this, set the pad_token_id to a different value.", + ) + + self.ignore_index = ignore_index + self.padding_free = padding_free + + def torch_call( + self, examples: list[Union[list[int], Any, dict[str, Any]]] + ) -> dict[str, Any]: + batch = super().torch_call(examples) + + if self.instruction_template is None: + for i in range(len(examples)): + response_token_ids_start_idx = None + + for idx in np.where(batch["labels"][i] == self.response_token_ids[0])[ + 0 + ]: + # `response_token_ids` is + # `'### Response:\n'`, here we are just making sure + # that the token IDs match + if ( + self.response_token_ids + == batch["labels"][i][ + idx : idx + len(self.response_token_ids) + ].tolist() + ): + response_token_ids_start_idx = idx + + if response_token_ids_start_idx is None: + logging.warning( + "Could not find response key %s in the following instance: " + "%s. This instance will be ignored in loss " + "calculation. Note, if this happens often, " + "consider increasing the `max_length`.", + self.response_template, + self.tokenizer.decode(batch["input_ids"][i]), + ) + batch["labels"][i, :] = self.ignore_index + else: + response_token_ids_end_idx = response_token_ids_start_idx + len( + self.response_token_ids + ) + + # Make pytorch loss function ignore all + # tokens up through the end of the response key + batch["labels"][i, :response_token_ids_end_idx] = self.ignore_index + + else: + for i in range(len(examples)): + response_token_ids_idxs = [] + human_token_ids_idxs = [] + + for assistant_idx in np.where( + batch["labels"][i] == self.response_token_ids[0] + )[0]: + # find the indexes of the start of a response. + if ( + self.response_token_ids + == batch["labels"][i][ + assistant_idx : assistant_idx + len(self.response_token_ids) + ].tolist() + ): + response_token_ids_idxs.append( + assistant_idx + len(self.response_token_ids) + ) + + if len(response_token_ids_idxs) == 0: + logging.warning( + "Could not find response key %s in the following instance: " + "%s. This instance will be ignored in loss " + "calculation. Note, if this happens often, " + "consider increasing the `max_length`.", + self.response_template, + self.tokenizer.decode(batch["input_ids"][i]), + ) + batch["labels"][i, :] = self.ignore_index + + human_token_ids = self.instruction_token_ids + for human_idx in np.where(batch["labels"][i] == human_token_ids[0])[0]: + # find the indexes of the start of a human answer. + if ( + human_token_ids + == batch["labels"][i][ + human_idx : human_idx + len(human_token_ids) + ].tolist() + ): + human_token_ids_idxs.append(human_idx) + + if len(human_token_ids_idxs) == 0: + logging.warning( + "Could not find instruction key `%s` in the following instance: " + "%s. This instance will be ignored in loss " + "calculation. Note, if this happens often, " + "consider increasing the `max_length`.", + self.instruction_template, + self.tokenizer.decode(batch["input_ids"][i]), + ) + batch["labels"][i, :] = self.ignore_index + + if ( + len(human_token_ids_idxs) > 0 + and len(response_token_ids_idxs) > 0 + and human_token_ids_idxs[0] > response_token_ids_idxs[0] + ): + human_token_ids_idxs = [0] + human_token_ids_idxs + + for idx, (start, end) in enumerate( + zip(human_token_ids_idxs, response_token_ids_idxs) + ): + # Make pytorch loss function ignore all non response tokens + if idx != 0: + batch["labels"][i, start:end] = self.ignore_index + else: + batch["labels"][i, :end] = self.ignore_index + + if len(response_token_ids_idxs) < len(human_token_ids_idxs): + batch["labels"][i, human_token_ids_idxs[-1] :] = self.ignore_index + + if self.padding_free: + # remove padding, `attention_mask` and add `position_ids` + attn_mask = batch.pop("attention_mask") + batch["input_ids"] = batch["input_ids"][attn_mask.bool()].unsqueeze(0) + batch["position_ids"] = ( + attn_mask.cumsum(1)[attn_mask.bool()].unsqueeze(0) - 1 + ) + batch["labels"] = batch["labels"][attn_mask.bool()].unsqueeze(0) + batch["labels"][batch["position_ids"] == 0] = self.ignore_index + + # Calculate cumulative sequence lengths for queries and + # keys to prevent graph breaks during further computations. + flattened_position_ids = batch["position_ids"].flatten() + indices_q = torch.arange( + flattened_position_ids.size(0), + device=flattened_position_ids.device, + dtype=torch.int32, + ) + batch["cu_seq_lens_q"] = torch.cat( + ( + indices_q[flattened_position_ids == 0], + torch.tensor( + flattened_position_ids.size(), + device=flattened_position_ids.device, + dtype=torch.int32, + ), + ) + ).unsqueeze(0) + batch["cu_seq_lens_k"] = batch["cu_seq_lens_q"] + + # Determine maximum sequence lengths to + # prevent graph breaks during further computations. + batch["max_length_k"] = torch.tensor( + [flattened_position_ids.max().item() + 1] + ) + batch["max_length_q"] = batch["max_length_k"] + + return batch diff --git a/tuning/data/data_handlers.py b/tuning/data/data_handlers.py index 327fb40ac..f549f18a2 100644 --- a/tuning/data/data_handlers.py +++ b/tuning/data/data_handlers.py @@ -532,7 +532,7 @@ def tokenize_and_apply_chat_template_with_masking( add_generation_prompt=False, tools=tools, documents=documents, - ) + )["input_ids"] ) # clone labels from input ids @@ -557,7 +557,7 @@ def tokenize_and_apply_chat_template_with_masking( add_generation_prompt=False, tools=tools, documents=documents, - ).shape[1] + )["input_ids"].shape[1] ) # next, we calculate the end index of this non-assistant message if ( @@ -578,7 +578,7 @@ def tokenize_and_apply_chat_template_with_masking( add_generation_prompt=True, tools=tools, documents=documents, - ).shape[1] + )["input_ids"].shape[1] ) else: # for the last message or the message that doesn't follow with @@ -594,7 +594,7 @@ def tokenize_and_apply_chat_template_with_masking( add_generation_prompt=False, tools=tools, documents=documents, - ).shape[1] + )["input_ids"].shape[1] ) # set the label to -100 for the non-assistant part labels[:, message_start_idx:message_end_idx] = -100 diff --git a/tuning/data/data_preprocessing_utils.py b/tuning/data/data_preprocessing_utils.py index 04d0d7a2d..b43081864 100644 --- a/tuning/data/data_preprocessing_utils.py +++ b/tuning/data/data_preprocessing_utils.py @@ -22,11 +22,10 @@ DataCollatorForSeq2Seq, LlavaProcessor, ) -from trl import DataCollatorForCompletionOnlyLM # Local from tuning.config import configs -from tuning.data.collators import VisionDataCollator +from tuning.data.collators import DataCollatorForCompletionOnlyLM, VisionDataCollator logger = logging.getLogger(__name__) diff --git a/tuning/data/tokenizer_utils.py b/tuning/data/tokenizer_utils.py index 622da0e3c..13faa47d8 100644 --- a/tuning/data/tokenizer_utils.py +++ b/tuning/data/tokenizer_utils.py @@ -44,21 +44,37 @@ def get_special_tokens_dict( special_tokens_dict = {} if not tokenizer_name_or_path: - # TODO: understand if we need to hardcode these here or just use defaults in model - if isinstance( - tokenizer, (transformers.LlamaTokenizer, transformers.LlamaTokenizerFast) - ): + llama_classes = tuple( + cls + for cls in [ + getattr(transformers, "LlamaTokenizer", None), + getattr(transformers, "LlamaTokenizerFast", None), + ] + if cls is not None + ) + is_llama_tokenizer = ( + bool(llama_classes) and isinstance(tokenizer, llama_classes) + ) or "llama" in (getattr(tokenizer, "name_or_path", "") or "").lower() + + gpt_neox_classes = tuple( + cls + for cls in [ + getattr(transformers, "GPTNeoXTokenizerFast", None), + getattr(transformers, "GPTNeoXTokenizer", None), + ] + if cls is not None + ) + + if is_llama_tokenizer: special_tokens_dict["bos_token"] = "" special_tokens_dict["eos_token"] = "" special_tokens_dict["unk_token"] = "" special_tokens_dict["pad_token"] = "" - elif isinstance( - tokenizer, (transformers.GPT2Tokenizer, transformers.GPTNeoXTokenizerFast) - ): + elif isinstance(tokenizer, (transformers.GPT2Tokenizer, *gpt_neox_classes)): special_tokens_dict["pad_token"] = "" # Add special tokens only when a custom tokenizer is not passed - if tokenizer.pad_token is None: + if tokenizer.pad_token is None or "pad_token" in special_tokens_dict: logger.warning("PAD token set to default, missing in tokenizer") special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN if tokenizer.eos_token is None: @@ -102,7 +118,8 @@ def tokenizer_and_embedding_resize( dict: Metadata on number of added tokens. """ num_new_tokens = tokenizer.add_special_tokens( - special_tokens_dict=special_tokens_dict, replace_additional_special_tokens=False + special_tokens_dict=special_tokens_dict, + # replace_additional_special_tokens=False ) embedding_size = int(multiple_of * math.ceil(len(tokenizer) / multiple_of)) num_new_tokens = num_new_tokens + embedding_size - len(tokenizer) @@ -119,8 +136,9 @@ def tokenizer_and_embedding_resize( model.set_input_embeddings(resized_input_embeddings) # Resize vocab size when embeddings updated for Mllama models - if model.language_model.vocab_size != embedding_size: - model.language_model.vocab_size = embedding_size + if model.model.vocab_size != embedding_size: + model.model.vocab_size = embedding_size + else: model.resize_token_embeddings(embedding_size) diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 1fa524e89..51d23fe16 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -28,9 +28,11 @@ from peft import LoraConfig from peft.utils.other import fsdp_auto_wrap_policy from torch.cuda import OutOfMemoryError +from transformers import ( + AutoModelForImageTextToText, # AutoModelForVision2Seq was renamed in transformers v5 +) from transformers import ( AutoModelForCausalLM, - AutoModelForVision2Seq, AutoProcessor, AutoTokenizer, TrainerCallback, @@ -292,7 +294,8 @@ def train( ) ) # try to load model as a vision model - model = AutoModelForVision2Seq.from_pretrained( + # in transformers v5, AutoModelForVision2Seq was renamed to AutoModelForImageTextToText + model = AutoModelForImageTextToText.from_pretrained( model_args.model_name_or_path, **model_kwargs ) try: