diff --git a/.pylintrc b/.pylintrc index 612fa0e8f..570205ae4 100644 --- a/.pylintrc +++ b/.pylintrc @@ -447,10 +447,7 @@ disable=raw-checker-failed, duplicate-code, unbalanced-tuple-unpacking, unspecified-encoding, - too-many-lines, - no-name-in-module, - unexpected-keyword-arg, - unused-argument + too-many-lines # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py index 968753bfd..43cf8dda0 100644 --- a/build/accelerate_launch.py +++ b/build/accelerate_launch.py @@ -110,6 +110,7 @@ def main(): # message to termination log. logging.error(traceback.format_exc()) # The exit code that sft_trainer.py threw is captured in e.returncode + return_code = e.returncode if return_code not in [INTERNAL_ERROR_EXIT_CODE, USER_ERROR_EXIT_CODE]: return_code = INTERNAL_ERROR_EXIT_CODE @@ -117,12 +118,6 @@ def main(): sys.exit(return_code) except Exception as e: # pylint: disable=broad-except logging.error(traceback.format_exc()) - # v5: torch.distributed raises ChildFailedError with per-rank exit codes - # Check if the root cause was a user error - if hasattr(e, "failures"): - root_codes = [f.exitcode for f in e.failures.values()] - if any(c == USER_ERROR_EXIT_CODE for c in root_codes): - sys.exit(USER_ERROR_EXIT_CODE) write_termination_log(f"Unhandled exception during training. {e}") sys.exit(INTERNAL_ERROR_EXIT_CODE) diff --git a/pyproject.toml b/pyproject.toml index 27fdaaa7b..334ee79d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,19 +28,18 @@ classifiers=[ dependencies = [ "numpy>=1.26.4,<2.2.0", "accelerate>=1.9.0,<2.0.0", -"transformers>=5.2.0,<5.3.0", -"torch>2.7.0,<=2.9.0", -"torchvision<=0.24.0", +"transformers>=4.55.0,<=4.55.4", +"torch>2.7.0,<2.9.0", +"torchvision<0.24", "sentencepiece>=0.1.99,<0.3", -"tokenizers<=0.23.0", +"tokenizers<=0.22", "tqdm>=4.66.2,<5.0", -"trl>=0.27.0,<0.29.0", -"peft>=0.18.1,<0.19.0", +"trl>=0.19.1,<0.20.0", +"peft>=0.18.0,< 0.19.0", "datasets>=4.0.0,<5.0.0", "simpleeval>=0.9.13,<2.0", "pillow>=12.1.1", -"kernels>=0.12.1,<0.13.0", -"huggingface_hub>=1.3.0,<1.4.0", +"kernels<=0.9.0", ] [project.optional-dependencies] diff --git a/tests/build/test_launch_script.py b/tests/build/test_launch_script.py index 400ecc105..322fe5998 100644 --- a/tests/build/test_launch_script.py +++ b/tests/build/test_launch_script.py @@ -51,7 +51,7 @@ "warmup_ratio": 0.03, "lr_scheduler_type": "cosine", "logging_steps": 1, - "include_num_input_tokens_seen": True, + "include_tokens_per_second": True, "packing": False, "response_template": "\n### Label:", "dataset_text_field": "output", diff --git a/tests/data/test_data_preprocessing.py b/tests/data/test_data_preprocessing.py index 5732ae06c..7d0dd2eee 100644 --- a/tests/data/test_data_preprocessing.py +++ b/tests/data/test_data_preprocessing.py @@ -22,6 +22,7 @@ from datasets import Dataset, DatasetDict, IterableDataset from PIL import Image from transformers import AutoProcessor, AutoTokenizer, DataCollatorForSeq2Seq +from trl import DataCollatorForCompletionOnlyLM import datasets import numpy as np import pyarrow @@ -68,7 +69,7 @@ # Local from tuning.config import configs from tuning.config.acceleration_configs import AttentionAndDistributedPackingConfig -from tuning.data.collators import DataCollatorForCompletionOnlyLM, VisionDataCollator +from tuning.data.collators import VisionDataCollator from tuning.data.data_config import ( DataHandlerConfig, DataPreProcessorConfig, diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index 029cf63ad..10ed33a09 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -124,7 +124,7 @@ warmup_ratio=0.03, lr_scheduler_type="cosine", logging_steps=1, - include_num_input_tokens_seen=True, + include_tokens_per_second=True, packing=False, max_seq_length=4096, save_strategy="epoch", @@ -140,7 +140,7 @@ warmup_ratio=0.03, lr_scheduler_type="cosine", logging_steps=1, - include_num_input_tokens_seen=True, + include_tokens_per_second=True, packing=False, max_seq_length=4096, save_strategy="epoch", diff --git a/tests/utils/test_embedding_resize.py b/tests/utils/test_embedding_resize.py index 2ef4513b8..5ec7e7ab7 100644 --- a/tests/utils/test_embedding_resize.py +++ b/tests/utils/test_embedding_resize.py @@ -20,9 +20,11 @@ # Third Party from transformers import ( - AutoModelForImageTextToText, # AutoModelForVision2Seq was renamed to this in transformers v5 + AutoModelForCausalLM, + AutoModelForVision2Seq, + AutoProcessor, + AutoTokenizer, ) -from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer import torch # First Party @@ -126,17 +128,16 @@ def test_special_tokens_before_and_after(): model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) input_tokenizer_len = len(tokenizer.get_vocab()) - addn_spl_tokens_before = list(tokenizer.extra_special_tokens) + addn_spl_tokens_before = tokenizer.special_tokens_map.get( + "additional_special_tokens" + ) assert ( len(addn_spl_tokens_before) > 0 ), "this test needs tokenizer special tokens to not be empty before testing" special_tokens_dict = {"sep_token": "", "pad_token": ""} addn_spl_tokens_added = ["", "", ""] - # for transformers v5: merge existing extra_special_tokens with new ones to prevent replacement - special_tokens_dict["additional_special_tokens"] = ( - list(tokenizer.extra_special_tokens) + addn_spl_tokens_added - ) + special_tokens_dict["additional_special_tokens"] = addn_spl_tokens_added resize_result = tokenizer_and_embedding_resize( special_tokens_dict=special_tokens_dict, @@ -149,7 +150,9 @@ def test_special_tokens_before_and_after(): addn_spl_tokens_before.extend(addn_spl_tokens_added) expected_addn_special_tokens = addn_spl_tokens_before expected_embedding_size = input_tokenizer_len + len(addn_spl_tokens_added) + 2 - addn_spl_tokens_after = list(tokenizer.extra_special_tokens) + addn_spl_tokens_after = tokenizer.special_tokens_map.get( + "additional_special_tokens" + ) assert "" in tokenizer.get_vocab() assert "" in tokenizer.get_vocab() @@ -209,9 +212,7 @@ def test_resize_with_multiple_of(): def test_resize_llama_vision_model(): - model = AutoModelForImageTextToText.from_pretrained( - TINY_LLAMA_VISION_MODEL_NAME - ) # AutoModelForVision2Seq was renamed to AutoModelForImageTextToText in transformers v5 + model = AutoModelForVision2Seq.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME) processor = AutoProcessor.from_pretrained(TINY_LLAMA_VISION_MODEL_NAME) tokenizer = processor.tokenizer diff --git a/tuning/data/collators.py b/tuning/data/collators.py index b9947e3ab..c7f63a99c 100644 --- a/tuning/data/collators.py +++ b/tuning/data/collators.py @@ -12,15 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Standard -from typing import Any, Optional, Union -import logging - -# Third Party -from transformers import DataCollatorForLanguageModeling -import numpy as np -import torch - # Local from tuning.data.utils import try_convert_bytes_dict_to_pil @@ -100,239 +91,3 @@ def __call__(self, features): batch["labels"] = labels return batch - - -class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling): - """ - Data collator used for completion tasks. - It ensures that all the tokens of the labels - are set to an 'ignore_index' - when they do not come from the assistant. - This ensure that the loss is only calculated on the completion made by - the assistant. - - Args: - response_template (`Union[str, list[int]]`): - the template form that indicates the - start of the response, typically - something like '### Response:\n'. It - can also be passed as tokenized ids, - which can be useful when using a tokenizer - that encodes the response - differently if it does not have proper context. - instruction_template (`Union[str, list[int]]`): - the template form that indicates the start - of the human instruction, typically - something like '### - Human:\n'. Useful for assistant-style - conversation datasets. It can also be passed - as tokenized ids. - mlm (`bool`, *optional*, defaults to `False`): Whether - to use masked language modeling in the underlying - `DataCollatorForLanguageModeling` class. - Note that this option currently has no effect but is present - for flexibility and backwards-compatibility. - ignore_index (`int`, *optional*, defaults to `-100`): - The index to use to ignore the initial tokens with - """ - - def __init__( - self, - *args, - response_template: Union[str, list[int]], - instruction_template: Optional[Union[str, list[int]]] = None, - mlm: bool = False, - ignore_index: int = -100, - padding_free: bool = False, - **kwargs, - ): - super().__init__(*args, mlm=mlm, **kwargs) - - self.instruction_template = instruction_template - if isinstance(instruction_template, str): - # The user provides a string, must tokenize - self.instruction_token_ids = self.tokenizer.encode( - self.instruction_template, add_special_tokens=False - ) - else: - # The user already provides the token ids - self.instruction_token_ids = instruction_template - - self.response_template = response_template - if isinstance(response_template, str): - # The user provides a string, must tokenize - self.response_token_ids = self.tokenizer.encode( - self.response_template, add_special_tokens=False - ) - else: - # The user already provides the token ids - self.response_token_ids = response_template - - if ( - not self.mlm - and self.instruction_template - and self.tokenizer.pad_token_id == self.tokenizer.eos_token_id - ): - logging.warning( - "The pad_token_id and eos_token_id values " - "of this tokenizer are identical. " - "If you are planning for multi-turn training, " - "it can result in the model continuously generating " - "questions and answers without eos token. " - "To avoid this, set the pad_token_id to a different value.", - ) - - self.ignore_index = ignore_index - self.padding_free = padding_free - - def torch_call( - self, examples: list[Union[list[int], Any, dict[str, Any]]] - ) -> dict[str, Any]: - batch = super().torch_call(examples) - - if self.instruction_template is None: - for i in range(len(examples)): - response_token_ids_start_idx = None - - for idx in np.where(batch["labels"][i] == self.response_token_ids[0])[ - 0 - ]: - # `response_token_ids` is - # `'### Response:\n'`, here we are just making sure - # that the token IDs match - if ( - self.response_token_ids - == batch["labels"][i][ - idx : idx + len(self.response_token_ids) - ].tolist() - ): - response_token_ids_start_idx = idx - - if response_token_ids_start_idx is None: - logging.warning( - "Could not find response key %s in the following instance: " - "%s. This instance will be ignored in loss " - "calculation. Note, if this happens often, " - "consider increasing the `max_length`.", - self.response_template, - self.tokenizer.decode(batch["input_ids"][i]), - ) - batch["labels"][i, :] = self.ignore_index - else: - response_token_ids_end_idx = response_token_ids_start_idx + len( - self.response_token_ids - ) - - # Make pytorch loss function ignore all - # tokens up through the end of the response key - batch["labels"][i, :response_token_ids_end_idx] = self.ignore_index - - else: - for i in range(len(examples)): - response_token_ids_idxs = [] - human_token_ids_idxs = [] - - for assistant_idx in np.where( - batch["labels"][i] == self.response_token_ids[0] - )[0]: - # find the indexes of the start of a response. - if ( - self.response_token_ids - == batch["labels"][i][ - assistant_idx : assistant_idx + len(self.response_token_ids) - ].tolist() - ): - response_token_ids_idxs.append( - assistant_idx + len(self.response_token_ids) - ) - - if len(response_token_ids_idxs) == 0: - logging.warning( - "Could not find response key %s in the following instance: " - "%s. This instance will be ignored in loss " - "calculation. Note, if this happens often, " - "consider increasing the `max_length`.", - self.response_template, - self.tokenizer.decode(batch["input_ids"][i]), - ) - batch["labels"][i, :] = self.ignore_index - - human_token_ids = self.instruction_token_ids - for human_idx in np.where(batch["labels"][i] == human_token_ids[0])[0]: - # find the indexes of the start of a human answer. - if ( - human_token_ids - == batch["labels"][i][ - human_idx : human_idx + len(human_token_ids) - ].tolist() - ): - human_token_ids_idxs.append(human_idx) - - if len(human_token_ids_idxs) == 0: - logging.warning( - "Could not find instruction key `%s` in the following instance: " - "%s. This instance will be ignored in loss " - "calculation. Note, if this happens often, " - "consider increasing the `max_length`.", - self.instruction_template, - self.tokenizer.decode(batch["input_ids"][i]), - ) - batch["labels"][i, :] = self.ignore_index - - if ( - len(human_token_ids_idxs) > 0 - and len(response_token_ids_idxs) > 0 - and human_token_ids_idxs[0] > response_token_ids_idxs[0] - ): - human_token_ids_idxs = [0] + human_token_ids_idxs - - for idx, (start, end) in enumerate( - zip(human_token_ids_idxs, response_token_ids_idxs) - ): - # Make pytorch loss function ignore all non response tokens - if idx != 0: - batch["labels"][i, start:end] = self.ignore_index - else: - batch["labels"][i, :end] = self.ignore_index - - if len(response_token_ids_idxs) < len(human_token_ids_idxs): - batch["labels"][i, human_token_ids_idxs[-1] :] = self.ignore_index - - if self.padding_free: - # remove padding, `attention_mask` and add `position_ids` - attn_mask = batch.pop("attention_mask") - batch["input_ids"] = batch["input_ids"][attn_mask.bool()].unsqueeze(0) - batch["position_ids"] = ( - attn_mask.cumsum(1)[attn_mask.bool()].unsqueeze(0) - 1 - ) - batch["labels"] = batch["labels"][attn_mask.bool()].unsqueeze(0) - batch["labels"][batch["position_ids"] == 0] = self.ignore_index - - # Calculate cumulative sequence lengths for queries and - # keys to prevent graph breaks during further computations. - flattened_position_ids = batch["position_ids"].flatten() - indices_q = torch.arange( - flattened_position_ids.size(0), - device=flattened_position_ids.device, - dtype=torch.int32, - ) - batch["cu_seq_lens_q"] = torch.cat( - ( - indices_q[flattened_position_ids == 0], - torch.tensor( - flattened_position_ids.size(), - device=flattened_position_ids.device, - dtype=torch.int32, - ), - ) - ).unsqueeze(0) - batch["cu_seq_lens_k"] = batch["cu_seq_lens_q"] - - # Determine maximum sequence lengths to - # prevent graph breaks during further computations. - batch["max_length_k"] = torch.tensor( - [flattened_position_ids.max().item() + 1] - ) - batch["max_length_q"] = batch["max_length_k"] - - return batch diff --git a/tuning/data/data_handlers.py b/tuning/data/data_handlers.py index f549f18a2..327fb40ac 100644 --- a/tuning/data/data_handlers.py +++ b/tuning/data/data_handlers.py @@ -532,7 +532,7 @@ def tokenize_and_apply_chat_template_with_masking( add_generation_prompt=False, tools=tools, documents=documents, - )["input_ids"] + ) ) # clone labels from input ids @@ -557,7 +557,7 @@ def tokenize_and_apply_chat_template_with_masking( add_generation_prompt=False, tools=tools, documents=documents, - )["input_ids"].shape[1] + ).shape[1] ) # next, we calculate the end index of this non-assistant message if ( @@ -578,7 +578,7 @@ def tokenize_and_apply_chat_template_with_masking( add_generation_prompt=True, tools=tools, documents=documents, - )["input_ids"].shape[1] + ).shape[1] ) else: # for the last message or the message that doesn't follow with @@ -594,7 +594,7 @@ def tokenize_and_apply_chat_template_with_masking( add_generation_prompt=False, tools=tools, documents=documents, - )["input_ids"].shape[1] + ).shape[1] ) # set the label to -100 for the non-assistant part labels[:, message_start_idx:message_end_idx] = -100 diff --git a/tuning/data/data_preprocessing_utils.py b/tuning/data/data_preprocessing_utils.py index b43081864..04d0d7a2d 100644 --- a/tuning/data/data_preprocessing_utils.py +++ b/tuning/data/data_preprocessing_utils.py @@ -22,10 +22,11 @@ DataCollatorForSeq2Seq, LlavaProcessor, ) +from trl import DataCollatorForCompletionOnlyLM # Local from tuning.config import configs -from tuning.data.collators import DataCollatorForCompletionOnlyLM, VisionDataCollator +from tuning.data.collators import VisionDataCollator logger = logging.getLogger(__name__) diff --git a/tuning/data/tokenizer_utils.py b/tuning/data/tokenizer_utils.py index 13faa47d8..622da0e3c 100644 --- a/tuning/data/tokenizer_utils.py +++ b/tuning/data/tokenizer_utils.py @@ -44,37 +44,21 @@ def get_special_tokens_dict( special_tokens_dict = {} if not tokenizer_name_or_path: - llama_classes = tuple( - cls - for cls in [ - getattr(transformers, "LlamaTokenizer", None), - getattr(transformers, "LlamaTokenizerFast", None), - ] - if cls is not None - ) - is_llama_tokenizer = ( - bool(llama_classes) and isinstance(tokenizer, llama_classes) - ) or "llama" in (getattr(tokenizer, "name_or_path", "") or "").lower() - - gpt_neox_classes = tuple( - cls - for cls in [ - getattr(transformers, "GPTNeoXTokenizerFast", None), - getattr(transformers, "GPTNeoXTokenizer", None), - ] - if cls is not None - ) - - if is_llama_tokenizer: + # TODO: understand if we need to hardcode these here or just use defaults in model + if isinstance( + tokenizer, (transformers.LlamaTokenizer, transformers.LlamaTokenizerFast) + ): special_tokens_dict["bos_token"] = "" special_tokens_dict["eos_token"] = "" special_tokens_dict["unk_token"] = "" special_tokens_dict["pad_token"] = "" - elif isinstance(tokenizer, (transformers.GPT2Tokenizer, *gpt_neox_classes)): + elif isinstance( + tokenizer, (transformers.GPT2Tokenizer, transformers.GPTNeoXTokenizerFast) + ): special_tokens_dict["pad_token"] = "" # Add special tokens only when a custom tokenizer is not passed - if tokenizer.pad_token is None or "pad_token" in special_tokens_dict: + if tokenizer.pad_token is None: logger.warning("PAD token set to default, missing in tokenizer") special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN if tokenizer.eos_token is None: @@ -118,8 +102,7 @@ def tokenizer_and_embedding_resize( dict: Metadata on number of added tokens. """ num_new_tokens = tokenizer.add_special_tokens( - special_tokens_dict=special_tokens_dict, - # replace_additional_special_tokens=False + special_tokens_dict=special_tokens_dict, replace_additional_special_tokens=False ) embedding_size = int(multiple_of * math.ceil(len(tokenizer) / multiple_of)) num_new_tokens = num_new_tokens + embedding_size - len(tokenizer) @@ -136,9 +119,8 @@ def tokenizer_and_embedding_resize( model.set_input_embeddings(resized_input_embeddings) # Resize vocab size when embeddings updated for Mllama models - if model.model.vocab_size != embedding_size: - model.model.vocab_size = embedding_size - + if model.language_model.vocab_size != embedding_size: + model.language_model.vocab_size = embedding_size else: model.resize_token_embeddings(embedding_size) diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 51d23fe16..1fa524e89 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -28,11 +28,9 @@ from peft import LoraConfig from peft.utils.other import fsdp_auto_wrap_policy from torch.cuda import OutOfMemoryError -from transformers import ( - AutoModelForImageTextToText, # AutoModelForVision2Seq was renamed in transformers v5 -) from transformers import ( AutoModelForCausalLM, + AutoModelForVision2Seq, AutoProcessor, AutoTokenizer, TrainerCallback, @@ -294,8 +292,7 @@ def train( ) ) # try to load model as a vision model - # in transformers v5, AutoModelForVision2Seq was renamed to AutoModelForImageTextToText - model = AutoModelForImageTextToText.from_pretrained( + model = AutoModelForVision2Seq.from_pretrained( model_args.model_name_or_path, **model_kwargs ) try: