Merge branch 'main' into convert_fp32_to_bf16

YashasviChaurasia · web-flow · commit 7d4b464f56c6 · 2025-09-30T16:31:02.000+05:30
diff --git a/build/nvcr.Dockerfile b/build/nvcr.Dockerfile
@@ -46,7 +46,7 @@ RUN python -m pip install --upgrade pip
 RUN pip install --upgrade --force-reinstall torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu128
 
 # Install main package + flash attention
-RUN COPY . ${SOURCE_DIR}
+COPY . ${SOURCE_DIR}
 RUN cd ${SOURCE_DIR}
 RUN pip install --no-cache-dir ${SOURCE_DIR} && \
     pip install --no-cache-dir ${SOURCE_DIR}[flash-attn]
diff --git a/docs/advanced-data-preprocessing.md b/docs/advanced-data-preprocessing.md
@@ -509,7 +509,7 @@ We recommend inspecting the data and chat template to decide if you need to pass
 Depending on various scenarios users might need to decide on how to use chat template with their data or which chat template to use for their use case.  
 
 Following are the Guidelines from us in a flow chart :  
-![guidelines for chat template](docs/images/chat_template_guide.jpg)  
+![guidelines for chat template](images/chat_template_guide.jpg)  
 
 Here are some scenarios addressed in the flow chart:  
 1. Depending on the model the tokenizer for the model may or may not have a chat template  
diff --git a/docs/tuning-techniques.md b/docs/tuning-techniques.md
@@ -24,7 +24,7 @@
 
 ## LoRA Tuning Example
 
-Set `peft_method` to `"lora"`. You can additionally pass any arguments from [LoraConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L21).
+Set `peft_method` to `"lora"`. You can additionally pass any arguments from [LoraConfig](https://huggingface.co/docs/peft/en/package_reference/lora#peft.LoraConfig).
 ```py
 # Args you can pass
 r: int =8 
@@ -340,7 +340,7 @@ You can see details on a sample configuration of Accelerated GPTQ-LoRA [here](ht
 
 To use GPTQ-LoRA technique, you can set the `quantized_lora_config` defined [here](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/acceleration_configs/quantized_lora_config.py). See the Notes section of FMS Acceleration doc [below](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/README.md#fms-acceleration) for usage. The only kernel we are supporting currently is `triton_v2`.
 
-In addition, LoRA tuning technique is required to be used, set `peft_method` to `"lora"` and pass any arguments from [LoraConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L21).
+In addition, LoRA tuning technique is required to be used, set `peft_method` to `"lora"` and pass any arguments from [LoraConfig](https://huggingface.co/docs/peft/en/package_reference/lora#peft.LoraConfig).
 
 Example command to run:
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,7 +48,7 @@ flash-attn = ["flash-attn>=2.8.3"]
 aim = ["aim>=3.19.0,<4.0"]
 mlflow = ["mlflow"]
 clearml = ["clearml==2.0.0"]
-fms-accel = ["fms-acceleration>=0.6"]
+fms-accel = ["fms-acceleration>=0.6.2"]
 gptq-dev = ["auto_gptq>0.4.2", "optimum>=1.15.0"]
 mamba = ["mamba_ssm[causal-conv1d]>=2.0.0,<3.0.0"]
 scanner-dev = ["HFResourceScanner>=0.1.0"]
diff --git a/tuning/config/peft_config.py b/tuning/config/peft_config.py
@@ -15,9 +15,10 @@
 # Standard
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import List
+from typing import List, Optional
 
 # Third Party
+from peft import LoraConfig as HFLoraConfig
 from transformers.utils.quantization_config import Mxfp4Config as HfMxfp4Config
 
 
@@ -40,49 +41,125 @@ def to_hf_config(self):
 
 
 @dataclass
-class LoraConfig:
+class LoraConfig(HFLoraConfig):
     """
-    This is the configuration class to store the configuration of a [`LoraModel`].
+    This is the configuration class that extends peft.LoraConfig with a few defaults.
 
     Args:
-        r (`int`):
-            Lora attention dimension (the "rank").
-        target_modules (List[str]]):
-            The names of the modules to apply the adapter to. \
-            If this is specified, only the modules with the specified \
-            names will be replaced. Please specify modules as per model architecture. \
-            If the value is ["all-linear"], \
-            then LORA selects all linear and Conv1D modules as per model architecture, \
-            except for the output layer.
         lora_alpha (`int`):
             The alpha parameter for Lora scaling.
         lora_dropout (`float`):
             The dropout probability for Lora layers.
-        bias (`str`):
-            Bias type for LoRA. Can be 'none', 'all' or 'lora_only'. \
-            If 'all' or 'lora_only', the corresponding biases will be updated during training. \
-            Be aware that this means that, even when disabling the adapters, the model \
-            will not produce the same output as the base model would have without adaptation.
     """
 
-    r: int = 8
     lora_alpha: int = 32
-    target_modules: List[str] = field(
+    lora_dropout: float = 0.05
+
+    # HACK: The following list of arguments listed below
+    # is a fix which reduces the field annotation from
+    # Optional[List[str], str] type to Optional[List[str]] type
+    # This is done for compatibility with HFArgumentParser
+    # Please see: https://github.com/huggingface/peft/issues/2798 for further explanation!
+    target_modules: Optional[List[str]] = field(
         default=None,
         metadata={
-            "help": "The names of the modules to apply LORA to. LORA selects modules which either \
-            completely match or "
-            'end with one of the strings. If the value is ["all-linear"], \
-            then LORA selects all linear and Conv1D '
-            "modules except for the output layer."
+            "help": (
+                "List of module names or regex expression of the module names to replace with LoRA."
+                "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. "
+                "This can also be a wildcard 'all-linear' which matches all linear/Conv1D "
+                "(if the model is a PreTrainedModel, the output layer excluded). "
+                "If not specified, modules will be chosen according to the model architecture, "
+                "If the architecture is not known, an error will be raised -- "
+                "in this case, you should specify the target modules manually. "
+                "To avoid targeting any modules (because you want to apply `target_parameters`) "
+                ", set `target_modules=[]`."
+            ),
         },
     )
-    target_parameters: List[str] = field(
+    exclude_modules: Optional[List[str]] = field(
         default=None,
-        metadata={"help": "The names/regex of the parameters to apply LORA to"},
+        metadata={
+            "help": (
+                "List of module names or regex expression of the module names to exclude from Lora."
+            )
+        },
     )
-    bias = "none"
-    lora_dropout: float = 0.05
+    init_lora_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "How to initialize the weights of the LoRA layers. "
+                "Passing True (default) results in the default initialization from "
+                "the reference implementation from "
+                "Microsoft, with the LoRA B weight being set to 0. "
+                "This means that without further training, "
+                "the LoRA adapter will be a no-op. "
+                "Setting the initialization to False leads to random initialization of "
+                "LoRA A and B, meaning that LoRA is not a no-op before training; "
+                "this setting is intended for debugging purposes."
+            ),
+        },
+    )
+    layers_to_transform: Optional[list[int]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The layer indexes to transform, is this argument is specified, "
+                "PEFT will transform only the layers indexes that are specified inside this list. "
+                "If a single integer is passed, PEFT will transform only the layer at this index. "
+                "This only works when target_modules is a list of str."
+            )
+        },
+    )
+    layers_pattern: Optional[list[str]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The layer pattern name, used only if `layers_to_transform` is different to None "
+                "and if the layer pattern is not in the common layers pattern. "
+                "This only works when target_modules is a list of str. "
+                "This should target the `nn.ModuleList` of the "
+                "model, which is often called `'layers'` or `'h'`."
+            )
+        },
+    )
+    trainable_token_indices: Optional[list[int]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Lets you specify which token indices to selectively fine-tune "
+                "without requiring to re-train the "
+                "whole embedding matrix using the `peft.TrainableTokensModel` method. "
+                "You can specify token indices in two ways. "
+                "Either you specify a list of indices which will then target the model's input "
+                "embedding layer (or, if not found, `embed_tokens`). "
+                "(Not supported yet) Alternatively, you can specify a dictionary "
+                "where the key is the name of the embedding module "
+                "and the values are the list of token indices, e.g. "
+                "`{'embed_tokens': [0, 1, ...]}`. Note that training "
+                "with FSDP requires `use_orig_params=True` to "
+                "avoid issues with non-uniform `requires_grad`."
+            )
+        },
+    )
+    loftq_config: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The configuration of LoftQ. If this is passed, "
+                "then LoftQ will be used to quantize the backbone "
+                "weights and initialize Lora layers. Also set `init_lora_weights='loftq'` "
+                "in this case."
+            )
+        },
+    )
+
+    def __post_init__(self):
+        # If target_modules is a single-element list, convert it into a plain string
+        if self.target_modules == ["all-linear"]:
+            self.target_modules = "all-linear"
+
+        super().__post_init__()
 
 
 @dataclass
diff --git a/tuning/data/setup_dataprocessor.py b/tuning/data/setup_dataprocessor.py
@@ -159,7 +159,9 @@ def process_dataconfig_file(
 
 
 # Data Format 1: Pretokenized Data
-def _get_pretokenized_dataset_handlers(data_args, is_eval_tokenized):
+def _get_pretokenized_dataset_handlers(
+    data_args: DataArguments, is_eval_present, is_eval_tokenized
+):
 
     # if the provided train dataset is pretokenized
     # however user provides formatting flags, error out
@@ -168,6 +170,7 @@ def _get_pretokenized_dataset_handlers(data_args, is_eval_tokenized):
         or data_args.data_formatter_template
         or data_args.dataset_text_field
         or data_args.instruction_template
+        or data_args.dataset_conversation_field
     ):
         raise ValueError(
             "fields response_template, data_formatter_template,"
@@ -177,7 +180,7 @@ def _get_pretokenized_dataset_handlers(data_args, is_eval_tokenized):
 
     # if the train dataset is pretokenized
     # ensure validation dataset is pretokenized otherwise error out
-    if is_eval_tokenized:
+    if is_eval_present and not is_eval_tokenized:
         raise ValueError(
             "validation data should be pretokenized to be used \
             along with pretokenized train data"
@@ -189,7 +192,9 @@ def _get_pretokenized_dataset_handlers(data_args, is_eval_tokenized):
 
 ### Data format 2
 # pylint: disable=unused-argument
-def _get_dataset_formatting_handlers(data_args, packing, is_padding_free=False):
+def _get_dataset_formatting_handlers(
+    data_args: DataArguments, packing, is_padding_free=False
+):
 
     if data_args.response_template is None:
         if packing is False:
@@ -253,7 +258,7 @@ def _get_chat_dataset_handlers(data_args, tokenizer_kwargs):
     fn_kwargs["formatted_text_column_name"] = data_args.dataset_text_field
     fn_kwargs["tokenizer_kwargs"] = tokenizer_kwargs
     if data_args.dataset_conversation_field is not None:
-        fn_kwargs["conversation_column"] = data_args.dataset_conversation_field
+        fn_kwargs["conversation_column_name"] = data_args.dataset_conversation_field
 
     kwargs = {"fn_kwargs": fn_kwargs, "batched": False, "remove_columns": "all"}
 
@@ -284,14 +289,14 @@ def _get_default_dataset_handlers(data_args, tokenizer_kwargs):
 
 
 ### Vsion Data Format
-def _get_vision_dataset_handlers(data_args, processor_kwargs):
+def _get_vision_dataset_handlers(data_args: DataArguments, processor_kwargs):
 
     handlers = []
 
     # First data handler configuration
     handler_fn_kwargs1 = {
-        "dataset_text_field": data_args.dataset_text_field,
-        "conversation_column": data_args.dataset_text_field,
+        "formatted_text_column_name": data_args.dataset_text_field,
+        "conversation_column_name": data_args.dataset_conversation_field,
     }
     handler_kwargs1 = {
         "fn_kwargs": handler_fn_kwargs1,
@@ -403,7 +408,7 @@ def _process_raw_data_args(
     if is_traindata_tokenized:
         # Data Format 1: Pretokenized Data
         handlers, dataset_text_field = _get_pretokenized_dataset_handlers(
-            data_args, (is_eval_dataset_present and not is_evaldata_tokenized)
+            data_args, is_eval_dataset_present, is_evaldata_tokenized
         )
     elif processor and data_args.dataset_text_field and data_args.dataset_image_field:
 
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
diff --git a/tuning/utils/config_utils.py b/tuning/utils/config_utils.py