From 80b866cb23c4e078ae0aec444f60d50acf4d59da Mon Sep 17 00:00:00 2001 From: romit Date: Tue, 16 Sep 2025 10:24:46 +0000 Subject: [PATCH 1/7] Updated LoraConfig to subclass from peft.LoraConfig Signed-off-by: romit --- docs/tuning-techniques.md | 4 ++-- tuning/config/peft_config.py | 43 +++++++++--------------------------- tuning/sft_trainer.py | 9 ++++---- tuning/utils/config_utils.py | 7 +++--- 4 files changed, 20 insertions(+), 43 deletions(-) diff --git a/docs/tuning-techniques.md b/docs/tuning-techniques.md index c7afab1979..8428b3f83b 100644 --- a/docs/tuning-techniques.md +++ b/docs/tuning-techniques.md @@ -24,7 +24,7 @@ ## LoRA Tuning Example -Set `peft_method` to `"lora"`. You can additionally pass any arguments from [LoraConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L21). +Set `peft_method` to `"lora"`. You can additionally pass any arguments from [LoraConfig](https://huggingface.co/docs/peft/en/package_reference/lora#peft.LoraConfig). ```py # Args you can pass r: int =8 @@ -340,7 +340,7 @@ You can see details on a sample configuration of Accelerated GPTQ-LoRA [here](ht To use GPTQ-LoRA technique, you can set the `quantized_lora_config` defined [here](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/acceleration_configs/quantized_lora_config.py). See the Notes section of FMS Acceleration doc [below](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/README.md#fms-acceleration) for usage. The only kernel we are supporting currently is `triton_v2`. -In addition, LoRA tuning technique is required to be used, set `peft_method` to `"lora"` and pass any arguments from [LoraConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L21). +In addition, LoRA tuning technique is required to be used, set `peft_method` to `"lora"` and pass any arguments from [LoraConfig](https://huggingface.co/docs/peft/en/package_reference/lora#peft.LoraConfig). Example command to run: diff --git a/tuning/config/peft_config.py b/tuning/config/peft_config.py index 27af853327..254cdba644 100644 --- a/tuning/config/peft_config.py +++ b/tuning/config/peft_config.py @@ -18,6 +18,7 @@ from typing import List # Third Party +from peft import LoraConfig as _LoraConfig from transformers.utils.quantization_config import Mxfp4Config as HfMxfp4Config @@ -40,50 +41,26 @@ def to_hf_config(self): @dataclass -class LoraConfig: +class LoraConfig(_LoraConfig): """ - This is the configuration class to store the configuration of a [`LoraModel`]. + This is the configuration class that extends peft.LoraConfig with a few defaults. Args: - r (`int`): - Lora attention dimension (the "rank"). - target_modules (List[str]]): - The names of the modules to apply the adapter to. \ - If this is specified, only the modules with the specified \ - names will be replaced. Please specify modules as per model architecture. \ - If the value is ["all-linear"], \ - then LORA selects all linear and Conv1D modules as per model architecture, \ - except for the output layer. lora_alpha (`int`): The alpha parameter for Lora scaling. lora_dropout (`float`): The dropout probability for Lora layers. - bias (`str`): - Bias type for LoRA. Can be 'none', 'all' or 'lora_only'. \ - If 'all' or 'lora_only', the corresponding biases will be updated during training. \ - Be aware that this means that, even when disabling the adapters, the model \ - will not produce the same output as the base model would have without adaptation. """ - - r: int = 8 lora_alpha: int = 32 - target_modules: List[str] = field( - default=None, - metadata={ - "help": "The names of the modules to apply LORA to. LORA selects modules which either \ - completely match or " - 'end with one of the strings. If the value is ["all-linear"], \ - then LORA selects all linear and Conv1D ' - "modules except for the output layer." - }, - ) - target_parameters: List[str] = field( - default=None, - metadata={"help": "The names/regex of the parameters to apply LORA to"}, - ) - bias = "none" lora_dropout: float = 0.05 + def __post_init__(self): + # If target_modules is a single-element list, convert it into a plain string + if self.target_modules == ["all-linear"]: + self.target_modules = "all-linear" + + super().__post_init__() + @dataclass class PromptTuningConfig: diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 383dbaf3a7..014b043665 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -71,7 +71,7 @@ def train( data_args: configs.DataArguments, train_args: configs.TrainingArguments, peft_config: Optional[ # pylint: disable=redefined-outer-name - Union[peft_config.LoraConfig, LoraConfig, peft_config.PromptTuningConfig] + Union[LoraConfig, peft_config.PromptTuningConfig] ] = None, quantization_config: Optional[peft_config.Mxfp4Config] = None, trainer_controller_args: TrainerControllerCallback = None, @@ -92,8 +92,7 @@ def train( model_args: tuning.config.configs.ModelArguments data_args: tuning.config.configs.DataArguments train_args: tuning.config.configs.TrainingArguments - peft_config: peft_config.LoraConfig for Lora tuning | \ - LoraConfig (peft.LoraConfig): for activated Lora (aLoRA) tuning | \ + peft_config: LoraConfig (peft.LoraConfig): for activated Lora (aLoRA) tuning | \ peft_config.PromptTuningConfig for prompt tuning | \ None for full fine tuning The peft configuration to pass to trainer @@ -110,7 +109,7 @@ def train( tracker with automatically be added. exp_metadata: Dict of key value pairs passed to train to be recoreded by the tracker. quantized_lora_config: tuning.config.acceleration_configs.QuantizedLoraConfig \ - Should be used in combination with peft_config.LoraConfig for Lora tuning \ + Should be used in combination with LoraConfig for Lora tuning \ fusedops_kernels_config: tuning.config.acceleration_configs.FusedOpsAndKernelsConfig \ Should be used in combination with quantized_lora_config. Also currently fused_lora and fast_kernels must used together (may change in future). \ @@ -855,7 +854,7 @@ def main(): sys.exit(INTERNAL_ERROR_EXIT_CODE) if isinstance( - tune_config, (peft_config.LoraConfig, LoraConfig) + tune_config, LoraConfig ): # aLoraConfig subclasses LoraConfig try: if training_args.save_model_dir: diff --git a/tuning/utils/config_utils.py b/tuning/utils/config_utils.py index 78c4d32ab2..4d47ca991f 100644 --- a/tuning/utils/config_utils.py +++ b/tuning/utils/config_utils.py @@ -113,9 +113,10 @@ def get_hf_peft_config(task_type, tuning_config, tokenizer_name_or_path): hf_peft_config = alora_config elif isinstance(tuning_config, peft_config.LoraConfig): lora_config = asdict(tuning_config) - if lora_config["target_modules"] == ["all-linear"]: - lora_config["target_modules"] = "all-linear" - hf_peft_config = HFLoraConfig(task_type=task_type, **lora_config) + + if not hasattr(lora_config, "task_type"): + lora_config["task_type"]=task_type + hf_peft_config = HFLoraConfig(**lora_config) elif isinstance(tuning_config, peft_config.PromptTuningConfig): hf_peft_config = HFPromptTuningConfig( task_type=task_type, From de6baa71bcddce03111f3d0856c3529f83e3d525 Mon Sep 17 00:00:00 2001 From: romit Date: Tue, 16 Sep 2025 16:28:02 +0000 Subject: [PATCH 2/7] Added some fields under custom dataclass to let is pass through HfArgumentParser Signed-off-by: romit --- tuning/config/peft_config.py | 80 +++++++++++++++++++++++++++++++++++- tuning/sft_trainer.py | 4 +- tuning/utils/config_utils.py | 2 +- 3 files changed, 81 insertions(+), 5 deletions(-) diff --git a/tuning/config/peft_config.py b/tuning/config/peft_config.py index 254cdba644..f1220bf991 100644 --- a/tuning/config/peft_config.py +++ b/tuning/config/peft_config.py @@ -15,7 +15,7 @@ # Standard from dataclasses import dataclass, field from enum import Enum -from typing import List +from typing import List, Optional # Third Party from peft import LoraConfig as _LoraConfig @@ -51,9 +51,87 @@ class LoraConfig(_LoraConfig): lora_dropout (`float`): The dropout probability for Lora layers. """ + lora_alpha: int = 32 lora_dropout: float = 0.05 + # HACK: The following list of arguments are listed here + # as a temperorary fix which reduces the field annotation + # from Optional[List[str], str] to Optional[List[str], str] + # Please see: https://github.com/huggingface/transformers/issues/40915 for further explanation! + target_modules: Optional[List[str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with LoRA. " + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D " + "(if the model is a PreTrainedModel, the output layer excluded). " + "If not specified, modules will be chosen according to the model architecture, If the architecture is " + "not known, an error will be raised -- in this case, you should specify the target modules manually. " + "To avoid targeting any modules (because you want to apply `target_parameters`), set " + "`target_modules=[]`." + ), + }, + ) + exclude_modules: List[str] | None = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to exclude from Lora." + }, + ) + init_lora_weights: (bool) = field( + default=True, + metadata={ + "help": ( + "How to initialize the weights of the LoRA layers. " + "Passing True (default) results in the default initialization from the reference implementation from " + "Microsoft, with the LoRA B weight being set to 0. This means that without further training, the LoRA " + "adapter will be a no-op. " + "Setting the initialization to False leads to random initialization of LoRA A and B, meaning that LoRA " + "is not a no-op before training; this setting is intended for debugging purposes. " + ), + }, + ) + layers_to_transform: Optional[list[int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index. " + "This only works when target_modules is a list of str." + }, + ) + layers_pattern: Optional[list[str]] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." + "This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the " + "model, which is often called `'layers'` or `'h'`." + }, + ) + trainable_token_indices: Optional[list[int]] = field( + default=None, + metadata={ + "help": ( + "Lets you specify which token indices to selectively fine-tune without requiring to re-train the " + "whole embedding matrix using the `peft.TrainableTokensModel` method. You can specify token indices " + "in two ways. Either you specify a list of indices which will then target the model's input embedding " + "layer (or, if not found, `embed_tokens`). (Not supported yet) Alternatively, you can specify a dictionary where the key " + "is the name of the embedding module and the values are the list of token indices, e.g. " + "`{'embed_tokens': [0, 1, ...]}`. Note that training with FSDP requires `use_orig_params=True` to " + "avoid issues with non-uniform `requires_grad`." + ) + }, + ) + loftq_config: dict = field( + default_factory=dict, + metadata={ + "help": ( + "The configuration of LoftQ. If this is passed, then LoftQ will be used to quantize the backbone " + "weights and initialize Lora layers. Also set `init_lora_weights='loftq'` in this case." + ) + }, + ) + def __post_init__(self): # If target_modules is a single-element list, convert it into a plain string if self.target_modules == ["all-linear"]: diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 014b043665..efc8d00340 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -853,9 +853,7 @@ def main(): ) sys.exit(INTERNAL_ERROR_EXIT_CODE) - if isinstance( - tune_config, LoraConfig - ): # aLoraConfig subclasses LoraConfig + if isinstance(tune_config, LoraConfig): # aLoraConfig subclasses LoraConfig try: if training_args.save_model_dir: # Write number of added tokens to artifacts diff --git a/tuning/utils/config_utils.py b/tuning/utils/config_utils.py index 4d47ca991f..a80dc53988 100644 --- a/tuning/utils/config_utils.py +++ b/tuning/utils/config_utils.py @@ -115,7 +115,7 @@ def get_hf_peft_config(task_type, tuning_config, tokenizer_name_or_path): lora_config = asdict(tuning_config) if not hasattr(lora_config, "task_type"): - lora_config["task_type"]=task_type + lora_config["task_type"] = task_type hf_peft_config = HFLoraConfig(**lora_config) elif isinstance(tuning_config, peft_config.PromptTuningConfig): hf_peft_config = HFPromptTuningConfig( From 5acad56a28184e2d2b8a60e04183075748a96912 Mon Sep 17 00:00:00 2001 From: romit Date: Wed, 17 Sep 2025 05:22:12 +0000 Subject: [PATCH 3/7] Lint and fmt fixes Signed-off-by: romit --- tuning/config/peft_config.py | 79 +++++++++++++++++++++++------------- tuning/utils/config_utils.py | 8 ++-- 2 files changed, 53 insertions(+), 34 deletions(-) diff --git a/tuning/config/peft_config.py b/tuning/config/peft_config.py index f1220bf991..e510c841fe 100644 --- a/tuning/config/peft_config.py +++ b/tuning/config/peft_config.py @@ -18,7 +18,7 @@ from typing import List, Optional # Third Party -from peft import LoraConfig as _LoraConfig +from peft import LoraConfig as HFLoraConfig from transformers.utils.quantization_config import Mxfp4Config as HfMxfp4Config @@ -41,7 +41,7 @@ def to_hf_config(self): @dataclass -class LoraConfig(_LoraConfig): +class LoraConfig(HFLoraConfig): """ This is the configuration class that extends peft.LoraConfig with a few defaults. @@ -63,71 +63,92 @@ class LoraConfig(_LoraConfig): default=None, metadata={ "help": ( - "List of module names or regex expression of the module names to replace with LoRA. " + "List of module names or regex expression of the module names to replace with LoRA." "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " "This can also be a wildcard 'all-linear' which matches all linear/Conv1D " "(if the model is a PreTrainedModel, the output layer excluded). " - "If not specified, modules will be chosen according to the model architecture, If the architecture is " - "not known, an error will be raised -- in this case, you should specify the target modules manually. " - "To avoid targeting any modules (because you want to apply `target_parameters`), set " - "`target_modules=[]`." + "If not specified, modules will be chosen according to the model architecture, " + "If the architecture is not known, an error will be raised -- " + "in this case, you should specify the target modules manually. " + "To avoid targeting any modules (because you want to apply `target_parameters`) " + ", set `target_modules=[]`." ), }, ) - exclude_modules: List[str] | None = field( + exclude_modules: Optional[List[str]] = field( default=None, metadata={ - "help": "List of module names or regex expression of the module names to exclude from Lora." + "help": ( + "List of module names or regex expression of the module names to exclude from Lora." + ) }, ) - init_lora_weights: (bool) = field( + init_lora_weights: bool = field( default=True, metadata={ "help": ( "How to initialize the weights of the LoRA layers. " - "Passing True (default) results in the default initialization from the reference implementation from " - "Microsoft, with the LoRA B weight being set to 0. This means that without further training, the LoRA " - "adapter will be a no-op. " - "Setting the initialization to False leads to random initialization of LoRA A and B, meaning that LoRA " - "is not a no-op before training; this setting is intended for debugging purposes. " + "Passing True (default) results in the default initialization from " + "the reference implementation from " + "Microsoft, with the LoRA B weight being set to 0. " + "This means that without further training, " + "the LoRA adapter will be a no-op. " + "Setting the initialization to False leads to random initialization of " + "LoRA A and B, meaning that LoRA is not a no-op before training; " + "this setting is intended for debugging purposes." ), }, ) layers_to_transform: Optional[list[int]] = field( default=None, metadata={ - "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index. " - "This only works when target_modules is a list of str." + "help": ( + "The layer indexes to transform, is this argument is specified, " + "PEFT will transform only the layers indexes that are specified inside this list. " + "If a single integer is passed, PEFT will transform only the layer at this index. " + "This only works when target_modules is a list of str." + ) }, ) layers_pattern: Optional[list[str]] = field( default=None, metadata={ - "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." - "This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the " - "model, which is often called `'layers'` or `'h'`." + "help": ( + "The layer pattern name, used only if `layers_to_transform` is different to None " + "and if the layer pattern is not in the common layers pattern. " + "This only works when target_modules is a list of str. " + "This should target the `nn.ModuleList` of the " + "model, which is often called `'layers'` or `'h'`." + ) }, ) trainable_token_indices: Optional[list[int]] = field( default=None, metadata={ "help": ( - "Lets you specify which token indices to selectively fine-tune without requiring to re-train the " - "whole embedding matrix using the `peft.TrainableTokensModel` method. You can specify token indices " - "in two ways. Either you specify a list of indices which will then target the model's input embedding " - "layer (or, if not found, `embed_tokens`). (Not supported yet) Alternatively, you can specify a dictionary where the key " - "is the name of the embedding module and the values are the list of token indices, e.g. " - "`{'embed_tokens': [0, 1, ...]}`. Note that training with FSDP requires `use_orig_params=True` to " + "Lets you specify which token indices to selectively fine-tune " + "without requiring to re-train the " + "whole embedding matrix using the `peft.TrainableTokensModel` method. " + "You can specify token indices in two ways. " + "Either you specify a list of indices which will then target the model's input " + "embedding layer (or, if not found, `embed_tokens`). " + "(Not supported yet) Alternatively, you can specify a dictionary " + "where the key is the name of the embedding module " + "and the values are the list of token indices, e.g. " + "`{'embed_tokens': [0, 1, ...]}`. Note that training " + "with FSDP requires `use_orig_params=True` to " "avoid issues with non-uniform `requires_grad`." ) }, ) - loftq_config: dict = field( + loftq_config: Optional[dict] = field( default_factory=dict, metadata={ "help": ( - "The configuration of LoftQ. If this is passed, then LoftQ will be used to quantize the backbone " - "weights and initialize Lora layers. Also set `init_lora_weights='loftq'` in this case." + "The configuration of LoftQ. If this is passed, " + "then LoftQ will be used to quantize the backbone " + "weights and initialize Lora layers. Also set `init_lora_weights='loftq'` " + "in this case." ) }, ) diff --git a/tuning/utils/config_utils.py b/tuning/utils/config_utils.py index a80dc53988..67e383ea40 100644 --- a/tuning/utils/config_utils.py +++ b/tuning/utils/config_utils.py @@ -20,7 +20,6 @@ import pickle # Third Party -from peft import LoraConfig as HFLoraConfig from peft import PromptTuningConfig as HFPromptTuningConfig # Local @@ -112,11 +111,10 @@ def get_hf_peft_config(task_type, tuning_config, tokenizer_name_or_path): alora_config.task_type = task_type hf_peft_config = alora_config elif isinstance(tuning_config, peft_config.LoraConfig): - lora_config = asdict(tuning_config) + if getattr(tuning_config, "task_type") is None: + setattr(tuning_config, "task_type", task_type) - if not hasattr(lora_config, "task_type"): - lora_config["task_type"] = task_type - hf_peft_config = HFLoraConfig(**lora_config) + hf_peft_config = tuning_config elif isinstance(tuning_config, peft_config.PromptTuningConfig): hf_peft_config = HFPromptTuningConfig( task_type=task_type, From 3c5711ec2462bca1e575d27a57fed1e2869f585f Mon Sep 17 00:00:00 2001 From: romit Date: Wed, 17 Sep 2025 08:32:14 +0000 Subject: [PATCH 4/7] Updated config utils Signed-off-by: romit --- tuning/utils/config_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tuning/utils/config_utils.py b/tuning/utils/config_utils.py index 67e383ea40..45fda027fe 100644 --- a/tuning/utils/config_utils.py +++ b/tuning/utils/config_utils.py @@ -111,6 +111,9 @@ def get_hf_peft_config(task_type, tuning_config, tokenizer_name_or_path): alora_config.task_type = task_type hf_peft_config = alora_config elif isinstance(tuning_config, peft_config.LoraConfig): + if getattr(tuning_config, "target_modules") == ["all-linear"]: + setattr(tuning_config, "target_modules", "all-linear") + if getattr(tuning_config, "task_type") is None: setattr(tuning_config, "task_type", task_type) From 8437bcf061b7a1fd45c41bf9633b5b8cc18732ae Mon Sep 17 00:00:00 2001 From: r0 <11757603+romitjain@users.noreply.github.com> Date: Thu, 25 Sep 2025 10:22:04 +0530 Subject: [PATCH 5/7] Update comment in LoraConfig Signed-off-by: r0 <11757603+romitjain@users.noreply.github.com> --- tuning/config/peft_config.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tuning/config/peft_config.py b/tuning/config/peft_config.py index e510c841fe..7bd6261e69 100644 --- a/tuning/config/peft_config.py +++ b/tuning/config/peft_config.py @@ -55,10 +55,11 @@ class LoraConfig(HFLoraConfig): lora_alpha: int = 32 lora_dropout: float = 0.05 - # HACK: The following list of arguments are listed here - # as a temperorary fix which reduces the field annotation - # from Optional[List[str], str] to Optional[List[str], str] - # Please see: https://github.com/huggingface/transformers/issues/40915 for further explanation! + # HACK: The following list of arguments listed below + # is a fix which reduces the field annotation from + # Optional[List[str], str] type to Optional[List[str]] type + # This is done for compatibility with HFArgumentParser + # Please see: https://github.com/huggingface/peft/issues/2798 for further explanation! target_modules: Optional[List[str]] = field( default=None, metadata={ From dbc8b657c90d35df408eb822439f02ad0326d7e5 Mon Sep 17 00:00:00 2001 From: romit Date: Thu, 25 Sep 2025 05:40:46 +0000 Subject: [PATCH 6/7] Lint changes Signed-off-by: romit --- tuning/config/peft_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tuning/config/peft_config.py b/tuning/config/peft_config.py index 7bd6261e69..48f594393d 100644 --- a/tuning/config/peft_config.py +++ b/tuning/config/peft_config.py @@ -56,7 +56,7 @@ class LoraConfig(HFLoraConfig): lora_dropout: float = 0.05 # HACK: The following list of arguments listed below - # is a fix which reduces the field annotation from + # is a fix which reduces the field annotation from # Optional[List[str], str] type to Optional[List[str]] type # This is done for compatibility with HFArgumentParser # Please see: https://github.com/huggingface/peft/issues/2798 for further explanation! From 8475748e6d676bb917e877239bd9d0e8a47d3590 Mon Sep 17 00:00:00 2001 From: romit Date: Fri, 26 Sep 2025 04:54:34 +0000 Subject: [PATCH 7/7] Updated comment Signed-off-by: romit --- tuning/sft_trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index cdd93c0a43..051db7f814 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -110,6 +110,7 @@ def train( exp_metadata: Dict of key value pairs passed to train to be recoreded by the tracker. quantized_lora_config: tuning.config.acceleration_configs.QuantizedLoraConfig \ Should be used in combination with LoraConfig for Lora tuning \ + https://huggingface.co/docs/peft/en/package_reference/lora#peft.LoraConfig \ fusedops_kernels_config: tuning.config.acceleration_configs.FusedOpsAndKernelsConfig \ Should be used in combination with quantized_lora_config. Also currently fused_lora and fast_kernels must used together (may change in future). \