Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/tuning-techniques.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

## LoRA Tuning Example

Set `peft_method` to `"lora"`. You can additionally pass any arguments from [LoraConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L21).
Set `peft_method` to `"lora"`. You can additionally pass any arguments from [LoraConfig](https://huggingface.co/docs/peft/en/package_reference/lora#peft.LoraConfig).
```py
# Args you can pass
r: int =8
Expand Down Expand Up @@ -340,7 +340,7 @@ You can see details on a sample configuration of Accelerated GPTQ-LoRA [here](ht

To use GPTQ-LoRA technique, you can set the `quantized_lora_config` defined [here](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/acceleration_configs/quantized_lora_config.py). See the Notes section of FMS Acceleration doc [below](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/README.md#fms-acceleration) for usage. The only kernel we are supporting currently is `triton_v2`.

In addition, LoRA tuning technique is required to be used, set `peft_method` to `"lora"` and pass any arguments from [LoraConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L21).
In addition, LoRA tuning technique is required to be used, set `peft_method` to `"lora"` and pass any arguments from [LoraConfig](https://huggingface.co/docs/peft/en/package_reference/lora#peft.LoraConfig).

Example command to run:

Expand Down
133 changes: 105 additions & 28 deletions tuning/config/peft_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
# Standard
from dataclasses import dataclass, field
from enum import Enum
from typing import List
from typing import List, Optional

# Third Party
from peft import LoraConfig as HFLoraConfig
from transformers.utils.quantization_config import Mxfp4Config as HfMxfp4Config


Expand All @@ -40,49 +41,125 @@ def to_hf_config(self):


@dataclass
class LoraConfig:
class LoraConfig(HFLoraConfig):
"""
This is the configuration class to store the configuration of a [`LoraModel`].
This is the configuration class that extends peft.LoraConfig with a few defaults.

Args:
r (`int`):
Lora attention dimension (the "rank").
target_modules (List[str]]):
The names of the modules to apply the adapter to. \
If this is specified, only the modules with the specified \
names will be replaced. Please specify modules as per model architecture. \
If the value is ["all-linear"], \
then LORA selects all linear and Conv1D modules as per model architecture, \
except for the output layer.
lora_alpha (`int`):
The alpha parameter for Lora scaling.
lora_dropout (`float`):
The dropout probability for Lora layers.
bias (`str`):
Bias type for LoRA. Can be 'none', 'all' or 'lora_only'. \
If 'all' or 'lora_only', the corresponding biases will be updated during training. \
Be aware that this means that, even when disabling the adapters, the model \
will not produce the same output as the base model would have without adaptation.
"""

r: int = 8
lora_alpha: int = 32
target_modules: List[str] = field(
lora_dropout: float = 0.05

# HACK: The following list of arguments listed below
# is a fix which reduces the field annotation from
# Optional[List[str], str] type to Optional[List[str]] type
# This is done for compatibility with HFArgumentParser
# Please see: https://github.com/huggingface/peft/issues/2798 for further explanation!
target_modules: Optional[List[str]] = field(
default=None,
metadata={
"help": "The names of the modules to apply LORA to. LORA selects modules which either \
completely match or "
'end with one of the strings. If the value is ["all-linear"], \
then LORA selects all linear and Conv1D '
"modules except for the output layer."
"help": (
"List of module names or regex expression of the module names to replace with LoRA."
"For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. "
"This can also be a wildcard 'all-linear' which matches all linear/Conv1D "
"(if the model is a PreTrainedModel, the output layer excluded). "
"If not specified, modules will be chosen according to the model architecture, "
"If the architecture is not known, an error will be raised -- "
"in this case, you should specify the target modules manually. "
"To avoid targeting any modules (because you want to apply `target_parameters`) "
", set `target_modules=[]`."
),
},
)
target_parameters: List[str] = field(
exclude_modules: Optional[List[str]] = field(
default=None,
metadata={"help": "The names/regex of the parameters to apply LORA to"},
metadata={
"help": (
"List of module names or regex expression of the module names to exclude from Lora."
)
},
)
bias = "none"
lora_dropout: float = 0.05
init_lora_weights: bool = field(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@romitjain per your comment on line 58 above the other arguments are incompatible with arg parser but do we need to have arguments like this too?

Copy link
Copy Markdown
Collaborator Author

@romitjain romitjain Sep 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, any field that accepts heterogeneous fields will need to be defined again.

For init_lora_weights, the original type is:

bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq", "orthogonal"]

default=True,
metadata={
"help": (
"How to initialize the weights of the LoRA layers. "
"Passing True (default) results in the default initialization from "
"the reference implementation from "
"Microsoft, with the LoRA B weight being set to 0. "
"This means that without further training, "
"the LoRA adapter will be a no-op. "
"Setting the initialization to False leads to random initialization of "
"LoRA A and B, meaning that LoRA is not a no-op before training; "
"this setting is intended for debugging purposes."
),
},
)
layers_to_transform: Optional[list[int]] = field(
default=None,
metadata={
"help": (
"The layer indexes to transform, is this argument is specified, "
"PEFT will transform only the layers indexes that are specified inside this list. "
"If a single integer is passed, PEFT will transform only the layer at this index. "
"This only works when target_modules is a list of str."
)
},
)
layers_pattern: Optional[list[str]] = field(
default=None,
metadata={
"help": (
"The layer pattern name, used only if `layers_to_transform` is different to None "
"and if the layer pattern is not in the common layers pattern. "
"This only works when target_modules is a list of str. "
"This should target the `nn.ModuleList` of the "
"model, which is often called `'layers'` or `'h'`."
)
},
)
trainable_token_indices: Optional[list[int]] = field(
default=None,
metadata={
"help": (
"Lets you specify which token indices to selectively fine-tune "
"without requiring to re-train the "
"whole embedding matrix using the `peft.TrainableTokensModel` method. "
"You can specify token indices in two ways. "
"Either you specify a list of indices which will then target the model's input "
"embedding layer (or, if not found, `embed_tokens`). "
"(Not supported yet) Alternatively, you can specify a dictionary "
"where the key is the name of the embedding module "
"and the values are the list of token indices, e.g. "
"`{'embed_tokens': [0, 1, ...]}`. Note that training "
"with FSDP requires `use_orig_params=True` to "
"avoid issues with non-uniform `requires_grad`."
)
},
)
loftq_config: Optional[dict] = field(
default_factory=dict,
metadata={
"help": (
"The configuration of LoftQ. If this is passed, "
"then LoftQ will be used to quantize the backbone "
"weights and initialize Lora layers. Also set `init_lora_weights='loftq'` "
"in this case."
)
},
)

def __post_init__(self):
# If target_modules is a single-element list, convert it into a plain string
if self.target_modules == ["all-linear"]:
self.target_modules = "all-linear"

super().__post_init__()


@dataclass
Expand Down
12 changes: 5 additions & 7 deletions tuning/sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def train(
data_args: configs.DataArguments,
train_args: configs.TrainingArguments,
peft_config: Optional[ # pylint: disable=redefined-outer-name
Union[peft_config.LoraConfig, LoraConfig, peft_config.PromptTuningConfig]
Union[LoraConfig, peft_config.PromptTuningConfig]
] = None,
quantization_config: Optional[peft_config.Mxfp4Config] = None,
trainer_controller_args: TrainerControllerCallback = None,
Expand All @@ -92,8 +92,7 @@ def train(
model_args: tuning.config.configs.ModelArguments
data_args: tuning.config.configs.DataArguments
train_args: tuning.config.configs.TrainingArguments
peft_config: peft_config.LoraConfig for Lora tuning | \
LoraConfig (peft.LoraConfig): for activated Lora (aLoRA) tuning | \
peft_config: LoraConfig (peft.LoraConfig): for activated Lora (aLoRA) tuning | \
peft_config.PromptTuningConfig for prompt tuning | \
None for full fine tuning
The peft configuration to pass to trainer
Expand All @@ -110,7 +109,8 @@ def train(
tracker with automatically be added.
exp_metadata: Dict of key value pairs passed to train to be recoreded by the tracker.
quantized_lora_config: tuning.config.acceleration_configs.QuantizedLoraConfig \
Should be used in combination with peft_config.LoraConfig for Lora tuning \
Should be used in combination with LoraConfig for Lora tuning \
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a reference to the HuggingFace loraconfig arguments here?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

https://huggingface.co/docs/peft/en/package_reference/lora#peft.LoraConfig \
fusedops_kernels_config: tuning.config.acceleration_configs.FusedOpsAndKernelsConfig \
Should be used in combination with quantized_lora_config. Also currently
fused_lora and fast_kernels must used together (may change in future). \
Expand Down Expand Up @@ -845,9 +845,7 @@ def main():
)
sys.exit(INTERNAL_ERROR_EXIT_CODE)

if isinstance(
tune_config, (peft_config.LoraConfig, LoraConfig)
): # aLoraConfig subclasses LoraConfig
if isinstance(tune_config, LoraConfig): # aLoraConfig subclasses LoraConfig
try:
if training_args.save_model_dir:
# Write number of added tokens to artifacts
Expand Down
12 changes: 7 additions & 5 deletions tuning/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import pickle

# Third Party
from peft import LoraConfig as HFLoraConfig
from peft import PromptTuningConfig as HFPromptTuningConfig

# Local
Expand Down Expand Up @@ -112,10 +111,13 @@ def get_hf_peft_config(task_type, tuning_config, tokenizer_name_or_path):
alora_config.task_type = task_type
hf_peft_config = alora_config
elif isinstance(tuning_config, peft_config.LoraConfig):
lora_config = asdict(tuning_config)
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have removed this since tuning_config will now already be of type peft.LoraConfig.

Flattening the parameters using asdict recursively flattens all the dataclass parameters. This means that any parameter like: a.b.c is now converted to a[b][c]. When passed using **a, it does not preserve the dataclass structure of b which can cause issues if the underlying library accesses b.c instead of b[c]

if lora_config["target_modules"] == ["all-linear"]:
lora_config["target_modules"] = "all-linear"
hf_peft_config = HFLoraConfig(task_type=task_type, **lora_config)
if getattr(tuning_config, "target_modules") == ["all-linear"]:
setattr(tuning_config, "target_modules", "all-linear")

if getattr(tuning_config, "task_type") is None:
setattr(tuning_config, "task_type", task_type)

hf_peft_config = tuning_config
elif isinstance(tuning_config, peft_config.PromptTuningConfig):
hf_peft_config = HFPromptTuningConfig(
task_type=task_type,
Expand Down