Skip to content

Commit c59da48

Browse files
committed
feat: support padding-free + pretraining
Signed-off-by: Mehant Kammakomati <mehant.kammakomati2@ibm.com>
1 parent 63db876 commit c59da48

2 files changed

Lines changed: 10 additions & 0 deletions

File tree

tuning/sft_trainer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,7 @@ def train(
341341
max_seq_length,
342342
data_args.tokens_field,
343343
data_args.instruction_template,
344+
attention_and_distributed_packing_config.padding_free,
344345
)
345346

346347
if framework is not None and framework.requires_agumentation:

tuning/utils/preprocessing_utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ def get_data_collator(
167167
max_seq_length: int,
168168
tokens_field: str = True,
169169
instruction_template: Optional[str] = None,
170+
padding_free: str = None,
170171
) -> Callable:
171172
"""Create and return the the appropriate collator type based on the configuration for packing,
172173
response_template, and dataset_text_field.
@@ -186,6 +187,8 @@ def get_data_collator(
186187
feature having tokens
187188
instruction_template: Optional[str]
188189
start of user answer.
190+
padding_free: str
191+
padding free method
189192
190193
Returns:
191194
Callable
@@ -240,6 +243,12 @@ def get_data_collator(
240243
return DataCollatorForSeq2Seq(
241244
tokenizer=tokenizer, padding=True, max_length=max_seq_length
242245
)
246+
if padding_free:
247+
# when packing is false but padding_free is used and no response template is used
248+
# then its a pretrained scenario.
249+
return DataCollatorForSeq2Seq(
250+
tokenizer=tokenizer, padding=False, max_length=max_seq_length
251+
)
243252
raise ValueError(
244253
"Could not pick a data collator. Please refer to supported data formats"
245254
)

0 commit comments

Comments
 (0)