-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathtrain.py
More file actions
114 lines (97 loc) · 2.94 KB
/
train.py
File metadata and controls
114 lines (97 loc) · 2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import pandas as pd
import json
import torch
import shutil
from os import system
from datetime import datetime
from datasets import load_dataset
from peft import LoraConfig
from peft.utils.other import prepare_model_for_kbit_training
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
AutoTokenizer,
TrainingArguments
)
from trl import SFTTrainer
print( "Loading the dataset..." )
df = pd.read_csv( "medquad.csv" )
df = df.iloc[ :, :2 ]
df.columns = [ "text", 'label' ]
#result = list(df.head(1000).to_json(orient="records"))
result = list( df.to_json( orient="records" ) )
result[0] = '{"json":['
result[-1] = ']'
result.append('}')
result = ''.join(result)
result = result.strip('"\'')
result = json.loads( result )
with open( 'data.json', 'w' ) as json_file:
json.dump( result, json_file )
modelName = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model = AutoModelForCausalLM.from_pretrained(
modelName,
device_map = 'auto',
trust_remote_code = True,
token = False,
)
model = prepare_model_for_kbit_training( model )
peft_config = LoraConfig(
r = 32,
lora_alpha = 16,
bias = "none",
lora_dropout = 0.05, # Conventional
task_type = "CAUSAL_LM",
)
model.add_adapter( peft_config )
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained( modelName, trust_remote_code = True, use_fast=False )
tokenizer.pad_token = tokenizer.eos_token
if os.path.isdir( "./temp" ):
shutil.rmtree( "./temp" )
training_arguments = TrainingArguments(
output_dir = "./temp",
num_train_epochs = 4,
per_device_train_batch_size = 2,
gradient_accumulation_steps = 1,
optim = "paged_adamw_32bit",
save_strategy = "epoch",
logging_steps = 100,
logging_strategy = "steps",
learning_rate= 2e-4,
fp16= False,
bf16= False,
group_by_length = True,
disable_tqdm = False,
report_to = None
)
model.config.use_cache = False
def formatting_func( example ):
text = f"### Question: { example['text'] }\n ### Answer: { example['label'] }"
return text
def generate_and_tokenize_prompt( prompt ):
return tokenizer( formatting_func( prompt ), truncation = True, max_length = 2048 )
dataset = load_dataset("json", data_files="data.json", field='json', split="train")
dataset = dataset.map( generate_and_tokenize_prompt )
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=2048,
tokenizer=tokenizer,
args=training_arguments,
packing=False,
)
trainer.train()
if os.path.isdir( "./temp" ):
shutil.rmtree( "./temp" )
directory = "trained"
if os.path.isdir( directory ):
shutil.rmtree( directory )
model.save_pretrained( directory )
tokenizer.save_pretrained( directory )
print( f"Model saved '{directory}'." )