Skip to content

Commit e84f209

Browse files
authored
Merge branch 'main' into compile-cache
2 parents 5df0eab + 8e70c7c commit e84f209

5 files changed

Lines changed: 240 additions & 91 deletions

File tree

scripts/offline_data_processing.py

Lines changed: 5 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,15 @@
55
import traceback
66

77
# Third Party
8-
from transformers import (
9-
AutoTokenizer,
10-
GPT2Tokenizer,
11-
GPTNeoXTokenizerFast,
12-
LlamaTokenizer,
13-
LlamaTokenizerFast,
14-
)
8+
from transformers import AutoTokenizer
159

1610
# Local
1711
from tuning.config import configs
1812
from tuning.data.setup_dataprocessor import process_dataargs
1913
from tuning.sft_trainer import get_parser
2014
from tuning.utils.error_logging import USER_ERROR_EXIT_CODE, write_termination_log
2115
from tuning.utils.logging import set_log_level
16+
from tuning.utils.tokenizer_data_utils import get_special_tokens_dict
2217

2318

2419
def save_dataset_shards(
@@ -92,36 +87,9 @@ def get_processed_dataset(
9287
tokenizer.chat_template = data_args.chat_template
9388

9489
# Prepare special tokens dictionary
95-
special_tokens_dict = {}
96-
if not model_args.tokenizer_name_or_path:
97-
if isinstance(tokenizer, (LlamaTokenizer, LlamaTokenizerFast)):
98-
special_tokens_dict["bos_token"] = "<s>"
99-
special_tokens_dict["eos_token"] = "</s>"
100-
special_tokens_dict["unk_token"] = "<unk>"
101-
special_tokens_dict["pad_token"] = "<pad>"
102-
elif isinstance(tokenizer, (GPT2Tokenizer, GPTNeoXTokenizerFast)):
103-
special_tokens_dict["pad_token"] = "<pad>"
104-
105-
if tokenizer.pad_token is None:
106-
logger.warning(
107-
"PAD token not found in tokenizer; setting PAD token to default."
108-
)
109-
special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
110-
if tokenizer.eos_token is None:
111-
logger.warning(
112-
"EOS token not found in tokenizer; setting EOS token to default."
113-
)
114-
special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
115-
if tokenizer.pad_token == tokenizer.eos_token:
116-
logger.warning(
117-
"PAD token and EOS token are the same. Overriding accordingly."
118-
)
119-
if tokenizer.eos_token != configs.DEFAULT_PAD_TOKEN:
120-
tokenizer.pad_token = configs.DEFAULT_PAD_TOKEN
121-
special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
122-
else:
123-
tokenizer.eos_token = configs.DEFAULT_EOS_TOKEN
124-
special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
90+
special_tokens_dict = get_special_tokens_dict(
91+
tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
92+
)
12593

12694
# adds user specified special tokens to vocab
12795
if data_args.add_special_tokens:
Lines changed: 150 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,164 @@
1-
# Third party
21
# Third Party
32
from transformers import AutoModelForCausalLM, AutoTokenizer
43

54
# First Party
65
from tests.artifacts.testdata import MODEL_NAME
76

87
# Local
9-
# First party
10-
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize
8+
from tuning.config import configs
9+
from tuning.utils.tokenizer_data_utils import (
10+
get_special_tokens_dict,
11+
tokenizer_and_embedding_resize,
12+
)
1113

1214

13-
def test_tokenizer_and_embedding_resize_return_values():
14-
"""Test to ensure number of added tokens are returned correctly"""
15+
def test_setting_special_tokens_with_LlamaTokenizerFast():
16+
"""
17+
Unit test using a LlamaTokenizerFast tokenizer. This tokenizer is only missing a PAD token,
18+
however because it is a LlamaTokenizer, the function code automatically adds the BOS, EOS,
19+
UNK and PAD tokens to the special tokens dict. Then, the <pad> token is replaced with
20+
a <PAD> token, because the Llama tokenizer does not have a pad token specified.
21+
"""
22+
tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True)
23+
model_args = configs.ModelArguments()
24+
special_tokens_dict = get_special_tokens_dict(
25+
tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
26+
)
27+
assert special_tokens_dict == {
28+
"bos_token": "<s>",
29+
"eos_token": "</s>",
30+
"unk_token": "<unk>",
31+
"pad_token": "<PAD>",
32+
}
33+
34+
35+
def test_setting_special_tokens_with_GPT2TokenizerFast():
36+
"""
37+
Unit test using a GPT2TokenizerFast tokenizer. This tokenizer is the case where the
38+
EOS token = PAD token, both of them are <|endoftext|>. So, the pad token in the tokenizer is set
39+
to <PAD> and the "pad_token": "<PAD>" is also added to the special tokens dict.
40+
"""
41+
tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.1-8b-base")
42+
model_args = configs.ModelArguments()
43+
special_tokens_dict = get_special_tokens_dict(
44+
tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
45+
)
46+
assert special_tokens_dict == {
47+
"pad_token": "<PAD>",
48+
}
49+
50+
51+
def test_setting_special_tokens_with_GPTNeoXTokenizerFast():
52+
"""
53+
Unit test using a GPTNeoXTokenizerFast tokenizer. This tokenizer is another one that is
54+
hardcoded into the function to automatically add just a pad token to the special tokens dict.
55+
However, the tokenizer itself is also missing a pad token, so the function then replaces
56+
the <pad> token with the default <PAD> token.
57+
"""
58+
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
59+
model_args = configs.ModelArguments()
60+
special_tokens_dict = get_special_tokens_dict(
61+
tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
62+
)
63+
assert special_tokens_dict == {
64+
"pad_token": "<PAD>",
65+
}
66+
67+
68+
def test_setting_special_tokens_when_missing_all_special_tokens():
69+
"""
70+
Unit test using the GPT2TokenizerFast tokenizer. All the special tokens have been
71+
removed from the tokenizer, so we expect all of them to appear in the special tokens dict.
72+
"""
73+
tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.1-8b-base")
74+
75+
# Set all special tokens to None
76+
tokenizer.bos_token = None
77+
tokenizer.eos_token = None
78+
tokenizer.unk_token = None
79+
tokenizer.pad_token = None
80+
81+
model_args = configs.ModelArguments()
82+
special_tokens_dict = get_special_tokens_dict(
83+
tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
84+
)
85+
assert special_tokens_dict == {
86+
"pad_token": "<PAD>",
87+
"eos_token": "</s>",
88+
"bos_token": "<s>",
89+
"unk_token": "<unk>",
90+
}
91+
92+
93+
def test_setting_special_tokens_when_path_is_not_none():
94+
"""
95+
A simple unit test that sets the `tokenizer_name_or_path` argument in
96+
`model_args` to a non None value. Since the argument is not None, almost
97+
the entire `get_special_tokens_dict` function is skipped and the
98+
special tokens dict is expected to be empty.
99+
"""
100+
tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True)
101+
model_args = configs.ModelArguments(tokenizer_name_or_path="test_path")
102+
special_tokens_dict = get_special_tokens_dict(
103+
tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
104+
)
105+
# Assert special_tokens_dict is empty
106+
assert not special_tokens_dict
107+
108+
109+
def test_tokenizer_and_embedding_resize_return_values_missing_one_token():
110+
"""
111+
Tests the resizing function when the special tokens dict contains a PAD token,
112+
which means the tokenizer is missing one special token.
113+
114+
`mulitple_of` is set to 1.
115+
"""
15116
special_tokens_dict = {"pad_token": "<pad>"}
16117
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
17118
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
18119
metadata = tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model)
19120
assert metadata["num_new_tokens"] == 1
20-
assert "new_embedding_size" in metadata
121+
assert metadata["new_embedding_size"] == len(tokenizer)
122+
123+
124+
def test_tokenizer_and_embedding_resize_return_values_missing_four_tokens():
125+
"""
126+
Tests the resizing when the special tokens dict contains a PAD, EOS, BOS and UNK token,
127+
which means the tokenizer is missing four special tokens.
128+
129+
`mulitple_of` is set to 1.
130+
"""
131+
special_tokens_dict = {
132+
"pad_token": "<PAD>",
133+
"eos_token": "</s>",
134+
"bos_token": "<s>",
135+
"unk_token": "<unk>",
136+
}
137+
tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True)
138+
model = AutoModelForCausalLM.from_pretrained("Maykeye/TinyLLama-v0")
139+
metadata = tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model)
140+
assert metadata["num_new_tokens"] == 4
141+
assert metadata["new_embedding_size"] == len(tokenizer)
142+
143+
144+
def test_tokenizer_and_embedding_resize_return_values_mutliple_of_two():
145+
"""
146+
Tests the resizing when the special tokens dict contains a PAD, EOS, BOS and UNK token,
147+
which means the tokenizer is missing four special tokens.
148+
149+
`mulitple_of` is set to 2; this add one to the count of num_new_tokens and adds
150+
one to the count of new_embedding_size.
151+
"""
152+
special_tokens_dict = {
153+
"pad_token": "<PAD>",
154+
"eos_token": "</s>",
155+
"bos_token": "<s>",
156+
"unk_token": "<unk>",
157+
}
158+
tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True)
159+
model = AutoModelForCausalLM.from_pretrained("Maykeye/TinyLLama-v0")
160+
metadata = tokenizer_and_embedding_resize(
161+
special_tokens_dict, tokenizer, model, multiple_of=2
162+
)
163+
assert metadata["num_new_tokens"] == 5
164+
assert metadata["new_embedding_size"] == len(tokenizer) + 1

tuning/config/configs.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,17 @@ class DataArguments:
131131
},
132132
)
133133

134+
def __post_init__(self):
135+
def unescape(s):
136+
if s is not None and isinstance(s, str):
137+
return s.encode("utf-8").decode("unicode_escape")
138+
return s
139+
140+
self.chat_template = unescape(self.chat_template)
141+
self.data_formatter_template = unescape(self.data_formatter_template)
142+
self.response_template = unescape(self.response_template)
143+
self.instruction_template = unescape(self.instruction_template)
144+
134145

135146
@dataclass
136147
class TrainingArguments(transformers.TrainingArguments):

tuning/sft_trainer.py

Lines changed: 8 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,7 @@
2626
from huggingface_hub.utils._validators import HFValidationError
2727
from peft.utils.other import fsdp_auto_wrap_policy
2828
from torch.cuda import OutOfMemoryError
29-
from transformers import (
30-
AutoModelForCausalLM,
31-
AutoTokenizer,
32-
GPT2Tokenizer,
33-
GPTNeoXTokenizerFast,
34-
LlamaTokenizer,
35-
LlamaTokenizerFast,
36-
TrainerCallback,
37-
)
29+
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainerCallback
3830
from transformers.trainer_utils import get_last_checkpoint
3931
from transformers.utils import is_accelerate_available
4032
from trl import SFTConfig, SFTTrainer
@@ -69,7 +61,10 @@
6961
write_termination_log,
7062
)
7163
from tuning.utils.logging import set_log_level
72-
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize
64+
from tuning.utils.tokenizer_data_utils import (
65+
get_special_tokens_dict,
66+
tokenizer_and_embedding_resize,
67+
)
7368

7469

7570
def train(
@@ -268,42 +263,9 @@ def train(
268263
tokenizer.chat_template = data_args.chat_template
269264

270265
# Add special tokens only when a custom tokenizer is not passed
271-
special_tokens_dict = {}
272-
if not model_args.tokenizer_name_or_path:
273-
# TODO: understand if we need to hardcode these here or just use defaults in model
274-
if isinstance(tokenizer, (LlamaTokenizer, LlamaTokenizerFast)):
275-
special_tokens_dict["bos_token"] = "<s>"
276-
special_tokens_dict["eos_token"] = "</s>"
277-
special_tokens_dict["unk_token"] = "<unk>"
278-
special_tokens_dict["pad_token"] = "<pad>"
279-
elif isinstance(tokenizer, (GPT2Tokenizer, GPTNeoXTokenizerFast)):
280-
special_tokens_dict["pad_token"] = "<pad>"
281-
282-
# add special tokens only when a custom tokenizer is not passed
283-
if not model_args.tokenizer_name_or_path:
284-
# TODO: we need to change this, perhaps follow what open instruct does?
285-
if tokenizer.pad_token is None:
286-
logger.warning("PAD token set to default, missing in tokenizer")
287-
special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
288-
if tokenizer.eos_token is None:
289-
logger.warning("EOS token set to default, missing in tokenizer")
290-
special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
291-
if tokenizer.bos_token is None:
292-
logger.warning("BOS token set to default, missing in tokenizer")
293-
special_tokens_dict["bos_token"] = configs.DEFAULT_BOS_TOKEN
294-
if tokenizer.unk_token is None:
295-
logger.warning("UNK token set to default, missing in tokenizer")
296-
special_tokens_dict["unk_token"] = configs.DEFAULT_UNK_TOKEN
297-
if tokenizer.pad_token == tokenizer.eos_token:
298-
logger.warning(
299-
"PAD token set to default, to make it different from eos token"
300-
)
301-
if tokenizer.eos_token != configs.DEFAULT_PAD_TOKEN:
302-
tokenizer.pad_token = configs.DEFAULT_PAD_TOKEN
303-
special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
304-
else:
305-
tokenizer.eos_token = configs.DEFAULT_EOS_TOKEN
306-
special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
266+
special_tokens_dict = get_special_tokens_dict(
267+
tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer
268+
)
307269

308270
# adds user specified special tokens to vocab
309271
if data_args.add_special_tokens:

0 commit comments

Comments
 (0)