@@ -159,7 +159,9 @@ def process_dataconfig_file(
159159
160160
161161# Data Format 1: Pretokenized Data
162- def _get_pretokenized_dataset_handlers (data_args , is_eval_tokenized ):
162+ def _get_pretokenized_dataset_handlers (
163+ data_args : DataArguments , is_eval_present , is_eval_tokenized
164+ ):
163165
164166 # if the provided train dataset is pretokenized
165167 # however user provides formatting flags, error out
@@ -168,6 +170,7 @@ def _get_pretokenized_dataset_handlers(data_args, is_eval_tokenized):
168170 or data_args .data_formatter_template
169171 or data_args .dataset_text_field
170172 or data_args .instruction_template
173+ or data_args .dataset_conversation_field
171174 ):
172175 raise ValueError (
173176 "fields response_template, data_formatter_template,"
@@ -177,7 +180,7 @@ def _get_pretokenized_dataset_handlers(data_args, is_eval_tokenized):
177180
178181 # if the train dataset is pretokenized
179182 # ensure validation dataset is pretokenized otherwise error out
180- if is_eval_tokenized :
183+ if is_eval_present and not is_eval_tokenized :
181184 raise ValueError (
182185 "validation data should be pretokenized to be used \
183186 along with pretokenized train data"
@@ -189,7 +192,9 @@ def _get_pretokenized_dataset_handlers(data_args, is_eval_tokenized):
189192
190193### Data format 2
191194# pylint: disable=unused-argument
192- def _get_dataset_formatting_handlers (data_args , packing , is_padding_free = False ):
195+ def _get_dataset_formatting_handlers (
196+ data_args : DataArguments , packing , is_padding_free = False
197+ ):
193198
194199 if data_args .response_template is None :
195200 if packing is False :
@@ -253,7 +258,7 @@ def _get_chat_dataset_handlers(data_args, tokenizer_kwargs):
253258 fn_kwargs ["formatted_text_column_name" ] = data_args .dataset_text_field
254259 fn_kwargs ["tokenizer_kwargs" ] = tokenizer_kwargs
255260 if data_args .dataset_conversation_field is not None :
256- fn_kwargs ["conversation_column " ] = data_args .dataset_conversation_field
261+ fn_kwargs ["conversation_column_name " ] = data_args .dataset_conversation_field
257262
258263 kwargs = {"fn_kwargs" : fn_kwargs , "batched" : False , "remove_columns" : "all" }
259264
@@ -284,14 +289,14 @@ def _get_default_dataset_handlers(data_args, tokenizer_kwargs):
284289
285290
286291### Vsion Data Format
287- def _get_vision_dataset_handlers (data_args , processor_kwargs ):
292+ def _get_vision_dataset_handlers (data_args : DataArguments , processor_kwargs ):
288293
289294 handlers = []
290295
291296 # First data handler configuration
292297 handler_fn_kwargs1 = {
293- "dataset_text_field " : data_args .dataset_text_field ,
294- "conversation_column " : data_args .dataset_text_field ,
298+ "formatted_text_column_name " : data_args .dataset_text_field ,
299+ "conversation_column_name " : data_args .dataset_conversation_field ,
295300 }
296301 handler_kwargs1 = {
297302 "fn_kwargs" : handler_fn_kwargs1 ,
@@ -403,7 +408,7 @@ def _process_raw_data_args(
403408 if is_traindata_tokenized :
404409 # Data Format 1: Pretokenized Data
405410 handlers , dataset_text_field = _get_pretokenized_dataset_handlers (
406- data_args , ( is_eval_dataset_present and not is_evaldata_tokenized )
411+ data_args , is_eval_dataset_present , is_evaldata_tokenized
407412 )
408413 elif processor and data_args .dataset_text_field and data_args .dataset_image_field :
409414
0 commit comments