@@ -49,10 +49,10 @@ def is_pretokenized_dataset(data: Union[str, Dataset, IterableDataset]):
4949
5050 if isinstance (data , str ):
5151 # Create a data processor with default processor config
52- processor = get_datapreprocessor (
52+ data_processor = get_datapreprocessor (
5353 processor_config = DataPreProcessorConfig (), tokenizer = None
5454 )
55- data = processor .load_dataset (
55+ data = data_processor .load_dataset (
5656 None ,
5757 streaming = False ,
5858 splitName = "train[:1]" ,
@@ -73,23 +73,23 @@ def _process_dataconfig_file(
7373 is_multipack : bool = False ,
7474):
7575 data_config = load_and_validate_data_config (data_args .data_config_path )
76- processor = get_datapreprocessor (
76+ data_processor = get_datapreprocessor (
7777 processor_config = data_config .dataprocessor ,
7878 tokenizer = tokenizer ,
79- image_processor = processor ,
79+ processor = processor ,
8080 additional_data_handlers = additional_data_handlers ,
8181 )
8282
83- if processor .processor_config .chat_template is not None :
83+ if data_processor .processor_config .chat_template is not None :
8484 if tokenizer .chat_template :
8585 logger .warning (
8686 "replacing existing chat_template %s with data config's chat_template %s" ,
8787 tokenizer .chat_template ,
88- processor .processor_config .chat_template ,
88+ data_processor .processor_config .chat_template ,
8989 )
90- tokenizer .chat_template = processor .processor_config .chat_template
90+ tokenizer .chat_template = data_processor .processor_config .chat_template
9191
92- if processor .processor_config .streaming :
92+ if data_processor .processor_config .streaming :
9393 if train_args .max_steps < 1 :
9494 logging .error (
9595 "ValueError: `--max_steps` must be set when streaming is set in data \
@@ -108,7 +108,7 @@ def _process_dataconfig_file(
108108 "Multipack is not compatible with streaming=true please set streaming=false "
109109 "or disable multipack sampler"
110110 )
111- train_dataset = processor .process_dataset_configs (data_config .datasets )
111+ train_dataset = data_processor .process_dataset_configs (data_config .datasets )
112112
113113 return (train_dataset , None , data_args .dataset_text_field )
114114
@@ -239,17 +239,16 @@ def _get_vision_dataset_handlers(data_args, processor_kwargs):
239239 handlers = []
240240
241241 # First data handler configuration
242- fn_kwargs1 = {
242+ handler_fn_kwargs1 = {
243243 "dataset_text_field" : data_args .dataset_text_field ,
244244 "conversation_column" : data_args .dataset_text_field ,
245245 }
246- kwargs1 = {
247- "fn_kwargs" : fn_kwargs1 ,
248- "batched" : False ,
246+ handler_kwargs1 = {
247+ "fn_kwargs" : handler_fn_kwargs1 ,
249248 "remove_columns" : None ,
250249 }
251250 handlers .append (
252- DataHandlerConfig ("apply_tokenizer_chat_template" , arguments = kwargs1 )
251+ DataHandlerConfig ("apply_tokenizer_chat_template" , arguments = handler_kwargs1 )
253252 )
254253
255254 # Second data handler configuration
@@ -262,8 +261,6 @@ def _get_vision_dataset_handlers(data_args, processor_kwargs):
262261 }
263262 kwargs2 = {
264263 "fn_kwargs" : fn_kwargs2 ,
265- "batched" : False ,
266- "num_proc" : None ,
267264 }
268265 handlers .append (
269266 DataHandlerConfig ("prepare_multimodal_data_processor" , arguments = kwargs2 )
@@ -297,11 +294,10 @@ def _process_raw_data_args(
297294
298295 # Create a data processor with default processor config
299296 default_processor_config = DataPreProcessorConfig ()
300- default_processor_config .streaming = data_args .use_streaming_dataset
301297 data_processor = get_datapreprocessor (
302298 processor_config = default_processor_config ,
303299 tokenizer = tokenizer ,
304- image_processor = processor ,
300+ processor = processor ,
305301 additional_data_handlers = additional_data_handlers ,
306302 )
307303 assert isinstance (
@@ -488,6 +484,7 @@ def process_dataargs(
488484 )
489485
490486 dataset_kwargs = {}
487+ # For vision model tuning prepare_dataset is skipped.
491488 if processor is not None :
492489 dataset_kwargs ["skip_prepare_dataset" ] = True
493490
0 commit comments