@@ -609,11 +609,23 @@ def tokenize_and_apply_chat_template_with_masking(
609609
610610
611611AVAILABLE_DATA_HANDLERS = {
612- "tokenize_and_apply_input_masking" : DataHandler (
613- op = tokenize_and_apply_input_masking ,
612+ "remove_columns" : DataHandler (
613+ # Native function
614+ handler_type = DataHandlerType .REMOVE ,
615+ ),
616+ "select_columns" : DataHandler (
617+ # Native function
618+ handler_type = DataHandlerType .SELECT ,
619+ ),
620+ "rename_columns" : DataHandler (
621+ # Native function
622+ handler_type = DataHandlerType .RENAME ,
623+ ),
624+ "tokenize" : DataHandler (
625+ op = tokenize ,
614626 handler_type = DataHandlerType .MAP ,
615- allows_batching = False ,
616- desc = "Combining and tokenizing instruction and response, masking instructions " ,
627+ allows_batching = True ,
628+ desc = "Tokenizing the dataset " ,
617629 ),
618630 "add_tokenizer_eos_token" : DataHandler (
619631 op = add_tokenizer_eos_token ,
@@ -625,51 +637,42 @@ def tokenize_and_apply_chat_template_with_masking(
625637 op = apply_custom_jinja_template ,
626638 handler_type = DataHandlerType .MAP ,
627639 allows_batching = False ,
628- desc = "Formatting dataset with given jinja template" ,
640+ desc = "Formatting dataset with given formatting template" ,
641+ ),
642+ "tokenize_and_apply_input_masking" : DataHandler (
643+ op = tokenize_and_apply_input_masking ,
644+ handler_type = DataHandlerType .MAP ,
645+ allows_batching = False ,
646+ desc = "Combining and tokenizing instruction and response, masking instructions" ,
629647 ),
630648 "apply_tokenizer_chat_template" : DataHandler (
631649 op = apply_tokenizer_chat_template ,
632650 handler_type = DataHandlerType .MAP ,
633651 allows_batching = False ,
634- desc = "Applying tokenizers chat template to dataset" ,
652+ desc = "Applying chat template to dataset" ,
635653 ),
636654 "tokenize_and_apply_chat_template_with_masking" : DataHandler (
637655 op = tokenize_and_apply_chat_template_with_masking ,
638656 handler_type = DataHandlerType .MAP ,
639657 allows_batching = False ,
640- desc = "Applying chat template to dataset with tokenization " ,
658+ desc = "Applying chat template to dataset and tokenizing " ,
641659 ),
642660 "duplicate_columns" : DataHandler (
643661 op = duplicate_columns ,
644662 handler_type = DataHandlerType .MAP ,
645663 allows_batching = True ,
646664 desc = "Duplicating columns" ,
647665 ),
648- "prepare_multimodal_data_processor" : DataHandler (
649- op = prepare_multimodal_data_processor ,
650- handler_type = DataHandlerType .MAP ,
651- allows_batching = False ,
652- desc = "Processing text+image data" ,
653- ),
654- "tokenize" : DataHandler (
655- op = tokenize ,
656- handler_type = DataHandlerType .MAP ,
657- allows_batching = True ,
658- desc = "Tokenizing the dataset" ,
659- ),
660666 "skip_samples_with_large_columns" : DataHandler (
661667 op = skip_samples_with_large_columns ,
662668 handler_type = DataHandlerType .FILTER ,
663669 allows_batching = False ,
664670 desc = "Skipping large samples" ,
665671 ),
666- "remove_columns" : DataHandler (
667- handler_type = DataHandlerType .REMOVE ,
668- ),
669- "select_columns" : DataHandler (
670- handler_type = DataHandlerType .SELECT ,
671- ),
672- "rename_columns" : DataHandler (
673- handler_type = DataHandlerType .RENAME ,
672+ "prepare_multimodal_data_processor" : DataHandler (
673+ op = prepare_multimodal_data_processor ,
674+ handler_type = DataHandlerType .MAP ,
675+ allows_batching = False ,
676+ desc = "Processing multimodal data" ,
674677 ),
675678}
0 commit comments