Skip to content

Commit 03c7058

Browse files
committed
minor changes
Signed-off-by: Dushyant Behl <dushyantbehl@in.ibm.com>
1 parent 0f67235 commit 03c7058

2 files changed

Lines changed: 42 additions & 30 deletions

File tree

scripts/offline_data_processing.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,17 @@ def save_dataset_shards(
2929
dataset_name (str): Name of the dataset (used for logging).
3030
"""
3131
os.makedirs(output_dir, exist_ok=True)
32+
logging.info(
33+
"Dumping processesd dataaset %s at %s in %d shards",
34+
dataset_name,
35+
output_dir,
36+
num_shards,
37+
)
3238
for shard_idx in range(num_shards):
3339
shard = dataset.shard(index=shard_idx, num_shards=num_shards)
3440
shard_path = os.path.join(output_dir, f"ds_{shard_idx:05d}.parquet")
3541
shard.to_parquet(shard_path)
36-
logging.info("Dumped %d shards of %s at %s", num_shards, dataset_name, output_dir)
42+
logging.info("Dumped %d shards", num_shards)
3743

3844

3945
def process_datasets_offline(
@@ -53,10 +59,13 @@ def process_datasets_offline(
5359
tuple: A tuple containing the formatted training dataset and validation dataset.
5460
"""
5561
# Set log level for this function
56-
train_args, logger = set_log_level(train_args, "get_processed_dataset")
62+
train_args, logger = set_log_level(train_args, "process_datasets_offline")
5763

5864
logger.info(
59-
"Starting dataset processing with model_args: %s, data_args: %s, training_args: %s",
65+
"Starting offline dataset processing with \n\
66+
model_args: %s, \n\
67+
data_args: %s, \n\
68+
training_args: %s",
6069
model_args,
6170
data_args,
6271
train_args,

tuning/data/data_handlers.py

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -609,11 +609,23 @@ def tokenize_and_apply_chat_template_with_masking(
609609

610610

611611
AVAILABLE_DATA_HANDLERS = {
612-
"tokenize_and_apply_input_masking": DataHandler(
613-
op=tokenize_and_apply_input_masking,
612+
"remove_columns": DataHandler(
613+
# Native function
614+
handler_type=DataHandlerType.REMOVE,
615+
),
616+
"select_columns": DataHandler(
617+
# Native function
618+
handler_type=DataHandlerType.SELECT,
619+
),
620+
"rename_columns": DataHandler(
621+
# Native function
622+
handler_type=DataHandlerType.RENAME,
623+
),
624+
"tokenize": DataHandler(
625+
op=tokenize,
614626
handler_type=DataHandlerType.MAP,
615-
allows_batching=False,
616-
desc="Combining and tokenizing instruction and response, masking instructions",
627+
allows_batching=True,
628+
desc="Tokenizing the dataset",
617629
),
618630
"add_tokenizer_eos_token": DataHandler(
619631
op=add_tokenizer_eos_token,
@@ -625,51 +637,42 @@ def tokenize_and_apply_chat_template_with_masking(
625637
op=apply_custom_jinja_template,
626638
handler_type=DataHandlerType.MAP,
627639
allows_batching=False,
628-
desc="Formatting dataset with given jinja template",
640+
desc="Formatting dataset with given formatting template",
641+
),
642+
"tokenize_and_apply_input_masking": DataHandler(
643+
op=tokenize_and_apply_input_masking,
644+
handler_type=DataHandlerType.MAP,
645+
allows_batching=False,
646+
desc="Combining and tokenizing instruction and response, masking instructions",
629647
),
630648
"apply_tokenizer_chat_template": DataHandler(
631649
op=apply_tokenizer_chat_template,
632650
handler_type=DataHandlerType.MAP,
633651
allows_batching=False,
634-
desc="Applying tokenizers chat template to dataset",
652+
desc="Applying chat template to dataset",
635653
),
636654
"tokenize_and_apply_chat_template_with_masking": DataHandler(
637655
op=tokenize_and_apply_chat_template_with_masking,
638656
handler_type=DataHandlerType.MAP,
639657
allows_batching=False,
640-
desc="Applying chat template to dataset with tokenization",
658+
desc="Applying chat template to dataset and tokenizing",
641659
),
642660
"duplicate_columns": DataHandler(
643661
op=duplicate_columns,
644662
handler_type=DataHandlerType.MAP,
645663
allows_batching=True,
646664
desc="Duplicating columns",
647665
),
648-
"prepare_multimodal_data_processor": DataHandler(
649-
op=prepare_multimodal_data_processor,
650-
handler_type=DataHandlerType.MAP,
651-
allows_batching=False,
652-
desc="Processing text+image data",
653-
),
654-
"tokenize": DataHandler(
655-
op=tokenize,
656-
handler_type=DataHandlerType.MAP,
657-
allows_batching=True,
658-
desc="Tokenizing the dataset",
659-
),
660666
"skip_samples_with_large_columns": DataHandler(
661667
op=skip_samples_with_large_columns,
662668
handler_type=DataHandlerType.FILTER,
663669
allows_batching=False,
664670
desc="Skipping large samples",
665671
),
666-
"remove_columns": DataHandler(
667-
handler_type=DataHandlerType.REMOVE,
668-
),
669-
"select_columns": DataHandler(
670-
handler_type=DataHandlerType.SELECT,
671-
),
672-
"rename_columns": DataHandler(
673-
handler_type=DataHandlerType.RENAME,
672+
"prepare_multimodal_data_processor": DataHandler(
673+
op=prepare_multimodal_data_processor,
674+
handler_type=DataHandlerType.MAP,
675+
allows_batching=False,
676+
desc="Processing multimodal data",
674677
),
675678
}

0 commit comments

Comments
 (0)