remove extra data formatting handler

dushyantbehl · dushyantbehl · commit 0f67235a02eb · 2025-05-12T17:51:56.000+05:30
Signed-off-by: Dushyant Behl &lt;dushyantbehl@in.ibm.com&gt;
diff --git a/tests/artifacts/predefined_data_configs/__init__.py b/tests/artifacts/predefined_data_configs/__init__.py
@@ -20,9 +20,6 @@
 ### Constants used for data
 PREDEFINED_DATA_CONFIGS = os.path.join(os.path.dirname(__file__))
 DATA_CONFIG_APPLY_CUSTOM_TEMPLATE_YAML = os.path.join(
-    PREDEFINED_DATA_CONFIGS, "apply_custom_template.yaml"
-)
-DATA_CONFIG_APPLY_CUSTOM_JINJA_TEMPLATE_YAML = os.path.join(
     PREDEFINED_DATA_CONFIGS, "apply_custom_jinja_template.yaml"
 )
 DATA_CONFIG_PRETOKENIZE_JSON_DATA_YAML = os.path.join(
diff --git a/tests/artifacts/predefined_data_configs/apply_custom_jinja_template_streaming.yaml b/tests/artifacts/predefined_data_configs/apply_custom_jinja_template_streaming.yaml
@@ -2,11 +2,11 @@ dataprocessor:
     type: default
     streaming: true
 datasets:
-  - name: apply_custom_data_template
+  - name: apply_custom_jinja_template
     data_paths:
       - "FILE_PATH"
     data_handlers:
-      - name: apply_custom_data_formatting_template
+      - name: apply_custom_jinja_template
         arguments:
           remove_columns: all
           batched: false
diff --git a/tests/artifacts/predefined_data_configs/apply_custom_template.yaml b/tests/artifacts/predefined_data_configs/apply_custom_template.yaml
diff --git a/tests/data/test_data_handlers.py b/tests/data/test_data_handlers.py
@@ -31,7 +31,6 @@
 
 # Local
 from tuning.data.data_handlers import (
-    apply_custom_data_formatting_template,
     apply_custom_jinja_template,
     combine_sequence,
     duplicate_columns,
@@ -40,34 +39,6 @@
 )
 
 
-def test_apply_custom_formatting_template():
-    """Tests custom formatting data handler returns correct formatted response"""
-    json_dataset = datasets.load_dataset(
-        "json", data_files=TWITTER_COMPLAINTS_DATA_JSONL
-    )
-    template = "### Input: {{Tweet text}} \n\n ### Response: {{text_label}}"
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    formatted_dataset_field = "formatted_data_field"
-    formatted_dataset = json_dataset.map(
-        apply_custom_data_formatting_template,
-        fn_kwargs={
-            "tokenizer": tokenizer,
-            "formatted_text_column_name": formatted_dataset_field,
-            "template": template,
-        },
-    )
-    # First response from the data file that is read.
-    expected_response = (
-        "### Input: @HMRCcustomers No this is my first job"
-        + " \n\n ### Response: no complaint"
-        + tokenizer.eos_token
-    )
-
-    # a new column is created in Dataset
-    assert formatted_dataset_field in formatted_dataset["train"][0]
-    assert formatted_dataset["train"][0][formatted_dataset_field] == expected_response
-
-
 def test_apply_custom_formatting_jinja_template():
     """Tests custom formatting data handler with jinja template dataset returns correct formatted response"""
     json_dataset = datasets.load_dataset(
@@ -95,7 +66,7 @@ def test_apply_custom_formatting_jinja_template():
     assert formatted_dataset["train"][0][formatted_dataset_field] == expected_response
 
 
-def test_apply_custom_formatting_template_iterable():
+def test_apply_custom_formatting_jinja_template_iterable():
     """Tests custom formatting data handler with iterable dataset returns correct formatted response"""
     json_dataset = datasets.load_dataset(
         "json", data_files=TWITTER_COMPLAINTS_DATA_JSONL, streaming=True
@@ -104,7 +75,7 @@ def test_apply_custom_formatting_template_iterable():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     formatted_dataset_field = "formatted_data_field"
     formatted_dataset = json_dataset.map(
-        apply_custom_data_formatting_template,
+        apply_custom_jinja_template,
         fn_kwargs={
             "tokenizer": tokenizer,
             "formatted_text_column_name": formatted_dataset_field,
@@ -127,25 +98,6 @@ def test_apply_custom_formatting_template_iterable():
     assert first_sample[formatted_dataset_field] == expected_response
 
 
-def test_apply_custom_formatting_template_gives_error_with_wrong_keys():
-    """Tests that the formatting function will throw error if wrong keys are passed to template"""
-    json_dataset = datasets.load_dataset(
-        "json", data_files=TWITTER_COMPLAINTS_DATA_JSONL
-    )
-    template = "### Input: {{not found}} \n\n ### Response: {{text_label}}"
-    formatted_dataset_field = "formatted_data_field"
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    with pytest.raises(KeyError):
-        json_dataset.map(
-            apply_custom_data_formatting_template,
-            fn_kwargs={
-                "tokenizer": tokenizer,
-                "formatted_text_column_name": formatted_dataset_field,
-                "template": template,
-            },
-        )
-
-
 @pytest.mark.parametrize(
     "template",
     [
diff --git a/tests/data/test_data_preprocessing.py b/tests/data/test_data_preprocessing.py
@@ -35,7 +35,6 @@
     save_dataset_shards,
 )
 from tests.artifacts.predefined_data_configs import (
-    DATA_CONFIG_APPLY_CUSTOM_JINJA_TEMPLATE_YAML,
     DATA_CONFIG_APPLY_CUSTOM_TEMPLATE_YAML,
     DATA_CONFIG_MULTIPLE_DATASETS_SAMPLING_YAML,
     DATA_CONFIG_MULTITURN_DATA_YAML,
@@ -887,10 +886,6 @@ def test_process_dataconfig_file_with_streaming_and_multipack_throws_error(
         (DATA_CONFIG_APPLY_CUSTOM_TEMPLATE_YAML, TWITTER_COMPLAINTS_DATA_JSONL),
         (DATA_CONFIG_APPLY_CUSTOM_TEMPLATE_YAML, TWITTER_COMPLAINTS_DATA_PARQUET),
         (DATA_CONFIG_APPLY_CUSTOM_TEMPLATE_YAML, TWITTER_COMPLAINTS_DATA_ARROW),
-        (DATA_CONFIG_APPLY_CUSTOM_JINJA_TEMPLATE_YAML, TWITTER_COMPLAINTS_DATA_JSON),
-        (DATA_CONFIG_APPLY_CUSTOM_JINJA_TEMPLATE_YAML, TWITTER_COMPLAINTS_DATA_JSONL),
-        (DATA_CONFIG_APPLY_CUSTOM_JINJA_TEMPLATE_YAML, TWITTER_COMPLAINTS_DATA_PARQUET),
-        (DATA_CONFIG_APPLY_CUSTOM_JINJA_TEMPLATE_YAML, TWITTER_COMPLAINTS_DATA_ARROW),
         (DATA_CONFIG_PRETOKENIZE_JSON_DATA_YAML, TWITTER_COMPLAINTS_TOKENIZED_JSON),
         (DATA_CONFIG_PRETOKENIZE_JSON_DATA_YAML, TWITTER_COMPLAINTS_TOKENIZED_JSONL),
         (DATA_CONFIG_PRETOKENIZE_JSON_DATA_YAML, TWITTER_COMPLAINTS_TOKENIZED_PARQUET),
@@ -972,15 +967,10 @@ def test_process_dataconfig_file(data_config_path, data_path):
         (DATA_CONFIG_APPLY_CUSTOM_TEMPLATE_YAML, TWITTER_COMPLAINTS_DATA_JSON, True),
         (DATA_CONFIG_APPLY_CUSTOM_TEMPLATE_YAML, TWITTER_COMPLAINTS_DATA_JSON, False),
         (
-            DATA_CONFIG_APPLY_CUSTOM_JINJA_TEMPLATE_YAML,
+            DATA_CONFIG_APPLY_CUSTOM_TEMPLATE_YAML,
             TWITTER_COMPLAINTS_DATA_JSON,
             True,
         ),
-        (
-            DATA_CONFIG_APPLY_CUSTOM_JINJA_TEMPLATE_YAML,
-            TWITTER_COMPLAINTS_DATA_JSON,
-            False,
-        ),
         (
             DATA_CONFIG_TOKENIZE_AND_APPLY_INPUT_MASKING_YAML,
             TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSON,
diff --git a/tuning/data/data_handlers.py b/tuning/data/data_handlers.py
@@ -20,7 +20,6 @@
 
 # import copy
 import logging
-import re
 
 # Third Party
 from jinja2 import StrictUndefined, TemplateSyntaxError, UndefinedError
@@ -203,54 +202,6 @@ def add_tokenizer_eos_token(
     return {f"{text_column_name}": element[f"{text_column_name}"] + tokenizer.eos_token}
 
 
-def apply_custom_data_formatting_template(
-    element: Dict[str, str],
-    tokenizer: AutoTokenizer,
-    formatted_text_column_name: str,
-    template: str,
-    add_eos_token: bool = True,
-    **kwargs,
-):
-    """Function to format datasets with Alpaca style / other templates.
-       Expects to be run as a HF Map API function.
-    Args:
-        element: the HF Dataset element.
-        tokenizer: Tokenizer to be used for the EOS token, which will be appended
-            when formatting the data into a single sequence. Defaults to empty.
-        formatted_text_column_name: Name of the dataset column where formatted
-                                    text is to be saved. If doesn't exist a new
-                                    column will be created.
-        template: Template to format data with. Features of Dataset
-            should be referred to by {{key}}
-        add_eos_token: should add tokenizer.eos_token to text or not, defaults to True
-    Returns:
-        Formatted Dataset element by formatting dataset with template+tokenizer.EOS_TOKEN
-        Saves the result to formatted_text_column_name argument.
-    """
-
-    if add_eos_token:
-        template += tokenizer.eos_token
-
-    def replace_text(match_obj):
-        captured_groups = match_obj.groups()
-        if len(captured_groups) != 1:
-            raise ValueError(
-                "Unexpectedly captured multiple groups in template formatting"
-            )
-
-        index_object = captured_groups[0]
-        if index_object not in element:
-            raise KeyError("Requested template string is not a valid key in dict")
-
-        return str(element[index_object])
-
-    return {
-        f"{formatted_text_column_name}": re.sub(
-            r"{{([\s0-9a-zA-Z_\-\.]+)}}", replace_text, template
-        )
-    }
-
-
 def apply_custom_jinja_template(
     element: Dict[str, str],
     tokenizer: AutoTokenizer,
@@ -259,7 +210,7 @@ def apply_custom_jinja_template(
     add_eos_token: bool = True,
     **kwargs,
 ):
-    """Function to format datasets with jinja templates.
+    """Function to format datasets with Alpaca style / any other jinja templates.
        Expects to be run as a HF Map API function.
     Args:
         element: the HF Dataset element
@@ -670,12 +621,6 @@ def tokenize_and_apply_chat_template_with_masking(
         allows_batching=False,
         desc="Adding EOS token to text dataset",
     ),
-    "apply_custom_data_formatting_template": DataHandler(
-        op=apply_custom_data_formatting_template,
-        handler_type=DataHandlerType.MAP,
-        allows_batching=False,
-        desc="Formatting dataset with given formatter template",
-    ),
     "apply_custom_jinja_template": DataHandler(
         op=apply_custom_jinja_template,
         handler_type=DataHandlerType.MAP,
diff --git a/tuning/data/setup_dataprocessor.py b/tuning/data/setup_dataprocessor.py
@@ -189,7 +189,7 @@ def _get_dataset_formatting_handlers(data_args, packing, is_padding_free=False):
         fn_kwargs["formatted_text_column_name"] = data_args.dataset_text_field
         fn_kwargs["template"] = data_args.data_formatter_template
         handler = DataHandlerConfig(
-            "apply_custom_data_formatting_template",
+            "apply_custom_jinja_template",
             arguments={"fn_kwargs": fn_kwargs, "batched": False},
         )
     return [handler], data_args.dataset_text_field

Original file line number	Diff line number	Diff line change
`@@ -189,7 +189,7 @@ def _get_dataset_formatting_handlers(data_args, packing, is_padding_free=False):`
`189`	`189`	`fn_kwargs["formatted_text_column_name"] = data_args.dataset_text_field`
`190`	`190`	`fn_kwargs["template"] = data_args.data_formatter_template`
`191`	`191`	`handler = DataHandlerConfig(`
`192`		`- "apply_custom_data_formatting_template",`
	`192`	`+ "apply_custom_jinja_template",`
`193`	`193`	`arguments={"fn_kwargs": fn_kwargs, "batched": False},`
`194`	`194`	`)`
`195`	`195`	`return [handler], data_args.dataset_text_field`