Merge branch 'main' into compile-cache

dushyantbehl · web-flow · commit faa04d90b59e · 2025-04-08T14:06:29.000+05:30
diff --git a/tests/artifacts/predefined_data_configs/__init__.py b/tests/artifacts/predefined_data_configs/__init__.py
@@ -40,6 +40,9 @@
 DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML = os.path.join(
     PREDEFINED_DATA_CONFIGS, "multi_turn_data_with_chat_template_granite_3_1B.yaml"
 )
+DATA_CONFIG_MULTITURN_CHAT_TOKENIZE_AND_MASKING_DATA_HANDLER = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "mt_data_granite_3_1B_tokenize_and_mask_handler.yaml"
+)
 DATA_CONFIG_YAML_STREAMING_INPUT_OUTPUT = os.path.join(
     PREDEFINED_DATA_CONFIGS, "tokenize_and_apply_input_masking_streaming.yaml"
 )
diff --git a/tests/artifacts/predefined_data_configs/mt_data_granite_3_1B_tokenize_and_mask_handler.yaml b/tests/artifacts/predefined_data_configs/mt_data_granite_3_1B_tokenize_and_mask_handler.yaml
@@ -0,0 +1,83 @@
+dataprocessor:
+    type: default
+    chat_template: |
+      {%- if messages[0]['role'] == 'system' %}
+          {%- set system_message = messages[0]['content'] %}
+          {%- set loop_messages = messages[1:] %}
+      {%- else %}
+          {%- set system_message = "Knowledge Cutoff Date: April 2024.\nToday's Date: " + strftime_now('%B %d, %Y') + ".\nYou are Granite, developed by IBM." %}
+          {%- if tools and documents %}
+              {%- set system_message = system_message + " You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.\n\nWrite the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+          {%- elif tools %}
+              {%- set system_message = system_message + " You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request." %}
+          {%- elif documents %}
+              {%- set system_message = system_message + " Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
+          {%- else %}
+              {%- set system_message = system_message + " You are a helpful AI assistant." %}    
+          {%- endif %}
+          {%- if 'citations' in controls and documents %}
+              {%- set system_message = system_message + '\n\nIn your response, use the symbols <co> and </co> to indicate when a fact comes from a document in the search result, e.g <co>0</co> for a fact from document 0. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}
+          {%- endif %}
+          {%- if 'hallucinations' in controls and documents %}
+              {%- set system_message = system_message + '\n\nFinally, after the response is written, include a numbered list of sentences from the response that are potentially hallucinated and not based in the documents.' %}
+          {%- endif %}
+          {%- set loop_messages = messages %}
+      {%- endif %}
+      {{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>\n' }}
+      {%- if tools %}
+          {{- '<|start_of_role|>tools<|end_of_role|>' }}
+          {{- tools | tojson(indent=4) }}
+          {{- '<|end_of_text|>\n' }}
+      {%- endif %}
+      {%- if documents %}
+          {{- '<|start_of_role|>documents<|end_of_role|>' }}
+          {%- for document in documents %}
+              {{- 'Document ' + loop.index0 | string + '\n' }}
+              {{- document['text'] }}
+              {%- if not loop.last %}
+                  {{- '\n\n'}}
+              {%- endif%}
+          {%- endfor %}
+          {{- '<|end_of_text|>\n' }}
+      {%- endif %}
+      {%- for message in loop_messages %}
+          {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}
+          {%- if loop.last and add_generation_prompt %}
+              {{- '<|start_of_role|>assistant' }}
+                  {%- if controls %}
+                      {{- ' ' + controls | tojson()}}
+                  {%- endif %}
+              {{- '<|end_of_role|>' }}
+          {%- endif %}
+      {%- endfor %}
+datasets:
+  - name: dataset_1
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: tokenize_and_apply_chat_template_with_masking
+        arguments:
+          remove_columns: all
+          fn_kwargs:
+            max_seq_length: 1024
+            conversation_column: "messages"
+  - name: dataset_2
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: tokenize_and_apply_chat_template_with_masking
+        arguments:
+          remove_columns: all
+          fn_kwargs:
+            max_seq_length: 1024
+            conversation_column: "messages"
+  - name: dataset_3
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: tokenize_and_apply_chat_template_with_masking
+        arguments:
+          remove_columns: all
+          fn_kwargs:
+            max_seq_length: 1024
+            conversation_column: "messages"
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -39,6 +39,7 @@
 from tests.artifacts.predefined_data_configs import (
     DATA_CONFIG_DUPLICATE_COLUMNS,
     DATA_CONFIG_MULTIPLE_DATASETS_SAMPLING_YAML,
+    DATA_CONFIG_MULTITURN_CHAT_TOKENIZE_AND_MASKING_DATA_HANDLER,
     DATA_CONFIG_MULTITURN_DATA_YAML,
     DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML,
     DATA_CONFIG_RENAME_RETAIN_COLUMNS,
@@ -1258,6 +1259,14 @@ def test_run_chat_style_ft_using_dataconfig(datafiles, dataconfigfile):
             ],
             DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML,
         ),
+        (
+            [
+                CHAT_DATA_MULTI_TURN_GRANITE_3_1B,
+                CHAT_DATA_MULTI_TURN_GRANITE_3_1B,
+                CHAT_DATA_MULTI_TURN_GRANITE_3_1B,
+            ],
+            DATA_CONFIG_MULTITURN_CHAT_TOKENIZE_AND_MASKING_DATA_HANDLER,
+        ),
     ],
 )
 def test_run_chat_style_ft_using_dataconfig_for_chat_template(
@@ -1768,7 +1777,7 @@ def test_pretokenized_dataset_bad_args(dataset_text_field, response_template):
         data_args = copy.deepcopy(DATA_ARGS)
         data_args.dataset_text_field = dataset_text_field
         data_args.response_template = response_template
-        data_args.training_data_path = TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL
+        data_args.training_data_path = TWITTER_COMPLAINTS_TOKENIZED_JSON
         # We should raise an error since we should not have a dataset text
         # field or a response template if we have pretokenized data
         with pytest.raises(ValueError):
diff --git a/tuning/data/data_handlers.py b/tuning/data/data_handlers.py
@@ -24,6 +24,7 @@
 from jinja2 import StrictUndefined, TemplateSyntaxError, UndefinedError
 from jinja2.sandbox import SandboxedEnvironment, SecurityError
 from transformers import AutoTokenizer
+import torch
 
 # Local
 from tuning.utils.config_utils import process_jinja_placeholders
@@ -381,6 +382,128 @@ def skip_large_text(element: Dict[str, str], column_name: str, max_length: int):
     return len(element[column_name]) < max_length
 
 
+def tokenize_and_apply_chat_template_with_masking(
+    element: Dict[str, str],
+    tokenizer: AutoTokenizer,
+    max_seq_length: int = None,
+    conversation_column: str = "messages",
+    **kwargs,
+):
+    """Function to apply chat template to the dataset elements and
+       perform masking to ensure model is trained only on completions.
+       Assumes the dataset is modelled according to ChatML style format
+       like,
+       { messages: {'role': 'user', 'content': 'blah'}
+
+       Tokenizes the dataset and returns a tokenized element.
+       Requires that max_seq_length is passed to ensure truncation of
+       extra large samples. If samples are to be skipped truncated please
+       use filter data handler before using this to ensure skipping
+       of samples.
+
+       Expects to be run as a HF Map function.
+       Ensures that element contains `input_ids`, `labels` and
+       `attention_mask`
+       If used with `remove_columns=all` the dataset can be used
+       directly to train.
+    Args:
+        element: the HF Dataset samples
+        tokenizer: Tokenizer to be used.
+        max_seq_length: Max seq length of the tokens allowed.
+                        Required argument.
+        conversation_column: Name of the column which contains conversations
+                        Typically `messages`
+        kwargs: Unused by this function.
+    Returns:
+        Tokenized element which contains `input_ids` `labels` and `attention_mask`
+        with labels properly masked to train only on completions.
+    """
+
+    # This function is taken from OpenInstruct
+    # https://github.com/allenai/open-instruct/blob/\
+    #   d208aa371976a09152f61991951e981573e7582f/open_instruct/\
+    #   dataset_transformation.py#L632
+
+    messages = element[conversation_column]
+
+    if len(messages) == 0:
+        raise ValueError(
+            f"Contents of the column {conversation_column} must not be empty."
+        )
+
+    # Tokenize the whole sample
+    input_ids = tokenizer.apply_chat_template(
+        conversation=messages,
+        tokenize=True,
+        padding=False,
+        return_tensors="pt",
+        truncation=True,
+        max_length=max_seq_length,
+        add_generation_prompt=False,
+    )
+
+    # clone labels from input ids
+    labels = input_ids.clone()
+
+    # mask the non-assistant part for avoiding loss
+    for message_idx, message in enumerate(messages):
+        if message["role"] != "assistant":
+            # we calculate the start index of this non-assistant message
+            if message_idx == 0:
+                message_start_idx = 0
+            else:
+                message_start_idx = tokenizer.apply_chat_template(
+                    conversation=messages[
+                        :message_idx
+                    ],  # here marks the end of the previous messages
+                    tokenize=True,
+                    padding=False,
+                    return_tensors="pt",
+                    truncation=True,
+                    max_length=max_seq_length,
+                    add_generation_prompt=False,
+                ).shape[1]
+            # next, we calculate the end index of this non-assistant message
+            if (
+                message_idx < len(messages) - 1
+                and messages[message_idx + 1]["role"] == "assistant"
+            ):
+                # for intermediate messages that follow with an assistant message,
+                # we need to set `add_generation_prompt=True` to avoid the assistant
+                # generation prefix being included in the loss (e.g., `<|assistant|>`)
+                message_end_idx = tokenizer.apply_chat_template(
+                    conversation=messages[: message_idx + 1],
+                    tokenize=True,
+                    return_tensors="pt",
+                    padding=False,
+                    truncation=True,
+                    max_length=max_seq_length,
+                    add_generation_prompt=True,
+                ).shape[1]
+            else:
+                # for the last message or the message that doesn't follow with
+                # an assistant message, we don't need to add the assistant generation prefix
+                message_end_idx = tokenizer.apply_chat_template(
+                    conversation=messages[: message_idx + 1],
+                    tokenize=True,
+                    return_tensors="pt",
+                    padding=False,
+                    truncation=True,
+                    max_length=max_seq_length,
+                    add_generation_prompt=False,
+                ).shape[1]
+            # set the label to -100 for the non-assistant part
+            labels[:, message_start_idx:message_end_idx] = -100
+            if max_seq_length and message_end_idx >= max_seq_length:
+                break
+    attention_mask = torch.ones_like(input_ids)
+    return {
+        "input_ids": input_ids.flatten(),
+        "labels": labels.flatten(),
+        "attention_mask": attention_mask.flatten(),
+    }
+
+
 AVAILABLE_DATA_HANDLERS = {
     "tokenize_and_apply_input_masking": DataHandler(
         op=tokenize_and_apply_input_masking,
@@ -407,6 +530,11 @@ def skip_large_text(element: Dict[str, str], column_name: str, max_length: int):
         handler_type=DataHandlerType.MAP,
         allows_batching=False,
     ),
+    "tokenize_and_apply_chat_template_with_masking": DataHandler(
+        op=tokenize_and_apply_chat_template_with_masking,
+        handler_type=DataHandlerType.MAP,
+        allows_batching=False,
+    ),
     "duplicate_columns": DataHandler(
         op=duplicate_columns,
         handler_type=DataHandlerType.MAP,
diff --git a/tuning/data/data_preprocessing_utils.py b/tuning/data/data_preprocessing_utils.py
@@ -65,6 +65,22 @@ def get_data_collator(
         # packing for non tokenized dataset doesn't require a collator with SFTrainer.
         return None
 
+    if is_padding_free:
+        # when packing is false but padding_free is used and
+        # no response template is used then its a pretrained scenario.
+        # Current plugin in fms-acceleration is compatible with
+        # `DataCollatorForSeq2Seq` collator hence we use this.
+        return DataCollatorForSeq2Seq(
+            tokenizer=tokenizer, padding=False, max_length=max_seq_length
+        )
+
+    if is_traindata_tokenized:
+        # Note that this automatically pads labels with -100
+        # TODO check if this is sufficient for preprocessed
+        return DataCollatorForSeq2Seq(
+            tokenizer=tokenizer, padding=True, max_length=max_seq_length
+        )
+
     # TODO: near term - how response template ids are parsed out needs to be cleaned.
     # The [2:] here applies if response template has \n prefix, it is needed to strip \n,
     # otherwise template is not found. We will create issue to clean this out after we discuss
@@ -88,22 +104,6 @@ def get_data_collator(
             ignore_index=configs.IGNORE_INDEX,
         )
 
-    if is_padding_free:
-        # when packing is false but padding_free is used and
-        # no response template is used then its a pretrained scenario.
-        # Current plugin in fms-acceleration is compatible with
-        # `DataCollatorForSeq2Seq` collator hence we use this.
-        return DataCollatorForSeq2Seq(
-            tokenizer=tokenizer, padding=False, max_length=max_seq_length
-        )
-
-    if is_traindata_tokenized:
-        # Note that this automatically pads labels with -100
-        # TODO check if this is sufficient for preprocessed
-        return DataCollatorForSeq2Seq(
-            tokenizer=tokenizer, padding=True, max_length=max_seq_length
-        )
-
     raise ValueError(
         "Could not pick a data collator. Please refer to supported data formats"
     )

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,9 @@`
`40`	`40`	`DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML = os.path.join(`
`41`	`41`	`PREDEFINED_DATA_CONFIGS, "multi_turn_data_with_chat_template_granite_3_1B.yaml"`
`42`	`42`	`)`
	`43`	`+DATA_CONFIG_MULTITURN_CHAT_TOKENIZE_AND_MASKING_DATA_HANDLER = os.path.join(`
	`44`	`+ PREDEFINED_DATA_CONFIGS, "mt_data_granite_3_1B_tokenize_and_mask_handler.yaml"`
	`45`	`+)`
`43`	`46`	`DATA_CONFIG_YAML_STREAMING_INPUT_OUTPUT = os.path.join(`
`44`	`47`	`PREDEFINED_DATA_CONFIGS, "tokenize_and_apply_input_masking_streaming.yaml"`
`45`	`48`	`)`