Merge pull request #3895 from AI-Hypercomputer:igorts/dpo-input-processing

Google-ML-Automation · Google-ML-Automation · commit 60bc7f9b7e1a · 2026-05-20T11:10:49.000-07:00
PiperOrigin-RevId: 918547107
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -86,7 +86,7 @@ checkpoint_conversion_fn: none
 # optional checkpoint context to use for loading. options: "orbax", "safetensors"
 source_checkpoint_layout: "orbax"
 
-# Only applicable to Single Controller/Pathways on Cloud. Experimental feature, under testing 
+# Only applicable to Single Controller/Pathways on Cloud. Experimental feature, under testing
 colocated_python_checkpointing: False
 
 # enables autocheckpoint, which saves a checkpoint at the preemption step.
@@ -451,7 +451,7 @@ hardware: 'tpu' # Supported hardware types are 'tpu', 'gpu', 'gpu_multiprocess'
 # internal_compile allows bypassing open-source topology name mappings when using internal topologies directly via get_topology_desc.
 internal_compile: False
 internal_compile_num_devices: -1 # You must specify the number of devices when using internal_compile.
-compile_xla_flags: "" # Compiler options e.g. compile_xla_flags="--xla_tpu_num_sparse_cores_for_gather_offloading=1 --xla_tpu_scoped_vmem_limit_kib=65536" 
+compile_xla_flags: "" # Compiler options e.g. compile_xla_flags="--xla_tpu_num_sparse_cores_for_gather_offloading=1 --xla_tpu_scoped_vmem_limit_kib=65536"
 
 # Parallelism
 shard_mode: "auto" # can be either auto or explicit
@@ -564,8 +564,8 @@ logical_axis_rules: [
                       # ==========================================
                       # Deprecated / Scheduled for Removal
                       # ==========================================
-                      ['mlp_no_fsdp', ['tensor', 'tensor_sequence', 'autoregressive']], 
-                      ['embed_tensor_transpose', ['tensor_transpose']],                 
+                      ['mlp_no_fsdp', ['tensor', 'tensor_sequence', 'autoregressive']],
+                      ['embed_tensor_transpose', ['tensor_transpose']],
                       ['exp_with_fsdp', 'fsdp'],
                   ]
 # Axes used for DCN must be earlier in this list than ICI, see (b/339009148) for details
@@ -680,8 +680,6 @@ global_rampup_samples: 500
 
 # direct preference optimization (DPO)
 use_dpo: False
-dpo_label_smoothing: 0.0
-dpo_beta: 0.1
 
 # Supervised Fine-Tuning (SFT)
 use_sft: False
@@ -1206,7 +1204,7 @@ use_jax_splash: false
 # Path to the HuggingFace-style config directory for the adapter (e.g. src/maxtext/integration/vllm/maxtext_vllm_adapter)
 vllm_hf_config_path: ""
 # A JSON string of overrides to apply to the HuggingFace-style config for the vLLM adapter.
-# This can be used to override specific settings without modifying the original config file. 
+# This can be used to override specific settings without modifying the original config file.
 vllm_hf_overrides: {}
 # JSON string containing additional configuration for the vLLM model (e.g. '{"maxtext_config": {...}}')
 vllm_additional_config: {}
@@ -1221,7 +1219,7 @@ sinkhorn_iterations: 20
 
 ################################## DeepSeek Engram ##################################
 # Indices of transformer layers where Engram are integrated; leave empty [] to disable.
-# Example: [1, 4] attaches to the 2nd and 5th layer. 
+# Example: [1, 4] attaches to the 2nd and 5th layer.
 engram_layers: []
 # The max 'n' in N-gram. Example: n=3 means it covers both 2-grams and 3-grams.
 engram_max_ngram_size: 3
diff --git a/src/maxtext/configs/post_train/dpo.yml b/src/maxtext/configs/post_train/dpo.yml
@@ -1,6 +1,12 @@
 base_config: "base.yml"
 
 use_dpo: true
+dpo:
+  algo: 'dpo'
+  orpo_lambda: 0.1
+  dpo_label_smoothing: 0.0
+  dpo_beta: 0.1
+  max_prompt_length: null
 packing: false
 train_data_columns: ['chosen', 'rejected']
 eval_data_columns: ['chosen', 'rejected']
@@ -24,8 +30,6 @@ hf_eval_split: 'test'
 
 gradient_clipping_threshold: 10.0
 learning_rate: 5.0e-7
-dpo_label_smoothing: 0.0
-dpo_beta: 0.1
 
 enable_goodput_recording: false
 monitor_goodput: false
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1206,12 +1206,24 @@ class OlmoGrainDataset(BaseModel):
   olmo_apply_ngram_filter: bool = Field(True, description="Mask repetitive instances per OLMo-core's repetition filter.")
 
 
+class DPO(BaseModel):
+  """Configuration for DPO and ORPO preference optimization algorithms."""
+
+  algo: Literal["dpo", "orpo"] = Field("dpo", description="Alignment algorithm to use.")
+  dpo_beta: float = Field(0.1, description="Beta parameter for DPO.")
+  orpo_lambda: float = Field(0.1, description="Weight for preference loss in ORPO.")
+  dpo_label_smoothing: float = Field(0.0, ge=0.0, le=1.0, description="Label smoothing for DPO.")
+  max_prompt_length: int | None = Field(
+      None,
+      gt=0,
+      description="Maximum length for prompt. If None, defaults to half of max_target_length.",
+  )
+
+
 class FineTuning(BaseModel):
   """Configuration for fine-tuning methods like DPO, SFT, and GRPO."""
 
   use_dpo: bool = Field(False, description="If True, enables Direct Preference Optimization training.")
-  dpo_label_smoothing: float = Field(0.0, ge=0.0, le=1.0, description="Label smoothing for DPO.")
-  dpo_beta: float = Field(0.1, description="Beta parameter for DPO.")
   use_sft: bool = Field(False, description="If True, enables Supervised Fine-Tuning.")
   sft_train_on_completion_only: bool = Field(
       False, description="If True, trains only on the completion part of the text."
@@ -2303,6 +2315,10 @@ class MaxTextConfig(
   """
 
   debug: Debug = Field(default_factory=Debug, description="Configuration for debugging options.")
+  dpo: DPO = Field(
+      default_factory=DPO,
+      description="Configuration for DPO and ORPO alignment algorithms.",
+  )
   rl: RL = Field(
       default_factory=RL,
       description="Configuration for RL algorithms like Group Relative Policy Optimization (GRPO).",
@@ -2889,6 +2905,16 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
           raise ValueError("For multimodal SFT, `sft_train_on_completion_only` must be True.")
         if self.packing:
           raise ValueError("For multimodal SFT, `packing` is not yet supported.")
+    if self.use_dpo:
+      if self.packing:
+        raise ValueError("For DPO/ORPO, `packing` is not supported.")
+      if self.dpo.max_prompt_length is not None and self.dpo.max_prompt_length >= self.max_target_length:
+        raise ValueError(
+            f"dpo.max_prompt_length ({self.dpo.max_prompt_length}) must be less than max_target_length"
+            f" ({self.max_target_length})."
+        )
+    if self.use_sft and self.use_dpo:
+      raise ValueError("Only one of `use_sft` or `use_dpo` can be True.")
     if self.shard_mode == ShardMode.EXPLICIT:
       supported_decoders = {"simple", "simple_mlp", "llama2", "deepseek"}
       if self.decoder_block.value not in supported_decoders:
diff --git a/src/maxtext/input_pipeline/dpo_utils.py b/src/maxtext/input_pipeline/dpo_utils.py
@@ -0,0 +1,108 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DPO specific input pipeline utilities."""
+
+import dataclasses
+import grain.python as grain
+import numpy as np
+
+
+@dataclasses.dataclass
+class DPODataFormatting(grain.MapTransform):
+  """Prepares DPO data.
+  Renames input columns, extracts common prefix if needed, generates masks, and performs
+  DPO-aware padding (left-padded prompts, right-padded responses).
+  """
+
+  pad_id: int
+  max_target_length: int
+  data_column_names: tuple[str, ...]
+  max_prompt_length: int | None = None
+
+  def map(self, element):
+    "Apply the dataset transformations for DPO."
+    # 1. Reformat/Extract Columns
+    try:
+      if len(self.data_column_names) == 3:
+        input_ids = element[self.data_column_names[0]]
+        chosen_ids = element[self.data_column_names[1]]
+        rejected_ids = element[self.data_column_names[2]]
+      elif len(self.data_column_names) == 2:
+        # Support for datasets like Anthropic/hh-rlhf where prompt is a common prefix
+        full_chosen = element[self.data_column_names[0]]
+        full_rejected = element[self.data_column_names[1]]
+
+        # Find common prefix length
+        prefix_len = 0
+        for c, r in zip(full_chosen, full_rejected):
+          if c != r:
+            break
+          prefix_len += 1
+        input_ids = full_chosen[:prefix_len]
+        chosen_ids = full_chosen[prefix_len:]
+        rejected_ids = full_rejected[prefix_len:]
+      else:
+        raise ValueError(f"DPODataFormatting expects 2 or 3 columns, got {len(self.data_column_names)}")
+    except KeyError as e:
+      raise KeyError(
+          f"Column '{e.args[0]}' not found in the dataset. "
+          f"Expected columns: {self.data_column_names}. "
+          f"Available columns: {list(element.keys())}. "
+          "Please verify that 'train_data_columns' and 'eval_data_columns' match your dataset."
+      ) from e
+
+    # 2. Padding and Masking
+    max_prompt_length = self.max_prompt_length or (self.max_target_length // 2)
+    max_response_length = self.max_target_length - max_prompt_length
+
+    assert max_prompt_length > 0, (
+        "max_prompt_length must be positive. " "Check the configs for 'max_prompt_length' and 'max_target_length'."
+    )
+    assert max_response_length > 0, (
+        "max_response_length must be positive. " "Check the configs for 'max_prompt_length' and 'max_target_length'."
+    )
+
+    prompt_ids = self._pad(input_ids, max_prompt_length, left=True)
+    chosen_ids = self._pad(chosen_ids, max_response_length, left=False)
+    rejected_ids = self._pad(rejected_ids, max_response_length, left=False)
+
+    # Remove old columns if they exist
+    for key in self.data_column_names:
+      if key in element:
+        del element[key]
+
+    element["prompt_ids"] = prompt_ids
+    element["chosen_ids"] = chosen_ids
+    element["rejected_ids"] = rejected_ids
+    element["prompt_mask"] = (prompt_ids != self.pad_id).astype(np.int32)
+    element["chosen_mask"] = (chosen_ids != self.pad_id).astype(np.int32)
+    element["rejected_mask"] = (rejected_ids != self.pad_id).astype(np.int32)
+    return element
+
+  def _pad(self, x, length, left=False):
+    """Pads or trims an array to a specific length.
+
+    When left=True (for prompts), trims from the left to keep the suffix (closest context).
+    When left=False (for responses), trims from the right to keep the prefix.
+    """
+    x = np.asarray(x)
+    pad_amount = max(length - x.shape[0], 0)
+    if left:
+      pad_width = ((pad_amount, 0),)
+      x_trimmed = x[-length:]
+    else:
+      pad_width = ((0, pad_amount),)
+      x_trimmed = x[:length]
+    return np.pad(x_trimmed, pad_width, constant_values=self.pad_id).astype(np.int32)
diff --git a/src/maxtext/input_pipeline/hf_data_processing.py b/src/maxtext/input_pipeline/hf_data_processing.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,9 +24,8 @@
 
 import grain.python as grain
 
-import numpy as np
-
 from maxtext.input_pipeline import data_processing_utils
+from maxtext.input_pipeline import dpo_utils
 from maxtext.input_pipeline import input_pipeline_utils
 from maxtext.input_pipeline import instruction_data_processing
 from maxtext.input_pipeline import multihost_dataloading
@@ -214,7 +213,7 @@ def preprocessing_pipeline(
     num_threads=1,
     drop_remainder=True,
     generate_padding_batch=False,
-    use_dpo=None,
+    use_dpo=False,
     use_sft=None,
     use_tunix_gradient_accumulation=False,
     num_microbatches=1,
@@ -330,19 +329,12 @@ def preprocessing_pipeline(
         )
     )
     data_column_names = ("inputs", "targets")
-  elif use_dpo:
-
-    def lists2array(x):
-      """Convert lists/tuples to array"""
-      return jax.tree.map(np.asarray, x, is_leaf=lambda y: isinstance(y, (list, tuple)))
-
-    operations.append(grain.MapOperation(lists2array))
-  else:
+  elif not use_dpo:
     assert len(data_column_names) == 1
     operations.append(input_pipeline_utils.HFNormalizeFeatures(data_column_names[0]))
     data_column_names = ("inputs", "targets")
 
-  if packing and not use_dpo:
+  if packing:
     length_struct = {col: max_target_length for col in data_column_names}
     max_segments = max_segments_per_seq
     if max_segments is not None and max_segments <= 0:
@@ -356,7 +348,12 @@ def lists2array(x):
     )
     operations.append(input_pipeline_utils.ReformatPacking(data_column_names))
   else:
-    operations.append(input_pipeline_utils.PadOrTrimToMaxLength(max_target_length, pad_id))
+    if use_dpo:
+      # Renames arbitrary DPO columns and performs DPO-aware padding.
+      max_prompt_length = config.dpo.max_prompt_length
+      operations.append(dpo_utils.DPODataFormatting(pad_id, max_target_length, data_column_names, max_prompt_length))
+    else:
+      operations.append(input_pipeline_utils.PadOrTrimToMaxLength(max_target_length, pad_id))
     operations.append(grain.Batch(batch_size=batch_size, drop_remainder=drop_remainder))
 
   if shift and not use_dpo:
diff --git a/tests/post_training/unit/dpo_data_processing_test.py b/tests/post_training/unit/dpo_data_processing_test.py
diff --git a/tests/post_training/unit/dpo_hooks_test.py b/tests/post_training/unit/dpo_hooks_test.py