Enable Tunix-based DPO input processing for Grain

igorts-git · igorts-git · commit 86f715faa127 · 2026-06-02T09:38:10.000-07:00
diff --git a/src/maxtext/configs/post_train/dpo.yml b/src/maxtext/configs/post_train/dpo.yml
@@ -8,24 +8,21 @@ dpo:
   dpo_beta: 0.1
   max_prompt_length: null
 packing: false
-train_data_columns: ['chosen', 'rejected']
-eval_data_columns: ['chosen', 'rejected']
-base_output_directory: 'gs://maxtext-external/logs'
 
 per_device_batch_size: 2.0
 steps: 10
 max_target_length: 512
 eval_interval: 5  # test eval once, in the middle of 10 training steps
 eval_steps: 2
 
-# TFDS Pipeline ----------------------
-dataset_type: tfds
-dataset_path: 'gs://maxtext-dataset/dpo/anthropic_rlhf'
-dataset_name: 'tfds:1.0.0'
-eval_dataset_name: 'tfds:1.0.0'
-eval_split: 'test'
-
-# HF Pipeline -------------------------
+# Some reasonable defaults for running DPO without extra config params.
+model_name: "qwen3-0.6b"
+tokenizer_path: "src/maxtext/assets/tokenizers/qwen3-tokenizer"
+tokenizer_type: "huggingface"
+dataset_type: hf
+hf_path: 'Anthropic/hh-rlhf'
+train_data_columns: ['chosen', 'rejected']
+eval_data_columns: ['chosen', 'rejected']
 hf_eval_split: 'test'
 
 gradient_clipping_threshold: 10.0
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -3043,6 +3043,12 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
       logger.warning(
           "tfds pipeline is deprecated. Use dataset_type=grain, grain_file_type=tfrecord, and provide grain_train_files."
       )
+      if self.use_dpo:
+        raise ValueError(
+            "TFDS dataset_type=tfds is not supported for DPO training"
+            " (config.use_dpo=True). Please use dataset_type=grain or"
+            " dataset_type=hf instead."
+        )
       if not self.dataset_name:
         raise ValueError("dataset_name can't be empty when dataset_type=tfds")
       if self.eval_interval > 0 and not self.eval_split:
diff --git a/src/maxtext/input_pipeline/grain_data_processing.py b/src/maxtext/input_pipeline/grain_data_processing.py
@@ -29,6 +29,7 @@
 from maxtext.input_pipeline import data_processing_utils
 from maxtext.input_pipeline import input_pipeline_utils
 from maxtext.input_pipeline import grain_tokenizer
+from maxtext.input_pipeline import dpo_utils
 from maxtext.input_pipeline import multihost_dataloading
 from maxtext.utils import gcs_utils
 from maxtext.utils import max_logging
@@ -263,6 +264,46 @@ def pretrain_preprocessing_pipeline(
   return dataset
 
 
+def dpo_preprocessing_pipeline(
+    dataset,
+    config,
+    data_columns,
+    tokenize,
+    grain_worker_count,
+    grain_per_worker_buffer_size,
+):
+  """Use grain to pre-process the dataset and return iterators for dpo fine-tuning"""
+  dataset = data_processing_utils.parse_and_keep_features(dataset, config, data_columns, tokenize)
+  tokenizer_model, pad_id = data_processing_utils.get_tokenizer_and_pad_id(config)
+
+  if tokenize:
+    dataset = dataset.map(grain_tokenizer.TokenizeAndTrim(data_columns, config.max_target_length, tokenizer_model))
+
+  # Renames arbitrary DPO columns and performs DPO-aware padding.
+  max_prompt_length = config.dpo.max_prompt_length
+  dataset = dataset.map(
+      dpo_utils.DPODataFormatting(
+          pad_id=pad_id,
+          max_target_length=config.max_target_length,
+          data_column_names=data_columns,
+          max_prompt_length=max_prompt_length,
+      )
+  )
+
+  batch_size = data_processing_utils.get_local_batch_size(config)
+  if config.grain_use_elastic_iterator:
+    # ElasticIterator batches internally, so return the pre-batch dataset.
+    pass
+  else:
+    batch_fn = functools.partial(grain.experimental.batch_and_pad, batch_size=batch_size, pad_value=pad_id)
+    dataset = dataset.batch(batch_size, batch_fn=batch_fn)
+
+  dataset = data_processing_utils.apply_multiprocessing_and_prefetch(
+      dataset, config, grain_worker_count, grain_per_worker_buffer_size
+  )
+  return dataset
+
+
 def _format_chat_template_grain(element, data_columns, tokenizer_model):
   """Grain-compatible mapping function to format raw columns into conversational messages."""
   # Convert raw columns to conversational messages
@@ -350,6 +391,8 @@ def sft_preprocessing_pipeline(
 
 def _get_pipeline_fn(config):
   """Returns the appropriate preprocessing pipeline function based on config."""
+  if config.use_dpo:
+    return dpo_preprocessing_pipeline
   if config.use_sft:
     return sft_preprocessing_pipeline
   return pretrain_preprocessing_pipeline
diff --git a/tests/post_training/unit/dpo_data_processing_test.py b/tests/post_training/unit/dpo_data_processing_test.py
@@ -23,9 +23,11 @@
 import pytest
 import transformers
 
+import grain.python as grain
 from maxtext.configs import pyconfig
 from maxtext.input_pipeline import dpo_utils
 from maxtext.input_pipeline import hf_data_processing
+from maxtext.input_pipeline import grain_data_processing
 from maxtext.input_pipeline import input_pipeline_interface
 from maxtext.utils.globals import MAXTEXT_ASSETS_ROOT, MAXTEXT_CONFIGS_DIR, MAXTEXT_PKG_DIR
 
@@ -389,5 +391,154 @@ def test_dpo_non_positive_max_prompt_length(self):
       )
 
 
+@pytest.mark.external_training
+class TestGrainDPOPipelineProcessing(unittest.TestCase):
+  """End-to-end Grain DPO pipeline processing tests."""
+
+  def setUp(self):
+    super().setUp()
+    self.config = pyconfig.initialize_pydantic(
+        [
+            os.path.join(MAXTEXT_PKG_DIR, "dpo_trainer"),
+            os.path.join(MAXTEXT_CONFIGS_DIR, "post_train", "dpo.yml"),
+        ],
+        per_device_batch_size=2,
+        run_name="test",
+        mesh_axes=["data"],
+        logical_axis_rules=[["batch", "data"]],
+        data_sharding=["data"],
+        base_output_directory="gs://max-experiments/",
+        tokenizer_path=os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "qwen3-tokenizer"),
+        train_split="train",
+        enable_checkpointing=False,
+        use_dpo=True,
+        enable_data_shuffling=False,
+        max_target_length=64,
+        grain_file_type="parquet",  # to trigger KeepFeatures in parse_and_keep_features
+        tokenizer_type="huggingface",
+        dataset_type="grain",
+        grain_train_files="dummy",
+        eval_interval=0,
+    )
+    self.mesh_shape_1d = (len(jax.devices()),)
+    self.mesh = Mesh(mesh_utils.create_device_mesh(self.mesh_shape_1d), self.config.mesh_axes)
+    self.process_indices = input_pipeline_interface.get_process_loading_real_data(
+        self.config.data_sharding,
+        self.config.global_batch_size_to_load,
+        self.config.global_batch_size_to_train_on,
+        self.config.max_target_length,
+        self.mesh,
+    )
+    self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+        self.config.tokenizer_path,
+        add_bos_token=False,
+        add_eos_token=False,
+        legacy=False,
+    )
+    self.pad_id = hf_data_processing._get_pad_id(self.tokenizer)  # pylint: disable=protected-access
+
+  def get_data_iterator(self, list_of_dicts, data_columns):
+    """Helper to initialize the Grain preprocessing pipeline."""
+    dataset = grain.MapDataset.source(list_of_dicts)
+    dataset = dataset[self.process_indices.index(jax.process_index()) :: len(self.process_indices)]
+    dataset = dataset.to_iter_dataset()
+
+    iter_ds = grain_data_processing.dpo_preprocessing_pipeline(
+        dataset=dataset,
+        config=self.config,
+        data_columns=data_columns,
+        tokenize=self.config.tokenize_train_data,
+        grain_worker_count=0,
+        grain_per_worker_buffer_size=1,
+    )
+    return iter(iter_ds)
+
+  def test_dpo_format_3_columns(self):
+    """Verify that the 3-column explicit DPO dataset is processed correctly."""
+    prompt_str = "Question: What is 2+2?"
+    chosen_str = "Answer: 4"
+    rejected_str = "Answer: 5"
+
+    list_of_dicts = [
+        {
+            "input": prompt_str,
+            "chosen": chosen_str,
+            "rejected": rejected_str,
+        }
+        for _ in range(10)
+    ]
+
+    data_iter = self.get_data_iterator(list_of_dicts, ["input", "chosen", "rejected"])
+    batch = next(data_iter)
+
+    # Verify expected keys
+    for key in (
+        "prompt_ids",
+        "chosen_ids",
+        "rejected_ids",
+        "prompt_mask",
+        "chosen_mask",
+        "rejected_mask",
+    ):
+      self.assertIn(key, batch)
+
+    # Verify batch dimensions match global batch size and split max_target_length
+    max_prompt_len = self.config.max_target_length // 2
+    max_response_len = self.config.max_target_length - max_prompt_len
+    self.assertEqual(
+        batch["prompt_ids"].shape,
+        (self.config.global_batch_size_to_load, max_prompt_len),
+    )
+    self.assertEqual(
+        batch["chosen_ids"].shape,
+        (self.config.global_batch_size_to_load, max_response_len),
+    )
+    self.assertEqual(
+        batch["rejected_ids"].shape,
+        (self.config.global_batch_size_to_load, max_response_len),
+    )
+
+    # Verify decoded content directly
+    decoded_prompt = self.tokenizer.decode(batch["prompt_ids"][0], skip_special_tokens=True)
+    decoded_chosen = self.tokenizer.decode(batch["chosen_ids"][0], skip_special_tokens=True)
+    decoded_rejected = self.tokenizer.decode(batch["rejected_ids"][0], skip_special_tokens=True)
+
+    self.assertEqual(decoded_prompt, prompt_str)
+    self.assertEqual(decoded_chosen, chosen_str)
+    self.assertEqual(decoded_rejected, rejected_str)
+
+    # Verify mask structure (left padding for prompt -> 1s at the end; right padding for responses -> 1s at start)
+    self.assertEqual(batch["prompt_mask"][0][-1], 1)
+    self.assertEqual(batch["chosen_mask"][0][0], 1)
+    self.assertEqual(batch["rejected_mask"][0][0], 1)
+
+  def test_dpo_format_2_columns(self):
+    """Verify that 2-column DPO datasets correctly extract common prefixes."""
+    # We use a clear common prefix and different suffixes
+    prefix = "Common prompt context for DPO:"
+    chosen_suffix = " the chosen completion"
+    rejected_suffix = " the rejected completion"
+
+    list_of_dicts = [
+        {
+            "chosen": prefix + chosen_suffix,
+            "rejected": prefix + rejected_suffix,
+        }
+        for _ in range(10)
+    ]
+
+    data_iter = self.get_data_iterator(list_of_dicts, ["chosen", "rejected"])
+    batch = next(data_iter)
+
+    # Verify decoded extracted prefix and completions robustly against BPE token boundary quirks
+    decoded_prompt = self.tokenizer.decode(batch["prompt_ids"][0], skip_special_tokens=True)
+    decoded_chosen = self.tokenizer.decode(batch["chosen_ids"][0], skip_special_tokens=True)
+    decoded_rejected = self.tokenizer.decode(batch["rejected_ids"][0], skip_special_tokens=True)
+
+    self.assertIn("Common prompt context", decoded_prompt)
+    self.assertIn("chosen", decoded_chosen)
+    self.assertIn("rejected", decoded_rejected)
+
+
 if __name__ == "__main__":
   unittest.main()