Merge pull request #4031 from AI-Hypercomputer:pr/dataset-processor-path

Google-ML-Automation · Google-ML-Automation · commit f93627fc6708 · 2026-06-08T14:47:06.000-07:00
PiperOrigin-RevId: 928781201
diff --git a/src/maxtext/configs/post_train/rl.yml b/src/maxtext/configs/post_train/rl.yml
@@ -227,6 +227,11 @@ skip_jax_distributed_system: true
 #      Loads separate dataset for training and evaluation (e.g., train on OpenMathInstruct-2, eval on GSM8K).
 dataset_name: 'openai/gsm8k'
 eval_dataset_name: 'openai/gsm8k'
+# Optional: path to a user-provided Python file with a custom `process_data`
+# function. Signature: process_data(dataset_name, model_tokenizer, template_config,
+# tmvp_config, x) -> dict with keys {prompts, question, answer}. When empty
+# (default), the built-in utils_rl.process_data is used.
+dataset_processor_path: ''
 train_split: 'train'
 eval_split: 'test'
 hf_name: 'main' # subset of Hugging Face dataset
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -2024,6 +2024,13 @@ class RLDataset(BaseModel):
   train_fraction: float = Field(1.0, description="Fraction of the dataset to be used for training.")
   train_micro_batch_size: int = Field(-1, description="Micro batch size for training.")
   rollout_micro_batch_size: int = Field(-1, description="Micro batch size for rollout.")
+  dataset_processor_path: str = Field(
+      "",
+      description=(
+          "Optional path to a user-provided Python file with a `process_data` function. "
+          "When set, replaces the built-in dataset processor for custom datasets."
+      ),
+  )
 
 
 class RLEvaluation(BaseModel):
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -254,6 +254,17 @@ def prepare_datasets(
         f"Chat template is required for processing dataset but failed to load from {trainer_config.chat_template_path}"
     )
 
+  # Optional user-provided `process_data(dataset_name, tokenizer, template, config, x) -> dict`.
+  # When `dataset_processor_path` is set in config, load that file's `process_data`
+  # and use it instead of the built-in utils_rl.process_data. Lets users adapt
+  # custom datasets (with non-standard answer columns / cleaning) without editing maxtext.
+  _custom_processor_path = getattr(trainer_config, "dataset_processor_path", "") or ""
+  if _custom_processor_path:
+    _process_data = utils_rl.load_custom_callable(_custom_processor_path, "process_data")
+    max_logging.log(f"prepare_datasets: using custom process_data from {_custom_processor_path}")
+  else:
+    _process_data = utils_rl.process_data
+
   # Prepare train and test data from training data for certain datasets
   eval_dataset_name = getattr(trainer_config, "eval_dataset_name", None)
   test_dataset = None
@@ -272,22 +283,14 @@ def prepare_datasets(
     train_dataset = (
         grain.MapDataset.source(splits["train"])
         .shuffle(seed=trainer_config.data_shuffle_seed)
-        .map(
-            lambda x: utils_rl.process_data(
-                trainer_config.dataset_name, model_tokenizer, template_config, trainer_config, x
-            )
-        )
+        .map(lambda x: _process_data(trainer_config.dataset_name, model_tokenizer, template_config, trainer_config, x))
     )
 
     if trainer_config.num_test_batches > 0:
       test_dataset = (
           grain.MapDataset.source(splits["validation"])
           .shuffle(seed=trainer_config.data_shuffle_seed)
-          .map(
-              lambda x: utils_rl.process_data(
-                  trainer_config.dataset_name, model_tokenizer, template_config, trainer_config, x
-              )
-          )
+          .map(lambda x: _process_data(trainer_config.dataset_name, model_tokenizer, template_config, trainer_config, x))
       )
   else:
     if not eval_dataset_name:
@@ -302,11 +305,7 @@ def prepare_datasets(
     train_dataset = (
         grain.MapDataset.source(train_dataset)
         .shuffle(seed=trainer_config.data_shuffle_seed)
-        .map(
-            lambda x: utils_rl.process_data(
-                trainer_config.dataset_name, model_tokenizer, template_config, trainer_config, x
-            )
-        )
+        .map(lambda x: _process_data(trainer_config.dataset_name, model_tokenizer, template_config, trainer_config, x))
     )
 
     if trainer_config.num_test_batches > 0:
@@ -319,7 +318,7 @@ def prepare_datasets(
       test_dataset = (
           grain.MapDataset.source(test_dataset)
           .shuffle(seed=trainer_config.data_shuffle_seed)
-          .map(lambda x: utils_rl.process_data(eval_dataset_name, model_tokenizer, template_config, trainer_config, x))
+          .map(lambda x: _process_data(eval_dataset_name, model_tokenizer, template_config, trainer_config, x))
       )
 
   def _filter_long_prompts(x):
diff --git a/src/maxtext/trainers/post_train/rl/utils_rl.py b/src/maxtext/trainers/post_train/rl/utils_rl.py
@@ -14,8 +14,10 @@
 
 # pylint: disable=bare-except, consider-using-generator, chained-comparison, broad-exception-caught
 """RL Utils Module."""
+import importlib.util
 import itertools
 import json
+import os
 import re
 import uuid
 from typing import Any, Callable, Optional
@@ -813,3 +815,24 @@ def install_training_hooks(
       )
   except Exception as e:  # pylint: disable=broad-exception-caught
     max_logging.warning(f"[intermediate-eval] install failed: {e!r}")
+
+
+def load_custom_callable(module_path: str, function_name: str) -> Callable:
+  """Load a callable from a user-provided Python file via importlib.
+
+  `module_path` is an absolute or relative filesystem path to a `.py` file.
+  The file is loaded as a fresh module (not added to sys.path) and the
+  named attribute is returned. Used to plug in user-defined `process_data`
+  (for custom datasets) and reward functions without editing maxtext.
+  """
+  if not os.path.isfile(module_path):
+    raise ValueError(f"Cannot import {module_path!r}: file does not exist")
+  spec = importlib.util.spec_from_file_location(f"_user_module_{function_name}", module_path)
+  if spec is None or spec.loader is None:
+    raise ValueError(f"Cannot import {module_path!r}: not a valid python file")
+  module = importlib.util.module_from_spec(spec)
+  spec.loader.exec_module(module)
+  fn = getattr(module, function_name, None)
+  if fn is None:
+    raise ValueError(f"{module_path!r} does not define a function named {function_name!r}")
+  return fn
diff --git a/tests/post_training/unit/load_custom_callable_test.py b/tests/post_training/unit/load_custom_callable_test.py
@@ -0,0 +1,130 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for load_custom_callable (used by dataset_processor_path knob)."""
+
+import os
+import sys
+import tempfile
+import textwrap
+import unittest
+
+import pytest
+
+from maxtext.trainers.post_train.rl.utils_rl import load_custom_callable
+
+
+pytestmark = [pytest.mark.post_training]
+
+
+_USER_PROCESS_DATA_SOURCE = textwrap.dedent(
+    """
+    # Simulated user-provided dataset processor file.
+    def process_data(dataset_name, model_tokenizer, template_config, tmvp_config, x):
+      # Minimal stand-in for utils_rl.process_data: returns a dict shaped like
+      # what the RL data pipeline expects, with a marker so the test can verify
+      # that THIS function (not the built-in) was actually invoked.
+      return {
+          "prompts": f"USER_PROCESSOR<{x.get('question', '')}>",
+          "question": x.get("question", ""),
+          "answer": x.get("answer", ""),
+          "_marker": "loaded_from_user_file",
+      }
+
+
+    def another_helper(x):
+      return x * 2
+    """
+).strip()
+
+
+def _write_user_file(tmpdir):
+  """Write the user processor file inside tmpdir and return its absolute path."""
+  path = os.path.join(tmpdir, "user_processor.py")
+  with open(path, "w", encoding="utf-8") as f:
+    f.write(_USER_PROCESS_DATA_SOURCE)
+  return path
+
+
+class LoadCustomCallableTest(unittest.TestCase):
+  """Verify load_custom_callable loads a function from a user .py file."""
+
+  @pytest.mark.cpu_only
+  def test_loads_function_from_user_file(self):
+    """Returns a callable that behaves like the function in the user file."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+      user_file = _write_user_file(tmpdir)
+      fn = load_custom_callable(user_file, "process_data")
+
+      self.assertTrue(callable(fn))
+      # pylint: disable-next=not-callable
+      result = fn(
+          "dataset_name",
+          model_tokenizer=None,
+          template_config=None,
+          tmvp_config=None,
+          x={"question": "2+2?", "answer": "4"},
+      )
+      self.assertEqual(result["_marker"], "loaded_from_user_file")
+      self.assertEqual(result["prompts"], "USER_PROCESSOR<2+2?>")
+      self.assertEqual(result["question"], "2+2?")
+      self.assertEqual(result["answer"], "4")
+
+  @pytest.mark.cpu_only
+  def test_loads_any_named_function(self):
+    """function_name argument selects which symbol to return."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+      user_file = _write_user_file(tmpdir)
+      fn = load_custom_callable(user_file, "another_helper")
+      self.assertEqual(fn(5), 10)  # pylint: disable=not-callable
+
+  @pytest.mark.cpu_only
+  def test_raises_when_file_does_not_exist(self):
+    """Nonexistent path -> ValueError, not a cryptic ImportError."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+      bogus = os.path.join(tmpdir, "does_not_exist.py")
+      with self.assertRaises(ValueError):
+        load_custom_callable(bogus, "process_data")
+
+  @pytest.mark.cpu_only
+  def test_raises_when_function_not_defined(self):
+    """File exists but doesn't define the named function -> ValueError."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+      user_file = _write_user_file(tmpdir)
+      with self.assertRaises(ValueError):
+        load_custom_callable(user_file, "no_such_function")
+
+  @pytest.mark.cpu_only
+  def test_does_not_pollute_sys_path(self):
+    """Loading the file must not append its directory to sys.path."""
+    sys_path_before = list(sys.path)
+    with tempfile.TemporaryDirectory() as tmpdir:
+      user_file = _write_user_file(tmpdir)
+      load_custom_callable(user_file, "process_data")
+    self.assertEqual(sys.path, sys_path_before)
+
+  @pytest.mark.cpu_only
+  def test_does_not_pollute_sys_modules_globally(self):
+    """The loaded module gets a unique synthetic name; it should not shadow
+    other modules with a generic name like 'user_processor'."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+      user_file = _write_user_file(tmpdir)
+      load_custom_callable(user_file, "process_data")
+      # The helper uses '_user_module_<function_name>' as the synthetic module
+      # name, not the file's basename - so 'user_processor' should NOT exist.
+      self.assertNotIn("user_processor", sys.modules)
+
+
+if __name__ == "__main__":
+  unittest.main()