Merge pull request #3747 from AI-Hypercomputer:perf_input_pipeline

Google-ML-Automation · Google-ML-Automation · commit 76a7ecb24bd4 · 2026-04-27T13:07:49.000-07:00
PiperOrigin-RevId: 906506791
diff --git a/src/maxtext/input_pipeline/input_pipeline_utils.py b/src/maxtext/input_pipeline/input_pipeline_utils.py
@@ -43,6 +43,8 @@
 
 
 def normalize_features(x, column_name):
+  if column_name not in x and "ids" in x:
+    column_name = "ids"
   return {"inputs": x[column_name], "targets": x[column_name]}
 
 
@@ -85,7 +87,8 @@ def _process_string(string_tensor):
     return [modified_string]
 
   for k in data_keys:
-    features[k] = tf.py_function(_process_string, [features[k]], Tout=[tf.int32])[0]
+    if features[k].dtype == tf.string:
+      features[k] = tf.py_function(_process_string, [features[k]], Tout=[tf.int32])[0]
   return features
 
 
diff --git a/src/maxtext/input_pipeline/tfds_data_processing_c4_mlperf.py b/src/maxtext/input_pipeline/tfds_data_processing_c4_mlperf.py
@@ -249,6 +249,7 @@ def preprocess_train_dataset(
     max_target_length: int,
     shuffle_buffer_size: int,
     data_shuffle_seed: int,
+    is_tokenized_dataset: bool,
 ) -> tf.data.Dataset:
   """Preprocess the training dataset."""
   if sp_tokenizer.pad_id is not None:
@@ -257,10 +258,13 @@ def preprocess_train_dataset(
     pad_id = sp_tokenizer.unk_id
   else:
     pad_id = -1
-  train_ds = train_ds.map(
-      lambda x: TokenizeOp(tokenizer_model=sp_tokenizer, features=x, data_keys=("targets",)),
-      num_parallel_calls=AUTOTUNE,
-  )
+
+  if not is_tokenized_dataset:
+    train_ds = train_ds.map(
+        lambda x: TokenizeOp(tokenizer_model=sp_tokenizer, features=x, data_keys=("targets",)),
+        num_parallel_calls=AUTOTUNE,
+    )
+
   train_ds = reduce_concat_tokens(train_ds, feature_key="targets", batch_size=4096)
   train_ds = split_tokens_to_targets_length(train_ds, max_target_length)
   train_ds = train_ds.shuffle(shuffle_buffer_size, seed=data_shuffle_seed)
@@ -321,15 +325,32 @@ def make_c4_mlperf_train_iterator(
     process_indices,
 ):
   """Make train iterator of customized C4 dataset for mlperf gpt3 training."""
+  train_split = "train"
+  if config.dataset_name == "c4/en:3.0.1":
+    # gs://max-datasets-rogue/c4/en/3.0.1
+    is_tokenized_dataset = False
+  elif config.dataset_name == "c4/en:3.0.5":
+    # gs://mlperf-6-submission/tfds/c4/en/3.0.5
+    is_tokenized_dataset = True
+  elif config.dataset_name == "c4/en:3.0.7":
+    # gs://max-datasets-rogue/c4/en/3.0.7
+    is_tokenized_dataset = False
+    train_split = "train2"
+  else:
+    raise ValueError(f"{config.dataset_name=} should be one of " "('c4/en:3.0.1', 'c4/en:3.0.5', 'c4/en:3.0.7')")
   train_ds = get_dataset(
       dataset_name=config.dataset_name,
-      split="train2",
+      split=train_split,
       dataloading_host_index=process_indices.index(jax.process_index()),
       dataloading_host_count=len(process_indices),
       enable_data_shuffling=config.enable_data_shuffling,
       data_shuffle_seed=config.data_shuffle_seed,
   )
-  train_ds = rekey(train_ds, {"inputs": None, "targets": "text"})
+
+  if is_tokenized_dataset:
+    train_ds = rekey(train_ds, {"inputs": None, "targets": "ids"})
+  else:
+    train_ds = rekey(train_ds, {"inputs": None, "targets": "text"})
 
   sp_tokenizer = get_tokenizer(
       config.tokenizer_path, config.tokenizer_type, config.add_bos, config.add_eos, config.hf_access_token
@@ -341,6 +362,7 @@ def make_c4_mlperf_train_iterator(
       max_target_length=config.max_target_length,
       shuffle_buffer_size=128,
       data_shuffle_seed=config.data_shuffle_seed,
+      is_tokenized_dataset=is_tokenized_dataset,
   )
   train_multihost_gen = multihost_dataloading.MultiHostDataLoadIterator(train_ds, global_mesh)
   return train_multihost_gen
@@ -352,17 +374,21 @@ def make_c4_mlperf_eval_iterator(
     process_indices,
 ):
   """Make eval iterator of customized C4 dataset for mlperf gpt3 training."""
-  eval_slit = "None"
+  eval_split = "None"
   if config.eval_dataset_name == "c4/en:3.0.5":
     is_tokenized_dataset = True
   elif config.eval_dataset_name == "c4/en:3.0.4":
     is_tokenized_dataset = False
-    eval_slit = "validation_24567exp"
+    eval_split = "validation_24567exp"
   elif config.eval_dataset_name in ["c4/en:3.0.1", "c4/en:3.0.8", "c4/en:3.0.9"]:
     is_tokenized_dataset = False
-    eval_slit = "validation"
+    eval_split = "validation"
   else:
-    raise ValueError(f"{config.eval_dataset_name=} should be one of ('c4/en:3.0.1', 'c4/en:3.0.4', 'c4/en:3.0.5')")
+    raise ValueError(
+        f"{config.eval_dataset_name=} should be one of "
+        "('c4/en:3.0.1', 'c4/en:3.0.4', 'c4/en:3.0.5', "
+        "'c4/en:3.0.8', 'c4/en:3.0.9')"
+    )
 
   if is_tokenized_dataset:
     eval_ds = get_dataset(
@@ -378,7 +404,7 @@ def make_c4_mlperf_eval_iterator(
   else:
     eval_ds = get_dataset(
         dataset_name=config.eval_dataset_name,
-        split=eval_slit,
+        split=eval_split,
         dataloading_host_index=process_indices.index(jax.process_index()),
         dataloading_host_count=len(process_indices),
         enable_data_shuffling=False,