Improve error message when tokenize_train/eval_data doesn't match the dataset

aireenmei · aireenmei · commit c81e688347eb · 2026-05-08T21:29:57.000Z
diff --git a/src/maxtext/input_pipeline/data_processing_utils.py b/src/maxtext/input_pipeline/data_processing_utils.py
@@ -29,9 +29,7 @@ def parse_and_keep_features(dataset, config, data_columns, tokenize):
   """Parse arrayrecord features or keep specified columns for other formats."""
   if config.grain_file_type in ("arrayrecord", "tfrecord"):
     dataset = dataset.map(input_pipeline_utils.ParseFeatures(data_columns, tokenize))
-    dataset = dataset.map(input_pipeline_utils.NormalizeFeatures(data_columns, tokenize))
-  else:
-    dataset = dataset.map(input_pipeline_utils.KeepFeatures(feature_names=data_columns))
+  dataset = dataset.map(input_pipeline_utils.NormalizeFeatures(data_columns, tokenize))
   return dataset
 
 
diff --git a/src/maxtext/input_pipeline/input_pipeline_utils.py b/src/maxtext/input_pipeline/input_pipeline_utils.py
@@ -39,6 +39,8 @@
 Features = dict[str, Any]
 INPUT_TOKENS_KEY = "input_ids"
 
+# pylint: disable=protected-access
+
 ########## Functions used by TFDS pipeline
 
 
@@ -89,6 +91,34 @@ def _process_string(string_tensor):
   return features
 
 
+def validate_tfds_data_types(dataset, data_keys, tokenize):
+  """Validate that TFDS dataset types match the tokenization configuration."""
+  import tensorflow as tf  # pylint: disable=import-outside-toplevel
+
+  spec = dataset.element_spec
+  for k in data_keys:
+    if k not in spec:
+      continue
+
+    # For TFDS, strings/bytes are usually tf.string
+    is_string = spec[k].dtype == tf.string
+
+    if tokenize and not is_string:
+      raise ValueError(
+          f"TFDS dataset column '{k}' has type {spec[k].dtype}, but tokenization is enabled. "
+          "This often happens if the dataset is already tokenized (contains integers) "
+          "but 'tokenize_train_data=True' (or 'tokenize_eval_data=True') is set in the configuration. "
+          "If your data is already tokenized, please set it to False."
+      )
+    if not tokenize and is_string:
+      raise ValueError(
+          f"TFDS dataset column '{k}' has type {spec[k].dtype}, but tokenization is disabled. "
+          "This often happens if the dataset is NOT tokenized (contains strings) "
+          "but 'tokenize_train_data=False' (or 'tokenize_eval_data=False') is set in the configuration. "
+          "If your data is not tokenized, please set it to True."
+      )
+
+
 ########## Functions used by HF pipeline
 
 
@@ -544,7 +574,7 @@ def make_tfrecord_iter_dataset(path: str):
 
 @dataclasses.dataclass
 class ParseFeatures(grain.MapTransform):
-  """Parse serialized example"""
+  """Parse serialized example proto into a dictionary of arrays."""
 
   def __init__(self, data_columns, tokenize):
     self.data_columns = data_columns
@@ -575,45 +605,60 @@ def map(self, element):
 
 @dataclasses.dataclass
 class NormalizeFeatures(grain.MapTransform):
-  """Normalize text feature keys."""
+  """Universal feature normalizer and validator. Acts as selector and validator."""
 
   def __init__(self, column_names, tokenize):
     self.column_names = column_names
     self.tokenize = tokenize
 
   def map(self, element):
-    if self.tokenize:
-      return {col: element[col][0].decode() for col in self.column_names}
-    else:
-      return {col: element[col] for col in self.column_names}
-
-
-@dataclasses.dataclass
-class KeepFeatures(grain.MapTransform):
-  """Keep only specified features in the dataset element.
-
-  This transform filters the input dictionary, retaining only the keys
-  that are present in `feature_names`.
-  """
-
-  def __init__(self, feature_names: list[str]):
-    """Initializes the KeepFeatures transform.
-
-    Args:
-      feature_names: A list of strings, where each string is the name of a
-        feature to be kept in the dataset element.
-    """
-    self.feature_names = feature_names
-
-  def map(self, element: dict[str, Any]) -> dict[str, Any]:
-    """Applies the feature filtering to the input element."""
-    missing = [n for n in self.feature_names if n not in element]
-    if missing:
-      raise ValueError(
-          f"Column {missing} not found in dataset. Available columns: {sorted(element.keys())}. "
-          "Please set train_data_columns or eval_data_columns accordingly."
-      )
-    return {k: v for k, v in element.items() if k in self.feature_names}
+    res = {}
+    for col in self.column_names:
+      val = element[col]
+      if self.tokenize:
+        # ArrayRecord/TFRecord: ParseFeatures wraps bytes_list.value in np.ndarray(dtype=object).
+        # An empty array means the proto stored data in int64_list (already tokenized) — user config error.
+        if isinstance(val, (list, np.ndarray)):
+          if len(val) == 0:
+            raise ValueError(
+                f"Expected non-empty string/bytes list for column '{col}' because tokenization is enabled, "
+                "but got an empty list. This often happens if the dataset is already tokenized (contains integers) "
+                "but tokenization is enabled in the configuration. "
+                "If your data is already tokenized, please set 'tokenize_train_data=False' "
+                "(or 'tokenize_eval_data=False')."
+            )
+          val = val[0]  # unwrap the single-element array from ParseFeatures
+
+        # ArrayRecord/TFRecord: proto bytes_list values are Python bytes after unwrapping above.
+        if isinstance(val, bytes):
+          val = val.decode("utf-8")
+
+        # Parquet: string columns arrive as scalar str (no unwrapping needed).
+        # Any other type indicates a misconfiguration (e.g. already-tokenized integers).
+        if not isinstance(val, str):
+          raise ValueError(
+              f"Expected string or bytes for column '{col}' but got {type(val)} with value {val}. "
+              "If your data is already tokenized, please set 'tokenize_train_data=False' "
+              "(or 'tokenize_eval_data=False') in the configuration."
+          )
+        res[col] = val
+      else:
+        # Parquet: text column arrives as scalar str/bytes — user forgot to pre-tokenize.
+        if isinstance(val, (str, bytes)):
+          raise ValueError(
+              f"Expected tokenized integers for column '{col}' because tokenization is disabled, "
+              f"but got strings. If your data is NOT tokenized, please set 'tokenize_train_data=True' "
+              "(or 'tokenize_eval_data=True') in the configuration."
+          )
+        # ArrayRecord/TFRecord: ParseFeatures reads from int64_list; an empty array means the data
+        # was stored in bytes_list (not tokenized) — user forgot to enable tokenization.
+        if isinstance(val, (list, np.ndarray)) and len(val) == 0:
+          raise ValueError(
+              f"Column '{col}' is empty. This often happens if the dataset contains strings "
+              "but tokenization is disabled (looking for integers)."
+          )
+        res[col] = val
+    return res
 
 
 @dataclasses.dataclass
diff --git a/src/maxtext/input_pipeline/tfds_data_processing.py b/src/maxtext/input_pipeline/tfds_data_processing.py
@@ -102,6 +102,8 @@ def preprocessing_pipeline(
         "Please set train_data_columns or eval_data_columns accordingly."
     )
 
+  input_pipeline_utils.validate_tfds_data_types(dataset, data_column_names, tokenize)
+
   if not use_dpo:
     assert len(data_column_names) == 1
     dataset = dataset.map(

Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,8 @@ def preprocessing_pipeline(`
`102`	`102`	`"Please set train_data_columns or eval_data_columns accordingly."`
`103`	`103`	`)`
`104`	`104`
	`105`	`+ input_pipeline_utils.validate_tfds_data_types(dataset, data_column_names, tokenize)`
	`106`	`+`
`105`	`107`	`if not use_dpo:`
`106`	`108`	`assert len(data_column_names) == 1`
`107`	`109`	`dataset = dataset.map(`