tensorflow
diff --git a/‎tensorflow_datasets/datasets/robonet/robonet_dataset_builder.py‎
Lines changed: 24 additions & 23 deletions b/‎tensorflow_datasets/datasets/robonet/robonet_dataset_builder.py‎
Lines changed: 24 additions & 23 deletions
diff --git a/‎tensorflow_datasets/rl_unplugged/rlu_rwrl/rlu_rwrl.py‎
Lines changed: 30 additions & 23 deletions b/‎tensorflow_datasets/rl_unplugged/rlu_rwrl/rlu_rwrl.py‎
Lines changed: 30 additions & 23 deletions
diff --git a/‎tensorflow_datasets/robotics/dataset_importer_builder.py‎
Lines changed: 24 additions & 18 deletions b/‎tensorflow_datasets/robotics/dataset_importer_builder.py‎
Lines changed: 24 additions & 18 deletions
diff --git a/‎tensorflow_datasets/structured/covid19/covid19.py‎
Lines changed: 32 additions & 21 deletions b/‎tensorflow_datasets/structured/covid19/covid19.py‎
Lines changed: 32 additions & 21 deletions
diff --git a/‎tensorflow_datasets/structured/web_graph/web_graph.py‎
Lines changed: 18 additions & 17 deletions b/‎tensorflow_datasets/structured/web_graph/web_graph.py‎
Lines changed: 18 additions & 17 deletions
diff --git a/‎tensorflow_datasets/text/c4.py‎
Lines changed: 23 additions & 25 deletions b/‎tensorflow_datasets/text/c4.py‎
Lines changed: 23 additions & 25 deletions
@@ -165,28 +165,29 @@ def _build_pcollection(self, pipeline, filedir):
     """Generate examples as dicts."""
     beam = tfds.core.lazy_imports.apache_beam
 
-    def _process_example(filename):
-      """Converts one video from hdf5 format."""
-      h5py = tfds.core.lazy_imports.h5py
-      with h5py.File(filename) as hf:
-        video_bytes = hf['env']['cam0_video']['frames'][:].tobytes()
-        states = hf['env']['state'][:].astype(np.float32)
-        states = np.pad(
-            states, ((0, 0), (0, STATES_DIM - states.shape[1])), 'constant'
-        )
-        actions = hf['policy']['actions'][:].astype(np.float32)
-        actions = np.pad(
-            actions, ((0, 0), (0, ACTIONS_DIM - actions.shape[1])), 'constant'
-        )
-
-      basename = os.path.basename(filename)
-      features = {
-          'video': video_bytes,
-          'actions': actions,
-          'states': states,
-          'filename': basename,
-      }
-      return basename, features
-
     filenames = tf.io.gfile.glob(os.path.join(filedir, '*.hdf5'))
     return pipeline | beam.Create(filenames) | beam.Map(_process_example)
+
+
+def _process_example(filename):
+  """Converts one video from hdf5 format."""
+  h5py = tfds.core.lazy_imports.h5py
+  with h5py.File(filename) as hf:
+    video_bytes = hf['env']['cam0_video']['frames'][:].tobytes()
+    states = hf['env']['state'][:].astype(np.float32)
+    states = np.pad(
+        states, ((0, 0), (0, STATES_DIM - states.shape[1])), 'constant'
+    )
+    actions = hf['policy']['actions'][:].astype(np.float32)
+    actions = np.pad(
+        actions, ((0, 0), (0, ACTIONS_DIM - actions.shape[1])), 'constant'
+    )
+
+  basename = os.path.basename(filename)
+  features = {
+      'video': video_bytes,
+      'actions': actions,
+      'states': states,
+      'filename': basename,
+  }
+  return basename, features
@@ -245,6 +245,31 @@ def tf_feature_to_tfds_feature(
     raise ValueError(f'Unsupported type {type(nested)}')
 
 
+def _generate_examples_one_file_fn(
+    path,
+    feature_description,
+    tf_example_to_step_ds_fn,
+) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
+  """Yields examples from one file."""
+  counter = 0
+  key_prefix = os.path.basename(path)
+  # Dataset of tf.Examples containing full episodes.
+  example_ds = tf.data.TFRecordDataset(filenames=str(path))
+  # Dataset of episodes, each represented as a dataset of steps.
+  episode_ds = example_ds.map(
+      functools.partial(
+          tf_example_to_step_ds_fn,
+          feature_description=feature_description,
+      ),
+      num_parallel_calls=tf.data.experimental.AUTOTUNE,
+  )
+  episode_ds = tfds.as_numpy(episode_ds)
+  for e in episode_ds:
+    episode_id = counter
+    yield f'{key_prefix}/{episode_id}', e
+    counter += 1
+
+
 class RluRwrl(rlu_common.RLUBuilder):
   """DatasetBuilder for rlu_rwrl dataset."""
 
@@ -368,26 +393,8 @@ def _generate_examples(self, paths):
 
     feature_description = tf_example_to_feature_description(example_item)
 
-    def _generate_examples_one_file(
-        path,
-    ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
-      """Yields examples from one file."""
-      counter = 0
-      key_prefix = os.path.basename(path)
-      # Dataset of tf.Examples containing full episodes.
-      example_ds = tf.data.TFRecordDataset(filenames=str(path))
-      # Dataset of episodes, each represented as a dataset of steps.
-      episode_ds = example_ds.map(
-          functools.partial(
-              self.tf_example_to_step_ds,
-              feature_description=feature_description,
-          ),
-          num_parallel_calls=tf.data.experimental.AUTOTUNE,
-      )
-      episode_ds = tfds.as_numpy(episode_ds)
-      for e in episode_ds:
-        episode_id = counter
-        yield f'{key_prefix}/{episode_id}', e
-        counter += 1
-
-    return beam.Create(file_paths) | beam.FlatMap(_generate_examples_one_file)
+    return beam.Create(file_paths) | beam.FlatMap(
+        _generate_examples_one_file_fn,
+        feature_description=feature_description,
+        tf_example_to_step_ds_fn=self.tf_example_to_step_ds,
+    )
@@ -18,6 +18,7 @@
 from __future__ import annotations
 
 import abc
+import functools
 import os
 from typing import Any
 
@@ -32,6 +33,24 @@
 
 
 
+def _dataset_importer_converter_fn(example, decode_fn, keys_to_strip):
+  """Beam converter function for DatasetImporterBuilder."""
+  # Decode the RLDS Episode and transform it to numpy.
+  example_out = dict(example)
+  example_out['steps'] = tf.data.Dataset.from_tensor_slices(
+      example_out['steps']
+  ).map(decode_fn)
+  steps = list(iter(example_out['steps'].take(-1)))
+  example_out['steps'] = steps
+  example_out = dataset_utils.as_numpy(example_out)
+  example_id = example_out['tfds_id'].decode('utf-8')
+  del example_out['tfds_id']
+  for key in keys_to_strip:
+    if key in example_out:
+      del example_out[key]
+  yield example_id, example_out
+
+
 class DatasetImporterBuilder(
     tfds.core.GeneratorBasedBuilder, skip_registration=True
 ):
@@ -118,24 +137,11 @@ def _generate_examples(
 
     decode_fn = builder.info.features['steps'].feature.decode_example
 
-    def converter_fn(example):
-      # Decode the RLDS Episode and transform it to numpy.
-      example_out = dict(example)
-      example_out['steps'] = tf.data.Dataset.from_tensor_slices(
-          example_out['steps']
-      ).map(decode_fn)
-      steps = list(iter(example_out['steps'].take(-1)))
-      example_out['steps'] = steps
-
-      example_out = dataset_utils.as_numpy(example_out)
-
-      example_id = example_out['tfds_id'].decode('utf-8')
-      del example_out['tfds_id']
-      for key in self.KEYS_TO_STRIP:
-        if key in example_out:
-          del example_out[key]
-
-      yield example_id, example_out
+    converter_fn = functools.partial(
+        _dataset_importer_converter_fn,
+        decode_fn=decode_fn,
+        keys_to_strip=self.KEYS_TO_STRIP,
+    )
 
     return f'read_tfds_dataset@{split}' >> beam_utils.ReadFromTFDS(
         builder=builder,
 
@@ -20,6 +20,7 @@
 response, weather, and more.
 """
 
+import functools
 import numpy as np
 from tensorflow_datasets.core.utils.lazy_imports_utils import tensorflow as tf
 import tensorflow_datasets.public_api as tfds
@@ -48,6 +49,29 @@
 _BATCH_SIZE = 10000
 
 
+def _cast_according_to_column(feature_type, v):
+  if feature_type == tf.string and isinstance(v, (float, int)):
+    return str(v)
+  return v
+
+
+def _load_shard(index: int, dl_manager, archive_path, columns, features):
+  """Load a shard of the dataset."""
+  pd = tfds.core.lazy_imports.pandas
+  # There is only one file so by using the for we guarantee that the file
+  # will be closed.
+  for _, file in dl_manager.iter_archive(archive_path):
+    df = pd.read_csv(file, skiprows=index, nrows=_BATCH_SIZE)
+    result = []
+    for i, row in df.iterrows():
+      example = {
+          k: _cast_according_to_column(features[k].dtype, v)
+          for k, v in zip(columns, row.values)
+      }
+      result.append((index + i, example))
+    return result
+
+
 class Covid19(tfds.core.GeneratorBasedBuilder):
   """DatasetBuilder for covid19 dataset."""
 
@@ -787,31 +811,18 @@ def _generate_examples(
     pd = tfds.core.lazy_imports.pandas
     beam = tfds.core.lazy_imports.apache_beam
 
-    def cast_according_to_column(feature_type, v):
-      if feature_type == tf.string and isinstance(v, (float, int)):
-        return str(v)
-      return v
-
     file_handles = dl_manager.iter_archive(archive_path)
     _, file = next(file_handles)
 
     columns = pd.read_csv(file, nrows=1).columns
-
-    def load_shard(index: int):
-      # There is only one file so by using the for we guarantee that the file
-      # will be closed.
-      for _, file in dl_manager.iter_archive(archive_path):
-        df = pd.read_csv(file, skiprows=index, nrows=_BATCH_SIZE)
-        features = self.info.features
-        result = []
-        for i, row in df.iterrows():
-          example = {
-              k: cast_according_to_column(features[k].dtype, v)
-              for k, v in zip(columns, row.values)
-          }
-          result.append((index + i, example))
-        return result
+    features = self.info.features
 
     return beam.Create(list(range(0, _N_RECORDS, _BATCH_SIZE))) | beam.FlatMap(
-        load_shard
+        functools.partial(
+            _load_shard,
+            dl_manager=dl_manager,
+            archive_path=archive_path,
+            columns=columns,
+            features=features,
+        )
     )
@@ -82,6 +82,24 @@
 """
 
 
+def _get_int_feature(example: tf.train.Example, feature_name: str) -> List[int]:
+  return example.features.feature[feature_name].int64_list.value
+
+
+def _process_example(example: bytes, is_test=False):
+  """Process a single example."""
+  example = tf.train.Example.FromString(example)
+  row_tag = _get_int_feature(example, 'row_tag')[0]
+  col_tag = np.array(_get_int_feature(example, 'col_tag'), dtype=np.int64)
+  if is_test:
+    gt_tag = _get_int_feature(example, 'gt_tag')
+  else:
+    gt_tag = []
+  gt_tag = np.array(gt_tag, dtype=np.int64)
+  return_dict = {'row_tag': row_tag, 'col_tag': col_tag, 'gt_tag': gt_tag}
+  return row_tag, return_dict
+
+
 @dataclasses.dataclass
 class WebGraphConfig(tfds.core.BuilderConfig):
   """Palmer Penguins dataset builder config."""
@@ -225,23 +243,6 @@ def _generate_examples(self, pipeline, files, split: str):
     """Yields examples."""
     beam = tfds.core.lazy_imports.apache_beam
 
-    def _get_int_feature(
-        example: tf.train.Example, feature_name: str
-    ) -> List[int]:
-      return example.features.feature[feature_name].int64_list.value
-
-    def _process_example(example: bytes, is_test=False):
-      example = tf.train.Example.FromString(example)
-      row_tag = _get_int_feature(example, 'row_tag')[0]
-      col_tag = np.array(_get_int_feature(example, 'col_tag'), dtype=np.int64)
-      if is_test:
-        gt_tag = _get_int_feature(example, 'gt_tag')
-      else:
-        gt_tag = []
-      gt_tag = np.array(gt_tag, dtype=np.int64)
-      return_dict = {'row_tag': row_tag, 'col_tag': col_tag, 'gt_tag': gt_tag}
-      return row_tag, return_dict
-
     return (
         pipeline
         | f'{split}_create' >> beam.Create(files)
 
@@ -349,6 +349,28 @@
 ]
 
 
+def _download_wet_file(path, dl_dir):
+  """Download WET file if it doesn't already exist."""
+  url = f"{_DOWNLOAD_HOST}/{path}"
+  out_path = epath.Path(dl_dir) / path
+  if out_path.exists():
+    c4_utils.get_counter_inc_fn("download_wet_url")("exists")
+    return out_path
+  tmp_dir = epath.Path(f"{os.fspath(out_path)}.incomplete{uuid.uuid4().hex}")
+  try:
+    tmp_dir.mkdir(parents=True, exist_ok=True)
+    downloader = tfds.download.download_manager.get_downloader()
+    with downloader.tqdm():
+      # TODO(slebedev): Investigate why pytype infers Promise[Future[...]].
+      dl_path = downloader.download(url, tmp_dir).get().path  # type: ignore
+      dl_path = epath.Path(dl_path)
+    dl_path.rename(out_path)
+  finally:
+    tmp_dir.rmtree(missing_ok=True)
+    c4_utils.get_counter_inc_fn("download_wet_url")("downloaded")
+  return out_path
+
+
 class C4Config(tfds.core.BuilderConfig):
   """BuilderConfig for C4 dataset."""
 
@@ -605,30 +627,6 @@ def _get_pages_pcollection(self, pipeline, file_paths, dl_manager):
     """Build PCollection of un-split page content."""
     beam = tfds.core.lazy_imports.apache_beam
 
-    def download_wet_file(path, dl_dir):
-      url = f"{_DOWNLOAD_HOST}/{path}"
-      out_path = epath.Path(dl_dir) / path
-
-      if out_path.exists():
-        c4_utils.get_counter_inc_fn("download_wet_url")("exists")
-        return out_path
-
-      tmp_dir = epath.Path(
-          f"{os.fspath(out_path)}.incomplete{uuid.uuid4().hex}"
-      )
-      try:
-        tmp_dir.mkdir(parents=True, exist_ok=True)
-        downloader = tfds.download.download_manager.get_downloader()
-        with downloader.tqdm():
-          # TODO(slebedev): Investigate why pytype infers Promise[Future[...]].
-          dl_path = downloader.download(url, tmp_dir).get().path  # type: ignore
-          dl_path = epath.Path(dl_path)
-        dl_path.rename(out_path)
-      finally:
-        tmp_dir.rmtree(missing_ok=True)
-        c4_utils.get_counter_inc_fn("download_wet_url")("downloaded")
-      return out_path
-
     wet_file_paths = (
         pipeline
         | "create_wet_path_urls" >> beam.Create(file_paths["wet_path_urls"])
@@ -640,7 +638,7 @@ def download_wet_file(path, dl_dir):
         | "filter_corrupt_wet_files"
         >> beam.Filter(lambda p: p not in _KNOWN_CORRUPT_WET_FILES)
         | beam.Map(
-            download_wet_file,
+            _download_wet_file,
             dl_dir=os.path.join(dl_manager.download_dir, "c4_wet_files"),
         )
     )