Skip to content

Commit bc798e1

Browse files
tomvdwThe TensorFlow Datasets Authors
authored andcommitted
Fix feature handling in Radon and NaN checks in LAION400M.
In radon_dataset_builder.py, the feature extraction now iterates through the defined features and includes error handling for conversion failures. In laion400m.py, NaN values for similarity and license are now checked using pd.isna() for more robust handling. PiperOrigin-RevId: 911744580
1 parent 1dac9e9 commit bc798e1

2 files changed

Lines changed: 21 additions & 11 deletions

File tree

tensorflow_datasets/datasets/radon/radon_dataset_builder.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,15 @@ def _generate_examples(self, file_path_srrs2, file_path_cty):
133133

134134
for i, (_, row) in enumerate(df.iterrows()):
135135
radon_val = row.pop('activity')
136+
features_dict = {}
137+
for name, (_, convert_fn) in features().items():
138+
try:
139+
features_dict[name] = convert_fn(row[name])
140+
except ValueError as e:
141+
raise ValueError(
142+
f'Failed to convert {name} with value {row[name]!r}'
143+
) from e
136144
yield i, {
137145
'activity': float(radon_val),
138-
'features': {
139-
name: features()[name][1](value) for name, value in row.items()
140-
},
146+
'features': features_dict,
141147
}

tensorflow_datasets/vision_language/laion400m/laion400m.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
# limitations under the License.
1515

1616
"""LAION-400M image dataset."""
17+
1718
import functools
1819
from typing import Dict, Tuple
1920

2021
from etils import epath
2122
import numpy as np
22-
2323
from tensorflow_datasets.core.utils.lazy_imports_utils import tensorflow as tf
2424
import tensorflow_datasets.public_api as tfds
2525

@@ -78,15 +78,21 @@
7878

7979
def _get_example_metadata(metadata_df_row):
8080
"""Returns example metadata."""
81+
pd = tfds.core.lazy_imports.pandas
8182
nsfw_tag = metadata_df_row['NSFW']
8283
if nsfw_tag not in _NSFW_TAGS:
8384
nsfw_tag = _NSFW_MISSING_TAG
8485

86+
similarity = metadata_df_row['similarity']
87+
license_ = metadata_df_row['LICENSE']
88+
8589
return {
8690
'caption': metadata_df_row['caption'],
8791
'nsfw': nsfw_tag,
88-
'similarity': metadata_df_row['similarity'] or _MISSING_SIMILARITY_VALUE,
89-
'license': metadata_df_row['LICENSE'] or '',
92+
'similarity': (
93+
_MISSING_SIMILARITY_VALUE if pd.isna(similarity) else similarity
94+
),
95+
'license': '' if pd.isna(license_) else license_,
9096
'url': metadata_df_row['url'],
9197
'original_width': metadata_df_row['original_width'],
9298
'original_height': metadata_df_row['original_height'],
@@ -174,11 +180,9 @@ def _info(self) -> tfds.core.DatasetInfo:
174180
}
175181

176182
if self.builder_config.name == LAION400M_IMAGES_CONFIG.name:
177-
features.update(
178-
{
179-
'image': tfds.features.Image(doc='image'),
180-
}
181-
)
183+
features.update({
184+
'image': tfds.features.Image(doc='image'),
185+
})
182186
else:
183187
features.update({
184188
'image_embedding': tfds.features.Tensor(

0 commit comments

Comments
 (0)