Skip to content

Commit cc6f899

Browse files
yeyu-nvidiaclaude
andcommitted
fix: use data_dir for directory paths in ShardedDataset
datasets' resolve_pattern only matches entries with type=="file", so passing a bare directory path as data_files results in FileNotFoundError even when the directory exists on disk. Detect directory paths and pass them via data_dir instead. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 97d1531 commit cc6f899

1 file changed

Lines changed: 11 additions & 1 deletion

File tree

modelopt/torch/utils/plugins/transformers_dataset.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,20 @@ def __getitem__(self, index):
7373
return self._raw_samples[index]
7474

7575
def _load_dataset(self):
76+
# datasets' resolve_pattern only matches entries with type=="file", so passing
77+
# a bare directory path as data_files results in FileNotFoundError.
78+
# Use data_dir for directory paths instead.
79+
data_dir = None
80+
data_files = self.data_files
81+
if data_files and os.path.isdir(data_files):
82+
data_dir = data_files
83+
data_files = None
84+
7685
dataset = load_dataset(
7786
self.name,
7887
self.subset,
79-
data_files=self.data_files,
88+
data_files=data_files,
89+
data_dir=data_dir,
8090
split=self.split,
8191
# num_proc=4, # TODO: Make this configurable
8292
streaming=self.num_streaming_samples is not None,

0 commit comments

Comments
 (0)