Skip to content

Commit 11864d0

Browse files
Apply suggestions from code review
Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Signed-off-by: Chenhan D. Yu <5185878+ChenhanYu@users.noreply.github.com>
1 parent fc956b8 commit 11864d0

2 files changed

Lines changed: 6 additions & 6 deletions

File tree

examples/dataset/make_nemotron_ptv2_dataset.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def parse_args() -> argparse.Namespace:
110110
)
111111
parser.add_argument(
112112
"--output-dir",
113-
default="/tmp/specdec_data",
113+
default="/tmp/ptv2_gen",
114114
help="Directory where output JSONL files will be written.",
115115
)
116116
parser.add_argument(
@@ -231,7 +231,7 @@ def main() -> None:
231231

232232
full_path = output_dir / "default.jsonl"
233233
logger.info("Writing %d rows to %s", len(combined), full_path)
234-
combined.to_json(str(full_path))
234+
combined.to_json(str(full_path), num_proc=args.num_proc)
235235

236236
if not args.no_subsets:
237237
for n, name in [(1_000, "1K"), (10_000, "10K"), (100_000, "100K")]:
@@ -240,7 +240,7 @@ def main() -> None:
240240
continue
241241
subset_path = output_dir / f"sample-{name}.jsonl"
242242
logger.info("Writing %s subset to %s", name, subset_path)
243-
combined.select(range(n)).to_json(str(subset_path))
243+
combined.select(range(n)).to_json(str(subset_path), num_proc=args.num_proc)
244244

245245
logger.info("Done. Output files are in %s", output_dir)
246246

examples/dataset/make_nemotron_ptv3_dataset.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def parse_args() -> argparse.Namespace:
177177
)
178178
parser.add_argument(
179179
"--output-dir",
180-
default="/tmp/specdec_v3",
180+
default="/tmp/ptv3_train",
181181
help="Directory where output JSONL files will be written.",
182182
)
183183
parser.add_argument(
@@ -284,7 +284,7 @@ def main() -> None:
284284

285285
full_path = output_dir / "default.jsonl"
286286
logger.info("Writing %d rows to %s", len(combined), full_path)
287-
combined.to_json(str(full_path))
287+
combined.to_json(str(full_path), num_proc=args.num_proc)
288288

289289
if not args.no_subsets:
290290
for n, name in [(1_000, "1K"), (10_000, "10K"), (100_000, "100K")]:
@@ -293,7 +293,7 @@ def main() -> None:
293293
continue
294294
subset_path = output_dir / f"sample-{name}.jsonl"
295295
logger.info("Writing %s subset to %s", name, subset_path)
296-
combined.select(range(n)).to_json(str(subset_path))
296+
combined.select(range(n)).to_json(str(subset_path), num_proc=args.num_proc)
297297

298298
logger.info("Done. Output files are in %s", output_dir)
299299

0 commit comments

Comments
 (0)