|
4 | 4 | from pathlib import Path |
5 | 5 | from typing import Dict, Tuple |
6 | 6 |
|
7 | | -from datasets import load_dataset |
| 7 | +from datasets import concatenate_datasets, load_dataset |
8 | 8 | from tqdm import tqdm |
9 | 9 |
|
10 | 10 | """ |
@@ -69,6 +69,18 @@ def parse_args(): |
69 | 69 | action="store_true", |
70 | 70 | help="Whether to split the dataset into train and eval sets, default is False", |
71 | 71 | ) |
| 72 | + parser.add_argument( |
| 73 | + "--opc-subset", |
| 74 | + type=str, |
| 75 | + default="largescale_diverse_instruct", |
| 76 | + choices=[ |
| 77 | + "largescale_diverse_instruct", |
| 78 | + "filtered_infinity_instruct", |
| 79 | + "realuser_instruct", |
| 80 | + "all", |
| 81 | + ], |
| 82 | + help="The subset of OpenCoder opc-sft-stage1 dataset to use, or 'all' to use all subsets (default: largescale_diverse_instruct)", |
| 83 | + ) |
72 | 84 | return parser.parse_args() |
73 | 85 |
|
74 | 86 |
|
@@ -253,9 +265,20 @@ def main(): |
253 | 265 | ] |
254 | 266 | proc_fn = process_sharegpt4v_row |
255 | 267 | elif args.dataset == "opc": |
256 | | - ds = load_dataset( |
257 | | - "OpenCoder-LLM/opc-sft-stage1", "largescale_diverse_instruct" |
258 | | - )["train"] |
| 268 | + if args.opc_subset == "all": |
| 269 | + # Load all subsets and concatenate them |
| 270 | + subsets = [ |
| 271 | + "largescale_diverse_instruct", |
| 272 | + "filtered_infinity_instruct", |
| 273 | + "realuser_instruct", |
| 274 | + ] |
| 275 | + datasets_list = [ |
| 276 | + load_dataset("OpenCoder-LLM/opc-sft-stage1", subset)["train"] |
| 277 | + for subset in subsets |
| 278 | + ] |
| 279 | + ds = concatenate_datasets(datasets_list) |
| 280 | + else: |
| 281 | + ds = load_dataset("OpenCoder-LLM/opc-sft-stage1", args.opc_subset)["train"] |
259 | 282 | proc_fn = process_opc_sft_stage1 |
260 | 283 | else: |
261 | 284 | raise ValueError( |
|
0 commit comments