Skip to content

Commit 34b5883

Browse files
authored
Add subset options for opc (#312)
* Add subset options for opc * lint & cat datasets
1 parent d582d7d commit 34b5883

1 file changed

Lines changed: 27 additions & 4 deletions

File tree

scripts/prepare_data.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from pathlib import Path
55
from typing import Dict, Tuple
66

7-
from datasets import load_dataset
7+
from datasets import concatenate_datasets, load_dataset
88
from tqdm import tqdm
99

1010
"""
@@ -69,6 +69,18 @@ def parse_args():
6969
action="store_true",
7070
help="Whether to split the dataset into train and eval sets, default is False",
7171
)
72+
parser.add_argument(
73+
"--opc-subset",
74+
type=str,
75+
default="largescale_diverse_instruct",
76+
choices=[
77+
"largescale_diverse_instruct",
78+
"filtered_infinity_instruct",
79+
"realuser_instruct",
80+
"all",
81+
],
82+
help="The subset of OpenCoder opc-sft-stage1 dataset to use, or 'all' to use all subsets (default: largescale_diverse_instruct)",
83+
)
7284
return parser.parse_args()
7385

7486

@@ -253,9 +265,20 @@ def main():
253265
]
254266
proc_fn = process_sharegpt4v_row
255267
elif args.dataset == "opc":
256-
ds = load_dataset(
257-
"OpenCoder-LLM/opc-sft-stage1", "largescale_diverse_instruct"
258-
)["train"]
268+
if args.opc_subset == "all":
269+
# Load all subsets and concatenate them
270+
subsets = [
271+
"largescale_diverse_instruct",
272+
"filtered_infinity_instruct",
273+
"realuser_instruct",
274+
]
275+
datasets_list = [
276+
load_dataset("OpenCoder-LLM/opc-sft-stage1", subset)["train"]
277+
for subset in subsets
278+
]
279+
ds = concatenate_datasets(datasets_list)
280+
else:
281+
ds = load_dataset("OpenCoder-LLM/opc-sft-stage1", args.opc_subset)["train"]
259282
proc_fn = process_opc_sft_stage1
260283
else:
261284
raise ValueError(

0 commit comments

Comments
 (0)