Skip to content

Commit dcfa1e2

Browse files
Alfonso CastañoThe TensorFlow Datasets Authors
authored andcommitted
Add builder for LBPP
PiperOrigin-RevId: 746047815
1 parent 92cbcff commit dcfa1e2

4 files changed

Lines changed: 197 additions & 0 deletions

File tree

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
@inproceedings{matton-etal-2024-leakage,
2+
title = "On Leakage of Code Generation Evaluation Datasets",
3+
author = "Matton, Alexandre and
4+
Sherborne, Tom and
5+
Aumiller, Dennis and
6+
Tommasone, Elena and
7+
Alizadeh, Milad and
8+
He, Jingyi and
9+
Ma, Raymond and
10+
Voisin, Maxime and
11+
Gilsenan-McMahon, Ellen and
12+
Gall{\'e}, Matthias",
13+
editor = "Al-Onaizan, Yaser and
14+
Bansal, Mohit and
15+
Chen, Yun-Nung",
16+
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
17+
month = nov,
18+
year = "2024",
19+
address = "Miami, Florida, USA",
20+
publisher = "Association for Computational Linguistics",
21+
url = "https://aclanthology.org/2024.findings-emnlp.772/",
22+
doi = "10.18653/v1/2024.findings-emnlp.772",
23+
pages = "13215--13223",
24+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
*Less Basic Python Programming* is a collection of 161 programming problems with accompanying unit tests.
2+
They were created with the aim of being fresh (not leaked at the time of creation) and more difficult than similar datasets (e.g., HumanEval and MBPP).
3+
It can serve as a drop-in replacement or enrichment of those datasets as they are structured in an equivalent way.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# coding=utf-8
2+
# Copyright 2024 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
# coding=utf-8
2+
# Copyright 2024 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Cohere Less Basic Python Problems."""
17+
18+
import base64
19+
import json
20+
import pickle
21+
import zlib
22+
from tensorflow_datasets.core.utils.lazy_imports_utils import pandas as pd
23+
import tensorflow_datasets.public_api as tfds
24+
25+
26+
_HOMEPAGE = "https://aclanthology.org/2024.findings-emnlp.772/"
27+
28+
_VERSION = tfds.core.Version("2.0.0")
29+
30+
_COLUMNS = [
31+
"task_id",
32+
"language",
33+
"title",
34+
"instruction",
35+
"completion",
36+
"test_file",
37+
"test_list",
38+
"signature",
39+
"categories",
40+
"test_setup",
41+
]
42+
43+
_LANGUAGES = ["python", "cpp", "go", "java", "js", "rust"]
44+
_ALL_LANGUAGE_ALIASES = ["all", "multilingual"]
45+
_LANGUAGE_ALIAS_MAP = {
46+
"default": "python",
47+
"javascript": "js",
48+
}
49+
50+
51+
def decode_str(str_to_decode: str):
52+
return json.loads(
53+
pickle.loads(
54+
zlib.decompress(base64.b64decode(str_to_decode.encode("utf-8")))
55+
)
56+
)
57+
58+
59+
class LBPPConfig(tfds.core.BuilderConfig):
60+
"""BuilderConfig"""
61+
62+
def __init__(self, name, description, features, **kwargs):
63+
super(LBPPConfig, self).__init__(name=name, version=_VERSION, **kwargs)
64+
self.name = name
65+
self.description = description
66+
self.features = features
67+
68+
69+
class Builder(tfds.core.GeneratorBasedBuilder):
70+
VERSION = _VERSION
71+
BUILDER_CONFIGS = [
72+
LBPPConfig(
73+
name="all", description="Multilingual LBPP", features=_COLUMNS
74+
),
75+
LBPPConfig(
76+
name="multilingual",
77+
description="Multilingual LBPP",
78+
features=_COLUMNS,
79+
),
80+
LBPPConfig(name="default", description="Python LBPP", features=_COLUMNS),
81+
LBPPConfig(name="python", description="Python LBPP", features=_COLUMNS),
82+
LBPPConfig(name="cpp", description="C++ LBPP", features=_COLUMNS),
83+
LBPPConfig(name="go", description="Go LBPP", features=_COLUMNS),
84+
LBPPConfig(name="java", description="Java LBPP", features=_COLUMNS),
85+
LBPPConfig(name="js", description="JavaScript LBPP", features=_COLUMNS),
86+
LBPPConfig(
87+
name="javascript", description="JavaScript LBPP", features=_COLUMNS
88+
),
89+
LBPPConfig(name="rust", description="JavaScript LBPP", features=_COLUMNS),
90+
]
91+
DEFAULT_CONFIG_NAME = "python"
92+
93+
def _info(self):
94+
return self.dataset_info_from_configs(
95+
features=tfds.features.FeaturesDict({
96+
"task_id": tfds.features.Text(),
97+
"language": tfds.features.Text(),
98+
"title": tfds.features.Text(),
99+
"instruction": tfds.features.Text(),
100+
"completion": tfds.features.Text(),
101+
"test_file": tfds.features.Text(),
102+
"test_list": tfds.features.Sequence(tfds.features.Text()),
103+
"signature": tfds.features.Text(),
104+
"categories": tfds.features.Sequence(tfds.features.Text()),
105+
"test_setup": tfds.features.Text(),
106+
}),
107+
homepage=_HOMEPAGE,
108+
supervised_keys=None,
109+
)
110+
111+
def _split_generators(self, dl_manager):
112+
# Map alias to actual language
113+
data_loading_name = _LANGUAGE_ALIAS_MAP.get(
114+
self.builder_config.name, self.builder_config.name
115+
)
116+
hf_url_prefx = (
117+
"https://huggingface.co/datasets/CohereForAI/lbpp/resolve/main/"
118+
)
119+
if data_loading_name in _ALL_LANGUAGE_ALIASES:
120+
# Download all languages
121+
download_targets = [
122+
f"{hf_url_prefx}{_lang}/test.parquet" for _lang in _LANGUAGES
123+
]
124+
else:
125+
download_targets = [f"{hf_url_prefx}{data_loading_name}/test.parquet"]
126+
127+
downloaded_files = dl_manager.download(download_targets)
128+
129+
return [
130+
tfds.core.SplitGenerator(
131+
name=tfds.Split.TEST,
132+
gen_kwargs={
133+
"filepaths": downloaded_files,
134+
},
135+
)
136+
]
137+
138+
def _generate_examples(self, filepaths: list[str]):
139+
key = 0
140+
for filepath in filepaths:
141+
df = pd.read_parquet(filepath)
142+
for line in df.to_dict(orient="records"):
143+
yield key, {
144+
"task_id": line["task_id"],
145+
"language": line["language"],
146+
"title": line["title"],
147+
"instruction": line["instruction"],
148+
"completion": decode_str(line["completion"]),
149+
"test_file": decode_str(line["test_file"]),
150+
"test_list": decode_str(line["test_list"]),
151+
"signature": line["signature"] or "",
152+
"categories": line["categories"],
153+
"test_setup": decode_str(line["test_setup"]),
154+
}
155+
key += 1

0 commit comments

Comments
 (0)