Skip to content

Commit e63faa0

Browse files
committed
Datasets
1 parent b87a539 commit e63faa0

7 files changed

Lines changed: 506 additions & 0 deletions

File tree

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,12 @@ client.delete_env(envId=env.environmentId)
149149
SDK provides **code execution proxies** - tools for AI agents. You add it to your toolbox in Vercel AI SDK, Langchain or OpenAI Agents, making LLM write Python or Bash code to talk with Slack or Linear API. Requests will automatically be intercepted and routed to isolated test environments. This enables agents to interact with service replicas without any code changes. See more in: **[Python SDK](sdk/agent-diff-python/README.md)**
150150

151151

152+
## Benchmark & Training
153+
154+
- **HuggingFace Dataset**: [hubertmarek/agent-diff-bench](https://huggingface.co/datasets/hubertmarek/agent-diff-bench) — 224 tasks across all 4 services (80/20 train/test split, stratified by service)
155+
- **Prime Intellect Environment**: [agent-diff-bench on Prime Lab](https://app.primeintellect.ai/dashboard/environments/hubert-marek/agent-diff-bench) — run evaluations or RL training via Hosted Training
156+
- **Paper**: [AgentDiff: Agentic API Evaluation via State Differencing (KDD 2026 pre-print)](https://drive.google.com/file/d/1BlmJTSMX7ohwvD1aYBByg7_Y815fgsxp/view?usp=sharing)
157+
152158
## Evaluations & Test Suites
153159

154160
Collections of test cases with assertions that you can run against agent runs using evaluations.

backend/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ RUN echo '#!/bin/sh\n\
2929
python utils/seed_slack_template.py\n\
3030
python utils/seed_linear_template.py\n\
3131
python utils/seed_box_template.py\n\
32+
python utils/seed_calendar_template.py\n\
3233
python utils/seed_tests.py\n\
3334
else\n\
3435
echo "=== Skipping seed (set SEED=true to enable) ==="\n\

datasets/agent-diff-bench/test.jsonl

Lines changed: 45 additions & 0 deletions
Large diffs are not rendered by default.
36.5 KB
Binary file not shown.

datasets/agent-diff-bench/train.jsonl

Lines changed: 179 additions & 0 deletions
Large diffs are not rendered by default.
84.6 KB
Binary file not shown.

utils/generate_hf_dataset.py

Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Generate a HuggingFace-compatible dataset from the agent-diff test suites.
4+
5+
Combines all 4 service benchmarks (Linear, Slack, Box, Calendar) into a single
6+
dataset with the schema expected by the prime-environments verifiers framework.
7+
8+
Output: a Parquet file (and optionally pushes to HuggingFace Hub).
9+
10+
Usage:
11+
python utils/generate_hf_dataset.py # save locally
12+
python utils/generate_hf_dataset.py --push hubertmarek/agent-diff-bench # push to HF
13+
"""
14+
15+
import argparse
16+
import json
17+
from pathlib import Path
18+
from typing import Any
19+
20+
21+
REPO_ROOT = Path(__file__).resolve().parent.parent
22+
23+
BENCHMARKS: list[tuple[str, Path]] = [
24+
("linear", REPO_ROOT / "examples/linear/testsuites/linear_bench.json"),
25+
("slack", REPO_ROOT / "examples/slack/testsuites/slack_bench_v2.json"),
26+
("box", REPO_ROOT / "examples/box/testsuites/box_bench.json"),
27+
("calendar", REPO_ROOT / "examples/calendar/testsuites/calendar_bench.json"),
28+
]
29+
30+
# Metadata keys to promote to top-level columns (must exist in test.metadata).
31+
PROMOTED_METADATA_KEYS = [
32+
"task_horizon",
33+
"operation_type",
34+
"entity_scope",
35+
"information_availability",
36+
"prompt_ambiguity",
37+
]
38+
39+
# Keys to include in the info column (runtime metadata for the environment).
40+
INFO_KEYS = [
41+
"seed_template",
42+
"impersonate_user_id",
43+
"eval_type",
44+
"tools_required",
45+
]
46+
47+
48+
def build_answer(test: dict[str, Any], ignore_fields: dict[str, Any]) -> str:
49+
"""Build the JSON-encoded answer string from a test's assertions.
50+
51+
The answer is the full expectedOutput spec sent to the AgentDiff evaluation
52+
engine. It must include both assertions and ignore_fields at the top level,
53+
since the assertion engine reads ignore_fields from the spec root to know
54+
which fields to exclude when computing diffs.
55+
"""
56+
spec: dict[str, Any] = {"assertions": test["assertions"]}
57+
if ignore_fields:
58+
spec["ignore_fields"] = ignore_fields
59+
return json.dumps(spec, separators=(",", ":"))
60+
61+
62+
def build_info(test: dict[str, Any], service: str = "") -> dict[str, Any]:
63+
"""Build the info dict containing runtime metadata for the environment."""
64+
metadata = test.get("metadata", {})
65+
return {
66+
"service": service,
67+
"seed_template": test.get("seed_template", ""),
68+
"impersonate_user_id": test.get("impersonate_user_id", ""),
69+
"eval_type": test.get("type", "actionEval"),
70+
"tools_required": metadata.get("tools_required", []),
71+
}
72+
73+
74+
def load_suite(path: Path) -> dict[str, Any]:
75+
"""Load and validate a test suite JSON file."""
76+
with open(path, "r", encoding="utf-8") as f:
77+
data = json.load(f)
78+
if "tests" not in data:
79+
raise ValueError(f"No 'tests' key in {path}")
80+
return data
81+
82+
83+
def generate_rows() -> list[dict[str, Any]]:
84+
"""Generate all dataset rows from the 4 service benchmarks."""
85+
rows: list[dict[str, Any]] = []
86+
global_id = 0
87+
88+
for service, suite_path in BENCHMARKS:
89+
if not suite_path.exists():
90+
print(f"WARNING: {suite_path} not found, skipping {service}")
91+
continue
92+
93+
suite = load_suite(suite_path)
94+
ignore_fields = suite.get("ignore_fields", {})
95+
tests = suite["tests"]
96+
97+
for test in tests:
98+
metadata = test.get("metadata", {})
99+
100+
row = {
101+
# Core columns (required by verifiers)
102+
"question": test["prompt"],
103+
"answer": build_answer(test, ignore_fields),
104+
# Identity
105+
"test_id": f"{service}_{global_id}",
106+
"test_name": test.get("name", ""),
107+
"service": service,
108+
# Promoted taxonomy metadata
109+
"task_horizon": metadata.get("task_horizon", 0),
110+
"operation_type": metadata.get("operation_type", ""),
111+
"entity_scope": metadata.get("entity_scope", ""),
112+
"information_availability": metadata.get(
113+
"information_availability", ""
114+
),
115+
"prompt_ambiguity": metadata.get("prompt_ambiguity", ""),
116+
# Runtime metadata (JSON blob)
117+
"info": build_info(test, service=service),
118+
}
119+
rows.append(row)
120+
global_id += 1
121+
122+
return rows
123+
124+
125+
def print_summary(rows: list[dict[str, Any]]) -> None:
126+
"""Print a summary of the generated dataset."""
127+
from collections import Counter
128+
129+
services = Counter(r["service"] for r in rows)
130+
print(f"\nGenerated {len(rows)} rows:")
131+
for svc, count in sorted(services.items()):
132+
print(f" {svc}: {count} tests")
133+
134+
# ID range check
135+
ids = [r["test_id"] for r in rows]
136+
print(f"\nTest IDs: {ids[0]} ... {ids[-1]}")
137+
assert len(ids) == len(set(ids)), "Duplicate test_id detected!"
138+
print("All test_ids are unique.")
139+
140+
# Column overview
141+
print(f"\nColumns: {list(rows[0].keys())}")
142+
143+
144+
def split_rows(
145+
rows: list[dict[str, Any]], test_fraction: float = 0.2, seed: int = 42
146+
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
147+
"""Stratified 80/20 split by service. Returns (train, test)."""
148+
import random
149+
150+
rng = random.Random(seed)
151+
by_service: dict[str, list[dict[str, Any]]] = {}
152+
for row in rows:
153+
by_service.setdefault(row["service"], []).append(row)
154+
155+
train, test = [], []
156+
for service, svc_rows in sorted(by_service.items()):
157+
shuffled = list(svc_rows)
158+
rng.shuffle(shuffled)
159+
n_test = max(1, round(len(shuffled) * test_fraction))
160+
test.extend(shuffled[:n_test])
161+
train.extend(shuffled[n_test:])
162+
163+
print(f"\nSplit: {len(train)} train, {len(test)} test")
164+
for service in sorted(by_service):
165+
n_train = sum(1 for r in train if r["service"] == service)
166+
n_test = sum(1 for r in test if r["service"] == service)
167+
print(f" {service}: {n_train} train, {n_test} test")
168+
169+
return train, test
170+
171+
172+
def _prepare_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
173+
"""Convert info dicts to JSON strings for storage."""
174+
for row in rows:
175+
if isinstance(row["info"], dict):
176+
row["info"] = json.dumps(row["info"], separators=(",", ":"))
177+
return rows
178+
179+
180+
def save_dataset(
181+
train_rows: list[dict[str, Any]],
182+
test_rows: list[dict[str, Any]],
183+
output_dir: Path,
184+
) -> None:
185+
"""Save train/test splits as Parquet."""
186+
from datasets import Dataset
187+
188+
output_dir.mkdir(parents=True, exist_ok=True)
189+
190+
for split_name, split_rows in [("train", train_rows), ("test", test_rows)]:
191+
ds = Dataset.from_list(_prepare_rows(split_rows))
192+
parquet_path = output_dir / f"{split_name}.parquet"
193+
ds.to_parquet(str(parquet_path))
194+
print(f"Saved {split_name} ({len(split_rows)} rows) to {parquet_path}")
195+
196+
jsonl_path = output_dir / f"{split_name}.jsonl"
197+
with open(jsonl_path, "w", encoding="utf-8") as f:
198+
for row in split_rows:
199+
f.write(json.dumps(row, ensure_ascii=False) + "\n")
200+
201+
202+
def push_to_hub(
203+
train_rows: list[dict[str, Any]],
204+
test_rows: list[dict[str, Any]],
205+
repo_id: str,
206+
) -> None:
207+
"""Push train/test splits to HuggingFace Hub."""
208+
from datasets import Dataset
209+
210+
for split_name, split_rows in [("train", train_rows), ("test", test_rows)]:
211+
ds = Dataset.from_list(_prepare_rows(list(split_rows)))
212+
ds.push_to_hub(repo_id, split=split_name)
213+
print(f"Pushed {split_name} ({len(split_rows)} rows) to {repo_id}")
214+
215+
print(f"\nDataset: https://huggingface.co/datasets/{repo_id}")
216+
217+
218+
def main():
219+
parser = argparse.ArgumentParser(
220+
description="Generate HuggingFace dataset from agent-diff test suites"
221+
)
222+
parser.add_argument(
223+
"--output-dir",
224+
type=Path,
225+
default=REPO_ROOT / "datasets" / "agent-diff-bench",
226+
help="Output directory for local files (default: datasets/agent-diff-bench/)",
227+
)
228+
parser.add_argument(
229+
"--push",
230+
type=str,
231+
default=None,
232+
metavar="REPO_ID",
233+
help="Push to HuggingFace Hub (e.g. hubertmarek/agent-diff-bench)",
234+
)
235+
parser.add_argument(
236+
"--test-fraction",
237+
type=float,
238+
default=0.2,
239+
help="Fraction of data for test split (default: 0.2)",
240+
)
241+
parser.add_argument(
242+
"--seed",
243+
type=int,
244+
default=42,
245+
help="Random seed for split (default: 42)",
246+
)
247+
parser.add_argument(
248+
"--dry-run",
249+
action="store_true",
250+
help="Generate and summarize without saving",
251+
)
252+
args = parser.parse_args()
253+
254+
rows = generate_rows()
255+
print_summary(rows)
256+
train_rows, test_rows = split_rows(rows, args.test_fraction, args.seed)
257+
258+
if args.dry_run:
259+
sample = dict(train_rows[0])
260+
sample["question"] = sample["question"][:100] + "..."
261+
sample["answer"] = sample["answer"][:100] + "..."
262+
print(f"\nSample row:\n{json.dumps(sample, indent=2)}")
263+
return
264+
265+
save_dataset(train_rows, test_rows, args.output_dir)
266+
267+
if args.push:
268+
# Re-generate (save_dataset mutates info to string)
269+
rows = generate_rows()
270+
train_rows, test_rows = split_rows(rows, args.test_fraction, args.seed)
271+
push_to_hub(train_rows, test_rows, args.push)
272+
273+
274+
if __name__ == "__main__":
275+
main()

0 commit comments

Comments
 (0)