|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Generate a HuggingFace-compatible dataset from the agent-diff test suites. |
| 4 | +
|
| 5 | +Combines all 4 service benchmarks (Linear, Slack, Box, Calendar) into a single |
| 6 | +dataset with the schema expected by the prime-environments verifiers framework. |
| 7 | +
|
| 8 | +Output: a Parquet file (and optionally pushes to HuggingFace Hub). |
| 9 | +
|
| 10 | +Usage: |
| 11 | + python utils/generate_hf_dataset.py # save locally |
| 12 | + python utils/generate_hf_dataset.py --push hubertmarek/agent-diff-bench # push to HF |
| 13 | +""" |
| 14 | + |
| 15 | +import argparse |
| 16 | +import json |
| 17 | +from pathlib import Path |
| 18 | +from typing import Any |
| 19 | + |
| 20 | + |
| 21 | +REPO_ROOT = Path(__file__).resolve().parent.parent |
| 22 | + |
| 23 | +BENCHMARKS: list[tuple[str, Path]] = [ |
| 24 | + ("linear", REPO_ROOT / "examples/linear/testsuites/linear_bench.json"), |
| 25 | + ("slack", REPO_ROOT / "examples/slack/testsuites/slack_bench_v2.json"), |
| 26 | + ("box", REPO_ROOT / "examples/box/testsuites/box_bench.json"), |
| 27 | + ("calendar", REPO_ROOT / "examples/calendar/testsuites/calendar_bench.json"), |
| 28 | +] |
| 29 | + |
| 30 | +# Metadata keys to promote to top-level columns (must exist in test.metadata). |
| 31 | +PROMOTED_METADATA_KEYS = [ |
| 32 | + "task_horizon", |
| 33 | + "operation_type", |
| 34 | + "entity_scope", |
| 35 | + "information_availability", |
| 36 | + "prompt_ambiguity", |
| 37 | +] |
| 38 | + |
| 39 | +# Keys to include in the info column (runtime metadata for the environment). |
| 40 | +INFO_KEYS = [ |
| 41 | + "seed_template", |
| 42 | + "impersonate_user_id", |
| 43 | + "eval_type", |
| 44 | + "tools_required", |
| 45 | +] |
| 46 | + |
| 47 | + |
| 48 | +def build_answer(test: dict[str, Any], ignore_fields: dict[str, Any]) -> str: |
| 49 | + """Build the JSON-encoded answer string from a test's assertions. |
| 50 | +
|
| 51 | + The answer is the full expectedOutput spec sent to the AgentDiff evaluation |
| 52 | + engine. It must include both assertions and ignore_fields at the top level, |
| 53 | + since the assertion engine reads ignore_fields from the spec root to know |
| 54 | + which fields to exclude when computing diffs. |
| 55 | + """ |
| 56 | + spec: dict[str, Any] = {"assertions": test["assertions"]} |
| 57 | + if ignore_fields: |
| 58 | + spec["ignore_fields"] = ignore_fields |
| 59 | + return json.dumps(spec, separators=(",", ":")) |
| 60 | + |
| 61 | + |
| 62 | +def build_info(test: dict[str, Any], service: str = "") -> dict[str, Any]: |
| 63 | + """Build the info dict containing runtime metadata for the environment.""" |
| 64 | + metadata = test.get("metadata", {}) |
| 65 | + return { |
| 66 | + "service": service, |
| 67 | + "seed_template": test.get("seed_template", ""), |
| 68 | + "impersonate_user_id": test.get("impersonate_user_id", ""), |
| 69 | + "eval_type": test.get("type", "actionEval"), |
| 70 | + "tools_required": metadata.get("tools_required", []), |
| 71 | + } |
| 72 | + |
| 73 | + |
| 74 | +def load_suite(path: Path) -> dict[str, Any]: |
| 75 | + """Load and validate a test suite JSON file.""" |
| 76 | + with open(path, "r", encoding="utf-8") as f: |
| 77 | + data = json.load(f) |
| 78 | + if "tests" not in data: |
| 79 | + raise ValueError(f"No 'tests' key in {path}") |
| 80 | + return data |
| 81 | + |
| 82 | + |
| 83 | +def generate_rows() -> list[dict[str, Any]]: |
| 84 | + """Generate all dataset rows from the 4 service benchmarks.""" |
| 85 | + rows: list[dict[str, Any]] = [] |
| 86 | + global_id = 0 |
| 87 | + |
| 88 | + for service, suite_path in BENCHMARKS: |
| 89 | + if not suite_path.exists(): |
| 90 | + print(f"WARNING: {suite_path} not found, skipping {service}") |
| 91 | + continue |
| 92 | + |
| 93 | + suite = load_suite(suite_path) |
| 94 | + ignore_fields = suite.get("ignore_fields", {}) |
| 95 | + tests = suite["tests"] |
| 96 | + |
| 97 | + for test in tests: |
| 98 | + metadata = test.get("metadata", {}) |
| 99 | + |
| 100 | + row = { |
| 101 | + # Core columns (required by verifiers) |
| 102 | + "question": test["prompt"], |
| 103 | + "answer": build_answer(test, ignore_fields), |
| 104 | + # Identity |
| 105 | + "test_id": f"{service}_{global_id}", |
| 106 | + "test_name": test.get("name", ""), |
| 107 | + "service": service, |
| 108 | + # Promoted taxonomy metadata |
| 109 | + "task_horizon": metadata.get("task_horizon", 0), |
| 110 | + "operation_type": metadata.get("operation_type", ""), |
| 111 | + "entity_scope": metadata.get("entity_scope", ""), |
| 112 | + "information_availability": metadata.get( |
| 113 | + "information_availability", "" |
| 114 | + ), |
| 115 | + "prompt_ambiguity": metadata.get("prompt_ambiguity", ""), |
| 116 | + # Runtime metadata (JSON blob) |
| 117 | + "info": build_info(test, service=service), |
| 118 | + } |
| 119 | + rows.append(row) |
| 120 | + global_id += 1 |
| 121 | + |
| 122 | + return rows |
| 123 | + |
| 124 | + |
| 125 | +def print_summary(rows: list[dict[str, Any]]) -> None: |
| 126 | + """Print a summary of the generated dataset.""" |
| 127 | + from collections import Counter |
| 128 | + |
| 129 | + services = Counter(r["service"] for r in rows) |
| 130 | + print(f"\nGenerated {len(rows)} rows:") |
| 131 | + for svc, count in sorted(services.items()): |
| 132 | + print(f" {svc}: {count} tests") |
| 133 | + |
| 134 | + # ID range check |
| 135 | + ids = [r["test_id"] for r in rows] |
| 136 | + print(f"\nTest IDs: {ids[0]} ... {ids[-1]}") |
| 137 | + assert len(ids) == len(set(ids)), "Duplicate test_id detected!" |
| 138 | + print("All test_ids are unique.") |
| 139 | + |
| 140 | + # Column overview |
| 141 | + print(f"\nColumns: {list(rows[0].keys())}") |
| 142 | + |
| 143 | + |
| 144 | +def split_rows( |
| 145 | + rows: list[dict[str, Any]], test_fraction: float = 0.2, seed: int = 42 |
| 146 | +) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: |
| 147 | + """Stratified 80/20 split by service. Returns (train, test).""" |
| 148 | + import random |
| 149 | + |
| 150 | + rng = random.Random(seed) |
| 151 | + by_service: dict[str, list[dict[str, Any]]] = {} |
| 152 | + for row in rows: |
| 153 | + by_service.setdefault(row["service"], []).append(row) |
| 154 | + |
| 155 | + train, test = [], [] |
| 156 | + for service, svc_rows in sorted(by_service.items()): |
| 157 | + shuffled = list(svc_rows) |
| 158 | + rng.shuffle(shuffled) |
| 159 | + n_test = max(1, round(len(shuffled) * test_fraction)) |
| 160 | + test.extend(shuffled[:n_test]) |
| 161 | + train.extend(shuffled[n_test:]) |
| 162 | + |
| 163 | + print(f"\nSplit: {len(train)} train, {len(test)} test") |
| 164 | + for service in sorted(by_service): |
| 165 | + n_train = sum(1 for r in train if r["service"] == service) |
| 166 | + n_test = sum(1 for r in test if r["service"] == service) |
| 167 | + print(f" {service}: {n_train} train, {n_test} test") |
| 168 | + |
| 169 | + return train, test |
| 170 | + |
| 171 | + |
| 172 | +def _prepare_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: |
| 173 | + """Convert info dicts to JSON strings for storage.""" |
| 174 | + for row in rows: |
| 175 | + if isinstance(row["info"], dict): |
| 176 | + row["info"] = json.dumps(row["info"], separators=(",", ":")) |
| 177 | + return rows |
| 178 | + |
| 179 | + |
| 180 | +def save_dataset( |
| 181 | + train_rows: list[dict[str, Any]], |
| 182 | + test_rows: list[dict[str, Any]], |
| 183 | + output_dir: Path, |
| 184 | +) -> None: |
| 185 | + """Save train/test splits as Parquet.""" |
| 186 | + from datasets import Dataset |
| 187 | + |
| 188 | + output_dir.mkdir(parents=True, exist_ok=True) |
| 189 | + |
| 190 | + for split_name, split_rows in [("train", train_rows), ("test", test_rows)]: |
| 191 | + ds = Dataset.from_list(_prepare_rows(split_rows)) |
| 192 | + parquet_path = output_dir / f"{split_name}.parquet" |
| 193 | + ds.to_parquet(str(parquet_path)) |
| 194 | + print(f"Saved {split_name} ({len(split_rows)} rows) to {parquet_path}") |
| 195 | + |
| 196 | + jsonl_path = output_dir / f"{split_name}.jsonl" |
| 197 | + with open(jsonl_path, "w", encoding="utf-8") as f: |
| 198 | + for row in split_rows: |
| 199 | + f.write(json.dumps(row, ensure_ascii=False) + "\n") |
| 200 | + |
| 201 | + |
| 202 | +def push_to_hub( |
| 203 | + train_rows: list[dict[str, Any]], |
| 204 | + test_rows: list[dict[str, Any]], |
| 205 | + repo_id: str, |
| 206 | +) -> None: |
| 207 | + """Push train/test splits to HuggingFace Hub.""" |
| 208 | + from datasets import Dataset |
| 209 | + |
| 210 | + for split_name, split_rows in [("train", train_rows), ("test", test_rows)]: |
| 211 | + ds = Dataset.from_list(_prepare_rows(list(split_rows))) |
| 212 | + ds.push_to_hub(repo_id, split=split_name) |
| 213 | + print(f"Pushed {split_name} ({len(split_rows)} rows) to {repo_id}") |
| 214 | + |
| 215 | + print(f"\nDataset: https://huggingface.co/datasets/{repo_id}") |
| 216 | + |
| 217 | + |
| 218 | +def main(): |
| 219 | + parser = argparse.ArgumentParser( |
| 220 | + description="Generate HuggingFace dataset from agent-diff test suites" |
| 221 | + ) |
| 222 | + parser.add_argument( |
| 223 | + "--output-dir", |
| 224 | + type=Path, |
| 225 | + default=REPO_ROOT / "datasets" / "agent-diff-bench", |
| 226 | + help="Output directory for local files (default: datasets/agent-diff-bench/)", |
| 227 | + ) |
| 228 | + parser.add_argument( |
| 229 | + "--push", |
| 230 | + type=str, |
| 231 | + default=None, |
| 232 | + metavar="REPO_ID", |
| 233 | + help="Push to HuggingFace Hub (e.g. hubertmarek/agent-diff-bench)", |
| 234 | + ) |
| 235 | + parser.add_argument( |
| 236 | + "--test-fraction", |
| 237 | + type=float, |
| 238 | + default=0.2, |
| 239 | + help="Fraction of data for test split (default: 0.2)", |
| 240 | + ) |
| 241 | + parser.add_argument( |
| 242 | + "--seed", |
| 243 | + type=int, |
| 244 | + default=42, |
| 245 | + help="Random seed for split (default: 42)", |
| 246 | + ) |
| 247 | + parser.add_argument( |
| 248 | + "--dry-run", |
| 249 | + action="store_true", |
| 250 | + help="Generate and summarize without saving", |
| 251 | + ) |
| 252 | + args = parser.parse_args() |
| 253 | + |
| 254 | + rows = generate_rows() |
| 255 | + print_summary(rows) |
| 256 | + train_rows, test_rows = split_rows(rows, args.test_fraction, args.seed) |
| 257 | + |
| 258 | + if args.dry_run: |
| 259 | + sample = dict(train_rows[0]) |
| 260 | + sample["question"] = sample["question"][:100] + "..." |
| 261 | + sample["answer"] = sample["answer"][:100] + "..." |
| 262 | + print(f"\nSample row:\n{json.dumps(sample, indent=2)}") |
| 263 | + return |
| 264 | + |
| 265 | + save_dataset(train_rows, test_rows, args.output_dir) |
| 266 | + |
| 267 | + if args.push: |
| 268 | + # Re-generate (save_dataset mutates info to string) |
| 269 | + rows = generate_rows() |
| 270 | + train_rows, test_rows = split_rows(rows, args.test_fraction, args.seed) |
| 271 | + push_to_hub(train_rows, test_rows, args.push) |
| 272 | + |
| 273 | + |
| 274 | +if __name__ == "__main__": |
| 275 | + main() |
0 commit comments