|
| 1 | +"""Configuration constants for Spider 2.0-DBT benchmark evaluation.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import os |
| 6 | +from pathlib import Path |
| 7 | + |
| 8 | +# ── Paths ────────────────────────────────────────────────────────────────────── |
| 9 | + |
| 10 | +BASE_DIR = Path(__file__).resolve().parent |
| 11 | +SPIDER2_REPO_DIR = BASE_DIR / "spider2_repo" |
| 12 | +SPIDER2_DBT_DIR = SPIDER2_REPO_DIR / "spider2-dbt" |
| 13 | +TASK_JSONL = SPIDER2_DBT_DIR / "examples" / "spider2-dbt.jsonl" |
| 14 | +EXAMPLES_DIR = SPIDER2_DBT_DIR / "examples" |
| 15 | +GOLD_EVAL_JSONL = SPIDER2_DBT_DIR / "evaluation_suite" / "gold" / "spider2_eval.jsonl" |
| 16 | +EVAL_UTILS_DIR = SPIDER2_DBT_DIR / "evaluation_suite" |
| 17 | +WORKSPACE_DIR = BASE_DIR / "workspace" |
| 18 | +RESULTS_DIR = BASE_DIR / "results" |
| 19 | +INCREMENTAL_DIR = RESULTS_DIR / "incremental" |
| 20 | +REPORTS_DIR = BASE_DIR / "reports" |
| 21 | + |
| 22 | +# ── Spider2 Repository ───────────────────────────────────────────────────────── |
| 23 | + |
| 24 | +SPIDER2_REPO_URL = "https://github.com/xlang-ai/Spider2.git" |
| 25 | +# Pin to a known-good commit for reproducibility |
| 26 | +SPIDER2_COMMIT = "main" |
| 27 | + |
| 28 | +# Google Drive file IDs for DuckDB database zips (from Spider2 README) |
| 29 | +# Format: (gdrive_id, expected_filename) |
| 30 | +DUCKDB_ZIP_DOWNLOADS = [ |
| 31 | + ("1N3f7BSWC4foj-V-1C9n8M2XmgV7FOcqL", "DBT_start_db.zip"), |
| 32 | + ("1s0USV_iQLo4oe05QqAMnhGGp5jeejCzp", "dbt_gold.zip"), |
| 33 | +] |
| 34 | + |
| 35 | +# ── Execution ────────────────────────────────────────────────────────────────── |
| 36 | + |
| 37 | +ALTIMATE_CODE_BIN = os.environ.get("ALTIMATE_CODE_BIN", "altimate-code") |
| 38 | +DEFAULT_TIMEOUT = 600 # seconds per task (slowest legit tasks take ~593s) |
| 39 | +MAX_RETRIES = 2 # auto-retry only for fast exits (API/init failures) |
| 40 | +FAST_EXIT_THRESHOLD_S = 10 # tasks completing under this are likely failures |
| 41 | +DEFAULT_PARALLEL = 2 # concurrent tasks (4 caused too much resource contention) |
| 42 | +DEFAULT_MODEL = "anthropic/claude-sonnet-4-6" |
| 43 | +DEFAULT_AGENT = "coder" |
| 44 | + |
| 45 | +# ── Leaderboard Data (Spider 2.0-DBT, as of 2025) ───────────────────────────── |
| 46 | +# Source: https://spider2-dbt.github.io/ |
| 47 | +# Format: (agent_name, pass_rate) |
| 48 | + |
| 49 | +LEADERBOARD: list[tuple[str, float]] = [ |
| 50 | + ("Databao Agent", 44.11), |
| 51 | + ("MLE-Bench Agent", 38.24), |
| 52 | + ("Claude 3.5 Sonnet (CoT)", 36.76), |
| 53 | + ("GPT-4o (CoT)", 33.82), |
| 54 | + ("CodeS Agent", 32.35), |
| 55 | + ("OpenHands Agent", 30.88), |
| 56 | + ("SWE-Agent", 27.94), |
| 57 | + ("Gemini 1.5 Pro (CoT)", 26.47), |
| 58 | + ("Llama 3.1 405B (CoT)", 22.06), |
| 59 | + ("GPT-4o mini (CoT)", 19.12), |
| 60 | + ("Claude 3 Haiku (CoT)", 16.18), |
| 61 | +] |
| 62 | + |
| 63 | +# ── Task Categories (domain grouping for report) ────────────────────────────── |
| 64 | +# Extract domain from instance_id by stripping trailing digits |
| 65 | + |
| 66 | +import re |
| 67 | + |
| 68 | + |
| 69 | +def get_task_domain(instance_id: str) -> str: |
| 70 | + """Extract domain from instance_id by stripping trailing digits. |
| 71 | +
|
| 72 | + e.g. 'shopify002' -> 'shopify', 'f1003' -> 'f1', 'tpch001' -> 'tpch' |
| 73 | + """ |
| 74 | + return re.sub(r"\d+$", "", instance_id) |
0 commit comments