diff --git a/.gitignore b/.gitignore index e04949b..94cc105 100644 --- a/.gitignore +++ b/.gitignore @@ -53,7 +53,6 @@ fastagent.secrets.yaml outputs/ output*/ results/ -experiments/ fastagent.jsonl test_script_*.py .claude/ @@ -64,3 +63,7 @@ site/ # Appworld data data/ + +/utils/ + +util_tools/ \ No newline at end of file diff --git a/experiments/parth/README.md b/experiments/parth/README.md new file mode 100644 index 0000000..8055583 --- /dev/null +++ b/experiments/parth/README.md @@ -0,0 +1,82 @@ +# Parth's Experiment Work + +Experiments run using the WAGS framework during Spring 2026. + +## Directory structure + +``` +experiments/parth/ +├── gepa/ # GEPA prompt optimization on BFCL +│ ├── run.py # Orchestrator: runs GEPA optimization loop +│ ├── agent.py # BFCLAgent wrapper for DSPy +│ ├── metrics.py # BFCL metric with feedback for GEPA scoring +│ ├── scoring_utils.py # Score parsing and aggregation helpers +│ ├── data_utils.py # BFCL test case loading and filtering +│ ├── env_utils.py # Model/env validation +│ ├── logging_utils.py # JSONL logging, TeeIO, git info capture +│ ├── gepa_minimal.py # Minimal standalone GEPA example +│ └── gepa_overview.txt # Design notes on the optimization loop +│ +└── feedback_ablation/ # Runtime feedback ablation study + ├── run_experiment.py # Runs one (subset x condition) cell via pytest + ├── analyze_results.py # Builds results dataframe + summary tables + ├── cases.yaml # Per-case trigger configs and feedback messages + ├── baseline_stability.yaml # Baseline pass/fail/flaky labels per case + └── configs/ # 28 JSON config files (A-G x 4 conditions) + ├── A_specific.json + ├── A_vague.json + ├── ... + └── G_verbose.json +``` + +## Feedback ablation + +Studies how different styles of runtime feedback (specific, vague, verbose, null) +affect agent recovery on BFCL multi-turn test cases across 7 failure-pattern +subsets (A-G). Uses the `ExternalFeedbackMiddleware` in `src/wags/middleware/`. + +### Running an experiment cell + +```bash +python experiments/parth/feedback_ablation/run_experiment.py \ + --subset A --condition specific + +# Dry-run to inspect the resolved config and pytest command: +python experiments/parth/feedback_ablation/run_experiment.py \ + --subset A --condition specific --dry-run +``` + +### Analyzing results + +```bash +python experiments/parth/feedback_ablation/analyze_results.py +``` + +Produces `results_dataframe.csv` (in the feedback_ablation directory) and prints +recovery rate, disruption rate, and behavioral response tables. + +## GEPA + +Guided Expert Policy Aggregation applied to BFCL instruction optimization. +Uses DSPy's `GEPA` teleprompt to iteratively refine agent instructions. + +```bash +python -m experiments.parth.gepa.run \ + --instruction-file path/to/instruction.txt \ + --output-dir outputs/gepa_on_bfcl +``` + +## Experiment outputs + +Raw outputs are gitignored and stored locally under `outputs/` at the repo root: + +- `outputs/feedback/` — feedback ablation results (A-G x conditions + baselines) +- `outputs/gepa-expert/` — GEPA optimization artifacts +- `outputs/D_trigger/` — precondition-gated trigger experiment +- `outputs/brake_feedback/` — brake-only feedback pilot + +Canonical copies are on Google Drive: *(link TBD)* + +## Quarter summary + +*(link TBD)* diff --git a/experiments/parth/feedback_ablation/analyze_results.py b/experiments/parth/feedback_ablation/analyze_results.py new file mode 100644 index 0000000..4f13616 --- /dev/null +++ b/experiments/parth/feedback_ablation/analyze_results.py @@ -0,0 +1,494 @@ +#!/usr/bin/env python3 +"""Analysis pipeline for feedback experiment results. + +Reads external_feedback.jsonl, *_evaluation.json, and *_complete.json across +all cells and produces: + 1. A results dataframe (one row per subset × condition × test_case_id) + 2. Recovery rate table by (subset, condition) + 3. Disruption rate table by (subset, condition) over consistent_pass cases + 4. Behavioral response distribution by condition +""" + +import csv +import json +import sys +from collections import defaultdict +from pathlib import Path + +import yaml + +_THIS_DIR = Path(__file__).resolve().parent +REPO_ROOT = _THIS_DIR.parent.parent.parent +FEEDBACK_DIR = REPO_ROOT / "outputs" / "feedback" +BASELINE_STABILITY_YAML = _THIS_DIR / "baseline_stability.yaml" +CASES_YAML = _THIS_DIR / "cases.yaml" + +CONDITIONS = ["specific", "vague", "verbose", "null"] +BASELINES = ["baseline_1", "baseline_2", "baseline_3"] + + +# --------------------------------------------------------------------------- +# Data loading +# --------------------------------------------------------------------------- + +def load_baseline_outcomes() -> dict[tuple[str, str], str]: + """Load (subset, case_id) -> stability from baseline_stability.yaml.""" + with open(BASELINE_STABILITY_YAML) as f: + entries = yaml.safe_load(f) + return { + (e["subset"], e["case_id"]): e["baseline_stability"] + for e in entries + } + + +def load_expected_cases() -> dict[str, list[str]]: + """Load subset -> [case_ids] from cases.yaml.""" + with open(CASES_YAML) as f: + cases = yaml.safe_load(f) + by_subset: dict[str, list[str]] = {} + for c in cases: + by_subset.setdefault(c["subset"], []).append(c["case_id"]) + return by_subset + + +def load_feedback_log(path: Path) -> dict[str, list[dict]]: + """Parse external_feedback.jsonl, return {test_case_id: [records]}.""" + by_case: dict[str, list[dict]] = defaultdict(list) + if not path.exists(): + return by_case + with open(path) as f: + for line in f: + if line.strip(): + r = json.loads(line) + by_case[r["test_case_id"]].append(r) + return by_case + + +def _run_eval_from_complete(test_id: str, complete_path: Path) -> dict | None: + """Run BFCL evaluation from a complete.json file. Returns eval dict or None.""" + try: + from tests.benchmarks.bfcl.test_bfcl import _validate_from_complete_json + return _validate_from_complete_json(test_id, complete_path) + except Exception: + return None + + +def find_eval_result(cell_dir: Path, case_id: str) -> str | None: + """Find evaluation result for a case. Returns 'pass', 'fail', or None. + + If no evaluation file exists but a complete.json does, retroactively + evaluates from the complete.json and caches the result. + """ + # Check raw/ directory (feedback runs) + candidates = [ + cell_dir / "raw" / f"{case_id}_evaluation.json", + # Baseline format: per-case subdirectory + cell_dir / case_id / "evaluation.json", + ] + for p in candidates: + if p.exists(): + with open(p) as f: + ev = json.load(f) + valid = ev.get("validation", {}).get("valid", False) + return "pass" if valid else "fail" + + # No eval file — try retroactive evaluation from complete.json + complete_path = find_complete_json(cell_dir, case_id) + if complete_path is not None and complete_path.exists(): + ev = _run_eval_from_complete(case_id, complete_path) + if ev is not None: + # Cache the result + eval_out = complete_path.parent / f"{case_id}_evaluation.json" + eval_out.write_text(json.dumps(ev, indent=2, default=str)) + valid = ev.get("validation", {}).get("valid", False) + return "pass" if valid else "fail" + + return None + + +def find_complete_json(cell_dir: Path, case_id: str) -> Path | None: + """Find complete.json for a case.""" + candidates = [ + cell_dir / "raw" / f"{case_id}_complete.json", + cell_dir / case_id / "raw" / f"{case_id}_complete.json", + ] + for p in candidates: + if p.exists(): + return p + return None + + +def classify_behavior( + complete_path: Path | None, + feedback_records: list[dict], +) -> str: + """Classify agent behavior after feedback. + + Returns one of: + not_triggered, identical_retry, different_args, different_tool, + no_retry, reasoning_shown, crash + """ + triggered = [r for r in feedback_records if r.get("triggered")] + if not triggered: + return "not_triggered" + + if complete_path is None or not complete_path.exists(): + return "crash" + + with open(complete_path) as f: + data = json.load(f) + msgs = data.get("messages", []) + + # Collect all feedback messages for matching + feedback_texts = [r["feedback_message"] for r in triggered if r.get("feedback_message")] + + behaviors = [] + + for i, msg in enumerate(msgs): + tr = msg.get("tool_results") or {} + for tid, result in tr.items(): + texts = [c.get("text", "") for c in result.get("content", []) if isinstance(c, dict)] + full_text = " ".join(texts) + + is_feedback = any(ft and ft[:40] in full_text for ft in feedback_texts) + if not is_feedback: + continue + + # Find original call + orig_call = None + if i > 0: + prev = msgs[i - 1] + for ptid, pcall in (prev.get("tool_calls") or {}).items(): + if ptid == tid: + orig_call = pcall + break + + # Check next assistant message + if i + 1 >= len(msgs): + behaviors.append("no_retry") + continue + + nxt = msgs[i + 1] + if nxt.get("role") != "assistant": + behaviors.append("no_retry") + continue + + has_reasoning = any( + (isinstance(c, dict) and c.get("text", "").strip()) + or (isinstance(c, str) and c.strip()) + for c in (nxt.get("content") or []) + ) + + next_calls = nxt.get("tool_calls") or {} + + if not next_calls: + behaviors.append("reasoning_shown" if has_reasoning else "no_retry") + continue + + if orig_call: + orig_name = orig_call["name"] + orig_args = orig_call.get("arguments") + retry_same = [c for c in next_calls.values() if c["name"] == orig_name] + if retry_same: + if retry_same[0].get("arguments") == orig_args: + behaviors.append("reasoning_shown" if has_reasoning else "identical_retry") + else: + behaviors.append("reasoning_shown" if has_reasoning else "different_args") + else: + behaviors.append("reasoning_shown" if has_reasoning else "different_tool") + else: + behaviors.append("reasoning_shown" if has_reasoning else "different_tool") + + if not behaviors: + return "not_triggered" + + # Priority: reasoning_shown > different_tool > different_args > no_retry > identical_retry + priority = ["reasoning_shown", "different_tool", "different_args", "no_retry", "identical_retry"] + for p in priority: + if p in behaviors: + return p + return behaviors[0] + + +# --------------------------------------------------------------------------- +# Build results dataframe +# --------------------------------------------------------------------------- + +def build_results() -> list[dict]: + baseline_outcomes = load_baseline_outcomes() + expected_cases = load_expected_cases() + rows = [] + + for subset, case_ids in sorted(expected_cases.items()): + for condition in CONDITIONS: + cell_dir = FEEDBACK_DIR / subset / condition + if not cell_dir.exists(): + continue + + feedback_log = load_feedback_log(cell_dir / "raw" / "external_feedback.jsonl") + + for case_id in sorted(case_ids): + fb_records = feedback_log.get(case_id, []) + triggered = [r for r in fb_records if r.get("triggered")] + + eval_result = find_eval_result(cell_dir, case_id) + complete_path = find_complete_json(cell_dir, case_id) + + behavior = classify_behavior(complete_path, fb_records) + + # Determine evaluator outcome + if eval_result is not None: + eval_outcome = eval_result + elif complete_path and complete_path.exists(): + eval_outcome = "crash" # ran but no eval = likely crashed + else: + eval_outcome = "missing" + + rows.append({ + "subset": subset, + "condition": condition, + "case_id": case_id, + "baseline_outcome": baseline_outcomes.get((subset, case_id), "unknown"), + "trigger_fired": len(triggered) > 0, + "trigger_count": len(triggered), + "behavioral_response": behavior, + "evaluator_outcome": eval_outcome, + }) + + return rows + + +# --------------------------------------------------------------------------- +# Summary tables +# --------------------------------------------------------------------------- + +def print_table(headers: list[str], rows: list[list], title: str) -> None: + """Print a formatted table.""" + print(f"\n{'=' * 80}") + print(f" {title}") + print(f"{'=' * 80}") + + col_widths = [len(h) for h in headers] + for row in rows: + for i, val in enumerate(row): + col_widths[i] = max(col_widths[i], len(str(val))) + + fmt = " ".join(f"{{:<{w}}}" for w in col_widths) + print(fmt.format(*headers)) + print(fmt.format(*["-" * w for w in col_widths])) + for row in rows: + print(fmt.format(*[str(v) for v in row])) + + +def compute_baseline_eval(subset: str, case_id: str) -> dict[str, str | None]: + """Get pass/fail from each baseline run.""" + results = {} + for bl in BASELINES: + cell_dir = FEEDBACK_DIR / subset / bl + results[bl] = find_eval_result(cell_dir, case_id) + return results + + +def table_accuracy(results: list[dict]) -> None: + """Overall accuracy by (subset, condition) vs. baseline.""" + expected_cases = load_expected_cases() + headers = ["subset", "condition", "n_cases", "n_pass", "n_fail", "n_other", "accuracy", "baseline_accuracy"] + table_rows = [] + + total_pass = 0 + total_cases = 0 + bl_total_pass = 0 + bl_total_runs = 0 + + for subset in sorted(set(r["subset"] for r in results)): + case_ids = expected_cases.get(subset, []) + + # Baseline accuracy for this subset (averaged across 3 runs) + bl_passes = 0 + bl_runs = 0 + for case_id in case_ids: + bl_results = compute_baseline_eval(subset, case_id) + for outcome in bl_results.values(): + if outcome is not None: + bl_runs += 1 + if outcome == "pass": + bl_passes += 1 + bl_acc = f"{bl_passes}/{bl_runs} ({100 * bl_passes / bl_runs:.0f}%)" if bl_runs > 0 else "—" + bl_total_pass += bl_passes + bl_total_runs += bl_runs + + for condition in CONDITIONS: + cell = [r for r in results if r["subset"] == subset and r["condition"] == condition] + if not cell: + continue + n = len(cell) + n_pass = sum(1 for r in cell if r["evaluator_outcome"] == "pass") + n_fail = sum(1 for r in cell if r["evaluator_outcome"] == "fail") + n_other = n - n_pass - n_fail + acc = f"{n_pass}/{n} ({100 * n_pass / n:.0f}%)" + table_rows.append([subset, condition, n, n_pass, n_fail, n_other, acc, bl_acc]) + total_pass += n_pass + total_cases += n + + print_table(headers, table_rows, "Table 0: Overall Accuracy by (Subset, Condition)") + + bl_pct = 100 * bl_total_pass / bl_total_runs if bl_total_runs else 0 + by_cond: dict[str, tuple[int, int]] = {} + for condition in CONDITIONS: + cond_rows = [r for r in results if r["condition"] == condition] + cp = sum(1 for r in cond_rows if r["evaluator_outcome"] == "pass") + cn = len(cond_rows) + by_cond[condition] = (cp, cn) + parts = " | ".join( + f"{c}: {p}/{n} ({100 * p / n:.1f}%)" for c, (p, n) in by_cond.items() if n + ) + print(f"\nBaseline accuracy: {bl_total_pass}/{bl_total_runs} ({bl_pct:.1f}%)") + print(f"Feedback accuracy — {parts}") + + +def table_recovery(results: list[dict]) -> None: + """Recovery rate by (subset, condition), split by baseline category.""" + # Group: for consistent_fail and flaky cases, what fraction passed under feedback? + headers = ["subset", "condition", "baseline_cat", "n_cases", "n_pass", "n_fail", "n_crash", "recovery_rate"] + table_rows = [] + + for baseline_cat in ["consistent_fail", "flaky"]: + for subset in sorted(set(r["subset"] for r in results)): + for condition in CONDITIONS: + cell = [ + r for r in results + if r["subset"] == subset + and r["condition"] == condition + and r["baseline_outcome"] == baseline_cat + ] + if not cell: + continue + n = len(cell) + n_pass = sum(1 for r in cell if r["evaluator_outcome"] == "pass") + n_fail = sum(1 for r in cell if r["evaluator_outcome"] == "fail") + n_crash = sum(1 for r in cell if r["evaluator_outcome"] in ("crash", "missing")) + rate = f"{n_pass}/{n} ({100 * n_pass / n:.0f}%)" if n > 0 else "—" + table_rows.append([subset, condition, baseline_cat, n, n_pass, n_fail, n_crash, rate]) + + # Add baseline column: aggregate pass rate across 3 baseline runs + headers_with_bl = headers + ["baseline_pass_rate"] + table_rows_with_bl = [] + expected_cases = load_expected_cases() + + for row in table_rows: + subset, condition, baseline_cat = row[0], row[1], row[2] + case_ids = [ + r["case_id"] for r in results + if r["subset"] == subset + and r["condition"] == condition + and r["baseline_outcome"] == baseline_cat + ] + bl_passes = 0 + bl_total = 0 + for case_id in case_ids: + bl_results = compute_baseline_eval(subset, case_id) + for bl, outcome in bl_results.items(): + if outcome is not None: + bl_total += 1 + if outcome == "pass": + bl_passes += 1 + bl_rate = f"{bl_passes}/{bl_total} ({100 * bl_passes / bl_total:.0f}%)" if bl_total > 0 else "—" + table_rows_with_bl.append(row + [bl_rate]) + + print_table(headers_with_bl, table_rows_with_bl, "Table 1: Recovery Rate (consistent_fail & flaky cases)") + + +def table_disruption(results: list[dict]) -> None: + """Disruption rate over consistent_pass cases where feedback fired.""" + headers = ["subset", "condition", "n_cases", "n_triggered", "n_pass", "n_fail", "n_crash", "disruption_rate"] + table_rows = [] + + for subset in sorted(set(r["subset"] for r in results)): + for condition in CONDITIONS: + cell = [ + r for r in results + if r["subset"] == subset + and r["condition"] == condition + and r["baseline_outcome"] == "consistent_pass" + ] + if not cell: + continue + n = len(cell) + n_triggered = sum(1 for r in cell if r["trigger_fired"]) + n_pass = sum(1 for r in cell if r["evaluator_outcome"] == "pass") + n_fail = sum(1 for r in cell if r["evaluator_outcome"] == "fail") + n_crash = sum(1 for r in cell if r["evaluator_outcome"] in ("crash", "missing")) + disrupted = n_fail + n_crash + rate = f"{disrupted}/{n} ({100 * disrupted / n:.0f}%)" if n > 0 else "—" + table_rows.append([subset, condition, n, n_triggered, n_pass, n_fail, n_crash, rate]) + + print_table(headers, table_rows, "Table 2: Disruption Rate (consistent_pass cases)") + + +def table_behavior(results: list[dict]) -> None: + """Behavioral response distribution by condition (pooled across subsets).""" + behavior_types = [ + "not_triggered", "identical_retry", "different_args", + "different_tool", "no_retry", "reasoning_shown", "crash", + ] + headers = ["condition", "n_total"] + behavior_types + + table_rows = [] + for condition in CONDITIONS: + cell = [r for r in results if r["condition"] == condition] + n = len(cell) + counts = {b: sum(1 for r in cell if r["behavioral_response"] == b) for b in behavior_types} + row = [condition, n] + [counts.get(b, 0) for b in behavior_types] + table_rows.append(row) + + print_table(headers, table_rows, "Table 3: Behavioral Response Distribution by Condition (pooled)") + + # Also by (subset, condition) + headers2 = ["subset", "condition", "n_total"] + behavior_types + table_rows2 = [] + for subset in sorted(set(r["subset"] for r in results)): + for condition in CONDITIONS: + cell = [r for r in results if r["subset"] == subset and r["condition"] == condition] + if not cell: + continue + n = len(cell) + counts = {b: sum(1 for r in cell if r["behavioral_response"] == b) for b in behavior_types} + row = [subset, condition, n] + [counts.get(b, 0) for b in behavior_types] + table_rows2.append(row) + + print_table(headers2, table_rows2, "Table 3b: Behavioral Response Distribution by (Subset, Condition)") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + results = build_results() + + # Print full dataframe + print(f"Results dataframe: {len(results)} rows") + print(f"Subsets: {sorted(set(r['subset'] for r in results))}") + print(f"Conditions: {sorted(set(r['condition'] for r in results))}") + + # Dump dataframe as CSV + csv_path = _THIS_DIR / "results_dataframe.csv" + fieldnames = [ + "subset", "condition", "case_id", "baseline_outcome", + "trigger_fired", "trigger_count", "behavioral_response", "evaluator_outcome", + ] + with open(csv_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(results) + print(f"\nDataframe written to {csv_path}") + + # Summary tables + table_accuracy(results) + table_recovery(results) + table_disruption(results) + table_behavior(results) + + +if __name__ == "__main__": + main() diff --git a/experiments/parth/feedback_ablation/baseline_stability.yaml b/experiments/parth/feedback_ablation/baseline_stability.yaml new file mode 100644 index 0000000..150443e --- /dev/null +++ b/experiments/parth/feedback_ablation/baseline_stability.yaml @@ -0,0 +1,126 @@ +- case_id: multi_turn_base_52 + subset: A + baseline_stability: flaky +- case_id: multi_turn_base_53 + subset: A + baseline_stability: consistent_pass +- case_id: multi_turn_base_54 + subset: A + baseline_stability: consistent_fail +- case_id: multi_turn_base_55 + subset: A + baseline_stability: flaky +- case_id: multi_turn_base_73 + subset: A + baseline_stability: consistent_fail +- case_id: multi_turn_base_84 + subset: A + baseline_stability: consistent_fail +- case_id: multi_turn_base_87 + subset: A + baseline_stability: consistent_fail +- case_id: multi_turn_base_89 + subset: A + baseline_stability: consistent_fail +- case_id: multi_turn_base_92 + subset: A + baseline_stability: consistent_fail +- case_id: multi_turn_base_97 + subset: A + baseline_stability: consistent_fail +- case_id: multi_turn_base_98 + subset: A + baseline_stability: consistent_fail +- case_id: multi_turn_base_102 + subset: B + baseline_stability: consistent_pass +- case_id: multi_turn_base_103 + subset: B + baseline_stability: consistent_fail +- case_id: multi_turn_base_129 + subset: B + baseline_stability: consistent_fail +- case_id: multi_turn_base_148 + subset: B + baseline_stability: consistent_pass +- case_id: multi_turn_base_190 + subset: B + baseline_stability: consistent_fail +- case_id: multi_turn_base_27 + subset: B + baseline_stability: flaky +- case_id: multi_turn_base_59 + subset: C + baseline_stability: flaky +- case_id: multi_turn_base_66 + subset: C + baseline_stability: consistent_fail +- case_id: multi_turn_base_67 + subset: C + baseline_stability: consistent_pass +- case_id: multi_turn_base_76 + subset: C + baseline_stability: consistent_pass +- case_id: multi_turn_base_79 + subset: C + baseline_stability: flaky +- case_id: multi_turn_base_81 + subset: C + baseline_stability: consistent_fail +- case_id: multi_turn_base_0 + subset: D + baseline_stability: consistent_fail +- case_id: multi_turn_base_10 + subset: D + baseline_stability: flaky +- case_id: multi_turn_base_18 + subset: D + baseline_stability: consistent_fail +- case_id: multi_turn_base_4 + subset: D + baseline_stability: consistent_fail +- case_id: multi_turn_base_40 + subset: D + baseline_stability: consistent_fail +- case_id: multi_turn_base_42 + subset: D + baseline_stability: consistent_fail +- case_id: multi_turn_base_44 + subset: D + baseline_stability: consistent_fail +- case_id: multi_turn_base_155 + subset: E + baseline_stability: flaky +- case_id: multi_turn_base_185 + subset: E + baseline_stability: consistent_fail +- case_id: multi_turn_base_198 + subset: E + baseline_stability: consistent_fail +- case_id: multi_turn_base_173 + subset: F + baseline_stability: consistent_fail +- case_id: multi_turn_base_179 + subset: F + baseline_stability: consistent_fail +- case_id: multi_turn_base_180 + subset: F + baseline_stability: consistent_fail +- case_id: multi_turn_base_184 + subset: F + baseline_stability: flaky +- case_id: multi_turn_base_48 + subset: F + baseline_stability: flaky +- case_id: multi_turn_base_161 + subset: G + baseline_stability: consistent_fail +- case_id: multi_turn_base_172 + subset: G + baseline_stability: consistent_fail +- case_id: multi_turn_base_188 + subset: G + baseline_stability: consistent_fail +- case_id: multi_turn_base_193 + subset: G + baseline_stability: consistent_fail diff --git a/experiments/parth/feedback_ablation/cases.yaml b/experiments/parth/feedback_ablation/cases.yaml new file mode 100644 index 0000000..caf12ef --- /dev/null +++ b/experiments/parth/feedback_ablation/cases.yaml @@ -0,0 +1,859 @@ +- case_id: multi_turn_base_97 + subset: A + pattern: Unrequested Vehicle Actuation + trigger: + tool_name: releaseBrakePedal + trigger_type: tool_only + occurrence: first + messages: + specific: Releasing the brake pedal was not requested by the user and is not required to complete the current task. Do + not call releaseBrakePedal unless explicitly stated. Continue with any remaining steps the user actually asked for, + or end your turn if the task is complete. + vague: This action does not appear to be necessary for the user's request. Reconsider whether it should be performed. + verbose: The releaseBrakePedal action was not part of the user's request. In vehicle control workflows, the brake pedal + is a critical safety component, and releasing it changes the vehicle's physical state in ways the user has not authorized. + Brake pedal manipulation is typically reserved for specific scenarios such as initiating motion after a complete stop, + transitioning out of park, or emergency disengagement procedures. Modern vehicle control systems often have interlocks + and state dependencies that make brake pedal state changes consequential for downstream operations including engine + state, transmission engagement, and parking brake coordination. Before performing any vehicle actuation, you should + verify that it directly serves a user-requested goal. In this case, the user did not request that the brake pedal be + released, and doing so is not required to complete the task they asked for. Do not call releaseBrakePedal when it was + not explicitly required. Instead, continue with any remaining steps that were actually requested, or end your turn if + the user's task has been fully addressed. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_98 + subset: A + pattern: Unrequested Vehicle Actuation + trigger: + tool_name: releaseBrakePedal + trigger_type: tool_only + occurrence: first + messages: + specific: Releasing the brake pedal was not requested by the user and is not required to complete the current task. Do + not call releaseBrakePedal unless explicitly stated. Continue with any remaining steps the user actually asked for, + or end your turn if the task is complete. + vague: This action does not appear to be necessary for the user's request. Reconsider whether it should be performed. + verbose: The releaseBrakePedal action was not part of the user's request. In vehicle control workflows, the brake pedal + is a critical safety component, and releasing it changes the vehicle's physical state in ways the user has not authorized. + Brake pedal manipulation is typically reserved for specific scenarios such as initiating motion after a complete stop, + transitioning out of park, or emergency disengagement procedures. Modern vehicle control systems often have interlocks + and state dependencies that make brake pedal state changes consequential for downstream operations including engine + state, transmission engagement, and parking brake coordination. Before performing any vehicle actuation, you should + verify that it directly serves a user-requested goal. In this case, the user did not request that the brake pedal be + released, and doing so is not required to complete the task they asked for. Do not call releaseBrakePedal when it was + not explicitly required. Instead, continue with any remaining steps that were actually requested, or end your turn if + the user's task has been fully addressed. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_52 + subset: A + pattern: Unrequested Vehicle Actuation + trigger: + tool_name: releaseBrakePedal + trigger_type: tool_only + occurrence: first + messages: + specific: Releasing the brake pedal was not requested by the user and is not required to complete the current task. Do + not call releaseBrakePedal unless explicitly stated. Continue with any remaining steps the user actually asked for, + or end your turn if the task is complete. + vague: This action does not appear to be necessary for the user's request. Reconsider whether it should be performed. + verbose: The releaseBrakePedal action was not part of the user's request. In vehicle control workflows, the brake pedal + is a critical safety component, and releasing it changes the vehicle's physical state in ways the user has not authorized. + Brake pedal manipulation is typically reserved for specific scenarios such as initiating motion after a complete stop, + transitioning out of park, or emergency disengagement procedures. Modern vehicle control systems often have interlocks + and state dependencies that make brake pedal state changes consequential for downstream operations including engine + state, transmission engagement, and parking brake coordination. Before performing any vehicle actuation, you should + verify that it directly serves a user-requested goal. In this case, the user did not request that the brake pedal be + released, and doing so is not required to complete the task they asked for. Do not call releaseBrakePedal when it was + not explicitly required. Instead, continue with any remaining steps that were actually requested, or end your turn if + the user's task has been fully addressed. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_53 + subset: A + pattern: Unrequested Vehicle Actuation + trigger: null + messages: null +- case_id: multi_turn_base_54 + subset: A + pattern: Unrequested Vehicle Actuation + trigger: + tool_name: releaseBrakePedal + trigger_type: tool_only + occurrence: first + messages: + specific: Releasing the brake pedal was not requested by the user and is not required to complete the current task. Do + not call releaseBrakePedal unless explicitly stated. Continue with any remaining steps the user actually asked for, + or end your turn if the task is complete. + vague: This action does not appear to be necessary for the user's request. Reconsider whether it should be performed. + verbose: The releaseBrakePedal action was not part of the user's request. In vehicle control workflows, the brake pedal + is a critical safety component, and releasing it changes the vehicle's physical state in ways the user has not authorized. + Brake pedal manipulation is typically reserved for specific scenarios such as initiating motion after a complete stop, + transitioning out of park, or emergency disengagement procedures. Modern vehicle control systems often have interlocks + and state dependencies that make brake pedal state changes consequential for downstream operations including engine + state, transmission engagement, and parking brake coordination. Before performing any vehicle actuation, you should + verify that it directly serves a user-requested goal. In this case, the user did not request that the brake pedal be + released, and doing so is not required to complete the task they asked for. Do not call releaseBrakePedal when it was + not explicitly required. Instead, continue with any remaining steps that were actually requested, or end your turn if + the user's task has been fully addressed. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_55 + subset: A + pattern: Unrequested Vehicle Actuation + trigger: + tool_name: releaseBrakePedal + trigger_type: tool_only + occurrence: first + messages: + specific: Releasing the brake pedal was not requested by the user and is not required to complete the current task. Do + not call releaseBrakePedal unless explicitly stated. Continue with any remaining steps the user actually asked for, + or end your turn if the task is complete. + vague: This action does not appear to be necessary for the user's request. Reconsider whether it should be performed. + verbose: The releaseBrakePedal action was not part of the user's request. In vehicle control workflows, the brake pedal + is a critical safety component, and releasing it changes the vehicle's physical state in ways the user has not authorized. + Brake pedal manipulation is typically reserved for specific scenarios such as initiating motion after a complete stop, + transitioning out of park, or emergency disengagement procedures. Modern vehicle control systems often have interlocks + and state dependencies that make brake pedal state changes consequential for downstream operations including engine + state, transmission engagement, and parking brake coordination. Before performing any vehicle actuation, you should + verify that it directly serves a user-requested goal. In this case, the user did not request that the brake pedal be + released, and doing so is not required to complete the task they asked for. Do not call releaseBrakePedal when it was + not explicitly required. Instead, continue with any remaining steps that were actually requested, or end your turn if + the user's task has been fully addressed. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_73 + subset: A + pattern: Unrequested Vehicle Actuation + trigger: + tool_name: releaseBrakePedal + trigger_type: tool_only + occurrence: first + messages: + specific: Releasing the brake pedal was not requested by the user and is not required to complete the current task. Do + not call releaseBrakePedal unless explicitly stated. Continue with any remaining steps the user actually asked for, + or end your turn if the task is complete. + vague: This action does not appear to be necessary for the user's request. Reconsider whether it should be performed. + verbose: The releaseBrakePedal action was not part of the user's request. In vehicle control workflows, the brake pedal + is a critical safety component, and releasing it changes the vehicle's physical state in ways the user has not authorized. + Brake pedal manipulation is typically reserved for specific scenarios such as initiating motion after a complete stop, + transitioning out of park, or emergency disengagement procedures. Modern vehicle control systems often have interlocks + and state dependencies that make brake pedal state changes consequential for downstream operations including engine + state, transmission engagement, and parking brake coordination. Before performing any vehicle actuation, you should + verify that it directly serves a user-requested goal. In this case, the user did not request that the brake pedal be + released, and doing so is not required to complete the task they asked for. Do not call releaseBrakePedal when it was + not explicitly required. Instead, continue with any remaining steps that were actually requested, or end your turn if + the user's task has been fully addressed. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_84 + subset: A + pattern: Unrequested Vehicle Actuation + trigger: + tool_name: releaseBrakePedal + trigger_type: tool_only + occurrence: first + messages: + specific: Releasing the brake pedal was not requested by the user and is not required to complete the current task. Do + not call releaseBrakePedal unless explicitly stated. Continue with any remaining steps the user actually asked for, + or end your turn if the task is complete. + vague: This action does not appear to be necessary for the user's request. Reconsider whether it should be performed. + verbose: The releaseBrakePedal action was not part of the user's request. In vehicle control workflows, the brake pedal + is a critical safety component, and releasing it changes the vehicle's physical state in ways the user has not authorized. + Brake pedal manipulation is typically reserved for specific scenarios such as initiating motion after a complete stop, + transitioning out of park, or emergency disengagement procedures. Modern vehicle control systems often have interlocks + and state dependencies that make brake pedal state changes consequential for downstream operations including engine + state, transmission engagement, and parking brake coordination. Before performing any vehicle actuation, you should + verify that it directly serves a user-requested goal. In this case, the user did not request that the brake pedal be + released, and doing so is not required to complete the task they asked for. Do not call releaseBrakePedal when it was + not explicitly required. Instead, continue with any remaining steps that were actually requested, or end your turn if + the user's task has been fully addressed. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_87 + subset: A + pattern: Unrequested Vehicle Actuation + trigger: + tool_name: releaseBrakePedal + trigger_type: tool_only + occurrence: first + messages: + specific: Releasing the brake pedal was not requested by the user and is not required to complete the current task. Do + not call releaseBrakePedal unless explicitly stated. Continue with any remaining steps the user actually asked for, + or end your turn if the task is complete. + vague: This action does not appear to be necessary for the user's request. Reconsider whether it should be performed. + verbose: The releaseBrakePedal action was not part of the user's request. In vehicle control workflows, the brake pedal + is a critical safety component, and releasing it changes the vehicle's physical state in ways the user has not authorized. + Brake pedal manipulation is typically reserved for specific scenarios such as initiating motion after a complete stop, + transitioning out of park, or emergency disengagement procedures. Modern vehicle control systems often have interlocks + and state dependencies that make brake pedal state changes consequential for downstream operations including engine + state, transmission engagement, and parking brake coordination. Before performing any vehicle actuation, you should + verify that it directly serves a user-requested goal. In this case, the user did not request that the brake pedal be + released, and doing so is not required to complete the task they asked for. Do not call releaseBrakePedal when it was + not explicitly required. Instead, continue with any remaining steps that were actually requested, or end your turn if + the user's task has been fully addressed. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_89 + subset: A + pattern: Unrequested Vehicle Actuation + trigger: + tool_name: releaseBrakePedal + trigger_type: tool_only + occurrence: first + messages: + specific: Releasing the brake pedal was not requested by the user and is not required to complete the current task. Do + not call releaseBrakePedal unless explicitly stated. Continue with any remaining steps the user actually asked for, + or end your turn if the task is complete. + vague: This action does not appear to be necessary for the user's request. Reconsider whether it should be performed. + verbose: The releaseBrakePedal action was not part of the user's request. In vehicle control workflows, the brake pedal + is a critical safety component, and releasing it changes the vehicle's physical state in ways the user has not authorized. + Brake pedal manipulation is typically reserved for specific scenarios such as initiating motion after a complete stop, + transitioning out of park, or emergency disengagement procedures. Modern vehicle control systems often have interlocks + and state dependencies that make brake pedal state changes consequential for downstream operations including engine + state, transmission engagement, and parking brake coordination. Before performing any vehicle actuation, you should + verify that it directly serves a user-requested goal. In this case, the user did not request that the brake pedal be + released, and doing so is not required to complete the task they asked for. Do not call releaseBrakePedal when it was + not explicitly required. Instead, continue with any remaining steps that were actually requested, or end your turn if + the user's task has been fully addressed. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_92 + subset: A + pattern: Unrequested Vehicle Actuation + trigger: + tool_name: releaseBrakePedal + trigger_type: tool_only + occurrence: first + messages: + specific: Releasing the brake pedal was not requested by the user and is not required to complete the current task. Do + not call releaseBrakePedal unless explicitly stated. Continue with any remaining steps the user actually asked for, + or end your turn if the task is complete. + vague: This action does not appear to be necessary for the user's request. Reconsider whether it should be performed. + verbose: The releaseBrakePedal action was not part of the user's request. In vehicle control workflows, the brake pedal + is a critical safety component, and releasing it changes the vehicle's physical state in ways the user has not authorized. + Brake pedal manipulation is typically reserved for specific scenarios such as initiating motion after a complete stop, + transitioning out of park, or emergency disengagement procedures. Modern vehicle control systems often have interlocks + and state dependencies that make brake pedal state changes consequential for downstream operations including engine + state, transmission engagement, and parking brake coordination. Before performing any vehicle actuation, you should + verify that it directly serves a user-requested goal. In this case, the user did not request that the brake pedal be + released, and doing so is not required to complete the task they asked for. Do not call releaseBrakePedal when it was + not explicitly required. Instead, continue with any remaining steps that were actually requested, or end your turn if + the user's task has been fully addressed. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_103 + subset: B + pattern: Ticket Over-Specification + trigger: + tool_name: send_message + trigger_type: tool_only + occurrence: first + messages: + specific: Before sending, take a moment to reconsider the message content. Is it limited to what the user explicitly asked + you to communicate? Free-text fields can easily accumulate context, identifiers, or details that weren't part of the + original request. Verify that the message reflects only what the user wanted conveyed, then proceed if appropriate. + vague: Before proceeding with this call, take a moment to reconsider the content of the arguments. Verify they reflect + what the user actually asked for, then continue if appropriate. + verbose: 'Messaging tools in multi-user systems serve a wide range of communication purposes, from brief notifications + to detailed coordination across stakeholders. Effective messages typically prioritize clarity and respect the recipient''s + context — they convey what the recipient needs to know without requiring them to parse through extraneous identifiers, + metadata, or background that the system already tracks elsewhere. Many messaging workflows benefit from minimalism: + the user generally has a specific intent for the message, and additional context the agent has access to (order IDs, + symbols, prices, statuses, timestamps) is often more appropriately handled by the platform''s structured data rather + than embedded in conversational text. Before sending, take a moment to reconsider whether the message content is limited + to what the user explicitly asked you to communicate. Verify that it reflects only what the user wanted conveyed, then + proceed if appropriate.' + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_129 + subset: B + pattern: Ticket Over-Specification + trigger: + tool_name: resolve_ticket + trigger_type: tool_only + occurrence: first + messages: + specific: Before resolving this ticket, take a moment to reconsider. Does the resolution accurately describe only the + actions that have actually been performed in this session? Resolutions can drift toward summarizing intended or assumed + steps rather than completed ones. Verify that the ticket's underlying issue is genuinely addressed and that the resolution + text reflects what actually occurred, then proceed if appropriate. + vague: Before proceeding with this call, take a moment to reconsider the content of the arguments. Verify they reflect + what the user actually asked for, then continue if appropriate. + verbose: Ticket resolution is a consequential action in support workflows because it changes the ticket's state and signals + to other systems and users that the underlying issue has been addressed. Resolution text serves as the durable record + of what was done, and downstream auditing, customer follow-ups, and reporting often rely on its accuracy. A common failure + mode in agent-driven resolution is for the resolution text to describe what was intended or summarized from prior context, + rather than what was actually executed in the current session — this can result in tickets being closed prematurely + or with misleading documentation. Before resolving, take a moment to reconsider whether the resolution describes only + the actions that have actually been performed, and whether the ticket's underlying issue is genuinely addressed. Verify + accordingly, then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_148 + subset: B + pattern: Ticket Over-Specification + trigger: null + messages: null +- case_id: multi_turn_base_27 + subset: B + pattern: Ticket Over-Specification + trigger: + tool_name: create_ticket + trigger_type: tool_only + occurrence: first + messages: + specific: Before creating this ticket, take a moment to reconsider the title and description. Are they limited to the + information the user actually provided or requested? Description fields can easily accumulate adjacent context, prior + conversation details, or assumed background that wasn't part of the ticket's scope. Verify that the ticket content reflects + only what the user asked to record, then proceed if appropriate. + vague: Before proceeding with this call, take a moment to reconsider the content of the arguments. Verify they reflect + what the user actually asked for, then continue if appropriate. + verbose: Ticket creation is the entry point for many support and tracking workflows, and the quality of a ticket's initial + title and description has downstream effects on triage, routing, and resolution. Effective tickets are typically scoped + to a single, clearly described issue with only the context necessary for whoever picks it up to understand and act on + it. Description fields can easily accumulate adjacent information — prior conversation, related but separate issues, + system metadata, or context the user didn't ask to be recorded — which can dilute the ticket's purpose and make it harder + to resolve. Before creating, take a moment to reconsider whether the title and description are limited to what the user + actually asked to record. Verify, then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_102 + subset: B + pattern: Ticket Over-Specification + trigger: null + messages: null +- case_id: multi_turn_base_190 + subset: B + pattern: Ticket Over-Specification + trigger: + tool_name: contact_customer_support + trigger_type: tool_only + occurrence: first + messages: + specific: Before sending this message, take a moment to reconsider its content. Is it scoped to the specific issue the + user wants raised, or has it accumulated identifiers, transaction details, and context beyond what's necessary? Free-text + fields in support requests often work better when concise and focused. Verify that the message communicates only what + the user asked you to convey, then proceed if appropriate. + vague: Before proceeding with this call, take a moment to reconsider the content of the arguments. Verify they reflect + what the user actually asked for, then continue if appropriate. + verbose: Customer support communications operate within established workflows where intake messages are typically reviewed + by humans or routed through triage systems. Effective support requests tend to be concise and focused on a single issue, + providing only the context necessary for the support team to understand and respond — extensive identifiers, transaction + histories, and adjacent details are often more efficiently retrieved by support staff through their own systems than + parsed from a long inbound message. A common pattern in agent-driven support contact is for messages to over-specify, + which can slow triage and obscure the actual question. Before sending, take a moment to reconsider whether the message + is scoped to the specific issue the user wants raised. Verify, then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_59 + subset: C + pattern: Missing Vehicle Preconditions + trigger: + tool_name: startEngine + trigger_type: tool_only + occurrence: first + messages: + specific: Before starting the engine, take a moment to verify the vehicle is in the appropriate state. Engine start typically + depends on prior conditions being satisfied, such as doors being secured and the brake being engaged. Confirm these + conditions hold, then proceed if appropriate. + vague: Before proceeding with this action, take a moment to verify that the vehicle is in the appropriate state for it. + Confirm, then continue if appropriate. + verbose: 'Engine start is a state-changing operation that depends on the vehicle being in a configuration suitable for + ignition. Modern vehicle control systems implement a range of interlocks intended to prevent unsafe or unintended starts: + doors are typically expected to be secured, the brake pedal is typically expected to be engaged, the transmission is + expected to be in an appropriate position, and prior tool calls in the session may have left the vehicle in a state + that needs verification before ignition can safely proceed. A common failure mode in vehicle control workflows is for + an agent to issue an engine start without confirming these prerequisites, which can lead to the call being rejected, + partial state changes, or unintended downstream consequences. Before starting the engine, take a moment to verify that + the vehicle is in the appropriate state — that the relevant precondition steps such as securing doors and engaging the + brake have been completed in this session. Confirm these conditions hold, then proceed if appropriate.' + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_66 + subset: C + pattern: Missing Vehicle Preconditions + trigger: + tool_name: activateParkingBrake + trigger_type: tool_only + occurrence: first + messages: + specific: Engaging the parking brake was not explicitly requested by the user. Take a moment to reconsider whether this + action is necessary to complete the current task. Continue with any remaining requested steps, or end your turn if the + task is complete. + vague: Before proceeding with this action, take a moment to verify that the vehicle is in the appropriate state for it. + Confirm, then continue if appropriate. + verbose: Vehicle actuation tools change the physical state of the vehicle in ways that have downstream consequences for + both the user and subsequent operations. The parking brake in particular is a safety-relevant component whose engagement + should generally be tied to an explicit need — the vehicle being secured at rest, a parking transition, or a user request + to engage it. In agent-driven workflows, a common failure mode is for actuation calls to be issued as part of an assumed + sequence rather than in response to actual user intent, leaving the vehicle in a state the user did not authorize. Before + engaging the parking brake, take a moment to reconsider whether this action was actually requested or required to complete + the user's task. Continue with any remaining requested steps, or end your turn if the task is complete. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_67 + subset: C + pattern: Missing Vehicle Preconditions + trigger: null + messages: null +- case_id: multi_turn_base_76 + subset: C + pattern: Missing Vehicle Preconditions + trigger: null + messages: null +- case_id: multi_turn_base_79 + subset: C + pattern: Missing Vehicle Preconditions + trigger: + tool_name: startEngine + trigger_type: tool_only + occurrence: first + messages: + specific: Before starting the engine, take a moment to verify the vehicle is in the appropriate state. Engine start typically + depends on prior conditions being satisfied, such as doors being secured and the brake being engaged. Confirm these + conditions hold, then proceed if appropriate. + vague: Before proceeding with this action, take a moment to verify that the vehicle is in the appropriate state for it. + Confirm, then continue if appropriate. + verbose: 'Engine start is a state-changing operation that depends on the vehicle being in a configuration suitable for + ignition. Modern vehicle control systems implement a range of interlocks intended to prevent unsafe or unintended starts: + doors are typically expected to be secured, the brake pedal is typically expected to be engaged, the transmission is + expected to be in an appropriate position, and prior tool calls in the session may have left the vehicle in a state + that needs verification before ignition can safely proceed. A common failure mode in vehicle control workflows is for + an agent to issue an engine start without confirming these prerequisites, which can lead to the call being rejected, + partial state changes, or unintended downstream consequences. Before starting the engine, take a moment to verify that + the vehicle is in the appropriate state — that the relevant precondition steps such as securing doors and engaging the + brake have been completed in this session. Confirm these conditions hold, then proceed if appropriate.' + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_81 + subset: C + pattern: Missing Vehicle Preconditions + trigger: + tool_name: startEngine + trigger_type: tool_only + occurrence: first + messages: + specific: Before starting the engine, take a moment to verify the vehicle is in the appropriate state. Engine start typically + depends on prior conditions being satisfied, such as doors being secured and the brake being engaged. Confirm these + conditions hold, then proceed if appropriate. + vague: Before proceeding with this action, take a moment to verify that the vehicle is in the appropriate state for it. + Confirm, then continue if appropriate. + verbose: 'Engine start is a state-changing operation that depends on the vehicle being in a configuration suitable for + ignition. Modern vehicle control systems implement a range of interlocks intended to prevent unsafe or unintended starts: + doors are typically expected to be secured, the brake pedal is typically expected to be engaged, the transmission is + expected to be in an appropriate position, and prior tool calls in the session may have left the vehicle in a state + that needs verification before ignition can safely proceed. A common failure mode in vehicle control workflows is for + an agent to issue an engine start without confirming these prerequisites, which can lead to the call being rejected, + partial state changes, or unintended downstream consequences. Before starting the engine, take a moment to verify that + the vehicle is in the appropriate state — that the relevant precondition steps such as securing doors and engaging the + brake have been completed in this session. Confirm these conditions hold, then proceed if appropriate.' + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_0 + subset: D + pattern: File Ops Without Verification + trigger: + tool_name: grep + trigger_type: precondition_check + condition: arg_not_recent_file + occurrence: first + messages: + specific: Before searching, take a moment to verify the file you're searching is the correct one for this task. File names + referenced in earlier turns or in similar tasks can be easy to confuse. Confirm the target file matches the user's current + request, then proceed if appropriate. + vague: Before proceeding with this file operation, take a moment to verify the relevant state — your current location, + the files involved, and whether the action is needed. Confirm, then continue if appropriate. + verbose: File search operations like grep depend on the search being directed at the correct file, and a common failure + mode in multi-turn file workflows is for the agent to pattern-match on a filename from earlier context rather than the + file currently in scope. Filenames in agent workflows often follow predictable patterns (drafts, finals, dated versions), + and similar names can be easy to substitute for one another, especially when prior turns referenced different files. + Before searching, take a moment to verify that the file argument matches the file the user is asking about in this specific + task — not a similarly named file from earlier context or a default the agent might assume. Confirm the target file + matches the user's current request, then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_18 + subset: D + pattern: File Ops Without Verification + trigger: + tool_name: cd + trigger_type: precondition_check + condition: cd_to_current_dir + occurrence: first + messages: + specific: Before changing directories, take a moment to verify your current working directory. You may already be in the + target directory, or the path you're navigating to may not be relative to where you currently are. Confirm with pwd + if uncertain, then proceed if appropriate. + vague: Before proceeding with this file operation, take a moment to verify the relevant state — your current location, + the files involved, and whether the action is needed. Confirm, then continue if appropriate. + verbose: Working directory state is implicit in shell-like environments and can drift between turns in ways that aren't + always obvious from the conversation history. A common failure mode is for an agent to issue a cd to a target directory + without first checking the current working directory, which can lead to errors when the target is reached as a relative + path that doesn't resolve correctly, or to no-op calls when the agent is already in the target directory. The pwd command + is a low-cost way to ground subsequent navigation in actual rather than assumed state. Before changing directories, + take a moment to verify your current working directory and confirm the navigation is necessary and correctly specified. + Then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_10 + subset: D + pattern: File Ops Without Verification + trigger: + tool_name: mv + trigger_type: precondition_check + condition: mv_dest_missing_directory + occurrence: first + messages: + specific: Before moving the file, take a moment to verify the source and destination are what you intend. The destination + argument can serve either as a new filename or as a target directory depending on context — confirm which is appropriate + here, and verify any prior steps the move depends on have been completed. Then proceed if appropriate. + vague: Before proceeding with this file operation, take a moment to verify the relevant state — your current location, + the files involved, and whether the action is needed. Confirm, then continue if appropriate. + verbose: 'The mv command is overloaded: depending on whether the destination is an existing directory, an existing file, + or a non-existent path, it can rename, overwrite, or move the source. In multi-step file workflows, this overloading + is a common source of failures — an agent may intend to move a file into a folder but, if the folder doesn''t exist + or the path is misinterpreted, end up renaming the file in place instead. Move operations also frequently depend on + prior steps such as creating a destination directory or navigating to the correct working directory. Before moving the + file, take a moment to verify the source and destination are what you intend, that the destination behaves as you expect + (rename vs. directory move), and that any prior dependent steps have been completed. Then proceed if appropriate.' + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_4 + subset: D + pattern: File Ops Without Verification + trigger: + tool_name: cd + trigger_type: precondition_check + condition: cd_to_current_dir + occurrence: first + messages: + specific: Before changing directories, take a moment to verify your current working directory. You may already be in the + target directory, or the path you're navigating to may not be relative to where you currently are. Confirm with pwd + if uncertain, then proceed if appropriate. + vague: Before proceeding with this file operation, take a moment to verify the relevant state — your current location, + the files involved, and whether the action is needed. Confirm, then continue if appropriate. + verbose: Working directory state is implicit in shell-like environments and can drift between turns in ways that aren't + always obvious from the conversation history. A common failure mode is for an agent to issue a cd to a target directory + without first checking the current working directory, which can lead to errors when the target is reached as a relative + path that doesn't resolve correctly, or to no-op calls when the agent is already in the target directory. The pwd command + is a low-cost way to ground subsequent navigation in actual rather than assumed state. Before changing directories, + take a moment to verify your current working directory and confirm the navigation is necessary and correctly specified. + Then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_40 + subset: D + pattern: File Ops Without Verification + trigger: + tool_name: ls + trigger_type: precondition_check + condition: duplicate_ls_no_state_change + occurrence: first + messages: + specific: Before listing directory contents, take a moment to verify whether this information is already available from + a recent call. Repeating a listing without state changes between calls typically returns the same result. Confirm whether + a fresh listing is needed, then proceed if appropriate. + vague: Before proceeding with this file operation, take a moment to verify the relevant state — your current location, + the files involved, and whether the action is needed. Confirm, then continue if appropriate. + verbose: Directory listing is a read-only operation, but redundant ls calls are a common pattern in agent workflows that + can indicate the agent has lost track of state from a prior call or is filling space rather than acting on information + already available. Each tool call consumes context and adds latency, and repeating a listing without intervening state + changes typically yields no new information. Before listing, take a moment to verify whether you already have the directory + contents from a recent call in this session. Confirm whether a fresh listing is genuinely needed, then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_42 + subset: D + pattern: File Ops Without Verification + trigger: + tool_name: mkdir + trigger_type: precondition_check + condition: mkdir_already_exists + occurrence: first + messages: + specific: Before creating the directory, take a moment to verify it doesn't already exist and that your current working + directory is the intended parent. Confirm with pwd and ls if uncertain, then proceed if appropriate. + vague: Before proceeding with this file operation, take a moment to verify the relevant state — your current location, + the files involved, and whether the action is needed. Confirm, then continue if appropriate. + verbose: Directory creation is a state-changing operation whose effect depends on both the current working directory and + the existing filesystem state. A common failure mode is for an agent to attempt mkdir on a directory that already exists, + or to create the directory in an unintended parent because the working directory wasn't verified first. Many shell environments + will return an error when creating an existing directory, but the more subtle failure is silently creating a nested + or duplicate directory in the wrong location. Before creating, take a moment to verify the directory doesn't already + exist and that your current working directory is the intended parent. Confirm with pwd and ls if uncertain, then proceed + if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_44 + subset: D + pattern: File Ops Without Verification + trigger: + tool_name: echo + trigger_type: precondition_check + condition: echo_content_extra_quotes + occurrence: first + messages: + specific: Before writing the content, take a moment to verify the content string is formatted as intended. String literals + can pick up extra quote characters or escape sequences that change what actually gets written to the file. Confirm the + content matches what the user requested, then proceed if appropriate. + vague: Before proceeding with this file operation, take a moment to verify the relevant state — your current location, + the files involved, and whether the action is needed. Confirm, then continue if appropriate. + verbose: Writing content to files via echo depends on the content string being formatted as intended, and string literals + in tool calls can be a source of subtle errors. Quote characters intended as delimiters can end up embedded in the written + content if escaping is handled incorrectly, and conversely, content meant to include literal quotes can have them stripped. + Multi-step workflows where the content is paraphrased or reconstructed from earlier conversation are especially prone + to introducing extra layers of quoting or escape sequences. Before writing, take a moment to verify that the content + string matches what the user requested, character for character, with no added or removed quote layers. Confirm, then + proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_155 + subset: E + pattern: Budget Constraint Violations + trigger: + tool_name: book_flight + trigger_type: tool_only + occurrence: first + messages: + specific: Before booking, take a moment to verify the argument values you've selected, particularly the payment card. + The card_id should correspond to one of the user's available cards — confirm it matches an entry from the user's actual + card list rather than a similarly formatted or assumed value. Verify the other booking arguments (dates, route, class) + are also correct, then proceed if appropriate. + vague: Before proceeding with this call, take a moment to verify the argument values match what the user actually requested. + Confirm, then continue if appropriate. + verbose: Flight booking is a financially consequential action whose arguments determine which payment method is charged, + which itinerary is reserved, and what travel class is purchased. A common failure mode in agent-driven booking workflows + is for the agent to populate the card_id argument with a value that resembles a payment card identifier — a string with + the right format, a partial number from earlier context, or an assumed default — without verifying that the value corresponds + to one of the user's actually available cards. Payment card lists are typically retrievable through a dedicated tool, + and grounding the card_id selection in the actual list rather than in inferred or pattern-matched values is the most + reliable way to avoid charging an unintended card or having the booking fail. Before booking, take a moment to verify + the card_id matches an entry from the user's actual card list, and that the other arguments (dates, route, class) align + with what the user requested. Then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_198 + subset: E + pattern: Budget Constraint Violations + trigger: + tool_name: book_flight + trigger_type: tool_only + occurrence: first + messages: + specific: Before booking, take a moment to verify the argument values you've selected, particularly the payment card. + The card_id should correspond to one of the user's available cards — confirm it matches an entry from the user's actual + card list rather than a similarly formatted or assumed value. Verify the other booking arguments (dates, route, class) + are also correct, then proceed if appropriate. + vague: Before proceeding with this call, take a moment to verify the argument values match what the user actually requested. + Confirm, then continue if appropriate. + verbose: Flight booking is a financially consequential action whose arguments determine which payment method is charged, + which itinerary is reserved, and what travel class is purchased. A common failure mode in agent-driven booking workflows + is for the agent to populate the card_id argument with a value that resembles a payment card identifier — a string with + the right format, a partial number from earlier context, or an assumed default — without verifying that the value corresponds + to one of the user's actually available cards. Payment card lists are typically retrievable through a dedicated tool, + and grounding the card_id selection in the actual list rather than in inferred or pattern-matched values is the most + reliable way to avoid charging an unintended card or having the booking fail. Before booking, take a moment to verify + the card_id matches an entry from the user's actual card list, and that the other arguments (dates, route, class) align + with what the user requested. Then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_185 + subset: E + pattern: Budget Constraint Violations + trigger: + tool_name: purchase_insurance + trigger_type: tool_only + occurrence: first + messages: + specific: Before purchasing, take a moment to verify the argument values you've selected, particularly the insurance type. + Insurance offerings typically come in distinct tiers (e.g., basic, travel, comprehensive), and the user's request may + specify or imply a particular tier. Confirm the insurance_type matches what the user actually requested, and verify + the other arguments (booking ID, cost, payment card) are correct. Then proceed if appropriate. + vague: Before proceeding with this call, take a moment to verify the argument values match what the user actually requested. + Confirm, then continue if appropriate. + verbose: Insurance purchases involve selecting from distinct product tiers that differ in coverage, cost, and applicability + to the booking being insured. A common failure mode in agent-driven insurance workflows is for the insurance_type argument + to be populated with a plausible-sounding value (e.g., "travel," "standard," "basic") that doesn't actually match the + tier the user requested or the tier appropriate for their stated needs. Insurance tier names can be easy to confuse + — "travel insurance" may sound like a default for travel-related bookings even when the user explicitly asked for "comprehensive" + coverage, and the cost argument may need to align with the selected tier. Before purchasing, take a moment to verify + that insurance_type matches what the user actually requested, and that the other arguments (booking ID, cost, payment + card) are consistent with the user's request. Then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_180 + subset: F + pattern: Wrong Turn Execution + trigger: + tool_name: set_budget_limit + trigger_type: tool_only + occurrence: first + messages: + specific: Before setting the budget limit, take a moment to reconsider whether this is the right next step in the user's + task. If you've already set or attempted to set a budget limit recently, repeating the call won't change the outcome + — review what's been done so far and whether a different action is needed to move the task forward. Then proceed if + appropriate. + vague: Before proceeding with this call, take a moment to reconsider whether it's the right next step in the user's task. + Confirm, then continue if appropriate. + verbose: Budget limit operations are configuration calls whose effect depends on the limit value being correct and on + the call being made at the right point in the workflow. A common failure mode in agent-driven financial workflows is + for an agent to repeat the same configuration call multiple times in a row — either because the prior call's result + wasn't fully processed, because the agent is uncertain whether it succeeded, or because the agent has lost track of + what's already been done in the session. Repeating a configuration call without intervening state changes typically + produces no progress and consumes context that could be spent on subsequent steps. Before setting the budget limit, + take a moment to reconsider whether you've already set or attempted to set this limit, and whether the next move in + the user's task is actually a different action. Then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_184 + subset: F + pattern: Wrong Turn Execution + trigger: + tool_name: book_flight + trigger_type: tool_only + occurrence: first + messages: + specific: Before booking, take a moment to reconsider whether this is the right next step in the user's task and whether + the arguments you've selected are correct. Verify the action fits the current point in the workflow and that values + like the payment card match what the user actually has available. Then proceed if appropriate. + vague: Before proceeding with this call, take a moment to reconsider whether it's the right next step in the user's task. + Confirm, then continue if appropriate. + verbose: Flight booking sits at a specific point in a multi-step travel workflow that typically involves authentication, + account verification, card selection, and other prerequisites. A common failure mode is for an agent to issue a book_flight + call before all the upstream context has been gathered — for example, picking a card_id based on inference rather than + on the user's actual card list, or booking before confirming the trip parameters. Booking is also a financially consequential + action that is hard to undo cleanly, so the cost of an incorrect call is higher than for read-only operations. Before + booking, take a moment to reconsider whether this is the right next step at this point in the workflow and whether the + arguments — payment card, dates, route, class — accurately reflect what the user requested. Then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_179 + subset: F + pattern: Wrong Turn Execution + trigger: + tool_name: purchase_insurance + trigger_type: tool_only + occurrence: first + messages: + specific: Before purchasing, take a moment to reconsider whether this is the right next step in the user's task and whether + the arguments are correct. Verify that the insurance type, booking ID, and payment card match what the user requested. + Then proceed if appropriate. + vague: Before proceeding with this call, take a moment to reconsider whether it's the right next step in the user's task. + Confirm, then continue if appropriate. + verbose: Insurance purchase calls involve both selecting the right product tier and tying the purchase to the correct + booking and payment method. A common failure mode in agent-driven workflows is for the insurance_type to be populated + with a default-looking value rather than the tier the user actually requested, or for the call to be issued before the + booking it's meant to insure has been confirmed. Insurance is also distinct from booking in that the tier names can + be ambiguous — "travel," "standard," "basic," "comprehensive" — and these distinctions matter for whether the coverage + matches the user's intent. Before purchasing, take a moment to reconsider whether this is the right next step and whether + the insurance type, booking ID, cost, and payment card are all consistent with what the user asked for. Then proceed + if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_173 + subset: F + pattern: Wrong Turn Execution + trigger: + tool_name: cancel_booking + trigger_type: tool_only + occurrence: first + messages: + specific: Before cancelling the booking, take a moment to reconsider whether this is the action the user actually requested. + Cancellation is a state-changing operation that's easy to confuse with other actions like closing a ticket, resolving + an issue, or undoing a different recent step. Verify that cancelling the booking is what the user asked for, then proceed + if appropriate. + vague: Before proceeding with this call, take a moment to reconsider whether it's the right next step in the user's task. + Confirm, then continue if appropriate. + verbose: Booking cancellation is a state-changing operation with downstream effects on refunds, related reservations, + and the user's broader travel plans. A common failure mode in agent-driven workflows is for cancellation tools to be + confused with other completion-style actions — closing a ticket, resolving an issue, marking a task done — particularly + when the user's phrasing is ambiguous about whether they want the booking itself undone or just a related record finalized. + Cancellation is typically harder to reverse than other actions, so verifying intent before issuing the call is especially + important. Before cancelling, take a moment to reconsider whether the user actually asked for the booking to be cancelled, + or whether a different action (closing a ticket, resolving a support request, finalizing a different record) better + matches their request. Then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_48 + subset: F + pattern: Wrong Turn Execution + trigger: + tool_name: get_ticket + trigger_type: tool_only + occurrence: first + messages: + specific: Before retrieving the ticket, take a moment to reconsider whether you already have the information this call + would return. If you've recently called get_ticket and the ticket state hasn't changed, repeating the call won't yield + new information — review what you already know and consider whether a different action is needed to move the task forward. + Then proceed if appropriate. + vague: Before proceeding with this call, take a moment to reconsider whether it's the right next step in the user's task. + Confirm, then continue if appropriate. + verbose: Ticket retrieval is a read-only operation whose value depends on whether the information it returns is new or + has changed since the last retrieval. A common failure mode in agent-driven workflows is for an agent to call get_ticket + repeatedly across turns without taking action on the information already returned — either because the agent is uncertain + how to proceed, because it's looking for a state change that hasn't happened, or because it's lost track of what it + already knows. Repeated read-only calls don't progress the task and can crowd out the actions that would. Before retrieving, + take a moment to reconsider whether you already have the ticket information from a recent call and whether the next + step in the user's task is actually a different action. Then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_161 + subset: G + pattern: Authentication Violations + trigger: + tool_name: authenticate_travel + trigger_type: tool_only + occurrence: first + messages: + specific: Before authenticating, take a moment to verify whether you've already authenticated successfully in this session. + Re-authenticating when a valid session already exists can reset session state and cause downstream calls to fail. Confirm + whether authentication is actually needed, then proceed if appropriate. + vague: Before proceeding with this call, take a moment to verify whether it's needed and whether the arguments are correct. + Confirm, then continue if appropriate. + verbose: Authentication tools establish session state that subsequent tool calls depend on, and the relationship between + authentication calls and session state can be subtle. A common failure mode in agent-driven workflows is for an agent + to issue an authentication call when a valid session already exists — either because the agent has lost track of prior + session state, because it's defensively re-authenticating before a sensitive operation, or because it's misinterpreting + an unrelated error as an auth issue. Re-authenticating against an already-authenticated session can reset tokens, invalidate + session-bound resources, or trigger rate limits, and can cause subsequent calls in the workflow to fail in ways that + are hard to diagnose. Before authenticating, take a moment to verify whether a valid authenticated session already exists + from earlier in this session, and whether the next step in the user's task actually requires a fresh authentication. + Then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_172 + subset: G + pattern: Authentication Violations + trigger: + tool_name: authenticate_travel + trigger_type: tool_only + occurrence: first + messages: + specific: Before authenticating, take a moment to verify whether you've already authenticated successfully in this session. + Re-authenticating when a valid session already exists can reset session state and cause downstream calls to fail. Confirm + whether authentication is actually needed, then proceed if appropriate. + vague: Before proceeding with this call, take a moment to verify whether it's needed and whether the arguments are correct. + Confirm, then continue if appropriate. + verbose: Authentication tools establish session state that subsequent tool calls depend on, and the relationship between + authentication calls and session state can be subtle. A common failure mode in agent-driven workflows is for an agent + to issue an authentication call when a valid session already exists — either because the agent has lost track of prior + session state, because it's defensively re-authenticating before a sensitive operation, or because it's misinterpreting + an unrelated error as an auth issue. Re-authenticating against an already-authenticated session can reset tokens, invalidate + session-bound resources, or trigger rate limits, and can cause subsequent calls in the workflow to fail in ways that + are hard to diagnose. Before authenticating, take a moment to verify whether a valid authenticated session already exists + from earlier in this session, and whether the next step in the user's task actually requires a fresh authentication. + Then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_193 + subset: G + pattern: Authentication Violations + trigger: + tool_name: authenticate_travel + trigger_type: tool_only + occurrence: first + messages: + specific: Before authenticating, take a moment to verify whether you've already authenticated successfully in this session. + Re-authenticating when a valid session already exists can reset session state and cause downstream calls to fail. Confirm + whether authentication is actually needed, then proceed if appropriate. + vague: Before proceeding with this call, take a moment to verify whether it's needed and whether the arguments are correct. + Confirm, then continue if appropriate. + verbose: Authentication tools establish session state that subsequent tool calls depend on, and the relationship between + authentication calls and session state can be subtle. A common failure mode in agent-driven workflows is for an agent + to issue an authentication call when a valid session already exists — either because the agent has lost track of prior + session state, because it's defensively re-authenticating before a sensitive operation, or because it's misinterpreting + an unrelated error as an auth issue. Re-authenticating against an already-authenticated session can reset tokens, invalidate + session-bound resources, or trigger rate limits, and can cause subsequent calls in the workflow to fail in ways that + are hard to diagnose. Before authenticating, take a moment to verify whether a valid authenticated session already exists + from earlier in this session, and whether the next step in the user's task actually requires a fresh authentication. + Then proceed if appropriate. + 'null': Unable to process this call at this time. +- case_id: multi_turn_base_188 + subset: G + pattern: Authentication Violations + trigger: + tool_name: purchase_insurance + trigger_type: tool_only + occurrence: first + messages: + specific: Before purchasing, take a moment to verify the argument values you've selected, particularly the insurance type. + Insurance offerings typically come in distinct tiers, and the user's request may specify or imply a particular one. + Confirm the insurance_type matches what the user actually requested, and verify the other arguments are correct. Then + proceed if appropriate. + vague: Before proceeding with this call, take a moment to verify whether it's needed and whether the arguments are correct. + Confirm, then continue if appropriate. + verbose: Insurance purchases involve selecting from distinct product tiers that differ in coverage, cost, and applicability + to the booking being insured. A common failure mode in agent-driven insurance workflows is for the insurance_type argument + to be populated with a plausible-sounding value that doesn't actually match the tier the user requested or the tier + appropriate for their stated needs. Insurance tier names can be easy to confuse — "travel insurance" may sound like + a default for travel-related bookings even when the user explicitly asked for "comprehensive" coverage. Before purchasing, + take a moment to verify that insurance_type matches what the user actually requested, and that the other arguments (booking + ID, cost, payment card) are consistent with the user's request. Then proceed if appropriate. + 'null': Unable to process this call at this time. diff --git a/experiments/parth/feedback_ablation/configs/A_null.json b/experiments/parth/feedback_ablation/configs/A_null.json new file mode 100644 index 0000000..be75a35 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/A_null.json @@ -0,0 +1,26 @@ +{ + "subset": "A", + "condition": "null", + "test_ids": [ + "multi_turn_base_52", + "multi_turn_base_53", + "multi_turn_base_54", + "multi_turn_base_55", + "multi_turn_base_73", + "multi_turn_base_84", + "multi_turn_base_87", + "multi_turn_base_89", + "multi_turn_base_92", + "multi_turn_base_97", + "multi_turn_base_98" + ], + "triggers": [ + { + "tool_name": "releaseBrakePedal", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/A_specific.json b/experiments/parth/feedback_ablation/configs/A_specific.json new file mode 100644 index 0000000..e6223e2 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/A_specific.json @@ -0,0 +1,26 @@ +{ + "subset": "A", + "condition": "specific", + "test_ids": [ + "multi_turn_base_52", + "multi_turn_base_53", + "multi_turn_base_54", + "multi_turn_base_55", + "multi_turn_base_73", + "multi_turn_base_84", + "multi_turn_base_87", + "multi_turn_base_89", + "multi_turn_base_92", + "multi_turn_base_97", + "multi_turn_base_98" + ], + "triggers": [ + { + "tool_name": "releaseBrakePedal", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Releasing the brake pedal was not requested by the user and is not required to complete the current task. Do not call releaseBrakePedal unless explicitly stated. Continue with any remaining steps the user actually asked for, or end your turn if the task is complete.", + "condition_label": "specific" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/A_vague.json b/experiments/parth/feedback_ablation/configs/A_vague.json new file mode 100644 index 0000000..0ddf09f --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/A_vague.json @@ -0,0 +1,26 @@ +{ + "subset": "A", + "condition": "vague", + "test_ids": [ + "multi_turn_base_52", + "multi_turn_base_53", + "multi_turn_base_54", + "multi_turn_base_55", + "multi_turn_base_73", + "multi_turn_base_84", + "multi_turn_base_87", + "multi_turn_base_89", + "multi_turn_base_92", + "multi_turn_base_97", + "multi_turn_base_98" + ], + "triggers": [ + { + "tool_name": "releaseBrakePedal", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "This action does not appear to be necessary for the user's request. Reconsider whether it should be performed.", + "condition_label": "vague" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/A_verbose.json b/experiments/parth/feedback_ablation/configs/A_verbose.json new file mode 100644 index 0000000..e4968e3 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/A_verbose.json @@ -0,0 +1,26 @@ +{ + "subset": "A", + "condition": "verbose", + "test_ids": [ + "multi_turn_base_52", + "multi_turn_base_53", + "multi_turn_base_54", + "multi_turn_base_55", + "multi_turn_base_73", + "multi_turn_base_84", + "multi_turn_base_87", + "multi_turn_base_89", + "multi_turn_base_92", + "multi_turn_base_97", + "multi_turn_base_98" + ], + "triggers": [ + { + "tool_name": "releaseBrakePedal", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "The releaseBrakePedal action was not part of the user's request. In vehicle control workflows, the brake pedal is a critical safety component, and releasing it changes the vehicle's physical state in ways the user has not authorized. Brake pedal manipulation is typically reserved for specific scenarios such as initiating motion after a complete stop, transitioning out of park, or emergency disengagement procedures. Modern vehicle control systems often have interlocks and state dependencies that make brake pedal state changes consequential for downstream operations including engine state, transmission engagement, and parking brake coordination. Before performing any vehicle actuation, you should verify that it directly serves a user-requested goal. In this case, the user did not request that the brake pedal be released, and doing so is not required to complete the task they asked for. Do not call releaseBrakePedal when it was not explicitly required. Instead, continue with any remaining steps that were actually requested, or end your turn if the user's task has been fully addressed.", + "condition_label": "verbose" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/B_null.json b/experiments/parth/feedback_ablation/configs/B_null.json new file mode 100644 index 0000000..580404d --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/B_null.json @@ -0,0 +1,42 @@ +{ + "subset": "B", + "condition": "null", + "test_ids": [ + "multi_turn_base_102", + "multi_turn_base_103", + "multi_turn_base_129", + "multi_turn_base_148", + "multi_turn_base_190", + "multi_turn_base_27" + ], + "triggers": [ + { + "tool_name": "send_message", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + }, + { + "tool_name": "resolve_ticket", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + }, + { + "tool_name": "create_ticket", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + }, + { + "tool_name": "contact_customer_support", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/B_specific.json b/experiments/parth/feedback_ablation/configs/B_specific.json new file mode 100644 index 0000000..e952907 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/B_specific.json @@ -0,0 +1,42 @@ +{ + "subset": "B", + "condition": "specific", + "test_ids": [ + "multi_turn_base_102", + "multi_turn_base_103", + "multi_turn_base_129", + "multi_turn_base_148", + "multi_turn_base_190", + "multi_turn_base_27" + ], + "triggers": [ + { + "tool_name": "send_message", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before sending, take a moment to reconsider the message content. Is it limited to what the user explicitly asked you to communicate? Free-text fields can easily accumulate context, identifiers, or details that weren't part of the original request. Verify that the message reflects only what the user wanted conveyed, then proceed if appropriate.", + "condition_label": "specific" + }, + { + "tool_name": "resolve_ticket", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before resolving this ticket, take a moment to reconsider. Does the resolution accurately describe only the actions that have actually been performed in this session? Resolutions can drift toward summarizing intended or assumed steps rather than completed ones. Verify that the ticket's underlying issue is genuinely addressed and that the resolution text reflects what actually occurred, then proceed if appropriate.", + "condition_label": "specific" + }, + { + "tool_name": "create_ticket", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before creating this ticket, take a moment to reconsider the title and description. Are they limited to the information the user actually provided or requested? Description fields can easily accumulate adjacent context, prior conversation details, or assumed background that wasn't part of the ticket's scope. Verify that the ticket content reflects only what the user asked to record, then proceed if appropriate.", + "condition_label": "specific" + }, + { + "tool_name": "contact_customer_support", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before sending this message, take a moment to reconsider its content. Is it scoped to the specific issue the user wants raised, or has it accumulated identifiers, transaction details, and context beyond what's necessary? Free-text fields in support requests often work better when concise and focused. Verify that the message communicates only what the user asked you to convey, then proceed if appropriate.", + "condition_label": "specific" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/B_vague.json b/experiments/parth/feedback_ablation/configs/B_vague.json new file mode 100644 index 0000000..f348552 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/B_vague.json @@ -0,0 +1,42 @@ +{ + "subset": "B", + "condition": "vague", + "test_ids": [ + "multi_turn_base_102", + "multi_turn_base_103", + "multi_turn_base_129", + "multi_turn_base_148", + "multi_turn_base_190", + "multi_turn_base_27" + ], + "triggers": [ + { + "tool_name": "send_message", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to reconsider the content of the arguments. Verify they reflect what the user actually asked for, then continue if appropriate.", + "condition_label": "vague" + }, + { + "tool_name": "resolve_ticket", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to reconsider the content of the arguments. Verify they reflect what the user actually asked for, then continue if appropriate.", + "condition_label": "vague" + }, + { + "tool_name": "create_ticket", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to reconsider the content of the arguments. Verify they reflect what the user actually asked for, then continue if appropriate.", + "condition_label": "vague" + }, + { + "tool_name": "contact_customer_support", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to reconsider the content of the arguments. Verify they reflect what the user actually asked for, then continue if appropriate.", + "condition_label": "vague" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/B_verbose.json b/experiments/parth/feedback_ablation/configs/B_verbose.json new file mode 100644 index 0000000..6d21ac0 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/B_verbose.json @@ -0,0 +1,42 @@ +{ + "subset": "B", + "condition": "verbose", + "test_ids": [ + "multi_turn_base_102", + "multi_turn_base_103", + "multi_turn_base_129", + "multi_turn_base_148", + "multi_turn_base_190", + "multi_turn_base_27" + ], + "triggers": [ + { + "tool_name": "send_message", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Messaging tools in multi-user systems serve a wide range of communication purposes, from brief notifications to detailed coordination across stakeholders. Effective messages typically prioritize clarity and respect the recipient's context — they convey what the recipient needs to know without requiring them to parse through extraneous identifiers, metadata, or background that the system already tracks elsewhere. Many messaging workflows benefit from minimalism: the user generally has a specific intent for the message, and additional context the agent has access to (order IDs, symbols, prices, statuses, timestamps) is often more appropriately handled by the platform's structured data rather than embedded in conversational text. Before sending, take a moment to reconsider whether the message content is limited to what the user explicitly asked you to communicate. Verify that it reflects only what the user wanted conveyed, then proceed if appropriate.", + "condition_label": "verbose" + }, + { + "tool_name": "resolve_ticket", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Ticket resolution is a consequential action in support workflows because it changes the ticket's state and signals to other systems and users that the underlying issue has been addressed. Resolution text serves as the durable record of what was done, and downstream auditing, customer follow-ups, and reporting often rely on its accuracy. A common failure mode in agent-driven resolution is for the resolution text to describe what was intended or summarized from prior context, rather than what was actually executed in the current session — this can result in tickets being closed prematurely or with misleading documentation. Before resolving, take a moment to reconsider whether the resolution describes only the actions that have actually been performed, and whether the ticket's underlying issue is genuinely addressed. Verify accordingly, then proceed if appropriate.", + "condition_label": "verbose" + }, + { + "tool_name": "create_ticket", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Ticket creation is the entry point for many support and tracking workflows, and the quality of a ticket's initial title and description has downstream effects on triage, routing, and resolution. Effective tickets are typically scoped to a single, clearly described issue with only the context necessary for whoever picks it up to understand and act on it. Description fields can easily accumulate adjacent information — prior conversation, related but separate issues, system metadata, or context the user didn't ask to be recorded — which can dilute the ticket's purpose and make it harder to resolve. Before creating, take a moment to reconsider whether the title and description are limited to what the user actually asked to record. Verify, then proceed if appropriate.", + "condition_label": "verbose" + }, + { + "tool_name": "contact_customer_support", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Customer support communications operate within established workflows where intake messages are typically reviewed by humans or routed through triage systems. Effective support requests tend to be concise and focused on a single issue, providing only the context necessary for the support team to understand and respond — extensive identifiers, transaction histories, and adjacent details are often more efficiently retrieved by support staff through their own systems than parsed from a long inbound message. A common pattern in agent-driven support contact is for messages to over-specify, which can slow triage and obscure the actual question. Before sending, take a moment to reconsider whether the message is scoped to the specific issue the user wants raised. Verify, then proceed if appropriate.", + "condition_label": "verbose" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/C_null.json b/experiments/parth/feedback_ablation/configs/C_null.json new file mode 100644 index 0000000..609a4a5 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/C_null.json @@ -0,0 +1,28 @@ +{ + "subset": "C", + "condition": "null", + "test_ids": [ + "multi_turn_base_59", + "multi_turn_base_66", + "multi_turn_base_67", + "multi_turn_base_76", + "multi_turn_base_79", + "multi_turn_base_81" + ], + "triggers": [ + { + "tool_name": "startEngine", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + }, + { + "tool_name": "activateParkingBrake", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/C_specific.json b/experiments/parth/feedback_ablation/configs/C_specific.json new file mode 100644 index 0000000..4bfcef2 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/C_specific.json @@ -0,0 +1,28 @@ +{ + "subset": "C", + "condition": "specific", + "test_ids": [ + "multi_turn_base_59", + "multi_turn_base_66", + "multi_turn_base_67", + "multi_turn_base_76", + "multi_turn_base_79", + "multi_turn_base_81" + ], + "triggers": [ + { + "tool_name": "startEngine", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before starting the engine, take a moment to verify the vehicle is in the appropriate state. Engine start typically depends on prior conditions being satisfied, such as doors being secured and the brake being engaged. Confirm these conditions hold, then proceed if appropriate.", + "condition_label": "specific" + }, + { + "tool_name": "activateParkingBrake", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Engaging the parking brake was not explicitly requested by the user. Take a moment to reconsider whether this action is necessary to complete the current task. Continue with any remaining requested steps, or end your turn if the task is complete.", + "condition_label": "specific" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/C_vague.json b/experiments/parth/feedback_ablation/configs/C_vague.json new file mode 100644 index 0000000..f05ee50 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/C_vague.json @@ -0,0 +1,28 @@ +{ + "subset": "C", + "condition": "vague", + "test_ids": [ + "multi_turn_base_59", + "multi_turn_base_66", + "multi_turn_base_67", + "multi_turn_base_76", + "multi_turn_base_79", + "multi_turn_base_81" + ], + "triggers": [ + { + "tool_name": "startEngine", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this action, take a moment to verify that the vehicle is in the appropriate state for it. Confirm, then continue if appropriate.", + "condition_label": "vague" + }, + { + "tool_name": "activateParkingBrake", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this action, take a moment to verify that the vehicle is in the appropriate state for it. Confirm, then continue if appropriate.", + "condition_label": "vague" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/C_verbose.json b/experiments/parth/feedback_ablation/configs/C_verbose.json new file mode 100644 index 0000000..b1bc333 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/C_verbose.json @@ -0,0 +1,28 @@ +{ + "subset": "C", + "condition": "verbose", + "test_ids": [ + "multi_turn_base_59", + "multi_turn_base_66", + "multi_turn_base_67", + "multi_turn_base_76", + "multi_turn_base_79", + "multi_turn_base_81" + ], + "triggers": [ + { + "tool_name": "startEngine", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Engine start is a state-changing operation that depends on the vehicle being in a configuration suitable for ignition. Modern vehicle control systems implement a range of interlocks intended to prevent unsafe or unintended starts: doors are typically expected to be secured, the brake pedal is typically expected to be engaged, the transmission is expected to be in an appropriate position, and prior tool calls in the session may have left the vehicle in a state that needs verification before ignition can safely proceed. A common failure mode in vehicle control workflows is for an agent to issue an engine start without confirming these prerequisites, which can lead to the call being rejected, partial state changes, or unintended downstream consequences. Before starting the engine, take a moment to verify that the vehicle is in the appropriate state — that the relevant precondition steps such as securing doors and engaging the brake have been completed in this session. Confirm these conditions hold, then proceed if appropriate.", + "condition_label": "verbose" + }, + { + "tool_name": "activateParkingBrake", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Vehicle actuation tools change the physical state of the vehicle in ways that have downstream consequences for both the user and subsequent operations. The parking brake in particular is a safety-relevant component whose engagement should generally be tied to an explicit need — the vehicle being secured at rest, a parking transition, or a user request to engage it. In agent-driven workflows, a common failure mode is for actuation calls to be issued as part of an assumed sequence rather than in response to actual user intent, leaving the vehicle in a state the user did not authorize. Before engaging the parking brake, take a moment to reconsider whether this action was actually requested or required to complete the user's task. Continue with any remaining requested steps, or end your turn if the task is complete.", + "condition_label": "verbose" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/D_null.json b/experiments/parth/feedback_ablation/configs/D_null.json new file mode 100644 index 0000000..00aa7da --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/D_null.json @@ -0,0 +1,63 @@ +{ + "subset": "D", + "condition": "null", + "test_ids": [ + "multi_turn_base_0", + "multi_turn_base_10", + "multi_turn_base_18", + "multi_turn_base_4", + "multi_turn_base_40", + "multi_turn_base_42", + "multi_turn_base_44" + ], + "triggers": [ + { + "tool_name": "grep", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null", + "condition": "arg_not_recent_file" + }, + { + "tool_name": "cd", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null", + "condition": "cd_to_current_dir" + }, + { + "tool_name": "mv", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null", + "condition": "mv_dest_missing_directory" + }, + { + "tool_name": "ls", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null", + "condition": "duplicate_ls_no_state_change" + }, + { + "tool_name": "mkdir", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null", + "condition": "mkdir_already_exists" + }, + { + "tool_name": "echo", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null", + "condition": "echo_content_extra_quotes" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/D_specific.json b/experiments/parth/feedback_ablation/configs/D_specific.json new file mode 100644 index 0000000..ea20d56 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/D_specific.json @@ -0,0 +1,63 @@ +{ + "subset": "D", + "condition": "specific", + "test_ids": [ + "multi_turn_base_0", + "multi_turn_base_10", + "multi_turn_base_18", + "multi_turn_base_4", + "multi_turn_base_40", + "multi_turn_base_42", + "multi_turn_base_44" + ], + "triggers": [ + { + "tool_name": "grep", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Before searching, take a moment to verify the file you're searching is the correct one for this task. File names referenced in earlier turns or in similar tasks can be easy to confuse. Confirm the target file matches the user's current request, then proceed if appropriate.", + "condition_label": "specific", + "condition": "arg_not_recent_file" + }, + { + "tool_name": "cd", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Before changing directories, take a moment to verify your current working directory. You may already be in the target directory, or the path you're navigating to may not be relative to where you currently are. Confirm with pwd if uncertain, then proceed if appropriate.", + "condition_label": "specific", + "condition": "cd_to_current_dir" + }, + { + "tool_name": "mv", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Before moving the file, take a moment to verify the source and destination are what you intend. The destination argument can serve either as a new filename or as a target directory depending on context — confirm which is appropriate here, and verify any prior steps the move depends on have been completed. Then proceed if appropriate.", + "condition_label": "specific", + "condition": "mv_dest_missing_directory" + }, + { + "tool_name": "ls", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Before listing directory contents, take a moment to verify whether this information is already available from a recent call. Repeating a listing without state changes between calls typically returns the same result. Confirm whether a fresh listing is needed, then proceed if appropriate.", + "condition_label": "specific", + "condition": "duplicate_ls_no_state_change" + }, + { + "tool_name": "mkdir", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Before creating the directory, take a moment to verify it doesn't already exist and that your current working directory is the intended parent. Confirm with pwd and ls if uncertain, then proceed if appropriate.", + "condition_label": "specific", + "condition": "mkdir_already_exists" + }, + { + "tool_name": "echo", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Before writing the content, take a moment to verify the content string is formatted as intended. String literals can pick up extra quote characters or escape sequences that change what actually gets written to the file. Confirm the content matches what the user requested, then proceed if appropriate.", + "condition_label": "specific", + "condition": "echo_content_extra_quotes" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/D_vague.json b/experiments/parth/feedback_ablation/configs/D_vague.json new file mode 100644 index 0000000..a68d1ca --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/D_vague.json @@ -0,0 +1,63 @@ +{ + "subset": "D", + "condition": "vague", + "test_ids": [ + "multi_turn_base_0", + "multi_turn_base_10", + "multi_turn_base_18", + "multi_turn_base_4", + "multi_turn_base_40", + "multi_turn_base_42", + "multi_turn_base_44" + ], + "triggers": [ + { + "tool_name": "grep", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Before proceeding with this file operation, take a moment to verify the relevant state — your current location, the files involved, and whether the action is needed. Confirm, then continue if appropriate.", + "condition_label": "vague", + "condition": "arg_not_recent_file" + }, + { + "tool_name": "cd", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Before proceeding with this file operation, take a moment to verify the relevant state — your current location, the files involved, and whether the action is needed. Confirm, then continue if appropriate.", + "condition_label": "vague", + "condition": "cd_to_current_dir" + }, + { + "tool_name": "mv", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Before proceeding with this file operation, take a moment to verify the relevant state — your current location, the files involved, and whether the action is needed. Confirm, then continue if appropriate.", + "condition_label": "vague", + "condition": "mv_dest_missing_directory" + }, + { + "tool_name": "ls", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Before proceeding with this file operation, take a moment to verify the relevant state — your current location, the files involved, and whether the action is needed. Confirm, then continue if appropriate.", + "condition_label": "vague", + "condition": "duplicate_ls_no_state_change" + }, + { + "tool_name": "mkdir", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Before proceeding with this file operation, take a moment to verify the relevant state — your current location, the files involved, and whether the action is needed. Confirm, then continue if appropriate.", + "condition_label": "vague", + "condition": "mkdir_already_exists" + }, + { + "tool_name": "echo", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Before proceeding with this file operation, take a moment to verify the relevant state — your current location, the files involved, and whether the action is needed. Confirm, then continue if appropriate.", + "condition_label": "vague", + "condition": "echo_content_extra_quotes" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/D_verbose.json b/experiments/parth/feedback_ablation/configs/D_verbose.json new file mode 100644 index 0000000..863e00a --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/D_verbose.json @@ -0,0 +1,63 @@ +{ + "subset": "D", + "condition": "verbose", + "test_ids": [ + "multi_turn_base_0", + "multi_turn_base_10", + "multi_turn_base_18", + "multi_turn_base_4", + "multi_turn_base_40", + "multi_turn_base_42", + "multi_turn_base_44" + ], + "triggers": [ + { + "tool_name": "grep", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "File search operations like grep depend on the search being directed at the correct file, and a common failure mode in multi-turn file workflows is for the agent to pattern-match on a filename from earlier context rather than the file currently in scope. Filenames in agent workflows often follow predictable patterns (drafts, finals, dated versions), and similar names can be easy to substitute for one another, especially when prior turns referenced different files. Before searching, take a moment to verify that the file argument matches the file the user is asking about in this specific task — not a similarly named file from earlier context or a default the agent might assume. Confirm the target file matches the user's current request, then proceed if appropriate.", + "condition_label": "verbose", + "condition": "arg_not_recent_file" + }, + { + "tool_name": "cd", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Working directory state is implicit in shell-like environments and can drift between turns in ways that aren't always obvious from the conversation history. A common failure mode is for an agent to issue a cd to a target directory without first checking the current working directory, which can lead to errors when the target is reached as a relative path that doesn't resolve correctly, or to no-op calls when the agent is already in the target directory. The pwd command is a low-cost way to ground subsequent navigation in actual rather than assumed state. Before changing directories, take a moment to verify your current working directory and confirm the navigation is necessary and correctly specified. Then proceed if appropriate.", + "condition_label": "verbose", + "condition": "cd_to_current_dir" + }, + { + "tool_name": "mv", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "The mv command is overloaded: depending on whether the destination is an existing directory, an existing file, or a non-existent path, it can rename, overwrite, or move the source. In multi-step file workflows, this overloading is a common source of failures — an agent may intend to move a file into a folder but, if the folder doesn't exist or the path is misinterpreted, end up renaming the file in place instead. Move operations also frequently depend on prior steps such as creating a destination directory or navigating to the correct working directory. Before moving the file, take a moment to verify the source and destination are what you intend, that the destination behaves as you expect (rename vs. directory move), and that any prior dependent steps have been completed. Then proceed if appropriate.", + "condition_label": "verbose", + "condition": "mv_dest_missing_directory" + }, + { + "tool_name": "ls", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Directory listing is a read-only operation, but redundant ls calls are a common pattern in agent workflows that can indicate the agent has lost track of state from a prior call or is filling space rather than acting on information already available. Each tool call consumes context and adds latency, and repeating a listing without intervening state changes typically yields no new information. Before listing, take a moment to verify whether you already have the directory contents from a recent call in this session. Confirm whether a fresh listing is genuinely needed, then proceed if appropriate.", + "condition_label": "verbose", + "condition": "duplicate_ls_no_state_change" + }, + { + "tool_name": "mkdir", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Directory creation is a state-changing operation whose effect depends on both the current working directory and the existing filesystem state. A common failure mode is for an agent to attempt mkdir on a directory that already exists, or to create the directory in an unintended parent because the working directory wasn't verified first. Many shell environments will return an error when creating an existing directory, but the more subtle failure is silently creating a nested or duplicate directory in the wrong location. Before creating, take a moment to verify the directory doesn't already exist and that your current working directory is the intended parent. Confirm with pwd and ls if uncertain, then proceed if appropriate.", + "condition_label": "verbose", + "condition": "mkdir_already_exists" + }, + { + "tool_name": "echo", + "trigger_type": "precondition_check", + "occurrence": 1, + "feedback_message": "Writing content to files via echo depends on the content string being formatted as intended, and string literals in tool calls can be a source of subtle errors. Quote characters intended as delimiters can end up embedded in the written content if escaping is handled incorrectly, and conversely, content meant to include literal quotes can have them stripped. Multi-step workflows where the content is paraphrased or reconstructed from earlier conversation are especially prone to introducing extra layers of quoting or escape sequences. Before writing, take a moment to verify that the content string matches what the user requested, character for character, with no added or removed quote layers. Confirm, then proceed if appropriate.", + "condition_label": "verbose", + "condition": "echo_content_extra_quotes" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/E_null.json b/experiments/parth/feedback_ablation/configs/E_null.json new file mode 100644 index 0000000..77117ff --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/E_null.json @@ -0,0 +1,25 @@ +{ + "subset": "E", + "condition": "null", + "test_ids": [ + "multi_turn_base_155", + "multi_turn_base_185", + "multi_turn_base_198" + ], + "triggers": [ + { + "tool_name": "book_flight", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + }, + { + "tool_name": "purchase_insurance", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/E_specific.json b/experiments/parth/feedback_ablation/configs/E_specific.json new file mode 100644 index 0000000..fde81f8 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/E_specific.json @@ -0,0 +1,25 @@ +{ + "subset": "E", + "condition": "specific", + "test_ids": [ + "multi_turn_base_155", + "multi_turn_base_185", + "multi_turn_base_198" + ], + "triggers": [ + { + "tool_name": "book_flight", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before booking, take a moment to verify the argument values you've selected, particularly the payment card. The card_id should correspond to one of the user's available cards — confirm it matches an entry from the user's actual card list rather than a similarly formatted or assumed value. Verify the other booking arguments (dates, route, class) are also correct, then proceed if appropriate.", + "condition_label": "specific" + }, + { + "tool_name": "purchase_insurance", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before purchasing, take a moment to verify the argument values you've selected, particularly the insurance type. Insurance offerings typically come in distinct tiers (e.g., basic, travel, comprehensive), and the user's request may specify or imply a particular tier. Confirm the insurance_type matches what the user actually requested, and verify the other arguments (booking ID, cost, payment card) are correct. Then proceed if appropriate.", + "condition_label": "specific" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/E_vague.json b/experiments/parth/feedback_ablation/configs/E_vague.json new file mode 100644 index 0000000..ddc35cc --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/E_vague.json @@ -0,0 +1,25 @@ +{ + "subset": "E", + "condition": "vague", + "test_ids": [ + "multi_turn_base_155", + "multi_turn_base_185", + "multi_turn_base_198" + ], + "triggers": [ + { + "tool_name": "book_flight", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to verify the argument values match what the user actually requested. Confirm, then continue if appropriate.", + "condition_label": "vague" + }, + { + "tool_name": "purchase_insurance", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to verify the argument values match what the user actually requested. Confirm, then continue if appropriate.", + "condition_label": "vague" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/E_verbose.json b/experiments/parth/feedback_ablation/configs/E_verbose.json new file mode 100644 index 0000000..ec1beef --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/E_verbose.json @@ -0,0 +1,25 @@ +{ + "subset": "E", + "condition": "verbose", + "test_ids": [ + "multi_turn_base_155", + "multi_turn_base_185", + "multi_turn_base_198" + ], + "triggers": [ + { + "tool_name": "book_flight", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Flight booking is a financially consequential action whose arguments determine which payment method is charged, which itinerary is reserved, and what travel class is purchased. A common failure mode in agent-driven booking workflows is for the agent to populate the card_id argument with a value that resembles a payment card identifier — a string with the right format, a partial number from earlier context, or an assumed default — without verifying that the value corresponds to one of the user's actually available cards. Payment card lists are typically retrievable through a dedicated tool, and grounding the card_id selection in the actual list rather than in inferred or pattern-matched values is the most reliable way to avoid charging an unintended card or having the booking fail. Before booking, take a moment to verify the card_id matches an entry from the user's actual card list, and that the other arguments (dates, route, class) align with what the user requested. Then proceed if appropriate.", + "condition_label": "verbose" + }, + { + "tool_name": "purchase_insurance", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Insurance purchases involve selecting from distinct product tiers that differ in coverage, cost, and applicability to the booking being insured. A common failure mode in agent-driven insurance workflows is for the insurance_type argument to be populated with a plausible-sounding value (e.g., \"travel,\" \"standard,\" \"basic\") that doesn't actually match the tier the user requested or the tier appropriate for their stated needs. Insurance tier names can be easy to confuse — \"travel insurance\" may sound like a default for travel-related bookings even when the user explicitly asked for \"comprehensive\" coverage, and the cost argument may need to align with the selected tier. Before purchasing, take a moment to verify that insurance_type matches what the user actually requested, and that the other arguments (booking ID, cost, payment card) are consistent with the user's request. Then proceed if appropriate.", + "condition_label": "verbose" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/F_null.json b/experiments/parth/feedback_ablation/configs/F_null.json new file mode 100644 index 0000000..f6b646c --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/F_null.json @@ -0,0 +1,48 @@ +{ + "subset": "F", + "condition": "null", + "test_ids": [ + "multi_turn_base_173", + "multi_turn_base_179", + "multi_turn_base_180", + "multi_turn_base_184", + "multi_turn_base_48" + ], + "triggers": [ + { + "tool_name": "set_budget_limit", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + }, + { + "tool_name": "book_flight", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + }, + { + "tool_name": "purchase_insurance", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + }, + { + "tool_name": "cancel_booking", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + }, + { + "tool_name": "get_ticket", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/F_specific.json b/experiments/parth/feedback_ablation/configs/F_specific.json new file mode 100644 index 0000000..554be24 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/F_specific.json @@ -0,0 +1,48 @@ +{ + "subset": "F", + "condition": "specific", + "test_ids": [ + "multi_turn_base_173", + "multi_turn_base_179", + "multi_turn_base_180", + "multi_turn_base_184", + "multi_turn_base_48" + ], + "triggers": [ + { + "tool_name": "set_budget_limit", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before setting the budget limit, take a moment to reconsider whether this is the right next step in the user's task. If you've already set or attempted to set a budget limit recently, repeating the call won't change the outcome — review what's been done so far and whether a different action is needed to move the task forward. Then proceed if appropriate.", + "condition_label": "specific" + }, + { + "tool_name": "book_flight", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before booking, take a moment to reconsider whether this is the right next step in the user's task and whether the arguments you've selected are correct. Verify the action fits the current point in the workflow and that values like the payment card match what the user actually has available. Then proceed if appropriate.", + "condition_label": "specific" + }, + { + "tool_name": "purchase_insurance", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before purchasing, take a moment to reconsider whether this is the right next step in the user's task and whether the arguments are correct. Verify that the insurance type, booking ID, and payment card match what the user requested. Then proceed if appropriate.", + "condition_label": "specific" + }, + { + "tool_name": "cancel_booking", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before cancelling the booking, take a moment to reconsider whether this is the action the user actually requested. Cancellation is a state-changing operation that's easy to confuse with other actions like closing a ticket, resolving an issue, or undoing a different recent step. Verify that cancelling the booking is what the user asked for, then proceed if appropriate.", + "condition_label": "specific" + }, + { + "tool_name": "get_ticket", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before retrieving the ticket, take a moment to reconsider whether you already have the information this call would return. If you've recently called get_ticket and the ticket state hasn't changed, repeating the call won't yield new information — review what you already know and consider whether a different action is needed to move the task forward. Then proceed if appropriate.", + "condition_label": "specific" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/F_vague.json b/experiments/parth/feedback_ablation/configs/F_vague.json new file mode 100644 index 0000000..09663ce --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/F_vague.json @@ -0,0 +1,48 @@ +{ + "subset": "F", + "condition": "vague", + "test_ids": [ + "multi_turn_base_173", + "multi_turn_base_179", + "multi_turn_base_180", + "multi_turn_base_184", + "multi_turn_base_48" + ], + "triggers": [ + { + "tool_name": "set_budget_limit", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to reconsider whether it's the right next step in the user's task. Confirm, then continue if appropriate.", + "condition_label": "vague" + }, + { + "tool_name": "book_flight", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to reconsider whether it's the right next step in the user's task. Confirm, then continue if appropriate.", + "condition_label": "vague" + }, + { + "tool_name": "purchase_insurance", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to reconsider whether it's the right next step in the user's task. Confirm, then continue if appropriate.", + "condition_label": "vague" + }, + { + "tool_name": "cancel_booking", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to reconsider whether it's the right next step in the user's task. Confirm, then continue if appropriate.", + "condition_label": "vague" + }, + { + "tool_name": "get_ticket", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to reconsider whether it's the right next step in the user's task. Confirm, then continue if appropriate.", + "condition_label": "vague" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/F_verbose.json b/experiments/parth/feedback_ablation/configs/F_verbose.json new file mode 100644 index 0000000..e5e0d52 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/F_verbose.json @@ -0,0 +1,48 @@ +{ + "subset": "F", + "condition": "verbose", + "test_ids": [ + "multi_turn_base_173", + "multi_turn_base_179", + "multi_turn_base_180", + "multi_turn_base_184", + "multi_turn_base_48" + ], + "triggers": [ + { + "tool_name": "set_budget_limit", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Budget limit operations are configuration calls whose effect depends on the limit value being correct and on the call being made at the right point in the workflow. A common failure mode in agent-driven financial workflows is for an agent to repeat the same configuration call multiple times in a row — either because the prior call's result wasn't fully processed, because the agent is uncertain whether it succeeded, or because the agent has lost track of what's already been done in the session. Repeating a configuration call without intervening state changes typically produces no progress and consumes context that could be spent on subsequent steps. Before setting the budget limit, take a moment to reconsider whether you've already set or attempted to set this limit, and whether the next move in the user's task is actually a different action. Then proceed if appropriate.", + "condition_label": "verbose" + }, + { + "tool_name": "book_flight", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Flight booking sits at a specific point in a multi-step travel workflow that typically involves authentication, account verification, card selection, and other prerequisites. A common failure mode is for an agent to issue a book_flight call before all the upstream context has been gathered — for example, picking a card_id based on inference rather than on the user's actual card list, or booking before confirming the trip parameters. Booking is also a financially consequential action that is hard to undo cleanly, so the cost of an incorrect call is higher than for read-only operations. Before booking, take a moment to reconsider whether this is the right next step at this point in the workflow and whether the arguments — payment card, dates, route, class — accurately reflect what the user requested. Then proceed if appropriate.", + "condition_label": "verbose" + }, + { + "tool_name": "purchase_insurance", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Insurance purchase calls involve both selecting the right product tier and tying the purchase to the correct booking and payment method. A common failure mode in agent-driven workflows is for the insurance_type to be populated with a default-looking value rather than the tier the user actually requested, or for the call to be issued before the booking it's meant to insure has been confirmed. Insurance is also distinct from booking in that the tier names can be ambiguous — \"travel,\" \"standard,\" \"basic,\" \"comprehensive\" — and these distinctions matter for whether the coverage matches the user's intent. Before purchasing, take a moment to reconsider whether this is the right next step and whether the insurance type, booking ID, cost, and payment card are all consistent with what the user asked for. Then proceed if appropriate.", + "condition_label": "verbose" + }, + { + "tool_name": "cancel_booking", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Booking cancellation is a state-changing operation with downstream effects on refunds, related reservations, and the user's broader travel plans. A common failure mode in agent-driven workflows is for cancellation tools to be confused with other completion-style actions — closing a ticket, resolving an issue, marking a task done — particularly when the user's phrasing is ambiguous about whether they want the booking itself undone or just a related record finalized. Cancellation is typically harder to reverse than other actions, so verifying intent before issuing the call is especially important. Before cancelling, take a moment to reconsider whether the user actually asked for the booking to be cancelled, or whether a different action (closing a ticket, resolving a support request, finalizing a different record) better matches their request. Then proceed if appropriate.", + "condition_label": "verbose" + }, + { + "tool_name": "get_ticket", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Ticket retrieval is a read-only operation whose value depends on whether the information it returns is new or has changed since the last retrieval. A common failure mode in agent-driven workflows is for an agent to call get_ticket repeatedly across turns without taking action on the information already returned — either because the agent is uncertain how to proceed, because it's looking for a state change that hasn't happened, or because it's lost track of what it already knows. Repeated read-only calls don't progress the task and can crowd out the actions that would. Before retrieving, take a moment to reconsider whether you already have the ticket information from a recent call and whether the next step in the user's task is actually a different action. Then proceed if appropriate.", + "condition_label": "verbose" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/G_null.json b/experiments/parth/feedback_ablation/configs/G_null.json new file mode 100644 index 0000000..1ce1648 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/G_null.json @@ -0,0 +1,26 @@ +{ + "subset": "G", + "condition": "null", + "test_ids": [ + "multi_turn_base_161", + "multi_turn_base_172", + "multi_turn_base_188", + "multi_turn_base_193" + ], + "triggers": [ + { + "tool_name": "authenticate_travel", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + }, + { + "tool_name": "purchase_insurance", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Unable to process this call at this time.", + "condition_label": "null" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/G_specific.json b/experiments/parth/feedback_ablation/configs/G_specific.json new file mode 100644 index 0000000..b332b21 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/G_specific.json @@ -0,0 +1,26 @@ +{ + "subset": "G", + "condition": "specific", + "test_ids": [ + "multi_turn_base_161", + "multi_turn_base_172", + "multi_turn_base_188", + "multi_turn_base_193" + ], + "triggers": [ + { + "tool_name": "authenticate_travel", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before authenticating, take a moment to verify whether you've already authenticated successfully in this session. Re-authenticating when a valid session already exists can reset session state and cause downstream calls to fail. Confirm whether authentication is actually needed, then proceed if appropriate.", + "condition_label": "specific" + }, + { + "tool_name": "purchase_insurance", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before purchasing, take a moment to verify the argument values you've selected, particularly the insurance type. Insurance offerings typically come in distinct tiers, and the user's request may specify or imply a particular one. Confirm the insurance_type matches what the user actually requested, and verify the other arguments are correct. Then proceed if appropriate.", + "condition_label": "specific" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/G_vague.json b/experiments/parth/feedback_ablation/configs/G_vague.json new file mode 100644 index 0000000..cfb30c7 --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/G_vague.json @@ -0,0 +1,26 @@ +{ + "subset": "G", + "condition": "vague", + "test_ids": [ + "multi_turn_base_161", + "multi_turn_base_172", + "multi_turn_base_188", + "multi_turn_base_193" + ], + "triggers": [ + { + "tool_name": "authenticate_travel", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to verify whether it's needed and whether the arguments are correct. Confirm, then continue if appropriate.", + "condition_label": "vague" + }, + { + "tool_name": "purchase_insurance", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Before proceeding with this call, take a moment to verify whether it's needed and whether the arguments are correct. Confirm, then continue if appropriate.", + "condition_label": "vague" + } + ] +} diff --git a/experiments/parth/feedback_ablation/configs/G_verbose.json b/experiments/parth/feedback_ablation/configs/G_verbose.json new file mode 100644 index 0000000..d94420d --- /dev/null +++ b/experiments/parth/feedback_ablation/configs/G_verbose.json @@ -0,0 +1,26 @@ +{ + "subset": "G", + "condition": "verbose", + "test_ids": [ + "multi_turn_base_161", + "multi_turn_base_172", + "multi_turn_base_188", + "multi_turn_base_193" + ], + "triggers": [ + { + "tool_name": "authenticate_travel", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Authentication tools establish session state that subsequent tool calls depend on, and the relationship between authentication calls and session state can be subtle. A common failure mode in agent-driven workflows is for an agent to issue an authentication call when a valid session already exists — either because the agent has lost track of prior session state, because it's defensively re-authenticating before a sensitive operation, or because it's misinterpreting an unrelated error as an auth issue. Re-authenticating against an already-authenticated session can reset tokens, invalidate session-bound resources, or trigger rate limits, and can cause subsequent calls in the workflow to fail in ways that are hard to diagnose. Before authenticating, take a moment to verify whether a valid authenticated session already exists from earlier in this session, and whether the next step in the user's task actually requires a fresh authentication. Then proceed if appropriate.", + "condition_label": "verbose" + }, + { + "tool_name": "purchase_insurance", + "trigger_type": "tool_only", + "occurrence": 1, + "feedback_message": "Insurance purchases involve selecting from distinct product tiers that differ in coverage, cost, and applicability to the booking being insured. A common failure mode in agent-driven insurance workflows is for the insurance_type argument to be populated with a plausible-sounding value that doesn't actually match the tier the user requested or the tier appropriate for their stated needs. Insurance tier names can be easy to confuse — \"travel insurance\" may sound like a default for travel-related bookings even when the user explicitly asked for \"comprehensive\" coverage. Before purchasing, take a moment to verify that insurance_type matches what the user actually requested, and that the other arguments (booking ID, cost, payment card) are consistent with the user's request. Then proceed if appropriate.", + "condition_label": "verbose" + } + ] +} diff --git a/experiments/parth/feedback_ablation/run_experiment.py b/experiments/parth/feedback_ablation/run_experiment.py new file mode 100755 index 0000000..b2b3e07 --- /dev/null +++ b/experiments/parth/feedback_ablation/run_experiment.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +"""Experiment runner for BFCL runtime feedback + +Runs the BFCL evaluation suite for a single (subset, condition) cell. +Resolves the right config file, sets all required environment variables, +and delegates to pytest with appropriate flags. + +Usage +----- + # Run subset A with specific-label feedback + python experiments/parth/feedback_ablation/run_experiment.py --subset A --condition specific + + # Override the test-case list (comma-separated IDs) + python experiments/parth/feedback_ablation/run_experiment.py --subset E --condition vague \\ + --test-ids multi_turn_base_62,multi_turn_base_70 + + # Dry-run: print the pytest command without executing it + python experiments/parth/feedback_ablation/run_experiment.py --subset D --condition specific --dry-run + + # Extra pytest flags are forwarded verbatim after -- + python experiments/parth/feedback_ablation/run_experiment.py --subset A --condition specific -- -x -v + +Output +------ +Results are written to ``results///``: + + results/A/specific/raw/_complete.json + results/A/specific/raw/_structured.jsonl + results/A/specific/raw/_evaluation.json + results/A/specific/raw/external_feedback.jsonl ← structured trigger log +""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + +# --------------------------------------------------------------------------- +# Paths relative to repo root +# --------------------------------------------------------------------------- + +_THIS_DIR = Path(__file__).resolve().parent +REPO_ROOT = _THIS_DIR.parent.parent.parent +BFCL_DIR = REPO_ROOT / "tests" / "benchmarks" / "bfcl" +CONFIGS_DIR = _THIS_DIR / "configs" +RESULTS_DIR = REPO_ROOT / "outputs" / "feedback" + +VALID_SUBSETS = {"A", "B", "C", "D", "E", "F", "G"} +VALID_CONDITIONS = {"specific", "vague", "verbose", "null"} + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def resolve_config(subset: str, condition: str) -> Path: + """Return the path to the JSON config file for this cell. + + Raises FileNotFoundError with a helpful message if not found. + """ + candidate = CONFIGS_DIR / f"{subset}_{condition}.json" + if not candidate.exists(): + available = sorted(CONFIGS_DIR.glob("*.json")) + hint = "\n ".join(str(p.name) for p in available) or "(none)" + raise FileNotFoundError( + f"Config file not found: {candidate}\n" + f"Available configs in {CONFIGS_DIR}:\n {hint}" + ) + return candidate + + +def load_test_ids_from_config(config_path: Path) -> list[str]: + """Read the test_ids list from the config file.""" + with open(config_path) as fh: + config = json.load(fh) + return config.get("test_ids", []) + + +def build_pytest_filter(test_ids: list[str]) -> str: + """Build a pytest -k expression that matches exactly the given test IDs. + + pytest parametrises each test as ``test_bfcl[]`` so we match on + the bracketed ID substring. + """ + if not test_ids: + return "" + # Wrap each ID in brackets to match the exact parametrized name (e.g. [multi_turn_base_0]) + # and avoid substring matches (e.g. multi_turn_base_0 matching multi_turn_base_100). + return " or ".join(f"[{tid}]" for tid in test_ids) + + +def build_env( + config_path: Path, + output_dir: Path, + model: str, + temperature: float, +) -> dict[str, str]: + """Construct the environment for the pytest subprocess.""" + env = os.environ.copy() + env.update( + { + # Master switch — tells mcp_server.py to activate WAGS proxy mode. + "BFCL_EXTERNAL_FEEDBACK_ENABLED": "1", + # Points the middleware to the rules file for this cell. + "BFCL_EXTERNAL_FEEDBACK_CONFIG": str(config_path.resolve()), + # Structured JSONL log — one record per evaluated tool call. + "BFCL_EXTERNAL_FEEDBACK_LOG_FILE": str( + (output_dir / "raw" / "external_feedback.jsonl").resolve() + ), + # Forwarded to FastAgent / fastagent.config.yaml interpolation. + "DEFAULT_MODEL": model, + "TEMPERATURE": str(temperature), + } + ) + return env + + +def build_pytest_command( + output_dir: Path, + model: str, + temperature: float, + k_filter: str, + extra_args: list[str], +) -> list[str]: + """Assemble the full pytest invocation.""" + cmd = [ + sys.executable, "-m", "pytest", + str(BFCL_DIR / "test_bfcl.py"), + # Enable the external-feedback flag so test_bfcl.py passes it through. + "--external-feedback", + "--output-dir", str(output_dir.resolve()), + "--model", model, + "--temperature", str(temperature), + ] + if k_filter: + cmd += ["-k", k_filter] + cmd += extra_args + return cmd + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def parse_args(argv: list[str] | None = None) -> tuple[argparse.Namespace, list[str]]: + parser = argparse.ArgumentParser( + description="Run BFCL eval for one (subset × condition) experimental cell.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--subset", + required=True, + choices=sorted(VALID_SUBSETS), + help="Experimental subset ID (A–G).", + ) + parser.add_argument( + "--condition", + required=True, + choices=sorted(VALID_CONDITIONS), + help="Feedback condition label.", + ) + parser.add_argument( + "--config-dir", + default=None, + help=( + f"Directory containing config files " + f"(default: {CONFIGS_DIR})." + ), + ) + parser.add_argument( + "--output-dir", + default=None, + help=( + "Root output directory. Results go to " + "/// " + f"(default: {RESULTS_DIR})." + ), + ) + parser.add_argument( + "--model", + default="gpt-5", + help="LLM model name.", + ) + parser.add_argument( + "--temperature", + type=float, + default=0.00, + help="Sampling temperature (default: 0.00).", + ) + parser.add_argument( + "--test-ids", + default=None, + help=( + "Comma-separated list of BFCL test IDs to run. " + "Overrides the test_ids field in the config file." + ), + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print the resolved config, env vars, and pytest command without running.", + ) + return parser.parse_known_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args, extra_pytest_args = parse_args(argv) + + # Strip a leading "--" separator used to pass extra pytest args. + if extra_pytest_args and extra_pytest_args[0] == "--": + extra_pytest_args = extra_pytest_args[1:] + + # ---- Resolve config ---- + configs_dir = Path(args.config_dir) if args.config_dir else CONFIGS_DIR + config_path = configs_dir / f"{args.subset}_{args.condition}.json" + if not config_path.exists(): + available = sorted(configs_dir.glob("*.json")) + hint = "\n ".join(p.name for p in available) or "(none found)" + print( + f"ERROR: Config file not found: {config_path}\n" + f"Available configs in {configs_dir}:\n {hint}", + file=sys.stderr, + ) + return 1 + + # ---- Resolve test IDs ---- + if args.test_ids: + test_ids = [t.strip() for t in args.test_ids.split(",") if t.strip()] + else: + test_ids = load_test_ids_from_config(config_path) + + if not test_ids: + print( + "WARNING: No test IDs specified and none found in config file. " + "pytest will run ALL multi-turn BFCL tests.", + file=sys.stderr, + ) + + # ---- Output directory ---- + results_root = Path(args.output_dir) if args.output_dir else RESULTS_DIR + output_dir = results_root / args.subset / args.condition + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "raw").mkdir(parents=True, exist_ok=True) + + # ---- Build command and environment ---- + k_filter = build_pytest_filter(test_ids) + env = build_env(config_path, output_dir, args.model, args.temperature) + cmd = build_pytest_command( + output_dir, args.model, args.temperature, k_filter, extra_pytest_args + ) + + # ---- Dry run: just print ---- + if args.dry_run: + print("=== Config file ===") + print(f" {config_path}") + print(f"\n=== Test IDs ({len(test_ids)}) ===") + for tid in test_ids: + print(f" {tid}") + print("\n=== Output directory ===") + print(f" {output_dir}") + print("\n=== Environment overrides ===") + feedback_keys = [k for k in env if k.startswith("BFCL_") or k in {"DEFAULT_MODEL", "TEMPERATURE"}] + for k in sorted(feedback_keys): + print(f" {k}={env[k]}") + print("\n=== pytest command ===") + print(" " + " ".join(cmd)) + return 0 + + # ---- Run ---- + print( + f"[run_experiment] subset={args.subset} condition={args.condition} " + f"model={args.model} tests={len(test_ids) or 'all'}", + flush=True, + ) + print(f"[run_experiment] Output: {output_dir}", flush=True) + print(f"[run_experiment] Config: {config_path}", flush=True) + + result = subprocess.run(cmd, env=env, cwd=str(REPO_ROOT)) + return result.returncode + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/experiments/parth/gepa/__init__.py b/experiments/parth/gepa/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/experiments/parth/gepa/agent.py b/experiments/parth/gepa/agent.py new file mode 100644 index 0000000..0a1a5f0 --- /dev/null +++ b/experiments/parth/gepa/agent.py @@ -0,0 +1,302 @@ +""" +agent.py + +DSPy module wrapper for running BFCL tests with pytest +""" + +from __future__ import annotations +import json +import subprocess +import time +import uuid +from pathlib import Path +from typing import Any, List +import dspy +from tests.benchmarks.bfcl import evaluator as bfcl_evaluator +from tests.utils.fastagent_helpers import MessageSerializer +from .logging_utils import sha256_text, RUN_CTX, append_jsonl, utc_now_iso, safe_json + + + +class BFCLExample(dspy.Example): + """ + DSPy Example wrapper for BFCL cases/examples + """ + + def __init__( + self, + test_id: str | None = None, + question: str | None = None, + *, + base: dspy.Example | None = None, + **kwargs: Any + ): + if base is None: + super().__init__(test_id=test_id, question=question, **kwargs) + else: + super().__init__(base=base, **kwargs) + + +class BFCLAgent(dspy.Module): + """ + DSPy module that evaluates a given instruction prompt by running + BFCL tests (with pytest) and parsing resulting outputs + """ + + def __init__( + self, + instruction_text: str, + model: str, + execution_lm: dspy.LM, + base_dir: Path, + pytest_binary: str, + enable_scoring_mode: bool + ): + super().__init__() + self.model = model + self.execution_lm = execution_lm + self.base_dir = base_dir + self.base_dir.mkdir(parents=True, exist_ok=True) + self.pytest_binary = pytest_binary + self.enable_scoring_mode = enable_scoring_mode + + # The file at this path is changed before each run + self._instruction_path = self.base_dir / "current_instruction.txt" + + # Define the model's task + signature = dspy.Signature( + "prompt_input -> prompt_output", + instructions=instruction_text + ) + + # dspy.Predict handles logic of constructing prompt + # and sending it to the LM + self.prompt_predictor = dspy.Predict(signature) + + + def forward(self, test_id: str, question: str) -> dspy.Prediction: + """ + Run a single BFCL test case using the current instruction prompt + """ + phase = "unknown" + if RUN_CTX is not None: + if test_id in RUN_CTX.train_ids: + phase = "gepa_train" + elif test_id in RUN_CTX.dev_ids: + phase = "gepa_dev" + else: + phase = "baseline" + + test_number = None + try: + test_number = int(test_id.rsplit("_", 1)[-1]) + except Exception: + pass + + + # Initialize timing + t0 = time.perf_counter() + timing: dict[str, float] = {} + + # dspy trace anchor + try: + t_trace = time.perf_counter() + with dspy.context(lm=self.execution_lm): + _ = self.prompt_predictor(prompt_input=question) + timing["dspy_trace_anchor_s"] = time.perf_counter() - t_trace + except Exception as e: + timing["dspy_trace_anchor_s"] = 0.0 + # print(f"[TRACE_ANCHOR_ERROR] {type(e).__name__}: {e}") + + # Write current instruction + instruction_text = self.get_instruction_text() + instruction_hash = sha256_text(instruction_text) + + t_write = time.perf_counter() + self._instruction_path.write_text(instruction_text, encoding="utf-8") + timing["write_instruction_s"] = time.perf_counter() - t_write + + # Create a unique directory for each individual run + run_uid = uuid.uuid4().hex[:12] + run_dir = self.base_dir / "runs" / f"{test_id}__{run_uid}" + run_dir.mkdir(parents=True, exist_ok=True) + + # Construct the pytest command + cmd = [ + self.pytest_binary, + f"tests/benchmarks/bfcl/test_bfcl.py::test_bfcl[{test_id}]", + "--model", + self.model, + "--instruction-file", + str(self._instruction_path), + "--output-dir", + str(run_dir), + "-q", + "-x" + ] + if self.enable_scoring_mode: + cmd.append("--gepa-scoring-mode") + + # Run the pytest command + t_pytest = time.perf_counter() + result = subprocess.run( + cmd, + capture_output=True, + text=True + ) + timing["pytest_run_s"] = time.perf_counter() - t_pytest + + # Parse outputs and evaluate + complete_path = run_dir / "raw" / f"{test_id}_complete.json" + + tool_calls_by_turn: List[List[dict[str, Any]]] = [] + executable_responses: List[List[str]] = [] + evaluation: dict[str, Any] | None = None + eval_error: str | None = None + failure_summary: str | None = None + + t_eval = time.perf_counter() + if complete_path.exists(): + try: + complete_data = json.loads(complete_path.read_text()) + tool_calls_by_turn = MessageSerializer.extract_tool_calls_by_turn(complete_data) + + for turn in tool_calls_by_turn: + for call in turn: + if "function" in call and call["function"]: + call["function"] = self.strip_tool_prefix(call["function"]) + + t_fmt = time.perf_counter() + executable_responses = MessageSerializer.format_to_executable(tool_calls_by_turn) + executable_responses = [ + [self.strip_tool_prefix(call) for call in turn] + for turn in executable_responses + ] + timing["format_to_executable_s"] = time.perf_counter() - t_fmt + + t_chk = time.perf_counter() + evaluation = bfcl_evaluator._run_evaluation( + test_id, + tool_calls_by_turn, + executable_responses, + ) + + if evaluation is not None: + eval_path = run_dir / "evaluation.json" + eval_path.write_text( + json.dumps(safe_json(evaluation), indent=2), + encoding="utf-8", + ) + + if "validation" in evaluation: + failure_summary = self.summarize_validation_failure(evaluation["validation"]) + + timing["bfcl_checker_s"] = time.perf_counter() - t_chk + except Exception as e: + eval_error = f"{type(e).__name__}: {e}" + + else: + eval_error = "Complete JSON not found (agent may have crashed)" + + timing["parse_and_eval_s"] = time.perf_counter() - t_eval + + tools_used = [call.get("function") for turn in tool_calls_by_turn for call in turn if call.get("function")] + behavior_summary = self.summarize_behavior_from_calls(tool_calls_by_turn) + + timing["total_forward_s"] = time.perf_counter() - t0 + + if RUN_CTX is not None: + record = { + "ts": utc_now_iso(), + "run_id": RUN_CTX.run_id, + + "phase": phase, + "test_id": test_id, + "test_number": test_number, + + "instruction": { + "hash": instruction_hash, + }, + + "evaluation": { + "valid": bool( + evaluation.get("validation", {}).get("valid", False) + ) if evaluation else False, + "eval_error": eval_error, + "path": str(run_dir / "evaluation.json") if evaluation else None, + }, + + "failure_summary": failure_summary, + "irrelevant": bool( + evaluation.get("irrelevance_check", {}).get("irrelevant", False) + ) if evaluation else False, + + "run_dir": str(run_dir) + } + + append_jsonl( + RUN_CTX.run_index_path, + safe_json(record) + ) + + + # Final prediction for the current case + return dspy.Prediction( + test_id=test_id, + instruction_hash=instruction_hash, + instruction_text=instruction_text, + tools_used=tools_used, + behavior=behavior_summary, + executable_responses=executable_responses, + evaluation=evaluation, + eval_error=eval_error, + pytest_stdout=result.stdout, + pytest_stderr=result.stderr, + run_dir=str(run_dir), + timing=timing + ) + + def get_instruction_text(self) -> str: + """ + Return the current instruction text used by dspy + """ + instructions = getattr(self.prompt_predictor.signature, "instructions", "") + if isinstance(instructions, (list, tuple)): + return "\n".join(str(p) for p in instructions if p) + return str(instructions or "") + + + def summarize_behavior_from_calls(self, tool_calls: List[List[dict[str, Any]]]) -> str: + """ + Summarize tool-use behavior for logging and feedback + """ + tool_seq: List[str] = [] + for turn in tool_calls: + for call in turn: + fn = call.get("function") + if fn: + tool_seq.append(fn) + + return ( + f"TOOLS: {' -> '.join(tool_seq) or 'NONE'}\n" + f"NUM_TOOLS: {len(tool_seq)}" + ) + + def strip_tool_prefix(self, fn: str) -> str: + # vehiclecontrolapi__startEngine -> startEngine + return fn.split("__", 1)[-1] + + + def summarize_validation_failure(self, validation: dict[str, Any]) -> str | None: + if not validation or validation.get("valid", True): + return None + + reasons = [] + + for key in ["missing_calls", "extra_calls", "wrong_order", "argument_mismatches"]: + if key in validation and validation[key]: + reasons.append(f"{key}: {validation[key]}") + + return "; ".join(reasons) if reasons else "validation_failed" + + \ No newline at end of file diff --git a/experiments/parth/gepa/data_utils.py b/experiments/parth/gepa/data_utils.py new file mode 100644 index 0000000..4cadbc8 --- /dev/null +++ b/experiments/parth/gepa/data_utils.py @@ -0,0 +1,81 @@ +""" +data.py + +Dataset loading utilities for GEPA on BFCL tests +""" + +from __future__ import annotations +from typing import List, Any +from tests.benchmarks.bfcl import loader as bfcl_loader +from .agent import BFCLExample + + +def stringify_question(question: Any) -> str: + + if isinstance(question, list) and question: + first = question[0] + + if isinstance(first, str): + return first + + if isinstance(first, dict): + return str(first.get("content", "")) + + if isinstance(first, list) and first: + msg0 = first[0] + if isinstance(msg0, dict): + return str(msg0.get("content", "")) + + if isinstance(question, dict): + return str(question.get("content", "")) + + if isinstance(question, str): + return question + + return "" + + +def load_test_cases(subset: str, limit: int | None = None) -> List[BFCLExample]: + """ + Load BFCL test cases from a given subset and return as BFCLExample objects + """ + test_ids = bfcl_loader.find_tests_in_category(subset, limit=limit) + examples: List[BFCLExample] = [] + for test_id in test_ids[:limit]: + entry = bfcl_loader.load_test_entry(test_id) + question = stringify_question(entry.get("question", "")) + ex = BFCLExample(test_id=test_id, question=question) + examples.append(ex.with_inputs("test_id", "question")) + + return examples + + +def extract_test_number(test_id: str) -> int | None: + try: + return int(test_id.rsplit("_", 1)[-1]) + except ValueError: + return None + + +def parse_test_number_spec(spec: str) -> set[int]: + numbers: set[int] = set() + + for part in spec.split(","): + part = part.strip() + if not part: + continue + + if "-" in part: + start_s, end_s = part.split("-", 1) + start, end = int(start_s), int(end_s) + + if start > end: + raise ValueError( + f"Invalid test number range: {start}-{end}" + ) + + numbers.update(range(start, end + 1)) + else: + numbers.add(int(part)) + + return numbers diff --git a/experiments/parth/gepa/env_utils.py b/experiments/parth/gepa/env_utils.py new file mode 100644 index 0000000..91ae944 --- /dev/null +++ b/experiments/parth/gepa/env_utils.py @@ -0,0 +1,54 @@ +""" +env_utils.py + +Environment validation util functions +""" + +import sys +from typing import Any, List +import os + +MODEL_PROVIDER_ENV_VARS = { + # OpenAI + "gpt-": ["OPENAI_API_KEY"], + + # Anthropic + "claude-": ["ANTHROPIC_API_KEY"], + + # Qwen + "qwen-": ["QWEN_API_KEY"], + + # Kimi + "kimi-": ["KIMI_API_KEY"], +} + +def validate_model_environment(models: List[str]) -> None: + """ + Validate that required environment variables are set + for the requested models. Exit early if misconfigured. + """ + missing: dict[str, List[str]] = {} + + for model in models: + for prefix, env_vars in MODEL_PROVIDER_ENV_VARS.items(): + if model.startswith(prefix): + for env in env_vars: + val = os.getenv(env) + if not val or is_invalid_key(val): + missing.setdefault(model, []).append(env) + + if missing: + print("\n[CONFIG ERROR] Missing required environment variables:\n") + for model, envs in missing.items(): + print(f" Model '{model}' requires:") + for env in envs: + print(f" - {env}") + print( + "\nSet the missing variables and re-run. " + "No artifacts were produced for this run.\n" + ) + sys.exit(2) + + +def is_invalid_key(value: str) -> bool: + return value.strip() == "" or value.lower().startswith("your_") \ No newline at end of file diff --git a/experiments/parth/gepa/gepa_minimal.py b/experiments/parth/gepa/gepa_minimal.py new file mode 100644 index 0000000..adc12b8 --- /dev/null +++ b/experiments/parth/gepa/gepa_minimal.py @@ -0,0 +1,211 @@ +""" +Minimal GEPA use case +""" + +import json +from pathlib import Path +import sys +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +import dspy +from dspy.teleprompt import GEPA +from dspy.evaluate import Evaluate + + + +# 1. Define a tiny task + +class QAExample(dspy.Example): + """Simple question–answer example.""" + def __init__(self, question: str | None = None, answer: str | None = None, *, base: dspy.Example | None = None,**kwargs,): + if base is not None: + super().__init__(base=base, **kwargs) + else: + super().__init__(question=question, answer=answer, **kwargs) + + def __repr__(self): + return f"Q: {self.question} | A: {self.answer}" + + +examples = [ + QAExample( + "What is 2 + 2? If the result is greater than 3, subtract 2.", + "2" + ).with_inputs("question"), + QAExample( + "What is the capital of France? Return the number of letters in the answer.", + "5" + ).with_inputs("question"), + QAExample( + "What color is the sky? Assume no atmosphere.", + "black" + ).with_inputs("question"), + QAExample( + "What is 10 minus 3? If the result is odd, subtract 1.", + "6" + ).with_inputs("question"), + QAExample( + "What is the largest planet in our solar system? Answer in one word only. Explain your reasoning.", + "jupiter" + ).with_inputs("question"), + QAExample( + "Who wrote 'To Kill a Mockingbird'? Return only the last name.", + "lee" + ).with_inputs("question"), + QAExample( + "What is the boiling point of water in Celsius? If conditions differ from standard, return 'unknown'.", + "unknown" + ).with_inputs("question"), + QAExample( + "What is the square root of 16? Return the result minus 1.", + "3" + ).with_inputs("question"), + QAExample( + "What is the chemical symbol for gold? Return the symbol reversed.", + "ua" + ).with_inputs("question"), + QAExample( + "What is the dot product of [1,2] and [3,4]? If the result is greater than 10, subtract 1.", + "10" + ).with_inputs("question"), + QAExample( + "Where is the Taj Mahal located? Return only the country name.", + "india" + ).with_inputs("question"), + QAExample( + "What is the powerhouse of a cell? Answer the organelle name in reverse order", + "airdnohcotim" + ).with_inputs("question"), + QAExample( + "What is the RGB value of the color red? Return only the blue component.", + "0" + ).with_inputs("question"), +] + + + +# 2. Define a DSPy module + +class SimpleQAModel(dspy.Module): + def __init__(self, instructions: str): + super().__init__() + self.predict = dspy.Predict( + dspy.Signature("question -> answer", instructions=instructions) + ) + + def forward(self, question: str): + return self.predict(question=question) + + # Required for GEPA instruction optimization + def get_instruction_text(self) -> str: + return self.predict.signature.instructions or "" + + + +# 3. Metric + +def exact_match_metric( + gold, + pred, + trace=None, + pred_name=None, + pred_trace=None, +): + score = ( + 1.0 + if gold.answer.strip().lower() == pred.answer.strip().lower() + else 0.0 + ) + return score + + + + +# 4. Main + +def main(): + output_dir = Path("outputs/gepa_minimal") + output_dir.mkdir(parents=True, exist_ok=True) + + lm = dspy.LM("openai/gpt-5") + dspy.configure(lm=lm) + + # Initial weaker instruction + seed_instruction = "Answer given question." + + model = SimpleQAModel(seed_instruction) + + # Baseline evaluation + evaluator = Evaluate( + devset=examples, + metric=exact_match_metric, + display_progress=True, + num_threads=1, + ) + + print("\n=== BASELINE ===") + baseline = evaluator(model) + (output_dir / "baseline.txt").write_text(f"Baseline score: {baseline.score}") + + # 5. Run GEPA + gepa = GEPA( + metric=exact_match_metric, + max_full_evals=20, + reflection_lm=lm, + track_stats=True, + seed=42, + ) + + train_size = int(0.7 * len(examples)) + trainset, devset = examples[:train_size], examples[train_size:] + + print("\n=== RUNNING GEPA ===") + optimized_model = gepa.compile( + model, + trainset=trainset, + valset=devset, + ) + + print("\n=== OPTIMIZED ===") + final_score = evaluator(optimized_model) + (output_dir / "optimized.txt").write_text(f"Optimized accuracy: {final_score.score}") + + # Correct way to access results (from real DSPy usage) + results = optimized_model.detailed_results + + # Save candidates with proper instruction extraction + print("\n=== CANDIDATES SAVED ===") + candidates = [] + for i, cand in enumerate(results.candidates): + instr = cand.get_instruction_text() # This works! + candidates.append({ + "candidate_id": i, + "instruction_text": instr, + "val_score": results.val_aggregate_scores[i], + }) + (output_dir / "candidates.json").write_text(json.dumps(candidates, indent=2)) + + # Save instruction evolution + print("\n=== INSTRUCTIONS SAVED ===") + instructions_text = ( + f"Original:\n{seed_instruction}\n\n" + f"Optimized:\n{optimized_model.get_instruction_text()}" + ) + (output_dir / "instructions.txt").write_text(instructions_text) + + # Metadata + print("\n=== METADATA SAVED ===") + meta = { + "baseline_score": float(baseline.score), + "final_score": float(final_score), + "total_metric_calls": results.total_metric_calls, + "num_full_val_evals": results.num_full_val_evals, + "seed": results.seed, + } + (output_dir / "metadata.json").write_text(json.dumps(meta, indent=2)) + + print(f"\nAll outputs saved to {output_dir}/") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/experiments/parth/gepa/gepa_overview.txt b/experiments/parth/gepa/gepa_overview.txt new file mode 100644 index 0000000..255d57d --- /dev/null +++ b/experiments/parth/gepa/gepa_overview.txt @@ -0,0 +1,12 @@ +for step in optimization: + select candidate(s) + run agent on train examples + compute metric → (score, feedback) + build reflection prompt containing: + - current instruction + - feedback summaries + - scores + - possibly history + ask reflection LM: + "Propose an improved instruction" + parse LM output into a new instruction candidate \ No newline at end of file diff --git a/experiments/parth/gepa/logging_utils.py b/experiments/parth/gepa/logging_utils.py new file mode 100644 index 0000000..8b4fe32 --- /dev/null +++ b/experiments/parth/gepa/logging_utils.py @@ -0,0 +1,145 @@ +"""" +logging_utils.py + +Utility functions and objects for logging and saving outputs +""" + +from __future__ import annotations +import json +import hashlib +import subprocess +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +def utc_now_iso() -> str: + """ + Returns current UTC time + """ + return ( + datetime.now(timezone.utc) + .replace(microsecond=0) + .isoformat() + .replace("+00:00", "Z") + ) + + +def sha256_text(text: str) -> str: + """ + Computes a SHA 256 hash of string + + Used to identify instruction prompts across runs instead of storing large strings everywhere + """ + hexdigest = hashlib.sha256(text.encode("utf-8")).hexdigest() + return f"sha256:{hexdigest}" + + +def safe_json(obj: Any) -> Any: + """ + Convert a given object into a JSON-serializable structure + """ + try: + json.dumps(obj) + return obj + + except Exception: + if isinstance(obj, dict): + return {str(k): safe_json(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [safe_json(x) for x in obj] + if hasattr(obj, "__dict__"): + return safe_json(obj.__dict__) + return repr(obj) + + +def append_jsonl(path: Path, record: dict[str, Any]) -> None: + """ + Append a record to a .jsonl file + + If the file at path doesn't exist, it will be created + """ + path.parent.mkdir(parents=True, exist_ok=True) + # Open the file + with path.open("a", encoding="utf-8") as f: + f.write(json.dumps(record, ensure_ascii=False) + "\n") + + +class TeeIO: + """ + Similar to a file, this object processes writes to both a + stream (stdout, stderr) and a log file + """ + def __init__(self, real_stream, log_file): + self.real_stream = real_stream + self.log_file = log_file + + def write(self, s: str) -> None: + self.real_stream.write(s) + self.log_file.write(s) + + def flush(self) -> None: + self.real_stream.flush() + self.log_file.flush() + + def isatty(self) -> bool: + return False + + +@dataclass +class RunContext: + """ + Stores metadata used by metric functions and loggers + + Meant to be read only after initialization + """ + run_id: str + output_dir: Path + metric_calls_path: Path + candidate_snapshots_path: Path + run_index_path: Path + train_ids: set[str] + dev_ids: set[str] + score_definition: dict[str, Any] + +RUN_CTX: RunContext | None = None + + +def try_git_info() -> dict[str, Any]: + """ + Tries to retrieve git info, does not crash if not found + """ + info:dict[str, Any] = dict() + try: + head = subprocess.run( + args=["git", "rev-parse", "HEAD"], + capture_output=True, + text=True, + check=False + ) + info["git_commit"] = head.stdout.strip() if head.returncode == 0 else None + + status = subprocess.run( + args=["git", "status", "--porcelain"], + capture_output=True, + text=True, + check=False, + ) + info["git_dirty"] = bool(status.stdout.strip()) + + except Exception: + info["git_commit"] = None + info["git_dirty"] = None + + return info + +def log_run_index(record: dict[str, Any]) -> None: + """ + Append a single BFCL execution record to run_index.jsonl + """ + global RUN_CTX + if RUN_CTX is None: + return + + append_jsonl(RUN_CTX.run_index_path, safe_json(record)) diff --git a/experiments/parth/gepa/metrics.py b/experiments/parth/gepa/metrics.py new file mode 100644 index 0000000..caca0f3 --- /dev/null +++ b/experiments/parth/gepa/metrics.py @@ -0,0 +1,190 @@ +""" +metrics.py + +Metric and feedback for GEPA optimization on BFCL +""" + +from __future__ import annotations +from typing import Any, Optional, List +import dspy +from tests.benchmarks.bfcl import loader as bfcl_loader +from . import logging_utils +from .logging_utils import append_jsonl, safe_json, utc_now_iso +from .scoring_utils import fn_name, soft_sequence_score, diff_summary + + +class MetricFeedback(dspy.Prediction): + """ + Prediction returned to GEPA containing a scalar score and + human-readable feedback + """ + + def __init__(self, score: float, feedback: str): + super().__init__(score=score, feedback=feedback) + + +def build_score_definition() -> dict[str, Any]: + return { + "hard_valid": "BFCL evaluator validation.valid (boolean) from multi_turn_checker", + "final": "1.0 if hard_valid else 0.0", + "note": ( + "Optimization and candidate scores use only hard validity. " + "No soft or shaping score is applied." + ) + } + + +def bfcl_metric_with_feedback( + gold: dspy.Example, + pred: dspy.Prediction, + trace: Optional[Any] = None, + pred_name: Optional[str] = None, + pred_trace: Optional[Any] = None +) -> MetricFeedback: + """ + Computes the GEPA metric for a single BFCL evaluation. + Returns MetricFeedback(score, feedback) + """ + # Extract test id and initialize feedback + test_id = getattr(pred, "test_id", None) or getattr(gold, "test_id", None) + feedback_parts: List[str] = [] + ctx = logging_utils.RUN_CTX + + if ctx is None: + raise RuntimeError( + "RUN_CTX is None inside bfcl_metric_with_feedback. " + "This means run.py did not initialize logging_utils.RUN_CTX correctly." + ) + + + # Load BFCL truth + constraints for feedback + gt: list[list[str]] = [] + excluded: list[str] = [] + involved_classes: list[str] = [] + try: + if test_id: + gt = bfcl_loader.load_ground_truth(test_id) + entry = bfcl_loader.load_test_entry(test_id) + excluded = entry.get("excluded_function", []) or [] + involved_classes = entry.get("involved_classes", []) or [] + except Exception as e: + feedback_parts.append( + f"WARNING: could not load BFCL ground truth/entry: {type(e).__name__}: {e}" + ) + + # Pull prediction info + pred_exec: list[list[str]] = getattr(pred, "executable_responses", []) or [] + evaluation: dict[str, Any] | None = getattr(pred, "evaluation", None) + eval_error: str | None = getattr(pred, "eval_error", None) + + # Compute hard-valid (pass/fail) + hard_valid = False + if evaluation and isinstance(evaluation, dict): + hard_valid = bool(evaluation.get("validation", {}).get("valid", False)) + + # Final score + final_score = 1.0 if hard_valid else 0.0 + + + # Train/dev split + split = None + if ctx and test_id: + if ctx.train_ids and test_id in ctx.train_ids: + split = "train" + elif ctx.dev_ids and test_id in ctx.dev_ids: + split = "dev" + else: + split = "unknown" + + feedback_parts.append(f"RESULT: {'PASS' if hard_valid else 'FAIL'}") + feedback_parts.append( + f"SCORE: {'1.0' if hard_valid else '0.0'} (hard_valid)" + ) + if split: + feedback_parts.append(f"SPLIT: {split}") + + # if involved_classes: + # feedback_parts.append(f"INVOLVED_CLASSES (servers mounted): {', '.join(involved_classes)}") + # if excluded: + # feedback_parts.append(f"EXCLUDED_FUNCTIONS: {', '.join(excluded)}") + + if evaluation and isinstance(evaluation, dict): + validation = evaluation.get("validation", {}) + irrelevance = evaluation.get("irrelevance_check", {}) + feedback_parts.append("EVALUATOR_VALIDATION:") + if isinstance(validation, dict): + for k in ["valid", "reason", "error_type", "error_message"]: + if k in validation: + feedback_parts.append(f" {k}: {validation.get(k)}") + else: + feedback_parts.append(f" validation: {validation}") + + if isinstance(irrelevance, dict) and irrelevance: + feedback_parts.append("EVALUATOR_IRRELEVANCE_CHECK:") + for k in ["is_irrelevant", "reason"]: + if k in irrelevance: + feedback_parts.append(f" {k}: {irrelevance.get(k)}") + + if eval_error: + feedback_parts.append(f"EVAL_ERROR: {eval_error}") + + if gt: + feedback_parts.append("EXECUTABLE_DIFF:") + feedback_parts.append(diff_summary(gt, pred_exec)) + + if excluded and pred_exec: + used_fns = {fn_name(s) for turn in pred_exec for s in turn} + bad = sorted(set(excluded) & used_fns) + # if bad: + # feedback_parts.append(f"CONSTRAINT_VIOLATION: used excluded function(s): {', '.join(bad)}") + + if hasattr(pred, "behavior"): + feedback_parts.append("BEHAVIOR_SUMMARY:") + feedback_parts.append(str(pred.behavior)) + + run_dir = getattr(pred, "run_dir", None) + if run_dir: + feedback_parts.append(f"RUN_DIR: {run_dir}") + + # Log the record + if ctx and test_id: + record = { + "ts": utc_now_iso(), + "run_id": ctx.run_id, + "test_id": test_id, + "split": split, + "instruction_hash": getattr(pred, "instruction_hash", None), + "hard_valid": hard_valid, + "final": final_score, + "timing": getattr(pred, "timing", None), + "run_dir": run_dir, + "eval_error": eval_error, + "evaluator_validation": ( + safe_json(evaluation.get("validation")) + if isinstance(evaluation, dict) + else None + ), + "evaluator_irrelevance": ( + safe_json(evaluation.get("irrelevance_check")) + if isinstance(evaluation, dict) + else None + ), + } + append_jsonl(ctx.metric_calls_path, record) + + # Candidate snapshot + snap = { + "ts": utc_now_iso(), + "run_id": ctx.run_id, + "instruction_hash": getattr(pred, "instruction_hash", None), + "instruction_text": getattr(pred, "instruction_text", None), + "latest_eval": { + "test_id": test_id, + "split": split, + "hard_valid": hard_valid, + "final": final_score, + }, + } + append_jsonl(ctx.candidate_snapshots_path, snap) + + return MetricFeedback(score=final_score, feedback="\n".join(feedback_parts)) \ No newline at end of file diff --git a/experiments/parth/gepa/run.py b/experiments/parth/gepa/run.py new file mode 100644 index 0000000..77211f4 --- /dev/null +++ b/experiments/parth/gepa/run.py @@ -0,0 +1,457 @@ +""" +run.py + +Orchestrator for running GEPA-based instruction optimization +experiments on BFCL tests with logging/artifacts + +Run once per experiment with +`python -m experiments.parth.gepa.run --instruction-file path/to/instruction.txt [other options]` +""" + +from __future__ import annotations +import argparse +import json +import os +import platform +import sys +import time +import uuid +from pathlib import Path +from typing import Any +import shlex +import random + +import dspy +from dspy.teleprompt import GEPA + +from .agent import BFCLAgent +from .data_utils import load_test_cases, extract_test_number, parse_test_number_spec +from .metrics import bfcl_metric_with_feedback, build_score_definition +from .env_utils import validate_model_environment +from .logging_utils import ( + TeeIO, + append_jsonl, + safe_json, + sha256_text, + try_git_info, + utc_now_iso, +) +from . import logging_utils + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run GEPA instruction optimization on BFCL" + ) + + parser.add_argument("--test-subset", default="multi_turn_base") + parser.add_argument("--shuffle", action="store_true") + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--num-tests", type=int, default=None) + parser.add_argument("--test-numbers", type=str, default=None) + + parser.add_argument("--model", default="gpt-5-mini") + parser.add_argument("--reflection-model", default="gpt-5") + + parser.add_argument("--max-evaluations", type=int, default=20) + parser.add_argument("--auto", choices=["light", "medium", "heavy"], default=None) + + parser.add_argument("--instruction-file", type=Path, required=True) + parser.add_argument("--output-dir", type=Path, default=Path("outputs/gepa_on_bfcl")) + + parser.add_argument("--pytest-binary", default="pytest") + parser.add_argument("--gepa-scoring-mode", action="store_true") + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + validate_model_environment([args.model, args.reflection_model]) + + args.output_dir.mkdir(parents=True, exist_ok=True) + + # Console mirroring + console_log_path = args.output_dir / "console.log" + console_log_f = console_log_path.open("w", encoding="utf-8") + real_out, real_err = sys.stdout, sys.stderr + sys.stdout = TeeIO(real_out, console_log_f) + sys.stderr = TeeIO(real_err, console_log_f) + + # Metadata initialization + overall_t0 = time.perf_counter() + timings: dict[str, float] = {} + run_id = f"{time.strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}" + + # ---- Persist exact rerun command ---- + python_executable = sys.executable + script_path = Path(__file__).resolve() + + argv = [python_executable, str(script_path)] + sys.argv[1:] + command_str = shlex.join(argv) + + command_path = args.output_dir / "command.sh" + command_path.write_text( + "#!/usr/bin/env bash\n\n" + command_str + "\n", + encoding="utf-8", + ) + + # Make it executable for convenience + command_path.chmod(0o755) + + + metric_calls_path = args.output_dir / "metric_calls.jsonl" + candidate_snapshots_path = args.output_dir / "candidate_snapshots.jsonl" + reflection_calls_path = args.output_dir / "reflection_calls.jsonl" + run_index_path = args.output_dir / "run_index.jsonl" + + score_definition = build_score_definition() + + try: + print(f"[{utc_now_iso()}] RUN_ID={run_id}") + print(f"[{utc_now_iso()}] output_dir={args.output_dir}") + + selected_test_numbers: set[int] | None = None + if args.test_numbers: + selected_test_numbers = parse_test_number_spec(args.test_numbers) + + # Load dataset + t_load = time.perf_counter() + all_examples = load_test_cases(args.test_subset, limit=None) + + examples = list(all_examples) + + # Explicit numeric test selection + if selected_test_numbers is not None: + before = len(examples) + + matched = [] + matched_numbers = set() + + for e in examples: + num = extract_test_number(e.test_id) + if num in selected_test_numbers: + matched.append(e) + matched_numbers.add(num) + + examples = matched + after = len(examples) + + print( + f"[{utc_now_iso()}] Selected tests by number: " + f"{sorted(matched_numbers)} ({after}/{len(selected_test_numbers)} found)" + ) + + # Shuffle & slice + rng = random.Random(args.seed) + + if args.shuffle: + rng.shuffle(examples) + + if args.num_tests is not None: + if selected_test_numbers is not None: + print( + f"[{utc_now_iso()}] --test-numbers provided; ignoring --num-tests" + ) + else: + examples = examples[: args.num_tests] + + + + train_size = int(0.7 * len(examples)) + trainset = examples[:train_size] + devset = examples[train_size:] + timings["load_dataset_s"] = time.perf_counter() - t_load + + # Split dataset + train_ids = {e.test_id for e in trainset} + dev_ids = {e.test_id for e in devset} + + (args.output_dir / "dataset_split.json").write_text( + json.dumps( + { + "run_id": run_id, + "test_subset": args.test_subset, + "shuffle": args.shuffle, + "seed": args.seed, + "num_tests": args.num_tests, + "examples_used_ordered": [e.test_id for e in examples], + "train_ids": sorted(train_ids), + "dev_ids": sorted(dev_ids), + "test_number_selection": ( + sorted(selected_test_numbers) if selected_test_numbers is not None else None + ), + "selection_mode": ( + "explicit_numbers" if selected_test_numbers is not None + else "first_n" if args.num_tests is not None + else "all" + ), + }, + indent=2, + ), + encoding="utf-8", + ) + + logging_utils.RUN_CTX = logging_utils.RunContext( + run_id=run_id, + output_dir=args.output_dir, + metric_calls_path=metric_calls_path, + candidate_snapshots_path=candidate_snapshots_path, + run_index_path=run_index_path, + train_ids=train_ids, + dev_ids=dev_ids, + score_definition=score_definition, + ) + + # Load initial instructions + instruction_text = args.instruction_file.read_text(encoding="utf-8") + instruction_hash = sha256_text(instruction_text) + + # Write the run manifest + manifest = { + "run_id": run_id, + "created_at": utc_now_iso(), + "argv": sys.argv, + "args": safe_json(vars(args)), + "instruction_file": str(args.instruction_file), + "instruction_hash": instruction_hash, + "score_definition": score_definition, + "test_selection": { + "mode": ( + "explicit_numbers" if selected_test_numbers is not None + else "first_n" if args.num_tests is not None + else "all" + ), + "test_numbers": ( + sorted(selected_test_numbers) if selected_test_numbers is not None else None + ), + "num_tests": args.num_tests, + "shuffle": args.shuffle, + "seed": args.seed, + }, + "models": { + "agent_model": args.model, + "reflection_model": args.reflection_model + }, + "dataset_split": { + "train_ids": sorted(train_ids), + "dev_ids": sorted(dev_ids), + }, + "environment": { + "python": sys.version, + "platform": platform.platform(), + "cwd": os.getcwd(), + }, + **try_git_info(), + } + (args.output_dir / "run_manifest.json").write_text( + json.dumps(manifest, indent=2), + encoding="utf-8", + ) + + # Create LMs + reflection_lm = dspy.LM(args.reflection_model) + execution_lm = dspy.LM(args.model) + + # Always configure a global LM (reflection-only by policy) + dspy.configure(lm=reflection_lm) + + + # Create agent + agent = BFCLAgent( + instruction_text=instruction_text, + model=args.model, + execution_lm=execution_lm, + base_dir=args.output_dir, + pytest_binary=args.pytest_binary, + enable_scoring_mode=args.gepa_scoring_mode, + ) + + # Run and evaluate baseline - no GEPA! + t_base = time.perf_counter() + baseline_valid = 0 + baseline_details: list[dict[str, Any]] = [] + + for ex in examples: + pred = agent(test_id=ex.test_id, question=ex.question) + + valid = False + if pred.evaluation: + valid = bool( + pred.evaluation.get("validation", {}).get("valid", False) + ) + + baseline_valid += int(valid) + baseline_details.append( + { + "test_id": ex.test_id, + "valid": valid, + "run_dir": pred.run_dir, + "eval_error": pred.eval_error, + } + ) + + timings["baseline_s"] = time.perf_counter() - t_base + + # Persist baseline + baseline_valid_rate = baseline_valid / max(len(examples), 1) + + (args.output_dir / "baseline.json").write_text( + json.dumps( + { + "run_id": run_id, + "instruction_hash": instruction_hash, + "bfcl_valid_rate": baseline_valid_rate, + "valid": baseline_valid, + "total": len(examples), + "runs": baseline_details, + }, + indent=2, + ), + encoding="utf-8", + ) + + print( + f"[{utc_now_iso()}] Baseline BFCL valid rate: " + f"{baseline_valid_rate:.3f} ({baseline_valid}/{len(examples)})" + ) + + # Finalize GEPA parameters + t_gepa = time.perf_counter() + gepa_kwargs: dict[str, Any] = { + "metric": bfcl_metric_with_feedback, + "reflection_lm": reflection_lm, + "track_stats": True, + "log_dir": str(args.output_dir / "gepa_logs"), + "seed": 42, + } + + if args.auto is not None: + gepa_kwargs["auto"] = args.auto + else: + gepa_kwargs["max_full_evals"] = args.max_evaluations + + (args.output_dir / "gepa_config.json").write_text( + json.dumps(safe_json(gepa_kwargs), indent=2), + encoding="utf-8", + ) + + # Create and run GEPA optimizer + gepa = GEPA(**gepa_kwargs) + + reflection_lm.history.clear() + optimized_agent = gepa.compile( + agent, + trainset=trainset, + valset=devset, + ) + + for i, entry in enumerate(reflection_lm.history): + record = { + "ts": entry.get("timestamp"), + "run_id": run_id, + "call_index": i, + "model": entry.get("model") or args.reflection_model, + "model_type": entry.get("model_type"), + + # Prompting + "prompt": entry.get("prompt"), + "messages": entry.get("messages"), + + # Outputs + "raw_response": entry.get("response"), + "outputs": entry.get("outputs"), + + # Generation config + "kwargs": entry.get("kwargs"), + + # Usage & cost + "usage": entry.get("usage"), + "cost": entry.get("cost"), + + # Traceability + "uuid": entry.get("uuid"), + } + + append_jsonl(reflection_calls_path, safe_json(record)) + + results = optimized_agent.detailed_results + timings["gepa_compile_s"] = time.perf_counter() - t_gepa + + # Final candidates summary (still useful) + candidates = [] + for i, cand in enumerate(results.candidates): + instr = cand.get_instruction_text() + candidates.append( + { + "candidate_id": i, + "instruction_hash": sha256_text(instr), + "instruction_text": instr, + "val_score": results.val_aggregate_scores[i], + "discovered_at_metric_call": results.discovery_eval_counts[i], + "parents": results.parents[i], + } + ) + (args.output_dir / "gepa_candidates.json").write_text(json.dumps(candidates, indent=2), encoding="utf-8") + + # Pareto + best_ids = set().union(*results.per_val_instance_best_candidates) + with open(args.output_dir / "gepa_pareto.txt", "w", encoding="utf-8") as f: + f.write("GEPA Pareto Frontier\n====================\n\n") + for i in sorted(best_ids, key=lambda i: results.val_aggregate_scores[i], reverse=True): + f.write(f"Candidate {i} | score={results.val_aggregate_scores[i]:.3f}\n") + f.write("-" * 40 + "\n") + f.write(results.candidates[i].get_instruction_text() + "\n\n") + + final_instr = optimized_agent.get_instruction_text() + (args.output_dir / "optimized_instructions.txt").write_text(final_instr, encoding="utf-8") + + # Scores file (explicit: which examples and how computed) + scores_payload = { + "run_id": run_id, + "score_definition": score_definition, + "dataset_split": { + "train_ids": sorted(train_ids), + "dev_ids": sorted(dev_ids), + }, + "baseline": { + "bfcl_valid_rate_over_all_examples": baseline_valid_rate, + "examples_used": [e.test_id for e in examples], + "valid_count": baseline_valid, + "total_count": len(examples), + }, + "gepa": { + "objective": "binary hard_valid (1.0 pass / 0.0 fail) aggregated over dev set by GEPA", + "val_aggregate_scores": safe_json(results.val_aggregate_scores), + "candidate_count": len(results.candidates), + }, + "note": "For per-evaluation, per-test, per-step details see metric_calls.jsonl (append-only).", + } + (args.output_dir / "scores.json").write_text(json.dumps(scores_payload, indent=2), encoding="utf-8") + + # Metadata + timings + timings["total_wall_s"] = time.perf_counter() - overall_t0 + (args.output_dir / "timings.json").write_text(json.dumps({"run_id": run_id, **timings}, indent=2), encoding="utf-8") + + meta = { + "run_id": run_id, + "baseline_bfcl_valid_rate": baseline_valid_rate, + "final_score": max(results.val_aggregate_scores) if results.val_aggregate_scores else None, + "total_metric_calls": results.total_metric_calls, + "num_full_val_evals": results.num_full_val_evals, + "seed": results.seed, + } + (args.output_dir / "optimization_metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8") + + print(f"[{utc_now_iso()}] Done. See {args.output_dir}/run_manifest.json, scores.json, metric_calls.jsonl") + + + finally: + sys.stdout.flush() + sys.stderr.flush() + sys.stdout = real_out + sys.stderr = real_err + console_log_f.close() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/experiments/parth/gepa/scoring_utils.py b/experiments/parth/gepa/scoring_utils.py new file mode 100644 index 0000000..5f19917 --- /dev/null +++ b/experiments/parth/gepa/scoring_utils.py @@ -0,0 +1,132 @@ +"""" +scoring_utils.py + +Utility functions used for evaluating a BFCL agent's tool use +""" + +from __future__ import annotations +from typing import List + + +def fn_name(executable_call: str) -> str: + """ + Extract the function name from a tool call string + + Ex: read(file='log.txt') -> 'read' + """ + if not executable_call: + return "" + + i = executable_call.index("(") + return executable_call[:i] if i != -1 else executable_call + + +def soft_turn_score(gt_turn: List[str], pred_turn: List[str]) -> float: + """ + Returns a score in [0, 1] for a single turn by comparing function + overlap between ground truth and agent prediction + """ + # Perfectly aligned + if gt_turn == pred_turn: + return 1.0 + + gt_fns = [fn_name(x) for x in gt_turn] + pr_fns = [fn_name(x) for x in pred_turn] + + # No functions expected AND no functions called + if not gt_fns and not pr_fns: + return 1.0 + + # Either: + # No functions were expected but agent still called some + # OR agent didn't call any functions when it was expected to + if not gt_fns or not pr_fns: + return 0.0 + + gt_set = set(gt_fns) + pr_set = set(pr_fns) + intersection = len(gt_set.intersection(pr_set)) + + # No tool intersection -> 0.0 + if intersection == 0: + return 0.0 + + # Of all the tools the agent called, how many were in G.T + precision = intersection / max(len(pr_set), 1) + # Of all the tools in GT, how many did the agent call + recall = intersection / max(len(gt_set), 1) + + # F1 Score = harmonic mean of precision and recall + # Higher F1 = high prec AND high rec + # Lower F1 = low prec and rec OR extreme difference btwn them + return (2 * precision * recall) / (precision + recall) + + +def soft_sequence_score(gt: List[List[str]], pred: List[List[str]]) -> float: + """ + Returns a score in [0, 1] for a given multi-turn sequence, which is the + arithmetic average of soft turn scores + """ + # No functions expected AND no functions called + if not gt and not pred: + return 1.0 + + n = max(len(gt), len(pred), 1) + total = 0.0 + + for i in range(n): + gt_turn = gt[i] if i < len(gt) else [] + pred_turn = pred[i] if i < len(pred) else [] + + # Add up each turn's F1 Score + total += soft_turn_score(gt_turn, pred_turn) + + # Return average + return total / n + + +def diff_summary(gt: List[List[str]], pred: List[List[str]], + *, max_turns: int = 8, max_calls_per_turn: int = 8 + ) -> str: + """ + Produce a readable string representation of the diff between + GT and predicted tool call sequences + + Intended for logging + """ + lines: List[str] = [] + n = min(max(len(gt), len(pred)), max_turns) + + for i in range(n): + gt_turn = gt[i] if i < len(gt) else [] + pr_turn = pred[i] if i < len(pred) else [] + + if gt_turn == pr_turn: + lines.append(f"TURN {i + 1}: OK (exact match)") + continue + + lines.append(f"TURN {i + 1}: MISMATCH") + lines.append(" EXPECTED:") + if gt_turn: + for s in gt_turn[:max_calls_per_turn]: + lines.append(f" - {s}") + if len(gt_turn) > max_calls_per_turn: + lines.append(f" ... (+{len(gt_turn) - max_calls_per_turn} more)") + else: + lines.append(" - (no calls expected)") + + lines.append(" GOT:") + if pr_turn: + for s in pr_turn[:max_calls_per_turn]: + lines.append(f" - {s}") + if len(pr_turn) > max_calls_per_turn: + lines.append(f" ... (+{len(pr_turn) - max_calls_per_turn} more)") + else: + lines.append(" - (no calls produced)") + + if len(gt) != len(pred): + lines.append( + f"TURN COUNT: expected {len(gt)} turns, got {len(pred)} turns" + ) + + return "\n".join(lines) \ No newline at end of file diff --git a/src/wags/middleware/__init__.py b/src/wags/middleware/__init__.py index 0cc368a..ee5fbc7 100644 --- a/src/wags/middleware/__init__.py +++ b/src/wags/middleware/__init__.py @@ -1,10 +1,12 @@ """WAGS middleware components.""" from .elicitation import ElicitationMiddleware, RequiresElicitation +from .external_feedback import ExternalFeedbackMiddleware from .roots import RootsMiddleware, requires_root __all__ = [ "ElicitationMiddleware", + "ExternalFeedbackMiddleware", "RequiresElicitation", "RootsMiddleware", "requires_root", diff --git a/src/wags/middleware/external_feedback.py b/src/wags/middleware/external_feedback.py new file mode 100644 index 0000000..e747f32 --- /dev/null +++ b/src/wags/middleware/external_feedback.py @@ -0,0 +1,678 @@ +"""Middleware that injects one-time external feedback for target tool calls. + +Supports two modes: + +**Config-file mode** (new) + Set ``BFCL_EXTERNAL_FEEDBACK_CONFIG`` to a JSON file path. The file + contains a list of trigger rules; each rule fires independently with its + own occurrence counter. Every evaluated call — triggered or not — is + appended as a JSON record to ``BFCL_EXTERNAL_FEEDBACK_LOG_FILE`` so that + full agent trajectories can be reconstructed offline. + +**Legacy env-var mode** (backward-compatible) + If ``BFCL_EXTERNAL_FEEDBACK_CONFIG`` is not set the middleware falls back + to the original single-tool behaviour driven by: + ``BFCL_EXTERNAL_FEEDBACK_TOOL``, ``BFCL_EXTERNAL_FEEDBACK_MESSAGE``, + ``BFCL_EXTERNAL_FEEDBACK_N``. + +In both modes ``BFCL_EXTERNAL_FEEDBACK_ENABLED`` must be truthy for the +middleware to be active (the guard lives in ``mcp_server.py``; the class +itself does not re-check the flag). + +Config file schema +------------------ +See ``experiments/parth/feedback_ablation/configs/`` for examples. Top-level fields:: + + subset – "A" … "G" + condition – "specific" | "vague" | "verbose" | "null" + description – optional human note (ignored at runtime) + test_ids – list of BFCL test IDs this config applies to + triggers – list of trigger-rule objects (see TriggerRule) + +Each trigger-rule object:: + + tool_name – (str, required) target tool, name-normalised on match + trigger_type – "tool_only" | "argument_present" + | "argument_value" | "precondition_missing" + | "precondition_check" + argument_conditions – (object, optional) shape varies by trigger_type + condition – (str, optional) for precondition_check only + occurrence – (int) which matching call fires the trigger (1 = first) + feedback_message – (str) injected warning; empty string → bare rejection + condition_label – "specific" | "vague" | "verbose" | "null" + +argument_conditions shapes +-------------------------- +tool_only + omit entirely + +argument_present + {"forbidden_args": ["param1", "param2"]} + Fires when *any* listed key appears in the agent's arguments. + +argument_value + {"checks": [{"key": "speed", "op": "gt", "value": 120}], "match": "any"} + Supported ops: eq, neq, gt, gte, lt, lte, in, not_in, contains. + "match" is "any" (default) or "all". + +precondition_missing + {"required_prior_calls": ["startEngine"]} + Fires when *any* listed tool has not yet been called in this session. + Only tool calls that pass through (are not blocked) count as "called". + +precondition_check + Evaluates a named condition against tracked execution state. + Requires ``condition`` field on the trigger rule. Supported conditions: + + cd_to_current_dir + Fires if the cd target matches the current working directory. + duplicate_ls_no_state_change + Fires if ls was already called with the same arguments and no + state-mutating tool has run since. + arg_not_recent_file + Fires if the ``file_name`` argument is not in the set of files + recently involved in mv/cp/cat/grep calls. + mv_dest_missing_directory + Fires if the mv ``destination`` contains no ``/`` and does not + match any directory name from the most recent ls output. + mkdir_already_exists + Fires if ``dir_name`` matches a directory in the last ls output + or the last component of the current cwd. + echo_content_extra_quotes + Fires if the ``content`` argument starts and ends with matching + quote characters (``'…'`` or ``"…"``). +""" + +from __future__ import annotations + +import json +import os +import re +import sys +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from fastmcp.exceptions import ToolError +from fastmcp.server.middleware.middleware import CallNext, Middleware, MiddlewareContext +from fastmcp.tools.tool import ToolResult +from mcp.types import CallToolRequestParams + + +# --------------------------------------------------------------------------- +# Internal data model +# --------------------------------------------------------------------------- + +@dataclass +class TriggerRule: + """A single trigger rule parsed from a config file or constructed directly.""" + + tool_name: str + tool_name_normalized: str + trigger_type: str # tool_only | argument_present | argument_value | precondition_missing | precondition_check + argument_conditions: dict[str, Any] | None + condition: str | None # precondition_check condition name (e.g. cd_to_current_dir) + occurrence: int + feedback_message: str + condition_label: str + + # Runtime counters — mutated during execution, not sourced from config. + match_count: int = field(default=0, init=False) + fired: bool = field(default=False, init=False) + + +# Tools whose execution changes filesystem state (used by precondition_check). +STATE_MUTATING_TOOLS: frozenset[str] = frozenset( + {"mv", "mkdir", "echo", "cd", "cp", "rm", "touch", "rmdir", "cat"} +) + + +# --------------------------------------------------------------------------- +# Middleware +# --------------------------------------------------------------------------- + +class ExternalFeedbackMiddleware(Middleware): + """Config-driven (or legacy env-var) external-feedback injection middleware. + + Parameters + ---------- + config: + Pre-parsed config dict (mutually exclusive with *config_path*). + config_path: + Path to a JSON config file. Loaded on ``__init__``. + target_tool_name: + Legacy mode: the single tool to watch. + warning_message: + Legacy mode: message to inject. + trigger_on_nth_call: + Legacy mode: which occurrence to fire on (default 1). + """ + + def __init__( + self, + *, + # Config-file mode + config: dict[str, Any] | None = None, + config_path: str | Path | None = None, + # Legacy / direct-constructor mode + target_tool_name: str | None = None, + warning_message: str | None = None, + trigger_on_nth_call: int = 1, + ) -> None: + super().__init__() + + # --- Logging setup --- + self.log_file: str | None = os.getenv("BFCL_EXTERNAL_FEEDBACK_LOG_FILE") + if self.log_file: + Path(self.log_file).parent.mkdir(parents=True, exist_ok=True) + + self.test_case_id: str = os.getenv("TEST_ID", "unknown") + self.global_call_index: int = 0 + + # Set of *normalised* tool names that have successfully passed through + # (not blocked). Used for precondition_missing matching. + self.called_tools: set[str] = set() + + # --- Tracked execution state (for precondition_check triggers) --- + initial_cwd = os.getenv("BFCL_INITIAL_CWD") + self.cwd: str | None = initial_cwd + self.last_ls_result: str | None = None + self.last_ls_args: dict[str, Any] | None = None + self.state_changed_since_last_ls: bool = False + self.recent_files: set[str] = set() + if initial_cwd: + self._stderr( + f"[ExternalFeedbackMiddleware] Initial cwd seeded: {initial_cwd}" + ) + + # --- Build trigger rules --- + if config is not None or config_path is not None: + self.triggers = self._load_config_triggers(config, config_path) + self._stderr( + f"[ExternalFeedbackMiddleware] Config mode: " + f"{len(self.triggers)} trigger(s) loaded" + ) + for i, rule in enumerate(self.triggers, 1): + self._stderr( + f"[ExternalFeedbackMiddleware] Rule {i}: " + f"{rule.trigger_type} on '{rule.tool_name}' " + f"(occurrence={rule.occurrence}, label={rule.condition_label})" + ) + else: + self.triggers = self._build_legacy_triggers( + target_tool_name, warning_message, trigger_on_nth_call + ) + + # ------------------------------------------------------------------ + # Config / legacy builders + # ------------------------------------------------------------------ + + def _load_config_triggers( + self, + config: dict[str, Any] | None, + config_path: str | Path | None, + ) -> list[TriggerRule]: + if config is None: + assert config_path is not None + with open(config_path) as fh: + config = json.load(fh) + rules: list[TriggerRule] = [] + for raw in config.get("triggers", []): + tool_name: str = raw["tool_name"] + rules.append( + TriggerRule( + tool_name=tool_name, + tool_name_normalized=self._normalize(tool_name), + trigger_type=raw["trigger_type"], + argument_conditions=raw.get("argument_conditions"), + condition=raw.get("condition"), + occurrence=int(raw["occurrence"]), + feedback_message=raw.get("feedback_message", ""), + condition_label=raw["condition_label"], + ) + ) + return rules + + def _build_legacy_triggers( + self, + target_tool_name: str | None, + warning_message: str | None, + trigger_on_nth_call: int, + ) -> list[TriggerRule]: + """Construct a single tool_only rule from env vars / constructor args.""" + tool = target_tool_name or os.getenv("BFCL_EXTERNAL_FEEDBACK_TOOL", "") + msg = warning_message or os.getenv("BFCL_EXTERNAL_FEEDBACK_MESSAGE", "") + n = int(os.getenv("BFCL_EXTERNAL_FEEDBACK_N", str(trigger_on_nth_call))) + self._stderr( + f"[ExternalFeedbackMiddleware] Legacy mode: " + f"watching '{tool}', trigger on call #{n}" + ) + return [ + TriggerRule( + tool_name=tool, + tool_name_normalized=self._normalize(tool), + trigger_type="tool_only", + argument_conditions=None, + condition=None, + occurrence=n, + feedback_message=msg, + condition_label="specific", + ) + ] + + # ------------------------------------------------------------------ + # FastMCP hook + # ------------------------------------------------------------------ + + async def on_call_tool( + self, + context: MiddlewareContext[CallToolRequestParams], + call_next: CallNext[CallToolRequestParams, ToolResult], + ) -> ToolResult: + msg = context.message + self.global_call_index += 1 + args: dict[str, Any] = dict(msg.arguments or {}) + + # Evaluate every unfired rule in order. + for rule in self.triggers: + if rule.fired: + continue + if not self._name_matches(msg.name, rule.tool_name_normalized): + continue + + matched, precondition_state = self._conditions_match(rule, args) + if not matched: + # Log the non-match for precondition_check rules (useful for debugging). + if rule.trigger_type == "precondition_check" and precondition_state: + self._write_log_record( + tool_name=msg.name, + arguments=args, + triggered=False, + rule=rule, + precondition_state=precondition_state, + ) + continue + + rule.match_count += 1 + + if rule.match_count == rule.occurrence: + # ---- Trigger fires ---- + rule.fired = True + self._write_log_record( + tool_name=msg.name, + arguments=args, + triggered=True, + rule=rule, + precondition_state=precondition_state, + ) + self._stderr( + f"[ExternalFeedbackMiddleware] ✓ TRIGGERED '{rule.trigger_type}' " + f"on '{msg.name}' " + f"(match #{rule.match_count}, label={rule.condition_label})" + ) + + raise ToolError(rule.feedback_message or "") + + # ---- No rule fired — pass through ---- + self._write_log_record( + tool_name=msg.name, + arguments=args, + triggered=False, + rule=None, + ) + result = await call_next(context) + # Only record as "called" after a successful pass-through. + self.called_tools.add(self._normalize(msg.name)) + # Update tracked execution state from the result. + self._update_tracked_state(msg.name, args, result) + return result + + # ------------------------------------------------------------------ + # Matching helpers + # ------------------------------------------------------------------ + + def _name_matches(self, call_name: str, rule_normalized: str) -> bool: + normalised = self._normalize(call_name) + return normalised == rule_normalized or normalised.endswith(rule_normalized) + + def _conditions_match( + self, rule: TriggerRule, args: dict[str, Any], + ) -> tuple[bool, dict[str, Any] | None]: + """Return (matched, precondition_state) for *rule*. + + ``precondition_state`` is non-None only for ``precondition_check`` + rules and contains the values that were compared (for logging). + """ + t = rule.trigger_type + cond: dict[str, Any] = rule.argument_conditions or {} + + if t == "tool_only": + return True, None + + if t == "argument_present": + forbidden: list[str] = cond.get("forbidden_args", []) + return any(k in args for k in forbidden), None + + if t == "argument_value": + checks: list[dict[str, Any]] = cond.get("checks", []) + match_mode: str = cond.get("match", "any") + results = [self._evaluate_check(args, c) for c in checks] + matched = all(results) if match_mode == "all" else any(results) + return matched, None + + if t == "precondition_missing": + required: list[str] = cond.get("required_prior_calls", []) + # Fires when at least one required prior tool has NOT yet passed through. + return any(self._normalize(r) not in self.called_tools for r in required), None + + if t == "precondition_check": + return self._evaluate_precondition(rule, args) + + self._stderr( + f"[ExternalFeedbackMiddleware] Unknown trigger_type '{t}', skipping" + ) + return False, None + + # ------------------------------------------------------------------ + # Precondition-check evaluation + # ------------------------------------------------------------------ + + def _evaluate_precondition( + self, rule: TriggerRule, args: dict[str, Any], + ) -> tuple[bool, dict[str, Any]]: + """Evaluate a precondition_check rule against tracked state.""" + condition = rule.condition + + if condition == "cd_to_current_dir": + target = args.get("folder", args.get("path", "")) + matched = self._is_same_dir(target) + state = { + "condition": condition, + "condition_met": matched, + "cwd": self.cwd, + "cd_target": target, + } + return matched, state + + if condition == "duplicate_ls_no_state_change": + same_args = self.last_ls_args is not None and args == self.last_ls_args + matched = ( + self.last_ls_result is not None + and not self.state_changed_since_last_ls + and same_args + ) + state = { + "condition": condition, + "condition_met": matched, + "last_ls_result_exists": self.last_ls_result is not None, + "state_changed_since_last_ls": self.state_changed_since_last_ls, + "same_args": same_args, + "last_ls_args": self.last_ls_args, + "current_args": args, + } + return matched, state + + if condition == "arg_not_recent_file": + file_name = args.get("file_name", "") + matched = bool(file_name and file_name not in self.recent_files) + state = { + "condition": condition, + "condition_met": matched, + "file_name": file_name, + "recent_files": sorted(self.recent_files), + } + return matched, state + + if condition == "mv_dest_missing_directory": + dest = args.get("destination", "") + has_slash = "/" in dest + ls_dirs = self._parse_ls_directories() + in_ls = dest in ls_dirs if ls_dirs is not None else False + matched = bool(dest and not has_slash and not in_ls) + state = { + "condition": condition, + "condition_met": matched, + "destination": dest, + "has_slash": has_slash, + "ls_directories": sorted(ls_dirs) if ls_dirs is not None else None, + "dest_in_ls": in_ls, + } + return matched, state + + if condition == "mkdir_already_exists": + dir_name = args.get("dir_name", "") + cwd_name = self.cwd.rstrip("/").rsplit("/", 1)[-1] if self.cwd else None + ls_dirs = self._parse_ls_directories() + in_ls = dir_name in ls_dirs if ls_dirs is not None else False + is_cwd = bool(cwd_name and dir_name == cwd_name) + matched = in_ls or is_cwd + state = { + "condition": condition, + "condition_met": matched, + "dir_name": dir_name, + "cwd_name": cwd_name, + "in_ls_dirs": in_ls, + "matches_cwd": is_cwd, + } + return matched, state + + if condition == "echo_content_extra_quotes": + content = args.get("content", "") + matched = ( + len(content) >= 2 + and ( + (content[0] == "'" and content[-1] == "'") + or (content[0] == '"' and content[-1] == '"') + ) + ) + state = { + "condition": condition, + "condition_met": matched, + "content_preview": content[:60] + ("…" if len(content) > 60 else ""), + "starts_with": content[0] if content else None, + "ends_with": content[-1] if content else None, + } + return matched, state + + self._stderr( + f"[ExternalFeedbackMiddleware] Unknown precondition_check " + f"condition '{condition}', skipping" + ) + return False, {"condition": condition, "condition_met": False, "error": "unknown"} + + def _is_same_dir(self, target: str) -> bool: + """Check whether *target* resolves to the current working directory.""" + if self.cwd is None: + return False + if not target or target == ".": + return True + cwd_name = self.cwd.rstrip("/").rsplit("/", 1)[-1] + target_clean = target.rstrip("/") + if target_clean == cwd_name: + return True + if target_clean.startswith("/"): + return target_clean.rstrip("/") == self.cwd.rstrip("/") + import posixpath + resolved = posixpath.normpath(posixpath.join(self.cwd, target_clean)) + return resolved == posixpath.normpath(self.cwd) + + # ------------------------------------------------------------------ + # Tracked state updates + # ------------------------------------------------------------------ + + def _update_tracked_state( + self, tool_name: str, args: dict[str, Any], result: ToolResult, + ) -> None: + """Update cwd / ls / mutation / file tracking from a successful tool result.""" + norm = self._normalize(tool_name) + text = self._extract_result_text(result) + + if norm in {"cd", "chdir"}: + cwd = self._parse_json_field(text, "current_working_directory") + if cwd: + self.cwd = cwd + self.state_changed_since_last_ls = True + + elif norm == "pwd": + cwd = self._parse_json_field(text, "current_working_directory") + if cwd: + self.cwd = cwd + + elif norm == "ls": + self.last_ls_result = text + self.last_ls_args = dict(args) + self.state_changed_since_last_ls = False + + if norm in STATE_MUTATING_TOOLS: + self.state_changed_since_last_ls = True + + # Track files involved in file-manipulation tools. + if norm in {"mv", "cp"}: + for key in ("source", "destination"): + val = args.get(key, "") + if val: + self.recent_files.add(val.rsplit("/", 1)[-1]) + elif norm in {"cat", "grep"}: + val = args.get("file_name", "") + if val: + self.recent_files.add(val.rsplit("/", 1)[-1]) + + @staticmethod + def _extract_result_text(result: ToolResult) -> str: + """Pull the concatenated text from a ToolResult.""" + parts: list[str] = [] + for block in result.content: + if hasattr(block, "text"): + parts.append(block.text) + return "\n".join(parts) + + @staticmethod + def _parse_json_field(text: str, field: str) -> str | None: + """Try to extract a top-level JSON field from *text*.""" + try: + data = json.loads(text) + if isinstance(data, dict): + return data.get(field) + except (json.JSONDecodeError, TypeError): + pass + return None + + def _parse_ls_directories(self) -> set[str] | None: + """Extract directory names from the last ls result. + + The GorillaFileSystem ls returns JSON with a + ``current_directory_content`` list of names. We can't + distinguish files from directories by name alone, so we return + all entries — the caller treats them as potential directory names. + """ + if self.last_ls_result is None: + return None + items = self._parse_json_field(self.last_ls_result, "current_directory_content") + if isinstance(items, list): + return {str(i) for i in items} + return None + + # ------------------------------------------------------------------ + # Argument-value checks + # ------------------------------------------------------------------ + + def _evaluate_check(self, args: dict[str, Any], check: dict[str, Any]) -> bool: + """Evaluate a single argument_value check dict against *args*.""" + key: str = check["key"] + op: str = check["op"] + expected: Any = check["value"] + + if key not in args: + return False + + actual: Any = args[key] + + try: + match op: + case "eq": + return bool(actual == expected) + case "neq": + return bool(actual != expected) + case "gt": + return bool(actual > expected) + case "gte": + return bool(actual >= expected) + case "lt": + return bool(actual < expected) + case "lte": + return bool(actual <= expected) + case "in": + return bool(actual in expected) + case "not_in": + return bool(actual not in expected) + case "contains": + if isinstance(actual, str): + return str(expected) in actual + if isinstance(actual, (list, tuple)): + return expected in actual + return False + case _: + self._stderr( + f"[ExternalFeedbackMiddleware] Unknown op '{op}', " + "check skipped (returns False)" + ) + return False + except (TypeError, ValueError): + return False + + # ------------------------------------------------------------------ + # Structured logging + # ------------------------------------------------------------------ + + def _write_log_record( + self, + tool_name: str, + arguments: dict[str, Any], + triggered: bool, + rule: TriggerRule | None, + precondition_state: dict[str, Any] | None = None, + ) -> None: + """Append one JSON line to the log file (if configured).""" + record: dict[str, Any] = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "test_case_id": self.test_case_id, + "global_call_index": self.global_call_index, + "tool_name": tool_name, + "arguments": arguments, + "triggered": triggered, + "trigger_type": rule.trigger_type if rule else None, + "condition_label": rule.condition_label if rule else None, + "feedback_message": rule.feedback_message if rule else None, + "occurrence": rule.match_count if rule else None, + } + if precondition_state is not None: + record["precondition_state"] = precondition_state + + status = "TRIGGERED" if triggered else "pass" + self._stderr( + f"[ExternalFeedbackMiddleware] " + f"call #{self.global_call_index} '{tool_name}' -> {status}" + ) + + if self.log_file: + try: + with open(self.log_file, "a") as fh: + fh.write(json.dumps(record) + "\n") + fh.flush() + except Exception as exc: + self._stderr( + f"[ExternalFeedbackMiddleware] Log write error: {exc}" + ) + + # ------------------------------------------------------------------ + # Utilities + # ------------------------------------------------------------------ + + @staticmethod + def _normalize(name: str) -> str: + return re.sub(r"[^a-z0-9]", "", name.lower()) + + @staticmethod + def _stderr(message: str) -> None: + print(message, file=sys.stderr, flush=True) diff --git a/src/wags/proxy.py b/src/wags/proxy.py index 03df525..3598668 100644 --- a/src/wags/proxy.py +++ b/src/wags/proxy.py @@ -1,7 +1,6 @@ """MCP proxy server with middleware support.""" from collections.abc import Awaitable, Callable -from functools import partial from typing import Any, overload import mcp.types @@ -51,9 +50,17 @@ async def _apply_middleware( call_next: Callable[[MiddlewareContext[Any]], Awaitable[Any]], ) -> Any: """Apply middleware chain.""" + # Only apply middleware to tool call requests + # Middleware is designed for CallToolRequestParams which have a 'name' attribute + if not hasattr(context.message, 'name'): + return await call_next(context) + chain = call_next for mw in reversed(self.middleware): - chain = partial(mw, call_next=chain) + prev_chain = chain + async def middleware_wrapper(ctx: MiddlewareContext[Any], mw: Any = mw, prev_chain: Any = prev_chain) -> Any: + return await mw.on_call_tool(ctx, prev_chain) + chain = middleware_wrapper return await chain(context) diff --git a/tests/benchmarks/bfcl/fastagent.config.yaml b/tests/benchmarks/bfcl/fastagent.config.yaml index 668e893..8b5da27 100644 --- a/tests/benchmarks/bfcl/fastagent.config.yaml +++ b/tests/benchmarks/bfcl/fastagent.config.yaml @@ -1,5 +1,15 @@ # Static configuration with all available BFCL API servers # This single config is used for all tests - the script selects which servers to use +# +# The _wags_env anchor forwards external-feedback env vars to every MCP server +# subprocess. When unset (baseline runs) the values default to empty strings and +# mcp_server.py stays in baseline mode. + +_wags_env: &wags_env + BFCL_EXTERNAL_FEEDBACK_ENABLED: ${BFCL_EXTERNAL_FEEDBACK_ENABLED:} + BFCL_EXTERNAL_FEEDBACK_CONFIG: ${BFCL_EXTERNAL_FEEDBACK_CONFIG:} + BFCL_EXTERNAL_FEEDBACK_LOG_FILE: ${BFCL_EXTERNAL_FEEDBACK_LOG_FILE:} + TEST_ID: ${TEST_ID:} default_model: ${DEFAULT_MODEL:gpt-4o} @@ -19,6 +29,7 @@ mcp: - GorillaFileSystem - ${TEST_DATA_PATH} - ${TEST_ID} + env: *wags_env # MathAPI - Mathematical operations mathapi: @@ -30,6 +41,7 @@ mcp: - MathAPI - ${TEST_DATA_PATH} - ${TEST_ID} + env: *wags_env # MessageAPI - Messaging operations messageapi: @@ -41,6 +53,7 @@ mcp: - MessageAPI - ${TEST_DATA_PATH} - ${TEST_ID} + env: *wags_env # TwitterAPI - Twitter operations twitterapi: @@ -52,6 +65,7 @@ mcp: - TwitterAPI - ${TEST_DATA_PATH} - ${TEST_ID} + env: *wags_env # TicketAPI - Ticket management ticketapi: @@ -63,6 +77,7 @@ mcp: - TicketAPI - ${TEST_DATA_PATH} - ${TEST_ID} + env: *wags_env # TradingBot - Trading operations tradingbot: @@ -74,6 +89,7 @@ mcp: - TradingBot - ${TEST_DATA_PATH} - ${TEST_ID} + env: *wags_env # TravelAPI - Travel operations travelapi: @@ -85,6 +101,7 @@ mcp: - TravelAPI - ${TEST_DATA_PATH} - ${TEST_ID} + env: *wags_env # VehicleControlAPI - Vehicle control operations vehiclecontrolapi: @@ -96,6 +113,7 @@ mcp: - VehicleControlAPI - ${TEST_DATA_PATH} - ${TEST_ID} + env: *wags_env # WebSearchAPI - Web search operations websearchapi: @@ -107,6 +125,7 @@ mcp: - WebSearchAPI - ${TEST_DATA_PATH} - ${TEST_ID} + env: *wags_env # MemoryAPI_kv - Key-value memory operations memoryapi_kv: @@ -118,6 +137,7 @@ mcp: - MemoryAPI_kv - ${TEST_DATA_PATH} - ${TEST_ID} + env: *wags_env # MemoryAPI_vector - Vector memory operations memoryapi_vector: @@ -129,6 +149,7 @@ mcp: - MemoryAPI_vector - ${TEST_DATA_PATH} - ${TEST_ID} + env: *wags_env # MemoryAPI_rec_sum - Recursive summarization memory operations memoryapi_rec_sum: @@ -140,6 +161,7 @@ mcp: - MemoryAPI_rec_sum - ${TEST_DATA_PATH} - ${TEST_ID} + env: *wags_env logger: level: error @@ -147,4 +169,4 @@ logger: show_chat: false show_tools: false truncate_tools: false - progress_display: false \ No newline at end of file + progress_display: false diff --git a/tests/benchmarks/bfcl/instruction.txt b/tests/benchmarks/bfcl/instruction.txt index 8bf4645..b2d9568 100644 --- a/tests/benchmarks/bfcl/instruction.txt +++ b/tests/benchmarks/bfcl/instruction.txt @@ -8,5 +8,3 @@ You should only return the function calls in your response. You SHOULD NOT inclu At each turn, you should try your best to complete the tasks requested by the user within the current turn. Continue to output functions to call until you have fulfilled the user's request to the best of your ability. Once you have no more functions to call, the system will consider the current turn complete and proceed to the next turn or task. - -{{serverInstructions}} diff --git a/tests/benchmarks/bfcl/instruction_old.txt b/tests/benchmarks/bfcl/instruction_old.txt new file mode 100644 index 0000000..0b61c05 --- /dev/null +++ b/tests/benchmarks/bfcl/instruction_old.txt @@ -0,0 +1,12 @@ +You are an expert in composing functions. You are given a question and a set of possible functions. +Based on the question, you will need to make one or more function/tool calls to achieve the purpose. +If none of the functions can be used, point it out. +If the given question lacks the parameters required by the function, also point it out. + +You should only return the function calls in your response. You SHOULD NOT include any other text in the response. + +At each turn, you should try your best to complete the tasks requested by the user within the current turn. +Continue to output functions to call until you have fulfilled the user's request to the best of your ability. +Once you have no more functions to call, the system will consider the current turn complete and proceed to the next turn or task. + +{{serverInstructions}} \ No newline at end of file diff --git a/tests/benchmarks/bfcl/mcp_server.py b/tests/benchmarks/bfcl/mcp_server.py index 26c9a12..9342f46 100644 --- a/tests/benchmarks/bfcl/mcp_server.py +++ b/tests/benchmarks/bfcl/mcp_server.py @@ -2,13 +2,20 @@ """ MCP Server wrapper for BFCL API classes. Exposes API methods as MCP tools with automatic introspection. + +Experimental WAGS integration: +- If BFCL_EXTERNAL_FEEDBACK_ENABLED is set, we wrap the FastMCP server with a WAGS proxy + and attach ExternalFeedbackMiddleware. +- Otherwise, behavior is identical to the original BFCL server. """ import argparse import asyncio +import functools import importlib import inspect import json +import os import sys from typing import Any @@ -21,6 +28,13 @@ from mcp.server.fastmcp import FastMCP +def _env_flag_enabled(name: str, default: bool = False) -> bool: + raw = os.getenv(name) + if raw is None: + return default + return raw.strip().lower() in {"1", "true", "yes", "on"} + + def load_api_class(class_name: str) -> Any: """Load and instantiate the specified API class.""" module = importlib.import_module(CLASS_FILE_PATH_MAPPING[class_name]) @@ -65,6 +79,28 @@ def load_scenario_from_test(test_file: str, test_id: str, class_name: str) -> di return {} +def _strip_return_annotation(method: Any) -> Any: + """Remove the return annotation so FastMCP skips Pydantic output validation. + + Several BFCL API methods have return annotations like Dict[str, Union[str, bool]] + that don't match their actual return values (e.g. booking_history is a nested dict, + not str|bool). FastMCP builds a Pydantic output model from the annotation; when + validation fails the tool response is dropped entirely, leaving an unanswered + tool_call_id that causes OpenAI to reject the next request with a 400 error. + + IMPORTANT: we must set __signature__ explicitly rather than just overriding + __annotations__. functools.wraps sets __wrapped__, and inspect.signature follows + __wrapped__ back to the original function — bypassing __annotations__ entirely. + Setting __signature__ directly takes precedence over __wrapped__ in inspect.signature. + """ + @functools.wraps(method) + def wrapper(*args: Any, **kwargs: Any) -> Any: + return method(*args, **kwargs) + orig_sig = inspect.signature(method) + wrapper.__signature__ = orig_sig.replace(return_annotation=inspect.Parameter.empty) + return wrapper + + def patch_tool_with_func_doc(server: FastMCP, func_docs: dict[str, dict[str, Any]]) -> None: """Patch registered tools with BFCL's richer function documentation. @@ -97,6 +133,14 @@ async def main() -> None: args = parser.parse_args() class_name = args.class_name + + # Normalize class name to handle lowercase inputs like "vehiclecontrolapi" + if class_name.lower() in {k.lower(): k for k in CLASS_FILE_PATH_MAPPING}.keys(): + for k in CLASS_FILE_PATH_MAPPING: + if k.lower() == class_name.lower(): + class_name = k + break + if class_name not in CLASS_FILE_PATH_MAPPING: print("Usage: python mcp_server.py [test_file.json test_id]", file=sys.stderr) @@ -105,7 +149,7 @@ async def main() -> None: # Load the API class api = load_api_class(class_name) - print(f"Loaded {class_name}", file=sys.stderr) + print(f"[mcp_server] Loaded {class_name}", file=sys.stderr, flush=True) # Initialize scenario state if needed if hasattr(api, "_load_scenario") and class_name not in STATELESS_CLASSES: @@ -120,11 +164,80 @@ async def main() -> None: for method_name, method in inspect.getmembers(api, predicate=inspect.ismethod): if not method_name.startswith("_"): - server.add_tool(method, name=method_name) + server.add_tool(_strip_return_annotation(method), name=method_name) # Patch tools with BFCL's richer descriptions patch_tool_with_func_doc(server, func_docs) + # --- Seed initial cwd for precondition_check triggers --- + if hasattr(api, "_current_dir") and hasattr(api._current_dir, "name"): + os.environ["BFCL_INITIAL_CWD"] = "/" + api._current_dir.name + print( + f"[mcp_server] BFCL_INITIAL_CWD=/{api._current_dir.name}", + file=sys.stderr, flush=True, + ) + + # --- WAGS / external feedback experiment wiring --- + feedback_enabled = _env_flag_enabled("BFCL_EXTERNAL_FEEDBACK_ENABLED", default=False) + if feedback_enabled: + print("[mcp_server] BFCL_EXTERNAL_FEEDBACK_ENABLED=ON -> starting WAGS proxy mode", file=sys.stderr, flush=True) + + # Import here so baseline BFCL runs don't depend on WAGS. + from wags import create_proxy + from wags.middleware.external_feedback import ExternalFeedbackMiddleware + + proxy = create_proxy(server, server_name=f"wags-{class_name.lower()}-proxy") + + # Config-file mode: BFCL_EXTERNAL_FEEDBACK_CONFIG points to a JSON rules + # file. Legacy mode: individual _TOOL / _MESSAGE / _N env vars are used. + config_path = os.getenv("BFCL_EXTERNAL_FEEDBACK_CONFIG") + if config_path: + print( + f"[mcp_server] Config-file mode: loading triggers from {config_path}", + file=sys.stderr, + flush=True, + ) + middleware = ExternalFeedbackMiddleware(config_path=config_path) + else: + print( + "[mcp_server] Legacy env-var mode: " + f"tool={os.getenv('BFCL_EXTERNAL_FEEDBACK_TOOL', 'startEngine')} " + f"n={os.getenv('BFCL_EXTERNAL_FEEDBACK_N', '1')}", + file=sys.stderr, + flush=True, + ) + middleware = ExternalFeedbackMiddleware( + target_tool_name=os.getenv("BFCL_EXTERNAL_FEEDBACK_TOOL", "startEngine"), + warning_message=os.getenv( + "BFCL_EXTERNAL_FEEDBACK_MESSAGE", + "🚨 TEST WARNING: This tool call is intentionally blocked. Choose a different action and continue. 🚨", + ), + trigger_on_nth_call=int(os.getenv("BFCL_EXTERNAL_FEEDBACK_N", "1")), + ) + + # Attach middleware to the PROXY (not the underlying server). + proxy.add_middleware(middleware) + + # Loud startup logging so we know wiring is correct. + try: + tool_count = len(server._tool_manager._tools) + tool_names_preview = list(server._tool_manager._tools.keys())[:10] + except Exception: + tool_count = -1 + tool_names_preview = [] + + print(f"[mcp_server] Base server tool count: {tool_count}", file=sys.stderr, flush=True) + if tool_names_preview: + print(f"[mcp_server] Base server tools (preview): {tool_names_preview}", file=sys.stderr, flush=True) + + print(f"[mcp_server] Proxy middleware count: {len(proxy.middleware)}", file=sys.stderr, flush=True) + print(f"[mcp_server] Proxy middleware: {[type(m).__name__ for m in proxy.middleware]}", file=sys.stderr, flush=True) + + await proxy.run_stdio_async() + return + + # --- Baseline behavior --- + print("[mcp_server] BFCL_EXTERNAL_FEEDBACK_ENABLED=OFF -> starting baseline server mode", file=sys.stderr, flush=True) await server.run_stdio_async() diff --git a/tests/benchmarks/bfcl/test_bfcl.py b/tests/benchmarks/bfcl/test_bfcl.py index 0083977..d4011af 100644 --- a/tests/benchmarks/bfcl/test_bfcl.py +++ b/tests/benchmarks/bfcl/test_bfcl.py @@ -4,16 +4,80 @@ import json import os from pathlib import Path +import sys from typing import Any, cast import pytest +from mcp.types import CallToolResult, TextContent + +from fast_agent.types import PromptMessageExtended from tests.benchmarks.bfcl import evaluator, loader from tests.benchmarks.bfcl.elicitation import create_elicitation_handler +from tests.conftest import instruction_file from tests.utils.fastagent_helpers import MessageSerializer from tests.utils.logger import StructuredEventLogger +def _repair_unanswered_tool_calls( + agent_app: Any, user_text: str +) -> Any: + """Return a message payload that repairs any unanswered tool_call_ids. + + If the previous turn left an assistant tool_call without a matching + tool_result (e.g. fast-agent hit max_iterations or a tool-loop error break), + the next OpenAI request will 400 with "tool_call_ids did not have response + messages". We synthesize error tool_results for each dangling id and bundle + them with the outgoing user text into a single PromptMessageExtended so the + OpenAI converter emits tool messages before the user turn. + """ + agent = agent_app._agent(None) + history = agent.message_history + + responded: set[str] = set() + called: list[str] = [] + for m in history: + tcs = getattr(m, "tool_calls", None) + if tcs: + called.extend(tcs.keys()) + trs = getattr(m, "tool_results", None) + if trs: + responded.update(trs.keys()) + + unanswered = [cid for cid in called if cid not in responded] + if not unanswered: + return user_text + + print( + f"[REPAIR] Synthesizing tool_results for {len(unanswered)} unanswered " + f"tool_call_id(s): {unanswered}", + flush=True, + file=sys.stderr, + ) + + synthetic_results = { + cid: CallToolResult( + content=[ + TextContent( + type="text", + text=( + "[WAGS] No result available: the tool call did not " + "complete in the previous turn." + ), + ) + ], + isError=True, + ) + for cid in unanswered + } + + return PromptMessageExtended( + role="user", + content=[TextContent(type="text", text=user_text)], + tool_results=synthetic_results, + ) + + def _parse_question(question: Any) -> str: """Parse question from various formats into a string.""" if isinstance(question, list) and question: @@ -25,14 +89,43 @@ def _parse_question(question: Any) -> str: return "" -async def _run_bfcl_test(test_id: str, model: str, temperature: float, output_dir: Path) -> Path: +def _validate_openai_api_key_env() -> None: + """Fail fast for missing/placeholder OpenAI API key in BFCL runs.""" + api_key = os.getenv("OPENAI_API_KEY") + if api_key is None or not api_key.strip(): + raise RuntimeError( + "OPENAI_API_KEY is not set. Export a valid key before running BFCL tests." + ) + + candidate = api_key.strip() + if candidate.startswith("${") and candidate.endswith("}"): + raise RuntimeError( + "OPENAI_API_KEY is set to a placeholder value. Export the real key instead." + ) + + +async def _run_bfcl_test( + test_id: str, + model: str, + temperature: float, + output_dir: Path, + instruction_file: Path | None, + external_feedback_enabled: bool, +) -> Path: """Run BFCL test and return path to complete.json.""" from fast_agent import FastAgent + _validate_openai_api_key_env() + test_case = loader.load_test_entry(test_id) ground_truth = loader.load_ground_truth(test_id) - instruction_path = Path(__file__).parent / "instruction.txt" + default_instruction = Path(__file__).parent / "instruction.txt" + instruction_path = instruction_file if instruction_file is not None else default_instruction + print(f"Using INSTRUCTION file: {instruction_path}", flush=True, file=sys.stderr) + if not instruction_path.exists(): + raise FileNotFoundError(f"Instruction file not found: {instruction_path}") + structured_log_path = output_dir / "raw" / f"{test_id}_structured.jsonl" structured_log_path.parent.mkdir(parents=True, exist_ok=True) @@ -42,17 +135,32 @@ async def _run_bfcl_test(test_id: str, model: str, temperature: float, output_di test_data_path = output_dir / f"{test_id}_test.json" test_data_path.write_text(json.dumps(test_case)) - # Set environment variables BEFORE creating FastAgent + # Set environment variables BEFORE creating FastAgent. + # Save and restore to avoid state leaking into subsequent test cases + # when multiple tests run in the same pytest session. test_dir = Path(__file__).parent - os.environ.update( - { - "DEFAULT_MODEL": model, - "TEMPERATURE": str(temperature), - "TEST_DATA_PATH": str(test_data_path.absolute()), - "TEST_ID": test_id, - "SERVER_SCRIPT_PATH": str(test_dir / "mcp_server.py"), - } + _BFCL_ENV_KEYS = ( + "DEFAULT_MODEL", + "TEMPERATURE", + "TEST_DATA_PATH", + "TEST_ID", + "SERVER_SCRIPT_PATH", + "BFCL_EXTERNAL_FEEDBACK_LOG_FILE", ) + _saved_env = {k: os.environ.get(k) for k in _BFCL_ENV_KEYS} + env_updates = { + "DEFAULT_MODEL": model, + "TEMPERATURE": str(temperature), + "TEST_DATA_PATH": str(test_data_path.absolute()), + "TEST_ID": test_id, + "SERVER_SCRIPT_PATH": str(test_dir / "mcp_server.py"), + } + # Only set a default log path if run_experiment.py didn't already provide one. + if not os.environ.get("BFCL_EXTERNAL_FEEDBACK_LOG_FILE"): + env_updates["BFCL_EXTERNAL_FEEDBACK_LOG_FILE"] = str( + output_dir / "raw" / "external_feedback.jsonl" + ) + os.environ.update(env_updates) # Create FastAgent after environment variables are set config_path = test_dir / "fastagent.config.yaml" @@ -77,7 +185,19 @@ async def run_test() -> Path: continue structured_logger.log_turn(turn_idx, "start", msg) - await agent_app.send(msg) + send_payload = _repair_unanswered_tool_calls(agent_app, msg) + await agent_app.send(send_payload) + + # Check for feedback/errors in the latest turn + current_messages = agent_app._agent(None).message_history + for m in current_messages[-10:]: # Check recent messages + if hasattr(m, "tool_results") and m.tool_results: + for tr in m.tool_results: + is_err = getattr(tr, "is_error", False) + if is_err: + content = getattr(tr, "content", str(tr)) + print(f"\n[FEEDBACK] Tool Error detected: {str(content)[:200]}...", flush=True) + structured_logger.log_turn(turn_idx, "end") await asyncio.sleep(0) @@ -90,7 +210,14 @@ async def run_test() -> Path: return complete_path - return await run_test() + try: + return await run_test() + finally: + for k, v in _saved_env.items(): + if v is None: + os.environ.pop(k, None) + else: + os.environ[k] = v def _validate_from_complete_json(test_id: str, complete_path: Path) -> dict[str, Any]: @@ -117,8 +244,8 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: log_dir = output_dir / "raw" if log_dir.exists(): - log_files = list(log_dir.glob("**/*_fastagent.jsonl")) - test_ids = [f.stem.replace("_fastagent", "") for f in log_files] + log_files = list(log_dir.glob("**/*_structured.jsonl")) + test_ids = [f.stem.replace("_structured", "") for f in log_files] else: test_ids = [] else: @@ -134,13 +261,29 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: @pytest.mark.asyncio async def test_bfcl( - test_id: str, model: str, temperature: float, output_dir: Path, request: pytest.FixtureRequest + test_id: str, + model: str, + temperature: float, + output_dir: Path, + instruction_file: Path | None, + request: pytest.FixtureRequest, ) -> None: """Run or validate a BFCL test based on mode.""" - if not request.config.getoption("--validate-only"): - await _run_bfcl_test(test_id, model, temperature, output_dir) - - log_dir = output_dir / "raw" + if request.config.getoption("--validate-only"): + log_dir = Path(request.config.getoption("--log-dir")) + else: + external_feedback_enabled = bool(request.config.getoption("--external-feedback")) + await _run_bfcl_test( + test_id, + model, + temperature, + output_dir, + instruction_file, + external_feedback_enabled, + ) + log_dir = output_dir / "raw" complete_path = log_dir / f"{test_id}_complete.json" evaluation = _validate_from_complete_json(test_id, complete_path) assert evaluation["validation"]["valid"], f"Validation failed for {test_id}" + eval_path = log_dir / f"{test_id}_evaluation.json" + eval_path.write_text(json.dumps(evaluation, indent=2, default=str)) diff --git a/tests/conftest.py b/tests/conftest.py index 5592bd9..833fa83 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -37,6 +37,37 @@ def output_dir(request: pytest.FixtureRequest) -> Path: @pytest.fixture +def instruction_file(request: pytest.FixtureRequest) -> Path | None: + """Optional path to replacement instruction file.""" + value = request.config.getoption("--instruction-file") + return Path(value) if value else None + + +@pytest.fixture +def instruction_override(request: pytest.FixtureRequest) -> str | None: + """Inline instructions overriding file-based prompts.""" + value = request.config.getoption("--instruction-override") + return value if value else None + + +@pytest.fixture +def gepa_dir(request: pytest.FixtureRequest) -> Path | None: + """Directory for GEPA experiment artifacts.""" + value = request.config.getoption("--gepa-dir") + return Path(value) if value else None + + +@pytest.fixture +def gepa_log_dir(request: pytest.FixtureRequest) -> Path | None: + """Directory for GEPA-specific logs.""" + value = request.config.getoption("--gepa-log-dir") + return Path(value) if value else None + + +@pytest.fixture +def gepa_scoring_mode(request: pytest.FixtureRequest) -> bool: + """Flag controlling GEPA scoring-only mode.""" + return bool(request.config.getoption("--gepa-scoring-mode")) def toolset(request: pytest.FixtureRequest) -> str: """Toolset from CLI: 'full' (all tools) or 'minimal' (essential tools only).""" return cast(str, request.config.getoption("--toolset")) @@ -48,12 +79,25 @@ def pytest_addoption(parser: pytest.Parser) -> None: parser.addoption("--temperature", default=0.001, type=float, help="Temperature for LLM (default: 0.001)") parser.addoption("--output-dir", default="outputs", help="Output directory for results") parser.addoption("--validate-only", action="store_true", help="Only validate existing logs") + parser.addoption("--log-dir", default="outputs/raw", help="Directory with logs (for validate mode)") + parser.addoption("--instruction-file", default=None, help="Path to replacement instruction file") + parser.addoption("--instruction-override", default=None, help="Literal replacement instructions") + parser.addoption("--gepa-dir", default=None, help="Directory for GEPA experiment data") + parser.addoption("--gepa-log-dir", default=None, help="Directory for GEPA logs") + parser.addoption("--gepa-scoring-mode", action="store_true", help="Enable GEPA scoring-only mode") parser.addoption( "--toolset", default="full", choices=["full", "minimal"], help="Tool availability: 'full' (all tools) or 'minimal' (19 essential tools)", ) + parser.addoption( + "-F", + "--external-feedback", + action="store_true", + default=False, + help="BFCL only: enable ExternalFeedbackMiddleware", + ) def pytest_configure(config: pytest.Config) -> None: diff --git a/tests/unit/middleware/test_external_feedback.py b/tests/unit/middleware/test_external_feedback.py new file mode 100644 index 0000000..6cbefb2 --- /dev/null +++ b/tests/unit/middleware/test_external_feedback.py @@ -0,0 +1,76 @@ +"""Unit tests for ExternalFeedbackMiddleware.""" + +import pytest +from fastmcp.server.middleware.middleware import MiddlewareContext +from fastmcp.tools.tool import ToolResult +from mcp.types import CallToolRequestParams + +from wags.middleware.external_feedback import ExternalFeedbackMiddleware + + +def _tool_result_text(result: ToolResult) -> str: + blocks = result.to_mcp_result() + if isinstance(blocks, tuple): + blocks = blocks[0] + return " ".join(getattr(block, "text", str(block)) for block in blocks) + + +@pytest.mark.asyncio +async def test_blocks_first_target_call_and_prints(capsys: pytest.CaptureFixture[str]) -> None: + middleware = ExternalFeedbackMiddleware( + target_tool_name="startEngine", + warning_message="blocked", + trigger_on_nth_call=1, + ) + + context = MiddlewareContext(message=CallToolRequestParams(name="startEngine", arguments={}), method="tools/call") + + async def call_next(_: MiddlewareContext[CallToolRequestParams]) -> ToolResult: + return ToolResult(content="tool-executed") + + result = await middleware.on_call_tool(context, call_next) + output = capsys.readouterr().err + + assert "EXTERNAL FEEDBACK TRIGGERED" in _tool_result_text(result) + assert "Tool call #1: startEngine" in output + assert "TRIGGERED WARNING - BLOCKING TOOL CALL" in output + + +@pytest.mark.asyncio +async def test_allows_second_target_call() -> None: + middleware = ExternalFeedbackMiddleware( + target_tool_name="startEngine", + warning_message="blocked", + trigger_on_nth_call=1, + ) + + context = MiddlewareContext(message=CallToolRequestParams(name="startEngine", arguments={}), method="tools/call") + + async def call_next(_: MiddlewareContext[CallToolRequestParams]) -> ToolResult: + return ToolResult(content="tool-executed") + + first_result = await middleware.on_call_tool(context, call_next) + second_result = await middleware.on_call_tool(context, call_next) + + assert "EXTERNAL FEEDBACK TRIGGERED" in _tool_result_text(first_result) + assert _tool_result_text(second_result) == "tool-executed" + + +@pytest.mark.asyncio +async def test_matches_namespaced_and_snake_case_variants() -> None: + middleware = ExternalFeedbackMiddleware( + target_tool_name="start_engine", + warning_message="blocked", + trigger_on_nth_call=1, + ) + + context = MiddlewareContext( + message=CallToolRequestParams(name="vehiclecontrolapi__startEngine", arguments={}), + method="tools/call", + ) + + async def call_next(_: MiddlewareContext[CallToolRequestParams]) -> ToolResult: + return ToolResult(content="tool-executed") + + result = await middleware.on_call_tool(context, call_next) + assert "EXTERNAL FEEDBACK TRIGGERED" in _tool_result_text(result) diff --git a/tests/utils/fastagent_helpers.py b/tests/utils/fastagent_helpers.py index 3785ac3..afca2d2 100644 --- a/tests/utils/fastagent_helpers.py +++ b/tests/utils/fastagent_helpers.py @@ -112,9 +112,11 @@ def strip_server_prefix(tool_name: str) -> str: tool_name: Tool name potentially with server prefix Returns: - Tool name without prefix (e.g., 'github-list_issues' -> 'list_issues') + Tool name without prefix (e.g., 'vehiclecontrolapi__list_issues' -> 'list_issues') """ - if "-" in tool_name: + if "__" in tool_name: + return tool_name.split("__", 1)[1] + elif "-" in tool_name: return tool_name.split("-", 1)[1] return tool_name diff --git a/tests/utils/logger.py b/tests/utils/logger.py index b8ef8bc..004a0c8 100644 --- a/tests/utils/logger.py +++ b/tests/utils/logger.py @@ -93,6 +93,8 @@ def log_tool_result(self, turn_id: int, tool_id: str, result: Any, is_error: boo "result": str(result) if not isinstance(result, (dict, list)) else result, "is_error": is_error, } + if is_error: + print(f"\n[FEEDBACK] Tool Error for {tool_id}: {result}", flush=True) self._write_event(event) def log_assistant_response(self, turn_id: int, text: str) -> None: