From f94ab4b26ed6cbdbcb280bb4fb903c2aabe048e3 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Thu, 5 Mar 2026 13:53:00 -0500 Subject: [PATCH 1/3] feat: instrument evals usage events via openadapt-telemetry --- openadapt_evals/benchmarks/runner.py | 33 ++++++++ openadapt_evals/cli/main.py | 37 ++++++++- openadapt_evals/telemetry.py | 115 +++++++++++++++++++++++++++ pyproject.toml | 1 + scripts/record_waa_demos.py | 33 ++++++++ 5 files changed, 217 insertions(+), 2 deletions(-) create mode 100644 openadapt_evals/telemetry.py diff --git a/openadapt_evals/benchmarks/runner.py b/openadapt_evals/benchmarks/runner.py index d295929..b95ea15 100644 --- a/openadapt_evals/benchmarks/runner.py +++ b/openadapt_evals/benchmarks/runner.py @@ -28,6 +28,11 @@ BenchmarkResult, BenchmarkTask, ) +from openadapt_evals.telemetry import ( + track_action_executed, + track_agent_run, + track_agent_run_completed, +) if TYPE_CHECKING: from openadapt_evals.benchmarks.data_collection import ExecutionTraceCollector @@ -101,6 +106,15 @@ def evaluate_agent_on_benchmark( if config.verbose: logger.info(f"Evaluating {len(tasks)} tasks on {adapter.name}") + track_agent_run( + phase="start", + adapter=adapter.name, + agent_class=type(agent).__name__, + num_tasks=len(tasks), + max_steps=config.max_steps, + parallel=config.parallel, + run_name=config.run_name or "unspecified", + ) # Initialize execution trace collector if enabled trace_collector: ExecutionTraceCollector | None = None @@ -151,6 +165,18 @@ def evaluate_agent_on_benchmark( f"Evaluation complete: {success_count}/{len(results)} " f"({success_rate:.1%}) success, {avg_steps:.1f} avg steps" ) + else: + success_count = sum(1 for r in results if r.success) + avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0 + + track_agent_run_completed( + adapter=adapter.name, + agent_class=type(agent).__name__, + num_tasks=len(results), + success_count=success_count, + avg_steps=round(avg_steps, 2), + run_name=config.run_name or "unspecified", + ) return results @@ -352,6 +378,13 @@ def _run_single_task( exec_start = time.perf_counter() obs, done, info = adapter.step(action) exec_end = time.perf_counter() + track_action_executed( + task_id=task.task_id, + step_index=steps, + action_type=action.type, + adapter=adapter.name, + agent_class=type(agent).__name__, + ) if agent_logs: agent_logs["env_execute_ms"] = round((exec_end - exec_start) * 1000) if done: diff --git a/openadapt_evals/cli/main.py b/openadapt_evals/cli/main.py index b331b81..d9a15bf 100644 --- a/openadapt_evals/cli/main.py +++ b/openadapt_evals/cli/main.py @@ -18,6 +18,9 @@ import argparse import sys +import time + +from openadapt_evals.telemetry import track_agent_run, track_agent_run_completed def main(argv: list[str] | None = None) -> int: @@ -249,13 +252,43 @@ def _cmd_mock(args: argparse.Namespace) -> int: """Run mock evaluation.""" # Delegate to existing CLI implementation from openadapt_evals.benchmarks.cli import cmd_mock - return cmd_mock(args) + start = time.perf_counter() + track_agent_run( + phase="start", + entrypoint="oa evals mock", + mode="mock", + agent_class=getattr(args, "agent", "mock"), + ) + rc = cmd_mock(args) + track_agent_run_completed( + entrypoint="oa evals mock", + mode="mock", + agent_class=getattr(args, "agent", "mock"), + return_code=rc, + duration_seconds=round(time.perf_counter() - start, 3), + ) + return rc def _cmd_run(args: argparse.Namespace) -> int: """Run live evaluation.""" from openadapt_evals.benchmarks.cli import cmd_live - return cmd_live(args) + start = time.perf_counter() + track_agent_run( + phase="start", + entrypoint="oa evals run", + mode="live", + agent_class=getattr(args, "agent", "unknown"), + ) + rc = cmd_live(args) + track_agent_run_completed( + entrypoint="oa evals run", + mode="live", + agent_class=getattr(args, "agent", "unknown"), + return_code=rc, + duration_seconds=round(time.perf_counter() - start, 3), + ) + return rc def _cmd_probe(args: argparse.Namespace) -> int: diff --git a/openadapt_evals/telemetry.py b/openadapt_evals/telemetry.py new file mode 100644 index 0000000..53e4fa4 --- /dev/null +++ b/openadapt_evals/telemetry.py @@ -0,0 +1,115 @@ +"""openadapt-evals telemetry wrapper. + +Thin adapter over openadapt-telemetry PostHog usage events. +""" + +from __future__ import annotations + +from typing import Any + +from openadapt_telemetry.posthog import capture_usage_event + +_PACKAGE_NAME = "openadapt-evals" + + +def _compact(properties: dict[str, Any]) -> dict[str, Any]: + return {key: value for key, value in properties.items() if value is not None} + + +def capture_event(event: str, properties: dict[str, Any] | None = None) -> bool: + """Capture a raw usage event.""" + return capture_usage_event( + event=event, + properties=_compact(properties or {}), + package_name=_PACKAGE_NAME, + ) + + +def track_agent_run( + *, + phase: str, + adapter: str | None = None, + agent_class: str | None = None, + entrypoint: str | None = None, + mode: str | None = None, + num_tasks: int | None = None, + max_steps: int | None = None, + parallel: int | None = None, + run_name: str | None = None, +) -> bool: + properties = { + "phase": phase, + "adapter": adapter, + "agent_class": agent_class, + "entrypoint": entrypoint, + "mode": mode, + "num_tasks": num_tasks, + "max_steps": max_steps, + "parallel": parallel, + "run_name": run_name, + } + return capture_event("agent_run", properties) + + +def track_agent_run_completed( + *, + adapter: str | None = None, + agent_class: str | None = None, + entrypoint: str | None = None, + mode: str | None = None, + num_tasks: int | None = None, + success_count: int | None = None, + avg_steps: float | None = None, + return_code: int | None = None, + duration_seconds: float | None = None, + run_name: str | None = None, +) -> bool: + properties = { + "adapter": adapter, + "agent_class": agent_class, + "entrypoint": entrypoint, + "mode": mode, + "num_tasks": num_tasks, + "success_count": success_count, + "avg_steps": avg_steps, + "return_code": return_code, + "duration_seconds": duration_seconds, + "run_name": run_name, + } + return capture_event("agent_run_completed", properties) + + +def track_action_executed( + *, + task_id: str | None = None, + step_index: int | None = None, + action_type: str | None = None, + adapter: str | None = None, + agent_class: str | None = None, +) -> bool: + properties = { + "task_id": task_id, + "step_index": step_index, + "action_type": action_type, + "adapter": adapter, + "agent_class": agent_class, + } + return capture_event("action_executed", properties) + + +def track_demo_recorded( + *, + task_id: str | None = None, + mode: str | None = None, + steps: int | None = None, + output_dir: str | None = None, + phase: str | None = None, +) -> bool: + properties = { + "task_id": task_id, + "mode": mode, + "steps": steps, + "output_dir": output_dir, + "phase": phase, + } + return capture_event("demo_recorded", properties) diff --git a/pyproject.toml b/pyproject.toml index 1b598d9..a3a3cec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ dependencies = [ "openai>=1.0.0", "anthropic>=0.76.0", "openadapt-consilium>=0.3.2", + "openadapt-telemetry>=0.2.0", "openadapt-ml>=0.11.0", ] diff --git a/scripts/record_waa_demos.py b/scripts/record_waa_demos.py index b7becc8..26a9610 100644 --- a/scripts/record_waa_demos.py +++ b/scripts/record_waa_demos.py @@ -94,6 +94,11 @@ ] from openadapt_evals.constants import HARDER_TASK_IDS +from openadapt_evals.telemetry import ( + track_agent_run, + track_agent_run_completed, + track_demo_recorded, +) # File names for the docx setup task DOCX_FILES = ["report.docx", "meeting_notes.docx", "proposal.docx"] @@ -2180,6 +2185,13 @@ def _hard_reset_task_env() -> bytes: recorded.append(task_id) print(f"\n Saved {len(steps_meta)} step(s) to {task_dir}") + track_demo_recorded( + task_id=task_id, + mode="record-waa", + steps=len(steps_meta), + output_dir=str(output_dir), + phase="recorded", + ) # Summary print_header("Recording Summary") @@ -2376,6 +2388,13 @@ def cmd_annotate_waa( print(f" -> {json_path}") print(f" -> {txt_path}\n") + track_demo_recorded( + task_id=task_id, + mode="annotate", + steps=len(annotated_steps), + output_dir=str(output_dir), + phase="annotated", + ) print_header("Annotation Summary") print(f" Annotated: {len(task_dirs)} recording(s)") @@ -2431,8 +2450,22 @@ def cmd_eval_dc( print(f"Running eval-suite with demo-conditioned demos from {demo_dir}") print(f"Command: {' '.join(cmd)}\n") + track_agent_run( + phase="start", + entrypoint="record_waa_demos.py eval", + mode="demo-conditioned", + num_tasks=len([t.strip() for t in tasks.split(",") if t.strip()]), + max_steps=max_steps, + run_name=suite_name, + ) result = subprocess.run(cmd) + track_agent_run_completed( + entrypoint="record_waa_demos.py eval", + mode="demo-conditioned", + return_code=result.returncode, + run_name=suite_name, + ) if result.returncode != 0: sys.exit(result.returncode) From 9937a9f2105ede24f66647f60a99b687df7ebdc6 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Thu, 5 Mar 2026 14:12:20 -0500 Subject: [PATCH 2/3] fix: avoid duplicate agent_run telemetry events --- openadapt_evals/cli/main.py | 37 ++----------------------------------- scripts/record_waa_demos.py | 16 ---------------- 2 files changed, 2 insertions(+), 51 deletions(-) diff --git a/openadapt_evals/cli/main.py b/openadapt_evals/cli/main.py index d9a15bf..b331b81 100644 --- a/openadapt_evals/cli/main.py +++ b/openadapt_evals/cli/main.py @@ -18,9 +18,6 @@ import argparse import sys -import time - -from openadapt_evals.telemetry import track_agent_run, track_agent_run_completed def main(argv: list[str] | None = None) -> int: @@ -252,43 +249,13 @@ def _cmd_mock(args: argparse.Namespace) -> int: """Run mock evaluation.""" # Delegate to existing CLI implementation from openadapt_evals.benchmarks.cli import cmd_mock - start = time.perf_counter() - track_agent_run( - phase="start", - entrypoint="oa evals mock", - mode="mock", - agent_class=getattr(args, "agent", "mock"), - ) - rc = cmd_mock(args) - track_agent_run_completed( - entrypoint="oa evals mock", - mode="mock", - agent_class=getattr(args, "agent", "mock"), - return_code=rc, - duration_seconds=round(time.perf_counter() - start, 3), - ) - return rc + return cmd_mock(args) def _cmd_run(args: argparse.Namespace) -> int: """Run live evaluation.""" from openadapt_evals.benchmarks.cli import cmd_live - start = time.perf_counter() - track_agent_run( - phase="start", - entrypoint="oa evals run", - mode="live", - agent_class=getattr(args, "agent", "unknown"), - ) - rc = cmd_live(args) - track_agent_run_completed( - entrypoint="oa evals run", - mode="live", - agent_class=getattr(args, "agent", "unknown"), - return_code=rc, - duration_seconds=round(time.perf_counter() - start, 3), - ) - return rc + return cmd_live(args) def _cmd_probe(args: argparse.Namespace) -> int: diff --git a/scripts/record_waa_demos.py b/scripts/record_waa_demos.py index 26a9610..aa6f6e3 100644 --- a/scripts/record_waa_demos.py +++ b/scripts/record_waa_demos.py @@ -95,8 +95,6 @@ from openadapt_evals.constants import HARDER_TASK_IDS from openadapt_evals.telemetry import ( - track_agent_run, - track_agent_run_completed, track_demo_recorded, ) @@ -2450,22 +2448,8 @@ def cmd_eval_dc( print(f"Running eval-suite with demo-conditioned demos from {demo_dir}") print(f"Command: {' '.join(cmd)}\n") - track_agent_run( - phase="start", - entrypoint="record_waa_demos.py eval", - mode="demo-conditioned", - num_tasks=len([t.strip() for t in tasks.split(",") if t.strip()]), - max_steps=max_steps, - run_name=suite_name, - ) result = subprocess.run(cmd) - track_agent_run_completed( - entrypoint="record_waa_demos.py eval", - mode="demo-conditioned", - return_code=result.returncode, - run_name=suite_name, - ) if result.returncode != 0: sys.exit(result.returncode) From ba8171851da36a2b78d64ef830f066ee45b3f42d Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Thu, 5 Mar 2026 14:59:14 -0500 Subject: [PATCH 3/3] fix: align evals telemetry dependency with published release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a3a3cec..c79915a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ dependencies = [ "openai>=1.0.0", "anthropic>=0.76.0", "openadapt-consilium>=0.3.2", - "openadapt-telemetry>=0.2.0", + "openadapt-telemetry>=0.1.0", "openadapt-ml>=0.11.0", ]