diff --git a/openadapt_evals/benchmarks/runner.py b/openadapt_evals/benchmarks/runner.py index d295929..b95ea15 100644 --- a/openadapt_evals/benchmarks/runner.py +++ b/openadapt_evals/benchmarks/runner.py @@ -28,6 +28,11 @@ BenchmarkResult, BenchmarkTask, ) +from openadapt_evals.telemetry import ( + track_action_executed, + track_agent_run, + track_agent_run_completed, +) if TYPE_CHECKING: from openadapt_evals.benchmarks.data_collection import ExecutionTraceCollector @@ -101,6 +106,15 @@ def evaluate_agent_on_benchmark( if config.verbose: logger.info(f"Evaluating {len(tasks)} tasks on {adapter.name}") + track_agent_run( + phase="start", + adapter=adapter.name, + agent_class=type(agent).__name__, + num_tasks=len(tasks), + max_steps=config.max_steps, + parallel=config.parallel, + run_name=config.run_name or "unspecified", + ) # Initialize execution trace collector if enabled trace_collector: ExecutionTraceCollector | None = None @@ -151,6 +165,18 @@ def evaluate_agent_on_benchmark( f"Evaluation complete: {success_count}/{len(results)} " f"({success_rate:.1%}) success, {avg_steps:.1f} avg steps" ) + else: + success_count = sum(1 for r in results if r.success) + avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0 + + track_agent_run_completed( + adapter=adapter.name, + agent_class=type(agent).__name__, + num_tasks=len(results), + success_count=success_count, + avg_steps=round(avg_steps, 2), + run_name=config.run_name or "unspecified", + ) return results @@ -352,6 +378,13 @@ def _run_single_task( exec_start = time.perf_counter() obs, done, info = adapter.step(action) exec_end = time.perf_counter() + track_action_executed( + task_id=task.task_id, + step_index=steps, + action_type=action.type, + adapter=adapter.name, + agent_class=type(agent).__name__, + ) if agent_logs: agent_logs["env_execute_ms"] = round((exec_end - exec_start) * 1000) if done: diff --git a/openadapt_evals/telemetry.py b/openadapt_evals/telemetry.py new file mode 100644 index 0000000..53e4fa4 --- /dev/null +++ b/openadapt_evals/telemetry.py @@ -0,0 +1,115 @@ +"""openadapt-evals telemetry wrapper. + +Thin adapter over openadapt-telemetry PostHog usage events. +""" + +from __future__ import annotations + +from typing import Any + +from openadapt_telemetry.posthog import capture_usage_event + +_PACKAGE_NAME = "openadapt-evals" + + +def _compact(properties: dict[str, Any]) -> dict[str, Any]: + return {key: value for key, value in properties.items() if value is not None} + + +def capture_event(event: str, properties: dict[str, Any] | None = None) -> bool: + """Capture a raw usage event.""" + return capture_usage_event( + event=event, + properties=_compact(properties or {}), + package_name=_PACKAGE_NAME, + ) + + +def track_agent_run( + *, + phase: str, + adapter: str | None = None, + agent_class: str | None = None, + entrypoint: str | None = None, + mode: str | None = None, + num_tasks: int | None = None, + max_steps: int | None = None, + parallel: int | None = None, + run_name: str | None = None, +) -> bool: + properties = { + "phase": phase, + "adapter": adapter, + "agent_class": agent_class, + "entrypoint": entrypoint, + "mode": mode, + "num_tasks": num_tasks, + "max_steps": max_steps, + "parallel": parallel, + "run_name": run_name, + } + return capture_event("agent_run", properties) + + +def track_agent_run_completed( + *, + adapter: str | None = None, + agent_class: str | None = None, + entrypoint: str | None = None, + mode: str | None = None, + num_tasks: int | None = None, + success_count: int | None = None, + avg_steps: float | None = None, + return_code: int | None = None, + duration_seconds: float | None = None, + run_name: str | None = None, +) -> bool: + properties = { + "adapter": adapter, + "agent_class": agent_class, + "entrypoint": entrypoint, + "mode": mode, + "num_tasks": num_tasks, + "success_count": success_count, + "avg_steps": avg_steps, + "return_code": return_code, + "duration_seconds": duration_seconds, + "run_name": run_name, + } + return capture_event("agent_run_completed", properties) + + +def track_action_executed( + *, + task_id: str | None = None, + step_index: int | None = None, + action_type: str | None = None, + adapter: str | None = None, + agent_class: str | None = None, +) -> bool: + properties = { + "task_id": task_id, + "step_index": step_index, + "action_type": action_type, + "adapter": adapter, + "agent_class": agent_class, + } + return capture_event("action_executed", properties) + + +def track_demo_recorded( + *, + task_id: str | None = None, + mode: str | None = None, + steps: int | None = None, + output_dir: str | None = None, + phase: str | None = None, +) -> bool: + properties = { + "task_id": task_id, + "mode": mode, + "steps": steps, + "output_dir": output_dir, + "phase": phase, + } + return capture_event("demo_recorded", properties) diff --git a/pyproject.toml b/pyproject.toml index 1b598d9..c79915a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ dependencies = [ "openai>=1.0.0", "anthropic>=0.76.0", "openadapt-consilium>=0.3.2", + "openadapt-telemetry>=0.1.0", "openadapt-ml>=0.11.0", ] diff --git a/scripts/record_waa_demos.py b/scripts/record_waa_demos.py index b7becc8..aa6f6e3 100644 --- a/scripts/record_waa_demos.py +++ b/scripts/record_waa_demos.py @@ -94,6 +94,9 @@ ] from openadapt_evals.constants import HARDER_TASK_IDS +from openadapt_evals.telemetry import ( + track_demo_recorded, +) # File names for the docx setup task DOCX_FILES = ["report.docx", "meeting_notes.docx", "proposal.docx"] @@ -2180,6 +2183,13 @@ def _hard_reset_task_env() -> bytes: recorded.append(task_id) print(f"\n Saved {len(steps_meta)} step(s) to {task_dir}") + track_demo_recorded( + task_id=task_id, + mode="record-waa", + steps=len(steps_meta), + output_dir=str(output_dir), + phase="recorded", + ) # Summary print_header("Recording Summary") @@ -2376,6 +2386,13 @@ def cmd_annotate_waa( print(f" -> {json_path}") print(f" -> {txt_path}\n") + track_demo_recorded( + task_id=task_id, + mode="annotate", + steps=len(annotated_steps), + output_dir=str(output_dir), + phase="annotated", + ) print_header("Annotation Summary") print(f" Annotated: {len(task_dirs)} recording(s)")