Skip to content

Commit bc0fe75

Browse files
authored
Merge pull request #108 from OpenAdaptAI/feat/posthog-telemetry-instrumentation
feat: instrument evals usage events via openadapt-telemetry
2 parents 36be864 + ba81718 commit bc0fe75

4 files changed

Lines changed: 166 additions & 0 deletions

File tree

openadapt_evals/benchmarks/runner.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@
2828
BenchmarkResult,
2929
BenchmarkTask,
3030
)
31+
from openadapt_evals.telemetry import (
32+
track_action_executed,
33+
track_agent_run,
34+
track_agent_run_completed,
35+
)
3136

3237
if TYPE_CHECKING:
3338
from openadapt_evals.benchmarks.data_collection import ExecutionTraceCollector
@@ -101,6 +106,15 @@ def evaluate_agent_on_benchmark(
101106

102107
if config.verbose:
103108
logger.info(f"Evaluating {len(tasks)} tasks on {adapter.name}")
109+
track_agent_run(
110+
phase="start",
111+
adapter=adapter.name,
112+
agent_class=type(agent).__name__,
113+
num_tasks=len(tasks),
114+
max_steps=config.max_steps,
115+
parallel=config.parallel,
116+
run_name=config.run_name or "unspecified",
117+
)
104118

105119
# Initialize execution trace collector if enabled
106120
trace_collector: ExecutionTraceCollector | None = None
@@ -151,6 +165,18 @@ def evaluate_agent_on_benchmark(
151165
f"Evaluation complete: {success_count}/{len(results)} "
152166
f"({success_rate:.1%}) success, {avg_steps:.1f} avg steps"
153167
)
168+
else:
169+
success_count = sum(1 for r in results if r.success)
170+
avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0
171+
172+
track_agent_run_completed(
173+
adapter=adapter.name,
174+
agent_class=type(agent).__name__,
175+
num_tasks=len(results),
176+
success_count=success_count,
177+
avg_steps=round(avg_steps, 2),
178+
run_name=config.run_name or "unspecified",
179+
)
154180

155181
return results
156182

@@ -352,6 +378,13 @@ def _run_single_task(
352378
exec_start = time.perf_counter()
353379
obs, done, info = adapter.step(action)
354380
exec_end = time.perf_counter()
381+
track_action_executed(
382+
task_id=task.task_id,
383+
step_index=steps,
384+
action_type=action.type,
385+
adapter=adapter.name,
386+
agent_class=type(agent).__name__,
387+
)
355388
if agent_logs:
356389
agent_logs["env_execute_ms"] = round((exec_end - exec_start) * 1000)
357390
if done:

openadapt_evals/telemetry.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
"""openadapt-evals telemetry wrapper.
2+
3+
Thin adapter over openadapt-telemetry PostHog usage events.
4+
"""
5+
6+
from __future__ import annotations
7+
8+
from typing import Any
9+
10+
from openadapt_telemetry.posthog import capture_usage_event
11+
12+
_PACKAGE_NAME = "openadapt-evals"
13+
14+
15+
def _compact(properties: dict[str, Any]) -> dict[str, Any]:
16+
return {key: value for key, value in properties.items() if value is not None}
17+
18+
19+
def capture_event(event: str, properties: dict[str, Any] | None = None) -> bool:
20+
"""Capture a raw usage event."""
21+
return capture_usage_event(
22+
event=event,
23+
properties=_compact(properties or {}),
24+
package_name=_PACKAGE_NAME,
25+
)
26+
27+
28+
def track_agent_run(
29+
*,
30+
phase: str,
31+
adapter: str | None = None,
32+
agent_class: str | None = None,
33+
entrypoint: str | None = None,
34+
mode: str | None = None,
35+
num_tasks: int | None = None,
36+
max_steps: int | None = None,
37+
parallel: int | None = None,
38+
run_name: str | None = None,
39+
) -> bool:
40+
properties = {
41+
"phase": phase,
42+
"adapter": adapter,
43+
"agent_class": agent_class,
44+
"entrypoint": entrypoint,
45+
"mode": mode,
46+
"num_tasks": num_tasks,
47+
"max_steps": max_steps,
48+
"parallel": parallel,
49+
"run_name": run_name,
50+
}
51+
return capture_event("agent_run", properties)
52+
53+
54+
def track_agent_run_completed(
55+
*,
56+
adapter: str | None = None,
57+
agent_class: str | None = None,
58+
entrypoint: str | None = None,
59+
mode: str | None = None,
60+
num_tasks: int | None = None,
61+
success_count: int | None = None,
62+
avg_steps: float | None = None,
63+
return_code: int | None = None,
64+
duration_seconds: float | None = None,
65+
run_name: str | None = None,
66+
) -> bool:
67+
properties = {
68+
"adapter": adapter,
69+
"agent_class": agent_class,
70+
"entrypoint": entrypoint,
71+
"mode": mode,
72+
"num_tasks": num_tasks,
73+
"success_count": success_count,
74+
"avg_steps": avg_steps,
75+
"return_code": return_code,
76+
"duration_seconds": duration_seconds,
77+
"run_name": run_name,
78+
}
79+
return capture_event("agent_run_completed", properties)
80+
81+
82+
def track_action_executed(
83+
*,
84+
task_id: str | None = None,
85+
step_index: int | None = None,
86+
action_type: str | None = None,
87+
adapter: str | None = None,
88+
agent_class: str | None = None,
89+
) -> bool:
90+
properties = {
91+
"task_id": task_id,
92+
"step_index": step_index,
93+
"action_type": action_type,
94+
"adapter": adapter,
95+
"agent_class": agent_class,
96+
}
97+
return capture_event("action_executed", properties)
98+
99+
100+
def track_demo_recorded(
101+
*,
102+
task_id: str | None = None,
103+
mode: str | None = None,
104+
steps: int | None = None,
105+
output_dir: str | None = None,
106+
phase: str | None = None,
107+
) -> bool:
108+
properties = {
109+
"task_id": task_id,
110+
"mode": mode,
111+
"steps": steps,
112+
"output_dir": output_dir,
113+
"phase": phase,
114+
}
115+
return capture_event("demo_recorded", properties)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ dependencies = [
4141
"openai>=1.0.0",
4242
"anthropic>=0.76.0",
4343
"openadapt-consilium>=0.3.2",
44+
"openadapt-telemetry>=0.1.0",
4445
"openadapt-ml>=0.11.0",
4546
]
4647

scripts/record_waa_demos.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@
9494
]
9595

9696
from openadapt_evals.constants import HARDER_TASK_IDS
97+
from openadapt_evals.telemetry import (
98+
track_demo_recorded,
99+
)
97100

98101
# File names for the docx setup task
99102
DOCX_FILES = ["report.docx", "meeting_notes.docx", "proposal.docx"]
@@ -2180,6 +2183,13 @@ def _hard_reset_task_env() -> bytes:
21802183

21812184
recorded.append(task_id)
21822185
print(f"\n Saved {len(steps_meta)} step(s) to {task_dir}")
2186+
track_demo_recorded(
2187+
task_id=task_id,
2188+
mode="record-waa",
2189+
steps=len(steps_meta),
2190+
output_dir=str(output_dir),
2191+
phase="recorded",
2192+
)
21832193

21842194
# Summary
21852195
print_header("Recording Summary")
@@ -2376,6 +2386,13 @@ def cmd_annotate_waa(
23762386

23772387
print(f" -> {json_path}")
23782388
print(f" -> {txt_path}\n")
2389+
track_demo_recorded(
2390+
task_id=task_id,
2391+
mode="annotate",
2392+
steps=len(annotated_steps),
2393+
output_dir=str(output_dir),
2394+
phase="annotated",
2395+
)
23792396

23802397
print_header("Annotation Summary")
23812398
print(f" Annotated: {len(task_dirs)} recording(s)")

0 commit comments

Comments
 (0)