-
Notifications
You must be signed in to change notification settings - Fork 3.4k
feat(metrics): Add Prometheus metrics integration for agent monitoring #2855
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,194 @@ | ||
| """Example: Prometheus metrics endpoint for agent monitoring. | ||
|
|
||
| This example shows how to set up a FastAPI server with a /metrics endpoint | ||
| that exposes Prometheus metrics for your agents. | ||
|
|
||
| To run: | ||
| pip install 'openai-agents[prometheus]' fastapi uvicorn | ||
| uv run python examples/metrics/prometheus_endpoint.py | ||
|
|
||
| Then open http://localhost:8000/metrics in your browser or configure | ||
| Prometheus to scrape http://localhost:8000/metrics | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import asyncio | ||
| import time | ||
| import random | ||
| from contextlib import asynccontextmanager | ||
|
|
||
| from fastapi import FastAPI | ||
| from prometheus_client import make_asgi_app | ||
|
|
||
| from agents import Agent, Runner | ||
| from agents.metrics import PrometheusMetrics, MetricsHooks, enable_metrics | ||
|
|
||
| metrics = PrometheusMetrics() | ||
| enable_metrics(metrics) | ||
|
|
||
| metrics_app = make_asgi_app() | ||
|
|
||
| agent = Agent( | ||
| name="math_assistant", | ||
| instructions="You are a helpful math assistant. Solve simple math problems.", | ||
| ) | ||
|
|
||
|
|
||
| @asynccontextmanager | ||
| async def lifespan(app: FastAPI): | ||
| """Lifespan context manager for startup/shutdown.""" | ||
| print("Starting server with metrics enabled...") | ||
| print("Visit http://localhost:8000/metrics for Prometheus metrics") | ||
| yield | ||
| print("Shutting down...") | ||
|
|
||
|
|
||
| app = FastAPI(title="Agent Metrics Example", lifespan=lifespan) | ||
|
|
||
| app.mount("/metrics", metrics_app) | ||
|
|
||
|
|
||
| @app.get("/") | ||
| async def root(): | ||
| """Root endpoint with instructions.""" | ||
| return { | ||
| "message": "Agent Metrics Example", | ||
| "endpoints": { | ||
| "/": "This help message", | ||
| "/metrics": "Prometheus metrics endpoint", | ||
| "/solve/{problem}": "Solve a math problem (generates metrics)", | ||
| "/chat/{message}": "Chat with the agent (generates metrics)", | ||
| }, | ||
| } | ||
|
|
||
|
|
||
| @app.get("/solve/{problem}") | ||
| async def solve(problem: str): | ||
| """Solve a math problem and record metrics.""" | ||
| hooks = MetricsHooks() | ||
|
|
||
| start_time = time.monotonic() | ||
|
|
||
| try: | ||
| result = await Runner.run( | ||
| agent, | ||
| f"Solve this math problem: {problem}", | ||
| hooks=[hooks], | ||
| ) | ||
|
|
||
| duration = time.monotonic() - start_time | ||
|
|
||
| return { | ||
| "problem": problem, | ||
| "solution": result.final_output, | ||
| "duration_seconds": round(duration, 3), | ||
| } | ||
| except Exception as e: | ||
| duration = time.monotonic() - start_time | ||
| return { | ||
| "problem": problem, | ||
| "error": str(e), | ||
| "duration_seconds": round(duration, 3), | ||
| } | ||
|
|
||
|
|
||
| @app.get("/chat/{message}") | ||
| async def chat(message: str): | ||
| """Chat with the agent and record metrics.""" | ||
| hooks = MetricsHooks() | ||
|
|
||
| try: | ||
| result = await Runner.run( | ||
| agent, | ||
| message, | ||
| hooks=[hooks], | ||
| ) | ||
|
|
||
| return { | ||
| "message": message, | ||
| "response": result.final_output, | ||
| "usage": { | ||
| "input_tokens": result.usage.input_tokens if result.usage else 0, | ||
| "output_tokens": result.usage.output_tokens if result.usage else 0, | ||
| "total_tokens": result.usage.total_tokens if result.usage else 0, | ||
| }, | ||
| } | ||
| except Exception as e: | ||
| return { | ||
| "message": message, | ||
| "error": str(e), | ||
| } | ||
|
|
||
|
|
||
| @app.post("/generate-load") | ||
| async def generate_load(count: int = 10): | ||
| """Generate load for testing metrics (simulated).""" | ||
| results = [] | ||
|
|
||
| for i in range(count): | ||
| operation = random.choice(["add", "multiply", "divide", "subtract"]) | ||
| a, b = random.randint(1, 100), random.randint(1, 100) | ||
|
|
||
| latency = random.uniform(0.1, 2.0) | ||
| tokens_in = random.randint(50, 500) | ||
| tokens_out = random.randint(20, 200) | ||
|
|
||
| metrics.record_llm_call( | ||
| latency=latency, | ||
| tokens_in=tokens_in, | ||
| tokens_out=tokens_out, | ||
| model="gpt-4", | ||
| ) | ||
|
|
||
| if random.random() < 0.1: | ||
| error_type = random.choice(["RateLimitError", "TimeoutError", "APIError"]) | ||
| metrics.record_error(error_type, agent.name or "unknown") | ||
| results.append( | ||
| { | ||
| "operation": operation, | ||
| "error": error_type, | ||
| } | ||
| ) | ||
| else: | ||
| results.append( | ||
| { | ||
| "operation": operation, | ||
| "a": a, | ||
| "b": b, | ||
| "latency": round(latency, 3), | ||
| } | ||
| ) | ||
|
|
||
| await asyncio.sleep(0.01) | ||
|
|
||
| return { | ||
| "generated": count, | ||
| "results": results, | ||
| } | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| import uvicorn | ||
|
|
||
| print(""" | ||
| Endpoints: | ||
| • http://localhost:8000/ - API documentation | ||
| • http://localhost:8000/metrics - Prometheus metrics | ||
| • http://localhost:8000/solve/{x} - Solve math problem | ||
| • http://localhost:8000/chat/{msg} - Chat with agent | ||
| • POST /generate-load?count=10 - Generate test load | ||
|
|
||
| Metrics available: | ||
| • agents_llm_latency_seconds - LLM call latency | ||
| • agents_tokens_total - Token usage | ||
| • agents_errors_total - Error counts | ||
| • agents_runs_total - Run counts | ||
| • agents_run_duration_seconds - Run duration | ||
| • agents_turns_total - LLM turns | ||
| • agents_tool_executions_total - Tool executions | ||
| • agents_tool_latency_seconds - Tool latency | ||
|
|
||
| """) | ||
|
|
||
| uvicorn.run(app, host="0.0.0.0", port=8000) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from typing import TYPE_CHECKING | ||
|
|
||
| if TYPE_CHECKING: | ||
| from .prometheus import PrometheusMetrics | ||
| from .hooks import MetricsHooks | ||
|
|
||
| __all__ = [ | ||
| "PrometheusMetrics", | ||
| "MetricsHooks", | ||
| "enable_metrics", | ||
| "get_metrics", | ||
| "disable_metrics", | ||
| ] | ||
|
|
||
|
|
||
| def __getattr__(name: str): | ||
| if name == "PrometheusMetrics": | ||
| from .prometheus import PrometheusMetrics as _PrometheusMetrics | ||
|
|
||
| return _PrometheusMetrics | ||
| elif name == "MetricsHooks": | ||
| from .hooks import MetricsHooks as _MetricsHooks | ||
|
|
||
| return _MetricsHooks | ||
| elif name == "enable_metrics": | ||
| from .hooks import enable_metrics as _enable_metrics | ||
|
|
||
| return _enable_metrics | ||
| elif name == "get_metrics": | ||
| from .hooks import get_metrics as _get_metrics | ||
|
|
||
| return _get_metrics | ||
| elif name == "disable_metrics": | ||
| from .hooks import disable_metrics as _disable_metrics | ||
|
|
||
| return _disable_metrics | ||
| raise AttributeError(f"module {__name__!r} has no attribute {name!r}") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,143 @@ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import time | ||
| from typing import Any | ||
|
|
||
| from ..agent import Agent | ||
| from ..lifecycle import RunHooks | ||
| from ..logger import logger | ||
| from ..result import RunResult | ||
| from ..run_context import RunContextWrapper | ||
|
|
||
| try: | ||
| from .prometheus import PrometheusMetrics | ||
| except ImportError: | ||
| PrometheusMetrics = None | ||
|
|
||
| _global_metrics: PrometheusMetrics | None = None | ||
|
|
||
|
|
||
| def enable_metrics(metrics: PrometheusMetrics) -> None: | ||
| global _global_metrics | ||
| _global_metrics = metrics | ||
|
|
||
|
|
||
| def get_metrics() -> PrometheusMetrics | None: | ||
| return _global_metrics | ||
|
|
||
|
|
||
| def disable_metrics() -> None: | ||
| global _global_metrics | ||
| _global_metrics = None | ||
|
|
||
|
|
||
| class MetricsHooks(RunHooks): | ||
|
|
||
| def __init__(self, metrics: PrometheusMetrics | None = None) -> None: | ||
| self._metrics = metrics or _global_metrics | ||
| self._run_start_times: dict[str, float] = {} | ||
| self._tool_start_times: dict[str, float] = {} | ||
|
|
||
| async def on_start( | ||
| self, | ||
| context: RunContextWrapper[Any], | ||
| agent: Agent[Any], | ||
| ) -> None: | ||
|
Comment on lines
+42
to
+46
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
These handlers are declared as Useful? React with 👍 / 👎. |
||
| if self._metrics is None: | ||
| return | ||
|
|
||
| agent_name = agent.name or "unknown" | ||
| self._run_start_times[context.context_id] = time.monotonic() | ||
| self._metrics.record_run_start(agent_name) | ||
|
|
||
| async def on_end( | ||
| self, | ||
| context: RunContextWrapper[Any], | ||
| agent: Agent[Any], | ||
| result: RunResult, | ||
| ) -> None: | ||
| if self._metrics is None: | ||
| return | ||
|
|
||
| agent_name = agent.name or "unknown" | ||
| start_time = self._run_start_times.pop(context.context_id, None) | ||
| duration = None | ||
| if start_time is not None: | ||
| duration = time.monotonic() - start_time | ||
|
|
||
| self._metrics.record_run_end(agent_name, duration, status="success") | ||
|
|
||
| async def on_error( | ||
| self, | ||
| context: RunContextWrapper[Any], | ||
| agent: Agent[Any], | ||
| error: Exception, | ||
| ) -> None: | ||
| if self._metrics is None: | ||
| return | ||
|
|
||
| agent_name = agent.name or "unknown" | ||
| start_time = self._run_start_times.pop(context.context_id, None) | ||
| duration = None | ||
| if start_time is not None: | ||
| duration = time.monotonic() - start_time | ||
|
|
||
| error_type = type(error).__name__ | ||
| self._metrics.record_error(error_type, agent_name) | ||
| self._metrics.record_run_end(agent_name, duration, status="error") | ||
|
|
||
| async def on_tool_start( | ||
| self, | ||
| context: RunContextWrapper[Any], | ||
| agent: Agent[Any], | ||
| tool_name: str, | ||
| input_data: dict[str, Any], | ||
|
Comment on lines
+90
to
+95
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
| ) -> None: | ||
| if self._metrics is None: | ||
| return | ||
|
|
||
| key = f"{context.context_id}:{tool_name}" | ||
| self._tool_start_times[key] = time.monotonic() | ||
|
|
||
| async def on_tool_end( | ||
| self, | ||
| context: RunContextWrapper[Any], | ||
| agent: Agent[Any], | ||
| tool_name: str, | ||
| result: Any, | ||
| ) -> None: | ||
| if self._metrics is None: | ||
| return | ||
|
|
||
| key = f"{context.context_id}:{tool_name}" | ||
| start_time = self._tool_start_times.pop(key, None) | ||
| if start_time is not None: | ||
| latency = time.monotonic() - start_time | ||
| agent_name = agent.name or "unknown" | ||
| self._metrics.record_tool_execution(tool_name, latency, agent_name) | ||
|
|
||
| async def on_tool_error( | ||
| self, | ||
| context: RunContextWrapper[Any], | ||
| agent: Agent[Any], | ||
| tool_name: str, | ||
| error: Exception, | ||
| ) -> None: | ||
| if self._metrics is None: | ||
| return | ||
|
|
||
| key = f"{context.context_id}:{tool_name}" | ||
| start_time = self._tool_start_times.pop(key, None) | ||
| if start_time is not None: | ||
| latency = time.monotonic() - start_time | ||
| agent_name = agent.name or "unknown" | ||
| self._metrics.record_tool_execution(tool_name, latency, agent_name) | ||
|
|
||
| error_type = f"tool_error:{type(error).__name__}" | ||
| agent_name = agent.name or "unknown" | ||
| self._metrics.record_error(error_type, agent_name) | ||
|
|
||
|
|
||
| def create_metrics_hooks(metrics: PrometheusMetrics | None = None) -> MetricsHooks: | ||
| return MetricsHooks(metrics) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Runner.runexpectshooksto be a singleRunHooksobject, not a list, so passinghooks=[hooks]will fail when the runner tries to call hook methods on the list. In this example,/solveand/chatrequests will return errors instead of running the agent.Useful? React with 👍 / 👎.