scale-agentex-python/src/agentex/lib/core/observability/llm_metrics.py at da85d7b7eb84136f92909e154af69bc6f1c496f0 · scaleapi/scale-agentex-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""OTel metrics for LLM calls.

Single source of truth for LLM-call instrumentation across all agentex code
paths — temporal+openai_agents streaming today, sync ACP and the Claude SDK
plugin in future PRs. Centralizing the instrument definitions here means
those follow-ups don't need to redefine the metric names, units, or
description strings; they import ``get_llm_metrics()`` and record values.

The meter is no-op when the application hasn't configured a ``MeterProvider``,
so importing this module is safe for runtimes that don't use OTel. Instruments
are created lazily on first ``get_llm_metrics()`` call so a ``MeterProvider``
configured *after* this module is imported still binds correctly.

Cardinality is bounded:
- All metrics carry only ``model`` (the LLM model name).
- ``requests`` additionally carries ``status``, drawn from a small fixed set
  (see ``classify_status``).

Resource attributes (``service.name``, ``k8s.*``, etc.) come from the
application's OTel resource configuration and are added to every series
automatically.
"""

from __future__ import annotations

from typing import Optional

from opentelemetry import metrics


class LLMMetrics:
    """Lazily-created OTel instruments for LLM call telemetry."""

    def __init__(self) -> None:
        meter = metrics.get_meter("agentex.llm")
        self.requests = meter.create_counter(
            name="agentex.llm.requests",
            unit="1",
            description=(
                "LLM call count tagged with status (success / rate_limit / "
                "server_error / client_error / timeout / network_error / "
                "other_error). Use to alert on 429s, 5xxs, etc."
            ),
        )
        self.ttft_ms = meter.create_histogram(
            name="agentex.llm.ttft",
            unit="ms",
            description="Time from request submission to first content token (ms)",
        )
        # Note: TPS denominator is the model-generation window
        # (last_token_time - first_token_time), not total stream wall time.
        # This isolates raw model throughput from event-loop / tool-call latency.
        self.tps = meter.create_histogram(
            name="agentex.llm.tps",
            unit="tokens/s",
            description="Output tokens per second over the generation window",
        )
        self.input_tokens = meter.create_counter(
            name="agentex.llm.input_tokens",
            unit="tokens",
            description="Total input tokens sent to the LLM",
        )
        self.output_tokens = meter.create_counter(
            name="agentex.llm.output_tokens",
            unit="tokens",
            description="Total output tokens returned by the LLM",
        )
        self.cached_input_tokens = meter.create_counter(
            name="agentex.llm.cached_input_tokens",
            unit="tokens",
            description="Subset of input tokens served from prompt cache",
        )
        self.reasoning_tokens = meter.create_counter(
            name="agentex.llm.reasoning_tokens",
            unit="tokens",
            description="Output tokens spent on reasoning (subset of output_tokens)",
        )


_llm_metrics: Optional[LLMMetrics] = None


def get_llm_metrics() -> LLMMetrics:
    """Return the LLM metrics singleton, creating it on first use."""
    global _llm_metrics
    if _llm_metrics is None:
        _llm_metrics = LLMMetrics()
    return _llm_metrics


def classify_status(exc: Optional[BaseException]) -> str:
    """Categorize an LLM call's outcome into a small fixed set of status labels.

    A successful call returns ``"success"``. Exceptions are mapped by type name
    so we don't depend on a specific provider SDK's exception class hierarchy:
    OpenAI, Anthropic, and other providers all use names like ``RateLimitError``,
    ``APITimeoutError``, ``InternalServerError``, etc.
    """
    if exc is None:
        return "success"
    name = type(exc).__name__
    if "RateLimit" in name:
        return "rate_limit"
    if "Timeout" in name:
        return "timeout"
    if any(s in name for s in ("ServerError", "InternalServer", "ServiceUnavailable", "BadGateway")):
        return "server_error"
    if "Connection" in name:
        return "network_error"
    if any(s in name for s in ("BadRequest", "Authentication", "Permission", "NotFound", "Conflict", "UnprocessableEntity")):
        return "client_error"
    return "other_error"