scale-agentex-python/src/agentex/lib/core/observability/llm_metrics.py at 65d2e81c4f625f734ea6bcff374da5ce69fa8f32 · scaleapi/scale-agentex-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""OTel metrics for LLM calls.

Single source of truth for LLM-call instrumentation across all agentex code
paths — temporal+openai_agents streaming today, sync ACP and the Claude SDK
plugin in future PRs. Centralizing the instrument definitions here means
those follow-ups don't need to redefine the metric names, units, or
description strings; they import ``get_llm_metrics()`` and record values.

The meter is no-op when the application hasn't configured a ``MeterProvider``,
so importing this module is safe for runtimes that don't use OTel. Instruments
are created lazily on first ``get_llm_metrics()`` call so a ``MeterProvider``
configured *after* this module is imported still binds correctly.

Cardinality is bounded:
- All metrics carry only ``model`` (the LLM model name).
- ``requests`` additionally carries ``status``, drawn from a small fixed set
  (see ``classify_status``).

Resource attributes (``service.name``, ``k8s.*``, etc.) come from the
application's OTel resource configuration and are added to every series
automatically.
"""

from __future__ import annotations

from typing import Optional

from opentelemetry import metrics


class LLMMetrics:
    """Lazily-created OTel instruments for LLM call telemetry."""

    def __init__(self) -> None:
        meter = metrics.get_meter("agentex.llm")
        self.requests = meter.create_counter(
            name="agentex.llm.requests",
            unit="1",
            description=(
                "LLM call count tagged with status (success / rate_limit / "
                "server_error / client_error / timeout / network_error / "
                "other_error). Use to alert on 429s, 5xxs, etc."
            ),
        )
        self.ttft_ms = meter.create_histogram(
            name="agentex.llm.ttft",
            unit="ms",
            description="Time from request submission to first content token (ms)",
        )
        # ttat (time-to-first-answering-token) is distinct from ttft for reasoning
        # models: ttft fires on the first reasoning chunk (which arrives quickly),
        # while ttat fires on the first user-visible answer token (text or tool
        # call). For non-reasoning models the two are equal.
        self.ttat_ms = meter.create_histogram(
            name="agentex.llm.ttat",
            unit="ms",
            description="Time from request submission to first answering token (text or tool-call delta) — excludes reasoning chunks",
        )
        # Note: TPS denominator is the model-generation window
        # (last_token_time - first_token_time), not total stream wall time.
        # This isolates raw model throughput from event-loop / tool-call latency.
        self.tps = meter.create_histogram(
            name="agentex.llm.tps",
            unit="tokens/s",
            description="Output tokens per second over the generation window",
        )
        self.input_tokens = meter.create_counter(
            name="agentex.llm.input_tokens",
            unit="tokens",
            description="Total input tokens sent to the LLM",
        )
        self.output_tokens = meter.create_counter(
            name="agentex.llm.output_tokens",
            unit="tokens",
            description="Total output tokens returned by the LLM",
        )
        self.cached_input_tokens = meter.create_counter(
            name="agentex.llm.cached_input_tokens",
            unit="tokens",
            description="Subset of input tokens served from prompt cache",
        )
        self.reasoning_tokens = meter.create_counter(
            name="agentex.llm.reasoning_tokens",
            unit="tokens",
            description="Output tokens spent on reasoning (subset of output_tokens)",
        )


_llm_metrics: Optional[LLMMetrics] = None


def get_llm_metrics() -> LLMMetrics:
    """Return the LLM metrics singleton, creating it on first use."""
    global _llm_metrics
    if _llm_metrics is None:
        _llm_metrics = LLMMetrics()
    return _llm_metrics


def classify_status(exc: Optional[BaseException]) -> str:
    """Categorize an LLM call's outcome into a small fixed set of status labels.

    A successful call returns ``"success"``. Exceptions are mapped by type name
    so we don't depend on a specific provider SDK's exception class hierarchy:
    OpenAI, Anthropic, and other providers all use names like ``RateLimitError``,
    ``APITimeoutError``, ``InternalServerError``, etc.
    """
    if exc is None:
        return "success"
    name = type(exc).__name__
    if "RateLimit" in name:
        return "rate_limit"
    if "Timeout" in name:
        return "timeout"
    if any(s in name for s in ("ServerError", "InternalServer", "ServiceUnavailable", "BadGateway")):
        return "server_error"
    if "Connection" in name:
        return "network_error"
    if any(s in name for s in ("BadRequest", "Authentication", "Permission", "NotFound", "Conflict", "UnprocessableEntity")):
        return "client_error"
    return "other_error"