feat(openai): add gen_ai.server.time_to_first_token metric for streaming

Nik-Reddy · Nik-Reddy · commit 44ed765c90d5 · 2026-04-12T17:15:56.000-07:00
Implement the gen_ai.server.time_to_first_token histogram metric as defined in OpenTelemetry Semantic Conventions v1.38.0. This metric records the time from request start to first output token received during streaming chat completions. Changes: - Add time_to_first_token_s field to LLMInvocation dataclass - Add TTFT histogram creation to util-genai instruments and metrics - Wire TTFT detection into BaseStreamWrapper.process_chunk() - Record TTFT in both legacy and new handler instrumentation paths - Add tests for sync/async streaming, non-streaming, and tool calls Resolves #3932
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/instruments.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/instruments.py
@@ -35,6 +35,25 @@
     67108864,
 ]
 
+_GEN_AI_SERVER_TIME_TO_FIRST_TOKEN_BUCKETS = [
+    0.001,
+    0.005,
+    0.01,
+    0.02,
+    0.04,
+    0.06,
+    0.08,
+    0.1,
+    0.25,
+    0.5,
+    0.75,
+    1.0,
+    2.5,
+    5.0,
+    7.5,
+    10.0,
+]
+
 
 class Instruments:
     def __init__(self, meter: Meter):
@@ -50,3 +69,9 @@ def __init__(self, meter: Meter):
             unit="{token}",
             explicit_bucket_boundaries_advisory=_GEN_AI_CLIENT_TOKEN_USAGE_BUCKETS,
         )
+        self.ttft_histogram: Histogram = meter.create_histogram(
+            name=gen_ai_metrics.GEN_AI_SERVER_TIME_TO_FIRST_TOKEN,
+            description="Time to generate first token for successful responses",
+            unit="s",
+            explicit_bucket_boundaries_advisory=_GEN_AI_SERVER_TIME_TO_FIRST_TOKEN_BUCKETS,
+        )
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py
@@ -90,7 +90,10 @@ def traced_method(wrapped, instance, args, kwargs):
                     parsed_result = result
                 if is_streaming(kwargs):
                     return LegacyChatStreamWrapper(
-                        parsed_result, span, logger, capture_content
+                        parsed_result, span, logger, capture_content,
+                        instruments=instruments,
+                        start_time=start,
+                        request_attributes=span_attributes,
                     )
 
                 if span.is_recording():
@@ -195,7 +198,10 @@ async def traced_method(wrapped, instance, args, kwargs):
                     parsed_result = result
                 if is_streaming(kwargs):
                     return LegacyChatStreamWrapper(
-                        parsed_result, span, logger, capture_content
+                        parsed_result, span, logger, capture_content,
+                        instruments=instruments,
+                        start_time=start,
+                        request_attributes=span_attributes,
                     )
 
                 if span.is_recording():
@@ -631,6 +637,8 @@ def __init__(
         self.choice_buffers = []
         self._started = False
         self.capture_content = capture_content
+        self._first_token_received = False
+        self._first_token_time: Optional[float] = None
         self._setup()
 
     def _setup(self):
@@ -752,8 +760,25 @@ def process_chunk(self, chunk):
         self.set_response_model(chunk)
         self.set_response_service_tier(chunk)
         self.build_streaming_response(chunk)
+        self._detect_first_token(chunk)
         self.set_usage(chunk)
 
+    def _detect_first_token(self, chunk):
+        if self._first_token_received:
+            return
+        if getattr(chunk, "choices", None) is None:
+            return
+        for choice in chunk.choices:
+            if not choice.delta:
+                continue
+            if (
+                choice.delta.content is not None
+                or choice.delta.tool_calls is not None
+            ):
+                self._first_token_received = True
+                self._first_token_time = default_timer()
+                return
+
     def __getattr__(self, name):
         return getattr(self.stream, name)
 
@@ -777,10 +802,16 @@ def __init__(
         span: Span,
         logger: Logger,
         capture_content: bool,
+        instruments: Optional[Instruments] = None,
+        start_time: Optional[float] = None,
+        request_attributes: Optional[dict] = None,
     ):
         super().__init__(stream, capture_content=capture_content)
         self.span = span
         self.logger = logger
+        self._instruments = instruments
+        self._start_time = start_time
+        self._request_attributes = request_attributes or {}
 
     def cleanup(self, error: Optional[BaseException] = None):
         if not self._started:
@@ -863,9 +894,43 @@ def cleanup(self, error: Optional[BaseException] = None):
         if error:
             handle_span_exception(self.span, error)
         else:
+            self._record_ttft()
             self.span.end()
         self._started = False
 
+    def _record_ttft(self):
+        if (
+            self._instruments is None
+            or self._start_time is None
+            or self._first_token_time is None
+        ):
+            return
+        ttft = max(self._first_token_time - self._start_time, 0.0)
+        common_attributes = {
+            GenAIAttributes.GEN_AI_OPERATION_NAME: GenAIAttributes.GenAiOperationNameValues.CHAT.value,
+            GenAIAttributes.GEN_AI_SYSTEM: GenAIAttributes.GenAiSystemValues.OPENAI.value,
+        }
+        if GenAIAttributes.GEN_AI_REQUEST_MODEL in self._request_attributes:
+            common_attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL] = (
+                self._request_attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL]
+            )
+        if self.response_model:
+            common_attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL] = (
+                self.response_model
+            )
+        if ServerAttributes.SERVER_ADDRESS in self._request_attributes:
+            common_attributes[ServerAttributes.SERVER_ADDRESS] = (
+                self._request_attributes[ServerAttributes.SERVER_ADDRESS]
+            )
+        if ServerAttributes.SERVER_PORT in self._request_attributes:
+            common_attributes[ServerAttributes.SERVER_PORT] = (
+                self._request_attributes[ServerAttributes.SERVER_PORT]
+            )
+        self._instruments.ttft_histogram.record(
+            ttft,
+            attributes=common_attributes,
+        )
+
 
 class ChatStreamWrapper(BaseStreamWrapper):
     handler: TelemetryHandler
@@ -941,6 +1006,15 @@ def cleanup(self, error: Optional[BaseException] = None):
                 },
             )
 
+        if (
+            self._first_token_time is not None
+            and self.invocation.monotonic_start_s is not None
+        ):
+            self.invocation.time_to_first_token_s = max(
+                self._first_token_time - self.invocation.monotonic_start_s,
+                0.0,
+            )
+
         self._set_output_messages()
 
         if error:
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_ttft_metrics.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_ttft_metrics.py
@@ -0,0 +1,160 @@
+import pytest
+from tests.test_utils import DEFAULT_MODEL, USER_ONLY_PROMPT
+
+from opentelemetry.semconv._incubating.attributes import (
+    gen_ai_attributes as GenAIAttributes,
+)
+from opentelemetry.semconv._incubating.attributes import (
+    server_attributes as ServerAttributes,
+)
+from opentelemetry.semconv._incubating.metrics import gen_ai_metrics
+from opentelemetry.util.genai.utils import is_experimental_mode
+
+_TTFT_BUCKETS = (
+    0.001,
+    0.005,
+    0.01,
+    0.02,
+    0.04,
+    0.06,
+    0.08,
+    0.1,
+    0.25,
+    0.5,
+    0.75,
+    1.0,
+    2.5,
+    5.0,
+    7.5,
+    10.0,
+)
+
+
+def _get_ttft_metric(metric_reader):
+    metrics = metric_reader.get_metrics_data().resource_metrics
+    if not metrics:
+        return None
+    for scope_metrics in metrics[0].scope_metrics:
+        for m in scope_metrics.metrics:
+            if m.name == gen_ai_metrics.GEN_AI_SERVER_TIME_TO_FIRST_TOKEN:
+                return m
+    return None
+
+
+def test_streaming_chat_records_ttft_metric(
+    metric_reader, openai_client, instrument_with_content, vcr
+):
+    """TTFT metric is recorded for streaming chat completions."""
+    with vcr.use_cassette("test_chat_completion_streaming.yaml"):
+        response = openai_client.chat.completions.create(
+            model=DEFAULT_MODEL,
+            messages=USER_ONLY_PROMPT,
+            stream=True,
+            stream_options={"include_usage": True},
+        )
+        for _ in response:
+            pass
+
+    ttft_metric = _get_ttft_metric(metric_reader)
+    assert ttft_metric is not None, (
+        "gen_ai.server.time_to_first_token metric should be recorded for streaming"
+    )
+
+    data_point = ttft_metric.data.data_points[0]
+    assert data_point.sum >= 0
+    assert data_point.count == 1
+    assert data_point.explicit_bounds == _TTFT_BUCKETS
+
+    latest_experimental_enabled = is_experimental_mode()
+    assert GenAIAttributes.GEN_AI_OPERATION_NAME in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME]
+        == GenAIAttributes.GenAiOperationNameValues.CHAT.value
+    )
+    assert GenAIAttributes.GEN_AI_REQUEST_MODEL in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL]
+        == "gpt-4o-mini"
+    )
+    assert ServerAttributes.SERVER_ADDRESS in data_point.attributes
+
+
+@pytest.mark.asyncio()
+async def test_async_streaming_chat_records_ttft_metric(
+    metric_reader, async_openai_client, instrument_with_content, vcr
+):
+    """TTFT metric is recorded for async streaming chat completions."""
+    with vcr.use_cassette("test_async_chat_completion_streaming.yaml"):
+        response = await async_openai_client.chat.completions.create(
+            model=DEFAULT_MODEL,
+            messages=USER_ONLY_PROMPT,
+            stream=True,
+            stream_options={"include_usage": True},
+        )
+        async for _ in response:
+            pass
+
+    ttft_metric = _get_ttft_metric(metric_reader)
+    assert ttft_metric is not None, (
+        "gen_ai.server.time_to_first_token metric should be recorded for async streaming"
+    )
+
+    data_point = ttft_metric.data.data_points[0]
+    assert data_point.sum >= 0
+    assert data_point.count == 1
+    assert data_point.explicit_bounds == _TTFT_BUCKETS
+
+
+def test_non_streaming_chat_does_not_record_ttft_metric(
+    metric_reader, openai_client, instrument_with_content, vcr
+):
+    """TTFT metric should NOT be recorded for non-streaming requests."""
+    with vcr.use_cassette("test_chat_completion_metrics.yaml"):
+        openai_client.chat.completions.create(
+            messages=USER_ONLY_PROMPT, model=DEFAULT_MODEL, stream=False
+        )
+
+    ttft_metric = _get_ttft_metric(metric_reader)
+    assert ttft_metric is None, (
+        "gen_ai.server.time_to_first_token metric should not be recorded for non-streaming"
+    )
+
+
+def test_streaming_tool_calls_records_ttft_metric(
+    metric_reader, openai_client, instrument_with_content, vcr
+):
+    """TTFT metric is recorded for streaming responses with tool calls."""
+    with vcr.use_cassette(
+        "test_chat_completion_multiple_tools_streaming_with_content.yaml"
+    ):
+        response = openai_client.chat.completions.create(
+            model=DEFAULT_MODEL,
+            messages=[{"role": "user", "content": "What's the weather?"}],
+            stream=True,
+            stream_options={"include_usage": True},
+            tools=[
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                "location": {"type": "string"},
+                            },
+                        },
+                    },
+                }
+            ],
+        )
+        for _ in response:
+            pass
+
+    ttft_metric = _get_ttft_metric(metric_reader)
+    assert ttft_metric is not None, (
+        "gen_ai.server.time_to_first_token metric should be recorded for streaming tool calls"
+    )
+
+    data_point = ttft_metric.data.data_points[0]
+    assert data_point.sum >= 0
+    assert data_point.count == 1
diff --git a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/instruments.py b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/instruments.py
@@ -35,6 +35,25 @@
     67108864,
 ]
 
+_GEN_AI_SERVER_TIME_TO_FIRST_TOKEN_BUCKETS = [
+    0.001,
+    0.005,
+    0.01,
+    0.02,
+    0.04,
+    0.06,
+    0.08,
+    0.1,
+    0.25,
+    0.5,
+    0.75,
+    1.0,
+    2.5,
+    5.0,
+    7.5,
+    10.0,
+]
+
 
 def create_duration_histogram(meter: Meter) -> Histogram:
     return meter.create_histogram(
@@ -52,3 +71,12 @@ def create_token_histogram(meter: Meter) -> Histogram:
         unit="{token}",
         explicit_bucket_boundaries_advisory=_GEN_AI_CLIENT_TOKEN_USAGE_BUCKETS,
     )
+
+
+def create_ttft_histogram(meter: Meter) -> Histogram:
+    return meter.create_histogram(
+        name=gen_ai_metrics.GEN_AI_SERVER_TIME_TO_FIRST_TOKEN,
+        description="Time to generate first token for successful responses",
+        unit="s",
+        explicit_bucket_boundaries_advisory=_GEN_AI_SERVER_TIME_TO_FIRST_TOKEN_BUCKETS,
+    )
diff --git a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/metrics.py b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/metrics.py
diff --git a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/types.py b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/types.py