Skip to content

Commit b44c921

Browse files
fix(adk): set OTLP exporter timeout in milliseconds (#1639)
OTEL specified that timeout env vars should be in **milliseconds**: https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter/#timeout-configuration Python SDK's exporter currently expects it in seconds, so this adds a utility function to parse the env var into seconds. Go is not affected because OTEL Go SDK will properly take up the new value in milliseconds properly. Without changing this to milliseconds, Go ADK was having a timeout configured as 15ms which results in intermittent error like `traces export: exporter export timeout: rpc error: code = DeadlineExceeded desc = context deadline exceeded` --------- Signed-off-by: Jet Chiang <pokyuen.jetchiang-ext@solo.io>
1 parent e17cd6e commit b44c921

3 files changed

Lines changed: 67 additions & 6 deletions

File tree

helm/kagent/values.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -497,12 +497,12 @@ otel:
497497
otlp:
498498
endpoint: ""
499499
protocol: "grpc"
500-
timeout: 15
500+
timeout: 15000 # milliseconds
501501
insecure: true
502502
logging:
503503
enabled: false
504504
exporter:
505505
otlp:
506506
endpoint: ""
507-
timeout: 15
507+
timeout: 15000 # milliseconds
508508
insecure: true

python/packages/kagent-core/src/kagent/core/tracing/_utils.py

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,39 @@
1818
from ._span_processor import KagentAttributesSpanProcessor
1919

2020

21+
def _resolve_otlp_timeout_seconds(signal: str) -> float:
22+
"""
23+
Resolve OTLP timeout env vars (milliseconds) into seconds for exporters.
24+
By default, Python OTLP exporter reads timeout env var as seconds.
25+
However, OTEL spec defines timeout as milliseconds.
26+
"""
27+
signal_timeout_env = f"OTEL_EXPORTER_OTLP_{signal}_TIMEOUT"
28+
raw_timeout = os.getenv(signal_timeout_env) or os.getenv("OTEL_EXPORTER_OTLP_TIMEOUT")
29+
if raw_timeout is None:
30+
# OTEL spec default is 10000ms
31+
return 10.0
32+
33+
try:
34+
timeout_millis = float(raw_timeout)
35+
except ValueError:
36+
logging.warning(
37+
"Invalid OTEL timeout value %r from %s; falling back to 10000ms",
38+
raw_timeout,
39+
signal_timeout_env,
40+
)
41+
return 10.0
42+
43+
if timeout_millis < 0:
44+
logging.warning(
45+
"Negative OTEL timeout value %r from %s; falling back to 10000ms",
46+
raw_timeout,
47+
signal_timeout_env,
48+
)
49+
return 10.0
50+
51+
return timeout_millis / 1000.0
52+
53+
2154
def _instrument_anthropic(event_logger_provider=None):
2255
"""Instrument Anthropic SDK if available."""
2356
try:
@@ -69,11 +102,12 @@ def configure(name: str = "kagent", namespace: str = "kagent", fastapi_app: Fast
69102
or os.getenv("OTEL_TRACING_EXPORTER_OTLP_ENDPOINT") # Backward compatibility
70103
or os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
71104
)
105+
trace_timeout_seconds = _resolve_otlp_timeout_seconds("TRACES")
72106
logging.info("Trace endpoint: %s", trace_endpoint or "<default>")
73107
if trace_endpoint:
74-
processor = BatchSpanProcessor(OTLPSpanExporter(endpoint=trace_endpoint))
108+
processor = BatchSpanProcessor(OTLPSpanExporter(endpoint=trace_endpoint, timeout=trace_timeout_seconds))
75109
else:
76-
processor = BatchSpanProcessor(OTLPSpanExporter())
110+
processor = BatchSpanProcessor(OTLPSpanExporter(timeout=trace_timeout_seconds))
77111

78112
# Check if a TracerProvider already exists (e.g., set by CrewAI)
79113
current_provider = trace.get_tracer_provider()
@@ -107,13 +141,14 @@ def configure(name: str = "kagent", namespace: str = "kagent", fastapi_app: Fast
107141
or os.getenv("OTEL_LOGGING_EXPORTER_OTLP_ENDPOINT") # Backward compatibility
108142
or os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
109143
)
144+
log_timeout_seconds = _resolve_otlp_timeout_seconds("LOGS")
110145
logging.info("Log endpoint: %s", log_endpoint or "<default>")
111146

112147
# Add OTLP exporter
113148
if log_endpoint:
114-
log_processor = BatchLogRecordProcessor(OTLPLogExporter(endpoint=log_endpoint))
149+
log_processor = BatchLogRecordProcessor(OTLPLogExporter(endpoint=log_endpoint, timeout=log_timeout_seconds))
115150
else:
116-
log_processor = BatchLogRecordProcessor(OTLPLogExporter())
151+
log_processor = BatchLogRecordProcessor(OTLPLogExporter(timeout=log_timeout_seconds))
117152
logger_provider.add_log_record_processor(log_processor)
118153

119154
_logs.set_logger_provider(logger_provider)

python/packages/kagent-core/tests/test_tracing_configure.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from types import SimpleNamespace
22

3+
import pytest
34
from opentelemetry.propagate import get_global_textmap
45
from opentelemetry.trace import get_current_span
56

@@ -94,3 +95,28 @@ def test_otel_sdk_default_propagator_includes_w3c_tracecontext():
9495

9596
ctx = get_global_textmap().extract(carrier)
9697
assert get_current_span(ctx).get_span_context().trace_id == trace_id
98+
99+
100+
@pytest.mark.parametrize(
101+
("signal", "env", "expected"),
102+
[
103+
("TRACES", {}, 10.0),
104+
("TRACES", {"OTEL_EXPORTER_OTLP_TIMEOUT": "500"}, 0.5),
105+
("TRACES", {"OTEL_EXPORTER_OTLP_TRACES_TIMEOUT": "250"}, 0.25),
106+
(
107+
"LOGS",
108+
{
109+
"OTEL_EXPORTER_OTLP_TIMEOUT": "500",
110+
"OTEL_EXPORTER_OTLP_LOGS_TIMEOUT": "750",
111+
},
112+
0.75,
113+
),
114+
],
115+
)
116+
def test_resolve_otlp_timeout_seconds_uses_milliseconds(monkeypatch, signal, env, expected):
117+
for key in ("OTEL_EXPORTER_OTLP_TIMEOUT", "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT", "OTEL_EXPORTER_OTLP_LOGS_TIMEOUT"):
118+
monkeypatch.delenv(key, raising=False)
119+
for key, value in env.items():
120+
monkeypatch.setenv(key, value)
121+
122+
assert _utils._resolve_otlp_timeout_seconds(signal) == expected

0 commit comments

Comments
 (0)