Skip to content

Commit 62d7ee0

Browse files
google-genai-botcopybara-github
authored andcommitted
refactor: move exception handling from metric emission into instrumentation handlers
Metric instrumentation code should never disrupt the regular execution flow, so with this we capture any errors one level higher and make instrumentation more robust (especially against tests that use incomplete mocks of adk components). PiperOrigin-RevId: 904412190
1 parent f95ac48 commit 62d7ee0

2 files changed

Lines changed: 80 additions & 95 deletions

File tree

src/google/adk/telemetry/_instrumentation.py

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -83,23 +83,28 @@ async def record_agent_invocation(
8383
with tracing.tracer.start_as_current_span(span_name) as s:
8484
span = s
8585
tracing.trace_agent_invocation(span, agent, ctx)
86-
_metrics.record_agent_request_size(agent.name, ctx.user_content)
8786
tel_ctx = TelemetryContext(otel_context=context_api.get_current())
8887
yield tel_ctx
8988
except Exception as e:
9089
caught_error = e
9190
raise
9291
finally:
9392
elapsed_ms = _get_elapsed_ms(span, start_time)
94-
_metrics.record_agent_invocation_duration(
95-
agent.name,
96-
elapsed_ms,
97-
ctx.user_content,
98-
ctx.session.events,
99-
caught_error,
100-
)
101-
_metrics.record_agent_response_size(agent.name, ctx.session.events)
102-
_metrics.record_agent_workflow_steps(agent.name, len(ctx.session.events))
93+
try:
94+
_metrics.record_agent_invocation_duration(
95+
agent.name,
96+
elapsed_ms,
97+
ctx.user_content,
98+
ctx.session.events,
99+
caught_error,
100+
)
101+
_metrics.record_agent_request_size(agent.name, ctx.user_content)
102+
_metrics.record_agent_response_size(agent.name, ctx.session.events)
103+
_metrics.record_agent_workflow_steps(agent.name, len(ctx.session.events))
104+
except Exception: # pylint: disable=broad-exception-caught
105+
logger.exception(
106+
"Failed to record agent metrics for agent %s", agent.name
107+
)
103108

104109

105110
@contextlib.asynccontextmanager
@@ -144,11 +149,16 @@ async def record_tool_execution(
144149
if isinstance(result_event, event_lib.Event)
145150
else None
146151
)
147-
_metrics.record_tool_execution_duration(
148-
tool_name=tool.name,
149-
agent_name=agent.name,
150-
elapsed_ms=elapsed_ms,
151-
input_content=invocation_context.user_content,
152-
output_content=output_content,
153-
error=caught_error,
154-
)
152+
try:
153+
_metrics.record_tool_execution_duration(
154+
tool_name=tool.name,
155+
agent_name=agent.name,
156+
elapsed_ms=elapsed_ms,
157+
input_content=invocation_context.user_content,
158+
output_content=output_content,
159+
error=caught_error,
160+
)
161+
except Exception: # pylint: disable=broad-exception-caught
162+
logger.exception(
163+
"Failed to record tool execution duration for tool %s", tool.name
164+
)

src/google/adk/telemetry/_metrics.py

Lines changed: 52 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -69,17 +69,12 @@ def record_agent_request_size(
6969
agent_name: str, user_content: types.Content | None
7070
):
7171
"""Records the size of the agent request."""
72-
try:
73-
size = _get_content_size(user_content)
74-
attrs = {
75-
gen_ai_attributes.GEN_AI_AGENT_NAME: agent_name,
76-
GEN_AI_INPUT_TYPE: _get_modality_from_content(user_content),
77-
}
78-
_agent_request_size.record(size, attributes=attrs)
79-
except Exception: # pylint: disable=broad-exception-caught
80-
logger.exception(
81-
"Failed to record agent request size for agent %s", agent_name
82-
)
72+
size = _get_content_size(user_content)
73+
attrs = {
74+
gen_ai_attributes.GEN_AI_AGENT_NAME: agent_name,
75+
GEN_AI_INPUT_TYPE: _get_modality_from_content(user_content),
76+
}
77+
_agent_request_size.record(size, attributes=attrs)
8378

8479

8580
def record_agent_invocation_duration(
@@ -90,64 +85,49 @@ def record_agent_invocation_duration(
9085
error: Exception | None = None,
9186
):
9287
"""Records the duration of the agent invocation."""
93-
try:
94-
response_content: types.Content | None = None
95-
for event in reversed(events):
96-
if event.author == agent_name and event.content:
97-
response_content = event.content
98-
break
99-
100-
attrs = {
101-
gen_ai_attributes.GEN_AI_AGENT_NAME: agent_name,
102-
GEN_AI_INPUT_TYPE: _get_modality_from_content(user_content),
103-
gen_ai_attributes.GEN_AI_OUTPUT_TYPE: _get_modality_from_content(
104-
response_content
105-
),
106-
}
107-
if error is not None:
108-
attrs[error_attributes.ERROR_TYPE] = type(error).__name__
109-
_agent_invocation_duration.record(elapsed_ms, attributes=attrs)
110-
except Exception: # pylint: disable=broad-exception-caught
111-
logger.exception(
112-
"Failed to record agent invocation duration for agent %s", agent_name
113-
)
88+
response_content: types.Content | None = None
89+
for event in reversed(events):
90+
if event.author == agent_name and event.content:
91+
response_content = event.content
92+
break
93+
94+
attrs = {
95+
gen_ai_attributes.GEN_AI_AGENT_NAME: agent_name,
96+
GEN_AI_INPUT_TYPE: _get_modality_from_content(user_content),
97+
gen_ai_attributes.GEN_AI_OUTPUT_TYPE: _get_modality_from_content(
98+
response_content
99+
),
100+
}
101+
if error is not None:
102+
attrs[error_attributes.ERROR_TYPE] = type(error).__name__
103+
_agent_invocation_duration.record(elapsed_ms, attributes=attrs)
114104

115105

116106
def record_agent_response_size(agent_name: str, events: list[Event]):
117107
"""Records the size of the agent response by extracting content from events."""
118-
try:
119-
response_content: types.Content | None = None
120-
for event in reversed(events):
121-
# Need to look for author matching agent_name and having content
122-
if event.author == agent_name and event.content:
123-
response_content = event.content
124-
break
125-
126-
size = _get_content_size(response_content)
127-
attrs = {
128-
gen_ai_attributes.GEN_AI_AGENT_NAME: agent_name,
129-
gen_ai_attributes.GEN_AI_OUTPUT_TYPE: _get_modality_from_content(
130-
response_content
131-
),
132-
}
133-
_agent_response_size.record(size, attributes=attrs)
134-
except Exception: # pylint: disable=broad-exception-caught
135-
logger.exception(
136-
"Failed to record agent response size for agent %s", agent_name
137-
)
108+
response_content: types.Content | None = None
109+
for event in reversed(events):
110+
# Need to look for author matching agent_name and having content
111+
if event.author == agent_name and event.content:
112+
response_content = event.content
113+
break
114+
115+
size = _get_content_size(response_content)
116+
attrs = {
117+
gen_ai_attributes.GEN_AI_AGENT_NAME: agent_name,
118+
gen_ai_attributes.GEN_AI_OUTPUT_TYPE: _get_modality_from_content(
119+
response_content
120+
),
121+
}
122+
_agent_response_size.record(size, attributes=attrs)
138123

139124

140125
def record_agent_workflow_steps(agent_name: str, steps_count: int):
141126
"""Records the number of steps in the agent workflow."""
142-
try:
143-
attrs = {
144-
gen_ai_attributes.GEN_AI_AGENT_NAME: agent_name,
145-
}
146-
_agent_workflow_steps.record(steps_count, attributes=attrs)
147-
except Exception: # pylint: disable=broad-exception-caught
148-
logger.exception(
149-
"Failed to record agent workflow steps for agent %s", agent_name
150-
)
127+
attrs = {
128+
gen_ai_attributes.GEN_AI_AGENT_NAME: agent_name,
129+
}
130+
_agent_workflow_steps.record(steps_count, attributes=attrs)
151131

152132

153133
def record_tool_execution_duration(
@@ -159,23 +139,18 @@ def record_tool_execution_duration(
159139
error: Exception | None = None,
160140
):
161141
"""Records the duration of the tool execution."""
162-
try:
163-
attrs = {
164-
gen_ai_attributes.GEN_AI_AGENT_NAME: agent_name,
165-
gen_ai_attributes.GEN_AI_TOOL_NAME: tool_name,
166-
GEN_AI_INPUT_TYPE: _get_modality_from_content(input_content),
167-
}
168-
if error is not None:
169-
attrs[error_attributes.ERROR_TYPE] = type(error).__name__
170-
else:
171-
attrs[gen_ai_attributes.GEN_AI_OUTPUT_TYPE] = _get_modality_from_content(
172-
output_content
173-
)
174-
_tool_execution_duration.record(elapsed_ms, attributes=attrs)
175-
except Exception: # pylint: disable=broad-exception-caught
176-
logger.exception(
177-
"Failed to record tool execution duration for tool %s", tool_name
142+
attrs = {
143+
gen_ai_attributes.GEN_AI_AGENT_NAME: agent_name,
144+
gen_ai_attributes.GEN_AI_TOOL_NAME: tool_name,
145+
GEN_AI_INPUT_TYPE: _get_modality_from_content(input_content),
146+
}
147+
if error is not None:
148+
attrs[error_attributes.ERROR_TYPE] = type(error).__name__
149+
else:
150+
attrs[gen_ai_attributes.GEN_AI_OUTPUT_TYPE] = _get_modality_from_content(
151+
output_content
178152
)
153+
_tool_execution_duration.record(elapsed_ms, attributes=attrs)
179154

180155

181156
# Helper functions copied from metrics_plugin.py

0 commit comments

Comments
 (0)