Fix Python tool error handling

stephentoub · stephentoub · commit 3e8fa48d4177 · 2026-03-02T22:31:23.000-05:00
diff --git a/python/copilot/tools.py b/python/copilot/tools.py
@@ -115,35 +115,26 @@ def decorator(fn: Callable[..., Any]) -> Tool:
             schema = ptype.model_json_schema()
 
         async def wrapped_handler(invocation: ToolInvocation) -> ToolResult:
-            try:
-                # Build args based on detected signature
-                call_args = []
-                if takes_params:
-                    args = invocation["arguments"] or {}
-                    if ptype is not None and _is_pydantic_model(ptype):
-                        call_args.append(ptype.model_validate(args))
-                    else:
-                        call_args.append(args)
-                if takes_invocation:
-                    call_args.append(invocation)
-
-                result = fn(*call_args)
-
-                if inspect.isawaitable(result):
-                    result = await result
-
-                return _normalize_result(result)
-
-            except Exception as exc:
-                # Don't expose detailed error information to the LLM for security reasons.
-                # The actual error is stored in the 'error' field for debugging.
-                return ToolResult(
-                    textResultForLlm="Invoking this tool produced an error. "
-                    "Detailed information is not available.",
-                    resultType="failure",
-                    error=str(exc),
-                    toolTelemetry={},
-                )
+            # Build args based on detected signature.
+            # Exceptions are NOT caught here — they propagate to the SDK's
+            # _execute_tool_call, which records errors on the execute_tool
+            # span and builds a safe ToolResult for the LLM.
+            call_args = []
+            if takes_params:
+                args = invocation["arguments"] or {}
+                if ptype is not None and _is_pydantic_model(ptype):
+                    call_args.append(ptype.model_validate(args))
+                else:
+                    call_args.append(args)
+            if takes_invocation:
+                call_args.append(invocation)
+
+            result = fn(*call_args)
+
+            if inspect.isawaitable(result):
+                result = await result
+
+            return _normalize_result(result)
 
         return Tool(
             name=tool_name,
diff --git a/python/e2e/test_tools_unit.py b/python/e2e/test_tools_unit.py
@@ -169,7 +169,9 @@ def test_tool(params: Params) -> str:
         assert received_params is not None
         assert received_params.value == "hello"
 
-    async def test_handler_error_is_hidden_from_llm(self):
+    async def test_handler_error_propagates(self):
+        """Exceptions from tool handlers propagate (caught by _execute_tool_call in client.py)."""
+
         class Params(BaseModel):
             pass
 
@@ -184,13 +186,11 @@ def failing_tool(params: Params, invocation: ToolInvocation) -> str:
             "arguments": {},
         }
 
-        result = await failing_tool.handler(invocation)
-
-        assert result["resultType"] == "failure"
-        assert "secret error message" not in result["textResultForLlm"]
-        assert "error" in result["textResultForLlm"].lower()
-        # But the actual error is stored internally
-        assert result["error"] == "secret error message"
+        # Exceptions propagate from define_tool handlers — the SDK's
+        # _execute_tool_call catches them, records telemetry, and builds
+        # a safe ToolResult that hides error details from the LLM.
+        with pytest.raises(ValueError, match="secret error message"):
+            await failing_tool.handler(invocation)
 
     async def test_function_style_api(self):
         class Params(BaseModel):
diff --git a/python/test_opentelemetry.py b/python/test_opentelemetry.py
@@ -495,6 +495,69 @@ def test_records_error_on_span(self, _reset_otel_globals):
         assert s.attributes[ATTR_ERROR_TYPE] == "ValueError"
         assert s.status.status_code == trace.StatusCode.ERROR
 
+    def test_execute_tool_error_from_define_tool_handler(self, _reset_otel_globals):
+        """Verify that errors from @define_tool handlers propagate and get recorded on spans.
+
+        This validates the fix where @define_tool no longer catches exceptions internally,
+        allowing _execute_tool_call to record error.type and ERROR status on the
+        execute_tool span — consistent with Node.js, .NET, and Go SDKs.
+        """
+        from copilot import ToolInvocation, define_tool
+
+        exporter, reader, tp, mp = _get_exporter_and_reader(_reset_otel_globals)
+        telemetry = _make_telemetry(tracer_provider=tp, meter_provider=mp)
+
+        # Use zero-param handler signature to avoid Pydantic + from __future__ import annotations issue
+        @define_tool(description="A tool that always fails")
+        def failing_tool() -> str:
+            raise RuntimeError("deliberate failure")
+
+        # Start an execute_tool span (as _execute_tool_call would)
+        span = telemetry.start_execute_tool_span(
+            tool_name="failing_tool",
+            tool_call_id="tc-fail",
+            description="A tool that always fails",
+            arguments={},
+        )
+
+        # Simulate _execute_tool_call: invoke the handler, catch the error, record it
+        invocation: ToolInvocation = {
+            "session_id": "s1",
+            "tool_call_id": "tc-fail",
+            "tool_name": "failing_tool",
+            "arguments": {},
+        }
+        operation_error = None
+        try:
+            import asyncio
+
+            loop = asyncio.new_event_loop()
+            loop.run_until_complete(failing_tool.handler(invocation))
+            loop.close()
+        except Exception as exc:
+            operation_error = exc
+            telemetry.record_error(span, exc)
+
+        span.end()
+
+        # The exception MUST have propagated (not swallowed by @define_tool)
+        assert operation_error is not None, "@define_tool must not catch handler exceptions"
+        assert isinstance(operation_error, RuntimeError)
+
+        # The span MUST have ERROR status and error.type
+        s = exporter.get_finished_spans()[0]
+        assert s.status.status_code == trace.StatusCode.ERROR
+        assert s.attributes[ATTR_ERROR_TYPE] == "RuntimeError"
+
+        # Operation duration metric should include error.type
+        telemetry.record_operation_duration(
+            0.1, None, None, "github", None, None, operation_error, OP_EXECUTE_TOOL
+        )
+        dps = _get_metric_data_points(reader, METRIC_OPERATION_DURATION)
+        assert len(dps) > 0
+        error_dp = [dp for dp in dps if dp.attributes.get(ATTR_ERROR_TYPE) == "RuntimeError"]
+        assert len(error_dp) > 0, "duration metric includes error.type for failed tool"
+
 
 # ---------------------------------------------------------------------------
 # Tests: Tool result recording