Skip to content

Commit a728bd0

Browse files
test(langchain): Add tool execution test
1 parent f4a8602 commit a728bd0

File tree

3 files changed

+326
-129
lines changed

3 files changed

+326
-129
lines changed

tests/conftest.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,6 +1099,65 @@ def nonstreaming_responses_model_response():
10991099
)
11001100

11011101

1102+
@pytest.fixture
1103+
def responses_tool_call_model_responses():
1104+
def inner(
1105+
tool_name: str,
1106+
arguments: str,
1107+
response_model: str,
1108+
response_text: str,
1109+
response_ids: "Iterator[str]",
1110+
usages: "Iterator[openai.types.responses.ResponseUsage]",
1111+
):
1112+
yield openai.types.responses.Response(
1113+
id=next(response_ids),
1114+
output=[
1115+
openai.types.responses.ResponseFunctionToolCall(
1116+
id="call_123",
1117+
call_id="call_123",
1118+
name=tool_name,
1119+
type="function_call",
1120+
arguments=arguments,
1121+
)
1122+
],
1123+
parallel_tool_calls=False,
1124+
tool_choice="none",
1125+
tools=[],
1126+
created_at=10000000,
1127+
model=response_model,
1128+
object="response",
1129+
usage=next(usages),
1130+
)
1131+
1132+
yield openai.types.responses.Response(
1133+
id=next(response_ids),
1134+
output=[
1135+
openai.types.responses.ResponseOutputMessage(
1136+
id="msg_final",
1137+
type="message",
1138+
status="completed",
1139+
content=[
1140+
openai.types.responses.ResponseOutputText(
1141+
text=response_text,
1142+
type="output_text",
1143+
annotations=[],
1144+
)
1145+
],
1146+
role="assistant",
1147+
)
1148+
],
1149+
parallel_tool_calls=False,
1150+
tool_choice="none",
1151+
tools=[],
1152+
created_at=10000000,
1153+
model=response_model,
1154+
object="response",
1155+
usage=next(usages),
1156+
)
1157+
1158+
return inner
1159+
1160+
11021161
class MockServerRequestHandler(BaseHTTPRequestHandler):
11031162
def do_GET(self): # noqa: N802
11041163
# Process an HTTP GET request and return a response.

tests/integrations/langchain/test_langchain.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,14 @@
5454
CompletionUsage,
5555
)
5656

57+
from openai.types.responses import (
58+
ResponseUsage,
59+
)
60+
from openai.types.responses.response_usage import (
61+
InputTokensDetails,
62+
OutputTokensDetails,
63+
)
64+
5765
LANGCHAIN_VERSION = package_version("langchain")
5866

5967

@@ -209,6 +217,185 @@ def test_langchain_create_agent(
209217
assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {})
210218

211219

220+
@pytest.mark.skipif(
221+
LANGCHAIN_VERSION < (1,),
222+
reason="LangChain 1.0+ required (ONE AGENT refactor)",
223+
)
224+
@pytest.mark.parametrize(
225+
"send_default_pii, include_prompts",
226+
[
227+
(True, True),
228+
(True, False),
229+
(False, True),
230+
(False, False),
231+
],
232+
)
233+
def test_tool_execution_span(
234+
sentry_init,
235+
capture_events,
236+
send_default_pii,
237+
include_prompts,
238+
get_model_response,
239+
responses_tool_call_model_responses,
240+
):
241+
sentry_init(
242+
integrations=[
243+
LangchainIntegration(
244+
include_prompts=include_prompts,
245+
)
246+
],
247+
traces_sample_rate=1.0,
248+
send_default_pii=send_default_pii,
249+
)
250+
events = capture_events()
251+
252+
responses = responses_tool_call_model_responses(
253+
tool_name="get_word_length",
254+
arguments='{"word": "eudca"}',
255+
response_model="gpt-4-0613",
256+
response_text="The word eudca has 5 letters.",
257+
response_ids=iter(["resp_1", "resp_2"]),
258+
usages=iter(
259+
[
260+
ResponseUsage(
261+
input_tokens=142,
262+
input_tokens_details=InputTokensDetails(
263+
cached_tokens=0,
264+
),
265+
output_tokens=50,
266+
output_tokens_details=OutputTokensDetails(
267+
reasoning_tokens=0,
268+
),
269+
total_tokens=192,
270+
),
271+
ResponseUsage(
272+
input_tokens=89,
273+
input_tokens_details=InputTokensDetails(
274+
cached_tokens=0,
275+
),
276+
output_tokens=28,
277+
output_tokens_details=OutputTokensDetails(
278+
reasoning_tokens=0,
279+
),
280+
total_tokens=117,
281+
),
282+
]
283+
),
284+
)
285+
tool_response = get_model_response(
286+
next(responses),
287+
serialize_pydantic=True,
288+
request_headers={
289+
"X-Stainless-Raw-Response": "True",
290+
},
291+
)
292+
final_response = get_model_response(
293+
next(responses),
294+
serialize_pydantic=True,
295+
request_headers={
296+
"X-Stainless-Raw-Response": "True",
297+
},
298+
)
299+
300+
llm = ChatOpenAI(
301+
model_name="gpt-4",
302+
temperature=0,
303+
openai_api_key="badkey",
304+
use_responses_api=True,
305+
)
306+
agent = create_agent(
307+
model=llm,
308+
tools=[get_word_length],
309+
name="word_length_agent",
310+
)
311+
312+
with patch.object(
313+
llm.client._client._client,
314+
"send",
315+
side_effect=[tool_response, final_response],
316+
) as _:
317+
with start_transaction():
318+
agent.invoke(
319+
{
320+
"messages": [
321+
HumanMessage(content="How many letters in the word eudca"),
322+
],
323+
},
324+
)
325+
326+
tx = events[0]
327+
assert tx["type"] == "transaction"
328+
assert tx["contexts"]["trace"]["origin"] == "manual"
329+
330+
chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")
331+
tool_exec_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool")
332+
333+
assert len(chat_spans) == 2
334+
335+
assert chat_spans[0]["origin"] == "auto.ai.langchain"
336+
assert chat_spans[1]["origin"] == "auto.ai.langchain"
337+
assert tool_exec_span["origin"] == "auto.ai.langchain"
338+
339+
# We can't guarantee anything about the "shape" of the langchain execution graph
340+
assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0
341+
342+
# Token usage is only available in newer versions of langchain (v0.2+)
343+
# where usage_metadata is supported on AIMessageChunk
344+
if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]:
345+
assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142
346+
assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50
347+
assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192
348+
349+
if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]:
350+
assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89
351+
assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28
352+
assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117
353+
354+
if send_default_pii and include_prompts:
355+
assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT]
356+
357+
assert "5" in chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
358+
359+
# Verify tool calls are recorded when PII is enabled
360+
assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get("data", {}), (
361+
"Tool calls should be recorded when send_default_pii=True and include_prompts=True"
362+
)
363+
tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS]
364+
assert isinstance(tool_calls_data, (list, str)) # Could be serialized
365+
if isinstance(tool_calls_data, str):
366+
assert "get_word_length" in tool_calls_data
367+
elif isinstance(tool_calls_data, list) and len(tool_calls_data) > 0:
368+
# Check if tool calls contain expected function name
369+
tool_call_str = str(tool_calls_data)
370+
assert "get_word_length" in tool_call_str
371+
else:
372+
assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {})
373+
assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {})
374+
assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get("data", {})
375+
assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("data", {})
376+
assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("data", {})
377+
assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("data", {})
378+
379+
# Verify tool calls are NOT recorded when PII is disabled
380+
assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get(
381+
"data", {}
382+
), (
383+
f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} "
384+
f"and include_prompts={include_prompts}"
385+
)
386+
assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get(
387+
"data", {}
388+
), (
389+
f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} "
390+
f"and include_prompts={include_prompts}"
391+
)
392+
393+
# Verify that available tools are always recorded regardless of PII settings
394+
for chat_span in chat_spans:
395+
tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS]
396+
assert "get_word_length" in tools_data
397+
398+
212399
@pytest.mark.parametrize(
213400
"send_default_pii, include_prompts",
214401
[

0 commit comments

Comments
 (0)