Skip to content

Commit 871cd79

Browse files
committed
feat: add conversation history to ToolContext
1 parent be9ce37 commit 871cd79

File tree

12 files changed

+346
-138
lines changed

12 files changed

+346
-138
lines changed

.agents/skills/runtime-behavior-probe/templates/python_probe.py

Lines changed: 3 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,15 @@
1212

1313
from __future__ import annotations
1414

15-
from collections import Counter, defaultdict
16-
from importlib import metadata
1715
import json
1816
import os
19-
from pathlib import Path
2017
import platform
2118
import shutil
22-
import statistics
2319
import subprocess
2420
import sys
2521
import time
26-
import uuid
22+
from importlib import metadata
23+
from pathlib import Path
2724

2825
SCENARIO = "replace-me"
2926
RUN_LABEL = "replace-me"
@@ -79,9 +76,7 @@ def emit(kind: str, **payload: object) -> None:
7976

8077

8178
def runtime_context() -> dict[str, object]:
82-
approved = {
83-
name: ("set" if os.getenv(name) else "unset") for name in APPROVED_ENV_VARS
84-
}
79+
approved = {name: ("set" if os.getenv(name) else "unset") for name in APPROVED_ENV_VARS}
8580
package_versions = {
8681
name: version
8782
for name in ("openai", "agents")
@@ -103,133 +98,3 @@ def runtime_context() -> dict[str, object]:
10398
"approved_env_vars": approved,
10499
"output_dir": str(_output_dir()) if _output_dir() else None,
105100
}
106-
107-
108-
def start_case(case_id: str, *, mode: str = MODE, note: str | None = None) -> None:
109-
emit("case_start", case_id=case_id, mode=mode, note=note)
110-
111-
112-
def record_case_result(
113-
case_id: str,
114-
observation_summary: str,
115-
result_flag: str,
116-
*,
117-
mode: str = MODE,
118-
is_warmup: bool = False,
119-
total_latency_s: float | None = None,
120-
first_token_latency_s: float | None = None,
121-
metrics: dict[str, object] | None = None,
122-
error: str | None = None,
123-
) -> None:
124-
payload: dict[str, object] = {
125-
"case_id": case_id,
126-
"mode": mode,
127-
"is_warmup": is_warmup,
128-
"observation_summary": observation_summary,
129-
"result_flag": result_flag,
130-
"metrics": metrics or {},
131-
"error": error,
132-
}
133-
if total_latency_s is not None:
134-
payload["total_latency_s"] = total_latency_s
135-
if first_token_latency_s is not None:
136-
payload["first_token_latency_s"] = first_token_latency_s
137-
RESULTS.append(payload)
138-
emit("case_result", **payload)
139-
140-
141-
def summarize_results() -> dict[str, object]:
142-
by_case: defaultdict[str, list[dict[str, object]]] = defaultdict(list)
143-
for result in RESULTS:
144-
by_case[str(result["case_id"])].append(result)
145-
146-
summary_cases: dict[str, object] = {}
147-
for case_id, items in by_case.items():
148-
measured = [item for item in items if not bool(item.get("is_warmup"))]
149-
latencies = [
150-
float(item["total_latency_s"])
151-
for item in measured
152-
if item.get("total_latency_s") is not None
153-
]
154-
first_token_latencies = [
155-
float(item["first_token_latency_s"])
156-
for item in measured
157-
if item.get("first_token_latency_s") is not None
158-
]
159-
result_flags = Counter(str(item["result_flag"]) for item in measured or items)
160-
observations = [
161-
str(item["observation_summary"]) for item in (measured or items)[:3]
162-
]
163-
summary_cases[case_id] = {
164-
"mode": str(items[-1]["mode"]),
165-
"runs": len(measured),
166-
"warmups": len(items) - len(measured),
167-
"result_flags": dict(result_flags),
168-
"median_total_latency_s": (
169-
statistics.median(latencies) if latencies else None
170-
),
171-
"mean_total_latency_s": statistics.mean(latencies) if latencies else None,
172-
"median_first_token_latency_s": (
173-
statistics.median(first_token_latencies)
174-
if first_token_latencies
175-
else None
176-
),
177-
"observations": observations,
178-
}
179-
180-
return {
181-
"scenario": SCENARIO,
182-
"run_label": RUN_LABEL,
183-
"mode": MODE,
184-
"result_count": len(RESULTS),
185-
"cases": summary_cases,
186-
"result_flags": dict(Counter(str(item["result_flag"]) for item in RESULTS)),
187-
}
188-
189-
190-
def finalize(exit_code: int) -> None:
191-
metadata_payload = {
192-
"exit_code": exit_code,
193-
"runtime_context": runtime_context(),
194-
}
195-
summary_payload = summarize_results()
196-
emit("summary", metadata=metadata_payload, summary=summary_payload)
197-
198-
output_dir = _output_dir()
199-
if not output_dir:
200-
return
201-
202-
metadata_path = output_dir / "metadata.json"
203-
results_path = output_dir / "results.json"
204-
summary_path = output_dir / "summary.json"
205-
_write_json(metadata_path, metadata_payload)
206-
_write_json(results_path, RESULTS)
207-
_write_json(summary_path, summary_payload)
208-
emit(
209-
"artifact_paths",
210-
metadata_path=str(metadata_path),
211-
results_path=str(results_path),
212-
summary_path=str(summary_path),
213-
)
214-
215-
216-
def main() -> int:
217-
case_id = os.getenv("PROBE_CASE_ID", f"case-{uuid.uuid4().hex[:8]}")
218-
emit("banner", context=runtime_context())
219-
start_case(case_id)
220-
221-
# Replace this block with the narrow runtime question you want to test.
222-
observation = "replace-me"
223-
result_flag = "expected"
224-
225-
record_case_result(
226-
case_id=case_id,
227-
observation_summary=observation,
228-
result_flag=result_flag,
229-
)
230-
finalize(exit_code=0)
231-
return 0
232-
233-
234-
if __name__ == "__main__":
235-
raise SystemExit(main())

docs/context.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ plus additional fields specific to the current tool call:
126126
- `tool_arguments` – the raw argument string passed to the tool
127127
- `tool_namespace` – the Responses namespace for the tool call, when the tool was loaded through `tool_namespace()` or another namespaced surface
128128
- `qualified_tool_name` – the tool name qualified with the namespace when one is available
129+
- `conversation_history` – a visible history snapshot available to the tool at invocation time. For local function tools in non-streaming runs, this includes the current input plus prior visible run items that can be represented as model input.
129130

130131
Use `ToolContext` when you need tool-level metadata during execution.
131132
For general context sharing between agents and tools, `RunContextWrapper` remains sufficient. Because `ToolContext` extends `RunContextWrapper`, it can also expose `.tool_input` when a nested `Agent.as_tool()` run supplied structured input.

src/agents/agent.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,7 @@ async def _run_agent_impl(context: ToolContext, input_json: str) -> Any:
602602
tool_namespace=context.tool_namespace,
603603
agent=context.agent,
604604
run_config=resolved_run_config,
605+
conversation_history=context.conversation_history,
605606
)
606607
set_agent_tool_state_scope(nested_context, tool_state_scope_id)
607608
if should_capture_tool_input:

src/agents/run_internal/run_loop.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1587,6 +1587,7 @@ async def get_new_response(
15871587
)
15881588
if isinstance(filtered.input, list):
15891589
filtered.input = deduplicate_input_items_preferring_latest(filtered.input)
1590+
context_wrapper.turn_input = list(filtered.input)
15901591

15911592
model = get_model(agent, run_config)
15921593
model_settings = agent.model_settings.resolve(run_config.model_settings)

src/agents/run_internal/tool_execution.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
RunItemBase,
5656
ToolApprovalItem,
5757
ToolCallOutputItem,
58+
TResponseInputItem,
5859
)
5960
from ..logger import logger
6061
from ..model_settings import ModelSettings
@@ -1284,13 +1285,17 @@ def __init__(
12841285
hooks: RunHooks[Any],
12851286
context_wrapper: RunContextWrapper[Any],
12861287
config: RunConfig,
1288+
conversation_history: list[TResponseInputItem] | None,
12871289
isolate_parallel_failures: bool | None,
12881290
) -> None:
12891291
self.agent = agent
12901292
self.tool_runs = tool_runs
12911293
self.hooks = hooks
12921294
self.context_wrapper = context_wrapper
12931295
self.config = config
1296+
self.conversation_history = (
1297+
list(conversation_history) if conversation_history is not None else None
1298+
)
12941299
self.isolate_parallel_failures = (
12951300
len(tool_runs) > 1 if isolate_parallel_failures is None else isolate_parallel_failures
12961301
)
@@ -1465,6 +1470,7 @@ async def _run_single_tool(
14651470
tool_namespace=tool_context_namespace,
14661471
agent=self.agent,
14671472
run_config=self.config,
1473+
conversation_history=self.conversation_history,
14681474
)
14691475
agent_hooks = self.agent.hooks
14701476
if self.config.trace_include_sensitive_data:
@@ -1797,6 +1803,7 @@ async def execute_function_tool_calls(
17971803
hooks: RunHooks[Any],
17981804
context_wrapper: RunContextWrapper[Any],
17991805
config: RunConfig,
1806+
conversation_history: list[TResponseInputItem] | None = None,
18001807
isolate_parallel_failures: bool | None = None,
18011808
) -> tuple[
18021809
list[FunctionToolResult], list[ToolInputGuardrailResult], list[ToolOutputGuardrailResult]
@@ -1808,6 +1815,7 @@ async def execute_function_tool_calls(
18081815
hooks=hooks,
18091816
context_wrapper=context_wrapper,
18101817
config=config,
1818+
conversation_history=conversation_history,
18111819
isolate_parallel_failures=isolate_parallel_failures,
18121820
).execute()
18131821

src/agents/run_internal/tool_planning.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
ToolApprovalItem,
2121
ToolCallItem,
2222
ToolCallOutputItem,
23+
TResponseInputItem,
2324
)
2425
from ..run_context import RunContextWrapper
2526
from ..tool import FunctionTool, MCPToolApprovalRequest
@@ -522,6 +523,7 @@ async def _execute_tool_plan(
522523
hooks,
523524
context_wrapper: RunContextWrapper[Any],
524525
run_config,
526+
conversation_history: list[TResponseInputItem] | None = None,
525527
parallel: bool = True,
526528
) -> tuple[
527529
list[Any],
@@ -556,6 +558,7 @@ async def _execute_tool_plan(
556558
hooks=hooks,
557559
context_wrapper=context_wrapper,
558560
config=run_config,
561+
conversation_history=conversation_history,
559562
isolate_parallel_failures=isolate_function_tool_failures,
560563
),
561564
execute_computer_actions(
@@ -598,6 +601,7 @@ async def _execute_tool_plan(
598601
hooks=hooks,
599602
context_wrapper=context_wrapper,
600603
config=run_config,
604+
conversation_history=conversation_history,
601605
isolate_parallel_failures=isolate_function_tool_failures,
602606
)
603607
computer_results = await execute_computer_actions(

src/agents/run_internal/turn_resolution.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,10 @@
8686
from ..util._approvals import evaluate_needs_approval_setting
8787
from .items import (
8888
REJECTION_MESSAGE,
89+
ReasoningItemIdPolicy,
8990
apply_patch_rejection_item,
9091
function_rejection_item,
92+
run_items_to_input_items,
9193
shell_rejection_item,
9294
)
9395
from .run_steps import (
@@ -153,6 +155,21 @@
153155
]
154156

155157

158+
def _build_function_tool_conversation_history(
159+
turn_input: Sequence[TResponseInputItem],
160+
pre_step_items: Sequence[RunItem],
161+
reasoning_item_id_policy: ReasoningItemIdPolicy | None,
162+
) -> list[TResponseInputItem]:
163+
"""Build the visible history snapshot for a local function tool invocation.
164+
165+
The snapshot is based on the actual turn input sent to the model plus prior generated
166+
items converted through the same reasoning-ID policy used by the normal model-input path.
167+
"""
168+
history = list(turn_input)
169+
history.extend(run_items_to_input_items(pre_step_items, reasoning_item_id_policy))
170+
return history
171+
172+
156173
async def _maybe_finalize_from_tool_results(
157174
*,
158175
agent: Agent[TContext],
@@ -528,6 +545,12 @@ async def execute_tools_and_side_effects(
528545
new_items=processed_response.new_items,
529546
)
530547

548+
conversation_history = _build_function_tool_conversation_history(
549+
context_wrapper.turn_input,
550+
pre_step_items,
551+
run_config.reasoning_item_id_policy,
552+
)
553+
531554
(
532555
function_results,
533556
tool_input_guardrail_results,
@@ -542,6 +565,7 @@ async def execute_tools_and_side_effects(
542565
hooks=hooks,
543566
context_wrapper=context_wrapper,
544567
run_config=run_config,
568+
conversation_history=conversation_history,
545569
)
546570
new_step_items.extend(
547571
_build_tool_result_items(
@@ -1103,6 +1127,17 @@ def _add_unmatched_pending(approval: ToolApprovalItem) -> None:
11031127
apply_patch_calls=approved_apply_patch_calls,
11041128
)
11051129

1130+
resolved_reasoning_item_id_policy = (
1131+
run_config.reasoning_item_id_policy
1132+
if run_config.reasoning_item_id_policy is not None
1133+
else (run_state._reasoning_item_id_policy if run_state is not None else None)
1134+
)
1135+
conversation_history = _build_function_tool_conversation_history(
1136+
context_wrapper.turn_input,
1137+
original_pre_step_items,
1138+
resolved_reasoning_item_id_policy,
1139+
)
1140+
11061141
(
11071142
function_results,
11081143
tool_input_guardrail_results,
@@ -1117,6 +1152,7 @@ def _add_unmatched_pending(approval: ToolApprovalItem) -> None:
11171152
hooks=hooks,
11181153
context_wrapper=context_wrapper,
11191154
run_config=run_config,
1155+
conversation_history=conversation_history,
11201156
)
11211157

11221158
for interruption in _collect_tool_interruptions(

0 commit comments

Comments
 (0)