openai
diff --git a/‎.agents/skills/runtime-behavior-probe/templates/python_probe.py‎
Lines changed: 3 additions & 138 deletions b/‎.agents/skills/runtime-behavior-probe/templates/python_probe.py‎
Lines changed: 3 additions & 138 deletions
diff --git a/‎docs/context.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/context.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/agents/agent.py‎
Lines changed: 1 addition & 0 deletions b/‎src/agents/agent.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/agents/run_internal/run_loop.py‎
Lines changed: 1 addition & 0 deletions b/‎src/agents/run_internal/run_loop.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/agents/run_internal/tool_execution.py‎
Lines changed: 8 additions & 0 deletions b/‎src/agents/run_internal/tool_execution.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/agents/run_internal/tool_planning.py‎
Lines changed: 4 additions & 0 deletions b/‎src/agents/run_internal/tool_planning.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/agents/run_internal/turn_resolution.py‎
Lines changed: 36 additions & 0 deletions b/‎src/agents/run_internal/turn_resolution.py‎
Lines changed: 36 additions & 0 deletions
@@ -12,18 +12,15 @@
 
 from __future__ import annotations
 
-from collections import Counter, defaultdict
-from importlib import metadata
 import json
 import os
-from pathlib import Path
 import platform
 import shutil
-import statistics
 import subprocess
 import sys
 import time
-import uuid
+from importlib import metadata
+from pathlib import Path
 
 SCENARIO = "replace-me"
 RUN_LABEL = "replace-me"
@@ -79,9 +76,7 @@ def emit(kind: str, **payload: object) -> None:
 
 
 def runtime_context() -> dict[str, object]:
-    approved = {
-        name: ("set" if os.getenv(name) else "unset") for name in APPROVED_ENV_VARS
-    }
+    approved = {name: ("set" if os.getenv(name) else "unset") for name in APPROVED_ENV_VARS}
     package_versions = {
         name: version
         for name in ("openai", "agents")
@@ -103,133 +98,3 @@ def runtime_context() -> dict[str, object]:
         "approved_env_vars": approved,
         "output_dir": str(_output_dir()) if _output_dir() else None,
     }
-
-
-def start_case(case_id: str, *, mode: str = MODE, note: str | None = None) -> None:
-    emit("case_start", case_id=case_id, mode=mode, note=note)
-
-
-def record_case_result(
-    case_id: str,
-    observation_summary: str,
-    result_flag: str,
-    *,
-    mode: str = MODE,
-    is_warmup: bool = False,
-    total_latency_s: float | None = None,
-    first_token_latency_s: float | None = None,
-    metrics: dict[str, object] | None = None,
-    error: str | None = None,
-) -> None:
-    payload: dict[str, object] = {
-        "case_id": case_id,
-        "mode": mode,
-        "is_warmup": is_warmup,
-        "observation_summary": observation_summary,
-        "result_flag": result_flag,
-        "metrics": metrics or {},
-        "error": error,
-    }
-    if total_latency_s is not None:
-        payload["total_latency_s"] = total_latency_s
-    if first_token_latency_s is not None:
-        payload["first_token_latency_s"] = first_token_latency_s
-    RESULTS.append(payload)
-    emit("case_result", **payload)
-
-
-def summarize_results() -> dict[str, object]:
-    by_case: defaultdict[str, list[dict[str, object]]] = defaultdict(list)
-    for result in RESULTS:
-        by_case[str(result["case_id"])].append(result)
-
-    summary_cases: dict[str, object] = {}
-    for case_id, items in by_case.items():
-        measured = [item for item in items if not bool(item.get("is_warmup"))]
-        latencies = [
-            float(item["total_latency_s"])
-            for item in measured
-            if item.get("total_latency_s") is not None
-        ]
-        first_token_latencies = [
-            float(item["first_token_latency_s"])
-            for item in measured
-            if item.get("first_token_latency_s") is not None
-        ]
-        result_flags = Counter(str(item["result_flag"]) for item in measured or items)
-        observations = [
-            str(item["observation_summary"]) for item in (measured or items)[:3]
-        ]
-        summary_cases[case_id] = {
-            "mode": str(items[-1]["mode"]),
-            "runs": len(measured),
-            "warmups": len(items) - len(measured),
-            "result_flags": dict(result_flags),
-            "median_total_latency_s": (
-                statistics.median(latencies) if latencies else None
-            ),
-            "mean_total_latency_s": statistics.mean(latencies) if latencies else None,
-            "median_first_token_latency_s": (
-                statistics.median(first_token_latencies)
-                if first_token_latencies
-                else None
-            ),
-            "observations": observations,
-        }
-
-    return {
-        "scenario": SCENARIO,
-        "run_label": RUN_LABEL,
-        "mode": MODE,
-        "result_count": len(RESULTS),
-        "cases": summary_cases,
-        "result_flags": dict(Counter(str(item["result_flag"]) for item in RESULTS)),
-    }
-
-
-def finalize(exit_code: int) -> None:
-    metadata_payload = {
-        "exit_code": exit_code,
-        "runtime_context": runtime_context(),
-    }
-    summary_payload = summarize_results()
-    emit("summary", metadata=metadata_payload, summary=summary_payload)
-
-    output_dir = _output_dir()
-    if not output_dir:
-        return
-
-    metadata_path = output_dir / "metadata.json"
-    results_path = output_dir / "results.json"
-    summary_path = output_dir / "summary.json"
-    _write_json(metadata_path, metadata_payload)
-    _write_json(results_path, RESULTS)
-    _write_json(summary_path, summary_payload)
-    emit(
-        "artifact_paths",
-        metadata_path=str(metadata_path),
-        results_path=str(results_path),
-        summary_path=str(summary_path),
-    )
-
-
-def main() -> int:
-    case_id = os.getenv("PROBE_CASE_ID", f"case-{uuid.uuid4().hex[:8]}")
-    emit("banner", context=runtime_context())
-    start_case(case_id)
-
-    # Replace this block with the narrow runtime question you want to test.
-    observation = "replace-me"
-    result_flag = "expected"
-
-    record_case_result(
-        case_id=case_id,
-        observation_summary=observation,
-        result_flag=result_flag,
-    )
-    finalize(exit_code=0)
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
@@ -126,6 +126,7 @@ plus additional fields specific to the current tool call:
 - `tool_arguments` – the raw argument string passed to the tool  
 - `tool_namespace` – the Responses namespace for the tool call, when the tool was loaded through `tool_namespace()` or another namespaced surface  
 - `qualified_tool_name` – the tool name qualified with the namespace when one is available  
+- `conversation_history` – a visible history snapshot available to the tool at invocation time. For local function tools in non-streaming runs, this includes the current input plus prior visible run items that can be represented as model input.  
 
 Use `ToolContext` when you need tool-level metadata during execution.  
 For general context sharing between agents and tools, `RunContextWrapper` remains sufficient. Because `ToolContext` extends `RunContextWrapper`, it can also expose `.tool_input` when a nested `Agent.as_tool()` run supplied structured input.
 
@@ -602,6 +602,7 @@ async def _run_agent_impl(context: ToolContext, input_json: str) -> Any:
                     tool_namespace=context.tool_namespace,
                     agent=context.agent,
                     run_config=resolved_run_config,
+                    conversation_history=context.conversation_history,
                 )
                 set_agent_tool_state_scope(nested_context, tool_state_scope_id)
                 if should_capture_tool_input:
 
@@ -1587,6 +1587,7 @@ async def get_new_response(
     )
     if isinstance(filtered.input, list):
         filtered.input = deduplicate_input_items_preferring_latest(filtered.input)
+    context_wrapper.turn_input = list(filtered.input)
 
     model = get_model(agent, run_config)
     model_settings = agent.model_settings.resolve(run_config.model_settings)
 
@@ -55,6 +55,7 @@
     RunItemBase,
     ToolApprovalItem,
     ToolCallOutputItem,
+    TResponseInputItem,
 )
 from ..logger import logger
 from ..model_settings import ModelSettings
@@ -1284,13 +1285,17 @@ def __init__(
         hooks: RunHooks[Any],
         context_wrapper: RunContextWrapper[Any],
         config: RunConfig,
+        conversation_history: list[TResponseInputItem] | None,
         isolate_parallel_failures: bool | None,
     ) -> None:
         self.agent = agent
         self.tool_runs = tool_runs
         self.hooks = hooks
         self.context_wrapper = context_wrapper
         self.config = config
+        self.conversation_history = (
+            list(conversation_history) if conversation_history is not None else None
+        )
         self.isolate_parallel_failures = (
             len(tool_runs) > 1 if isolate_parallel_failures is None else isolate_parallel_failures
         )
@@ -1465,6 +1470,7 @@ async def _run_single_tool(
                 tool_namespace=tool_context_namespace,
                 agent=self.agent,
                 run_config=self.config,
+                conversation_history=self.conversation_history,
             )
             agent_hooks = self.agent.hooks
             if self.config.trace_include_sensitive_data:
@@ -1797,6 +1803,7 @@ async def execute_function_tool_calls(
     hooks: RunHooks[Any],
     context_wrapper: RunContextWrapper[Any],
     config: RunConfig,
+    conversation_history: list[TResponseInputItem] | None = None,
     isolate_parallel_failures: bool | None = None,
 ) -> tuple[
     list[FunctionToolResult], list[ToolInputGuardrailResult], list[ToolOutputGuardrailResult]
@@ -1808,6 +1815,7 @@ async def execute_function_tool_calls(
         hooks=hooks,
         context_wrapper=context_wrapper,
         config=config,
+        conversation_history=conversation_history,
         isolate_parallel_failures=isolate_parallel_failures,
     ).execute()
 
 
@@ -20,6 +20,7 @@
     ToolApprovalItem,
     ToolCallItem,
     ToolCallOutputItem,
+    TResponseInputItem,
 )
 from ..run_context import RunContextWrapper
 from ..tool import FunctionTool, MCPToolApprovalRequest
@@ -522,6 +523,7 @@ async def _execute_tool_plan(
     hooks,
     context_wrapper: RunContextWrapper[Any],
     run_config,
+    conversation_history: list[TResponseInputItem] | None = None,
     parallel: bool = True,
 ) -> tuple[
     list[Any],
@@ -556,6 +558,7 @@ async def _execute_tool_plan(
                 hooks=hooks,
                 context_wrapper=context_wrapper,
                 config=run_config,
+                conversation_history=conversation_history,
                 isolate_parallel_failures=isolate_function_tool_failures,
             ),
             execute_computer_actions(
@@ -598,6 +601,7 @@ async def _execute_tool_plan(
             hooks=hooks,
             context_wrapper=context_wrapper,
             config=run_config,
+            conversation_history=conversation_history,
             isolate_parallel_failures=isolate_function_tool_failures,
         )
         computer_results = await execute_computer_actions(
 
@@ -86,8 +86,10 @@
 from ..util._approvals import evaluate_needs_approval_setting
 from .items import (
     REJECTION_MESSAGE,
+    ReasoningItemIdPolicy,
     apply_patch_rejection_item,
     function_rejection_item,
+    run_items_to_input_items,
     shell_rejection_item,
 )
 from .run_steps import (
@@ -153,6 +155,21 @@
 ]
 
 
+def _build_function_tool_conversation_history(
+    turn_input: Sequence[TResponseInputItem],
+    pre_step_items: Sequence[RunItem],
+    reasoning_item_id_policy: ReasoningItemIdPolicy | None,
+) -> list[TResponseInputItem]:
+    """Build the visible history snapshot for a local function tool invocation.
+
+    The snapshot is based on the actual turn input sent to the model plus prior generated
+    items converted through the same reasoning-ID policy used by the normal model-input path.
+    """
+    history = list(turn_input)
+    history.extend(run_items_to_input_items(pre_step_items, reasoning_item_id_policy))
+    return history
+
+
 async def _maybe_finalize_from_tool_results(
     *,
     agent: Agent[TContext],
@@ -528,6 +545,12 @@ async def execute_tools_and_side_effects(
         new_items=processed_response.new_items,
     )
 
+    conversation_history = _build_function_tool_conversation_history(
+        context_wrapper.turn_input,
+        pre_step_items,
+        run_config.reasoning_item_id_policy,
+    )
+
     (
         function_results,
         tool_input_guardrail_results,
@@ -542,6 +565,7 @@ async def execute_tools_and_side_effects(
         hooks=hooks,
         context_wrapper=context_wrapper,
         run_config=run_config,
+        conversation_history=conversation_history,
     )
     new_step_items.extend(
         _build_tool_result_items(
@@ -1103,6 +1127,17 @@ def _add_unmatched_pending(approval: ToolApprovalItem) -> None:
         apply_patch_calls=approved_apply_patch_calls,
     )
 
+    resolved_reasoning_item_id_policy = (
+        run_config.reasoning_item_id_policy
+        if run_config.reasoning_item_id_policy is not None
+        else (run_state._reasoning_item_id_policy if run_state is not None else None)
+    )
+    conversation_history = _build_function_tool_conversation_history(
+        context_wrapper.turn_input,
+        original_pre_step_items,
+        resolved_reasoning_item_id_policy,
+    )
+
     (
         function_results,
         tool_input_guardrail_results,
@@ -1117,6 +1152,7 @@ def _add_unmatched_pending(approval: ToolApprovalItem) -> None:
         hooks=hooks,
         context_wrapper=context_wrapper,
         run_config=run_config,
+        conversation_history=conversation_history,
     )
 
     for interruption in _collect_tool_interruptions(
Original file line number	Diff line number	Diff line change
`@@ -602,6 +602,7 @@ async def _run_agent_impl(context: ToolContext, input_json: str) -> Any:`
`602`	`602`	`tool_namespace=context.tool_namespace,`
`603`	`603`	`agent=context.agent,`
`604`	`604`	`run_config=resolved_run_config,`
	`605`	`+ conversation_history=context.conversation_history,`
`605`	`606`	`)`
`606`	`607`	`set_agent_tool_state_scope(nested_context, tool_state_scope_id)`
`607`	`608`	`if should_capture_tool_input:`
Original file line number	Diff line number	Diff line change
`@@ -1587,6 +1587,7 @@ async def get_new_response(`
`1587`	`1587`	`)`
`1588`	`1588`	`if isinstance(filtered.input, list):`
`1589`	`1589`	`filtered.input = deduplicate_input_items_preferring_latest(filtered.input)`
	`1590`	`+ context_wrapper.turn_input = list(filtered.input)`
`1590`	`1591`
`1591`	`1592`	`model = get_model(agent, run_config)`
`1592`	`1593`	`model_settings = agent.model_settings.resolve(run_config.model_settings)`