From 15ef7a86081d177b9bcc4d26498ad6260dd90ee1 Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Wed, 22 Apr 2026 03:04:59 -0700 Subject: [PATCH 1/5] add log of reset_count and correct num_tool_calls Signed-off-by: Yuki Huang --- resources_servers/browsecomp_advanced_harness/app.py | 2 -- responses_api_agents/browsecomp_agent/app.py | 6 ++++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/resources_servers/browsecomp_advanced_harness/app.py b/resources_servers/browsecomp_advanced_harness/app.py index 14b97a725..643b34c33 100644 --- a/resources_servers/browsecomp_advanced_harness/app.py +++ b/resources_servers/browsecomp_advanced_harness/app.py @@ -159,7 +159,6 @@ class TavilySearchMetrics(BaseModel): class TavilySearchVerifyResponse(TavilySearchVerifyRequest, JudgeEvaluation): - num_tool_calls: int metrics: TavilySearchMetrics @@ -400,7 +399,6 @@ async def verify(self, request: Request, body: TavilySearchVerifyRequest) -> Tav return TavilySearchVerifyResponse( **body.model_dump(), **judge_evaluation.model_dump(), - num_tool_calls=sum(o.type == "function_call" for o in body.response.output), metrics=self._session_id_to_metrics[request.session[SESSION_ID_KEY]], ) diff --git a/responses_api_agents/browsecomp_agent/app.py b/responses_api_agents/browsecomp_agent/app.py index 0b1956de0..e526ed9f9 100644 --- a/responses_api_agents/browsecomp_agent/app.py +++ b/responses_api_agents/browsecomp_agent/app.py @@ -84,10 +84,12 @@ async def responses( new_outputs = [] usage = None step = 0 + num_tool_calls = 0 model_server_cookies = None # update the cookies on every model response resources_server_cookies = request.cookies # update the cookies on every resources server response reset_threshold = 0 + reset_count = 0 if self.config.max_context_tokens and self.config.context_reset_pct: reset_threshold = int(self.config.max_context_tokens * self.config.context_reset_pct) @@ -119,6 +121,7 @@ async def responses( # --- Check context reset threshold --- prompt_tokens = model_response.usage.input_tokens if model_response.usage else 0 if reset_threshold and prompt_tokens > reset_threshold: + reset_count += 1 if self.config.context_reset_keep_rounds > 0: new_outputs = self._extract_last_rounds(new_outputs) else: @@ -154,6 +157,7 @@ async def responses( # --- Execute tool calls --- for output_function_call in all_fn_calls: + num_tool_calls += 1 api_response = await self.server_client.post( server_name=self.config.resources_server.name, url_path=f"/{output_function_call.name}", @@ -222,6 +226,8 @@ async def responses( model_response.output = new_outputs model_response.usage = usage + model_response.reset_count = reset_count + model_response.num_tool_calls = num_tool_calls return model_response async def run(self, request: Request, body: BrowsecompAgentRunRequest) -> BrowsecompAgentVerifyResponse: From 340b53e314c2add790c0022b4ab93668b1d0d878 Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Wed, 22 Apr 2026 03:16:28 -0700 Subject: [PATCH 2/5] add max_reset_count Signed-off-by: Yuki Huang --- responses_api_agents/browsecomp_agent/app.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/responses_api_agents/browsecomp_agent/app.py b/responses_api_agents/browsecomp_agent/app.py index e526ed9f9..228bbb1c4 100644 --- a/responses_api_agents/browsecomp_agent/app.py +++ b/responses_api_agents/browsecomp_agent/app.py @@ -14,7 +14,7 @@ # limitations under the License. import json import re -from typing import List +from typing import List, Optional from fastapi import Request, Response from pydantic import ConfigDict, ValidationError @@ -52,6 +52,7 @@ class BrowsecompAgentConfig(BaseResponsesAPIAgentConfig): max_context_tokens: int = 196608 context_reset_pct: float = 0.3 context_reset_keep_rounds: int = 3 + max_reset_count: Optional[int] = None max_run_retries: int = 1 @@ -90,6 +91,7 @@ async def responses( reset_threshold = 0 reset_count = 0 + max_reset_count = self.config.max_reset_count if self.config.max_context_tokens and self.config.context_reset_pct: reset_threshold = int(self.config.max_context_tokens * self.config.context_reset_pct) @@ -120,7 +122,11 @@ async def responses( # --- Check context reset threshold --- prompt_tokens = model_response.usage.input_tokens if model_response.usage else 0 - if reset_threshold and prompt_tokens > reset_threshold: + if ( + reset_threshold + and prompt_tokens > reset_threshold + and (max_reset_count is None or reset_count < max_reset_count) + ): reset_count += 1 if self.config.context_reset_keep_rounds > 0: new_outputs = self._extract_last_rounds(new_outputs) From 07aa70281dc56ef6f294ae3ce36fe6089ba5a1b8 Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Wed, 22 Apr 2026 06:21:44 -0700 Subject: [PATCH 3/5] support record traj Signed-off-by: Yuki Huang --- responses_api_agents/browsecomp_agent/app.py | 50 ++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/responses_api_agents/browsecomp_agent/app.py b/responses_api_agents/browsecomp_agent/app.py index 228bbb1c4..79685a2c7 100644 --- a/responses_api_agents/browsecomp_agent/app.py +++ b/responses_api_agents/browsecomp_agent/app.py @@ -14,6 +14,7 @@ # limitations under the License. import json import re +from pathlib import Path from typing import List, Optional from fastapi import Request, Response @@ -54,6 +55,7 @@ class BrowsecompAgentConfig(BaseResponsesAPIAgentConfig): context_reset_keep_rounds: int = 3 max_reset_count: Optional[int] = None max_run_retries: int = 1 + snap_dir: Optional[str] = None class BrowsecompAgentRunRequest(BaseRunRequest): @@ -82,6 +84,12 @@ async def responses( if isinstance(body.input, str): body.input = [NeMoGymEasyInputMessage(role="user", content=body.input)] + task_index, attempt = None, None + if self.config.snap_dir: + task_index = body.metadata.pop("task_index") + attempt = body.metadata.pop("attempt") + body.metadata = body.metadata or {} + new_outputs = [] usage = None step = 0 @@ -102,6 +110,8 @@ async def responses( new_outputs = self._compact_old_tool_messages(new_outputs) new_body = body.model_copy(update={"input": body.input + new_outputs}) + if not body.metadata: + new_body = new_body.model_dump(exclude={"metadata"}, exclude_none=True) model_response = await self.server_client.post( server_name=self.config.model_server.name, @@ -128,6 +138,16 @@ async def responses( and (max_reset_count is None or reset_count < max_reset_count) ): reset_count += 1 + # record current context + if self.config.snap_dir: + self._save_snapshot( + messages=body.input + new_outputs, + task_index=task_index, + attempt=attempt, + reset_count=reset_count, + is_final=False, + ) + # reset context if self.config.context_reset_keep_rounds > 0: new_outputs = self._extract_last_rounds(new_outputs) else: @@ -226,6 +246,16 @@ async def responses( if self.config.max_steps and step >= self.config.max_steps: break + # record final context + if self.config.snap_dir: + self._save_snapshot( + messages=body.input + new_outputs, + task_index=task_index, + attempt=attempt, + reset_count=None, + is_final=True, + ) + # Propogate any extra cookies necessary for downstream verification for k, v in (*resources_server_cookies.items(), *model_server_cookies.items()): response.set_cookie(k, v) @@ -250,6 +280,12 @@ async def run(self, request: Request, body: BrowsecompAgentRunRequest) -> Browse last_verify_response = None for attempt in range(self.config.max_run_retries): + # prepare for recording + if self.config.snap_dir: + body.responses_create_params.metadata = dict(body.responses_create_params.metadata or {}) + body.responses_create_params.metadata["task_index"] = str(body._ng_task_index) + body.responses_create_params.metadata["attempt"] = str(attempt) + response = await self.server_client.post( server_name=self.config.name, url_path="/v1/responses", @@ -349,6 +385,20 @@ def _extract_last_rounds(self, new_outputs): result.extend(tool_outputs) return result + def _save_snapshot(self, messages, task_index, attempt, reset_count, is_final): + sample_dir = Path(f"{self.config.snap_dir}/sample_{task_index}") + if not sample_dir.exists(): + sample_dir.mkdir(parents=True) + + if is_final: + sample_path = f"{sample_dir}/attempt_{attempt}_final.jsonl" + else: + sample_path = f"{sample_dir}/attempt_{attempt}_reset_{reset_count}.jsonl" + + with open(sample_path, "w", encoding="utf-8") as f: + for msg in messages: + f.write(msg.model_dump_json() + "\n") + if __name__ == "__main__": BrowsecompAgent.run_webserver() From 3fce82385d58fad7e474eb279641cd3da2340e03 Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Wed, 22 Apr 2026 06:22:57 -0700 Subject: [PATCH 4/5] add param to config Signed-off-by: Yuki Huang --- benchmarks/browsecomp/config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/browsecomp/config.yaml b/benchmarks/browsecomp/config.yaml index c494f5c37..635766846 100644 --- a/benchmarks/browsecomp/config.yaml +++ b/benchmarks/browsecomp/config.yaml @@ -32,7 +32,9 @@ browsecomp_benchmark_agent: max_context_tokens: 196608 context_reset_pct: 0.3 context_reset_keep_rounds: 3 + max_reset_count: null max_run_retries: 3 + snap_dir: null resources_server: type: resources_servers name: browsecomp_benchmark_resources_server From 2297da0570cfc968e2fcd9c7e28b8b2d5fbb67e7 Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Thu, 23 Apr 2026 07:50:33 -0700 Subject: [PATCH 5/5] empty push Signed-off-by: Yuki Huang