From 15ef7a86081d177b9bcc4d26498ad6260dd90ee1 Mon Sep 17 00:00:00 2001
From: Yuki Huang <yukih@nvidia.com>
Date: Wed, 22 Apr 2026 03:04:59 -0700
Subject: [PATCH 1/5] add log of reset_count and correct num_tool_calls

Signed-off-by: Yuki Huang <yukih@nvidia.com>
---
 resources_servers/browsecomp_advanced_harness/app.py | 2 --
 responses_api_agents/browsecomp_agent/app.py         | 6 ++++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/resources_servers/browsecomp_advanced_harness/app.py b/resources_servers/browsecomp_advanced_harness/app.py
index 14b97a725..643b34c33 100644
--- a/resources_servers/browsecomp_advanced_harness/app.py
+++ b/resources_servers/browsecomp_advanced_harness/app.py
@@ -159,7 +159,6 @@ class TavilySearchMetrics(BaseModel):
 
 
 class TavilySearchVerifyResponse(TavilySearchVerifyRequest, JudgeEvaluation):
-    num_tool_calls: int
     metrics: TavilySearchMetrics
 
 
@@ -400,7 +399,6 @@ async def verify(self, request: Request, body: TavilySearchVerifyRequest) -> Tav
         return TavilySearchVerifyResponse(
             **body.model_dump(),
             **judge_evaluation.model_dump(),
-            num_tool_calls=sum(o.type == "function_call" for o in body.response.output),
             metrics=self._session_id_to_metrics[request.session[SESSION_ID_KEY]],
         )
 
diff --git a/responses_api_agents/browsecomp_agent/app.py b/responses_api_agents/browsecomp_agent/app.py
index 0b1956de0..e526ed9f9 100644
--- a/responses_api_agents/browsecomp_agent/app.py
+++ b/responses_api_agents/browsecomp_agent/app.py
@@ -84,10 +84,12 @@ async def responses(
         new_outputs = []
         usage = None
         step = 0
+        num_tool_calls = 0
         model_server_cookies = None  # update the cookies on every model response
         resources_server_cookies = request.cookies  # update the cookies on every resources server response
 
         reset_threshold = 0
+        reset_count = 0
         if self.config.max_context_tokens and self.config.context_reset_pct:
             reset_threshold = int(self.config.max_context_tokens * self.config.context_reset_pct)
 
@@ -119,6 +121,7 @@ async def responses(
             # --- Check context reset threshold ---
             prompt_tokens = model_response.usage.input_tokens if model_response.usage else 0
             if reset_threshold and prompt_tokens > reset_threshold:
+                reset_count += 1
                 if self.config.context_reset_keep_rounds > 0:
                     new_outputs = self._extract_last_rounds(new_outputs)
                 else:
@@ -154,6 +157,7 @@ async def responses(
 
             # --- Execute tool calls ---
             for output_function_call in all_fn_calls:
+                num_tool_calls += 1
                 api_response = await self.server_client.post(
                     server_name=self.config.resources_server.name,
                     url_path=f"/{output_function_call.name}",
@@ -222,6 +226,8 @@ async def responses(
 
         model_response.output = new_outputs
         model_response.usage = usage
+        model_response.reset_count = reset_count
+        model_response.num_tool_calls = num_tool_calls
         return model_response
 
     async def run(self, request: Request, body: BrowsecompAgentRunRequest) -> BrowsecompAgentVerifyResponse:

From 340b53e314c2add790c0022b4ab93668b1d0d878 Mon Sep 17 00:00:00 2001
From: Yuki Huang <yukih@nvidia.com>
Date: Wed, 22 Apr 2026 03:16:28 -0700
Subject: [PATCH 2/5] add max_reset_count

Signed-off-by: Yuki Huang <yukih@nvidia.com>
---
 responses_api_agents/browsecomp_agent/app.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/responses_api_agents/browsecomp_agent/app.py b/responses_api_agents/browsecomp_agent/app.py
index e526ed9f9..228bbb1c4 100644
--- a/responses_api_agents/browsecomp_agent/app.py
+++ b/responses_api_agents/browsecomp_agent/app.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import json
 import re
-from typing import List
+from typing import List, Optional
 
 from fastapi import Request, Response
 from pydantic import ConfigDict, ValidationError
@@ -52,6 +52,7 @@ class BrowsecompAgentConfig(BaseResponsesAPIAgentConfig):
     max_context_tokens: int = 196608
     context_reset_pct: float = 0.3
     context_reset_keep_rounds: int = 3
+    max_reset_count: Optional[int] = None
     max_run_retries: int = 1
 
 
@@ -90,6 +91,7 @@ async def responses(
 
         reset_threshold = 0
         reset_count = 0
+        max_reset_count = self.config.max_reset_count
         if self.config.max_context_tokens and self.config.context_reset_pct:
             reset_threshold = int(self.config.max_context_tokens * self.config.context_reset_pct)
 
@@ -120,7 +122,11 @@ async def responses(
 
             # --- Check context reset threshold ---
             prompt_tokens = model_response.usage.input_tokens if model_response.usage else 0
-            if reset_threshold and prompt_tokens > reset_threshold:
+            if (
+                reset_threshold
+                and prompt_tokens > reset_threshold
+                and (max_reset_count is None or reset_count < max_reset_count)
+            ):
                 reset_count += 1
                 if self.config.context_reset_keep_rounds > 0:
                     new_outputs = self._extract_last_rounds(new_outputs)

From 07aa70281dc56ef6f294ae3ce36fe6089ba5a1b8 Mon Sep 17 00:00:00 2001
From: Yuki Huang <yukih@nvidia.com>
Date: Wed, 22 Apr 2026 06:21:44 -0700
Subject: [PATCH 3/5] support record traj

Signed-off-by: Yuki Huang <yukih@nvidia.com>
---
 responses_api_agents/browsecomp_agent/app.py | 50 ++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/responses_api_agents/browsecomp_agent/app.py b/responses_api_agents/browsecomp_agent/app.py
index 228bbb1c4..79685a2c7 100644
--- a/responses_api_agents/browsecomp_agent/app.py
+++ b/responses_api_agents/browsecomp_agent/app.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 import json
 import re
+from pathlib import Path
 from typing import List, Optional
 
 from fastapi import Request, Response
@@ -54,6 +55,7 @@ class BrowsecompAgentConfig(BaseResponsesAPIAgentConfig):
     context_reset_keep_rounds: int = 3
     max_reset_count: Optional[int] = None
     max_run_retries: int = 1
+    snap_dir: Optional[str] = None
 
 
 class BrowsecompAgentRunRequest(BaseRunRequest):
@@ -82,6 +84,12 @@ async def responses(
         if isinstance(body.input, str):
             body.input = [NeMoGymEasyInputMessage(role="user", content=body.input)]
 
+        task_index, attempt = None, None
+        if self.config.snap_dir:
+            task_index = body.metadata.pop("task_index")
+            attempt = body.metadata.pop("attempt")
+            body.metadata = body.metadata or {}
+
         new_outputs = []
         usage = None
         step = 0
@@ -102,6 +110,8 @@ async def responses(
                 new_outputs = self._compact_old_tool_messages(new_outputs)
 
             new_body = body.model_copy(update={"input": body.input + new_outputs})
+            if not body.metadata:
+                new_body = new_body.model_dump(exclude={"metadata"}, exclude_none=True)
 
             model_response = await self.server_client.post(
                 server_name=self.config.model_server.name,
@@ -128,6 +138,16 @@ async def responses(
                 and (max_reset_count is None or reset_count < max_reset_count)
             ):
                 reset_count += 1
+                # record current context
+                if self.config.snap_dir:
+                    self._save_snapshot(
+                        messages=body.input + new_outputs,
+                        task_index=task_index,
+                        attempt=attempt,
+                        reset_count=reset_count,
+                        is_final=False,
+                    )
+                # reset context
                 if self.config.context_reset_keep_rounds > 0:
                     new_outputs = self._extract_last_rounds(new_outputs)
                 else:
@@ -226,6 +246,16 @@ async def responses(
             if self.config.max_steps and step >= self.config.max_steps:
                 break
 
+        # record final context
+        if self.config.snap_dir:
+            self._save_snapshot(
+                messages=body.input + new_outputs,
+                task_index=task_index,
+                attempt=attempt,
+                reset_count=None,
+                is_final=True,
+            )
+
         # Propogate any extra cookies necessary for downstream verification
         for k, v in (*resources_server_cookies.items(), *model_server_cookies.items()):
             response.set_cookie(k, v)
@@ -250,6 +280,12 @@ async def run(self, request: Request, body: BrowsecompAgentRunRequest) -> Browse
 
         last_verify_response = None
         for attempt in range(self.config.max_run_retries):
+            # prepare for recording
+            if self.config.snap_dir:
+                body.responses_create_params.metadata = dict(body.responses_create_params.metadata or {})
+                body.responses_create_params.metadata["task_index"] = str(body._ng_task_index)
+                body.responses_create_params.metadata["attempt"] = str(attempt)
+
             response = await self.server_client.post(
                 server_name=self.config.name,
                 url_path="/v1/responses",
@@ -349,6 +385,20 @@ def _extract_last_rounds(self, new_outputs):
             result.extend(tool_outputs)
         return result
 
+    def _save_snapshot(self, messages, task_index, attempt, reset_count, is_final):
+        sample_dir = Path(f"{self.config.snap_dir}/sample_{task_index}")
+        if not sample_dir.exists():
+            sample_dir.mkdir(parents=True)
+
+        if is_final:
+            sample_path = f"{sample_dir}/attempt_{attempt}_final.jsonl"
+        else:
+            sample_path = f"{sample_dir}/attempt_{attempt}_reset_{reset_count}.jsonl"
+
+        with open(sample_path, "w", encoding="utf-8") as f:
+            for msg in messages:
+                f.write(msg.model_dump_json() + "\n")
+
 
 if __name__ == "__main__":
     BrowsecompAgent.run_webserver()

From 3fce82385d58fad7e474eb279641cd3da2340e03 Mon Sep 17 00:00:00 2001
From: Yuki Huang <yukih@nvidia.com>
Date: Wed, 22 Apr 2026 06:22:57 -0700
Subject: [PATCH 4/5] add param to config

Signed-off-by: Yuki Huang <yukih@nvidia.com>
---
 benchmarks/browsecomp/config.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/browsecomp/config.yaml b/benchmarks/browsecomp/config.yaml
index c494f5c37..635766846 100644
--- a/benchmarks/browsecomp/config.yaml
+++ b/benchmarks/browsecomp/config.yaml
@@ -32,7 +32,9 @@ browsecomp_benchmark_agent:
       max_context_tokens: 196608
       context_reset_pct: 0.3
       context_reset_keep_rounds: 3
+      max_reset_count: null
       max_run_retries: 3
+      snap_dir: null
       resources_server:
         type: resources_servers
         name: browsecomp_benchmark_resources_server

From 2297da0570cfc968e2fcd9c7e28b8b2d5fbb67e7 Mon Sep 17 00:00:00 2001
From: Yuki Huang <yukih@nvidia.com>
Date: Thu, 23 Apr 2026 07:50:33 -0700
Subject: [PATCH 5/5] empty push

Signed-off-by: Yuki Huang <yukih@nvidia.com>