feat: add support for sending prior assistant reasoning content in OpenHands requests

potatoQi · potatoQi · commit 0bf4d47ccd51 · 2026-06-01T15:12:50.000+08:00
diff --git a/config_example.toml b/config_example.toml
@@ -56,6 +56,8 @@ LLM_BASE_URL = ""
 OPENHANDS_VERSION = "0.62.0"
 # Optional: Reasoning effort for OpenAI o-series models
 LLM_REASONING_EFFORT = ""
+# Optional: Send prior assistant reasoning_content back to OpenHands requests
+LLM_SEND_REASONING_CONTENT = false
 # Azure API Version (required for Azure models only)
 LLM_API_VERSION = ""
 # Optional: OpenHands agent max iterations (step limit). Upstream default is 500.
diff --git a/docs/config.md b/docs/config.md
@@ -75,6 +75,7 @@ SAVE_COMPLETIONS = false      # Optional: whether to save LLM completions (true/
 INFER_LOG_RENDER_MODE = "compact" # Optional: compact|full for infer.log rendering
 
 LLM_REASONING_EFFORT = ""     # Optional: Reasoning effort for OpenAI o-series models
+LLM_SEND_REASONING_CONTENT = false # Optional: true to send prior assistant reasoning_content in history
 OPENHANDS_MAX_ITERATIONS = "" # Optional: OpenHands agent max iterations (step limit). Upstream default is 500.
 
 ```
diff --git a/docs/infer_cli_arg.md b/docs/infer_cli_arg.md
@@ -131,6 +131,11 @@ flags can override metadata (see the argument list below).
   Force native tool calling (`LLM_NATIVE_TOOL_CALLING=true`).  
   Resume mode: ignored (uses metadata).
 
+- `--send-reasoning-content`  
+  Send prior assistant `reasoning_content` back to the model in subsequent OpenHands requests.  
+  Useful for thinking models whose chat template supports reasoning history.  
+  Resume mode: ignored (uses metadata).
+
 - `--max-iters`  
   Maximum iterations for OpenHands (`OPENHANDS_MAX_ITERATIONS`).  
   Default: no override (OpenHands default applies).  
diff --git a/featurebench/infer/agents/openhands.py b/featurebench/infer/agents/openhands.py
@@ -39,6 +39,77 @@ def _env(name: str) -> str | None:
     return str(value).strip()
 
 
+def _install_send_reasoning_content_override(model: str) -> None:
+    """Force OpenHands SDK to preserve/send reasoning content for this model."""
+    try:
+        from openhands.sdk.llm.utils import model_features
+
+        tokens = [model]
+        if "/" in model:
+            tokens.append(model.split("/", 1)[-1])
+        for token in tokens:
+            if token and token not in model_features.SEND_REASONING_CONTENT_MODELS:
+                model_features.SEND_REASONING_CONTENT_MODELS.append(token)
+
+        cache_clear = getattr(model_features.get_features, "cache_clear", None)
+        if cache_clear:
+            cache_clear()
+    except Exception as exc:
+        print(
+            f"Warning: failed to enable reasoning-content send override: {exc}",
+            file=sys.stderr,
+        )
+
+    try:
+        from openhands.sdk.llm.message import Message
+
+        if getattr(Message, "_featurebench_reasoning_alias_patch", False):
+            return
+
+        original = Message.from_llm_chat_message.__func__
+
+        class _ReasoningContentProxy:
+            def __init__(self, wrapped: Any, reasoning_content: str):
+                self._wrapped = wrapped
+                self.reasoning_content = reasoning_content
+
+            def __getattr__(self, name: str) -> Any:
+                return getattr(self._wrapped, name)
+
+        def _extract_reasoning_content(message: Any) -> str | None:
+            reasoning = getattr(message, "reasoning_content", None)
+            if reasoning:
+                return str(reasoning)
+
+            reasoning = getattr(message, "reasoning", None)
+            if reasoning:
+                return str(reasoning)
+
+            provider_fields = getattr(message, "provider_specific_fields", None)
+            if isinstance(provider_fields, dict):
+                for key in ("reasoning_content", "reasoning"):
+                    reasoning = provider_fields.get(key)
+                    if reasoning:
+                        return str(reasoning)
+
+            return None
+
+        def patched(cls: type[Message], message: Any) -> Message:
+            if getattr(message, "reasoning_content", None) is None:
+                reasoning_content = _extract_reasoning_content(message)
+                if reasoning_content:
+                    message = _ReasoningContentProxy(message, reasoning_content)
+            return original(cls, message)
+
+        Message.from_llm_chat_message = classmethod(patched)
+        setattr(Message, "_featurebench_reasoning_alias_patch", True)
+    except Exception as exc:
+        print(
+            f"Warning: failed to install reasoning field alias patch: {exc}",
+            file=sys.stderr,
+        )
+
+
 def _event_data(event: Any) -> dict[str, Any]:
     try:
         return event.model_dump(mode="json", exclude_none=True)
@@ -128,6 +199,9 @@ def _build_llm() -> Any:
     if not model:
         raise RuntimeError("LLM_MODEL is required for OpenHands SDK runner.")
 
+    if _truthy(_env("LLM_SEND_REASONING_CONTENT")):
+        _install_send_reasoning_content_override(model)
+
     kwargs: dict[str, Any] = {"model": model}
 
     api_key = _env("LLM_API_KEY")
@@ -287,18 +361,8 @@ def install_script(self) -> str:
 
 export PIP_CACHE_DIR="$CACHE_ROOT/pip"
 export UV_CACHE_DIR="$CACHE_ROOT/uv"
-
-# If a local uv Python mirror exists and is non-empty, use it. Otherwise, let uv download Python from the default upstream sources.
 UV_PYTHON_MIRROR_DIR="$CACHE_ROOT/uv/python-mirror"
-if [ -z "${{UV_PYTHON_INSTALL_MIRROR:-}}" ]; then
-    if [ -d "$UV_PYTHON_MIRROR_DIR" ] && [ "$(ls -A "$UV_PYTHON_MIRROR_DIR" 2>/dev/null)" ]; then
-        export UV_PYTHON_INSTALL_MIRROR="file://$UV_PYTHON_MIRROR_DIR"
-        echo "Using local uv python mirror: $UV_PYTHON_INSTALL_MIRROR"
-    else
-        unset UV_PYTHON_INSTALL_MIRROR
-        echo "Local uv python mirror is empty; using upstream python downloads"
-    fi
-fi
+PYTHON_INSTALL_MIRROR="${{UV_PYTHON_INSTALL_MIRROR:-https://ghfast.top/https://github.com/astral-sh/python-build-standalone/releases/download}}"
 
 UV_DIR="/opt/featurebench/uv"
 UV_BIN_PRIMARY="$UV_DIR/bin/uv"
@@ -338,8 +402,8 @@ def install_script(self) -> str:
 
 # Configure uv index mirror (TUNA)
 mkdir -p ~/.config/uv
-cat > ~/.config/uv/uv.toml <<'EOF'
-python-install-mirror = "https://ghfast.top/https://github.com/astral-sh/python-build-standalone/releases/download"
+cat > ~/.config/uv/uv.toml <<EOF
+python-install-mirror = "$PYTHON_INSTALL_MIRROR"
 [[index]]
 url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/"
 default = true
@@ -356,7 +420,54 @@ def install_script(self) -> str:
     "$UV_BIN" pip install --index-url "$FALLBACK_INDEX_URL" "$@"
 }}
 
-# Install Python via uv (downloads cached via UV_CACHE_DIR)
+cache_uv_python_download() {{
+    if [[ "$PYTHON_INSTALL_MIRROR" == file://* ]]; then
+        export UV_PYTHON_INSTALL_MIRROR="$PYTHON_INSTALL_MIRROR"
+        echo "Using configured uv python mirror: $UV_PYTHON_INSTALL_MIRROR"
+        return 0
+    fi
+
+    local download_url rel_path target_path part_path
+    download_url="$("$UV_BIN" python list "$PY_VERSION" --only-downloads --show-urls | awk 'NR == 1 {{print $2}}')"
+    if [ -z "$download_url" ] || [[ "$download_url" == "<"* ]]; then
+        echo "Unable to resolve uv Python download URL for $PY_VERSION" >&2
+        return 1
+    fi
+    if [[ "$download_url" != "$PYTHON_INSTALL_MIRROR/"* ]]; then
+        echo "Resolved uv Python URL does not use configured mirror: $download_url" >&2
+        return 1
+    fi
+
+    rel_path="${{download_url#"$PYTHON_INSTALL_MIRROR"/}}"
+    rel_path="${{rel_path//%2B/+}}"
+    rel_path="${{rel_path//%2b/+}}"
+    target_path="$UV_PYTHON_MIRROR_DIR/$rel_path"
+    part_path="$target_path.part"
+
+    mkdir -p "$(dirname "$target_path")"
+    if [ -s "$target_path" ] && tar -tzf "$target_path" >/dev/null 2>&1; then
+        echo "Using cached uv Python archive: $target_path"
+    else
+        if [ -s "$target_path" ]; then
+            echo "Cached uv Python archive is invalid; redownloading: $target_path" >&2
+            rm -f "$target_path"
+        fi
+        echo "Caching uv Python archive: $download_url -> $target_path"
+        curl -fL --retry 3 --retry-delay 2 --connect-timeout 20 -C - -o "$part_path" "$download_url"
+        if ! tar -tzf "$part_path" >/dev/null 2>&1; then
+            echo "Downloaded uv Python archive is invalid: $part_path" >&2
+            rm -f "$part_path"
+            return 1
+        fi
+        mv "$part_path" "$target_path"
+    fi
+
+    export UV_PYTHON_INSTALL_MIRROR="file://$UV_PYTHON_MIRROR_DIR"
+    echo "Using local uv python mirror: $UV_PYTHON_INSTALL_MIRROR"
+}}
+
+# Install Python via uv. The archive is cached in a local mirror first so future containers reuse it.
+cache_uv_python_download
 $UV_BIN python install $PY_VERSION
 
 # Create venv (container-local)
@@ -402,7 +513,13 @@ def get_run_command(self, instruction: str) -> str:
             "if [ ! -x /opt/openhands-venv/bin/python ]; then "
             "echo '/opt/openhands-venv/bin/python not found' >&2; exit 127; "
             "fi; "
-            "if /opt/openhands-venv/bin/python -c "
+            "if [[ \"${LLM_SEND_REASONING_CONTENT,,}\" =~ ^(1|true|yes|on)$ ]] && "
+            "/opt/openhands-venv/bin/python -c "
+            "\"import importlib.util, sys; "
+            "sys.exit(0 if importlib.util.find_spec('openhands.sdk') else 1)\"; "
+            "then "
+            f"/opt/openhands-venv/bin/python /agent-logs/openhands-sdk-runner.py --task-file {task_file}; "
+            "elif /opt/openhands-venv/bin/python -c "
             "\"import importlib.util, sys; "
             "sys.exit(0 if importlib.util.find_spec('openhands.core.main') else 1)\"; "
             "then "
@@ -445,6 +562,8 @@ def get_env_setup_script(self) -> str:
             "LLM_REASONING_EFFORT": self.env_vars.get("LLM_REASONING_EFFORT"),
             # Force native tool calling (OpenHands LLMConfig.native_tool_calling via LLM_ env mapping)
             "LLM_NATIVE_TOOL_CALLING": self.env_vars.get("LLM_NATIVE_TOOL_CALLING"),
+            # Force OpenHands SDK to send prior assistant reasoning_content in history.
+            "LLM_SEND_REASONING_CONTENT": self.env_vars.get("LLM_SEND_REASONING_CONTENT"),
             # Disable features not needed for FeatureBench
             "AGENT_ENABLE_PROMPT_EXTENSIONS": "false",
             "AGENT_ENABLE_BROWSING": "false",
diff --git a/featurebench/infer/models.py b/featurebench/infer/models.py
@@ -170,6 +170,8 @@ class InferConfig:
     white_box: bool = False
     # If True, force OpenHands to use native tool calling (LLM_NATIVE_TOOL_CALLING=true).
     force_native_tool_calling: bool = False
+    # If True, send prior assistant reasoning content back to OpenHands LLM requests.
+    send_reasoning_content: bool = False
     # Optional task IDs to force rerun even if completed.
     force_rerun_ids: Optional[List[str]] = None
     # If True, treat prior TIMEOUT attempts as completed when resuming (skip reruns).
@@ -199,6 +201,7 @@ def to_dict(self) -> Dict[str, Any]:
             "without_interface_descriptions": self.without_interface_descriptions,
             "white_box": self.white_box,
             "force_native_tool_calling": self.force_native_tool_calling,
+            "send_reasoning_content": self.send_reasoning_content,
             "force_rerun_ids": self.force_rerun_ids,
             "force_timeout": self.force_timeout,
             "api_key": self.api_key,
@@ -260,6 +263,7 @@ class RunMetadata:
     without_interface_descriptions: bool = False
     white_box: bool = False
     force_native_tool_calling: bool = False
+    send_reasoning_content: bool = False
     force_timeout: bool = False
     api_key: Optional[str] = None
     base_url: Optional[str] = None
@@ -289,6 +293,7 @@ def to_dict(self) -> Dict[str, Any]:
             "without_interface_descriptions": self.without_interface_descriptions,
             "white_box": self.white_box,
             "force_native_tool_calling": self.force_native_tool_calling,
+            "send_reasoning_content": self.send_reasoning_content,
             "force_timeout": self.force_timeout,
             "api_key": self.api_key,
             "base_url": self.base_url,
diff --git a/featurebench/infer/run_infer.py b/featurebench/infer/run_infer.py
@@ -338,6 +338,9 @@ def _apply_override(flag_name: str, value: Optional[str], key_map: Dict[str, str
                 # Avoid inheriting config/env values when not forcing.
                 self.agent_env_vars.pop("LLM_NATIVE_TOOL_CALLING", None)
 
+            if getattr(config, "send_reasoning_content", False):
+                self.agent_env_vars["LLM_SEND_REASONING_CONTENT"] = "true"
+
         # Surface force-timeout behavior to all agents via env.
         if getattr(config, "force_timeout", False):
             self.agent_env_vars["FB_FORCE_TIMEOUT"] = "true"
@@ -365,6 +368,12 @@ def _apply_override(flag_name: str, value: Optional[str], key_map: Dict[str, str
                 else:
                     self.agent_env_vars.pop("LLM_NATIVE_TOOL_CALLING", None)
 
+                send_reasoning = bool(metadata.get("send_reasoning_content"))
+                if send_reasoning:
+                    self.agent_env_vars["LLM_SEND_REASONING_CONTENT"] = "true"
+                else:
+                    self.agent_env_vars.pop("LLM_SEND_REASONING_CONTENT", None)
+
                 recorded = metadata.get("openhands_reasoning_effort")
                 if recorded is not None and str(recorded).strip():
                     self.agent_env_vars["LLM_REASONING_EFFORT"] = str(recorded).strip()
@@ -778,10 +787,14 @@ def _save_run_metadata(self, task_ids: List[str]) -> None:
         # Persist the *effective* reasoning effort used by the agent (if any).
         openhands_reasoning_effort: Optional[str] = None
         codex_reasoning_effort: Optional[str] = None
+        send_reasoning_content = False
         if self.config.agent == "openhands":
             raw = self.agent_env_vars.get("LLM_REASONING_EFFORT")
             if raw is not None and str(raw).strip():
                 openhands_reasoning_effort = str(raw).strip()
+            send_reasoning_content = str(
+                self.agent_env_vars.get("LLM_SEND_REASONING_CONTENT", "")
+            ).strip().lower() in {"1", "true", "yes", "on"}
         elif self.config.agent == "codex":
             raw = self.agent_env_vars.get("CODEX_REASONING_EFFORT")
             if raw is not None and str(raw).strip():
@@ -808,6 +821,7 @@ def _save_run_metadata(self, task_ids: List[str]) -> None:
             without_interface_descriptions=self.config.without_interface_descriptions,
             white_box=getattr(self.config, "white_box", False),
             force_native_tool_calling=getattr(self.config, "force_native_tool_calling", False),
+            send_reasoning_content=send_reasoning_content,
             force_timeout=getattr(self.config, "force_timeout", False),
             api_key=self.config.api_key,
             base_url=self.config.base_url,
@@ -849,6 +863,11 @@ def run(self) -> None:
         if self.config.agent == "openhands":
             if getattr(self.config, "force_native_tool_calling", False):
                 self.console.print("[white]Tool calling:[/] [yellow]forced native[/]")
+            send_reasoning_content = str(
+                self.agent_env_vars.get("LLM_SEND_REASONING_CONTENT", "")
+            ).strip().lower() in {"1", "true", "yes", "on"}
+            if send_reasoning_content:
+                self.console.print("[white]Reasoning content:[/] [yellow]send in history[/]")
             effective = self.agent_env_vars.get("OPENHANDS_MAX_ITERATIONS")
             if effective is not None and str(effective).strip():
                 self.console.print(f"[white]Max iters:[/] [green]{effective}[/]")
@@ -1253,6 +1272,16 @@ def parse_args() -> argparse.Namespace:
         ),
     )
 
+    parser.add_argument(
+        "--send-reasoning-content",
+        action="store_true",
+        help=(
+            "OpenHands only: send prior assistant reasoning_content back to the model in subsequent requests. "
+            "Useful for thinking models whose chat template supports reasoning history. "
+            "In --resume mode, this flag is ignored and the value from run_metadata.json is used."
+        ),
+    )
+
     parser.add_argument(
         "--max-iters",
         type=int,
@@ -1364,6 +1393,10 @@ def load_resume_config(resume_dir: Path, args: argparse.Namespace) -> Tuple[Infe
         warnings.append(
             "--native-tool-calling (using 'force_native_tool_calling' from metadata)"
         )
+    if getattr(args, "send_reasoning_content", False):
+        warnings.append(
+            "--send-reasoning-content (using 'send_reasoning_content' from metadata)"
+        )
     
     if warnings:
         console.print("[bold yellow]Warning: The following arguments are ignored in resume mode:[/]")
@@ -1430,6 +1463,9 @@ def load_resume_config(resume_dir: Path, args: argparse.Namespace) -> Tuple[Infe
     # Determine force_native_tool_calling: always use metadata in resume mode.
     force_native_tool_calling = bool(metadata.get("force_native_tool_calling"))
 
+    # Determine send_reasoning_content: always use metadata in resume mode.
+    send_reasoning_content = bool(metadata.get("send_reasoning_content"))
+
     # Determine api_key/base_url/version: CLI overrides; otherwise use metadata.
     metadata_api_key = metadata.get("api_key")
     api_key = args.api_key if args.api_key is not None else metadata_api_key
@@ -1457,6 +1493,7 @@ def load_resume_config(resume_dir: Path, args: argparse.Namespace) -> Tuple[Infe
         without_interface_descriptions=without_interface_descriptions,
         white_box=white_box,
         force_native_tool_calling=force_native_tool_calling,
+        send_reasoning_content=send_reasoning_content,
         force_timeout=force_timeout,
         force_rerun_ids=_load_force_rerun_ids(getattr(args, "force_rerun", None)),
         api_key=api_key,
@@ -1522,6 +1559,7 @@ def main():
             without_interface_descriptions=bool(getattr(args, "without", False)),
             white_box=bool(getattr(args, "white", False)),
             force_native_tool_calling=bool(getattr(args, "native_tool_calling", False)),
+            send_reasoning_content=bool(getattr(args, "send_reasoning_content", False)),
             force_timeout=bool(getattr(args, "force_timeout", False)),
             force_rerun_ids=_load_force_rerun_ids(getattr(args, "force_rerun", None)),
             api_key=args.api_key,