Skip to content

Commit 0bf4d47

Browse files
committed
feat: add support for sending prior assistant reasoning content in OpenHands requests
1 parent 017f817 commit 0bf4d47

6 files changed

Lines changed: 185 additions & 15 deletions

File tree

config_example.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ LLM_BASE_URL = ""
5656
OPENHANDS_VERSION = "0.62.0"
5757
# Optional: Reasoning effort for OpenAI o-series models
5858
LLM_REASONING_EFFORT = ""
59+
# Optional: Send prior assistant reasoning_content back to OpenHands requests
60+
LLM_SEND_REASONING_CONTENT = false
5961
# Azure API Version (required for Azure models only)
6062
LLM_API_VERSION = ""
6163
# Optional: OpenHands agent max iterations (step limit). Upstream default is 500.

docs/config.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ SAVE_COMPLETIONS = false # Optional: whether to save LLM completions (true/
7575
INFER_LOG_RENDER_MODE = "compact" # Optional: compact|full for infer.log rendering
7676

7777
LLM_REASONING_EFFORT = "" # Optional: Reasoning effort for OpenAI o-series models
78+
LLM_SEND_REASONING_CONTENT = false # Optional: true to send prior assistant reasoning_content in history
7879
OPENHANDS_MAX_ITERATIONS = "" # Optional: OpenHands agent max iterations (step limit). Upstream default is 500.
7980

8081
```

docs/infer_cli_arg.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,11 @@ flags can override metadata (see the argument list below).
131131
Force native tool calling (`LLM_NATIVE_TOOL_CALLING=true`).
132132
Resume mode: ignored (uses metadata).
133133

134+
- `--send-reasoning-content`
135+
Send prior assistant `reasoning_content` back to the model in subsequent OpenHands requests.
136+
Useful for thinking models whose chat template supports reasoning history.
137+
Resume mode: ignored (uses metadata).
138+
134139
- `--max-iters`
135140
Maximum iterations for OpenHands (`OPENHANDS_MAX_ITERATIONS`).
136141
Default: no override (OpenHands default applies).

featurebench/infer/agents/openhands.py

Lines changed: 134 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,77 @@ def _env(name: str) -> str | None:
3939
return str(value).strip()
4040
4141
42+
def _install_send_reasoning_content_override(model: str) -> None:
43+
"""Force OpenHands SDK to preserve/send reasoning content for this model."""
44+
try:
45+
from openhands.sdk.llm.utils import model_features
46+
47+
tokens = [model]
48+
if "/" in model:
49+
tokens.append(model.split("/", 1)[-1])
50+
for token in tokens:
51+
if token and token not in model_features.SEND_REASONING_CONTENT_MODELS:
52+
model_features.SEND_REASONING_CONTENT_MODELS.append(token)
53+
54+
cache_clear = getattr(model_features.get_features, "cache_clear", None)
55+
if cache_clear:
56+
cache_clear()
57+
except Exception as exc:
58+
print(
59+
f"Warning: failed to enable reasoning-content send override: {exc}",
60+
file=sys.stderr,
61+
)
62+
63+
try:
64+
from openhands.sdk.llm.message import Message
65+
66+
if getattr(Message, "_featurebench_reasoning_alias_patch", False):
67+
return
68+
69+
original = Message.from_llm_chat_message.__func__
70+
71+
class _ReasoningContentProxy:
72+
def __init__(self, wrapped: Any, reasoning_content: str):
73+
self._wrapped = wrapped
74+
self.reasoning_content = reasoning_content
75+
76+
def __getattr__(self, name: str) -> Any:
77+
return getattr(self._wrapped, name)
78+
79+
def _extract_reasoning_content(message: Any) -> str | None:
80+
reasoning = getattr(message, "reasoning_content", None)
81+
if reasoning:
82+
return str(reasoning)
83+
84+
reasoning = getattr(message, "reasoning", None)
85+
if reasoning:
86+
return str(reasoning)
87+
88+
provider_fields = getattr(message, "provider_specific_fields", None)
89+
if isinstance(provider_fields, dict):
90+
for key in ("reasoning_content", "reasoning"):
91+
reasoning = provider_fields.get(key)
92+
if reasoning:
93+
return str(reasoning)
94+
95+
return None
96+
97+
def patched(cls: type[Message], message: Any) -> Message:
98+
if getattr(message, "reasoning_content", None) is None:
99+
reasoning_content = _extract_reasoning_content(message)
100+
if reasoning_content:
101+
message = _ReasoningContentProxy(message, reasoning_content)
102+
return original(cls, message)
103+
104+
Message.from_llm_chat_message = classmethod(patched)
105+
setattr(Message, "_featurebench_reasoning_alias_patch", True)
106+
except Exception as exc:
107+
print(
108+
f"Warning: failed to install reasoning field alias patch: {exc}",
109+
file=sys.stderr,
110+
)
111+
112+
42113
def _event_data(event: Any) -> dict[str, Any]:
43114
try:
44115
return event.model_dump(mode="json", exclude_none=True)
@@ -128,6 +199,9 @@ def _build_llm() -> Any:
128199
if not model:
129200
raise RuntimeError("LLM_MODEL is required for OpenHands SDK runner.")
130201
202+
if _truthy(_env("LLM_SEND_REASONING_CONTENT")):
203+
_install_send_reasoning_content_override(model)
204+
131205
kwargs: dict[str, Any] = {"model": model}
132206
133207
api_key = _env("LLM_API_KEY")
@@ -287,18 +361,8 @@ def install_script(self) -> str:
287361
288362
export PIP_CACHE_DIR="$CACHE_ROOT/pip"
289363
export UV_CACHE_DIR="$CACHE_ROOT/uv"
290-
291-
# If a local uv Python mirror exists and is non-empty, use it. Otherwise, let uv download Python from the default upstream sources.
292364
UV_PYTHON_MIRROR_DIR="$CACHE_ROOT/uv/python-mirror"
293-
if [ -z "${{UV_PYTHON_INSTALL_MIRROR:-}}" ]; then
294-
if [ -d "$UV_PYTHON_MIRROR_DIR" ] && [ "$(ls -A "$UV_PYTHON_MIRROR_DIR" 2>/dev/null)" ]; then
295-
export UV_PYTHON_INSTALL_MIRROR="file://$UV_PYTHON_MIRROR_DIR"
296-
echo "Using local uv python mirror: $UV_PYTHON_INSTALL_MIRROR"
297-
else
298-
unset UV_PYTHON_INSTALL_MIRROR
299-
echo "Local uv python mirror is empty; using upstream python downloads"
300-
fi
301-
fi
365+
PYTHON_INSTALL_MIRROR="${{UV_PYTHON_INSTALL_MIRROR:-https://ghfast.top/https://github.com/astral-sh/python-build-standalone/releases/download}}"
302366
303367
UV_DIR="/opt/featurebench/uv"
304368
UV_BIN_PRIMARY="$UV_DIR/bin/uv"
@@ -338,8 +402,8 @@ def install_script(self) -> str:
338402
339403
# Configure uv index mirror (TUNA)
340404
mkdir -p ~/.config/uv
341-
cat > ~/.config/uv/uv.toml <<'EOF'
342-
python-install-mirror = "https://ghfast.top/https://github.com/astral-sh/python-build-standalone/releases/download"
405+
cat > ~/.config/uv/uv.toml <<EOF
406+
python-install-mirror = "$PYTHON_INSTALL_MIRROR"
343407
[[index]]
344408
url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/"
345409
default = true
@@ -356,7 +420,54 @@ def install_script(self) -> str:
356420
"$UV_BIN" pip install --index-url "$FALLBACK_INDEX_URL" "$@"
357421
}}
358422
359-
# Install Python via uv (downloads cached via UV_CACHE_DIR)
423+
cache_uv_python_download() {{
424+
if [[ "$PYTHON_INSTALL_MIRROR" == file://* ]]; then
425+
export UV_PYTHON_INSTALL_MIRROR="$PYTHON_INSTALL_MIRROR"
426+
echo "Using configured uv python mirror: $UV_PYTHON_INSTALL_MIRROR"
427+
return 0
428+
fi
429+
430+
local download_url rel_path target_path part_path
431+
download_url="$("$UV_BIN" python list "$PY_VERSION" --only-downloads --show-urls | awk 'NR == 1 {{print $2}}')"
432+
if [ -z "$download_url" ] || [[ "$download_url" == "<"* ]]; then
433+
echo "Unable to resolve uv Python download URL for $PY_VERSION" >&2
434+
return 1
435+
fi
436+
if [[ "$download_url" != "$PYTHON_INSTALL_MIRROR/"* ]]; then
437+
echo "Resolved uv Python URL does not use configured mirror: $download_url" >&2
438+
return 1
439+
fi
440+
441+
rel_path="${{download_url#"$PYTHON_INSTALL_MIRROR"/}}"
442+
rel_path="${{rel_path//%2B/+}}"
443+
rel_path="${{rel_path//%2b/+}}"
444+
target_path="$UV_PYTHON_MIRROR_DIR/$rel_path"
445+
part_path="$target_path.part"
446+
447+
mkdir -p "$(dirname "$target_path")"
448+
if [ -s "$target_path" ] && tar -tzf "$target_path" >/dev/null 2>&1; then
449+
echo "Using cached uv Python archive: $target_path"
450+
else
451+
if [ -s "$target_path" ]; then
452+
echo "Cached uv Python archive is invalid; redownloading: $target_path" >&2
453+
rm -f "$target_path"
454+
fi
455+
echo "Caching uv Python archive: $download_url -> $target_path"
456+
curl -fL --retry 3 --retry-delay 2 --connect-timeout 20 -C - -o "$part_path" "$download_url"
457+
if ! tar -tzf "$part_path" >/dev/null 2>&1; then
458+
echo "Downloaded uv Python archive is invalid: $part_path" >&2
459+
rm -f "$part_path"
460+
return 1
461+
fi
462+
mv "$part_path" "$target_path"
463+
fi
464+
465+
export UV_PYTHON_INSTALL_MIRROR="file://$UV_PYTHON_MIRROR_DIR"
466+
echo "Using local uv python mirror: $UV_PYTHON_INSTALL_MIRROR"
467+
}}
468+
469+
# Install Python via uv. The archive is cached in a local mirror first so future containers reuse it.
470+
cache_uv_python_download
360471
$UV_BIN python install $PY_VERSION
361472
362473
# Create venv (container-local)
@@ -402,7 +513,13 @@ def get_run_command(self, instruction: str) -> str:
402513
"if [ ! -x /opt/openhands-venv/bin/python ]; then "
403514
"echo '/opt/openhands-venv/bin/python not found' >&2; exit 127; "
404515
"fi; "
405-
"if /opt/openhands-venv/bin/python -c "
516+
"if [[ \"${LLM_SEND_REASONING_CONTENT,,}\" =~ ^(1|true|yes|on)$ ]] && "
517+
"/opt/openhands-venv/bin/python -c "
518+
"\"import importlib.util, sys; "
519+
"sys.exit(0 if importlib.util.find_spec('openhands.sdk') else 1)\"; "
520+
"then "
521+
f"/opt/openhands-venv/bin/python /agent-logs/openhands-sdk-runner.py --task-file {task_file}; "
522+
"elif /opt/openhands-venv/bin/python -c "
406523
"\"import importlib.util, sys; "
407524
"sys.exit(0 if importlib.util.find_spec('openhands.core.main') else 1)\"; "
408525
"then "
@@ -445,6 +562,8 @@ def get_env_setup_script(self) -> str:
445562
"LLM_REASONING_EFFORT": self.env_vars.get("LLM_REASONING_EFFORT"),
446563
# Force native tool calling (OpenHands LLMConfig.native_tool_calling via LLM_ env mapping)
447564
"LLM_NATIVE_TOOL_CALLING": self.env_vars.get("LLM_NATIVE_TOOL_CALLING"),
565+
# Force OpenHands SDK to send prior assistant reasoning_content in history.
566+
"LLM_SEND_REASONING_CONTENT": self.env_vars.get("LLM_SEND_REASONING_CONTENT"),
448567
# Disable features not needed for FeatureBench
449568
"AGENT_ENABLE_PROMPT_EXTENSIONS": "false",
450569
"AGENT_ENABLE_BROWSING": "false",

featurebench/infer/models.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,8 @@ class InferConfig:
170170
white_box: bool = False
171171
# If True, force OpenHands to use native tool calling (LLM_NATIVE_TOOL_CALLING=true).
172172
force_native_tool_calling: bool = False
173+
# If True, send prior assistant reasoning content back to OpenHands LLM requests.
174+
send_reasoning_content: bool = False
173175
# Optional task IDs to force rerun even if completed.
174176
force_rerun_ids: Optional[List[str]] = None
175177
# If True, treat prior TIMEOUT attempts as completed when resuming (skip reruns).
@@ -199,6 +201,7 @@ def to_dict(self) -> Dict[str, Any]:
199201
"without_interface_descriptions": self.without_interface_descriptions,
200202
"white_box": self.white_box,
201203
"force_native_tool_calling": self.force_native_tool_calling,
204+
"send_reasoning_content": self.send_reasoning_content,
202205
"force_rerun_ids": self.force_rerun_ids,
203206
"force_timeout": self.force_timeout,
204207
"api_key": self.api_key,
@@ -260,6 +263,7 @@ class RunMetadata:
260263
without_interface_descriptions: bool = False
261264
white_box: bool = False
262265
force_native_tool_calling: bool = False
266+
send_reasoning_content: bool = False
263267
force_timeout: bool = False
264268
api_key: Optional[str] = None
265269
base_url: Optional[str] = None
@@ -289,6 +293,7 @@ def to_dict(self) -> Dict[str, Any]:
289293
"without_interface_descriptions": self.without_interface_descriptions,
290294
"white_box": self.white_box,
291295
"force_native_tool_calling": self.force_native_tool_calling,
296+
"send_reasoning_content": self.send_reasoning_content,
292297
"force_timeout": self.force_timeout,
293298
"api_key": self.api_key,
294299
"base_url": self.base_url,

featurebench/infer/run_infer.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,9 @@ def _apply_override(flag_name: str, value: Optional[str], key_map: Dict[str, str
338338
# Avoid inheriting config/env values when not forcing.
339339
self.agent_env_vars.pop("LLM_NATIVE_TOOL_CALLING", None)
340340

341+
if getattr(config, "send_reasoning_content", False):
342+
self.agent_env_vars["LLM_SEND_REASONING_CONTENT"] = "true"
343+
341344
# Surface force-timeout behavior to all agents via env.
342345
if getattr(config, "force_timeout", False):
343346
self.agent_env_vars["FB_FORCE_TIMEOUT"] = "true"
@@ -365,6 +368,12 @@ def _apply_override(flag_name: str, value: Optional[str], key_map: Dict[str, str
365368
else:
366369
self.agent_env_vars.pop("LLM_NATIVE_TOOL_CALLING", None)
367370

371+
send_reasoning = bool(metadata.get("send_reasoning_content"))
372+
if send_reasoning:
373+
self.agent_env_vars["LLM_SEND_REASONING_CONTENT"] = "true"
374+
else:
375+
self.agent_env_vars.pop("LLM_SEND_REASONING_CONTENT", None)
376+
368377
recorded = metadata.get("openhands_reasoning_effort")
369378
if recorded is not None and str(recorded).strip():
370379
self.agent_env_vars["LLM_REASONING_EFFORT"] = str(recorded).strip()
@@ -778,10 +787,14 @@ def _save_run_metadata(self, task_ids: List[str]) -> None:
778787
# Persist the *effective* reasoning effort used by the agent (if any).
779788
openhands_reasoning_effort: Optional[str] = None
780789
codex_reasoning_effort: Optional[str] = None
790+
send_reasoning_content = False
781791
if self.config.agent == "openhands":
782792
raw = self.agent_env_vars.get("LLM_REASONING_EFFORT")
783793
if raw is not None and str(raw).strip():
784794
openhands_reasoning_effort = str(raw).strip()
795+
send_reasoning_content = str(
796+
self.agent_env_vars.get("LLM_SEND_REASONING_CONTENT", "")
797+
).strip().lower() in {"1", "true", "yes", "on"}
785798
elif self.config.agent == "codex":
786799
raw = self.agent_env_vars.get("CODEX_REASONING_EFFORT")
787800
if raw is not None and str(raw).strip():
@@ -808,6 +821,7 @@ def _save_run_metadata(self, task_ids: List[str]) -> None:
808821
without_interface_descriptions=self.config.without_interface_descriptions,
809822
white_box=getattr(self.config, "white_box", False),
810823
force_native_tool_calling=getattr(self.config, "force_native_tool_calling", False),
824+
send_reasoning_content=send_reasoning_content,
811825
force_timeout=getattr(self.config, "force_timeout", False),
812826
api_key=self.config.api_key,
813827
base_url=self.config.base_url,
@@ -849,6 +863,11 @@ def run(self) -> None:
849863
if self.config.agent == "openhands":
850864
if getattr(self.config, "force_native_tool_calling", False):
851865
self.console.print("[white]Tool calling:[/] [yellow]forced native[/]")
866+
send_reasoning_content = str(
867+
self.agent_env_vars.get("LLM_SEND_REASONING_CONTENT", "")
868+
).strip().lower() in {"1", "true", "yes", "on"}
869+
if send_reasoning_content:
870+
self.console.print("[white]Reasoning content:[/] [yellow]send in history[/]")
852871
effective = self.agent_env_vars.get("OPENHANDS_MAX_ITERATIONS")
853872
if effective is not None and str(effective).strip():
854873
self.console.print(f"[white]Max iters:[/] [green]{effective}[/]")
@@ -1253,6 +1272,16 @@ def parse_args() -> argparse.Namespace:
12531272
),
12541273
)
12551274

1275+
parser.add_argument(
1276+
"--send-reasoning-content",
1277+
action="store_true",
1278+
help=(
1279+
"OpenHands only: send prior assistant reasoning_content back to the model in subsequent requests. "
1280+
"Useful for thinking models whose chat template supports reasoning history. "
1281+
"In --resume mode, this flag is ignored and the value from run_metadata.json is used."
1282+
),
1283+
)
1284+
12561285
parser.add_argument(
12571286
"--max-iters",
12581287
type=int,
@@ -1364,6 +1393,10 @@ def load_resume_config(resume_dir: Path, args: argparse.Namespace) -> Tuple[Infe
13641393
warnings.append(
13651394
"--native-tool-calling (using 'force_native_tool_calling' from metadata)"
13661395
)
1396+
if getattr(args, "send_reasoning_content", False):
1397+
warnings.append(
1398+
"--send-reasoning-content (using 'send_reasoning_content' from metadata)"
1399+
)
13671400

13681401
if warnings:
13691402
console.print("[bold yellow]Warning: The following arguments are ignored in resume mode:[/]")
@@ -1430,6 +1463,9 @@ def load_resume_config(resume_dir: Path, args: argparse.Namespace) -> Tuple[Infe
14301463
# Determine force_native_tool_calling: always use metadata in resume mode.
14311464
force_native_tool_calling = bool(metadata.get("force_native_tool_calling"))
14321465

1466+
# Determine send_reasoning_content: always use metadata in resume mode.
1467+
send_reasoning_content = bool(metadata.get("send_reasoning_content"))
1468+
14331469
# Determine api_key/base_url/version: CLI overrides; otherwise use metadata.
14341470
metadata_api_key = metadata.get("api_key")
14351471
api_key = args.api_key if args.api_key is not None else metadata_api_key
@@ -1457,6 +1493,7 @@ def load_resume_config(resume_dir: Path, args: argparse.Namespace) -> Tuple[Infe
14571493
without_interface_descriptions=without_interface_descriptions,
14581494
white_box=white_box,
14591495
force_native_tool_calling=force_native_tool_calling,
1496+
send_reasoning_content=send_reasoning_content,
14601497
force_timeout=force_timeout,
14611498
force_rerun_ids=_load_force_rerun_ids(getattr(args, "force_rerun", None)),
14621499
api_key=api_key,
@@ -1522,6 +1559,7 @@ def main():
15221559
without_interface_descriptions=bool(getattr(args, "without", False)),
15231560
white_box=bool(getattr(args, "white", False)),
15241561
force_native_tool_calling=bool(getattr(args, "native_tool_calling", False)),
1562+
send_reasoning_content=bool(getattr(args, "send_reasoning_content", False)),
15251563
force_timeout=bool(getattr(args, "force_timeout", False)),
15261564
force_rerun_ids=_load_force_rerun_ids(getattr(args, "force_rerun", None)),
15271565
api_key=args.api_key,

0 commit comments

Comments
 (0)