Skip to content

Commit c825790

Browse files
committed
feat: update OpenHands configuration options for native tool calling and reasoning content
1 parent 0bf4d47 commit c825790

6 files changed

Lines changed: 74 additions & 33 deletions

File tree

config_example.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,10 @@ LLM_BASE_URL = ""
5656
OPENHANDS_VERSION = "0.62.0"
5757
# Optional: Reasoning effort for OpenAI o-series models
5858
LLM_REASONING_EFFORT = ""
59+
# Optional: Force OpenHands native tool calling ("true" or "false"; empty uses OpenHands default)
60+
LLM_NATIVE_TOOL_CALLING = ""
5961
# Optional: Send prior assistant reasoning_content back to OpenHands requests
60-
LLM_SEND_REASONING_CONTENT = false
62+
LLM_SEND_REASONING_CONTENT = ""
6163
# Azure API Version (required for Azure models only)
6264
LLM_API_VERSION = ""
6365
# Optional: OpenHands agent max iterations (step limit). Upstream default is 500.
@@ -105,4 +107,4 @@ MSWEA_BASE_URL = ""
105107
# Optional: Cost tracking mode (default behavior in FB adapter is ignore_errors)
106108
MSWEA_COST_TRACKING = ""
107109
# Optional: Lock mini_swe_agent version (leave empty to use latest)
108-
MINI_SWE_AGENT_VERSION = ""
110+
MINI_SWE_AGENT_VERSION = ""

docs/config.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ SAVE_COMPLETIONS = false # Optional: whether to save LLM completions (true/
7575
INFER_LOG_RENDER_MODE = "compact" # Optional: compact|full for infer.log rendering
7676

7777
LLM_REASONING_EFFORT = "" # Optional: Reasoning effort for OpenAI o-series models
78-
LLM_SEND_REASONING_CONTENT = false # Optional: true to send prior assistant reasoning_content in history
78+
LLM_NATIVE_TOOL_CALLING = "" # Optional: true/false to force OpenHands native tool calling; empty uses OpenHands default
79+
LLM_SEND_REASONING_CONTENT = "" # Optional: true to send prior assistant reasoning_content in history
7980
OPENHANDS_MAX_ITERATIONS = "" # Optional: OpenHands agent max iterations (step limit). Upstream default is 500.
8081

8182
```
@@ -206,4 +207,4 @@ Example (local vLLM):
206207
[llm.local]
207208
backend = "vllm"
208209
base_url = "http://localhost:8080/v1"
209-
```
210+
```

docs/infer_cli_arg.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,11 @@ flags can override metadata (see the argument list below).
128128
### OpenHands Only
129129

130130
- `--native-tool-calling`
131-
Force native tool calling (`LLM_NATIVE_TOOL_CALLING=true`).
131+
Force native tool calling on (`LLM_NATIVE_TOOL_CALLING=true`).
132+
Resume mode: ignored (uses metadata).
133+
134+
- `--no-native-tool-calling`
135+
Force native tool calling off (`LLM_NATIVE_TOOL_CALLING=false`).
132136
Resume mode: ignored (uses metadata).
133137

134138
- `--send-reasoning-content`
@@ -154,4 +158,4 @@ runs/{timestamp}/
154158
├── infer.log # Agent execution log
155159
├── run.log # Runtime log
156160
└── patch.diff # Generated patch
157-
```
161+
```

featurebench/infer/agents/openhands.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ def get_env_setup_script(self) -> str:
560560
"LLM_BASE_URL": self.env_vars.get("LLM_BASE_URL"),
561561
"LLM_API_VERSION": self.env_vars.get("LLM_API_VERSION"),
562562
"LLM_REASONING_EFFORT": self.env_vars.get("LLM_REASONING_EFFORT"),
563-
# Force native tool calling (OpenHands LLMConfig.native_tool_calling via LLM_ env mapping)
563+
# Configure native tool calling (OpenHands LLMConfig.native_tool_calling via LLM_ env mapping)
564564
"LLM_NATIVE_TOOL_CALLING": self.env_vars.get("LLM_NATIVE_TOOL_CALLING"),
565565
# Force OpenHands SDK to send prior assistant reasoning_content in history.
566566
"LLM_SEND_REASONING_CONTENT": self.env_vars.get("LLM_SEND_REASONING_CONTENT"),
@@ -1199,4 +1199,4 @@ def post_run_hook(self, container, log_file) -> bool:
11991199
return False
12001200
except Exception as e:
12011201
self.logger.error(f"Error reading trajectory.json: {e}")
1202-
return False
1202+
return False

featurebench/infer/models.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,8 @@ class InferConfig:
168168
without_interface_descriptions: bool = False
169169
# If True, enable white-box mode: agent can see FAIL_TO_PASS test file(s).
170170
white_box: bool = False
171-
# If True, force OpenHands to use native tool calling (LLM_NATIVE_TOOL_CALLING=true).
172-
force_native_tool_calling: bool = False
171+
# OpenHands only: True/False forces LLM_NATIVE_TOOL_CALLING, None leaves OpenHands default.
172+
native_tool_calling: Optional[bool] = None
173173
# If True, send prior assistant reasoning content back to OpenHands LLM requests.
174174
send_reasoning_content: bool = False
175175
# Optional task IDs to force rerun even if completed.
@@ -200,7 +200,7 @@ def to_dict(self) -> Dict[str, Any]:
200200
"split": self.split,
201201
"without_interface_descriptions": self.without_interface_descriptions,
202202
"white_box": self.white_box,
203-
"force_native_tool_calling": self.force_native_tool_calling,
203+
"native_tool_calling": self.native_tool_calling,
204204
"send_reasoning_content": self.send_reasoning_content,
205205
"force_rerun_ids": self.force_rerun_ids,
206206
"force_timeout": self.force_timeout,
@@ -262,7 +262,7 @@ class RunMetadata:
262262
level: Optional[List[int]] = None # Level filter (1, 2)
263263
without_interface_descriptions: bool = False
264264
white_box: bool = False
265-
force_native_tool_calling: bool = False
265+
native_tool_calling: Optional[bool] = None
266266
send_reasoning_content: bool = False
267267
force_timeout: bool = False
268268
api_key: Optional[str] = None
@@ -292,7 +292,7 @@ def to_dict(self) -> Dict[str, Any]:
292292
"level": self.level,
293293
"without_interface_descriptions": self.without_interface_descriptions,
294294
"white_box": self.white_box,
295-
"force_native_tool_calling": self.force_native_tool_calling,
295+
"native_tool_calling": self.native_tool_calling,
296296
"send_reasoning_content": self.send_reasoning_content,
297297
"force_timeout": self.force_timeout,
298298
"api_key": self.api_key,
@@ -348,4 +348,4 @@ def patch_path(self) -> Path:
348348

349349
def ensure_dirs(self) -> None:
350350
"""Create all necessary directories."""
351-
self.attempt_dir.mkdir(parents=True, exist_ok=True)
351+
self.attempt_dir.mkdir(parents=True, exist_ok=True)

featurebench/infer/run_infer.py

Lines changed: 53 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -330,13 +330,14 @@ def _apply_override(flag_name: str, value: Optional[str], key_map: Dict[str, str
330330
_apply_override("--base-url", config.base_url, base_url_map)
331331
_apply_override("--version", config.version, version_map)
332332

333-
# Force native tool calling for OpenHands when requested.
333+
# Configure native tool calling for OpenHands. CLI overrides config.toml;
334+
# when CLI is unset, keep any non-empty LLM_NATIVE_TOOL_CALLING from config.
334335
if config.agent == "openhands":
335-
if getattr(config, "force_native_tool_calling", False):
336+
native_tool_calling = getattr(config, "native_tool_calling", None)
337+
if native_tool_calling is True:
336338
self.agent_env_vars["LLM_NATIVE_TOOL_CALLING"] = "true"
337-
else:
338-
# Avoid inheriting config/env values when not forcing.
339-
self.agent_env_vars.pop("LLM_NATIVE_TOOL_CALLING", None)
339+
elif native_tool_calling is False:
340+
self.agent_env_vars["LLM_NATIVE_TOOL_CALLING"] = "false"
340341

341342
if getattr(config, "send_reasoning_content", False):
342343
self.agent_env_vars["LLM_SEND_REASONING_CONTENT"] = "true"
@@ -361,10 +362,12 @@ def _apply_override(flag_name: str, value: Optional[str], key_map: Dict[str, str
361362
metadata = {}
362363

363364
if config.agent == "openhands":
364-
# Force native tool calling in resume mode strictly follows metadata.
365-
force_native = bool(metadata.get("force_native_tool_calling"))
366-
if force_native:
365+
# Native tool calling in resume mode strictly follows metadata.
366+
native_tool_calling = metadata.get("native_tool_calling")
367+
if native_tool_calling is True:
367368
self.agent_env_vars["LLM_NATIVE_TOOL_CALLING"] = "true"
369+
elif native_tool_calling is False:
370+
self.agent_env_vars["LLM_NATIVE_TOOL_CALLING"] = "false"
368371
else:
369372
self.agent_env_vars.pop("LLM_NATIVE_TOOL_CALLING", None)
370373

@@ -787,11 +790,15 @@ def _save_run_metadata(self, task_ids: List[str]) -> None:
787790
# Persist the *effective* reasoning effort used by the agent (if any).
788791
openhands_reasoning_effort: Optional[str] = None
789792
codex_reasoning_effort: Optional[str] = None
793+
native_tool_calling: Optional[bool] = None
790794
send_reasoning_content = False
791795
if self.config.agent == "openhands":
792796
raw = self.agent_env_vars.get("LLM_REASONING_EFFORT")
793797
if raw is not None and str(raw).strip():
794798
openhands_reasoning_effort = str(raw).strip()
799+
raw = self.agent_env_vars.get("LLM_NATIVE_TOOL_CALLING")
800+
if raw is not None and str(raw).strip():
801+
native_tool_calling = str(raw).strip().lower() in {"1", "true", "yes", "on"}
795802
send_reasoning_content = str(
796803
self.agent_env_vars.get("LLM_SEND_REASONING_CONTENT", "")
797804
).strip().lower() in {"1", "true", "yes", "on"}
@@ -820,7 +827,7 @@ def _save_run_metadata(self, task_ids: List[str]) -> None:
820827
level=self.config.level,
821828
without_interface_descriptions=self.config.without_interface_descriptions,
822829
white_box=getattr(self.config, "white_box", False),
823-
force_native_tool_calling=getattr(self.config, "force_native_tool_calling", False),
830+
native_tool_calling=native_tool_calling,
824831
send_reasoning_content=send_reasoning_content,
825832
force_timeout=getattr(self.config, "force_timeout", False),
826833
api_key=self.config.api_key,
@@ -861,8 +868,16 @@ def run(self) -> None:
861868
if getattr(self.config, "white_box", False):
862869
self.console.print("[white]Prompt:[/] [yellow]white-box (tests visible)[/]")
863870
if self.config.agent == "openhands":
864-
if getattr(self.config, "force_native_tool_calling", False):
865-
self.console.print("[white]Tool calling:[/] [yellow]forced native[/]")
871+
native_tool_calling = self.agent_env_vars.get("LLM_NATIVE_TOOL_CALLING")
872+
native_tool_calling = (
873+
None
874+
if native_tool_calling is None or not str(native_tool_calling).strip()
875+
else str(native_tool_calling).strip().lower() in {"1", "true", "yes", "on"}
876+
)
877+
if native_tool_calling is True:
878+
self.console.print("[white]Tool calling:[/] [yellow]native forced on[/]")
879+
elif native_tool_calling is False:
880+
self.console.print("[white]Tool calling:[/] [yellow]native forced off[/]")
866881
send_reasoning_content = str(
867882
self.agent_env_vars.get("LLM_SEND_REASONING_CONTENT", "")
868883
).strip().lower() in {"1", "true", "yes", "on"}
@@ -1263,11 +1278,20 @@ def parse_args() -> argparse.Namespace:
12631278
),
12641279
)
12651280

1266-
parser.add_argument(
1281+
native_tool_group = parser.add_mutually_exclusive_group()
1282+
native_tool_group.add_argument(
12671283
"--native-tool-calling",
12681284
action="store_true",
12691285
help=(
1270-
"OpenHands only: force native tool calling (sets LLM_NATIVE_TOOL_CALLING=true inside the container). "
1286+
"OpenHands only: force native tool calling on (sets LLM_NATIVE_TOOL_CALLING=true inside the container). "
1287+
"In --resume mode, this flag is ignored and the value from run_metadata.json is used."
1288+
),
1289+
)
1290+
native_tool_group.add_argument(
1291+
"--no-native-tool-calling",
1292+
action="store_true",
1293+
help=(
1294+
"OpenHands only: force native tool calling off (sets LLM_NATIVE_TOOL_CALLING=false inside the container). "
12711295
"In --resume mode, this flag is ignored and the value from run_metadata.json is used."
12721296
),
12731297
)
@@ -1391,7 +1415,11 @@ def load_resume_config(resume_dir: Path, args: argparse.Namespace) -> Tuple[Infe
13911415
)
13921416
if getattr(args, "native_tool_calling", False):
13931417
warnings.append(
1394-
"--native-tool-calling (using 'force_native_tool_calling' from metadata)"
1418+
"--native-tool-calling (using 'native_tool_calling' from metadata)"
1419+
)
1420+
if getattr(args, "no_native_tool_calling", False):
1421+
warnings.append(
1422+
"--no-native-tool-calling (using 'native_tool_calling' from metadata)"
13951423
)
13961424
if getattr(args, "send_reasoning_content", False):
13971425
warnings.append(
@@ -1460,8 +1488,8 @@ def load_resume_config(resume_dir: Path, args: argparse.Namespace) -> Tuple[Infe
14601488
# Determine white_box: always use metadata in resume mode.
14611489
white_box = bool(metadata.get("white_box"))
14621490

1463-
# Determine force_native_tool_calling: always use metadata in resume mode.
1464-
force_native_tool_calling = bool(metadata.get("force_native_tool_calling"))
1491+
# Determine native_tool_calling: always use metadata in resume mode.
1492+
native_tool_calling = metadata.get("native_tool_calling")
14651493

14661494
# Determine send_reasoning_content: always use metadata in resume mode.
14671495
send_reasoning_content = bool(metadata.get("send_reasoning_content"))
@@ -1492,7 +1520,7 @@ def load_resume_config(resume_dir: Path, args: argparse.Namespace) -> Tuple[Infe
14921520
split=metadata.get('split'), # Use split from metadata
14931521
without_interface_descriptions=without_interface_descriptions,
14941522
white_box=white_box,
1495-
force_native_tool_calling=force_native_tool_calling,
1523+
native_tool_calling=native_tool_calling,
14961524
send_reasoning_content=send_reasoning_content,
14971525
force_timeout=force_timeout,
14981526
force_rerun_ids=_load_force_rerun_ids(getattr(args, "force_rerun", None)),
@@ -1558,7 +1586,13 @@ def main():
15581586
split=args.split if args.split is not None else "full",
15591587
without_interface_descriptions=bool(getattr(args, "without", False)),
15601588
white_box=bool(getattr(args, "white", False)),
1561-
force_native_tool_calling=bool(getattr(args, "native_tool_calling", False)),
1589+
native_tool_calling=(
1590+
True
1591+
if getattr(args, "native_tool_calling", False)
1592+
else False
1593+
if getattr(args, "no_native_tool_calling", False)
1594+
else None
1595+
),
15621596
send_reasoning_content=bool(getattr(args, "send_reasoning_content", False)),
15631597
force_timeout=bool(getattr(args, "force_timeout", False)),
15641598
force_rerun_ids=_load_force_rerun_ids(getattr(args, "force_rerun", None)),
@@ -1573,4 +1607,4 @@ def main():
15731607

15741608

15751609
if __name__ == "__main__":
1576-
main()
1610+
main()

0 commit comments

Comments
 (0)