feat: update OpenHands configuration options for native tool calling and reasoning content

potatoQi · potatoQi · commit c82579063d5d · 2026-06-01T22:10:45.000+08:00
diff --git a/config_example.toml b/config_example.toml
@@ -56,8 +56,10 @@ LLM_BASE_URL = ""
 OPENHANDS_VERSION = "0.62.0"
 # Optional: Reasoning effort for OpenAI o-series models
 LLM_REASONING_EFFORT = ""
+# Optional: Force OpenHands native tool calling ("true" or "false"; empty uses OpenHands default)
+LLM_NATIVE_TOOL_CALLING = ""
 # Optional: Send prior assistant reasoning_content back to OpenHands requests
-LLM_SEND_REASONING_CONTENT = false
+LLM_SEND_REASONING_CONTENT = ""
 # Azure API Version (required for Azure models only)
 LLM_API_VERSION = ""
 # Optional: OpenHands agent max iterations (step limit). Upstream default is 500.
@@ -105,4 +107,4 @@ MSWEA_BASE_URL = ""
 # Optional: Cost tracking mode (default behavior in FB adapter is ignore_errors)
 MSWEA_COST_TRACKING = ""
 # Optional: Lock mini_swe_agent version (leave empty to use latest)
-MINI_SWE_AGENT_VERSION = ""
+MINI_SWE_AGENT_VERSION = ""
diff --git a/docs/config.md b/docs/config.md
@@ -75,7 +75,8 @@ SAVE_COMPLETIONS = false      # Optional: whether to save LLM completions (true/
 INFER_LOG_RENDER_MODE = "compact" # Optional: compact|full for infer.log rendering
 
 LLM_REASONING_EFFORT = ""     # Optional: Reasoning effort for OpenAI o-series models
-LLM_SEND_REASONING_CONTENT = false # Optional: true to send prior assistant reasoning_content in history
+LLM_NATIVE_TOOL_CALLING = ""  # Optional: true/false to force OpenHands native tool calling; empty uses OpenHands default
+LLM_SEND_REASONING_CONTENT = "" # Optional: true to send prior assistant reasoning_content in history
 OPENHANDS_MAX_ITERATIONS = "" # Optional: OpenHands agent max iterations (step limit). Upstream default is 500.
 
 ```
@@ -206,4 +207,4 @@ Example (local vLLM):
 [llm.local]
 backend = "vllm"
 base_url = "http://localhost:8080/v1"
-```
+```
diff --git a/docs/infer_cli_arg.md b/docs/infer_cli_arg.md
@@ -128,7 +128,11 @@ flags can override metadata (see the argument list below).
 ### OpenHands Only
 
 - `--native-tool-calling`  
-  Force native tool calling (`LLM_NATIVE_TOOL_CALLING=true`).  
+  Force native tool calling on (`LLM_NATIVE_TOOL_CALLING=true`).  
+  Resume mode: ignored (uses metadata).
+
+- `--no-native-tool-calling`  
+  Force native tool calling off (`LLM_NATIVE_TOOL_CALLING=false`).  
   Resume mode: ignored (uses metadata).
 
 - `--send-reasoning-content`  
@@ -154,4 +158,4 @@ runs/{timestamp}/
             ├── infer.log         # Agent execution log
             ├── run.log           # Runtime log
             └── patch.diff        # Generated patch
-```
+```
diff --git a/featurebench/infer/agents/openhands.py b/featurebench/infer/agents/openhands.py
@@ -560,7 +560,7 @@ def get_env_setup_script(self) -> str:
             "LLM_BASE_URL": self.env_vars.get("LLM_BASE_URL"),
             "LLM_API_VERSION": self.env_vars.get("LLM_API_VERSION"),
             "LLM_REASONING_EFFORT": self.env_vars.get("LLM_REASONING_EFFORT"),
-            # Force native tool calling (OpenHands LLMConfig.native_tool_calling via LLM_ env mapping)
+            # Configure native tool calling (OpenHands LLMConfig.native_tool_calling via LLM_ env mapping)
             "LLM_NATIVE_TOOL_CALLING": self.env_vars.get("LLM_NATIVE_TOOL_CALLING"),
             # Force OpenHands SDK to send prior assistant reasoning_content in history.
             "LLM_SEND_REASONING_CONTENT": self.env_vars.get("LLM_SEND_REASONING_CONTENT"),
@@ -1199,4 +1199,4 @@ def post_run_hook(self, container, log_file) -> bool:
             return False
         except Exception as e:
             self.logger.error(f"Error reading trajectory.json: {e}")
-            return False
+            return False
diff --git a/featurebench/infer/models.py b/featurebench/infer/models.py
@@ -168,8 +168,8 @@ class InferConfig:
     without_interface_descriptions: bool = False
     # If True, enable white-box mode: agent can see FAIL_TO_PASS test file(s).
     white_box: bool = False
-    # If True, force OpenHands to use native tool calling (LLM_NATIVE_TOOL_CALLING=true).
-    force_native_tool_calling: bool = False
+    # OpenHands only: True/False forces LLM_NATIVE_TOOL_CALLING, None leaves OpenHands default.
+    native_tool_calling: Optional[bool] = None
     # If True, send prior assistant reasoning content back to OpenHands LLM requests.
     send_reasoning_content: bool = False
     # Optional task IDs to force rerun even if completed.
@@ -200,7 +200,7 @@ def to_dict(self) -> Dict[str, Any]:
             "split": self.split,
             "without_interface_descriptions": self.without_interface_descriptions,
             "white_box": self.white_box,
-            "force_native_tool_calling": self.force_native_tool_calling,
+            "native_tool_calling": self.native_tool_calling,
             "send_reasoning_content": self.send_reasoning_content,
             "force_rerun_ids": self.force_rerun_ids,
             "force_timeout": self.force_timeout,
@@ -262,7 +262,7 @@ class RunMetadata:
     level: Optional[List[int]] = None  # Level filter (1, 2)
     without_interface_descriptions: bool = False
     white_box: bool = False
-    force_native_tool_calling: bool = False
+    native_tool_calling: Optional[bool] = None
     send_reasoning_content: bool = False
     force_timeout: bool = False
     api_key: Optional[str] = None
@@ -292,7 +292,7 @@ def to_dict(self) -> Dict[str, Any]:
             "level": self.level,
             "without_interface_descriptions": self.without_interface_descriptions,
             "white_box": self.white_box,
-            "force_native_tool_calling": self.force_native_tool_calling,
+            "native_tool_calling": self.native_tool_calling,
             "send_reasoning_content": self.send_reasoning_content,
             "force_timeout": self.force_timeout,
             "api_key": self.api_key,
@@ -348,4 +348,4 @@ def patch_path(self) -> Path:
     
     def ensure_dirs(self) -> None:
         """Create all necessary directories."""
-        self.attempt_dir.mkdir(parents=True, exist_ok=True)
+        self.attempt_dir.mkdir(parents=True, exist_ok=True)
diff --git a/featurebench/infer/run_infer.py b/featurebench/infer/run_infer.py
@@ -330,13 +330,14 @@ def _apply_override(flag_name: str, value: Optional[str], key_map: Dict[str, str
         _apply_override("--base-url", config.base_url, base_url_map)
         _apply_override("--version", config.version, version_map)
 
-        # Force native tool calling for OpenHands when requested.
+        # Configure native tool calling for OpenHands. CLI overrides config.toml;
+        # when CLI is unset, keep any non-empty LLM_NATIVE_TOOL_CALLING from config.
         if config.agent == "openhands":
-            if getattr(config, "force_native_tool_calling", False):
+            native_tool_calling = getattr(config, "native_tool_calling", None)
+            if native_tool_calling is True:
                 self.agent_env_vars["LLM_NATIVE_TOOL_CALLING"] = "true"
-            else:
-                # Avoid inheriting config/env values when not forcing.
-                self.agent_env_vars.pop("LLM_NATIVE_TOOL_CALLING", None)
+            elif native_tool_calling is False:
+                self.agent_env_vars["LLM_NATIVE_TOOL_CALLING"] = "false"
 
             if getattr(config, "send_reasoning_content", False):
                 self.agent_env_vars["LLM_SEND_REASONING_CONTENT"] = "true"
@@ -361,10 +362,12 @@ def _apply_override(flag_name: str, value: Optional[str], key_map: Dict[str, str
                 metadata = {}
 
             if config.agent == "openhands":
-                # Force native tool calling in resume mode strictly follows metadata.
-                force_native = bool(metadata.get("force_native_tool_calling"))
-                if force_native:
+                # Native tool calling in resume mode strictly follows metadata.
+                native_tool_calling = metadata.get("native_tool_calling")
+                if native_tool_calling is True:
                     self.agent_env_vars["LLM_NATIVE_TOOL_CALLING"] = "true"
+                elif native_tool_calling is False:
+                    self.agent_env_vars["LLM_NATIVE_TOOL_CALLING"] = "false"
                 else:
                     self.agent_env_vars.pop("LLM_NATIVE_TOOL_CALLING", None)
 
@@ -787,11 +790,15 @@ def _save_run_metadata(self, task_ids: List[str]) -> None:
         # Persist the *effective* reasoning effort used by the agent (if any).
         openhands_reasoning_effort: Optional[str] = None
         codex_reasoning_effort: Optional[str] = None
+        native_tool_calling: Optional[bool] = None
         send_reasoning_content = False
         if self.config.agent == "openhands":
             raw = self.agent_env_vars.get("LLM_REASONING_EFFORT")
             if raw is not None and str(raw).strip():
                 openhands_reasoning_effort = str(raw).strip()
+            raw = self.agent_env_vars.get("LLM_NATIVE_TOOL_CALLING")
+            if raw is not None and str(raw).strip():
+                native_tool_calling = str(raw).strip().lower() in {"1", "true", "yes", "on"}
             send_reasoning_content = str(
                 self.agent_env_vars.get("LLM_SEND_REASONING_CONTENT", "")
             ).strip().lower() in {"1", "true", "yes", "on"}
@@ -820,7 +827,7 @@ def _save_run_metadata(self, task_ids: List[str]) -> None:
             level=self.config.level,
             without_interface_descriptions=self.config.without_interface_descriptions,
             white_box=getattr(self.config, "white_box", False),
-            force_native_tool_calling=getattr(self.config, "force_native_tool_calling", False),
+            native_tool_calling=native_tool_calling,
             send_reasoning_content=send_reasoning_content,
             force_timeout=getattr(self.config, "force_timeout", False),
             api_key=self.config.api_key,
@@ -861,8 +868,16 @@ def run(self) -> None:
         if getattr(self.config, "white_box", False):
             self.console.print("[white]Prompt:[/] [yellow]white-box (tests visible)[/]")
         if self.config.agent == "openhands":
-            if getattr(self.config, "force_native_tool_calling", False):
-                self.console.print("[white]Tool calling:[/] [yellow]forced native[/]")
+            native_tool_calling = self.agent_env_vars.get("LLM_NATIVE_TOOL_CALLING")
+            native_tool_calling = (
+                None
+                if native_tool_calling is None or not str(native_tool_calling).strip()
+                else str(native_tool_calling).strip().lower() in {"1", "true", "yes", "on"}
+            )
+            if native_tool_calling is True:
+                self.console.print("[white]Tool calling:[/] [yellow]native forced on[/]")
+            elif native_tool_calling is False:
+                self.console.print("[white]Tool calling:[/] [yellow]native forced off[/]")
             send_reasoning_content = str(
                 self.agent_env_vars.get("LLM_SEND_REASONING_CONTENT", "")
             ).strip().lower() in {"1", "true", "yes", "on"}
@@ -1263,11 +1278,20 @@ def parse_args() -> argparse.Namespace:
         ),
     )
 
-    parser.add_argument(
+    native_tool_group = parser.add_mutually_exclusive_group()
+    native_tool_group.add_argument(
         "--native-tool-calling",
         action="store_true",
         help=(
-            "OpenHands only: force native tool calling (sets LLM_NATIVE_TOOL_CALLING=true inside the container). "
+            "OpenHands only: force native tool calling on (sets LLM_NATIVE_TOOL_CALLING=true inside the container). "
+            "In --resume mode, this flag is ignored and the value from run_metadata.json is used."
+        ),
+    )
+    native_tool_group.add_argument(
+        "--no-native-tool-calling",
+        action="store_true",
+        help=(
+            "OpenHands only: force native tool calling off (sets LLM_NATIVE_TOOL_CALLING=false inside the container). "
             "In --resume mode, this flag is ignored and the value from run_metadata.json is used."
         ),
     )
@@ -1391,7 +1415,11 @@ def load_resume_config(resume_dir: Path, args: argparse.Namespace) -> Tuple[Infe
         )
     if getattr(args, "native_tool_calling", False):
         warnings.append(
-            "--native-tool-calling (using 'force_native_tool_calling' from metadata)"
+            "--native-tool-calling (using 'native_tool_calling' from metadata)"
+        )
+    if getattr(args, "no_native_tool_calling", False):
+        warnings.append(
+            "--no-native-tool-calling (using 'native_tool_calling' from metadata)"
         )
     if getattr(args, "send_reasoning_content", False):
         warnings.append(
@@ -1460,8 +1488,8 @@ def load_resume_config(resume_dir: Path, args: argparse.Namespace) -> Tuple[Infe
     # Determine white_box: always use metadata in resume mode.
     white_box = bool(metadata.get("white_box"))
 
-    # Determine force_native_tool_calling: always use metadata in resume mode.
-    force_native_tool_calling = bool(metadata.get("force_native_tool_calling"))
+    # Determine native_tool_calling: always use metadata in resume mode.
+    native_tool_calling = metadata.get("native_tool_calling")
 
     # Determine send_reasoning_content: always use metadata in resume mode.
     send_reasoning_content = bool(metadata.get("send_reasoning_content"))
@@ -1492,7 +1520,7 @@ def load_resume_config(resume_dir: Path, args: argparse.Namespace) -> Tuple[Infe
         split=metadata.get('split'),  # Use split from metadata
         without_interface_descriptions=without_interface_descriptions,
         white_box=white_box,
-        force_native_tool_calling=force_native_tool_calling,
+        native_tool_calling=native_tool_calling,
         send_reasoning_content=send_reasoning_content,
         force_timeout=force_timeout,
         force_rerun_ids=_load_force_rerun_ids(getattr(args, "force_rerun", None)),
@@ -1558,7 +1586,13 @@ def main():
             split=args.split if args.split is not None else "full",
             without_interface_descriptions=bool(getattr(args, "without", False)),
             white_box=bool(getattr(args, "white", False)),
-            force_native_tool_calling=bool(getattr(args, "native_tool_calling", False)),
+            native_tool_calling=(
+                True
+                if getattr(args, "native_tool_calling", False)
+                else False
+                if getattr(args, "no_native_tool_calling", False)
+                else None
+            ),
             send_reasoning_content=bool(getattr(args, "send_reasoning_content", False)),
             force_timeout=bool(getattr(args, "force_timeout", False)),
             force_rerun_ids=_load_force_rerun_ids(getattr(args, "force_rerun", None)),
@@ -1573,4 +1607,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()