NVIDIA
diff --git a/‎garak/generators/openai.py‎
Lines changed: 167 additions & 0 deletions b/‎garak/generators/openai.py‎
Lines changed: 167 additions & 0 deletions
@@ -431,4 +431,171 @@ class OpenAIReasoningGenerator(OpenAIGenerator):
     }
 
 
+class OpenAIResponsesGenerator(Generator):
+    """Generator using the OpenAI Responses API with server-side tool orchestration.
+
+    Supports MCP servers and function tools via the ``tools`` parameter.
+    Unlike the chat-completions generators, the Responses API runs the full
+    agentic loop (tool calls → execution → follow-up) on the server side,
+    returning only the final text to garak.
+    """
+
+    ENV_VAR = "OPENAI_API_KEY"
+    active = True
+    generator_family_name = "OpenAIResponses"
+    supports_multiple_generations = False
+
+    DEFAULT_PARAMS = Generator.DEFAULT_PARAMS | {
+        "uri": None,
+        "instructions": None,
+        "tools": [],
+        "max_output_tokens": 1024,
+        "extra_params": {},
+        # response.output item types to collect into the final text.
+        # "message" captures standard assistant text; "reasoning" captures
+        # the model's reasoning summary (ResponseReasoningItem).
+        "output_types": ["message"],
+    }
+
+    _unsafe_attributes = ["client"]
+
+    def _load_unsafe(self):
+        kwargs = {"api_key": getattr(self, "api_key", None)}
+        if getattr(self, "uri", None):
+            kwargs["base_url"] = self.uri
+        self.client = openai.OpenAI(**kwargs)
+
+    def __init__(self, name="", config_root=_config):
+        self.name = name
+        self._load_config(config_root)
+        self.fullname = f"{self.generator_family_name} {self.name}"
+        self.key_env_var = self.ENV_VAR
+        self._load_unsafe()
+        super().__init__(self.name, config_root=config_root)
+
+    @staticmethod
+    def _build_input(prompt: Union[Conversation, str]):
+        """Convert a Conversation (or raw string) to the Responses API input format.
+
+        System turns are excluded — callers should promote them to ``instructions``
+        via :meth:`_extract_system_prompt` before calling this method.
+        """
+        if isinstance(prompt, str):
+            return prompt
+        non_system = [t for t in prompt.turns if t.role != "system"]
+        if len(non_system) == 1:
+            return non_system[0].content.text
+        items = []
+        for turn in non_system:
+            content_type = "output_text" if turn.role == "assistant" else "input_text"
+            items.append(
+                {
+                    "type": "message",
+                    "role": turn.role,
+                    "content": [{"type": content_type, "text": turn.content.text}],
+                }
+            )
+        return items
+
+    @staticmethod
+    def _extract_system_prompt(prompt: Union[Conversation, str]) -> Union[str, None]:
+        """Return system turn text from a Conversation as a single string, or None.
+
+        Multiple system turns are joined with a newline so no content is lost.
+        """
+        if not isinstance(prompt, Conversation):
+            return None
+        system_texts = [t.content.text for t in prompt.turns if t.role == "system"]
+        return "\n".join(system_texts) if system_texts else None
+
+    @backoff.on_exception(
+        backoff.fibo,
+        (
+            openai.RateLimitError,
+            openai.InternalServerError,
+            openai.APITimeoutError,
+            openai.APIConnectionError,
+            garak.exception.GeneratorBackoffTrigger,
+        ),
+        max_value=70,
+    )
+    def _call_model(
+        self, prompt: Union[Conversation, str], generations_this_call: int = 1
+    ) -> List[Union[Message, None]]:
+        if self.client is None:
+            self._load_unsafe()
+
+        instructions = self.instructions or self._extract_system_prompt(prompt)
+        create_args = {
+            "model": self.name,
+            "input": self._build_input(prompt),
+            "max_output_tokens": self.max_output_tokens,
+        }
+        if instructions:
+            create_args["instructions"] = instructions
+        if self.tools:
+            create_args["tools"] = self.tools
+        for k, v in self.extra_params.items():
+            create_args[k] = v
+
+        try:
+            response = self.client.responses.create(**create_args)
+        except openai.BadRequestError as e:
+            logging.exception(e)
+            logging.error("Bad request: %s", repr(prompt))
+            return [None]
+        except json.decoder.JSONDecodeError as e:
+            logging.exception(e)
+            raise garak.exception.GeneratorBackoffTrigger from e
+
+        text_parts = []
+        tool_calls = []
+
+        # Manually iterate response.output rather than using response.output_text so
+        # that output_types can include non-message items such as reasoning summaries.
+        _TOOL_CALL_ATTRS = (
+            "call_id", "name", "arguments", "input", "output",
+            "error", "status", "server_label",
+        )
+        for item in response.output:
+            item_type = getattr(item, "type", None)
+            # Any item whose type ends with "_call" is treated as a tool invocation and
+            # captured in tool_calls. This covers all current
+            # (function_call, mcp_call, web_search_call, file_search_call, computer_call)
+            # and future tool types without needing per-type handling.
+            if item_type is not None and item_type.endswith("_call"):
+                entry = {"type": item_type, "id": getattr(item, "id", None)}
+                for attr in _TOOL_CALL_ATTRS:
+                    val = getattr(item, attr, None)
+                    if val is not None:
+                        entry[attr] = val
+                tool_calls.append(entry)
+                continue
+            # Only attributes that are present on the item are included, so the dict shape varies by type.
+            if item_type not in self.output_types:
+                continue
+            if item_type == "message":
+                for part in item.content:
+                    if getattr(part, "type", None) == "output_text":
+                        text_parts.append(part.text)
+            elif item_type == "reasoning":
+                for summary in getattr(item, "summary", []):
+                    if getattr(summary, "type", None) == "summary_text":
+                        text_parts.append(summary.text)
+            else:
+                text = getattr(item, "text", None)
+                if text is not None:
+                    text_parts.append(text)
+                else:
+                    logging.warning("No text extraction defined for output item type %r", item_type)
+
+        text = "\n".join(text_parts) if text_parts else None
+        notes = {"tool_calls": tool_calls} if tool_calls else {}
+        # Responses API doesn't support multiple choices, we'll always return a list of one Message.
+        # Return a Message whenever there is text or tool call metadata; None only when both are absent.
+        if text is not None or tool_calls:
+            return [Message(text, notes=notes)]
+        return [None]
+
+
 DEFAULT_CLASS = "OpenAIGenerator"