Create openai responses api test (#1156)

areibman · cursoragent · alex · web-flow · commit 7992427ee42a · 2025-07-17T14:16:36.000-07:00
* Add o3 Responses API integration tests and documentation

Co-authored-by: alex &lt;alex@agentops.ai&gt;

* Remove o3 integration tests, update README, add o3 responses example

Co-authored-by: alex &lt;alex@agentops.ai&gt;

* responses fix

* yaml

* ruff

* Fix Ruff CI failures: remove unused variables and apply formatting

* auto start false

* safer tool args get

---------

Co-authored-by: Cursor Agent &lt;cursoragent@cursor.com&gt;
Co-authored-by: alex &lt;alex@agentops.ai&gt;
Co-authored-by: Alex Reibman &lt;reibs@Alexs-MBP.attlocal.net&gt;
Co-authored-by: Alex Reibman &lt;reibs@macbookpro.lan&gt;
diff --git a/.github/workflows/examples-integration-test.yml b/.github/workflows/examples-integration-test.yml
@@ -36,6 +36,7 @@ jobs:
           - { path: 'examples/openai/openai_example_async.py', name: 'OpenAI Async' }
           - { path: 'examples/openai/multi_tool_orchestration.py', name: 'OpenAI Multi-Tool' }
           - { path: 'examples/openai/web_search.py', name: 'OpenAI Web Search' }
+          - { path: 'examples/openai/o3_responses_example.py', name: 'OpenAI o3 Responses' }
           
           # Anthropic examples
           - { path: 'examples/anthropic/anthropic-example-sync.py', name: 'Anthropic Sync' }
diff --git a/agentops/instrumentation/providers/openai/stream_wrapper.py b/agentops/instrumentation/providers/openai/stream_wrapper.py
diff --git a/agentops/instrumentation/providers/openai/wrappers/__init__.py b/agentops/instrumentation/providers/openai/wrappers/__init__.py
@@ -14,6 +14,7 @@
     handle_run_stream_attributes,
     handle_messages_attributes,
 )
+from agentops.instrumentation.providers.openai.wrappers.responses import handle_responses_attributes
 
 __all__ = [
     "handle_chat_attributes",
@@ -25,4 +26,5 @@
     "handle_run_retrieve_attributes",
     "handle_run_stream_attributes",
     "handle_messages_attributes",
+    "handle_responses_attributes",
 ]
diff --git a/agentops/instrumentation/providers/openai/wrappers/responses.py b/agentops/instrumentation/providers/openai/wrappers/responses.py
@@ -0,0 +1,191 @@
+"""Responses API wrapper for OpenAI instrumentation.
+
+This module provides attribute extraction for OpenAI Responses API endpoints.
+"""
+
+import json
+import logging
+from typing import Any, Dict, Optional, Tuple
+
+from agentops.instrumentation.providers.openai.utils import is_openai_v1
+from agentops.instrumentation.providers.openai.wrappers.shared import (
+    model_as_dict,
+    should_send_prompts,
+)
+from agentops.instrumentation.common.attributes import AttributeMap
+from agentops.semconv import SpanAttributes, LLMRequestTypeValues
+
+logger = logging.getLogger(__name__)
+
+
+def handle_responses_attributes(
+    args: Optional[Tuple] = None,
+    kwargs: Optional[Dict] = None,
+    return_value: Optional[Any] = None,
+) -> AttributeMap:
+    """Extract attributes from responses API calls."""
+    attributes = {
+        SpanAttributes.LLM_SYSTEM: "OpenAI",
+        SpanAttributes.LLM_REQUEST_TYPE: LLMRequestTypeValues.CHAT.value,
+    }
+
+    # Extract request attributes from kwargs
+    if kwargs:
+        # Model
+        if "model" in kwargs:
+            attributes[SpanAttributes.LLM_REQUEST_MODEL] = kwargs["model"]
+
+        # Request parameters
+        if "max_tokens" in kwargs:
+            attributes[SpanAttributes.LLM_REQUEST_MAX_TOKENS] = kwargs["max_tokens"]
+        if "temperature" in kwargs:
+            attributes[SpanAttributes.LLM_REQUEST_TEMPERATURE] = kwargs["temperature"]
+        if "top_p" in kwargs:
+            attributes[SpanAttributes.LLM_REQUEST_TOP_P] = kwargs["top_p"]
+        if "frequency_penalty" in kwargs:
+            attributes[SpanAttributes.LLM_REQUEST_FREQUENCY_PENALTY] = kwargs["frequency_penalty"]
+        if "presence_penalty" in kwargs:
+            attributes[SpanAttributes.LLM_REQUEST_PRESENCE_PENALTY] = kwargs["presence_penalty"]
+        if "user" in kwargs:
+            attributes[SpanAttributes.LLM_USER] = kwargs["user"]
+
+        # Streaming
+        attributes[SpanAttributes.LLM_REQUEST_STREAMING] = kwargs.get("stream", False)
+
+        # Input messages
+        if should_send_prompts() and "input" in kwargs:
+            messages = kwargs["input"]
+            for i, msg in enumerate(messages):
+                prefix = f"{SpanAttributes.LLM_PROMPTS}.{i}"
+                if isinstance(msg, dict):
+                    if "role" in msg:
+                        attributes[f"{prefix}.role"] = msg["role"]
+                    if "content" in msg:
+                        content = msg["content"]
+                        if isinstance(content, list):
+                            content = json.dumps(content)
+                        attributes[f"{prefix}.content"] = content
+
+        # Tools
+        if "tools" in kwargs:
+            tools = kwargs["tools"]
+            if tools:
+                for i, tool in enumerate(tools):
+                    if isinstance(tool, dict) and "function" in tool:
+                        function = tool["function"]
+                        prefix = f"{SpanAttributes.LLM_REQUEST_FUNCTIONS}.{i}"
+                        if "name" in function:
+                            attributes[f"{prefix}.name"] = function["name"]
+                        if "description" in function:
+                            attributes[f"{prefix}.description"] = function["description"]
+                        if "parameters" in function:
+                            attributes[f"{prefix}.parameters"] = json.dumps(function["parameters"])
+
+    # Extract response attributes from return value
+    if return_value:
+        # Convert to dict if needed
+        response_dict = {}
+        if hasattr(return_value, "__dict__") and not hasattr(return_value, "__iter__"):
+            response_dict = model_as_dict(return_value)
+        elif isinstance(return_value, dict):
+            response_dict = return_value
+        elif hasattr(return_value, "model_dump"):
+            response_dict = return_value.model_dump()
+
+        # Basic response attributes
+        if "id" in response_dict:
+            attributes[SpanAttributes.LLM_RESPONSE_ID] = response_dict["id"]
+        if "model" in response_dict:
+            attributes[SpanAttributes.LLM_RESPONSE_MODEL] = response_dict["model"]
+
+        # Usage
+        usage = response_dict.get("usage", {})
+        if usage:
+            if is_openai_v1() and hasattr(usage, "__dict__"):
+                usage = usage.__dict__
+            if "total_tokens" in usage:
+                attributes[SpanAttributes.LLM_USAGE_TOTAL_TOKENS] = usage["total_tokens"]
+            # Responses API uses input_tokens/output_tokens instead of prompt_tokens/completion_tokens
+            if "input_tokens" in usage:
+                attributes[SpanAttributes.LLM_USAGE_PROMPT_TOKENS] = usage["input_tokens"]
+            if "output_tokens" in usage:
+                attributes[SpanAttributes.LLM_USAGE_COMPLETION_TOKENS] = usage["output_tokens"]
+
+            # Reasoning tokens
+            output_details = usage.get("output_tokens_details", {})
+            if isinstance(output_details, dict) and "reasoning_tokens" in output_details:
+                attributes[SpanAttributes.LLM_USAGE_REASONING_TOKENS] = output_details["reasoning_tokens"]
+
+        # Output items
+        if should_send_prompts() and "output" in response_dict:
+            output_items = response_dict["output"]
+            completion_idx = 0
+            for i, output_item in enumerate(output_items):
+                # Handle dictionary format
+                if isinstance(output_item, dict):
+                    item_type = output_item.get("type")
+                # Handle object format (Pydantic models)
+                elif hasattr(output_item, "type"):
+                    item_type = output_item.type
+                    output_item_dict = model_as_dict(output_item)
+                    if output_item_dict and isinstance(output_item_dict, dict):
+                        output_item = output_item_dict
+                    else:
+                        continue
+                else:
+                    continue
+
+                if item_type == "message":
+                    # Extract message content
+                    if isinstance(output_item, dict):
+                        content = output_item.get("content", [])
+                        if isinstance(content, list):
+                            # Aggregate all text content
+                            text_parts = []
+                            for content_item in content:
+                                if isinstance(content_item, dict) and content_item.get("type") == "text":
+                                    text = content_item.get("text", "")
+                                    if text:
+                                        text_parts.append(text)
+                            if text_parts:
+                                full_text = "".join(text_parts)
+                                attributes[f"{SpanAttributes.LLM_COMPLETIONS}.{completion_idx}.content"] = full_text
+                                attributes[f"{SpanAttributes.LLM_COMPLETIONS}.{completion_idx}.role"] = "assistant"
+                                completion_idx += 1
+                        elif isinstance(content, str):
+                            # Simple string content
+                            attributes[f"{SpanAttributes.LLM_COMPLETIONS}.{completion_idx}.content"] = content
+                            attributes[f"{SpanAttributes.LLM_COMPLETIONS}.{completion_idx}.role"] = "assistant"
+                            completion_idx += 1
+
+                elif item_type == "function_call" and isinstance(output_item, dict):
+                    # Handle function calls
+                    # The arguments contain the actual response content for function calls
+                    args_str = output_item.get("arguments", "")
+                    if args_str:
+                        try:
+                            args = json.loads(args_str)
+                            # Extract reasoning if present (common in o3 models)
+                            reasoning = args.get("reasoning", "")
+                            if reasoning:
+                                attributes[f"{SpanAttributes.LLM_COMPLETIONS}.{completion_idx}.content"] = reasoning
+                                attributes[f"{SpanAttributes.LLM_COMPLETIONS}.{completion_idx}.role"] = "assistant"
+                                completion_idx += 1
+                        except json.JSONDecodeError:
+                            pass
+
+                    # Also store tool call details
+                    attributes[f"{SpanAttributes.LLM_COMPLETIONS}.{i}.tool_calls.0.id"] = output_item.get("id", "")
+                    attributes[f"{SpanAttributes.LLM_COMPLETIONS}.{i}.tool_calls.0.name"] = output_item.get("name", "")
+                    attributes[f"{SpanAttributes.LLM_COMPLETIONS}.{i}.tool_calls.0.arguments"] = args_str
+
+                elif item_type == "reasoning" and isinstance(output_item, dict):
+                    # Handle reasoning items (o3 models provide these)
+                    summary = output_item.get("summary", "")
+                    if summary:
+                        attributes[f"{SpanAttributes.LLM_COMPLETIONS}.{completion_idx}.content"] = summary
+                        attributes[f"{SpanAttributes.LLM_COMPLETIONS}.{completion_idx}.role"] = "assistant"
+                        attributes[f"{SpanAttributes.LLM_COMPLETIONS}.{completion_idx}.type"] = "reasoning"
+                        completion_idx += 1
+
+    return attributes
diff --git a/examples/agno/agno_async_operations.ipynb b/examples/agno/agno_async_operations.ipynb
@@ -46,7 +46,6 @@
     "\n",
     "import agentops\n",
     "from agno.agent import Agent\n",
-    "from agno.team import Team\n",
     "from agno.models.openai import OpenAIChat"
    ]
   },
diff --git a/examples/langgraph/langgraph_example.ipynb b/examples/langgraph/langgraph_example.ipynb
@@ -45,7 +45,7 @@
     "from langgraph.graph import StateGraph, END\n",
     "from langgraph.graph.message import add_messages\n",
     "from langchain_openai import ChatOpenAI\n",
-    "from langchain_core.messages import HumanMessage, AIMessage, ToolMessage\n",
+    "from langchain_core.messages import HumanMessage, ToolMessage\n",
     "from langchain_core.tools import tool\n",
     "import agentops\n",
     "from dotenv import load_dotenv\n",
diff --git a/examples/mem0/mem0_memory_example.ipynb b/examples/mem0/mem0_memory_example.ipynb
@@ -55,8 +55,6 @@
     "from mem0 import Memory, AsyncMemory\n",
     "import os\n",
     "import asyncio\n",
-    "import logging\n",
-    "from dotenv import load_dotenv\n",
     "import agentops"
    ]
   },
@@ -189,7 +187,7 @@
     "        print(f\"Delete all result: {delete_all_result}\")\n",
     "\n",
     "        agentops.end_trace(end_state=\"success\")\n",
-    "    except Exception as e:\n",
+    "    except Exception:\n",
     "        agentops.end_trace(end_state=\"error\")"
    ]
   },
@@ -263,7 +261,7 @@
     "\n",
     "        agentops.end_trace(end_state=\"success\")\n",
     "\n",
-    "    except Exception as e:\n",
+    "    except Exception:\n",
     "        agentops.end_trace(end_state=\"error\")"
    ]
   },
diff --git a/examples/mem0/mem0_memoryclient_example.ipynb b/examples/mem0/mem0_memoryclient_example.ipynb
@@ -199,7 +199,7 @@
     "        delete_all_result = client.delete_all(user_id=user_id)\n",
     "        print(f\"Delete all result: {delete_all_result}\")\n",
     "        agentops.end_trace(end_state=\"success\")\n",
-    "    except Exception as e:\n",
+    "    except Exception:\n",
     "        agentops.end_trace(end_state=\"error\")"
    ]
   },
@@ -279,7 +279,7 @@
     "\n",
     "        agentops.end_trace(end_state=\"success\")\n",
     "\n",
-    "    except Exception as e:\n",
+    "    except Exception:\n",
     "        agentops.end_trace(end_state=\"error\")"
    ]
   },
diff --git a/examples/openai/README.md b/examples/openai/README.md
@@ -39,6 +39,16 @@ Example: `web_search`
 This example demonstrates:
 - Web search functionality
 
+### 5. o3 Responses API
+
+Example: `o3_responses_example`
+
+This example demonstrates:
+- OpenAI's o3 reasoning model with the Responses API
+- Tool calls and structured reasoning
+- Complex decision-making scenarios
+- AgentOps integration with reasoning models
+
 ## AgentOps Integration
 
 These examples show how to use AgentOps to monitor and analyze your AI applications. AgentOps automatically instruments your OpenAI calls to provide insights into performance, usage patterns, and model behavior.
diff --git a/examples/openai/o3_responses_example.py b/examples/openai/o3_responses_example.py
diff --git a/examples/smolagents/multi_smolagents_system.ipynb b/examples/smolagents/multi_smolagents_system.ipynb

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@`
`14`	`14`	`handle_run_stream_attributes,`
`15`	`15`	`handle_messages_attributes,`
`16`	`16`	`)`
	`17`	`+from agentops.instrumentation.providers.openai.wrappers.responses import handle_responses_attributes`
`17`	`18`
`18`	`19`	`__all__ = [`
`19`	`20`	`"handle_chat_attributes",`
`@@ -25,4 +26,5 @@`
`25`	`26`	`"handle_run_retrieve_attributes",`
`26`	`27`	`"handle_run_stream_attributes",`
`27`	`28`	`"handle_messages_attributes",`
	`29`	`+ "handle_responses_attributes",`
`28`	`30`	`]`
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,6 @@`
`46`	`46`	`"\n",`
`47`	`47`	`"import agentops\n",`
`48`	`48`	`"from agno.agent import Agent\n",`
`49`		`- "from agno.team import Team\n",`
`50`	`49`	`"from agno.models.openai import OpenAIChat"`
`51`	`50`	`]`
`52`	`51`	`},`