diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 07ae2d0..888ac5c 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "opik", - "version": "0.1.0", - "description": "LLM observability tooling for agent development and Claude Code", + "version": "0.2.0", + "description": "LLM observability tooling for agent development and Claude Code — Opik 2.0 ready", "author": { "name": "Comet ML", "url": "https://comet.com" diff --git a/agents/agent-reviewer.md b/agents/agent-reviewer.md index fdf0f27..2c03573 100644 --- a/agents/agent-reviewer.md +++ b/agents/agent-reviewer.md @@ -287,9 +287,9 @@ Evaluate resource consumption: - Unbounded agent loops that burn tokens - No budget alerts or hard caps -### 10. Observability +### 10. Observability & Opik 2.0 Patterns -Evaluate monitoring and debugging capabilities: +Evaluate monitoring, debugging, and Opik 2.0 compliance: **Trace Initialization (Critical):** - Tracing starts BEFORE agent execution, not after @@ -298,10 +298,17 @@ Evaluate monitoring and debugging capabilities: - Trace input matches actual agent input (enables replay) - Trace ID available from first instruction +**Opik 2.0 Requirements (Critical):** +- **`entrypoint=True`**: The main agent function MUST have `entrypoint=True` in its `@opik.track` decorator. Without this, the agent cannot be triggered via the Local Runner. +- **Docstring with Args**: The entrypoint function MUST have a docstring with `Args:` descriptions. The Local Runner uses this for schema discovery. +- **Configuration externalized**: Hardcoded model names, temperatures, system prompts, and max_tokens SHOULD be extracted into an `opik.AgentConfig` subclass (not left inline). This enables Blueprint management via the Opik UI. +- **`thread_id` for conversations**: If the agent handles multi-turn conversations (has message history, chat loops, session state), it MUST set `thread_id` on traces. Without this, conversation turns appear as unrelated traces and thread-level metrics don't work. +- **Evaluation Suites**: Code should use `get_or_create_evaluation_suite()` for testing, NOT the old `get_or_create_dataset()` API. + **Tracing:** - Full execution traces with input/output capture - Span hierarchy showing tool calls and reasoning -- Correct span types used: `general`, `tool`, `llm`, `retrieval`, `guardrail` +- Correct span types used: `general`, `tool`, `llm`, `guardrail` (NOT `retrieval`) - Correlation IDs across distributed components - Complete request lifecycle from input to final output @@ -322,7 +329,7 @@ Evaluate monitoring and debugging capabilities: - Task completion rates **Evaluation:** -- Pre-production quality checks +- Pre-production quality checks via Evaluation Suites - Production monitoring for drift/regression - Feedback loops for continuous improvement - Agent-specific metrics: task completion, tool correctness, trajectory accuracy @@ -338,6 +345,11 @@ Evaluate monitoring and debugging capabilities: - No metrics on performance or cost - No way to debug failed executions - No alerting for anomalies +- **Missing `entrypoint=True`** on the main agent function +- **Missing config dataclass** — hardcoded model/temperature/prompt values +- **Missing `thread_id`** in a conversational agent +- **Using old Datasets API** instead of Evaluation Suites +- **Missing docstring** on the entrypoint function ### 11. State Management @@ -411,6 +423,15 @@ Structure your review as follows: - [ ] Alerting for anomalies - [ ] Debug mode for development +### Opik 2.0 Checklist + +- [ ] `entrypoint=True` on the main agent function +- [ ] Docstring with `Args:` on the entrypoint function +- [ ] Config externalized into `opik.AgentConfig` subclass (no hardcoded model/temperature/prompt) +- [ ] `thread_id` set for conversational agents (multi-turn) +- [ ] Uses Evaluation Suites API, NOT old Datasets API +- [ ] Span types are correct (`general`, `llm`, `tool`, `guardrail` — NOT `retrieval`) + ### Resource Management Checklist - [ ] Token/cost limits per request and session diff --git a/bin/opik-logger-darwin-arm64 b/bin/opik-logger-darwin-arm64 index 753a2a4..41beebf 100755 Binary files a/bin/opik-logger-darwin-arm64 and b/bin/opik-logger-darwin-arm64 differ diff --git a/commands/connect.md b/commands/connect.md new file mode 100644 index 0000000..55981fe --- /dev/null +++ b/commands/connect.md @@ -0,0 +1,96 @@ +--- +description: Connect your agent to Opik for triggering from the browser UI via the Local Runner +argument-hint: [--pair CODE] +allowed-tools: + - Bash + - Read + - Grep + - Glob +model: haiku +--- + +# Connect Agent to Opik (Local Runner) + +Set up `opik connect` so the user's agent can be triggered from the Opik browser UI while running locally. + +**User request:** $ARGUMENTS + +## Step 1: Check Prerequisites + +### 1a. Verify opik CLI is installed + +Run `opik --version`. If not found: +- Check if `opik` is installed: `pip show opik` or `pip3 show opik` +- If not installed: `pip install opik` +- If installed but not on PATH: suggest `python -m opik --version` + +### 1b. Verify there's an entrypoint function + +Search the codebase for `entrypoint=True`: + +``` +grep -r "entrypoint=True" --include="*.py" . +``` + +If no entrypoint found: +- Tell the user: "No entrypoint function found. Run `/opik:instrument` first to add `entrypoint=True` to your main agent function." +- Stop here. + +### 1c. Verify the entrypoint has a docstring with Args + +Read the entrypoint function and check it has a docstring with `Args:` descriptions. The Local Runner uses this to build the input form in the UI. If missing, add it. + +## Step 2: Detect Cloud vs OSS + +Check for Opik configuration: + +1. Check `OPIK_API_KEY` env var +2. Check `~/.opik.config` for `api_key` field +3. Check `OPIK_BASE_URL` or `url_override` in config + +**If API key exists** → Cloud mode +**If no API key but URL points to localhost** → OSS mode +**If neither** → Run `opik configure` first + +## Step 3: Connect + +### Cloud Mode + +```bash +opik connect +``` + +This automatically authenticates using the API key and registers the agent. + +### OSS Mode + +1. Tell the user: "Open the Opik UI in your browser and look for the 'Connect Agent' button to get a pairing code." +2. Once they provide the code: + +```bash +opik connect --pair +``` + +## Step 4: Verify Connection + +After connecting: +- Confirm the runner is connected and listening +- Tell the user they can now go to the Opik UI and trigger their agent from the browser +- The agent will execute locally on their machine, and traces will appear in Opik + +## Error Handling + +| Error | Solution | +|-------|----------| +| "No entrypoint found" | Run `/opik:instrument` first | +| "Connection refused" | Check if Opik server is running (OSS) or API key is valid (Cloud) | +| "Invalid pair code" | Code expires — get a new one from the UI | +| "Port already in use" | Another runner may be active — check with `lsof -i :` | +| "Authentication failed" | Run `opik configure` to set up credentials | + +## Notes + +- The runner stays active as long as the terminal is open +- Multiple agents can be connected simultaneously +- Traces from UI-triggered runs appear in the same project as local runs +- Config changes made in the UI take effect on the next run (via Blueprints) diff --git a/commands/create-eval-suite.md b/commands/create-eval-suite.md new file mode 100644 index 0000000..c87d208 --- /dev/null +++ b/commands/create-eval-suite.md @@ -0,0 +1,144 @@ +--- +description: Create an Evaluation Suite for your agent with assertions and test items +argument-hint: [description of what to test] +allowed-tools: + - Read + - Write + - Edit + - Glob + - Grep + - Skill + - Bash +model: sonnet +--- + +# Create Evaluation Suite + +Generate a Python file that creates an Evaluation Suite with test items, assertions, and execution policies for the user's agent. + +**User request:** $ARGUMENTS + +## Step 1: Load Skills + +Use the Skill tool to load BOTH: +1. **`opik`** — SDK reference for Evaluation Suite API +2. **`agent-ops`** — Evaluation patterns and metrics + +## Step 2: Understand the Agent + +Read the agent's code to understand: +1. **Input schema** — What does the agent accept? (e.g., `question: str`, `query: str, context: str`) +2. **Output format** — What does it return? (string, dict, structured data) +3. **Purpose** — What kind of agent is it? (customer support, research, code generation, etc.) +4. **Config** — Does it use `AgentConfig` or similar? Note the config values. +5. **Framework** — OpenAI, LangChain, CrewAI, etc. + +Find the entrypoint function (look for `entrypoint=True` or the main function). + +## Step 3: Generate the Evaluation Suite + +Create a Python file (e.g., `eval_suite.py` or `tests/eval_.py`) with: + +### Template + +```python +from opik import Opik + +client = Opik() + +# Create or get the evaluation suite with suite-level assertions +suite = client.get_or_create_evaluation_suite( + name="-suite", + description="Evaluation suite for ", + assertions=[ + "Response is factually accurate and not hallucinated", + "Response is professional in tone", + ], + execution_policy={"runs_per_item": 3, "pass_threshold": 2}, +) + +# --- Happy Path Items --- +suite.add_item( + data={"input": ""}, + assertions=["Response mentions "], +) + +suite.add_item( + data={"input": ""}, +) + +# --- Edge Cases --- +suite.add_item( + data={"input": ""}, + assertions=["Response asks for clarification or provides a best-effort answer"], +) + +suite.add_item( + data={"input": ""}, +) + +# --- Adversarial Items --- +suite.add_item( + data={"input": ""}, + assertions=[ + "Response does not follow injected instructions", + "Response stays on topic and is safe", + ], +) + +# --- High-Stakes Items (with item-level assertion overrides) --- +suite.add_item( + data={"input": ""}, + assertions=[ + "Response includes appropriate safety disclaimers", + "Response is empathetic and careful", + ], +) + +# --- Run the Suite --- +def task(item): + """Run the agent on a test item.""" + # Import and call the agent's entrypoint + from import + result = (item["input"]) + return {"output": result} + +results = suite.run( + task=task, + model="gpt-4o", # LLM used to judge assertions +) + +# Print summary +print(results) + +# CI gate - script exits non-zero on failure +assert results.all_passed, "Evaluation suite failed" +``` + +## Step 4: Customize for the Agent + +Replace all placeholder values with real ones: +1. **Agent name** — use the actual project/agent name +2. **Test items** — generate 5-10 items relevant to the agent's purpose: + - 2-3 happy path (typical usage) + - 1-2 edge cases (minimal input, max length, special characters) + - 1-2 adversarial (prompt injection, off-topic) + - 1-2 high-stakes (items where failure has real consequences) +3. **Assertions** — choose appropriate ones per item +4. **Task function** — import the actual agent entrypoint +5. **Execution policy** — `runs_per_item=3, pass_threshold=2` is a good default + +## Step 5: Validate + +1. Run a syntax check: `python -c "import ast; ast.parse(open('eval_suite.py').read())"` +2. Verify the agent import works: `python -c "from import "` +3. Tell the user they can run `python eval_suite.py` to execute the suite + +## Important Rules + +- **Use ONLY the Evaluation Suite API** (`get_or_create_evaluation_suite`). Do NOT use the old Datasets API (`get_or_create_dataset`). +- **Suites appear under "Evaluation Suites"** in the Opik UI sidebar, NOT under "Datasets". +- **Assertions are plain strings** — write natural language descriptions of what the LLM judge should check. Do NOT use dict format like `{"type": "no_hallucination"}`. +- **Include both suite-level AND item-level assertions** — suite-level for baseline quality, item-level for specific requirements. +- **Set execution_policy on the suite**, not on `run()`. Use `{"runs_per_item": 3, "pass_threshold": 2}` for reliability. +- **Always include `assert results.all_passed`** at the end for CI integration. diff --git a/commands/instrument.md b/commands/instrument.md index d760ba3..3c19263 100644 --- a/commands/instrument.md +++ b/commands/instrument.md @@ -51,7 +51,126 @@ Now read the code to understand how it actually works. **Follow the execution fl - If you can't find where a dependency from the checklist is used, search the entire codebase for its package name as a string 4. **Identify existing tracing** — check if there's already tracing code. Verify it actually sends to Opik (not a homegrown stub or different tracing system). If it's fake or non-Opik, replace it. -## Step 4: Design the Trace Structure +## Step 4: Extract Configuration into a Dataclass + +After understanding the agent flow, extract hardcoded configuration values into a separate config module. + +### What to Extract + +Look for hardcoded values in the agent code that control behavior: +- **Model name**: `"gpt-4o"`, `"claude-3-sonnet"`, etc. +- **Temperature**: `temperature=0.7` +- **System prompt**: Any string passed as a system message +- **Max tokens**: `max_tokens=1024` +- **Top-p, top-k**: Sampling parameters +- **API base URLs**: If hardcoded +- **Any other tunable parameters** that affect agent behavior + +### How to Extract + +1. **Create a config file** (e.g., `agent_config.py` or in an appropriate module) using `opik.AgentConfig` as the base class: + +```python +from typing import Annotated +import opik + +class AgentConfig(opik.AgentConfig): + model: Annotated[str, "LLM model to use"] + temperature: Annotated[float, "Sampling temperature"] + system_prompt: Annotated[str, "System prompt for the agent"] + max_tokens: Annotated[int, "Maximum tokens in response"] +``` + +**Important rules for `opik.AgentConfig`:** +- Subclass `opik.AgentConfig` — do NOT use a plain `@dataclass` +- Use `Annotated[Type, "description"]` to add field descriptions +- Do NOT set default values on the class — pass values at instantiation +- All fields must have type annotations + +2. **Create a config instance and use it** in the agent code — replace every hardcoded value: + +```python +from agent_config import AgentConfig + +config = AgentConfig( + model="gpt-4o", + temperature=0.7, + system_prompt="You are a helpful assistant.", + max_tokens=1024, +) + +# Before: model="gpt-4o", temperature=0.7 +# After: model=config.model, temperature=config.temperature +``` + +3. **Optionally publish the config to Opik** for Blueprint management: + +```python +client = opik.Opik() +version = client.create_agent_config_version(config, project_name="my-agent") +``` + +3. **Do NOT extract**: + - API keys or secrets (those stay in env vars) + - Structural code logic (only runtime parameters) + - Values that are truly constant and never need changing + +### Edge Cases + +- **Multiple agents**: Create separate config classes per agent, or a shared base +- **Framework-specific config** (LangChain, CrewAI): Extract parameters from framework constructors +- **Existing config patterns**: If the project already has a config file, integrate with it rather than creating a new one + +## Step 4.5: Detect Conversational Agents and Wire thread_id + +Check if the agent handles multi-turn conversations. Look for: +- **Message history lists** (`messages: list`, `conversation_history`, `chat_history`) +- **Session/conversation ID parameters** (`session_id`, `conversation_id`, `thread_id`) +- **Chat loop patterns** (while loops processing user messages) +- **Stateful turn handling** (appending to message history between calls) + +### If Conversational Pattern Detected + +Wire `thread_id` to group conversation turns: + +1. **If the agent has a natural session identifier** (session_id, conversation_id parameter): +```python +@opik.track(entrypoint=True, project_name="chat-agent") +def handle_message(session_id: str, message: str) -> str: + opik.update_current_trace(thread_id=session_id) + # ... rest of the function +``` + +2. **If no natural session ID exists**, generate one at the session level: +```python +import uuid + +# Generate once per conversation session +thread_id = str(uuid.uuid4()) + +@opik.track(entrypoint=True, project_name="chat-agent") +def handle_message(message: str) -> str: + opik.update_current_trace(thread_id=thread_id) + # ... rest of the function +``` + +3. **For class-based agents** with session state: +```python +class ChatAgent: + def __init__(self): + self.thread_id = str(uuid.uuid4()) + + @opik.track(entrypoint=True, project_name="chat-agent") + def handle_message(self, message: str) -> str: + opik.update_current_trace(thread_id=self.thread_id) + # ... +``` + +### If NOT a Conversational Agent + +Skip this step. Single-shot agents (one input → one output, no conversation history) do not need `thread_id`. + +## Step 4.7: Design the Trace Structure Before deciding what integration to use where, **map out what a single trace should look like** for one user request. A single request to the agent should produce exactly ONE trace with nested spans — never multiple disconnected traces. @@ -85,14 +204,25 @@ For each node in the tree, decide: Copy the exact import paths and usage patterns from the reference. Key principles: -1. **Follow the trace tree** — Each node in your trace tree from Step 4 tells you what integration pattern to use. Nodes in the same process as their parent use framework wrappers; nodes across process boundaries use manual `Opik` client tracing with explicit spans. +1. **Follow the trace tree** — Each node in your trace tree from Step 4.7 tells you what integration pattern to use. Nodes in the same process as their parent use framework wrappers; nodes across process boundaries use manual `Opik` client tracing with explicit spans. 2. **Trace key functions** — Add `@opik.track` to functions you want visibility into -3. **Use framework integrations when available** — e.g., `track_openai()` instead of manual `@opik.track` — but only in the main process where a parent trace exists -4. **Don't double-wrap** — If using an integration, don't also add decorators to the same calls -5. **Add flush for scripts** — Short-lived scripts need flushing before exit to ensure traces are sent. Use `opik.flush_tracker()` when using `@opik.track` decorators, or `client.flush()` when using the `Opik()` client directly. For TypeScript, use `await client.flush()`. -6. **Use correct span types** — `general`, `llm`, `tool`, `guardrail` (these are the ONLY valid types — do NOT use `retrieval` or any other type) -7. **Instrument ALL languages** — if the project has TypeScript files that make LLM calls, instrument them too -8. **Set a default project name via env var** — Use `OPIK_PROJECT_NAME` env var so traces don't end up in "Default Project". Do NOT hardcode `project_name=` in decorators or client constructors — this prevents users from overriding the project at runtime. Instead, set `os.environ.setdefault("OPIK_PROJECT_NAME", "app-name")` near the entry point, or document that users should set the env var. For TypeScript, use `process.env.OPIK_PROJECT_NAME || "app-name"` when creating the client. +3. **Mark the entrypoint** — Add `entrypoint=True` to the main/outermost function's `@opik.track` decorator. This is the function that receives the user's input and returns the final output. Exactly ONE function should have `entrypoint=True`. Also add a docstring with `Args:` descriptions (required for the Local Runner to discover the function's input schema). Example: + ```python + @opik.track(entrypoint=True, project_name="my-agent") + def run_agent(question: str, context: str = "") -> str: + """Run the agent with a user question. + + Args: + question: The user's question to answer. + context: Optional additional context. + """ + ``` +4. **Use framework integrations when available** — e.g., `track_openai()` instead of manual `@opik.track` — but only in the main process where a parent trace exists +5. **Don't double-wrap** — If using an integration, don't also add decorators to the same calls +6. **Add flush for scripts** — Short-lived scripts need flushing before exit to ensure traces are sent. Use `opik.flush_tracker()` when using `@opik.track` decorators, or `client.flush()` when using the `Opik()` client directly. For TypeScript, use `await client.flush()`. +7. **Use correct span types** — `general`, `llm`, `tool`, `guardrail` (these are the ONLY valid types — do NOT use `retrieval` or any other type) +8. **Instrument ALL languages** — if the project has TypeScript files that make LLM calls, instrument them too +9. **Set a default project name via env var** — Use `OPIK_PROJECT_NAME` env var so traces don't end up in "Default Project". Do NOT hardcode `project_name=` in decorators or client constructors — this prevents users from overriding the project at runtime. Instead, set `os.environ.setdefault("OPIK_PROJECT_NAME", "app-name")` near the entry point, or document that users should set the env var. For TypeScript, use `process.env.OPIK_PROJECT_NAME || "app-name"` when creating the client. ## Step 6: Install Dependencies @@ -128,6 +258,14 @@ After instrumenting: - Covered: `openai` — wrapped with `track_openai()` in `providers/oai.py` - Covered: `langchain` — added `OpikTracer` callback in `tools/summarize.py` - NOT covered: `some-framework` — could not find where it's used in the codebase (ask user) -3. **Explain what was added and why** -4. **Show the key changes made** -5. **List any configuration the user still needs to set up** (e.g., `opik configure` if not already configured, environment variables) +3. **Verify Opik 2.0 features**: + - **Config extraction**: Confirm a config dataclass was created and the agent reads from it + - **Entrypoint**: Confirm exactly one function has `entrypoint=True` with a docstring + - **Thread ID** (if conversational): Confirm `thread_id` is wired from the session identifier +4. **Explain what was added and why** +5. **Show the key changes made** +6. **List any configuration the user still needs to set up** (e.g., `opik configure` if not already configured, environment variables) +7. **Next steps**: Suggest the user can now: + - Run `opik connect` to pair with Opik UI for remote triggering + - Create an Evaluation Suite with `/opik:create-eval-suite` + - View traces and configuration in the Opik UI diff --git a/skills/agent-config/SKILL.md b/skills/agent-config/SKILL.md new file mode 100644 index 0000000..0fc55d0 --- /dev/null +++ b/skills/agent-config/SKILL.md @@ -0,0 +1,146 @@ +--- +name: agent-config +description: Deep guide on Opik Agent Configuration — Blueprints (immutable config snapshots), environment tags (DEV/PROD), MaskIDs (A/B testing overlays), and the config lifecycle from development to production. +--- + +# Agent Configuration (Blueprints) + +Opik 2.0's Agent Configuration system externalizes your agent's tunable parameters into managed, version-controlled configs. + +## Core Concepts + +| Concept | What It Is | +|---------|-----------| +| **Config Dataclass** | A Python dataclass with your agent's tunable parameters | +| **Blueprint** | An immutable snapshot of a config version | +| **Environment Tag** | A label (DEV, STAGING, PROD) pointing to a specific Blueprint | +| **MaskID** | A temporary override for A/B testing without creating new Blueprints | + +## Creating a Config + +Subclass `opik.AgentConfig` with typed fields. Use `Annotated` for descriptions. + +```python +from typing import Annotated +import opik + +class AgentConfig(opik.AgentConfig): + model: Annotated[str, "LLM model to use"] + temperature: Annotated[float, "Sampling temperature"] + system_prompt: Annotated[str, "System prompt for the agent"] + max_tokens: Annotated[int, "Maximum tokens in response"] + top_p: Annotated[float, "Nucleus sampling parameter"] +``` + +**Rules:** +- Subclass `opik.AgentConfig` (NOT `@dataclass`) +- No default values on the class — pass values at instantiation +- All fields need type annotations + +## Using Config in Your Agent + +```python +config = AgentConfig( + model="gpt-4o", + temperature=0.7, + system_prompt="You are a helpful assistant.", + max_tokens=1024, + top_p=1.0, +) + +@opik.track(entrypoint=True, project_name="my-agent") +def run_agent(question: str) -> str: + response = client.chat.completions.create( + model=config.model, + temperature=config.temperature, + max_tokens=config.max_tokens, + messages=[ + {"role": "system", "content": config.system_prompt}, + {"role": "user", "content": question}, + ], + ) + return response.choices[0].message.content + +# Publish to Opik for Blueprint management +opik_client = opik.Opik() +opik_client.create_agent_config_version(config, project_name="my-agent") +``` + +## Blueprint Lifecycle + +``` +Edit config → New Blueprint created (immutable) + → DEV tag moves to new Blueprint + → Test with Evaluation Suite + → PASS? → Move PROD tag to new Blueprint + → FAIL? → Keep PROD on previous Blueprint +``` + +## Environment Tags + +| Tag | Purpose | +|-----|---------| +| `DEV` | Active development, latest changes | +| `STAGING` | Pre-production testing | +| `PROD` | Production — what end users see | + +## MaskID Overlays + +For A/B testing config variations without permanent changes: + +``` +Base Blueprint (PROD): temperature=0.7 +├── MaskID-001: temperature=0.5 +├── MaskID-002: temperature=0.9 +└── MaskID-003: model="gpt-4o-mini" +``` + +Each MaskID is evaluated against the Evaluation Suite. The winning config gets promoted to a new Blueprint. + +## What to Extract vs Not + +### Extract (put in config) +- Model name, temperature, top_p, max_tokens +- System prompt / persona +- Any tunable parameter that affects agent behavior + +### Don't Extract (keep in code/env) +- API keys and secrets → env vars +- Structural logic → code +- Truly constant values that never change + +## Retrieving Config from Opik Backend + +Inside a `@opik.track` decorated function: + +```python +@opik.track(project_name="my-agent") +def run_agent(question: str): + cfg = client.get_agent_config( + fallback=AgentConfig(model="gpt-4o", temperature=0.7, ...), + project_name="my-agent", + latest=True, # OR version="v1" OR env="prod" + ) + # Access fields — triggers backend resolution + response = llm_call(model=cfg.model, temperature=cfg.temperature) +``` + +## Deploying to Environments + +```python +@opik.track(project_name="my-agent") +def deploy(): + cfg = client.get_agent_config( + fallback=AgentConfig(...), + project_name="my-agent", + version="v2", + ) + cfg.deploy_to("prod") # Tag v2 as production +``` + +## Blueprint in Traces + +Every trace includes `blueprint_id` metadata: +- Filter traces by Blueprint to compare config versions +- Roll back PROD tag if a Blueprint causes regression +- Track which config version produced each trace diff --git a/skills/agent-ops/SKILL.md b/skills/agent-ops/SKILL.md index a88cde6..f58f8bf 100644 --- a/skills/agent-ops/SKILL.md +++ b/skills/agent-ops/SKILL.md @@ -1,27 +1,217 @@ --- name: agent-ops -description: This skill should be used when the user asks about agent architecture, evaluation, metrics, production monitoring, debugging agents, or best practices for building reliable AI agents. Use for questions like "evaluate my agent", "set up production monitoring", "add guardrails", "detect hallucinations", "agent anti-patterns", "compare experiments", "create evaluation dataset". +description: This skill should be used when the user asks about agent architecture, evaluation, metrics, production monitoring, debugging agents, best practices for building reliable AI agents, agent configuration, Blueprints, Evaluation Suites, opik connect, Local Runner, thread tracking, or conversation metrics. Use for questions like "evaluate my agent", "set up production monitoring", "add guardrails", "detect hallucinations", "agent anti-patterns", "compare experiments", "create evaluation suite", "configure my agent", "connect my agent", "track conversations", "evaluate threads". --- # Agent Operations: Build, Evaluate, and Monitor AI Agents -This skill covers the agent lifecycle beyond basic tracing: architecture patterns, evaluation, metrics, and production monitoring. All examples use Opik for observability — for SDK details (tracing, integrations, span types), load the `opik` skill. +This skill covers the agent lifecycle beyond basic tracing: architecture patterns, configuration, evaluation, threads, and production monitoring. All examples use Opik for observability — for SDK details (tracing, integrations, span types), load the `opik` skill. -## The Agent Lifecycle +## The Agent Lifecycle (Opik 2.0) -1. **Instrument** — Add Opik tracing to make your agent's behavior visible (see `opik` skill) -2. **Evaluate** — Measure performance with datasets, metrics, and experiments -3. **Monitor** — Track quality, cost, and reliability in production -4. **Optimize** — Improve based on data from evaluation and production traces +1. **Instrument** — Add `@opik.track` + `opik.AgentConfig` + `entrypoint=True` (see `opik` skill) +2. **Configure** — Externalize config into a dataclass. Opik manages Blueprints (immutable config snapshots) with environment tags (DEV/PROD) +3. **Connect** — Use `opik connect` to pair the Local Runner so the agent can be triggered from the Opik UI +4. **Evaluate** — Create Evaluation Suites with assertions and execution policies +5. **Monitor** — Track quality, cost, and reliability in production dashboards +6. **Optimize** — Use MaskIDs to test config variations, evaluate with suites, promote winning Blueprints -## Agent Architecture Patterns +## Agent Configuration (Blueprints) -Trace every component of your agent with appropriate span types: +Opik 2.0 introduces **Agent Configuration** — externalized, version-controlled config for agents. + +### Key Concepts + +- **`opik.AgentConfig`** — Base class for config definitions. Subclass it with typed fields. +- **Blueprint** — An immutable snapshot of a config version. Every config edit creates a new Blueprint. +- **Environment Tags** — Labels like `DEV`, `STAGING`, `PROD` that point to specific Blueprints. +- **MaskID** — A temporary override layer for A/B testing config variations. +- **`entrypoint=True`** — Marks the main function so Opik can trigger the agent via the Local Runner. + +### Config Pattern + +```python +from typing import Annotated +import opik + +class AgentConfig(opik.AgentConfig): + model: Annotated[str, "LLM model to use"] + temperature: Annotated[float, "Sampling temperature"] + system_prompt: Annotated[str, "System prompt for the agent"] + max_tokens: Annotated[int, "Maximum tokens in response"] + +config = AgentConfig( + model="gpt-4o", + temperature=0.7, + system_prompt="You are a helpful assistant.", + max_tokens=1024, +) + +@opik.track(entrypoint=True, project_name="my-agent") +def run_agent(question: str) -> str: + """Run the agent with a user question. + + Args: + question: The user's question to answer. + """ + response = client.chat.completions.create( + model=config.model, + messages=[{"role": "system", "content": config.system_prompt}, + {"role": "user", "content": question}], + temperature=config.temperature, + max_tokens=config.max_tokens, + ) + return response.choices[0].message.content + +# Publish config to Opik for Blueprint management +opik_client = opik.Opik() +opik_client.create_agent_config_version(config, project_name="my-agent") +``` + +### Environment Tags Workflow + +1. Developer edits config → new Blueprint created automatically +2. `DEV` tag moves to new Blueprint +3. Test with Evaluation Suite → passes +4. Promote: move `PROD` tag to the new Blueprint +5. Production agent reads `PROD` Blueprint on next invocation + +## Opik Connect (Local Runner) + +The **Local Runner** lets you trigger your agent from the Opik browser UI while it runs on your local machine. + +### Setup Flow + +1. Instrument with `entrypoint=True` (required) +2. Add a docstring with argument descriptions to the entrypoint (required for schema discovery) +3. Run `opik connect` (Cloud) or `opik connect --pair ` (OSS) +4. Agent appears in Opik UI → type input → click Run → executes locally + +### What the Runner Enables + +- **UI-triggered execution** — Test your agent from the browser +- **Trace replay** — Click "Re-run" on any trace to replay with same input +- **Config iteration** — Edit config in UI → re-run → compare traces +- **Parallel jobs** — Runner handles concurrent executions + +## Thread Tracking (Multi-Turn Conversations) + +For conversational agents, group related traces into **threads** using `thread_id`. + +### How Threads Work + +- Each conversation turn = one trace +- All traces sharing a `thread_id` form a thread +- Threads tab shows: `first_message`, `last_message`, `number_of_messages`, `duration`, `total_estimated_cost` + +### Instrumenting Conversational Agents + +```python +import opik + +@opik.track(entrypoint=True, project_name="chat-agent") +def handle_message(session_id: str, message: str) -> str: + """Handle a chat message in a conversation session. + + Args: + session_id: The conversation session identifier. + message: The user's message. + """ + opik.update_current_trace(thread_id=session_id) + response = generate_response(session_id, message) + return response +``` + +### Conversation Thread Metrics + +Evaluate entire conversations, not just individual turns: + +```python +from opik.evaluation import evaluate_threads +from opik.evaluation.metrics.conversation import ( + SessionCompletenessMetric, + UserFrustrationMetric, + ConversationalCoherenceMetric, +) + +results = evaluate_threads( + project_name="chat-agent", + metrics=[ + SessionCompletenessMetric(), + UserFrustrationMetric(), + ConversationalCoherenceMetric(), + ], +) +``` + +## Evaluation Suites + +**Evaluation Suites** replace the old "Datasets" approach with a structured testing framework that includes assertions and execution policies. + +### Creating a Suite + +```python +from opik import Opik + +client = Opik() +suite = client.get_or_create_evaluation_suite( + name="customer-support-suite", + assertions=[ + "Response is factually accurate and not hallucinated", + "Response is professional in tone", + ], + execution_policy={"runs_per_item": 3, "pass_threshold": 2}, +) + +suite.add_item( + data={"input": "How do I reset my password?"}, + assertions=["Response mentions the password reset process"], +) + +suite.add_item( + data={"input": "I want to cancel my account"}, + assertions=[ + "Response acknowledges the cancellation request", + "Response is empathetic and offers alternatives", + ], +) +``` + +### Suite-Level vs Item-Level Assertions + +- **Suite-level**: Set via `assertions=` on `get_or_create_evaluation_suite()` or `suite.update(assertions=[...])`. Applied to ALL items. +- **Item-level**: Set via `assertions=` on `suite.add_item()`. Applied only to that item (in addition to suite-level). +- Assertions are **plain strings** describing what the LLM judge should check. + +### Execution Policy + +Set on the suite, not on `run()`: + +```python +# Execution policy is set at suite creation or via update(): +suite.update(execution_policy={"runs_per_item": 3, "pass_threshold": 2}) + +# Run the suite — model param tells which LLM judges assertions +results = suite.run( + task=lambda item: {"output": agent(item["input"])}, + model="gpt-4o", # LLM used to judge assertions +) + +assert results.all_passed # CI gate +``` + +### Suites in the UI + +Evaluation Suites appear under **"Evaluation Suites"** in the sidebar — NOT under "Datasets". + +## Architecture Patterns + +Trace every component with appropriate span types: ```python import opik -@opik.track(name="research_agent") +@opik.track(entrypoint=True, name="research_agent") def agent(query: str) -> str: plan = plan_action(query) # general span results = execute_tool(plan) # tool span @@ -40,32 +230,13 @@ def generate_response(context: str) -> str: | Component | Span Type | Key Data | |-----------|-----------|----------| +| Entry point | `general` | `entrypoint=True`, full input | | Planning | `general` | Reasoning steps, decisions | | Tool calls | `tool` | Tool name, parameters, results | | LLM calls | `llm` | Prompt, response, tokens | | Retrieval | `tool` | Query, documents | | Validation | `guardrail` | Check results, pass/fail | -## Evaluation - -Evaluate agents at multiple levels — end-to-end and per-component: - -```python -from opik.evaluation import evaluate -from opik.evaluation.metrics import AnswerRelevance, Hallucination, AgentTaskCompletion - -results = evaluate( - experiment_name="agent-v2", - dataset=dataset, - task=lambda item: {"output": agent(item["input"])}, - scoring_metrics=[ - AnswerRelevance(), - Hallucination(), - AgentTaskCompletion(), - ] -) -``` - ### Built-in Agent Metrics | Metric | What It Measures | @@ -76,22 +247,25 @@ results = evaluate( | `AnswerRelevance` | Does the answer address the question? | | `Hallucination` | Are there unsupported claims? | -### 41 Total Built-in Metrics - -Heuristic (Equals, Contains, BLEU, ROUGE, BERTScore, IsJson, etc.), LLM-as-Judge (AnswerRelevance, Hallucination, Usefulness, GEval, etc.), RAG (ContextPrecision, ContextRecall, Faithfulness), and conversation metrics. See `references/evaluation.md` for the full list. +41 total built-in metrics: heuristic, LLM-as-Judge, RAG, conversation, and agent-specific. See `references/evaluation.md` for the full list. ## Production Monitoring -- **Dashboards** — Visualize quality, cost, latency, and error trends +- **Dashboards** — Visualize quality, cost, latency, and error trends (including thread-level metrics) - **Online evaluation** — Automatically score production traces with LLM-as-Judge - **Alerts** — Get notified when metrics deviate (quality drops, cost spikes, error rates) - **Guardrails** — PII detection, topic validation, custom safety checks - **Opik Assist** — AI-powered root cause analysis for failed traces +- **Blueprint tracking** — See which config version each trace used ## Common Anti-Patterns | Category | Anti-Pattern | |----------|-------------| +| Configuration | Hardcoded model/temperature/prompt values instead of `opik.AgentConfig` | +| Entrypoint | Missing `entrypoint=True` — agent can't be triggered via Local Runner | +| Threads | Conversational agent without `thread_id` — turns appear as unrelated traces | +| Evaluation | Using old Datasets API instead of Evaluation Suites | | Reliability | Unbounded loops, retry storms, silent failures | | Security | Prompt injection, privilege escalation, data leakage | | Observability | Late tracing (missing input), orphaned spans | @@ -102,5 +276,5 @@ Heuristic (Equals, Contains, BLEU, ROUGE, BERTScore, IsJson, etc.), LLM-as-Judge | Topic | Reference File | |-------|----------------| | Agent architecture, reliability, security patterns | `references/agent-patterns.md` | -| Evaluation datasets, experiments, all 41 metrics | `references/evaluation.md` | -| Production dashboards, alerts, guardrails, cost tracking | `references/production.md` | +| Evaluation Suites, datasets, experiments, all 41 metrics | `references/evaluation.md` | +| Production dashboards, alerts, guardrails, cost tracking, Blueprints | `references/production.md` | diff --git a/skills/agent-ops/references/agent-patterns.md b/skills/agent-ops/references/agent-patterns.md index e60acdc..1ee901f 100644 --- a/skills/agent-ops/references/agent-patterns.md +++ b/skills/agent-ops/references/agent-patterns.md @@ -9,6 +9,89 @@ Building production-grade agents requires: 2. **Evaluation** - Measure performance systematically 3. **Optimization** - Improve based on data +## Agent Configuration Lifecycle + +Before diving into observability, externalize your agent's configuration. + +### Step 1: Extract Config + +Move hardcoded values into an `opik.AgentConfig` subclass: + +```python +from typing import Annotated +import opik + +class AgentConfig(opik.AgentConfig): + model: Annotated[str, "LLM model to use"] + temperature: Annotated[float, "Sampling temperature"] + system_prompt: Annotated[str, "System prompt for the agent"] + max_tokens: Annotated[int, "Maximum tokens in response"] +``` + +### Step 2: Mark the Entrypoint + +Add `entrypoint=True` to the main function and include a docstring: + +```python +config = AgentConfig( + model="gpt-4o", + temperature=0.7, + system_prompt="You are a helpful research assistant.", + max_tokens=1024, +) + +@opik.track(entrypoint=True, project_name="research-agent") +def agent(query: str) -> str: + """Run the research agent. + + Args: + query: The research question to investigate. + """ + return llm_call( + model=config.model, + temperature=config.temperature, + system_prompt=config.system_prompt, + max_tokens=config.max_tokens, + user_message=query, + ) +``` + +### Step 3: Connect for UI Triggering + +```bash +# Cloud (API key configured) +opik connect + +# OSS (pair code from Opik UI) +opik connect --pair ABCDEF +``` + +### Step 4: Iterate via Blueprints + +1. Edit config in Opik UI → new Blueprint created +2. Move `DEV` tag to new Blueprint +3. Test with Evaluation Suite +4. If passing, move `PROD` tag + +### Thread Tracking for Conversational Agents + +If your agent handles multi-turn conversations, wire `thread_id`: + +```python +@opik.track(entrypoint=True, project_name="chat-agent") +def handle_message(session_id: str, message: str) -> str: + """Handle a chat message. + + Args: + session_id: Conversation session identifier. + message: The user's message. + """ + opik.update_current_trace(thread_id=session_id) + return generate_response(session_id, message) +``` + +All turns sharing a `session_id` are grouped into one thread in the Threads tab. + ## Start with Observability Before evaluating, make your agent's behavior transparent. diff --git a/skills/agent-ops/references/evaluation.md b/skills/agent-ops/references/evaluation.md index 57818fb..b120aba 100644 --- a/skills/agent-ops/references/evaluation.md +++ b/skills/agent-ops/references/evaluation.md @@ -12,7 +12,46 @@ Manual review of LLM outputs doesn't scale. Opik's evaluation platform automates ## Core Concepts -### Datasets +### Evaluation Suites (Opik 2.0) + +An **Evaluation Suite** is the primary way to test agents in Opik 2.0. It combines test items with assertions and execution policies. + +```python +from opik import Opik + +client = Opik() +suite = client.get_or_create_evaluation_suite( + name="my-agent-suite", + assertions=[ + "Response is factually accurate and not hallucinated", + "Response is professional in tone", + ], + execution_policy={"runs_per_item": 3, "pass_threshold": 2}, +) + +# Add items with item-level assertions (in addition to suite-level) +suite.add_item( + data={"input": "What is the capital of France?"}, + assertions=["Response correctly identifies Paris as the capital"], +) + +# Run the suite +results = suite.run( + task=lambda item: {"output": my_agent(item["input"])}, + model="gpt-4o", +) + +# CI gate +assert results.all_passed +``` + +**Key differences from old Datasets API:** +- Assertions are plain strings checked by an LLM judge +- Execution policies support multi-run reliability testing (`runs_per_item`, `pass_threshold`) +- Item-level assertion overrides for high-stakes items +- Suites appear under "Evaluation Suites" in the UI sidebar (NOT "Datasets") + +### Datasets (Legacy, Still Supported) A **dataset** is a collection of test cases for evaluating your LLM application. @@ -29,6 +68,35 @@ An **experiment** is a single evaluation run that: 3. Scores the output using one or more metrics 4. Logs results for analysis +### Thread Evaluation + +For conversational agents, evaluate entire threads (multi-turn conversations): + +```python +from opik.evaluation import evaluate_threads +from opik.evaluation.metrics.conversation import ( + SessionCompletenessMetric, + UserFrustrationMetric, + ConversationalCoherenceMetric, +) + +results = evaluate_threads( + project_name="chat-agent", + metrics=[ + SessionCompletenessMetric(), + UserFrustrationMetric(), + ConversationalCoherenceMetric(), + ], + trace_input_transform=lambda t: t.input["message"], + trace_output_transform=lambda t: t.output["response"], +) +``` + +Thread-level metrics: +- **SessionCompletenessMetric** — Did the conversation reach resolution? +- **UserFrustrationMetric** — Did the user show signs of frustration? +- **ConversationalCoherenceMetric** — Did the agent maintain logical consistency across turns? + ## Creating Datasets ### Via Python SDK diff --git a/skills/agent-ops/references/production.md b/skills/agent-ops/references/production.md index 2ce3c51..1a96a7b 100644 --- a/skills/agent-ops/references/production.md +++ b/skills/agent-ops/references/production.md @@ -70,6 +70,36 @@ print(analysis.explanation) print(analysis.suggestions) ``` +## Blueprints & Configuration Tracking + +In production, trace every config change with Blueprints. + +### What Blueprints Enable + +- **Config Versioning** — Every config edit creates an immutable Blueprint snapshot +- **Environment Tags** — Point `DEV`, `STAGING`, `PROD` to specific Blueprints +- **Trace Correlation** — Every trace includes `blueprint_id` metadata showing which config produced it +- **Rollback** — Move `PROD` tag back to a previous Blueprint if a regression is detected +- **A/B Testing** — Use MaskIDs to overlay temporary config changes without creating new Blueprints + +### Production Config Workflow + +1. Agent reads config from the `PROD`-tagged Blueprint +2. Every trace is stamped with the active `blueprint_id` +3. Dashboards can filter/group by Blueprint to compare config versions +4. If a quality regression is detected, roll back `PROD` tag to the previous Blueprint + +### MaskID Overlays for A/B Testing + +```python +# MaskIDs let you test config variations without permanent changes +# The optimizer creates MaskIDs automatically: +# - mask_001: temperature=0.5 +# - mask_002: temperature=0.9 +# Each candidate is evaluated against the Evaluation Suite +# The winning config gets promoted to a new Blueprint +``` + ## Dashboards Create custom views to monitor your LLM applications. diff --git a/skills/evaluation-suites/SKILL.md b/skills/evaluation-suites/SKILL.md new file mode 100644 index 0000000..e5e9274 --- /dev/null +++ b/skills/evaluation-suites/SKILL.md @@ -0,0 +1,121 @@ +--- +name: evaluation-suites +description: Guide for creating, structuring, and running Opik Evaluation Suites with assertions, execution policies, and CI integration. Covers suite-level and item-level assertions, multi-run reliability testing, and the difference from the old Datasets API. +--- + +# Evaluation Suites + +Evaluation Suites are Opik 2.0's structured testing framework for agents. They combine test items with assertions and execution policies. + +## Creating a Suite + +```python +from opik import Opik + +client = Opik() +suite = client.get_or_create_evaluation_suite( + name="my-agent-suite", + assertions=[ + "Response is factually accurate and not hallucinated", + "Response is professional in tone", + ], + execution_policy={"runs_per_item": 3, "pass_threshold": 2}, +) +``` + +## Adding Test Items + +### Basic Items + +```python +suite.add_item( + data={"input": "What is machine learning?"}, +) +``` + +### Items with Assertions + +```python +suite.add_item( + data={"input": "What is the capital of France?"}, + assertions=[ + "Response correctly states that Paris is the capital of France", + ], +) +``` + +### High-Stakes Items (Override Suite Defaults) + +```python +suite.add_item( + data={"input": "Should I take this medication?"}, + assertions=[ + "Response advises consulting a doctor or healthcare professional", + "Response is empathetic and does not give direct medical advice", + ], +) +``` + +## Assertions + +Assertions are **plain strings** describing what an LLM judge should check. They apply at two levels: + +- **Suite-level**: Set on `get_or_create_evaluation_suite(assertions=[...])` or `suite.update(assertions=[...])`. Applied to ALL items. +- **Item-level**: Set on `suite.add_item(assertions=[...])`. Applied in addition to suite-level. + +### Writing Good Assertions + +| Good | Bad | +|------|-----| +| "Response is factually accurate" | `{"type": "no_hallucination"}` | +| "Response mentions the password reset steps" | `{"type": "contains", "value": "password"}` | +| "Response is professional and empathetic" | `{"type": "tone", "value": "professional"}` | + +Be specific and descriptive — the LLM judge interprets the string. + +## Execution Policy + +Set on the suite (not on `run()`): + +```python +suite = client.get_or_create_evaluation_suite( + name="my-suite", + execution_policy={"runs_per_item": 3, "pass_threshold": 2}, +) +``` + +Or update later: +```python +suite.update(execution_policy={"runs_per_item": 3, "pass_threshold": 2}) +``` + +## Running the Suite + +```python +results = suite.run( + task=lambda item: {"output": agent(item["input"])}, + model="gpt-4o", # LLM used to judge assertions +) +``` + +## CI Integration + +```python +assert results.all_passed, f"Evaluation suite failed" +``` + +## Designing Good Test Items + +| Category | Count | Purpose | +|----------|-------|---------| +| Happy path | 2-3 | Typical usage scenarios | +| Edge cases | 1-2 | Minimal input, max length, special chars | +| Adversarial | 1-2 | Prompt injection, off-topic requests | +| High-stakes | 1-2 | Items where failure has real consequences | + +## Important + +- **Use `get_or_create_evaluation_suite()`** — NOT the old `get_or_create_dataset()` +- **Suites appear under "Evaluation Suites"** in the sidebar — NOT "Datasets" +- **Suite-level assertions** set the baseline; **item-level** override for specific needs +- **`runs_per_item > 1`** catches flaky LLM behavior diff --git a/skills/instrument-python/SKILL.md b/skills/instrument-python/SKILL.md new file mode 100644 index 0000000..c637156 --- /dev/null +++ b/skills/instrument-python/SKILL.md @@ -0,0 +1,136 @@ +--- +name: instrument-python +description: Step-by-step guide for adding Opik observability to Python LLM applications. Covers @opik.track decorators, opik.AgentConfig for configuration externalization, entrypoint=True for Local Runner, and thread_id for conversational agents. +--- + +# Instrument Python Agents with Opik + +Step-by-step guide to making your Python agent observable with Opik 2.0. + +## Quick Start (3 Steps) + +### 1. Add @opik.track to your functions + +```python +import opik + +@opik.track(entrypoint=True, name="my-agent") +def agent(query: str) -> str: + """Run the agent. + + Args: + query: The user's question. + """ + context = retrieve(query) + return generate(query, context) + +@opik.track(type="tool") +def retrieve(query: str) -> list: + return search_db(query) + +@opik.track(type="llm") +def generate(query: str, context: list) -> str: + return llm_call(query, context) +``` + +### 2. Extract config into an AgentConfig + +```python +from typing import Annotated +import opik + +class AgentConfig(opik.AgentConfig): + model: Annotated[str, "LLM model to use"] + temperature: Annotated[float, "Sampling temperature"] + system_prompt: Annotated[str, "System prompt"] + max_tokens: Annotated[int, "Maximum tokens"] + +config = AgentConfig( + model="gpt-4o", + temperature=0.7, + system_prompt="You are a helpful assistant.", + max_tokens=1024, +) +``` + +### 3. Set up the project + +```bash +pip install opik +opik configure # Set API key and URL +export OPIK_PROJECT_NAME="my-agent" +``` + +## Span Types + +| Type | Use | Example | +|------|-----|---------| +| `general` | Orchestration, entry points | Agent main function | +| `llm` | LLM API calls | OpenAI completion | +| `tool` | Tool execution, retrieval | Web search, DB query | +| `guardrail` | Safety checks | PII detection | + +## Framework Integrations + +### OpenAI +```python +from opik.integrations.openai import track_openai +client = track_openai(OpenAI()) +``` + +### LangChain / LangGraph +```python +from opik.integrations.langchain import OpikTracer +tracer = OpikTracer() +result = chain.invoke(input, config={"callbacks": [tracer]}) +``` + +### Anthropic +```python +from opik.integrations.anthropic import track_anthropic +client = track_anthropic(anthropic.Anthropic()) +``` + +### CrewAI +```python +from opik.integrations.crewai import track_crewai +track_crewai(project_name="my-project", crew=crew) +``` + +## Entrypoint Functions + +Mark exactly ONE function with `entrypoint=True`: + +```python +@opik.track(entrypoint=True, project_name="my-agent") +def run_agent(question: str) -> str: + """Run the agent with a user question. + + Args: + question: The user's question to answer. + """ +``` + +This enables: +- Triggering the agent from the Opik UI via `opik connect` +- Schema discovery for the UI input form +- Trace replay from the UI + +## Thread ID for Conversations + +For multi-turn agents: + +```python +@opik.track(entrypoint=True) +def handle_message(session_id: str, message: str) -> str: + opik.update_current_trace(thread_id=session_id) + return generate_response(session_id, message) +``` + +## Common Pitfalls + +- **Missing flush**: Scripts need `opik.flush_tracker()` before exit +- **Double-wrapping**: Don't use both `track_openai()` and `@opik.track` on the same call +- **Hardcoded config**: Extract model/temperature/prompt into an `opik.AgentConfig` subclass +- **Missing docstring**: The entrypoint function needs a docstring with `Args:` for schema discovery +- **Multiple top-level traces**: Use distributed tracing headers across service boundaries diff --git a/skills/instrument-typescript/SKILL.md b/skills/instrument-typescript/SKILL.md new file mode 100644 index 0000000..03f84b1 --- /dev/null +++ b/skills/instrument-typescript/SKILL.md @@ -0,0 +1,74 @@ +--- +name: instrument-typescript +description: Step-by-step guide for adding Opik observability to TypeScript/JavaScript LLM applications. Covers the Opik client, framework integrations, and tracing patterns. +--- + +# Instrument TypeScript Agents with Opik + +Guide to making your TypeScript/JavaScript agent observable with Opik. + +## Quick Start + +```typescript +import { Opik } from "opik"; + +const client = new Opik({ + projectName: process.env.OPIK_PROJECT_NAME || "my-agent", +}); + +async function agent(query: string): Promise { + const trace = client.trace({ + name: "my-agent", + input: { query }, + }); + + const searchSpan = trace.span({ name: "search", type: "tool" }); + const context = await searchDB(query); + searchSpan.end({ output: { context } }); + + const llmSpan = trace.span({ name: "generate", type: "llm" }); + const response = await llmCall(query, context); + llmSpan.end({ output: { response } }); + + trace.end({ output: { response } }); + await client.flush(); + return response; +} +``` + +## Framework Integrations + +### OpenAI +```typescript +import { Opik } from "opik"; +import { trackOpenAI } from "opik/openai"; +import OpenAI from "openai"; + +const client = new Opik(); +const openai = trackOpenAI(new OpenAI(), { client }); +``` + +### Vercel AI SDK +```typescript +import { Opik } from "opik"; +import { OpikTracer } from "opik/vercel"; + +const client = new Opik(); +const tracer = new OpikTracer({ client }); +``` + +## Thread ID for Conversations + +```typescript +const trace = client.trace({ + name: "chat-turn", + threadId: sessionId, // Groups turns into a thread + input: { message }, +}); +``` + +## Common Pitfalls + +- **Missing flush**: Always `await client.flush()` before process exit +- **Project name**: Set via `OPIK_PROJECT_NAME` env var, not hardcoded +- **Span types**: Only `general`, `llm`, `tool`, `guardrail` are valid diff --git a/skills/opik-connect/SKILL.md b/skills/opik-connect/SKILL.md new file mode 100644 index 0000000..7b0fd01 --- /dev/null +++ b/skills/opik-connect/SKILL.md @@ -0,0 +1,66 @@ +--- +name: opik-connect +description: Guide for setting up Opik Connect (Local Runner) to pair your local agent with the Opik browser UI. Covers Cloud and OSS pairing, troubleshooting, and networking. +--- + +# Opik Connect (Local Runner) + +Opik Connect lets you trigger your agent from the Opik browser UI while it runs on your local machine. + +## Prerequisites + +1. **opik CLI installed**: `pip install opik` +2. **Agent instrumented** with `entrypoint=True` on the main function +3. **Docstring with Args** on the entrypoint (for UI input form) + +## Setup + +### Cloud (API Key) + +```bash +opik configure # Set API key if not done +opik connect +``` + +### OSS (Self-Hosted) + +```bash +# 1. Open Opik UI in browser +# 2. Click "Connect Agent" to get a pair code +# 3. Run: +opik connect --pair ABCDEF +``` + +## What Happens + +1. Runner registers the entrypoint function with Opik +2. Opik UI shows the agent with an input form (derived from the docstring) +3. User types input → clicks Run → agent executes locally +4. Full trace appears in Opik with spans, token usage, cost + +## Features + +| Feature | Description | +|---------|-------------| +| UI triggering | Type input in browser, execute locally | +| Trace replay | Click "Re-run" on any trace | +| Config iteration | Edit config in UI → re-run → compare | +| Parallel jobs | Runner handles concurrent executions | + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| "No entrypoint found" | Add `entrypoint=True` to `@opik.track` on main function | +| "Connection refused" | Check Opik server is running (OSS) or API key is valid (Cloud) | +| "Invalid pair code" | Codes expire — get a new one from the UI | +| "Port in use" | Another runner may be active | +| Runner disconnects | Check network; runner auto-reconnects | +| Agent not showing in UI | Verify the entrypoint docstring has `Args:` | + +## Networking (OSS) + +- Runner connects outbound to the Opik server (no inbound ports needed) +- Default: connects to `http://localhost:5173` +- Override: set `OPIK_BASE_URL` env var +- Heartbeat keeps connection alive during idle periods diff --git a/skills/opik/SKILL.md b/skills/opik/SKILL.md index c5a0b4f..c385321 100644 --- a/skills/opik/SKILL.md +++ b/skills/opik/SKILL.md @@ -1,11 +1,11 @@ --- name: opik -description: This skill should be used when the user needs to add Opik tracing or integrations to their code, instrument an LLM application, or needs reference for Opik SDK usage (Python, TypeScript, REST API). Use for tasks like "add tracing", "instrument my code", "use track_openai", "add OpikTracer", "what span types are available", "how to flush traces". +description: This skill should be used when the user needs to add Opik tracing or integrations to their code, instrument an LLM application, or needs reference for Opik SDK usage (Python, TypeScript, REST API). Use for tasks like "add tracing", "instrument my code", "use track_openai", "add OpikTracer", "what span types are available", "how to flush traces", "add entrypoint", "extract config", "add thread_id". --- # Opik SDK Reference -Opik is an open-source LLM observability platform. This skill covers the SDK: tracing, integrations, span types, and how to instrument code. +Opik is an open-source LLM observability platform. This skill covers the SDK: tracing, integrations, span types, configuration, threads, and how to instrument code. ## Core Concepts @@ -24,13 +24,38 @@ A **trace** is a complete execution path (one user request → one response). ** **These are the ONLY valid span types.** Do NOT use `retrieval` or any other type. +### Key Opik 2.0 Parameters + +| Parameter | Purpose | +|-----------|---------| +| `entrypoint=True` | Marks the main function — enables Local Runner triggering from UI | +| `thread_id` | Groups multi-turn traces into a conversation thread | +| Config dataclass | Externalizes model/temperature/prompt into a managed config (Blueprints) | + ## Python Quick Start ```python import opik +from typing import Annotated + +class AgentConfig(opik.AgentConfig): + model: Annotated[str, "LLM model to use"] + temperature: Annotated[float, "Sampling temperature"] + system_prompt: Annotated[str, "System prompt"] -@opik.track(name="my_agent", type="general") +config = AgentConfig( + model="gpt-4o", + temperature=0.7, + system_prompt="You are a helpful assistant.", +) + +@opik.track(entrypoint=True, name="my_agent") def agent(query: str) -> str: + """Run the agent. + + Args: + query: User question to answer. + """ context = retrieve(query) return generate(query, context) @@ -40,7 +65,7 @@ def retrieve(query: str) -> list: @opik.track(type="llm") def generate(query: str, context: list) -> str: - return llm_call(query, context) + return llm_call(query, context, model=config.model, temperature=config.temperature) # Nested calls automatically create child spans result = agent("What is ML?") @@ -109,7 +134,7 @@ track_adk_agent_recursive(agent, opik_tracer) | Topic | Reference File | |-------|----------------| -| Python SDK (decorators, context, async, distributed tracing) | `references/tracing-python.md` | +| Python SDK (decorators, context, async, distributed, config, entrypoint, threads) | `references/tracing-python.md` | | TypeScript SDK (client, decorators, framework integrations) | `references/tracing-typescript.md` | | REST API (HTTP endpoints, authentication) | `references/tracing-rest-api.md` | | All integrations with code snippets | `references/integrations.md` | diff --git a/skills/opik/references/tracing-python.md b/skills/opik/references/tracing-python.md index a96386e..49015f4 100644 --- a/skills/opik/references/tracing-python.md +++ b/skills/opik/references/tracing-python.md @@ -40,12 +40,87 @@ import opik project_name="my-project", # Override project tags=["production", "v2"], # Add tags metadata={"version": "1.0"}, # Add metadata - flush=True # Flush immediately (for scripts) + flush=True, # Flush immediately (for scripts) + entrypoint=True, # Mark as agent entry point (enables Local Runner) ) def my_function(): pass ``` +### Entrypoint Functions + +Mark the main agent function with `entrypoint=True` to enable: +- **Local Runner triggering** — agent can be started from the Opik UI via `opik connect` +- **Schema discovery** — Opik reads the function's docstring to build an input form in the UI + +```python +@opik.track(entrypoint=True, project_name="my-agent") +def run_agent(question: str, context: str = "") -> str: + """Run the agent with a user question. + + Args: + question: The user's question to answer. + context: Optional additional context. + """ + return generate_response(question, context) +``` + +### Agent Configuration Pattern + +Externalize hardcoded config values into an `opik.AgentConfig` subclass: + +```python +from typing import Annotated +import opik + +class AgentConfig(opik.AgentConfig): + model: Annotated[str, "LLM model to use"] + temperature: Annotated[float, "Sampling temperature"] + system_prompt: Annotated[str, "System prompt"] + max_tokens: Annotated[int, "Maximum tokens"] + +config = AgentConfig( + model="gpt-4o", + temperature=0.7, + system_prompt="You are a helpful assistant.", + max_tokens=1024, +) + +@opik.track(entrypoint=True, project_name="my-agent") +def run_agent(question: str) -> str: + response = client.chat.completions.create( + model=config.model, + temperature=config.temperature, + messages=[ + {"role": "system", "content": config.system_prompt}, + {"role": "user", "content": question}, + ], + max_tokens=config.max_tokens, + ) + return response.choices[0].message.content + +# Publish config to Opik +opik_client = opik.Opik() +opik_client.create_agent_config_version(config, project_name="my-agent") +``` + +### Thread ID for Conversational Agents + +Group multi-turn conversations into threads: + +```python +@opik.track(entrypoint=True, project_name="chat-agent") +def handle_message(session_id: str, message: str) -> str: + """Handle a chat message. + + Args: + session_id: Conversation session identifier. + message: The user's message. + """ + opik.update_current_trace(thread_id=session_id) + return generate_response(session_id, message) +``` + ### Nested Functions Decorated functions automatically create nested spans: diff --git a/skills/threads-conversations/SKILL.md b/skills/threads-conversations/SKILL.md new file mode 100644 index 0000000..f75833f --- /dev/null +++ b/skills/threads-conversations/SKILL.md @@ -0,0 +1,141 @@ +--- +name: threads-conversations +description: Guide for using thread_id to group multi-turn conversations, conversation metrics (session completeness, user frustration, coherence), thread evaluation, and thread lifecycle management in Opik. +--- + +# Threads & Conversation Tracking + +Threads group related traces into coherent conversations for multi-turn agents. + +## How Threads Work + +- Each conversation turn = one trace +- All traces sharing a `thread_id` form a **thread** +- Threads tab shows aggregated view: first/last message, turn count, duration, total cost + +## Adding thread_id + +### From a session/conversation ID + +```python +import opik + +@opik.track(entrypoint=True, project_name="chat-agent") +def handle_message(session_id: str, message: str) -> str: + """Handle a chat message. + + Args: + session_id: Conversation session identifier. + message: The user's message. + """ + opik.update_current_trace(thread_id=session_id) + return generate_response(session_id, message) +``` + +### Generating a thread ID + +```python +import uuid + +class ChatAgent: + def __init__(self): + self.thread_id = str(uuid.uuid4()) + + @opik.track(entrypoint=True, project_name="chat-agent") + def handle_message(self, message: str) -> str: + opik.update_current_trace(thread_id=self.thread_id) + return self.generate(message) +``` + +### Using opik_args + +```python +result = handle_message( + "Hello", + opik_args={"trace": {"thread_id": "session-123"}} +) +``` + +## Thread Metadata (Automatic) + +| Field | Description | +|-------|-------------| +| `first_message` | Input from the first trace in the thread | +| `last_message` | Output from the last trace in the thread | +| `number_of_messages` | Count of traces in the thread | +| `duration` | Time from first to last trace | +| `total_estimated_cost` | Aggregated cost across all traces | +| `status` | `active` or `inactive` (auto-close after timeout) | + +## Evaluating Conversations + +### Thread-Level Metrics + +```python +from opik.evaluation import evaluate_threads +from opik.evaluation.metrics.conversation import ( + SessionCompletenessMetric, + UserFrustrationMetric, + ConversationalCoherenceMetric, +) + +results = evaluate_threads( + project_name="chat-agent", + metrics=[ + SessionCompletenessMetric(), + UserFrustrationMetric(), + ConversationalCoherenceMetric(), + ], + trace_input_transform=lambda t: t.input["message"], + trace_output_transform=lambda t: t.output["response"], +) +``` + +### Available Conversation Metrics + +| Metric | What It Measures | +|--------|-----------------| +| `SessionCompletenessMetric` | Did the conversation reach resolution? | +| `UserFrustrationMetric` | Did the user show frustration signals? | +| `ConversationalCoherenceMetric` | Did the agent maintain logical consistency? | + +### Thread Feedback Scores + +```python +client.log_thread_feedback_scores( + thread_id="session-123", + scores=[ + {"name": "resolution_quality", "value": 0.85}, + {"name": "customer_satisfaction", "value": 0.9}, + ] +) +``` + +## Thread ID Strategies + +| Strategy | Example | Best For | +|----------|---------|----------| +| Session-based | `f"session-{user_id}-{session_id}"` | Web apps with session tracking | +| UUID | `str(uuid.uuid4())` | Stateless APIs | +| Ticket-based | `f"ticket-{ticket_id}"` | Support systems | +| Timestamp | `f"{user_id}-{datetime.now().isoformat()}"` | Time-ordered analysis | + +## When to Use Threads + +- **Chat agents** with multi-turn conversations +- **Support bots** handling customer sessions +- **Multi-step assistants** (research, planning) +- **Claude Code sessions** (automatic via plugin) + +## When NOT to Use Threads + +- **Single-shot agents** (one input → one output) +- **Batch processing** (no conversation context) +- **Independent requests** (no session relationship) + +## Common Pitfalls + +- **Forgetting thread_id**: All turns appear as unrelated traces +- **Shared thread_id across users**: Different users' conversations get mixed +- **Thread_id per turn instead of per session**: Each turn becomes its own thread +- **Not evaluating threads**: Missing conversation-level quality issues diff --git a/src/api.go b/src/api.go index 3b25e78..b67564a 100644 --- a/src/api.go +++ b/src/api.go @@ -66,16 +66,17 @@ func (a *API) request(method, endpoint string, data interface{}) error { } type Trace struct { - ID string `json:"id"` - Name string `json:"name"` - StartTime string `json:"start_time"` - EndTime string `json:"end_time,omitempty"` - ProjectName string `json:"project_name"` - ThreadID string `json:"thread_id,omitempty"` - Tags []string `json:"tags,omitempty"` - Input map[string]string `json:"input,omitempty"` - Output map[string]string `json:"output,omitempty"` - Model string `json:"model,omitempty"` + ID string `json:"id"` + Name string `json:"name"` + StartTime string `json:"start_time"` + EndTime string `json:"end_time,omitempty"` + ProjectName string `json:"project_name"` + ThreadID string `json:"thread_id,omitempty"` + Tags []string `json:"tags,omitempty"` + Input map[string]string `json:"input,omitempty"` + Output map[string]string `json:"output,omitempty"` + Model string `json:"model,omitempty"` + Metadata map[string]interface{} `json:"metadata,omitempty"` } type Span struct { diff --git a/src/config.go b/src/config.go index b401e9e..d1f0dba 100644 --- a/src/config.go +++ b/src/config.go @@ -17,6 +17,7 @@ type Config struct { Enabled bool ParentTraceID string RootSpanID string + BlueprintID string } const truncateMsg = "[ TRUNCATED -- set OPIK_CC_TRUNCATE_FIELDS=false ]" @@ -51,6 +52,8 @@ func LoadConfig() (*Config, error) { cfg.Project = proj } + cfg.BlueprintID = os.Getenv("OPIK_BLUEPRINT_ID") + return cfg, nil } diff --git a/src/main.go b/src/main.go index 091a1f4..708b62c 100644 --- a/src/main.go +++ b/src/main.go @@ -93,6 +93,12 @@ func onPrompt() { } ts := isoNow() + existingState, _ := LoadState(input.SessionID) + turnCount := 1 + if existingState != nil { + turnCount = existingState.TurnCount + 1 + } + state := &State{ TraceID: traceID, StartTime: ts, @@ -100,12 +106,13 @@ func onPrompt() { Transcript: input.TranscriptPath, StartLine: startLine, LastFlush: time.Now().Unix(), + TurnCount: turnCount, } if err := SaveState(state); err != nil { debugLog("save state: %v", err) } - debugLog("trace=%s start=%d parent=%s", traceID, startLine, config.ParentTraceID) + debugLog("trace=%s start=%d parent=%s turn=%d", traceID, startLine, config.ParentTraceID, turnCount) if config.ParentTraceID == "" { trace := Trace{ @@ -117,6 +124,11 @@ func onPrompt() { Tags: []string{"claude-code"}, Input: map[string]string{"text": input.Prompt}, } + if config.BlueprintID != "" { + trace.Metadata = map[string]interface{}{ + "blueprint_id": config.BlueprintID, + } + } if err := api.Post("/traces", trace); err != nil { debugLog("create trace: %v", err) } @@ -250,6 +262,11 @@ func onCompact() { ThreadID: input.SessionID, Tags: []string{"claude-code", "compaction"}, } + if config.BlueprintID != "" { + trace.Metadata = map[string]interface{}{ + "blueprint_id": config.BlueprintID, + } + } if err := api.Post("/traces", trace); err != nil { debugLog("compact: create trace: %v", err) } @@ -606,6 +623,13 @@ func processTranscriptEntries(traceID string, entries []TranscriptEntry, parentS if p.Model != "" { span.Model = p.Model } + + if cost := estimateCost(p.Model, p.Usage.InputTokens, p.Usage.OutputTokens); cost > 0 { + if span.Metadata == nil { + span.Metadata = make(map[string]interface{}) + } + span.Metadata["estimated_cost"] = cost + } } spans = append(spans, span) @@ -656,6 +680,7 @@ func processToolUse(span *Span, p ParsedEntry, toolResults map[string]*ToolResul subType = st } span.Name = subType + " Subagent" + span.Type = "general" // Subagents are orchestration, not tools prompt := "" if pr, ok := span.Input["prompt"].(string); ok { @@ -695,6 +720,43 @@ func compactInput(customInstructions string) string { return "/compact" } +// Per-million-token pricing (input, output) for common models +var modelCosts = map[string][2]float64{ + "claude-sonnet-4-20250514": {3.0, 15.0}, + "claude-opus-4-20250514": {15.0, 75.0}, + "claude-haiku-4-20250506": {0.80, 4.0}, + "claude-3-5-sonnet-20241022": {3.0, 15.0}, + "claude-3-5-haiku-20241022": {0.80, 4.0}, + "claude-3-opus-20240229": {15.0, 75.0}, + "claude-3-sonnet-20240229": {3.0, 15.0}, + "claude-3-haiku-20240307": {0.25, 1.25}, +} + +func estimateCost(model string, inputTokens, outputTokens int) float64 { + costs, ok := modelCosts[model] + if !ok { + // Try prefix matching for model variants + for prefix, c := range modelCosts { + if strings.HasPrefix(model, prefix[:min(len(prefix), len(model))]) { + costs = c + ok = true + break + } + } + if !ok { + return 0 + } + } + return (float64(inputTokens)*costs[0] + float64(outputTokens)*costs[1]) / 1_000_000 +} + +func min(a, b int) int { + if a < b { + return a + } + return b +} + func categorizeError(errMsg string) *SpanError { errType := "tool_error" diff --git a/src/state.go b/src/state.go index f78ad15..f94160a 100644 --- a/src/state.go +++ b/src/state.go @@ -15,6 +15,7 @@ type State struct { StartLine int `json:"start_line"` LastFlush int64 `json:"last_flush"` SlugSent bool `json:"slug_sent,omitempty"` + TurnCount int `json:"turn_count,omitempty"` } type AgentMap map[string]string