diff --git a/backend/apps/monitoring_app.py b/backend/apps/monitoring_app.py index 310365293..d2e12243d 100644 --- a/backend/apps/monitoring_app.py +++ b/backend/apps/monitoring_app.py @@ -7,11 +7,19 @@ import logging from http import HTTPStatus -from typing import Annotated, Optional +from typing import Annotated, Any from fastapi import APIRouter, Header, HTTPException, Query from sqlalchemy import text +from consts.const import ( + ENABLE_TELEMETRY, + GRAFANA_PORT, + LANGFUSE_PORT, + MONITORING_PROVIDER, + PHOENIX_PORT, + SKYWALKING_UI_PORT, +) from consts.model import ConversationResponse from database.client import get_monitoring_db_session from utils.auth_utils import get_current_user_id @@ -21,6 +29,44 @@ router = APIRouter(prefix="/monitoring") +def _normalize_monitoring_provider(value: str | None) -> str: + return str(value or "otlp").strip().lower() + + +def _build_monitoring_ui( + provider: str, +) -> tuple[str | None, str | None, str | None]: + """Map MONITORING_PROVIDER to a monitoring UI port and path.""" + if provider == "grafana": + path = "/d/nexent-llm-agent/nexent-agent-trace-monitoring?orgId=1" + return GRAFANA_PORT, path, "Grafana" + if provider == "phoenix": + return PHOENIX_PORT, "/", "Phoenix" + if provider == "langfuse": + return LANGFUSE_PORT, "/project/nexent", "Langfuse" + if provider == "langsmith": + return None, None, "LangSmith" + if provider == "skywalking": + return SKYWALKING_UI_PORT, "/", "SkyWalking" + return None, None, None + + +def get_monitoring_status() -> dict[str, Any]: + """Return telemetry state and the monitoring UI entrypoint for frontend use.""" + telemetry_enabled = ENABLE_TELEMETRY + provider = _normalize_monitoring_provider(MONITORING_PROVIDER) + dashboard_port, dashboard_path, provider_name = _build_monitoring_ui(provider) + + return { + "telemetry_enabled": telemetry_enabled, + "provider": provider, + "provider_name": provider_name, + "ui_enabled": telemetry_enabled and bool(dashboard_port), + "dashboard_port": dashboard_port, + "dashboard_path": dashboard_path, + } + + def _compute_time_range_filter(time_range: str) -> str: """Convert time_range parameter to SQL timestamp condition.""" hours = {"24h": 24, "7d": 168, "30d": 720}.get(time_range, 24) @@ -28,12 +74,12 @@ def _compute_time_range_filter(time_range: str) -> str: def _query_model_metrics_from_db( - time_range: str, tenant_id: Optional[str] = None -) -> list[dict]: + time_range: str, tenant_id: str | None = None +) -> list[dict[str, Any]]: time_filter = _compute_time_range_filter(time_range) tenant_filter = "" - params = {} + params: dict[str, str] = {} if tenant_id: tenant_filter = "AND m.tenant_id = :tenant_id" params["tenant_id"] = tenant_id @@ -96,7 +142,7 @@ async def list_models_endpoint( page: Annotated[int, Query(ge=1, description="Page number")] = 1, page_size: Annotated[int, Query( ge=1, le=100, description="Items per page")] = 20, - authorization: Annotated[Optional[str], Header()] = None, + authorization: Annotated[str | None, Header()] = None, ): """List all models with aggregated monitoring metrics from database.""" try: @@ -113,3 +159,13 @@ async def list_models_endpoint( logger.error(f"Failed to list monitoring models: {str(e)}") raise HTTPException( status_code=HTTPStatus.INTERNAL_SERVER_ERROR, detail=str(e)) + + +@router.get("/status", response_model=ConversationResponse) +async def get_monitoring_status_endpoint(): + """Return whether monitoring UI should be shown in the frontend.""" + return ConversationResponse( + code=0, + message="success", + data=get_monitoring_status(), + ) diff --git a/backend/consts/const.py b/backend/consts/const.py index db1e69184..11246b74f 100644 --- a/backend/consts/const.py +++ b/backend/consts/const.py @@ -316,19 +316,68 @@ class VectorDatabaseType(str, Enum): THINK_END_PATTERN = "" -# Telemetry and Monitoring Configuration -ENABLE_TELEMETRY = os.getenv("ENABLE_TELEMETRY", "false").lower() == "true" -SERVICE_NAME = os.getenv("SERVICE_NAME", "nexent-backend") -JAEGER_ENDPOINT = os.getenv( - "JAEGER_ENDPOINT", "http://localhost:14268/api/traces") -PROMETHEUS_PORT = int(os.getenv("PROMETHEUS_PORT", "8000")) -TELEMETRY_SAMPLE_RATE = float(os.getenv("TELEMETRY_SAMPLE_RATE", "1.0")) - -# Performance monitoring thresholds -LLM_SLOW_REQUEST_THRESHOLD_SECONDS = float( - os.getenv("LLM_SLOW_REQUEST_THRESHOLD_SECONDS", "5.0")) -LLM_SLOW_TOKEN_RATE_THRESHOLD = float( - os.getenv("LLM_SLOW_TOKEN_RATE_THRESHOLD", "10.0")) # tokens per second +# Telemetry and Monitoring Configuration (OTLP Protocol) +MONITORING_PROVIDER = os.getenv("MONITORING_PROVIDER", "") +ENABLE_TELEMETRY_RAW = os.getenv("ENABLE_TELEMETRY") +ENABLE_TELEMETRY = (ENABLE_TELEMETRY_RAW or "false").lower() == "true" +OTEL_SERVICE_NAME_RAW = os.getenv("OTEL_SERVICE_NAME") +OTEL_SERVICE_NAME = OTEL_SERVICE_NAME_RAW or "nexent-backend" +OTEL_EXPORTER_OTLP_ENDPOINT_RAW = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT") +OTEL_EXPORTER_OTLP_ENDPOINT = OTEL_EXPORTER_OTLP_ENDPOINT_RAW or "http://localhost:4318" +OTEL_EXPORTER_OTLP_TRACES_ENDPOINT = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", "") +OTEL_EXPORTER_OTLP_METRICS_ENDPOINT = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", "") +OTEL_EXPORTER_OTLP_PROTOCOL_RAW = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL") +OTEL_EXPORTER_OTLP_PROTOCOL = OTEL_EXPORTER_OTLP_PROTOCOL_RAW or "http" +OTEL_EXPORTER_OTLP_HEADERS_RAW = os.getenv("OTEL_EXPORTER_OTLP_HEADERS") +OTEL_EXPORTER_OTLP_HEADERS = OTEL_EXPORTER_OTLP_HEADERS_RAW or "" +OTEL_EXPORTER_OTLP_AUTHORIZATION = os.getenv("OTEL_EXPORTER_OTLP_AUTHORIZATION", "") +OTEL_EXPORTER_OTLP_X_API_KEY = os.getenv("OTEL_EXPORTER_OTLP_X_API_KEY", "") +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION = os.getenv( + "OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION", "") +LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY", "") +LANGSMITH_PROJECT = os.getenv("LANGSMITH_PROJECT", "") +OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENABLED") +OTEL_EXPORTER_OTLP_METRICS_ENABLED = ( + OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW or "true").lower() == "true" +MONITORING_INSTRUMENT_FASTAPI_RAW = os.getenv("MONITORING_INSTRUMENT_FASTAPI") +MONITORING_INSTRUMENT_FASTAPI = ( + MONITORING_INSTRUMENT_FASTAPI_RAW or "true").lower() == "true" +MONITORING_INSTRUMENT_REQUESTS_RAW = os.getenv("MONITORING_INSTRUMENT_REQUESTS") +MONITORING_INSTRUMENT_REQUESTS = ( + MONITORING_INSTRUMENT_REQUESTS_RAW or "false").lower() == "true" +MONITORING_FASTAPI_EXCLUDED_URLS = os.getenv("MONITORING_FASTAPI_EXCLUDED_URLS", "") +MONITORING_FASTAPI_EXCLUDE_SPANS = os.getenv("MONITORING_FASTAPI_EXCLUDE_SPANS", "receive,send") +MONITORING_PROJECT_NAME = os.getenv("MONITORING_PROJECT_NAME", "") +PHOENIX_PORT = os.getenv("PHOENIX_PORT", "6006") +LANGFUSE_PORT = os.getenv("LANGFUSE_PORT", "3001") +GRAFANA_PORT = os.getenv("GRAFANA_PORT", "3002") +SKYWALKING_UI_PORT = os.getenv("SKYWALKING_UI_PORT", "8080") +TELEMETRY_SAMPLE_RATE_RAW = os.getenv("TELEMETRY_SAMPLE_RATE") +TELEMETRY_SAMPLE_RATE = float(TELEMETRY_SAMPLE_RATE_RAW or "1.0") + +# Parse OTLP headers into dict format +def _parse_otlp_headers(headers_str: str) -> dict: + """Parse OTLP headers string into dict. Format: 'key1=value1,key2=value2'""" + if not headers_str: + return {} + headers = {} + for pair in headers_str.split(","): + if "=" in pair: + key, value = pair.split("=", 1) + headers[key.strip()] = value.strip() + return headers + +OTLP_HEADERS = _parse_otlp_headers(OTEL_EXPORTER_OTLP_HEADERS) +if OTEL_EXPORTER_OTLP_AUTHORIZATION: + OTLP_HEADERS["Authorization"] = OTEL_EXPORTER_OTLP_AUTHORIZATION +if OTEL_EXPORTER_OTLP_X_API_KEY: + OTLP_HEADERS["x-api-key"] = OTEL_EXPORTER_OTLP_X_API_KEY +elif LANGSMITH_API_KEY: + OTLP_HEADERS["x-api-key"] = LANGSMITH_API_KEY +if LANGSMITH_PROJECT: + OTLP_HEADERS["Langsmith-Project"] = LANGSMITH_PROJECT +if OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION: + OTLP_HEADERS["x-langfuse-ingestion-version"] = OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION DEFAULT_ZH_TITLE = "新对话" diff --git a/backend/services/agent_service.py b/backend/services/agent_service.py index 73c6a4640..6154c87a4 100644 --- a/backend/services/agent_service.py +++ b/backend/services/agent_service.py @@ -74,7 +74,7 @@ from utils.llm_utils import call_llm_for_system_prompt # Monitoring utilities: expose monitoring context for downstream observers -from nexent.monitor import set_monitoring_context +from nexent.monitor import OPENINFERENCE_SPAN_KIND_CHAIN, set_monitoring_context # Import monitoring utilities from utils.monitoring import monitoring_manager @@ -1875,6 +1875,20 @@ async def run_agent_stream( agent_id=agent_request.agent_id, conversation_id=agent_request.conversation_id, ) + monitoring_manager.set_openinference_agent_context( + agent_id=agent_request.agent_id, + conversation_id=agent_request.conversation_id, + user_id=resolved_user_id, + tenant_id=resolved_tenant_id, + query=agent_request.query, + is_debug=agent_request.is_debug, + extra_metadata={ + "language": language, + "history_count": len(agent_request.history) if agent_request.history else 0, + "minio_files_count": len(agent_request.minio_files) if agent_request.minio_files else 0, + }, + span_kind=OPENINFERENCE_SPAN_KIND_CHAIN, + ) # Step 2: Save user message (if needed) if not agent_request.is_debug and not skip_user_save: @@ -1912,6 +1926,20 @@ async def run_agent_stream( memory_duration = time.time() - memory_start_time memory_enabled = memory_ctx_preview.user_config.memory_switch + monitoring_manager.set_openinference_agent_context( + agent_id=agent_request.agent_id, + conversation_id=agent_request.conversation_id, + user_id=resolved_user_id, + tenant_id=resolved_tenant_id, + query=agent_request.query, + is_debug=agent_request.is_debug, + memory_enabled=memory_enabled, + extra_metadata={ + "language": language, + "agent_share_option": getattr(memory_ctx_preview.user_config, "agent_share_option", "unknown"), + }, + span_kind=OPENINFERENCE_SPAN_KIND_CHAIN, + ) monitoring_manager.add_span_event("memory_context_build.completed", { "duration": memory_duration, "memory_enabled": memory_enabled, diff --git a/backend/utils/monitoring.py b/backend/utils/monitoring.py index eb20d88ec..cdfc7f6cd 100644 --- a/backend/utils/monitoring.py +++ b/backend/utils/monitoring.py @@ -2,12 +2,12 @@ Global Monitoring Manager for Backend This module initializes and configures the global monitoring manager instance -with backend environment variables. All other backend modules should import -`monitoring_manager` directly from this module. +with backend environment variables using OTLP protocol. All other backend modules +should import `monitoring_manager` directly from this module. Usage: from utils.monitoring import monitoring_manager - + @monitoring_manager.monitor_endpoint("my_service.my_function") async def my_function(): return {"status": "ok"} @@ -17,67 +17,79 @@ async def my_function(): MonitoringConfig, get_monitoring_manager ) -# Import configuration from backend (support both relative and absolute imports) try: - # Try relative import first (when running from backend directory) from consts.const import ( ENABLE_TELEMETRY, - SERVICE_NAME, - JAEGER_ENDPOINT, - PROMETHEUS_PORT, - TELEMETRY_SAMPLE_RATE, - LLM_SLOW_REQUEST_THRESHOLD_SECONDS, - LLM_SLOW_TOKEN_RATE_THRESHOLD + MONITORING_PROVIDER, + MONITORING_PROJECT_NAME, + OTEL_SERVICE_NAME, + OTEL_EXPORTER_OTLP_ENDPOINT, + OTEL_EXPORTER_OTLP_TRACES_ENDPOINT, + OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, + OTEL_EXPORTER_OTLP_PROTOCOL, + OTEL_EXPORTER_OTLP_METRICS_ENABLED, + MONITORING_INSTRUMENT_FASTAPI, + MONITORING_INSTRUMENT_REQUESTS, + MONITORING_FASTAPI_EXCLUDED_URLS, + MONITORING_FASTAPI_EXCLUDE_SPANS, + OTLP_HEADERS, + TELEMETRY_SAMPLE_RATE ) except ImportError: - # Fallback to absolute import (when running from project root) from backend.consts.const import ( ENABLE_TELEMETRY, - SERVICE_NAME, - JAEGER_ENDPOINT, - PROMETHEUS_PORT, - TELEMETRY_SAMPLE_RATE, - LLM_SLOW_REQUEST_THRESHOLD_SECONDS, - LLM_SLOW_TOKEN_RATE_THRESHOLD + MONITORING_PROVIDER, + MONITORING_PROJECT_NAME, + OTEL_SERVICE_NAME, + OTEL_EXPORTER_OTLP_ENDPOINT, + OTEL_EXPORTER_OTLP_TRACES_ENDPOINT, + OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, + OTEL_EXPORTER_OTLP_PROTOCOL, + OTEL_EXPORTER_OTLP_METRICS_ENABLED, + MONITORING_INSTRUMENT_FASTAPI, + MONITORING_INSTRUMENT_REQUESTS, + MONITORING_FASTAPI_EXCLUDED_URLS, + MONITORING_FASTAPI_EXCLUDE_SPANS, + OTLP_HEADERS, + TELEMETRY_SAMPLE_RATE ) import logging logger = logging.getLogger(__name__) -# ============================================================================ -# Global Monitoring Manager Instance -# ============================================================================ - -# Get the global monitoring manager instance monitoring_manager = get_monitoring_manager() -# Initialize monitoring configuration immediately when this module is imported - def _initialize_monitoring(): - """Initialize monitoring configuration with backend environment variables.""" + """Initialize monitoring configuration with OTLP settings.""" config = MonitoringConfig( enable_telemetry=ENABLE_TELEMETRY, - service_name=SERVICE_NAME, - jaeger_endpoint=JAEGER_ENDPOINT, - prometheus_port=PROMETHEUS_PORT, - telemetry_sample_rate=TELEMETRY_SAMPLE_RATE, - llm_slow_request_threshold_seconds=LLM_SLOW_REQUEST_THRESHOLD_SECONDS, - llm_slow_token_rate_threshold=LLM_SLOW_TOKEN_RATE_THRESHOLD + service_name=OTEL_SERVICE_NAME, + provider=MONITORING_PROVIDER or "otlp", + otlp_endpoint=OTEL_EXPORTER_OTLP_ENDPOINT, + otlp_traces_endpoint=OTEL_EXPORTER_OTLP_TRACES_ENDPOINT or None, + otlp_metrics_endpoint=OTEL_EXPORTER_OTLP_METRICS_ENDPOINT or None, + otlp_protocol=OTEL_EXPORTER_OTLP_PROTOCOL, + otlp_headers=OTLP_HEADERS, + export_metrics=OTEL_EXPORTER_OTLP_METRICS_ENABLED, + instrument_fastapi=MONITORING_INSTRUMENT_FASTAPI, + instrument_requests=MONITORING_INSTRUMENT_REQUESTS, + fastapi_excluded_urls=MONITORING_FASTAPI_EXCLUDED_URLS, + fastapi_exclude_spans=MONITORING_FASTAPI_EXCLUDE_SPANS, + project_name=MONITORING_PROJECT_NAME or None, + telemetry_sample_rate=TELEMETRY_SAMPLE_RATE ) - # Configure the SDK monitoring system using the singleton monitoring_manager.configure(config) logger.info( - f"Global monitoring initialized: service_name={SERVICE_NAME}, enable_telemetry={ENABLE_TELEMETRY}") + f"OTLP monitoring initialized: service_name={OTEL_SERVICE_NAME}, " + f"enable_telemetry={config.enable_telemetry}, provider={config.provider}, " + f"endpoint={config.otlp_endpoint}, trace_endpoint={config.get_trace_endpoint()}, " + f"protocol={OTEL_EXPORTER_OTLP_PROTOCOL}" + ) -# Initialize monitoring when module is imported _initialize_monitoring() - -# Export the global monitoring manager instance -__all__ = [ - 'monitoring_manager' -] +__all__ = ['monitoring_manager'] diff --git a/doc/docs/.vitepress/config.mts b/doc/docs/.vitepress/config.mts index 6ee76ff5d..87e79a831 100644 --- a/doc/docs/.vitepress/config.mts +++ b/doc/docs/.vitepress/config.mts @@ -385,6 +385,7 @@ export default defineConfig({ ], }, { text: "性能监控", link: "/zh/sdk/monitoring" }, + { text: "OpenTelemetry 设计", link: "/zh/sdk/opentelemetry-design" }, { text: "向量数据库", link: "/zh/sdk/vector-database" }, { text: "数据处理", link: "/zh/sdk/data-process" }, ], diff --git a/doc/docs/en/getting-started/software-architecture.md b/doc/docs/en/getting-started/software-architecture.md index dde7f8525..99e38a5f9 100644 --- a/doc/docs/en/getting-started/software-architecture.md +++ b/doc/docs/en/getting-started/software-architecture.md @@ -274,7 +274,7 @@ Real-time Input → Streaming Endpoint → Async Processing - **High Availability**: Multi-service redundancy, health checks, auto-restart - **High Performance**: Async processing, Redis caching, vector search optimization - **High Concurrency**: Distributed architecture, load balancing -- **Monitoring Friendly**: Prometheus metrics, Jaeger tracing, structured logging +- **Monitoring Friendly**: OpenTelemetry observability, Grafana Tempo tracing, structured logging ### 🔧 Developer Friendly - **Modular Development**: Clean layered architecture (App → Service → Database) diff --git a/doc/docs/en/sdk/monitoring.md b/doc/docs/en/sdk/monitoring.md index 4aa625132..6dd3bede1 100644 --- a/doc/docs/en/sdk/monitoring.md +++ b/doc/docs/en/sdk/monitoring.md @@ -1,289 +1,275 @@ -# 🚀 Nexent LLM Monitoring System +# Nexent Agent Observability (OTLP) -Enterprise-grade monitoring solution specifically designed for monitoring LLM token generation speed and performance. +Enterprise-grade observability for AI agents using OpenTelemetry OTLP protocol. Supports integration with observability platforms like Arize Phoenix, Langfuse, LangSmith, Grafana Tempo, Apache SkyWalking, and more. -## 📊 System Architecture +## Architecture ``` -┌─────────────────────────────────────────────────────────┐ -│ Nexent LLM Monitoring System │ -├─────────────────────────────────────────────────────────┤ -│ │ -│ Nexent API ──► OpenTelemetry ──► Jaeger (Tracing) │ -│ │ │ │ -│ │ └──────► Prometheus (Metrics) │ -│ │ │ │ -│ └─► OpenAI LLM └──► Grafana (Visualization) │ -│ (Token Monitoring) │ -└─────────────────────────────────────────────────────────┘ +NexentAgent ──► OpenTelemetry SDK ──► OTLP Collector ──► Arize Phoenix / Langfuse / LangSmith / Grafana Tempo / SkyWalking / OTLP Backend + │ │ + │ OpenInference Semantics │ + │ (llm.*, agent.* attributes) │ + └────────────────────────────────────────┘ ``` -## ⚡ Quick Start (5 minutes) +## Quick Start ```bash -# 1. Start monitoring services -./docker/start-monitoring.sh +cd docker +cp .env.example .env -# 2. Install performance monitoring dependencies -uv sync --extra performance - -# 3. Enable monitoring -export ENABLE_TELEMETRY=true +vim .env +ENABLE_TELEMETRY=true +MONITORING_PROVIDER=otlp +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http -# 4. Start backend service -python backend/config_service.py -python backend/runtime_service.py +docker-compose -f docker-compose-monitoring.yml up -d ``` -## 📊 Access Monitoring Interfaces +## AI Observability Platforms -| Interface | URL | Purpose | -|-----------|-----|---------| -| **Grafana Dashboard** | http://localhost:3005 | LLM Performance Monitoring | -| **Jaeger Tracing** | http://localhost:16686 | Request Trace Analysis | -| **Prometheus Metrics** | http://localhost:9090 | Raw Monitoring Data | +### Arize Phoenix -### 🔐 Grafana Login Information +Arize Phoenix provides AI-specific observability with OpenInference semantic support. -When first accessing Grafana (http://localhost:3005), you need to login: +**Configuration:** -``` -Username: admin -Password: admin +```bash +MONITORING_PROVIDER=phoenix +OTEL_EXPORTER_OTLP_ENDPOINT=https://app.phoenix.arize.com/s/YOUR_SPACE +OTEL_EXPORTER_OTLP_AUTHORIZATION="Bearer YOUR_PHOENIX_API_KEY" +OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false ``` -**After first login, you'll be prompted to change password:** -- Set a new password (recommended) -- Click "Skip" to skip (development environment) +**Features:** +- LLM trace visualization with prompt/completion +- Token-level performance metrics +- Agent step tracing +- Cost analysis -**After login, you can see:** -- 📊 **LLM Performance Dashboard** - Pre-configured performance dashboard -- 📈 **Data Source Configuration** - Auto-connected to Prometheus and Jaeger -- 🎯 **Real-time Monitoring Panel** - Key metrics like token generation speed, latency +### Langfuse -## 🎯 Core Features +Langfuse offers prompt management and LLM observability with OTLP support. -### ⚡ LLM-Specific Monitoring -- **Token Generation Speed**: Real-time monitoring of tokens generated per second -- **TTFT (Time to First Token)**: First token return latency -- **Streaming Response Analysis**: Generation timestamp for each token -- **Model Performance Comparison**: Performance benchmarks across different models +**Configuration:** -### 🔍 Distributed Tracing -- **Complete Request Chain**: End-to-end tracing from HTTP to LLM -- **Performance Bottleneck Detection**: Automatically identify slow queries and anomalies -- **Error Root Cause Analysis**: Quickly locate problem sources - -### 🛠️ Developer-Friendly Design -- **One-Line Integration**: Quick monitoring with decorators -- **Zero-Dependency Degradation**: Auto-skip when monitoring dependencies are missing -- **Zero-Touch Usage**: No need to manually check monitoring status, handled automatically -- **Flexible Configuration**: Environment variable controlled behavior - -## 🛠️ Adding Monitoring to Code +```bash +MONITORING_PROVIDER=langfuse +OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel -### 🎯 Recommended Approach: Singleton Pattern (v2.1+) +LANGFUSE_PUBLIC_KEY=pk-xxx +LANGFUSE_SECRET_KEY=sk-xxx -```python -# Backend service usage - directly use globally configured monitoring_manager -from utils.monitoring import monitoring_manager +OTEL_EXPORTER_OTLP_AUTHORIZATION=Basic BASE64_ENCODED_KEY +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION=4 +``` -# API endpoint monitoring -@monitoring_manager.monitor_endpoint("my_service.my_function") -async def my_api_function(): - return {"status": "ok"} +Generate the encoded key: -# LLM call monitoring -@monitoring_manager.monitor_llm_call("gpt-4", "chat_completion") -def call_llm(messages): - # Automatically get token-level monitoring - return llm_response - -# Manual monitoring events -monitoring_manager.add_span_event("custom_event", {"key": "value"}) -monitoring_manager.set_span_attributes(user_id="123", action="process") +```bash +echo -n "$LANGFUSE_PUBLIC_KEY:$LANGFUSE_SECRET_KEY" | base64 ``` -### 📦 Direct SDK Usage +**Features:** +- Prompt versioning and management +- Session-based trace grouping +- User feedback collection +- Model cost tracking -```python -from nexent.monitor import get_monitoring_manager - -# Get global monitoring manager - already configured in backend -monitor = get_monitoring_manager() - -# Use decorators -@monitor.monitor_llm_call("claude-3", "completion") -def my_llm_function(): - return "response" - -# Or use directly in business logic -with monitor.trace_llm_request("custom_operation", "my_model") as span: - # Execute business logic - result = process_data() - monitor.add_span_event("processing_completed") - return result -``` +### LangSmith -### ✨ Global Configuration Automation +LangSmith supports online OTLP trace ingestion through the OpenTelemetry endpoint. Nexent can send traces to a local Collector first, and the Collector forwards them to LangSmith. -Monitoring configuration is auto-initialized in `backend/utils/monitoring.py`: +**Collector forwarding:** -```python -# No manual configuration needed - auto-completed at system startup -# monitoring_manager already configured with environment variables -from utils.monitoring import monitoring_manager +```bash +cd docker +vim monitoring/monitoring.env -# Direct usage without checking if enabled -@monitoring_manager.monitor_endpoint("my_function") -def my_function(): - pass +MONITORING_PROVIDER=langsmith +LANGSMITH_API_KEY=lsv2_xxx +LANGSMITH_PROJECT=nexent +LANGSMITH_OTLP_TRACES_ENDPOINT=https://api.smith.langchain.com/otel/v1/traces -# FastAPI application initialization -monitoring_manager.setup_fastapi_app(app) +./start-monitoring.sh --stack langsmith ``` -### 🔒 Auto Start/Stop Design - -- **Smart Monitoring**: Auto start/stop based on `ENABLE_TELEMETRY` environment variable -- **Zero-Touch Usage**: External code doesn't need to check monitoring status, use all features directly -- **Graceful Degradation**: Silent no-effect when disabled, normal operation when enabled -- **Default Off**: Auto-disabled when not configured +Nexent backend configuration when it sends OTLP to the Collector: ```bash -# Enable monitoring -export ENABLE_TELEMETRY=true - -# Disable monitoring -export ENABLE_TELEMETRY=false +ENABLE_TELEMETRY=true +MONITORING_PROVIDER=langsmith +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false ``` -## 📊 Core Monitoring Metrics +For direct backend-to-LangSmith export, set `OTEL_EXPORTER_OTLP_ENDPOINT=https://api.smith.langchain.com/otel`, `LANGSMITH_API_KEY`, and optionally `LANGSMITH_PROJECT`. -| Metric | Description | Importance | -|--------|-------------|------------| -| `llm_token_generation_rate` | Token generation speed (tokens/s) | ⭐⭐⭐ | -| `llm_time_to_first_token_seconds` | First token latency | ⭐⭐⭐ | -| `llm_request_duration_seconds` | Complete request duration | ⭐⭐⭐ | -| `llm_total_tokens` | Input/output token count | ⭐⭐ | -| `llm_error_count` | LLM call error count | ⭐⭐⭐ | +### Apache SkyWalking -## 🔧 Environment Configuration +SkyWalking provides general APM, service topology, endpoint analysis, and trace query capabilities. For local deployment, Nexent sends OTLP to the Collector, and the Collector forwards traces to SkyWalking OAP over OTLP gRPC. ```bash -# Add to .env file -cat >> .env << EOF -ENABLE_TELEMETRY=true -SERVICE_NAME=nexent-backend -JAEGER_ENDPOINT=http://localhost:14268/api/traces -LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 -LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 -TELEMETRY_SAMPLE_RATE=1.0 # Development environment, production recommended 0.1 -EOF +MONITORING_PROVIDER=skywalking +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http ``` -## 🛠️ System Verification +For direct OAP access: ```bash -# Check metrics endpoint -curl http://localhost:8000/metrics - -# Verify dependency installation -python -c "from backend.utils.monitoring import MONITORING_AVAILABLE; print(f'Monitoring Available: {MONITORING_AVAILABLE}')" +MONITORING_PROVIDER=skywalking +OTEL_EXPORTER_OTLP_ENDPOINT=http://skywalking-oap:11800 +OTEL_EXPORTER_OTLP_PROTOCOL=grpc +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false ``` -## 🆘 Troubleshooting - -### No monitoring data? -```bash -# Check service status -docker-compose -f docker/docker-compose-monitoring.yml ps +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `ENABLE_TELEMETRY` | `false` | Enable/disable monitoring | +| `MONITORING_PROVIDER` | `otlp` | Provider profile: `otlp`, `phoenix`, `langfuse`, `langsmith`, `grafana`, `skywalking` | +| `MONITORING_PROJECT_NAME` | `nexent` | Observability platform project name | +| `OTEL_SERVICE_NAME` | `nexent-backend` | Service identifier | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4318` | OTLP base endpoint; SDK derives `/v1/traces` and `/v1/metrics` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | (empty) | Optional trace-specific endpoint | +| `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` | (empty) | Optional metric-specific endpoint | +| `OTEL_EXPORTER_OTLP_PROTOCOL` | `http` | Protocol: `http` or `grpc` | +| `OTEL_EXPORTER_OTLP_HEADERS` | (empty) | Generic auth headers (comma-separated) | +| `OTEL_EXPORTER_OTLP_AUTHORIZATION` | (empty) | `Authorization` header, commonly used by Phoenix bearer auth and Langfuse | +| `OTEL_EXPORTER_OTLP_X_API_KEY` | (empty) | `x-api-key` header for platforms that require it | +| `OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION` | (empty) | Langfuse ingestion version, for example `4` | +| `OTEL_EXPORTER_OTLP_METRICS_ENABLED` | `true` | Whether to export OTLP metrics | +| `LANGSMITH_API_KEY` | (empty) | LangSmith API key; mapped to the `x-api-key` OTLP header | +| `LANGSMITH_PROJECT` | (empty) | Optional LangSmith project header | +| `LANGSMITH_OTLP_TRACES_ENDPOINT` | `https://api.smith.langchain.com/otel/v1/traces` | Collector trace endpoint for online LangSmith | + +## Code Integration + +### Endpoint Monitoring -# Check dependency installation -python -c "import opentelemetry; print('✅ Monitoring dependencies installed')" -``` +```python +from utils.monitoring import monitoring_manager -### Port conflicts? -```bash -# Check port usage -lsof -i :3005 -i :9090 -i :16686 +@monitoring_manager.monitor_endpoint("my_service.my_function") +async def my_api_function(): + return {"status": "ok"} ``` -### Dependency installation issues? -```bash -# Reinstall performance dependencies -uv sync --extra performance +### LLM Call Monitoring -# Check performance configuration in pyproject.toml -cat backend/pyproject.toml | grep -A 20 "performance" +```python +@monitoring_manager.monitor_llm_call("gpt-4", "chat_completion") +def call_llm(messages): + return llm_response ``` -### Service name shows as unknown_service? -```bash -# Check environment variable configuration -echo "SERVICE_NAME: $SERVICE_NAME" +### Agent Step Tracing -# Restart monitoring service to apply new configuration -./docker/start-monitoring.sh +```python +with monitoring_manager.trace_agent_step("web_search", "research_agent", "tool_call") as span: + result = execute_tool() + monitoring_manager.set_tool_output(result) ``` -## 🧹 Data Management +### Tool Call Tracing -### Clean Jaeger Trace Data -```bash -# Method 1: Restart Jaeger container (simplest) -docker-compose -f docker/docker-compose-monitoring.yml restart nexent-jaeger - -# Method 2: Completely rebuild Jaeger container and data -docker-compose -f docker/docker-compose-monitoring.yml stop nexent-jaeger -docker-compose -f docker/docker-compose-monitoring.yml rm -f nexent-jaeger -docker-compose -f docker/docker-compose-monitoring.yml up -d nexent-jaeger +```python +with monitoring_manager.trace_tool_call("web_search", "agent_name", {"query": "test"}) as span: + results = search_web("test") + monitoring_manager.set_tool_output({"results": results}) +``` -# Method 3: Clean all monitoring data (rebuild all containers) -docker-compose -f docker/docker-compose-monitoring.yml down -docker-compose -f docker/docker-compose-monitoring.yml up -d +## OpenInference Semantic Attributes + +The system uses OpenInference semantic conventions for AI-specific observability: + +### LLM Attributes + +| Attribute | Description | +|-----------|-------------| +| `llm.model_name` | Model identifier (e.g., `gpt-4`) | +| `llm.operation.name` | Operation type (e.g., `chat_completion`) | +| `llm.token_count.prompt` | Input token count | +| `llm.token_count.completion` | Output token count | +| `llm.invocation_parameters` | Model parameters (JSON) | +| `llm.time_to_first_token` | TTFT in seconds | + +### Agent Attributes + +| Attribute | Description | +|-----------|-------------| +| `agent.name` | Agent identifier | +| `agent.step.name` | Step name (e.g., `web_search`) | +| `agent.step.type` | Step type: `tool_call`, `reasoning`, `action_selection` | +| `agent.tool.name` | Tool name | +| `agent.tool.input` | Tool input (JSON) | +| `agent.tool.output` | Tool output (JSON) | + +## Metrics + +| Metric | Description | +|--------|-------------| +| `llm.request.duration` | Request latency | +| `llm.token.generation_rate` | Tokens per second | +| `llm.time_to_first_token` | TTFT | +| `llm.token_count.prompt` | Input tokens | +| `llm.token_count.completion` | Output tokens | +| `agent.step.count` | Agent step count | +| `agent.execution.duration` | Agent execution time | +| `agent.error.count` | Agent errors | + +## Collector Configuration + +By default, the OpenTelemetry Collector only logs data through the debug exporter. This avoids forwarding data back into itself when no external backend is configured. To forward through the Collector, add a platform exporter: + +```yaml +exporters: + otlphttp/langsmith: + traces_endpoint: https://api.smith.langchain.com/otel/v1/traces + headers: + x-api-key: YOUR_LANGSMITH_API_KEY + Langsmith-Project: nexent + +service: + pipelines: + traces: + exporters: [otlphttp/langsmith, debug] ``` -### Clean Prometheus Metrics Data -```bash -# Restart Prometheus container -docker-compose -f docker/docker-compose-monitoring.yml restart nexent-prometheus +See `docker/monitoring/otel-collector-config.yml` for full configuration with platform examples. -# Completely clean Prometheus data -docker-compose -f docker/docker-compose-monitoring.yml stop nexent-prometheus -docker volume rm docker_prometheus_data 2>/dev/null || true -docker-compose -f docker/docker-compose-monitoring.yml up -d nexent-prometheus -``` +## Graceful Degradation -### Clean Grafana Configuration -```bash -# Reset Grafana configuration and dashboards -docker-compose -f docker/docker-compose-monitoring.yml stop nexent-grafana -docker volume rm docker_grafana_data 2>/dev/null || true -docker-compose -f docker/docker-compose-monitoring.yml up -d nexent-grafana +When OpenTelemetry dependencies are not installed, monitoring gracefully disables: + +```python +pip install nexent # Basic package - no monitoring +pip install nexent[performance] # With OTLP support ``` -## 📈 Typical Problem Analysis +All monitoring methods work without errors when disabled - decorators pass through, context managers yield None. -### Slow token generation (< 5 tokens/s) -1. **Analysis**: Grafana → Token Generation Rate panel -2. **Solution**: Check model service load, optimize input prompt length +## Troubleshooting -### Slow request response (> 10s) -1. **Analysis**: Jaeger → View complete trace chain -2. **Solution**: Locate bottleneck (database/LLM/network) +### No data appearing -### Error rate spike (> 10%) -1. **Analysis**: Prometheus → llm_error_count metric -2. **Solution**: Check model service availability, verify API keys +1. Check `ENABLE_TELEMETRY=true` in `.env` +2. Verify OTLP endpoint is reachable +3. Check authentication headers are correct -## 🎉 Getting Started +### Connection errors -After setup completion, you can: +1. Test endpoint: `curl -v $OTEL_EXPORTER_OTLP_ENDPOINT/v1/traces` +2. Verify protocol matches endpoint (`http` vs `grpc`) +3. Check Collector logs: `docker logs nexent-otel-collector` -1. 📊 View **LLM Performance Dashboard** in Grafana -2. 🔍 Trace complete request chains in Jaeger -3. 📈 Analyze token generation speed and performance bottlenecks -4. 🚨 Set performance alerts and thresholds +### Wrong attributes -Enjoy efficient LLM performance monitoring! 🚀 +1. Verify OpenInference attributes in platform UI +2. Check span attribute naming: `llm.model_name` not `model_name` +3. Review platform-specific attribute requirements diff --git a/doc/docs/zh/getting-started/software-architecture.md b/doc/docs/zh/getting-started/software-architecture.md index 24c83152d..8676992a4 100644 --- a/doc/docs/zh/getting-started/software-architecture.md +++ b/doc/docs/zh/getting-started/software-architecture.md @@ -274,7 +274,7 @@ Docker Compose 编排: - **高可用性**:多服务冗余、健康检查、自动重启 - **高性能**:异步处理、Redis 缓存、向量搜索优化 - **高并发**:分布式架构、负载均衡 -- **监控友好**:Prometheus 指标、Jaeger 追踪、结构化日志 +- **监控友好**:OpenTelemetry 可观测性、Grafana Tempo 追踪、结构化日志 ### 🔧 开发友好 - **模块化开发**:清晰的分层架构(App → Service → Database) diff --git a/doc/docs/zh/sdk/monitoring.md b/doc/docs/zh/sdk/monitoring.md index c592df267..8759d592f 100644 --- a/doc/docs/zh/sdk/monitoring.md +++ b/doc/docs/zh/sdk/monitoring.md @@ -1,289 +1,438 @@ -# 🚀 Nexent LLM 监控系统 +# Nexent Agent 可观测性(OTLP) -专门监控大模型 Token 生成速度和性能的企业级监控解决方案。 +基于 OpenTelemetry OTLP 协议的 AI Agent 企业级可观测性方案。支持对接 Arize Phoenix、Langfuse、LangSmith、Grafana Tempo、Apache SkyWalking 等可观测性平台。 -## 📊 系统架构 +## 系统架构 ``` -┌─────────────────────────────────────────────────────────┐ -│ Nexent LLM 监控系统 │ -├─────────────────────────────────────────────────────────┤ -│ │ -│ Nexent API ──► OpenTelemetry ──► Jaeger (链路追踪) │ -│ │ │ │ -│ │ └──────► Prometheus (指标收集) │ -│ │ │ │ -│ └─► OpenAI LLM └──► Grafana (可视化) │ -│ (Token 监控) │ -└─────────────────────────────────────────────────────────┘ +NexentAgent ──► OpenTelemetry SDK ──► OTLP Collector ──► Arize Phoenix / Langfuse / LangSmith / Grafana Tempo / SkyWalking / OTLP Backend + │ │ + │ OpenInference 语义约定 │ + │ (llm.*, agent.* 属性) │ + └────────────────────────────────────────┘ ``` -## ⚡ 快速启动(5分钟) +## 快速启动 ```bash -# 1. 启动监控服务 -./docker/start-monitoring.sh +cd docker +cp monitoring/monitoring.env.example monitoring/monitoring.env -# 2. 安装性能监控依赖 -uv sync --extra performance - -# 3. 启用监控 -export ENABLE_TELEMETRY=true +vim monitoring/monitoring.env +ENABLE_TELEMETRY=true +MONITORING_PROVIDER=otlp +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http -# 4. 启动后端服务 -python backend/config_service.py -python backend/runtime_service.py +./start-monitoring.sh --stack collector ``` -## 📊 访问监控界面 +## 本地化部署形态 -| 界面 | 地址 | 用途 | -|------|------|------| -| **Grafana 仪表板** | http://localhost:3005 | LLM 性能监控 | -| **Jaeger 链路追踪** | http://localhost:16686 | 请求链路分析 | -| **Prometheus 指标** | http://localhost:9090 | 原始监控数据 | +`docker/start-monitoring.sh` 支持多种形态,均以 OpenTelemetry Collector 作为统一入口。业务服务只需要把 OTLP 发到 Collector,不需要感知后端平台差异。 -### 🔐 Grafana 登录信息 +| 形态 | 命令 | 包含服务 | 适用场景 | +|------|------|----------|----------| +| `collector` | `./start-monitoring.sh --stack collector` | OpenTelemetry Collector | 只验证埋点、或转发到外部云端平台 | +| `phoenix` | `./start-monitoring.sh --stack phoenix` | Collector + Phoenix | 本地 trace 调试、OpenInference 属性查看、实验分析 | +| `langfuse` | `./start-monitoring.sh --stack langfuse` | Collector + Langfuse Web/Worker + Postgres + ClickHouse + MinIO + Redis | 本地完整 LLMOps 体验、会话/用户/反馈/成本分析 | +| `langsmith` | `./start-monitoring.sh --stack langsmith` | OpenTelemetry Collector | 转发 traces 到在线 LangSmith 平台 | +| `grafana` | `./start-monitoring.sh --stack grafana` | Collector + Grafana + Tempo | 本地 Tempo trace 查询 | +| `skywalking` | `./start-monitoring.sh --stack skywalking` | Collector + SkyWalking OAP + SkyWalking UI + BanyanDB | 本地 APM、服务拓扑、OTLP trace 查询 | -首次访问 Grafana (http://localhost:3005) 时需要登录: +也可以在 `docker/monitoring/monitoring.env` 中设置默认形态: -``` -用户名: admin -密码: admin +```bash +MONITORING_PROVIDER=phoenix ``` -**首次登录后会要求修改密码,可以:** -- 设置新密码(推荐) -- 点击 "Skip" 跳过(开发环境) +### 本地 Phoenix -**登录后可以看到:** -- 📊 **LLM Performance Dashboard** - 预配置的性能仪表板 -- 📈 **数据源配置** - 自动连接到 Prometheus 和 Jaeger -- 🎯 **实时监控面板** - Token 生成速度、延迟等关键指标 +Phoenix 本地部署使用 `arizephoenix/phoenix` 镜像,默认 UI 端口为 `6006`,gRPC OTLP 端口映射为 `4319`,数据持久化到 Docker volume `phoenix-data`。 -## 🎯 核心功能特性 +```bash +cd docker +./start-monitoring.sh --stack phoenix +``` -### ⚡ LLM 专用监控 -- **Token 生成速度**: 实时监控每秒生成的 token 数量 -- **TTFT (Time to First Token)**: 首个 token 返回延迟 -- **流式响应分析**: 每个 token 的生成时间戳 -- **模型性能对比**: 不同模型的性能基准 +访问地址: -### 🔍 分布式链路追踪 -- **完整请求链路**: 从 HTTP 到 LLM 的端到端追踪 -- **性能瓶颈识别**: 自动定位慢查询和异常 -- **错误根因分析**: 快速定位问题根源 +- Phoenix UI:`http://localhost:6006` +- Collector OTLP HTTP:`http://localhost:4318` +- Collector OTLP gRPC:`localhost:4317` -### 🛠️ 开发友好设计 -- **一行代码接入**: 使用装饰器快速添加监控 -- **零依赖降级**: 未安装监控依赖时自动跳过 -- **零感知使用**: 无需手动检查监控状态,自动处理 -- **灵活配置**: 环境变量控制监控行为 +Nexent 后端在 Docker 网络内运行时: -## 🛠️ 添加监控到代码 +```bash +ENABLE_TELEMETRY=true +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false +``` -### 🎯 推荐方式:单例模式 (v2.1+) +后端直接在宿主机运行时,把 endpoint 改为 `http://localhost:4318`。 -```python -# 后端服务中使用 - 直接使用全局配置好的 monitoring_manager -from utils.monitoring import monitoring_manager +### 本地 Langfuse -# API 端点监控 -@monitoring_manager.monitor_endpoint("my_service.my_function") -async def my_api_function(): - return {"status": "ok"} - -# LLM 调用监控 -@monitoring_manager.monitor_llm_call("gpt-4", "chat_completion") -def call_llm(messages): - # 自动获得 Token 级别监控 - return llm_response +Langfuse 本地部署使用 v3 架构:Web、Worker、Postgres、ClickHouse、MinIO、Redis。默认 UI 端口为 `3001`,初始化项目和 API Key 来自 `monitoring.env`。 -# 手动添加监控事件 -monitoring_manager.add_span_event("custom_event", {"key": "value"}) -monitoring_manager.set_span_attributes(user_id="123", action="process") +```bash +cd docker +./start-monitoring.sh --stack langfuse ``` -### 📦 SDK中直接使用 +访问地址: -```python -from nexent.monitor import get_monitoring_manager +- Langfuse UI:`http://localhost:3001` +- 默认管理员:`admin@nexent.local` / `nexent-langfuse-admin` +- 默认项目 Key:`pk-lf-nexent-local` / `sk-lf-nexent-local` -# 获取全局监控管理器 - 在backend已自动配置 -monitor = get_monitoring_manager() +启动脚本会在 `LANGFUSE_OTLP_AUTH_HEADER` 为空时自动生成 `Basic base64(public_key:secret_key)`,并让 Collector 将 trace 转发到 `http://langfuse-web:3000/api/public/otel`。本地默认密钥只适合开发验证,生产部署必须替换 `LANGFUSE_NEXTAUTH_SECRET`、`LANGFUSE_SALT`、`LANGFUSE_ENCRYPTION_KEY`、数据库密码和对象存储密钥。 -# 使用装饰器 -@monitor.monitor_llm_call("claude-3", "completion") -def my_llm_function(): - return "response" +### 在线 LangSmith -# 或者在业务逻辑中直接使用 -with monitor.trace_llm_request("custom_operation", "my_model") as span: - # 执行业务逻辑 - result = process_data() - monitor.add_span_event("processing_completed") - return result -``` +LangSmith 支持通过在线 OTLP endpoint 摄取 traces。Nexent 可以先把 OTLP 发到本地 Collector,再由 Collector 转发到 LangSmith,业务服务无需直接保存 LangSmith API Key。 -### ✨ 全局配置自动化 +```bash +cd docker +vim monitoring/monitoring.env -监控配置已在 `backend/utils/monitoring.py` 中自动初始化: +MONITORING_PROVIDER=langsmith +LANGSMITH_API_KEY=lsv2_xxx +LANGSMITH_PROJECT=nexent +LANGSMITH_OTLP_TRACES_ENDPOINT=https://api.smith.langchain.com/otel/v1/traces -```python -# 无需手动配置 - 系统启动时自动完成 -# monitoring_manager 已经使用环境变量配置完成 -from utils.monitoring import monitoring_manager +./start-monitoring.sh --stack langsmith +``` -# 直接使用即可,无需检查是否开启 -@monitoring_manager.monitor_endpoint("my_function") -def my_function(): - pass +后端在 Docker 网络内运行时: -# FastAPI应用初始化 -monitoring_manager.setup_fastapi_app(app) +```bash +ENABLE_TELEMETRY=true +MONITORING_PROVIDER=langsmith +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false ``` -### 🔒 自动启停设计 +LangSmith 当前配置只转发 traces,OTLP metrics 会留在 Collector debug pipeline。若需要后端直接写入 LangSmith,可设置 `OTEL_EXPORTER_OTLP_ENDPOINT=https://api.smith.langchain.com/otel`、`LANGSMITH_API_KEY` 和可选的 `LANGSMITH_PROJECT`。 + +### 本地 Grafana + Tempo -- **智能监控**: 根据 `ENABLE_TELEMETRY` 环境变量自动启停 -- **零感知使用**: 外部代码无需检查监控状态,直接使用所有功能 -- **优雅降级**: 未开启时静默无效果,开启时正常工作 -- **默认关闭**: 未配置时自动视为关闭状态 +Grafana 本地部署使用 Grafana Tempo 存储 traces,并启用 Tempo `metrics-generator` 的 `local-blocks` processor 支持 Grafana trace breakdown 中的 TraceQL metrics 查询。Collector 接收 Nexent 后端的 OTLP traces/metrics,其中 traces 通过 OTLP gRPC 转发到 Tempo;OTLP metrics 只进入 Collector debug pipeline,不提供独立指标存储或指标 dashboard。 ```bash -# 开启监控 -export ENABLE_TELEMETRY=true +cd docker +./start-monitoring.sh --stack grafana +``` + +后端 `.env` 使用已有的 `MONITORING_PROVIDER` 控制前端顶栏监控入口: -# 关闭监控 -export ENABLE_TELEMETRY=false +```bash +ENABLE_TELEMETRY=true +MONITORING_PROVIDER=grafana +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 ``` -## 📊 核心监控指标 +访问地址: + +- Grafana UI:`http://localhost:3002` +- 默认管理员:`admin` / `nexent-grafana-admin` +- Tempo API:`http://localhost:3200` -| 指标 | 描述 | 重要性 | -|------|------|-------| -| `llm_token_generation_rate` | Token 生成速度 (tokens/s) | ⭐⭐⭐ | -| `llm_time_to_first_token_seconds` | 首 Token 延迟 | ⭐⭐⭐ | -| `llm_request_duration_seconds` | 完整请求耗时 | ⭐⭐⭐ | -| `llm_total_tokens` | 输入/输出 Token 数量 | ⭐⭐ | -| `llm_error_count` | LLM 调用错误数 | ⭐⭐⭐ | +Grafana 会自动预置 Tempo datasource,并加载 `Nexent Agent Trace Monitoring` dashboard。Trace 查询入口在 Grafana Explore 中选择 `Tempo` datasource,示例 TraceQL 为 `{ resource.service.name = "nexent-backend" }`。 -## 🔧 环境配置 +### 本地 SkyWalking + +SkyWalking 本地部署使用 SkyWalking OAP、SkyWalking UI 和 BanyanDB。Collector 接收 Nexent 后端的 OTLP traces/metrics,其中 traces 通过 OTLP gRPC 转发到 SkyWalking OAP;OTLP metrics 当前只进入 Collector debug pipeline。SkyWalking 的 OTel metrics 接入需要 MAL rule 映射,不建议在没有规则的情况下直接假设 LLM metrics 会自动出现在 UI 中。 ```bash -# 添加到 .env 文件 -cat >> .env << EOF -ENABLE_TELEMETRY=true -SERVICE_NAME=nexent-backend -JAEGER_ENDPOINT=http://localhost:14268/api/traces -LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 -LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 -TELEMETRY_SAMPLE_RATE=1.0 # 开发环境,生产环境推荐 0.1 -EOF +cd docker +./start-monitoring.sh --stack skywalking ``` -## 🛠️ 验证系统 +后端 `.env`: ```bash -# 检查指标端点 -curl http://localhost:8000/metrics +ENABLE_TELEMETRY=true +MONITORING_PROVIDER=skywalking +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http +``` + +访问地址: + +- SkyWalking UI:`http://localhost:8080` +- SkyWalking OAP HTTP API:`http://localhost:12800` +- SkyWalking OAP gRPC API:`localhost:11800` + +SkyWalking OAP 侧启用 `SW_OTEL_RECEIVER=default`、`SW_OTEL_RECEIVER_ENABLED_HANDLERS=otlp-traces`、`SW_RECEIVER_ZIPKIN=default` 和 `SW_QUERY_ZIPKIN=default`。这是因为 SkyWalking 的 OTLP trace handler 会按其 OTLP trace 文档进入 Zipkin trace 查询链路。 -# 验证依赖安装 -python -c "from backend.utils.monitoring import MONITORING_AVAILABLE; print(f'监控可用: {MONITORING_AVAILABLE}')" +## AI 可观测性平台对接 + +### Arize Phoenix + +Arize Phoenix 提供针对 AI 的专业可观测性,原生支持 OpenInference 语义。 + +**配置:** + +```bash +MONITORING_PROVIDER=phoenix +OTEL_EXPORTER_OTLP_ENDPOINT=https://app.phoenix.arize.com/s/YOUR_SPACE +OTEL_EXPORTER_OTLP_AUTHORIZATION="Bearer YOUR_PHOENIX_API_KEY" +OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false ``` -## 🆘 故障排除 +**功能特性:** +- LLM 调用链可视化(Prompt/Completion) +- Token 级性能指标 +- Agent 步骤追踪 +- 成本分析 + +### Langfuse + +Langfuse 提供 Prompt 管理和 LLM 可观测性,支持 OTLP 协议。 + +**配置:** -### 监控数据为空? ```bash -# 检查服务状态 -docker-compose -f docker/docker-compose-monitoring.yml ps +MONITORING_PROVIDER=langfuse +OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel -# 检查依赖安装 -python -c "import opentelemetry; print('✅ 监控依赖已安装')" +LANGFUSE_PUBLIC_KEY=pk-xxx +LANGFUSE_SECRET_KEY=sk-xxx + +OTEL_EXPORTER_OTLP_AUTHORIZATION=Basic BASE64_ENCODED_KEY +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION=4 ``` -### 端口冲突? +生成认证 Key: + ```bash -# 检查端口占用 -lsof -i :3005 -i :9090 -i :16686 +echo -n "$LANGFUSE_PUBLIC_KEY:$LANGFUSE_SECRET_KEY" | base64 ``` -### 依赖安装问题? -```bash -# 重新安装性能依赖 -uv sync --extra performance +**功能特性:** +- Prompt 版本管理 +- 会话级 Trace 分组 +- 用户反馈收集 +- 模型成本追踪 + +## 环境变量 + +| 变量 | 默认值 | 说明 | +|------|--------|------| +| `ENABLE_TELEMETRY` | `false` | 启用/禁用监控 | +| `MONITORING_PROVIDER` | `otlp` | 平台配置和本地部署形态:`otlp`、`phoenix`、`langfuse`、`langsmith`、`grafana`、`skywalking` | +| `MONITORING_PROJECT_NAME` | `nexent` | 监控平台项目名 | +| `OTEL_SERVICE_NAME` | `nexent-backend` | 服务标识 | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4318` | OTLP base endpoint,SDK 会派生 `/v1/traces` 和 `/v1/metrics` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | (空) | 可选 trace 专用 endpoint | +| `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` | (空) | 可选 metric 专用 endpoint | +| `OTEL_EXPORTER_OTLP_PROTOCOL` | `http` | 协议:`http` 或 `grpc` | +| `OTEL_EXPORTER_OTLP_HEADERS` | (空) | 通用认证头(逗号分隔) | +| `OTEL_EXPORTER_OTLP_AUTHORIZATION` | (空) | `Authorization` header,常用于 Phoenix bearer auth 和 Langfuse | +| `OTEL_EXPORTER_OTLP_X_API_KEY` | (空) | `x-api-key` header,用于兼容需要该 header 的平台 | +| `OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION` | (空) | Langfuse 实时摄取版本,例如 `4` | +| `OTEL_EXPORTER_OTLP_METRICS_ENABLED` | `true` | 是否导出 OTLP metrics | +| `LANGSMITH_API_KEY` | (空) | LangSmith API Key,会映射为 OTLP `x-api-key` header | +| `LANGSMITH_PROJECT` | (空) | 可选 LangSmith project header | +| `LANGSMITH_OTLP_TRACES_ENDPOINT` | `https://api.smith.langchain.com/otel/v1/traces` | Collector 转发到在线 LangSmith 的 trace endpoint | +| `MONITORING_INSTRUMENT_FASTAPI` | `true` | 是否启用 FastAPI 自动 HTTP server span | +| `MONITORING_INSTRUMENT_REQUESTS` | `false` | 是否启用 requests 自动 HTTP client span;默认关闭,避免 AI trace 被普通 HTTP 请求刷屏 | +| `MONITORING_FASTAPI_EXCLUDED_URLS` | (空) | FastAPI 自动埋点排除 URL,逗号分隔正则;例如只看 agent 业务 span 时可设为 `/agent/run` | +| `MONITORING_FASTAPI_EXCLUDE_SPANS` | `receive,send` | 排除 ASGI 内部 `receive/send` span;流式接口建议保持默认值 | +| `OTEL_COLLECTOR_VERSION` | `0.150.0` | 本地 OpenTelemetry Collector Contrib 镜像版本 | +| `PHOENIX_VERSION` | `15` | 本地 Phoenix 镜像版本 | +| `LANGFUSE_VERSION` | `3` | 本地 Langfuse Web/Worker 镜像版本 | +| `LANGFUSE_POSTGRES_VERSION` | `15-alpine` | 本地 Langfuse Postgres 镜像版本 | +| `LANGFUSE_CLICKHOUSE_VERSION` | `26.3-alpine` | 本地 Langfuse ClickHouse 镜像版本 | +| `LANGFUSE_MINIO_VERSION` | `RELEASE.2023-12-20T01-00-02Z` | 本地 Langfuse MinIO 镜像版本 | +| `LANGFUSE_REDIS_VERSION` | `alpine` | 本地 Langfuse Redis 镜像版本 | +| `GRAFANA_VERSION` | `12.4` | 本地 Grafana 镜像版本 | +| `GRAFANA_PORT` | `3002` | 本地 Grafana UI 端口 | +| `GRAFANA_ADMIN_USER` | `admin` | 本地 Grafana 管理员用户名 | +| `GRAFANA_ADMIN_PASSWORD` | `nexent-grafana-admin` | 本地 Grafana 管理员密码 | +| `GRAFANA_DEFAULT_LANGUAGE` | `zh-Hans` | 本地 Grafana 默认界面语言 | +| `TEMPO_VERSION` | `2.10.5` | 本地 Tempo 镜像版本,避免浮动 tag 带来的配置兼容性漂移 | +| `TEMPO_PORT` | `3200` | 本地 Tempo HTTP API 端口 | +| `SKYWALKING_VERSION` | `10.4.0` | 本地 SkyWalking OAP/UI 镜像版本 | +| `SKYWALKING_BANYANDB_VERSION` | `0.9.0` | 本地 SkyWalking BanyanDB 镜像版本 | +| `SKYWALKING_UI_PORT` | `8080` | 本地 SkyWalking UI 端口 | +| `SKYWALKING_OAP_GRPC_PORT` | `11800` | 本地 SkyWalking OAP gRPC API 端口 | +| `SKYWALKING_OAP_HTTP_PORT` | `12800` | 本地 SkyWalking OAP HTTP API 端口 | + +## 代码集成 + +### 端点监控 -# 检查 pyproject.toml 中的 performance 配置 -cat backend/pyproject.toml | grep -A 20 "performance" +```python +from utils.monitoring import monitoring_manager + +@monitoring_manager.monitor_endpoint("my_service.my_function") +async def my_api_function(): + return {"status": "ok"} ``` -### 服务名显示为 unknown_service? -```bash -# 检查环境变量配置 -echo "SERVICE_NAME: $SERVICE_NAME" +### LLM 调用监控 -# 重启监控服务以应用新配置 -./docker/start-monitoring.sh +```python +@monitoring_manager.monitor_llm_call("gpt-4", "chat_completion") +def call_llm(messages): + return llm_response ``` -## 🧹 数据管理 +### Agent 步骤追踪 -### 清理 Jaeger 追踪数据 -```bash -# 方法1: 重启 Jaeger 容器(最简单) -docker-compose -f docker/docker-compose-monitoring.yml restart nexent-jaeger +```python +with monitoring_manager.trace_agent_step("web_search", "research_agent", "tool_call") as span: + result = execute_tool() + monitoring_manager.set_tool_output(result) +``` -# 方法2: 完全重建 Jaeger 容器和数据 -docker-compose -f docker/docker-compose-monitoring.yml stop nexent-jaeger -docker-compose -f docker/docker-compose-monitoring.yml rm -f nexent-jaeger -docker-compose -f docker/docker-compose-monitoring.yml up -d nexent-jaeger +### 工具调用追踪 -# 方法3: 清理所有监控数据(重建所有容器) -docker-compose -f docker/docker-compose-monitoring.yml down -docker-compose -f docker/docker-compose-monitoring.yml up -d +```python +with monitoring_manager.trace_tool_call("web_search", "agent_name", {"query": "test"}) as span: + results = search_web("test") + monitoring_manager.set_tool_output({"results": results}) ``` -### 清理 Prometheus 指标数据 -```bash -# 重启 Prometheus 容器 -docker-compose -f docker/docker-compose-monitoring.yml restart nexent-prometheus +### Phoenix 自定义层级埋点 + +如果希望 Phoenix 展示 `agent -> chain -> llm/tool` 的层级结构,使用 OpenInference span kind 封装方法: + +```python +from nexent.monitor import get_monitoring_manager -# 完全清理 Prometheus 数据 -docker-compose -f docker/docker-compose-monitoring.yml stop nexent-prometheus -docker volume rm docker_prometheus_data 2>/dev/null || true -docker-compose -f docker/docker-compose-monitoring.yml up -d nexent-prometheus +monitoring_manager = get_monitoring_manager() + +with monitoring_manager.trace_agent( + "TestAgent.run", + input_value={"query": "你好"}, + metadata={"agent_id": 1, "tenant_id": "tenant_id"}, + tags=["nexent", "agent", "agent_id:1"], + session_id=1001, + user_id="user_id", +): + with monitoring_manager.trace_chain("Step 0"): + with monitoring_manager.trace_chain("Step 1"): + with monitoring_manager.trace_llm_request("OpenAIModel.generate", "gpt-4"): + result = call_llm() + + with monitoring_manager.trace_tool_call("FinalAnswerTool", "TestAgent", {"query": "你好"}): + monitoring_manager.set_tool_output({"answer": result}) + + monitoring_manager.set_openinference_output({"answer": result}) ``` -### 清理 Grafana 配置 -```bash -# 重置 Grafana 配置和仪表板 -docker-compose -f docker/docker-compose-monitoring.yml stop nexent-grafana -docker volume rm docker_grafana_data 2>/dev/null || true -docker-compose -f docker/docker-compose-monitoring.yml up -d nexent-grafana +Phoenix 左侧的 `agent`、`chain`、`llm`、`tool` 标签来自 `openinference.span.kind`。span 必须通过嵌套 `with` 创建,Phoenix 才会显示成树形结构。 + +同一套方法也会写入 Langfuse 识别的 OTel 属性: + +| Nexent 方法 | Phoenix 属性 | Langfuse observation type | +|-------------|--------------|---------------------------| +| `trace_agent` | `openinference.span.kind=AGENT` | `langfuse.observation.type=agent` | +| `trace_chain` | `openinference.span.kind=CHAIN` | `langfuse.observation.type=chain` | +| `trace_llm_request` | `openinference.span.kind=LLM` | `langfuse.observation.type=generation` | +| `trace_tool_call` | `openinference.span.kind=TOOL` | `langfuse.observation.type=tool` | +| `trace_retriever` | `openinference.span.kind=RETRIEVER` | `langfuse.observation.type=retriever` | + +`session_id`、`user_id`、`tags` 和 `metadata` 会同步写入 `langfuse.session.id`、`langfuse.user.id`、`langfuse.trace.tags`、`langfuse.trace.metadata.*`,可在 Langfuse 中按会话、用户和业务字段过滤。`input_value`、`output_value` 会同步写入 `langfuse.observation.input` 和 `langfuse.observation.output`。 + +## OpenInference 语义属性 + +系统使用 OpenInference 语义约定,专为 AI 可观测性设计: + +### LLM 属性 + +| 属性 | 说明 | +|------|------| +| `llm.model_name` | 模型标识(如 `gpt-4`) | +| `llm.operation.name` | 操作类型(如 `chat_completion`) | +| `llm.token_count.prompt` | 输入 Token 数 | +| `llm.token_count.completion` | 输出 Token 数 | +| `llm.invocation_parameters` | 模型参数(JSON) | +| `llm.time_to_first_token` | TTFT(秒) | + +### Agent 属性 + +| 属性 | 说明 | +|------|------| +| `agent.name` | Agent 标识 | +| `agent.step.name` | 步骤名称(如 `web_search`) | +| `agent.step.type` | 步骤类型:`tool_call`、`reasoning`、`action_selection` | +| `agent.tool.name` | 工具名称 | +| `agent.tool.input` | 工具输入(JSON) | +| `agent.tool.output` | 工具输出(JSON) | + +## 指标 + +| 指标 | 说明 | +|------|------| +| `llm.request.duration` | 请求延迟 | +| `llm.token.generation_rate` | Token 生成速率 | +| `llm.time_to_first_token` | TTFT | +| `llm.token_count.prompt` | 输入 Token | +| `llm.token_count.completion` | 输出 Token | +| `agent.step.count` | Agent 步骤数 | +| `agent.execution.duration` | Agent 执行时间 | +| `agent.error.count` | Agent 错误数 | + +## Collector 配置 + +OpenTelemetry Collector 默认只通过 debug exporter 打印数据,避免没有外部后端时把数据转发回自身。需要通过 Collector 转发到平台时,增加对应 exporter: + +```yaml +exporters: + otlphttp/langsmith: + traces_endpoint: https://api.smith.langchain.com/otel/v1/traces + headers: + x-api-key: YOUR_LANGSMITH_API_KEY + Langsmith-Project: nexent + +service: + pipelines: + traces: + exporters: [otlphttp/langsmith, debug] +``` + +本地 Phoenix 和 Langfuse 分别使用独立 Collector 配置: + +- `docker/monitoring/otel-collector-phoenix-config.yml` +- `docker/monitoring/otel-collector-langfuse-config.yml` +- `docker/monitoring/otel-collector-langsmith-config.yml` + +基础 debug 配置见 `docker/monitoring/otel-collector-config.yml`。 + +## 优雅降级 + +未安装 OpenTelemetry 依赖时,监控自动禁用: + +```python +pip install nexent # 基础包 - 无监控 +pip install nexent[performance] # 包含 OTLP 支持 ``` -## 📈 典型问题分析 +禁用时所有监控方法均正常工作 - 装饰器透传,上下文管理器返回 None。 -### Token 生成速度慢 (< 5 tokens/s) -1. **分析**: Grafana → Token Generation Rate 面板 -2. **解决**: 检查模型服务负载、优化输入 prompt 长度 +## 故障排除 -### 请求响应慢 (> 10s) -1. **分析**: Jaeger → 查看完整链路追踪 -2. **解决**: 定位瓶颈环节(数据库/LLM/网络) +### 数据未显示 -### 错误率突增 (> 10%) -1. **分析**: Prometheus → llm_error_count 指标 -2. **解决**: 检查模型服务可用性、验证 API 密钥 +1. 检查 `.env` 中 `ENABLE_TELEMETRY=true` +2. 验证 OTLP 端点可访问 +3. 检查认证头配置正确 -## 🎉 开始使用 +### 连接错误 -设置完成后你可以: +1. 测试端点:`curl -v $OTEL_EXPORTER_OTLP_ENDPOINT/v1/traces` +2. 确认协议匹配端点(`http` vs `grpc`) +3. 查看 Collector 日志:`docker logs nexent-otel-collector` -1. 📊 在 Grafana 中查看 **LLM Performance Dashboard** -2. 🔍 在 Jaeger 中追踪每个请求的完整链路 -3. 📈 分析 Token 生成速度和性能瓶颈 -4. 🚨 设置性能告警和阈值 +### 属性错误 -享受高效的 LLM 性能监控! 🚀 +1. 在平台 UI 中验证 OpenInference 属性 +2. 检查 Span 属性命名:使用 `llm.model_name` 而非 `model_name` +3. 查看平台特定属性要求 diff --git a/doc/docs/zh/sdk/opentelemetry-design.md b/doc/docs/zh/sdk/opentelemetry-design.md new file mode 100644 index 000000000..4285dce18 --- /dev/null +++ b/doc/docs/zh/sdk/opentelemetry-design.md @@ -0,0 +1,643 @@ +# Nexent OpenTelemetry 可观测性设计 + +生成日期:2026-05-06 +基准分支:当前 OpenTelemetry 功能分支 + +## 可观测性基础 + +可观测性关注的是系统在运行过程中是否能够被理解和定位问题。相比只回答“系统是否还活着”的传统监控,可观测性更强调从运行时信号反推出系统内部状态,帮助研发和运维回答以下问题: + +- 当前请求为什么慢? +- Agent 在哪一步失败? +- 大模型调用耗时、首 token 时间和 token 速率是否异常? +- 某个用户、会话或 Agent 的完整执行链路是什么? +- 问题发生时有哪些输入、输出、工具调用和错误上下文? + +业界通常把可观测性拆成三大支柱:Metrics、Logs、Traces。三者解决的问题不同,需要组合使用。 + +| 支柱 | 核心问题 | 典型数据 | 适合场景 | 在 Nexent 中的作用 | +|------|----------|----------|----------|--------------------| +| Metrics | “整体是否异常?” | 计数器、直方图、速率、分位数 | 看趋势、告警、容量评估、SLO/SLA | 统计 LLM 请求耗时、TTFT、token 速率、错误数、Agent step/tool 调用数 | +| Logs | “当时发生了什么?” | 按时间顺序输出的文本或结构化事件 | 查看异常上下文、排查单点错误、审计关键行为 | 保留运行日志,并通过 span event/attribute 记录关键 Agent、LLM、Tool 事件 | +| Traces | “一次请求经历了哪些步骤?” | trace、span、span event、上下游关系 | 分布式调用链、流式 Agent 执行链路、跨服务耗时定位 | 串联 HTTP 接口、Agent run、LLM generate、Tool call 和最终答案 | + +三大支柱之间不是替代关系。Metrics 适合发现问题,例如某段时间 LLM 错误数上升;Traces 适合定位问题,例如找到某次 `agent.run` 卡在某个 tool;Logs 适合补充细节,例如错误堆栈、原始提示词摘要或工具返回内容。对于 LLM Agent 场景,单纯的 HTTP 接口指标不足以解释 Agent 行为,因此必须把 Agent、LLM、Tool 等业务语义写入 trace 层级中。 + +## 为什么使用 OpenTelemetry + +OpenTelemetry 是当前主流的可观测性开放标准,提供统一的 API、SDK、语义约定和 OTLP 传输协议。Nexent 选择 OpenTelemetry 作为监控主干,主要基于以下原因: + +- 标准化:用统一的 span、event、metric 表达 HTTP、Agent、LLM、Tool 等运行时信号,减少平台私有模型对业务代码的侵入。 +- 可移植:同一套埋点可以通过 OTLP 上报到 Phoenix、Langfuse、LangSmith、Grafana Tempo、SkyWalking 或其他兼容后端,切换平台主要调整配置和 Collector pipeline。 +- 可扩展:OpenTelemetry Collector 可以在不改业务代码的情况下完成转发、过滤、批处理、认证 header 注入和多后端分发。 +- 生态成熟:FastAPI、requests 等基础组件已有自动埋点能力,Nexent 只需要补充 Agent/LLM/Tool 的业务 span。 +- 避免锁定:监控平台 SDK 可以作为增强层,但核心链路不依赖某一家平台 SDK,避免平台迁移或本地化部署时重写埋点。 +- 适合 Agent 场景:trace 的父子 span 结构天然适合表达 `agent.run -> chain step -> LLM generate/tool call -> final answer` 这类多步骤执行过程。 + +因此,Nexent 的实现原则是:业务代码只产生 OpenTelemetry 标准信号和少量平台兼容属性,平台差异收敛在配置、Collector 和展示层。 + +## OTel 规范概要 + +本文中的 OTel 规范通常指 OpenTelemetry Specification 及其配套规范。它不是某个 SDK,也不是某个监控平台,而是一套兼容性契约:规定可观测性数据应该如何生成、命名、传播、处理和导出。各语言 SDK、Collector、后端平台和自动埋点库按这套契约实现,才能保证跨语言、跨框架、跨后端互通。 + +一句话概括:OTel 规范是 OpenTelemetry 为 traces、metrics、logs 等可观测性数据制定的一套标准,保证不同语言、框架、Collector 和后端之间能够互通。 + +OpenTelemetry 规范按 signal 维度独立演进。Tracing、Metrics、Logs、Baggage 是当前主要 signal;Profiles 正在发展中,Events 通常作为 Logs 的特定事件形态讨论。每个成熟 signal 通常由 API、SDK、OTLP、Collector 和 instrumentation/contrib 生态共同组成,语义约定用于保证不同语言和组件在观测同类操作时输出一致的数据。 + +从实现视角看,OTel 规范可以拆成六个常用层面: + +| 规范领域 | 核心概念 | 作用 | +|----------|----------|------| +| Signals | Traces、Metrics、Logs、Baggage、Profiles | 定义可观测性数据类型。Nexent 当前重点使用 Traces 和 Metrics,Logs 通过应用日志与 span event 补充上下文;Profiles 暂不接入 | +| API | Tracer、Meter、Logger、Context、Propagator | 面向业务代码和 instrumentation 的稳定接口,业务埋点只依赖 API,不直接绑定具体 exporter | +| SDK | TracerProvider、MeterProvider、SpanProcessor、MetricReader、Sampler、Resource | 提供采样、批处理、资源描述、导出等运行时能力 | +| Data Model | Span、Metric、LogRecord、Resource、Instrumentation Scope | 定义 telemetry 数据结构,确保不同语言和平台对数据有一致理解 | +| Context Propagation | Context、SpanContext、Baggage、Propagator | 在服务、线程、异步任务和下游请求之间传递 trace 上下文,保证调用链可以串起来 | +| OTLP | OTLP HTTP、OTLP gRPC、protobuf payload | OpenTelemetry 原生传输协议,负责把 traces、metrics、logs 从应用或 Collector 发到后端 | +| Semantic Conventions | 标准属性名、span name、metric name、单位和枚举值 | 统一 HTTP、数据库、RPC、Messaging 等通用语义;AI 场景中 Nexent 额外兼容 OpenInference 和 Langfuse 属性 | + +### Signals + +OTel 把可观测性数据抽象为多个 signal。每个 signal 有独立 API 和数据模型,但共享 Resource、Context 和传播机制。 + +- Traces:由一组具有父子关系的 span 构成,用于描述一次逻辑操作的完整路径。Nexent 用 trace 表达 `agent.run` 到 LLM、Tool、Final Answer 的执行链路。 +- Metrics:由 counter、histogram、gauge 等 instrument 产生,用于描述聚合后的趋势和分布。Nexent 用 metrics 统计 LLM 延迟、TTFT、token 速率和错误数。 +- Logs:以 LogRecord 或传统日志集成的方式表达离散事件。Nexent 当前不把 Logs signal 作为主链路 exporter,但会通过应用日志和 span event 补充错误上下文。 +- Baggage:跨进程传播的键值上下文,适合传递租户、用户、实验分组等需要参与过滤和关联的业务标签。使用时需要控制基数和敏感信息。 +- Profiles:用于记录代码级资源消耗画像,当前在 OpenTelemetry 体系中仍处于发展阶段。Nexent 暂不采集 profiles,避免引入额外运行时开销。 + +Nexent 的当前落地策略是:Traces 优先,因为 Agent 运行链路需要父子 span 表达;Metrics 保留,用于趋势、告警和 dashboard;Logs 暂以应用日志和 span event 形态承载,后续如需统一日志采集,可以通过 Collector 增加 Logs pipeline。 + +### API 与 SDK + +OTel 区分 API 和 SDK: + +- API 是埋点代码依赖的稳定接口,例如 `trace.get_tracer()`、`start_as_current_span()`、`meter.create_counter()`。 +- SDK 是运行时实现,负责创建 provider、处理 span/metric、采样、批量导出和错误处理。 + +这种分层让库代码可以只依赖 API,而应用在启动时统一配置 SDK。Nexent 的 SDK 埋点遵循这个模型:业务函数只创建 span、event、metric;是否启用、导出到哪里、使用 HTTP 还是 gRPC,全部由 `MonitoringConfig` 和环境变量决定。 + +这种分层也决定了 Nexent 的边界: + +- 业务代码不直接创建 exporter,也不直接引用 Phoenix、Langfuse、Tempo 等平台客户端。 +- 初始化层负责创建 SDK provider、resource、processor、reader 和 exporter。 +- 平台差异通过 provider profile、OTLP endpoint、header 和 Collector pipeline 表达。 + +### Resource 与 Instrumentation Scope + +Resource 描述 telemetry 来源实体,例如服务名、版本、实例、部署环境、项目名。Nexent 当前写入: + +- `service.name`:默认 `nexent-backend` +- `service.version`:当前固定为 `1.0.0` +- `service.instance.id`:当前固定为 `nexent-instance-1` +- `telemetry.provider`:当前 provider profile,例如 `otlp`、`phoenix`、`langfuse`、`grafana`、`skywalking` +- `project.name`:当配置 `MONITORING_PROJECT_NAME` 时写入 + +Instrumentation Scope 描述产生 telemetry 的 instrumentation 库或模块。后续如果需要区分 Nexent SDK、FastAPI 自动埋点、第三方库埋点,可以在 scope 层面辅助过滤。 + +### Context Propagation + +Trace 的核心是上下文传播。一个请求从 HTTP 入口进入后,后续 Agent step、LLM 调用、Tool 调用必须处在同一个 trace 上下文中,监控页面才能显示正确的父子层级。 + +OTel 的 Context 是执行范围内的不可变上下文容器,用于承载当前 span、baggage 等跨切面数据。Propagator 负责把这些上下文编码到请求边界,例如 HTTP header,再由下游服务还原。对 Nexent 来说,同进程内的 async、generator、线程和工具调用上下文保持比跨服务 header 传播更关键。 + +Nexent 的关键处理包括: + +- 在 `monitor_endpoint` 中覆盖 async coroutine 和 async generator,保证流式响应真正被消费时 span 仍然处于活动状态。 +- 通过 context variable 保存 tenant、user、agent、conversation 等请求级元数据,避免把监控参数侵入业务函数签名。 +- 在 Agent、LLM、Tool span 上写入 OpenInference、Langfuse 和 Nexent 自定义属性,保证不同平台都能基于同一 trace 做展示和过滤。 + +### Semantic Conventions + +Semantic Conventions 规定常见遥测字段的命名和含义,例如 HTTP 方法、URL、状态码、错误类型、metric 单位等。使用语义约定的价值是让不同服务、语言和平台对同一类数据有一致理解。 + +Nexent 采用三层语义: + +- OTel 通用语义:用于 service、resource、HTTP 自动埋点、metric instrument 等基础字段。 +- OpenInference 语义:用于 AI span 类型,例如 `openinference.span.kind=AGENT|CHAIN|LLM|TOOL|RETRIEVER`,适配 Phoenix 等 AI observability 平台。 +- Langfuse OTel 语义:用于 `langfuse.observation.type`、`langfuse.session.id`、`langfuse.user.id`、`langfuse.observation.input/output` 等展示和过滤字段。 + +当三者存在差异时,Nexent 不把业务 span 绑定到某个平台,而是在同一个 span 上补充多套兼容属性。 + +### OTLP 与 Collector Pipeline + +OTLP 是 OpenTelemetry 原生传输协议,支持 HTTP 和 gRPC。Nexent 后端只需要把数据发到 OTLP endpoint,后端平台差异交给 Collector 处理。 + +Collector pipeline 通常由三部分组成: + +- Receiver:接收应用上报的 OTLP traces/metrics/logs。 +- Processor:执行批处理、内存限制、资源属性补充、过滤、采样等处理。 +- Exporter:把数据转发到 Phoenix、Langfuse、Tempo 或其他 OTLP 兼容后端。 + +OTLP 是 request/response 风格协议,客户端发送 export 请求,服务端返回成功、部分成功或失败响应。Nexent 当前支持: + +- OTLP HTTP:默认协议,便于通过网关、云平台和本地 Collector 接入。 +- OTLP gRPC:适合内部网络或偏高吞吐场景。 +- base endpoint 与 signal endpoint:支持配置 base endpoint,再由 SDK 推导 `/v1/traces` 和 `/v1/metrics`,也支持直接配置 signal-specific endpoint,避免路径重复拼接。 + +这种架构的好处是:应用侧配置保持稳定,平台迁移和本地化部署主要改 Collector 配置。例如 `grafana` 形态下 traces 转发到 Tempo;`phoenix` 形态下 traces 转发到 Phoenix;`otlp` 形态下先通过 debug exporter 验证数据是否产生。 + +## 设计目标 + +Nexent 的监控能力以 OpenTelemetry 为主干,SDK 和后端只负责生成标准 span、event、metric,并通过 OTLP 导出。Phoenix、Langfuse、LangSmith、Grafana Tempo、Apache SkyWalking 和标准 OTLP 后端作为可配置 exporter 接入,业务代码不绑定单一平台。 + +核心目标: + +- Agent 流式运行期间保持 trace 上下文,覆盖 API、服务准备、Agent 异步 generator、Agent 线程、LLM 流式输出、Python 解释器执行、真实工具调用和最终答案。 +- 通过 OpenInference 属性适配 Phoenix,通过 `langfuse.*` 属性适配 Langfuse,同一套业务埋点可同时服务多个监控平台。 +- 支持 `otlp`、`phoenix`、`langfuse`、`grafana`、`skywalking` provider profile。 +- 通过环境变量统一控制后端导出配置和本地部署形态,`MONITORING_PROVIDER` 是唯一 provider 入口。 +- 支持 base endpoint 和 signal-specific endpoint,避免 `/v1/traces`、`/v1/metrics` 路径重复拼接。 +- FastAPI/requests 自动埋点可配置,默认压制流式接口中的 ASGI `receive/send` 噪声。 + +## 技术栈 + +| 分类 | 实现 | +|------|------| +| 标准框架 | OpenTelemetry API/SDK | +| 导出协议 | OTLP HTTP、OTLP gRPC | +| Trace exporter | `opentelemetry-exporter-otlp` HTTP/gRPC trace exporter | +| Metric exporter | `opentelemetry-exporter-otlp` HTTP/gRPC metric exporter | +| 自动埋点 | FastAPI instrumentation、requests instrumentation;requests 默认关闭 | +| AI 语义 | OpenInference 属性、Langfuse OTel 属性、Nexent 自定义业务属性 | +| Agent 框架 | SmolAgents `CodeAgent` 扩展、Nexent `CoreAgent`、`NexentAgent` | +| 配置 | 环境变量 | +| Collector | `otel/opentelemetry-collector-contrib`,支持 debug、Phoenix、Langfuse、LangSmith、Grafana/Tempo、SkyWalking 部署形态 | + +## 总体架构 + +```mermaid +flowchart LR + Backend[Nexent Backend / SDK] --> OTel[OpenTelemetry TracerProvider / MeterProvider] + OTel --> Exporter[OTLP Trace / Metric Exporter] + Exporter --> Collector[OpenTelemetry Collector] + Collector --> Phoenix[Arize Phoenix] + Collector --> Langfuse[Langfuse] + Collector --> Tempo[Grafana Tempo] + Collector --> SkyWalking[Apache SkyWalking] + Collector --> Other[OTLP Backend] + + Backend --> FastAPI[FastAPI Auto Instrumentation] + Backend --> Manual[Manual AI Spans] + Manual --> OI[OpenInference Attributes] + Manual --> LF[Langfuse Attributes] +``` + +## 配置模型 + +### 环境变量 + +| 变量 | 默认值 | 说明 | +|------|--------|------| +| `ENABLE_TELEMETRY` | `false` | 监控总开关 | +| `MONITORING_PROVIDER` | `otlp` | 监控 provider 和部署形态:`otlp`、`phoenix`、`langfuse`、`langsmith`、`grafana`、`skywalking` | +| `MONITORING_PROJECT_NAME` | `nexent` | 平台项目名 | +| `OTEL_SERVICE_NAME` | `nexent-backend` | OpenTelemetry service name | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4318` | OTLP base endpoint | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | 空 | 可选 trace 专用 endpoint | +| `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` | 空 | 可选 metric 专用 endpoint | +| `OTEL_EXPORTER_OTLP_PROTOCOL` | `http` | `http` 或 `grpc` | +| `OTEL_EXPORTER_OTLP_HEADERS` | 空 | 通用 `key=value,key2=value2` header | +| `OTEL_EXPORTER_OTLP_AUTHORIZATION` | 空 | `Authorization` header,常用于 Phoenix bearer auth 和 Langfuse Basic Auth | +| `OTEL_EXPORTER_OTLP_X_API_KEY` | 空 | `x-api-key` header,用于兼容需要该 header 的平台 | +| `OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION` | 空 | Langfuse 摄取版本,例如 `4` | +| `LANGSMITH_API_KEY` | 空 | LangSmith API Key,后端直连时映射为 `x-api-key`,Collector 转发时注入 exporter header | +| `LANGSMITH_PROJECT` | 空 | 可选 LangSmith project header | +| `LANGSMITH_OTLP_TRACES_ENDPOINT` | `https://api.smith.langchain.com/otel/v1/traces` | Collector 转发到在线 LangSmith 的 trace endpoint | +| `OTEL_EXPORTER_OTLP_METRICS_ENABLED` | `true` | 是否导出 metric | +| `MONITORING_INSTRUMENT_FASTAPI` | `true` | 是否启用 FastAPI 自动 HTTP server span | +| `MONITORING_INSTRUMENT_REQUESTS` | `false` | 是否启用 requests 自动 HTTP client span | +| `MONITORING_FASTAPI_EXCLUDED_URLS` | 空 | FastAPI 自动埋点排除 URL,逗号分隔正则 | +| `MONITORING_FASTAPI_EXCLUDE_SPANS` | `receive,send` | 排除 ASGI 内部 `receive/send` span,流式接口建议保持默认 | +| `OTEL_COLLECTOR_VERSION` | `0.150.0` | 本地 OpenTelemetry Collector Contrib 镜像版本 | +| `PHOENIX_VERSION` | `15` | 本地 Phoenix 镜像版本 | +| `LANGFUSE_VERSION` | `3` | 本地 Langfuse Web/Worker 镜像版本 | +| `LANGFUSE_POSTGRES_VERSION` | `15-alpine` | 本地 Langfuse Postgres 镜像版本 | +| `LANGFUSE_CLICKHOUSE_VERSION` | `26.3-alpine` | 本地 Langfuse ClickHouse 镜像版本 | +| `LANGFUSE_MINIO_VERSION` | `RELEASE.2023-12-20T01-00-02Z` | 本地 Langfuse MinIO 镜像版本 | +| `LANGFUSE_REDIS_VERSION` | `alpine` | 本地 Langfuse Redis 镜像版本 | +| `GRAFANA_VERSION` | `12.4` | 本地 Grafana 镜像版本 | +| `GRAFANA_PORT` | `3002` | 本地 Grafana UI 端口 | +| `GRAFANA_DEFAULT_LANGUAGE` | `zh-Hans` | 本地 Grafana 默认界面语言 | +| `TEMPO_VERSION` | `2.10.5` | 本地 Tempo 镜像版本,避免浮动 tag 带来的配置兼容性漂移 | +| `TEMPO_PORT` | `3200` | 本地 Tempo HTTP API 端口 | +| `SKYWALKING_VERSION` | `10.4.0` | 本地 SkyWalking OAP/UI 镜像版本 | +| `SKYWALKING_BANYANDB_VERSION` | `0.9.0` | 本地 SkyWalking BanyanDB 镜像版本 | +| `SKYWALKING_UI_PORT` | `8080` | 本地 SkyWalking UI 端口 | +| `SKYWALKING_OAP_GRPC_PORT` | `11800` | 本地 SkyWalking OAP gRPC API 端口 | +| `SKYWALKING_OAP_HTTP_PORT` | `12800` | 本地 SkyWalking OAP HTTP API 端口 | + +## Endpoint 规则 + +HTTP exporter 支持两种输入: + +- base endpoint:`https://cloud.langfuse.com/api/public/otel` +- signal endpoint:`https://cloud.langfuse.com/api/public/otel/v1/traces` + +SDK 会按 signal 派生最终地址: + +| 输入 | Trace endpoint | Metric endpoint | +|------|----------------|-----------------| +| `https://host/api/public/otel` | `https://host/api/public/otel/v1/traces` | `https://host/api/public/otel/v1/metrics` | +| `https://host/api/public/otel/v1/traces` | 原值 | `https://host/api/public/otel/v1/metrics` | +| `https://host/api/public/otel/v1/metrics` | `https://host/api/public/otel/v1/traces` | 原值 | + +## 平台接入 + +### 纯 OTLP / 自建 Collector + +```bash +MONITORING_PROVIDER=otlp +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http +``` + +前端顶栏监控入口只根据后端 `MONITORING_PROVIDER` 映射 UI 端口和路径,最终跳转地址由前端使用当前页面 URL 的 hostname 组装,避免固定写死 `localhost`: + +- `phoenix` -> `${currentHostname}:${PHOENIX_PORT:-6006}/` +- `langfuse` -> `${currentHostname}:${LANGFUSE_PORT:-3001}/project/nexent` +- `grafana` -> `${currentHostname}:${GRAFANA_PORT:-3002}/d/nexent-llm-agent/nexent-agent-trace-monitoring?orgId=1` +- `skywalking` -> `${currentHostname}:${SKYWALKING_UI_PORT:-8080}/` +- `otlp` 默认不显示顶栏监控入口 + +因此本地 Grafana 形态需要在后端 `.env` 中设置: + +```bash +MONITORING_PROVIDER=grafana +``` + +### Phoenix + +Phoenix 通过 OpenInference 属性识别 AI span 类型,核心字段是 `openinference.span.kind`。 + +```bash +MONITORING_PROVIDER=phoenix +OTEL_EXPORTER_OTLP_ENDPOINT=https://app.phoenix.arize.com/s/YOUR_SPACE +OTEL_EXPORTER_OTLP_AUTHORIZATION="Bearer YOUR_PHOENIX_API_KEY" +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false +MONITORING_PROJECT_NAME=nexent-production +``` + +### Langfuse + +Langfuse 的 OTLP HTTP base endpoint 是 `/api/public/otel`,使用 Basic Auth。实时摄取建议带 `x-langfuse-ingestion-version=4`。 + +```bash +MONITORING_PROVIDER=langfuse +OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel +OTEL_EXPORTER_OTLP_AUTHORIZATION="Basic BASE64_PUBLIC_SECRET" +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION=4 +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false +``` + +当前实现会同时写入 `langfuse.observation.type`、`langfuse.session.id`、`langfuse.user.id`、`langfuse.trace.tags`、`langfuse.trace.metadata.*`、`langfuse.observation.input`、`langfuse.observation.output` 等属性,以便 Langfuse 正确展示 generation/tool/agent 并支持过滤聚合。 + +### LangSmith + +LangSmith 的在线 OTLP trace endpoint 为 `https://api.smith.langchain.com/otel/v1/traces`,使用 `x-api-key` header 认证,可通过 `Langsmith-Project` header 指定项目。推荐仍让 Nexent 后端上报到本地 Collector,由 Collector 注入 LangSmith API Key 并转发 traces: + +```bash +MONITORING_PROVIDER=langsmith +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false +``` + +Collector 侧配置 `LANGSMITH_API_KEY`、`LANGSMITH_PROJECT` 和 `LANGSMITH_OTLP_TRACES_ENDPOINT`。LangSmith 当前形态只转发 traces,metrics 进入 Collector debug pipeline。 + +### SkyWalking + +SkyWalking 通过 OAP 的 OpenTelemetry receiver 接收 OTLP traces。推荐 Nexent 后端仍然只上报到本地 Collector,由 Collector 通过 OTLP gRPC 转发到 SkyWalking OAP: + +```bash +MONITORING_PROVIDER=skywalking +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http +``` + +如果直接对接 SkyWalking OAP,可使用 gRPC endpoint: + +```bash +MONITORING_PROVIDER=skywalking +OTEL_EXPORTER_OTLP_ENDPOINT=http://skywalking-oap:11800 +OTEL_EXPORTER_OTLP_PROTOCOL=grpc +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false +``` + +SkyWalking 当前本地形态只把 traces 转发到 OAP;metrics 进入 Collector debug pipeline。原因是 SkyWalking 的 OTel metrics 需要按 MAL rule 建模和映射,Nexent 的 LLM/Agent 自定义 metrics 不应在没有规则的情况下直接假设可以被 UI 正确聚合展示。 + +## 本地化部署设计 + +本地化部署通过 `docker/start-monitoring.sh` 选择形态。所有形态都保留 OpenTelemetry Collector 作为入口,Nexent 后端统一上报到 `http://otel-collector:4318` 或宿主机的 `http://localhost:4318`,平台差异只体现在 Collector exporter 和本地服务组合上。 + +| 形态 | Collector 配置 | 本地服务 | 数据去向 | 说明 | +|------|----------------|----------|----------|------| +| `otlp` | `otel-collector-config.yml` | Collector | debug exporter | 最小形态,用于验证 span/metric 是否产生,或手动改配置转发到云端平台;`collector` 仅作为启动脚本兼容别名 | +| `phoenix` | `otel-collector-phoenix-config.yml` | Collector + Phoenix | `http://phoenix:6006/v1/traces` | Phoenix 容器同时提供 UI 和 OTLP HTTP/gRPC trace collector,适合本地 trace debug | +| `langfuse` | `otel-collector-langfuse-config.yml` | Collector + Langfuse Web/Worker + Postgres + ClickHouse + MinIO + Redis | `http://langfuse-web:3000/api/public/otel/v1/traces` | Langfuse v3 依赖多组件,适合完整 LLMOps 能力验证 | +| `langsmith` | `otel-collector-langsmith-config.yml` | Collector | `https://api.smith.langchain.com/otel/v1/traces` | 在线 LangSmith trace 分析;API Key 只配置在 Collector 环境 | +| `grafana` | `otel-collector-grafana-config.yml` | Collector + Grafana + Tempo | traces 转发到 `tempo:4317`,metrics 只进入 Collector debug pipeline | Grafana + Tempo trace 查询 | +| `skywalking` | `otel-collector-skywalking-config.yml` | Collector + SkyWalking OAP + SkyWalking UI + BanyanDB | traces 转发到 `skywalking-oap:11800`,metrics 只进入 Collector debug pipeline | SkyWalking APM、服务拓扑和 OTLP trace 查询 | + +启动命令: + +```bash +cd docker +./start-monitoring.sh --stack otlp +./start-monitoring.sh --stack phoenix +./start-monitoring.sh --stack langfuse +./start-monitoring.sh --stack langsmith +./start-monitoring.sh --stack grafana +./start-monitoring.sh --stack skywalking +``` + +部署脚本职责: + +- 创建或复用 `nexent-network`。 +- 首次启动时从 `monitoring.env.example` 生成 `monitoring.env`。 +- 根据 `MONITORING_PROVIDER` 或 `--stack` 选择 Docker Compose profile。 +- 根据部署形态设置 `OTEL_COLLECTOR_CONFIG_FILE`。 +- Langfuse 本地形态下,如果 `LANGFUSE_OTLP_AUTH_HEADER` 未显式配置,则使用初始化项目的 public/secret key 生成 Basic Auth header。 +- LangSmith 在线形态要求 `LANGSMITH_API_KEY`,启动时会校验该变量,避免 Collector 静默丢弃鉴权失败的 trace。 + +### Phoenix 本地形态 + +Phoenix 使用 `arizephoenix/phoenix` 镜像,默认暴露: + +| 端口 | 用途 | +|------|------| +| `6006` | Phoenix UI 和 OTLP HTTP `/v1/traces` | +| `4319` | 映射到容器内 gRPC OTLP `4317`,避免与 Collector gRPC 端口冲突 | + +Compose 中设置 `PHOENIX_WORKING_DIR=/mnt/data` 并挂载 `phoenix-data` volume,确保本地重启后 trace 数据不丢失。Collector 使用 `otlphttp/phoenix` exporter 的 base endpoint `http://phoenix:6006`,由 Collector 按 OTLP HTTP 规则追加 `/v1/traces`。 + +### Langfuse 本地形态 + +Langfuse v3 本地形态按自托管架构拆分为应用容器和存储组件: + +| 组件 | 用途 | +|------|------| +| `langfuse-web` | UI、API、OTLP HTTP ingestion | +| `langfuse-worker` | 异步消费和处理 trace 事件 | +| `langfuse-postgres` | 事务型元数据 | +| `langfuse-clickhouse` | trace/observation/score 分析数据 | +| `langfuse-minio` | S3 兼容对象存储,保存事件和大对象 | +| `langfuse-redis` | 队列和缓存 | + +初始化参数通过 `LANGFUSE_INIT_*` 配置,默认创建 `nexent-local` 项目和本地 API Key。Collector 使用 `otlphttp/langfuse` exporter,endpoint 为 `http://langfuse-web:3000/api/public/otel`,并携带: + +```yaml +headers: + Authorization: ${env:LANGFUSE_OTLP_AUTH_HEADER} + x-langfuse-ingestion-version: "4" +``` + +默认密钥仅用于本地验证。生产或共享环境必须替换认证密钥、数据库密码、对象存储密钥和 `LANGFUSE_ENCRYPTION_KEY`,并补充备份、高可用和升级策略。 + +### Grafana 本地形态 + +Grafana 本地形态面向 trace 调试: + +| 组件 | 用途 | +|------|------| +| `grafana` | 展示 Nexent Agent trace dashboard,并预置 Tempo datasource | +| `tempo` | 接收 Collector 转发的 OTLP traces,并提供 Grafana Explore 查询后端 | + +Collector trace pipeline 使用 `otlp/tempo` exporter 转发到 `tempo:4317`。Tempo 启用 `metrics-generator` 的 `local-blocks` processor,用于支持 Grafana trace breakdown 中的 TraceQL metrics 查询。Collector metrics pipeline 保留为 debug exporter,用于兼容后端仍开启 OTLP metrics 的场景,但本地 Grafana 形态不提供独立指标存储和指标 dashboard。 + +### SkyWalking 本地形态 + +SkyWalking 本地形态面向通用 APM 和 trace 查询: + +| 组件 | 用途 | +|------|------| +| `skywalking-ui` | 展示服务、实例、端点、拓扑和 trace 查询 | +| `skywalking-oap` | SkyWalking 后端分析服务,接收 Collector 转发的 OTLP traces | +| `skywalking-banyandb` | SkyWalking 推荐的本地存储组件 | + +Collector trace pipeline 使用 `otlp/skywalking` exporter 转发到 `skywalking-oap:11800`。OAP 启用 `SW_OTEL_RECEIVER=default`、`SW_OTEL_RECEIVER_ENABLED_HANDLERS=otlp-traces`、`SW_RECEIVER_ZIPKIN=default`、`SW_QUERY_ZIPKIN=default`。这些配置用于开启 OTel trace handler,并让 SkyWalking 通过 Zipkin trace 查询链路展示 OTLP trace。 + +默认访问地址: + +- Grafana:`http://localhost:3002` +- Tempo API:`http://localhost:3200` + +## Span 语义映射 + +| Nexent 场景 | Phoenix / OpenInference | Langfuse | +|-------------|-------------------------|----------| +| Agent 入口 | `openinference.span.kind=AGENT` | `langfuse.observation.type=agent` | +| 服务准备、流式生成、线程执行、普通步骤 | `openinference.span.kind=CHAIN` | `langfuse.observation.type=chain` | +| LLM 调用 | `openinference.span.kind=LLM` | `langfuse.observation.type=generation` | +| 工具调用 | `openinference.span.kind=TOOL` | `langfuse.observation.type=tool` | +| 检索类调用 | `openinference.span.kind=RETRIEVER` | `langfuse.observation.type=retriever` | + +上下文属性: + +| 属性 | 说明 | +|------|------| +| `input.value` / `output.value` | OpenInference 输入输出 | +| `metadata` | OpenInference JSON metadata | +| `session.id` / `user.id` | OpenInference 会话和用户 | +| `tag.tags` | OpenInference tags | +| `langfuse.observation.input` / `langfuse.observation.output` | Langfuse observation 输入输出 | +| `langfuse.session.id` / `langfuse.user.id` | Langfuse 会话和用户 | +| `langfuse.trace.tags` | Langfuse trace tags | +| `langfuse.trace.metadata.*` / `langfuse.observation.metadata.*` | Langfuse 可过滤业务 metadata | + +## 埋点信息 + +| 埋点 | 位置 | 类型 | 内容 | 目的 | +|------|------|------|------|------| +| FastAPI 自动 span | `MonitoringManager.setup_fastapi_app` | HTTP server | route、method、status、duration | API 入口耗时和错误定位 | +| FastAPI `receive/send` 排除 | `fastapi_exclude_spans` | 降噪配置 | 默认 `receive,send` | 避免 SSE 流式接口生成大量 `unknown POST /agent/run http ...` | +| requests 自动 span | `MonitoringConfig.instrument_requests` | HTTP client | 外部请求 URL、method、status | 默认关闭;需要分析外部 HTTP 依赖时开启 | +| `agent.run` | `backend/apps/agent_app.py` | AGENT | `/agent/run` 请求入口 | 作为一次 Agent 运行的顶层业务 trace | +| `agent_service.run_agent_stream` | `backend/services/agent_service.py` | CHAIN | `agent_id`、`conversation_id`、debug、文件数、记忆开关、策略、准备耗时 | 分析 SSE 创建前的准备阶段 | +| `set_openinference_agent_context` | `run_agent_stream` | 当前 span 上下文 | session、user、tenant、agent、metadata、tags | 给 Phoenix/Langfuse 建立 Agent、用户、会话维度 | +| `user_resolution.*` | `run_agent_stream` | event | 用户、租户、语言和耗时 | 鉴权与租户解析定位 | +| `user_message_save.*` | `run_agent_stream` | event | 保存或跳过原因、耗时 | 判断会话写入是否正常 | +| `memory_context_build.*` | `run_agent_stream` | event | 记忆开关、共享策略、耗时 | 定位记忆上下文瓶颈 | +| `streaming_strategy.*` | `run_agent_stream` | event | `with_memory` 或 `no_memory` | 判断实际执行分支 | +| `generate_stream_with_memory` | `backend/services/agent_service.py` | CHAIN | memory token、预处理任务、fallback 分支 | 追踪带记忆路径的流式执行 | +| `generate_stream_no_memory` | `backend/services/agent_service.py` | CHAIN | 准备与流式输出事件 | 追踪无记忆流式执行 | +| `agent_run` | `sdk/nexent/core/agents/run_agent.py` | CHAIN | 线程启动、缓存读取、消息 yield | 追踪 Agent 异步 generator 消费过程 | +| `agent_run_thread` | `sdk/nexent/core/agents/run_agent.py` | CHAIN | Agent 创建、MCP 工具装载、执行错误 | 追踪实际 Agent 执行线程 | +| `{display_name or model_id}.generate` | `sdk/nexent/core/models/openai_llm.py` | LLM / generation | 模型、温度、top_p、消息、输入输出、token、TTFT、chunk 数 | LLM 性能、成本、输出和异常分析 | +| `python_interpreter` | `sdk/nexent/core/agents/core_agent.py` | TOOL | 生成代码、step number、执行输出、日志、是否最终答案 | 观测 CodeAgent 解释器执行 | +| 真实工具名 | `sdk/nexent/core/agents/nexent_agent.py` | TOOL | local/MCP/langchain/builtin 工具输入输出 | 观测真实工具可用性、延迟、错误和输入输出 | +| `FinalAnswerTool` | `sdk/nexent/core/agents/core_agent.py` | TOOL | 最终答案输出 | 让 Phoenix/Langfuse 中能明确看到最终答案节点 | +| `trace_agent` / `trace_chain` / `trace_retriever` | SDK 公共 API | AGENT / CHAIN / RETRIEVER | 自定义输入输出、metadata、tags、session、user | SDK 用户自定义层级埋点 | +| `trace_tool_call` | SDK 公共 API | TOOL | 工具名、输入、输出、耗时、错误 | SDK 用户自定义工具埋点 | + +### 事件清单 + +| Span / 位置 | Event | 主要属性 | 目的 | +|-------------|-------|----------|------| +| `monitor_endpoint` 通用装饰器 | `.started` / `.completed` / `.error` | `param.*`、`duration`、`error.*` | 统一记录接口和服务函数的开始、结束、异常 | +| `agent_service.run_agent_stream` | `user_resolution.started` / `user_resolution.completed` | `duration`、`user_id`、`tenant_id`、`language` | 定位用户、租户、语言解析耗时和结果 | +| `agent_service.run_agent_stream` | `user_message_save.started` / `user_message_save.completed` / `user_message_save.skipped` | `duration`、`reason` | 判断用户消息是否写入,以及跳过原因 | +| `agent_service.run_agent_stream` | `memory_context_build.started` / `memory_context_build.completed` | `duration`、`memory_enabled`、`agent_share_option`、`debug_mode` | 观测记忆上下文构建耗时和开关状态 | +| `agent_service.run_agent_stream` | `streaming_strategy.selected` / `streaming_strategy.completed` | `strategy`、`selected_strategy`、`duration` | 识别实际流式分支与选择耗时 | +| `agent_service.run_agent_stream` | `stream_generator.memory_stream.creating` / `stream_generator.no_memory_stream.creating` | 无 | 标记 generator 创建分支 | +| `agent_service.run_agent_stream` | `streaming_response.creating` / `streaming_response.created` / `run_agent_stream.preparation_completed` | `duration`、`media_type`、`total_preparation_time` | 观测 SSE 响应创建和整体准备耗时 | +| `generate_stream_no_memory` | `generate_stream_no_memory.started` / `generate_stream_no_memory.completed` / `generate_stream_no_memory.streaming.started` / `generate_stream_no_memory.streaming.completed` | 无 | 观测无记忆路径的准备和流式消费边界 | +| `agent_run` | `agent_run.started` / `agent_run.thread_started` / `agent_run.get_cached_message` / `agent_run.get_cached_message_completed` / `agent_run.yield_message` | 无 | 观测 Agent 线程启动、缓存轮询和消息 yield | +| LLM span | `completion_started` / `first_token_received` / `token_generated` / `completion_finished` / `model_stopped` / `error_occurred` | `model_id`、`temperature`、`top_p`、`message_count`、`total_duration`、`output_length`、`chunk_count`、`error.*` | 分析模型参数、流式输出耗时、停止和异常 | +| Tool span | span 属性 `agent.tool.input` / `agent.tool.output` | JSON 字符串、`agent.tool.duration_ms`、`error.*` | 分析工具输入输出、耗时和异常 | + +## 指标 + +| 指标 | 类型 | 维度 | 用途 | +|------|------|------|------| +| `llm.request.duration` | histogram | model、operation | LLM 请求延迟 | +| `llm.token.generation_rate` | histogram | model | token/s | +| `llm.time_to_first_token` | histogram | model | 首 token 延迟 | +| `llm.token_count.prompt` | counter | model | 输入 token 成本 | +| `llm.token_count.completion` | counter | model | 输出 token 成本 | +| `llm.error.count` | counter | model、operation | LLM 错误率 | +| `agent.step.count` | counter | agent、step type、tool | Agent 步骤和工具调用量 | +| `agent.execution.duration` | histogram | agent、status | Agent 总耗时 | +| `agent.error.count` | counter | agent、error type | Agent 异常统计 | + +## Agent 运行数据流 + +```mermaid +flowchart TD + U[用户] --> FE[前端 Chat] + FE --> API[POST /agent/run] + API --> HTTP[FastAPI HTTP span: 可配置隐藏] + HTTP --> A0[agent.run span: AGENT] + A0 --> S1[agent_service.run_agent_stream: CHAIN] + S1 --> R[user_resolution events] + S1 --> Save[user_message_save events] + S1 --> Mem[memory_context_build events] + Mem --> Strategy{streaming_strategy} + Strategy -->|with_memory| G1[generate_stream_with_memory: CHAIN] + Strategy -->|no_memory| G2[generate_stream_no_memory: CHAIN] + G1 --> AR[agent_run async generator: CHAIN] + G2 --> AR + AR --> Thread[agent_run_thread: CHAIN] + Thread --> NX[NexentAgent / CoreAgent] + NX --> Step[Agent step / code action] + Step --> LLM[Model.generate: LLM / generation] + Step --> PY[python_interpreter: TOOL] + PY --> Tool[Real local / MCP / langchain / builtin tool: TOOL] + PY --> Final[FinalAnswerTool: TOOL] + LLM --> Attr1[OpenInference + Langfuse attrs] + Tool --> Attr1 + Final --> Attr1 + Attr1 --> OTel[OpenTelemetry Tracer/Meter Provider] + OTel --> Collector[OTLP Collector] + Collector --> Phoenix[Phoenix] + Collector --> Langfuse[Langfuse] + Collector --> Tempo[Grafana Tempo] + Collector --> SkyWalking[SkyWalking] + Collector --> Other[OTLP Backend] +``` + +预期平台树形结构: + +```text +agent.run agent +└─ agent_service.run_agent_stream chain + └─ agent_service.generate_* chain + └─ agent_run chain + └─ agent_run_thread chain + ├─ Model.generate llm / generation + ├─ python_interpreter tool + │ └─ RealTool tool + └─ FinalAnswerTool tool +``` + +FastAPI HTTP span 可以保留在最上层用于接口视角,也可以通过 `MONITORING_FASTAPI_EXCLUDED_URLS=/agent/run` 在 AI trace 视图中隐藏。 + +## 监控页面结构 + +```mermaid +flowchart TB + Page[Agent 监控页] --> Filters[筛选区: 时间 / 租户 / 用户 / Agent / 会话 / 模型 / 状态] + Page --> KPIs[指标区: 成功率 / P95 / TTFT / tokens/s / token 成本 / 工具错误数] + Page --> TraceList[Trace 列表: Agent / 会话 / 用户 / 状态 / 耗时 / Token / 模型 / 最后错误] + Page --> Detail[Trace 详情] + Detail --> Waterfall[Span 瀑布图: agent / chain / llm / tool] + Detail --> Timeline[Agent 时间线: 准备 / 记忆 / LLM / 工具 / 最终答案] + Detail --> LLMPanel[LLM 面板: prompt / output / token / TTFT / generation rate] + Detail --> ToolPanel[工具面板: 工具名 / 输入 / 输出 / 耗时 / 错误] + Detail --> Session[会话和用户上下文] + Detail --> Raw[原始 OTel 属性和 events] + Detail --> Eval[反馈、评分和评估] +``` + +与 Phoenix、Langfuse、LangSmith、Grafana Tempo、SkyWalking 对比: + +| 方案 | 优点 | 不足 | Nexent 当前适配 | +|------|------|------|----------------| +| Phoenix | OpenInference 生态匹配好,适合 trace debug、实验、评估;`phoenix.otel` 可降低接入成本 | Nexent 的租户、权限、Agent 配置需要通过属性映射;HTTP 自动 span 容易产生 `unknown` 噪声 | 写入 `openinference.span.kind`、`input.value`、`output.value`、`metadata`、`session.id`、`user.id`,并支持 FastAPI 降噪 | +| Langfuse | Trace、session、user、prompt、evaluation、dashboard 能力完整,适合 LLMOps 闭环 | 需要 `langfuse.*` 属性才能获得更好的 observation 类型、用户、会话和 metadata 聚合 | 写入 `langfuse.observation.type`、`langfuse.session.id`、`langfuse.user.id`、`langfuse.trace.metadata.*`、`langfuse.observation.input/output` | +| LangSmith | LangChain 生态集成好,在线平台适合追踪、调试和评估 Agent 运行 | 当前仅配置 trace 转发;项目和鉴权通过 header 注入 | Collector 使用 `x-api-key` 和 `Langsmith-Project` 转发到在线 OTLP traces endpoint | +| Grafana Tempo | TraceQL 查询灵活,Grafana 生态适合和 dashboard 聚合 | 本地形态不提供独立 metrics 存储;LLM/Agent 语义展示需要自建 dashboard | traces 转发到 Tempo,Grafana 预置 Tempo datasource 和 Nexent trace dashboard | +| SkyWalking | 通用 APM、服务拓扑、端点分析和 trace 查询能力成熟,适合和传统微服务观测统一 | AI/LLM 专用语义展示弱于 Phoenix/Langfuse;OTel metrics 需要 MAL rule 映射 | traces 通过 OTLP gRPC 转发到 OAP;metrics 先进入 Collector debug pipeline | +| Nexent 自建页 | 可直接关联租户、会话、Agent 配置、权限、版本和业务动作,适合产品内闭环 | 需要自建 trace 存储、查询、聚合、瀑布图、权限隔离和成本统计 | 当前先通过 OTLP 对接外部平台,后续可基于同一批属性构建自有页面 | + +推荐路径: + +1. 短期使用 OTLP 对接 Phoenix/Langfuse/LangSmith,满足调试和分析。 +2. 中期在 Nexent 增加 trace 跳转、轻量指标概览和异常聚合。 +3. 长期按租户、会话、Agent 版本建立自有监控页,同时保留 OTLP 双写能力。 + +## 已修复的设计风险 + +| 风险 | 修复 | +|------|------| +| async generator span 提前结束 | `monitor_endpoint` 使用 `inspect.isasyncgenfunction`,在 `async for` 消费期间保持 span 打开 | +| `/v1/traces` 路径重复拼接 | SDK 支持 base endpoint 和 signal endpoint 自动归一化 | +| Collector header 无法兼容平台 | Collector 默认只 debug;平台转发配置拆分 `Authorization`、`x-api-key`、`x-langfuse-ingestion-version` | +| Phoenix 只看到接口看不到 Agent | 顶层 `agent.run` 标记为 AGENT,内部服务、线程、generator 标记为 CHAIN | +| Phoenix/Langfuse 中出现大量 `unknown POST /agent/run http ...` | 默认排除 FastAPI ASGI `receive/send` span;requests 自动埋点默认关闭;可配置隐藏 `/agent/run` HTTP span | +| Langfuse 无法识别 observation 类型 | 增加 `langfuse.observation.type` 和 trace/session/user/metadata/input/output 属性 | +| LLM span 不明显或缺输出 | LLM span 命名为 `{display_name or model_id}.generate`,并写入 `output.value` 和 `langfuse.observation.output` | +| 工具 span 缺失 | 在 `NexentAgent.create_single_agent` 统一包装 local/MCP/langchain/builtin 工具,并在 `CoreAgent` 增加 `python_interpreter` 和 `FinalAnswerTool` span | +| 单测漏掉流式函数 | 增加 async generator 装饰器测试和 OpenInference/Langfuse 属性测试 | + +## 使用建议 + +只看 Agent 业务链路时: + +```bash +MONITORING_INSTRUMENT_FASTAPI=true +MONITORING_FASTAPI_EXCLUDE_SPANS=receive,send +MONITORING_FASTAPI_EXCLUDED_URLS=/agent/run +MONITORING_INSTRUMENT_REQUESTS=false +``` + +同时看接口入口和 Agent 业务链路时: + +```bash +MONITORING_INSTRUMENT_FASTAPI=true +MONITORING_FASTAPI_EXCLUDE_SPANS=receive,send +MONITORING_FASTAPI_EXCLUDED_URLS= +MONITORING_INSTRUMENT_REQUESTS=false +``` + +需要排查外部 HTTP 依赖时: + +```bash +MONITORING_INSTRUMENT_REQUESTS=true +``` + +## 参考 + +- Phoenix Setup Tracing: https://arize.com/docs/phoenix/tracing/how-to-tracing/setup-tracing +- Phoenix Setup OTEL: https://arize.com/docs/phoenix/tracing/how-to-tracing/setup-tracing/setup-using-phoenix-otel +- Phoenix Authentication: https://arize.com/docs/phoenix/deployment/authentication +- Phoenix Self-Hosting: https://arize.com/docs/phoenix/self-hosting +- Phoenix Docker Deployment: https://arize.com/docs/phoenix/self-hosting/deployment-options/docker +- Langfuse OpenTelemetry: https://langfuse.com/integrations/native/opentelemetry +- Langfuse Self-Hosting: https://langfuse.com/self-hosting +- Langfuse Docker Compose: https://langfuse.com/self-hosting/local +- Langfuse Overview: https://langfuse.com/docs +- LangSmith OpenTelemetry: https://docs.langchain.com/langsmith/otel-gateway-trace-redaction +- SkyWalking OpenTelemetry Receiver: https://skywalking.apache.org/docs/main/latest/en/setup/backend/opentelemetry-receiver/ +- SkyWalking Docker Compose: https://github.com/apache/skywalking-showcase/tree/main/deploy/platform/docker +- SkyWalking BanyanDB: https://skywalking.apache.org/docs/skywalking-banyandb/latest/readme/ diff --git a/docker/.env.bak b/docker/.env.bak deleted file mode 100644 index 24b53751b..000000000 --- a/docker/.env.bak +++ /dev/null @@ -1,168 +0,0 @@ -# ===== Necessary Configs (Necessary till now, will be migrated to frontend page) ===== - -# Voice Service Config -APPID=app_id -TOKEN=token - -# ===== Non-essential Configs (Modify if you know what you are doing) ===== - -CLUSTER=volcano_tts -VOICE_TYPE=zh_male_jieshuonansheng_mars_bigtts -SPEED_RATIO=1.3 - -# ===== Proxy Configuration (Optional) ===== - -# HTTP_PROXY=http://proxy-server:port -# HTTPS_PROXY=http://proxy-server:port -# NO_PROXY=localhost,127.0.0.1 - -# ===== Backend Configuration (No need to modify at all) ===== - -# Model Path Config -CLIP_MODEL_PATH=/opt/models/clip-vit-base-patch32 -NLTK_DATA=/opt/models/nltk_data - -# Elasticsearch Service -ELASTICSEARCH_HOST=http://nexent-elasticsearch:9200 -ELASTIC_PASSWORD=nexent@2025 - -# Elasticsearch Memory Configuration -ES_JAVA_OPTS="-Xms2g -Xmx2g" - -# Elasticsearch Disk Watermark Configuration -ES_DISK_WATERMARK_LOW=85% -ES_DISK_WATERMARK_HIGH=90% -ES_DISK_WATERMARK_FLOOD_STAGE=95% - -# Main Services -# Config service (port 5010) - Main API service for config operations -CONFIG_SERVICE_URL=http://nexent-config:5010 -ELASTICSEARCH_SERVICE=http://nexent-config:5010/api - -# Runtime service (port 5014) - Runtime execution service for agent operations -RUNTIME_SERVICE_URL=http://nexent-runtime:5014 - -# MCP service (port 5011) - MCP protocol service -NEXENT_MCP_SERVER=http://nexent-mcp:5011 -MCP_MANAGEMENT_API=http://nexent-mcp:5015 - -# Data process service (port 5012) - Data processing service -DATA_PROCESS_SERVICE=http://nexent-data-process:5012/api - -# Northbound service (port 5013) - Northbound API service -NORTHBOUND_API_SERVER=http://nexent-northbound:5013/api - -# Postgres Config -POSTGRES_HOST=nexent-postgresql -POSTGRES_USER=root -NEXENT_POSTGRES_PASSWORD=nexent@4321 -POSTGRES_DB=nexent -POSTGRES_PORT=5432 - -# Minio Config -MINIO_ENDPOINT=http://nexent-minio:9000 -MINIO_ROOT_USER=nexent -MINIO_ROOT_PASSWORD=nexent@4321 -MINIO_REGION=cn-north-1 -MINIO_DEFAULT_BUCKET=nexent - -# Redis Config -REDIS_URL=redis://redis:6379/0 -REDIS_BACKEND_URL=redis://redis:6379/1 - -# Model Engine Config -MODEL_ENGINE_ENABLED=false - -# Supabase Config -DASHBOARD_USERNAME=supabase -DASHBOARD_PASSWORD=Huawei123 - -# Supabase db Config -SUPABASE_POSTGRES_PASSWORD=Huawei123 -SUPABASE_POSTGRES_HOST=db -SUPABASE_POSTGRES_DB=supabase -SUPABASE_POSTGRES_PORT=5436 - -# Supabase Auth Config -SITE_URL=http://localhost:3011 -SUPABASE_URL=http://supabase-kong-mini:8000 -API_EXTERNAL_URL=http://supabase-kong-mini:8000 -DISABLE_SIGNUP=false -JWT_EXPIRY=3600 -DEBUG_JWT_EXPIRE_SECONDS=0 - -# Supabase Configuration -ENABLE_EMAIL_SIGNUP=true -ENABLE_EMAIL_AUTOCONFIRM=true -ENABLE_ANONYMOUS_USERS=false - -# Supabase Phone Config -ENABLE_PHONE_SIGNUP=false -ENABLE_PHONE_AUTOCONFIRM=false - -MAILER_URLPATHS_CONFIRMATION="/auth/v1/verify" -MAILER_URLPATHS_INVITE="/auth/v1/verify" -MAILER_URLPATHS_RECOVERY="/auth/v1/verify" -MAILER_URLPATHS_EMAIL_CHANGE="/auth/v1/verify" - -INVITE_CODE=nexent2025 - -# Terminal Tool SSH Key Path -SSH_PRIVATE_KEY_PATH=/path/to/openssh-server/ssh-keys/openssh_server_key - -# ===== Data Processing Service Configuration ===== - -# Redis Port -REDIS_PORT=6379 - -# Flower Monitoring -FLOWER_PORT=5555 - -# Ray Configuration -RAY_ACTOR_NUM_CPUS=2 -RAY_DASHBOARD_PORT=8265 -RAY_DASHBOARD_HOST=0.0.0.0 -RAY_NUM_CPUS=4 -RAY_OBJECT_STORE_MEMORY_GB=0.25 -RAY_TEMP_DIR=/tmp/ray -RAY_LOG_LEVEL=INFO - -# Service Control Flags -DISABLE_RAY_DASHBOARD=true -DISABLE_CELERY_FLOWER=true -DOCKER_ENVIRONMENT=false -ENABLE_UPLOAD_IMAGE=false - -# Celery Configuration -CELERY_WORKER_PREFETCH_MULTIPLIER=1 -CELERY_TASK_TIME_LIMIT=3600 -ELASTICSEARCH_REQUEST_TIMEOUT=30 - -# Worker Configuration -QUEUES=process_q,forward_q -WORKER_NAME= -WORKER_CONCURRENCY=4 - -# Skills Configuration -SKILLS_PATH=/mnt/nexent/skills - -# Telemetry and Monitoring Configuration -ENABLE_TELEMETRY=false -SERVICE_NAME=nexent-backend -JAEGER_ENDPOINT=http://localhost:14268/api/traces -PROMETHEUS_PORT=8000 -TELEMETRY_SAMPLE_RATE=1.0 -LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 -LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 - -# Market Backend Address -MARKET_BACKEND=http://60.204.251.153:8010 -DEPLOYMENT_VERSION="speed" -# Root dir -ROOT_DIR="/c/Users/18270/nexent-data" -TERMINAL_MOUNT_DIR="/opt/terminal" -SSH_USERNAME="root" -SSH_PASSWORD="731215" -NEXENT_MCP_DOCKER_IMAGE="ccr.ccs.tencentyun.com/nexent-hub/nexent-mcp:v2.0.1" -MINIO_ACCESS_KEY="72c31cb5b521511cea652723" -MINIO_SECRET_KEY="m5gcSuKzZnp84CqmG7z5VKnd2C+H5U3PSr7eoJeygmI=" diff --git a/docker/.env.example b/docker/.env.example index e55bba45a..7bbeab0f2 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -153,14 +153,30 @@ WORKER_CONCURRENCY=4 # Skills Configuration SKILLS_PATH=/mnt/nexent/skills -# Telemetry and Monitoring Configuration +# Telemetry and Monitoring Configuration (OTLP Protocol) +# Enable OpenTelemetry monitoring for agent observability ENABLE_TELEMETRY=false -SERVICE_NAME=nexent-backend -JAEGER_ENDPOINT=http://localhost:14268/api/traces -PROMETHEUS_PORT=8000 +# Provider profile: otlp, phoenix, langfuse, jaeger, grafana, custom +MONITORING_PROVIDER=otlp +MONITORING_PROJECT_NAME=nexent +# Service name for identifying traces in observability platforms +OTEL_SERVICE_NAME=nexent-backend +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +# Optional signal-specific endpoints. Leave empty unless the backend requires them. +OTEL_EXPORTER_OTLP_TRACES_ENDPOINT= +OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= +# Protocol: "http" or "grpc" +OTEL_EXPORTER_OTLP_PROTOCOL=http + +# Authentication headers (format: key1=value1,key2=value2) +# Prefer platform-specific variables when using the Collector. +OTEL_EXPORTER_OTLP_HEADERS= +OTEL_EXPORTER_OTLP_AUTHORIZATION= +OTEL_EXPORTER_OTLP_X_API_KEY= +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION= +OTEL_EXPORTER_OTLP_METRICS_ENABLED=true + TELEMETRY_SAMPLE_RATE=1.0 -LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 -LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 # Market Backend Address MARKET_BACKEND=http://60.204.251.153:8010 diff --git a/docker/deploy.sh b/docker/deploy.sh index 7fb78aa90..0ee749bb4 100755 --- a/docker/deploy.sh +++ b/docker/deploy.sh @@ -1284,7 +1284,7 @@ main_deploy() { echo "--------------------------------" echo "" - APP_VERSION="$(get_app_version)" + APP_VERSION="latest" if [ -z "$APP_VERSION" ]; then echo "❌ Failed to get app version, please check the backend/consts/const.py file" exit 1 diff --git a/docker/docker-compose-monitoring.yml b/docker/docker-compose-monitoring.yml index fb4aa5eaf..d764fcede 100644 --- a/docker/docker-compose-monitoring.yml +++ b/docker/docker-compose-monitoring.yml @@ -1,88 +1,267 @@ +name: monitor + services: - # Jaeger - Distributed Tracing - jaeger: - image: jaegertracing/all-in-one:1.52 - container_name: nexent-jaeger - ports: - - "16686:16686" # Jaeger UI - - "14268:14268" # Jaeger collector HTTP - - "14250:14250" # Jaeger collector gRPC - - "6831:6831/udp" # Agent UDP - - "6832:6832/udp" # Agent UDP + otel-collector: + image: otel/opentelemetry-collector-contrib:${OTEL_COLLECTOR_VERSION:-0.151.0} + container_name: nexent-otel-collector + command: ["--config=/etc/otel-collector-config.yml"] environment: - - COLLECTOR_OTLP_ENABLED=true - - COLLECTOR_ZIPKIN_HOST_PORT=:9411 + LANGFUSE_OTLP_AUTH_HEADER: ${LANGFUSE_OTLP_AUTH_HEADER:-} + LANGSMITH_API_KEY: ${LANGSMITH_API_KEY:-} + LANGSMITH_PROJECT: ${LANGSMITH_PROJECT:-nexent} + LANGSMITH_OTLP_TRACES_ENDPOINT: ${LANGSMITH_OTLP_TRACES_ENDPOINT:-https://api.smith.langchain.com/otel/v1/traces} + volumes: + - ${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-config.yml}:/etc/otel-collector-config.yml + ports: + - "${OTEL_COLLECTOR_GRPC_PORT:-4317}:4317" + - "${OTEL_COLLECTOR_HTTP_PORT:-4318}:4318" networks: - - nexent-network + - nexent_nexent restart: unless-stopped - volumes: - - jaeger-data:/tmp - # Prometheus - Metrics Collection - prometheus: - image: prom/prometheus:v2.48.0 - container_name: nexent-prometheus + phoenix: + image: arizephoenix/phoenix:${PHOENIX_VERSION:-15} + container_name: nexent-phoenix + profiles: ["phoenix"] + environment: + PHOENIX_WORKING_DIR: /mnt/data + volumes: + - phoenix-data:/mnt/data ports: - - "9090:9090" - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--web.console.libraries=/etc/prometheus/console_libraries' - - '--web.console.templates=/etc/prometheus/consoles' - - '--storage.tsdb.retention.time=15d' - - '--web.enable-lifecycle' - - '--web.enable-admin-api' + - "${PHOENIX_PORT:-6006}:6006" + - "${PHOENIX_GRPC_HOST_PORT:-4319}:4317" + networks: + - nexent_nexent + restart: unless-stopped + + tempo: + image: grafana/tempo:${TEMPO_VERSION:-2.10.5} + container_name: nexent-tempo + profiles: ["grafana"] + command: ["--config.file=/etc/tempo.yml"] volumes: - - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml - - prometheus-data:/prometheus + - ./monitoring/tempo.yml:/etc/tempo.yml:ro + - tempo-data:/var/tempo + ports: + - "${TEMPO_PORT:-3200}:3200" networks: - - nexent-network + - nexent_nexent restart: unless-stopped - # Grafana - Metrics Visualization grafana: - image: grafana/grafana:10.2.0 + image: grafana/grafana:${GRAFANA_VERSION:-12.4} container_name: nexent-grafana - ports: - - "3005:3000" + profiles: ["grafana"] environment: - - GF_SECURITY_ADMIN_PASSWORD=admin - - GF_USERS_ALLOW_SIGN_UP=false - - GF_INSTALL_PLUGINS=grafana-piechart-panel + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-nexent-grafana-admin} + GF_USERS_ALLOW_SIGN_UP: "false" + GF_USERS_DEFAULT_LANGUAGE: ${GRAFANA_DEFAULT_LANGUAGE:-zh-Hans} + GF_PLUGINS_PREINSTALL_AUTO_UPDATE: "false" volumes: - grafana-data:/var/lib/grafana - - ./monitoring/grafana/provisioning:/etc/grafana/provisioning - - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro + ports: + - "${GRAFANA_PORT:-3002}:3000" + depends_on: + - tempo networks: - - nexent-network + - nexent_nexent restart: unless-stopped - depends_on: - - prometheus - # OpenTelemetry Collector (Optional - for advanced setups) - otel-collector: - image: otel/opentelemetry-collector-contrib:0.89.0 - container_name: nexent-otel-collector - command: ["--config=/etc/otel-collector-config.yml"] + zipkin: + image: openzipkin/zipkin:${ZIPKIN_VERSION:-latest} + container_name: nexent-zipkin + profiles: ["zipkin"] + ports: + - "${ZIPKIN_PORT:-9411}:9411" + networks: + - nexent_nexent + restart: unless-stopped + + langfuse-worker: + image: docker.io/langfuse/langfuse-worker:${LANGFUSE_VERSION:-3} + container_name: nexent-langfuse-worker + profiles: ["langfuse"] + restart: unless-stopped + depends_on: &langfuse-depends-on + langfuse-postgres: + condition: service_healthy + langfuse-minio: + condition: service_healthy + langfuse-redis: + condition: service_healthy + langfuse-clickhouse: + condition: service_healthy + environment: &langfuse-env + NEXTAUTH_URL: ${LANGFUSE_NEXTAUTH_URL:-http://localhost:3001} + NEXTAUTH_SECRET: ${LANGFUSE_NEXTAUTH_SECRET:-nexent-langfuse-secret} + DATABASE_URL: postgresql://${LANGFUSE_POSTGRES_USER:-postgres}:${LANGFUSE_POSTGRES_PASSWORD:-postgres}@langfuse-postgres:5432/${LANGFUSE_POSTGRES_DB:-postgres} + SALT: ${LANGFUSE_SALT:-nexent-langfuse-salt} + ENCRYPTION_KEY: ${LANGFUSE_ENCRYPTION_KEY:-0000000000000000000000000000000000000000000000000000000000000000} + TELEMETRY_ENABLED: ${LANGFUSE_TELEMETRY_ENABLED:-false} + LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES: ${LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES:-false} + CLICKHOUSE_MIGRATION_URL: clickhouse://langfuse-clickhouse:9000 + CLICKHOUSE_URL: http://langfuse-clickhouse:8123 + CLICKHOUSE_USER: ${LANGFUSE_CLICKHOUSE_USER:-clickhouse} + CLICKHOUSE_PASSWORD: ${LANGFUSE_CLICKHOUSE_PASSWORD:-clickhouse} + CLICKHOUSE_CLUSTER_ENABLED: ${LANGFUSE_CLICKHOUSE_CLUSTER_ENABLED:-false} + REDIS_HOST: langfuse-redis + REDIS_PORT: 6379 + REDIS_AUTH: ${LANGFUSE_REDIS_AUTH:-myredissecret} + REDIS_TLS_ENABLED: "false" + LANGFUSE_USE_AZURE_BLOB: "false" + LANGFUSE_USE_OCI_NATIVE_OBJECT_STORAGE: "false" + LANGFUSE_S3_EVENT_UPLOAD_BUCKET: ${LANGFUSE_S3_BUCKET:-langfuse} + LANGFUSE_S3_EVENT_UPLOAD_REGION: auto + LANGFUSE_S3_EVENT_UPLOAD_ACCESS_KEY_ID: ${LANGFUSE_MINIO_ROOT_USER:-minio} + LANGFUSE_S3_EVENT_UPLOAD_SECRET_ACCESS_KEY: ${LANGFUSE_MINIO_ROOT_PASSWORD:-miniosecret} + LANGFUSE_S3_EVENT_UPLOAD_ENDPOINT: http://langfuse-minio:9000 + LANGFUSE_S3_EVENT_UPLOAD_FORCE_PATH_STYLE: "true" + LANGFUSE_S3_EVENT_UPLOAD_PREFIX: events/ + LANGFUSE_S3_MEDIA_UPLOAD_BUCKET: ${LANGFUSE_S3_BUCKET:-langfuse} + LANGFUSE_S3_MEDIA_UPLOAD_REGION: auto + LANGFUSE_S3_MEDIA_UPLOAD_ACCESS_KEY_ID: ${LANGFUSE_MINIO_ROOT_USER:-minio} + LANGFUSE_S3_MEDIA_UPLOAD_SECRET_ACCESS_KEY: ${LANGFUSE_MINIO_ROOT_PASSWORD:-miniosecret} + LANGFUSE_S3_MEDIA_UPLOAD_ENDPOINT: http://localhost:${LANGFUSE_MINIO_API_PORT:-9092} + LANGFUSE_S3_MEDIA_UPLOAD_FORCE_PATH_STYLE: "true" + LANGFUSE_S3_MEDIA_UPLOAD_PREFIX: media/ + LANGFUSE_S3_BATCH_EXPORT_ENABLED: "false" + LANGFUSE_S3_BATCH_EXPORT_BUCKET: ${LANGFUSE_S3_BUCKET:-langfuse} + LANGFUSE_S3_BATCH_EXPORT_REGION: auto + LANGFUSE_S3_BATCH_EXPORT_ENDPOINT: http://langfuse-minio:9000 + LANGFUSE_S3_BATCH_EXPORT_EXTERNAL_ENDPOINT: http://localhost:${LANGFUSE_MINIO_API_PORT:-9092} + LANGFUSE_S3_BATCH_EXPORT_ACCESS_KEY_ID: ${LANGFUSE_MINIO_ROOT_USER:-minio} + LANGFUSE_S3_BATCH_EXPORT_SECRET_ACCESS_KEY: ${LANGFUSE_MINIO_ROOT_PASSWORD:-miniosecret} + LANGFUSE_S3_BATCH_EXPORT_FORCE_PATH_STYLE: "true" + networks: + - nexent_nexent + + langfuse-web: + image: docker.io/langfuse/langfuse:${LANGFUSE_VERSION:-3} + container_name: nexent-langfuse-web + profiles: ["langfuse"] + restart: unless-stopped + depends_on: *langfuse-depends-on + environment: + <<: *langfuse-env + LANGFUSE_INIT_ORG_ID: ${LANGFUSE_INIT_ORG_ID:-nexent} + LANGFUSE_INIT_ORG_NAME: ${LANGFUSE_INIT_ORG_NAME:-Nexent} + LANGFUSE_INIT_PROJECT_ID: ${LANGFUSE_INIT_PROJECT_ID:-nexent-local} + LANGFUSE_INIT_PROJECT_NAME: ${LANGFUSE_INIT_PROJECT_NAME:-Nexent Local} + LANGFUSE_INIT_PROJECT_PUBLIC_KEY: ${LANGFUSE_INIT_PROJECT_PUBLIC_KEY:-pk-lf-nexent-local} + LANGFUSE_INIT_PROJECT_SECRET_KEY: ${LANGFUSE_INIT_PROJECT_SECRET_KEY:-sk-lf-nexent-local} + LANGFUSE_INIT_USER_EMAIL: ${LANGFUSE_INIT_USER_EMAIL:-admin@nexent.local} + LANGFUSE_INIT_USER_NAME: ${LANGFUSE_INIT_USER_NAME:-Nexent Admin} + LANGFUSE_INIT_USER_PASSWORD: ${LANGFUSE_INIT_USER_PASSWORD:-nexent-langfuse-admin} + ports: + - "${LANGFUSE_PORT:-3001}:3000" + networks: + - nexent_nexent + + langfuse-clickhouse: + image: docker.io/clickhouse/clickhouse-server:${LANGFUSE_CLICKHOUSE_VERSION:-26.3-alpine} + container_name: nexent-langfuse-clickhouse + profiles: ["langfuse"] + restart: unless-stopped + user: "101:101" + environment: + CLICKHOUSE_DB: default + CLICKHOUSE_USER: ${LANGFUSE_CLICKHOUSE_USER:-clickhouse} + CLICKHOUSE_PASSWORD: ${LANGFUSE_CLICKHOUSE_PASSWORD:-clickhouse} volumes: - - ./monitoring/otel-collector-config.yml:/etc/otel-collector-config.yml + - langfuse-clickhouse-data:/var/lib/clickhouse + - langfuse-clickhouse-logs:/var/log/clickhouse-server ports: - - "4317:4317" # OTLP gRPC receiver - - "4318:4318" # OTLP HTTP receiver - - "8888:8888" # Prometheus metrics exposed by the collector - - "8889:8889" # Prometheus exporter metrics - depends_on: - - jaeger - - prometheus + - "127.0.0.1:${LANGFUSE_CLICKHOUSE_HTTP_PORT:-8124}:8123" + - "127.0.0.1:${LANGFUSE_CLICKHOUSE_NATIVE_PORT:-9002}:9000" + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://127.0.0.1:8123/ping || exit 1"] + interval: 5s + timeout: 5s + retries: 10 + start_period: 1s networks: - - nexent-network + - nexent_nexent + + langfuse-minio: + image: docker.io/minio/minio:${LANGFUSE_MINIO_VERSION:-RELEASE.2023-12-20T01-00-02Z} + container_name: nexent-langfuse-minio + profiles: ["langfuse"] restart: unless-stopped + entrypoint: sh + command: -c 'mkdir -p /data/${LANGFUSE_S3_BUCKET:-langfuse} && minio server --address ":9000" --console-address ":9001" /data' + environment: + MINIO_ROOT_USER: ${LANGFUSE_MINIO_ROOT_USER:-minio} + MINIO_ROOT_PASSWORD: ${LANGFUSE_MINIO_ROOT_PASSWORD:-miniosecret} + ports: + - "${LANGFUSE_MINIO_API_PORT:-9092}:9000" + - "127.0.0.1:${LANGFUSE_MINIO_CONSOLE_PORT:-9093}:9001" + volumes: + - langfuse-minio-data:/data + healthcheck: + test: ["CMD", "mc", "ready", "local"] + interval: 1s + timeout: 5s + retries: 5 + start_period: 1s + networks: + - nexent_nexent -volumes: - jaeger-data: - prometheus-data: - grafana-data: + langfuse-redis: + image: docker.io/redis:${LANGFUSE_REDIS_VERSION:-alpine} + container_name: nexent-langfuse-redis + profiles: ["langfuse"] + restart: unless-stopped + command: > + --requirepass ${LANGFUSE_REDIS_AUTH:-myredissecret} + --maxmemory-policy noeviction + ports: + - "127.0.0.1:${LANGFUSE_REDIS_PORT:-6380}:6379" + volumes: + - langfuse-redis-data:/data + healthcheck: + test: ["CMD-SHELL", "redis-cli -a ${LANGFUSE_REDIS_AUTH:-myredissecret} ping | grep PONG"] + interval: 3s + timeout: 10s + retries: 10 + networks: + - nexent_nexent + + langfuse-postgres: + image: docker.io/postgres:${LANGFUSE_POSTGRES_VERSION:-15-alpine} + container_name: nexent-langfuse-postgres + profiles: ["langfuse"] + restart: unless-stopped + environment: + POSTGRES_USER: ${LANGFUSE_POSTGRES_USER:-postgres} + POSTGRES_PASSWORD: ${LANGFUSE_POSTGRES_PASSWORD:-postgres} + POSTGRES_DB: ${LANGFUSE_POSTGRES_DB:-postgres} + TZ: UTC + PGTZ: UTC + ports: + - "127.0.0.1:${LANGFUSE_POSTGRES_PORT:-5440}:5432" + volumes: + - langfuse-postgres-data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${LANGFUSE_POSTGRES_USER:-postgres}"] + interval: 3s + timeout: 3s + retries: 10 + networks: + - nexent_nexent networks: - nexent-network: + nexent_nexent: external: true + +volumes: + phoenix-data: + langfuse-postgres-data: + langfuse-clickhouse-data: + langfuse-clickhouse-logs: + langfuse-minio-data: + langfuse-redis-data: + grafana-data: + tempo-data: diff --git a/docker/monitoring/grafana/dashboards/nexent-llm-agent.json b/docker/monitoring/grafana/dashboards/nexent-llm-agent.json new file mode 100644 index 000000000..d4e2c321b --- /dev/null +++ b/docker/monitoring/grafana/dashboards/nexent-llm-agent.json @@ -0,0 +1,150 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Nexent Agent traces backed by Grafana Tempo.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": false, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Open Tempo Explore", + "tooltip": "Open Grafana Explore with the Tempo datasource", + "type": "link", + "url": "/explore?left=%7B%22datasource%22:%22Tempo%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22query%22:%22%7B%20resource.service.name%20%3D%20%5C%22nexent-backend%5C%22%20%7D%22,%22queryType%22:%22traceql%22%7D%5D%7D" + } + ], + "panels": [ + { + "datasource": { + "type": "tempo", + "uid": "Tempo" + }, + "description": "Recent traces for Nexent backend. Open a trace row to inspect the agent, chain, LLM, and tool span waterfall.", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 16, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "tempo", + "uid": "Tempo" + }, + "limit": 100, + "query": "{ resource.service.name = \"nexent-backend\" }", + "queryType": "traceql", + "refId": "A", + "tableType": "traces" + } + ], + "title": "Recent Agent Traces", + "type": "table" + }, + { + "description": "TraceQL shortcuts for common Nexent views.", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 2, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "Service traces:\n{ resource.service.name = \"nexent-backend\" }\n\nAgent spans:\n{ resource.service.name = \"nexent-backend\" && span.openinference.span.kind = \"AGENT\" }\n\nLLM spans:\n{ resource.service.name = \"nexent-backend\" && span.openinference.span.kind = \"LLM\" }\n\nTool spans:\n{ resource.service.name = \"nexent-backend\" && span.openinference.span.kind = \"TOOL\" }\n\nError traces:\n{ resource.service.name = \"nexent-backend\" && status = error }", + "mode": "markdown" + }, + "pluginVersion": "11.0.0", + "title": "TraceQL Examples", + "type": "text" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "nexent", + "agent", + "tempo" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Nexent Agent Trace Monitoring", + "uid": "nexent-llm-agent", + "version": 1, + "weekStart": "" +} diff --git a/docker/monitoring/grafana/dashboards/nexent-llm-performance.json b/docker/monitoring/grafana/dashboards/nexent-llm-performance.json deleted file mode 100644 index ec8d0434a..000000000 --- a/docker/monitoring/grafana/dashboards/nexent-llm-performance.json +++ /dev/null @@ -1,544 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": null, - "links": [], - "liveNow": false, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 1, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.95, rate(llm_request_duration_seconds_bucket[5m]))", - "interval": "", - "legendFormat": "95th percentile", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.50, rate(llm_request_duration_seconds_bucket[5m]))", - "interval": "", - "legendFormat": "50th percentile (median)", - "refId": "B" - } - ], - "title": "LLM Request Duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "tokens/s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 2, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.95, rate(llm_token_generation_rate_bucket[5m]))", - "interval": "", - "legendFormat": "95th percentile", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.50, rate(llm_token_generation_rate_bucket[5m]))", - "interval": "", - "legendFormat": "50th percentile (median)", - "refId": "B" - } - ], - "title": "Token Generation Rate", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 8 - }, - "id": 3, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.95, rate(llm_time_to_first_token_seconds_bucket[5m]))", - "interval": "", - "legendFormat": "95th percentile TTFT", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.50, rate(llm_time_to_first_token_seconds_bucket[5m]))", - "interval": "", - "legendFormat": "50th percentile TTFT", - "refId": "B" - } - ], - "title": "Time to First Token (TTFT)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "tokens" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 8 - }, - "id": 4, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "rate(llm_total_tokens_total{type=\"input\"}[5m])", - "interval": "", - "legendFormat": "Input tokens/sec", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "rate(llm_total_tokens_total{type=\"output\"}[5m])", - "interval": "", - "legendFormat": "Output tokens/sec", - "refId": "B" - } - ], - "title": "Token Throughput", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "errors/sec" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 16 - }, - "id": 5, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "rate(llm_error_count_total[5m])", - "interval": "", - "legendFormat": "Error rate by model: {{model}}", - "refId": "A" - } - ], - "title": "LLM Error Rate", - "type": "timeseries" - } - ], - "refresh": "5s", - "schemaVersion": 37, - "style": "dark", - "tags": ["nexent", "llm", "performance"], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Nexent LLM Performance Dashboard", - "uid": "nexent-llm-perf", - "version": 1, - "weekStart": "" -} - diff --git a/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml b/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml index b89a1fa81..b863e9d16 100644 --- a/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml +++ b/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -1,13 +1,12 @@ apiVersion: 1 providers: - - name: 'Nexent LLM Monitoring' + - name: Nexent Monitoring orgId: 1 - folder: 'Nexent' + folder: Nexent type: file disableDeletion: false - updateIntervalSeconds: 10 + updateIntervalSeconds: 30 allowUiUpdates: true options: path: /var/lib/grafana/dashboards - diff --git a/docker/monitoring/grafana/provisioning/datasources/datasources.yml b/docker/monitoring/grafana/provisioning/datasources/datasources.yml index 9bdc40d61..d23e4cba9 100644 --- a/docker/monitoring/grafana/provisioning/datasources/datasources.yml +++ b/docker/monitoring/grafana/provisioning/datasources/datasources.yml @@ -1,16 +1,23 @@ apiVersion: 1 datasources: - - name: Prometheus - type: prometheus + - name: Tempo + uid: Tempo + type: tempo access: proxy - url: http://prometheus:9090 + url: http://nexent-tempo:3200 isDefault: true editable: true - - - name: Jaeger - type: jaeger - access: proxy - url: http://jaeger:16686 - editable: true - + basicAuth: false + jsonData: + nodeGraph: + enabled: true + search: + hide: false + traceQuery: + timeShiftEnabled: true + spanStartTimeShift: "-1h" + spanEndTimeShift: "1h" + streamingEnabled: + search: false + metrics: false diff --git a/docker/monitoring/monitoring.env b/docker/monitoring/monitoring.env index 2506c03a6..80ce18fed 100644 --- a/docker/monitoring/monitoring.env +++ b/docker/monitoring/monitoring.env @@ -1,21 +1,96 @@ -# Telemetry and Monitoring Configuration ENABLE_TELEMETRY=true -SERVICE_NAME=nexent-backend -JAEGER_ENDPOINT=http://localhost:14268/api/traces -PROMETHEUS_PORT=8000 -TELEMETRY_SAMPLE_RATE=1.0 - -# Performance monitoring thresholds -LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 -LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 +OTEL_SERVICE_NAME=nexent-backend +MONITORING_PROVIDER=otlp +MONITORING_PROJECT_NAME=nexent -# Grafana Configuration -GF_SECURITY_ADMIN_PASSWORD=admin -GF_USERS_ALLOW_SIGN_UP=false +# Use a base OTLP HTTP endpoint. SDK code derives /v1/traces and /v1/metrics. +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_TRACES_ENDPOINT= +OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= +OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_HEADERS= +OTEL_EXPORTER_OTLP_AUTHORIZATION= +OTEL_EXPORTER_OTLP_X_API_KEY= +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION= +OTEL_EXPORTER_OTLP_METRICS_ENABLED=true +MONITORING_INSTRUMENT_FASTAPI=true +MONITORING_INSTRUMENT_REQUESTS=false +MONITORING_FASTAPI_EXCLUDED_URLS= +MONITORING_FASTAPI_EXCLUDE_SPANS=receive,send +TELEMETRY_SAMPLE_RATE=1.0 -# Service ports -JAEGER_UI_PORT=16686 -PROMETHEUS_UI_PORT=9090 -GRAFANA_UI_PORT=3000 OTEL_COLLECTOR_GRPC_PORT=4317 OTEL_COLLECTOR_HTTP_PORT=4318 +OTEL_COLLECTOR_CONFIG_FILE= +OTEL_COLLECTOR_VERSION=0.151.0 + +# Local Phoenix stack. Used by: ./start-monitoring.sh --stack phoenix +PHOENIX_VERSION=15 +PHOENIX_PORT=6006 +PHOENIX_GRPC_HOST_PORT=4319 + +# Local Langfuse stack. Used by: ./start-monitoring.sh --stack langfuse +# Defaults are for local development only. Replace secrets before production use. +LANGFUSE_VERSION=3 +LANGFUSE_PORT=3001 +LANGFUSE_NEXTAUTH_URL=http://localhost:3001 +LANGFUSE_NEXTAUTH_SECRET=nexent-langfuse-secret +LANGFUSE_SALT=nexent-langfuse-salt +LANGFUSE_ENCRYPTION_KEY=0000000000000000000000000000000000000000000000000000000000000000 +LANGFUSE_TELEMETRY_ENABLED=false +LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES=false +LANGFUSE_INIT_ORG_ID=nexent +LANGFUSE_INIT_ORG_NAME=Nexent +LANGFUSE_INIT_PROJECT_ID=nexent +LANGFUSE_INIT_PROJECT_NAME=Nexent +LANGFUSE_INIT_PROJECT_PUBLIC_KEY=pk-lf-nexent-local +LANGFUSE_INIT_PROJECT_SECRET_KEY=sk-lf-nexent-local +LANGFUSE_INIT_USER_EMAIL=admin@nexent.com +LANGFUSE_INIT_USER_NAME=admin +LANGFUSE_INIT_USER_PASSWORD=nexent@4321 +LANGFUSE_OTLP_AUTH_HEADER= +LANGFUSE_POSTGRES_VERSION=15-alpine +LANGFUSE_POSTGRES_USER=postgres +LANGFUSE_POSTGRES_PASSWORD=nexent@4321 +LANGFUSE_POSTGRES_DB=postgres +LANGFUSE_POSTGRES_PORT=5440 +LANGFUSE_CLICKHOUSE_VERSION=26.3-alpine +LANGFUSE_CLICKHOUSE_USER=clickhouse +LANGFUSE_CLICKHOUSE_PASSWORD=clickhouse +LANGFUSE_CLICKHOUSE_HTTP_PORT=8124 +LANGFUSE_CLICKHOUSE_NATIVE_PORT=9002 +LANGFUSE_MINIO_VERSION=RELEASE.2023-12-20T01-00-02Z +LANGFUSE_MINIO_ROOT_USER=minio +LANGFUSE_MINIO_ROOT_PASSWORD=miniosecret +LANGFUSE_MINIO_API_PORT=9092 +LANGFUSE_MINIO_CONSOLE_PORT=9093 +LANGFUSE_S3_BUCKET=langfuse +LANGFUSE_REDIS_AUTH=myredissecret +LANGFUSE_REDIS_VERSION=alpine +LANGFUSE_REDIS_PORT=6380 + +# Online LangSmith forwarding. Used by: ./start-monitoring.sh --stack langsmith +# LangSmith currently ingests OTLP traces. Metrics remain in the Collector debug pipeline. +LANGSMITH_API_KEY= +LANGSMITH_PROJECT=nexent +LANGSMITH_OTLP_TRACES_ENDPOINT=https://api.smith.langchain.com/otel/v1/traces + +# Local Grafana stack. Used by: ./start-monitoring.sh --stack grafana +GRAFANA_VERSION=12.4 +GRAFANA_PORT=3002 +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=nexent@4321 +GRAFANA_DEFAULT_LANGUAGE=zh-Hans +TEMPO_VERSION=2.10.5 +TEMPO_PORT=3200 + +# Local SkyWalking stack. Used by: ./start-monitoring.sh --stack skywalking +SKYWALKING_VERSION=10.4.0 +SKYWALKING_BANYANDB_VERSION=0.10.1 +SKYWALKING_UI_PORT=8080 +SKYWALKING_OAP_GRPC_PORT=11800 +SKYWALKING_OAP_HTTP_PORT=12800 +SKYWALKING_ZIPKIN_RECEIVER_PORT=9411 +SKYWALKING_ZIPKIN_QUERY_PORT=9412 +SKYWALKING_BANYANDB_GRPC_PORT=17912 +SKYWALKING_BANYANDB_HTTP_PORT=17913 diff --git a/docker/monitoring/monitoring.env.example b/docker/monitoring/monitoring.env.example index 26ab041c8..be81f9e49 100644 --- a/docker/monitoring/monitoring.env.example +++ b/docker/monitoring/monitoring.env.example @@ -1,22 +1,89 @@ -# Telemetry and Monitoring Configuration ENABLE_TELEMETRY=true -SERVICE_NAME=nexent-backend -JAEGER_ENDPOINT=http://localhost:14268/api/traces -PROMETHEUS_PORT=8000 -TELEMETRY_SAMPLE_RATE=1.0 - -# Performance monitoring thresholds -LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 -LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 +OTEL_SERVICE_NAME=nexent-backend +MONITORING_PROVIDER=otlp +MONITORING_PROJECT_NAME=nexent -# Grafana Configuration -GF_SECURITY_ADMIN_PASSWORD=admin -GF_USERS_ALLOW_SIGN_UP=false +# Use a base OTLP HTTP endpoint. SDK code derives /v1/traces and /v1/metrics. +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_TRACES_ENDPOINT= +OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= +OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_HEADERS= +OTEL_EXPORTER_OTLP_AUTHORIZATION= +OTEL_EXPORTER_OTLP_X_API_KEY= +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION= +OTEL_EXPORTER_OTLP_METRICS_ENABLED=true +MONITORING_INSTRUMENT_FASTAPI=true +MONITORING_INSTRUMENT_REQUESTS=false +MONITORING_FASTAPI_EXCLUDED_URLS= +MONITORING_FASTAPI_EXCLUDE_SPANS=receive,send +TELEMETRY_SAMPLE_RATE=1.0 -# Service ports -JAEGER_UI_PORT=16686 -PROMETHEUS_UI_PORT=9090 -GRAFANA_UI_PORT=3000 OTEL_COLLECTOR_GRPC_PORT=4317 OTEL_COLLECTOR_HTTP_PORT=4318 +OTEL_COLLECTOR_CONFIG_FILE= +OTEL_COLLECTOR_VERSION=0.151.0 + +# Local Phoenix stack. Used by: ./start-monitoring.sh --stack phoenix +PHOENIX_VERSION=15 +PHOENIX_PORT=6006 +PHOENIX_GRPC_HOST_PORT=4319 + +# Local Langfuse stack. Used by: ./start-monitoring.sh --stack langfuse +# Defaults are for local development only. Replace secrets before production use. +LANGFUSE_VERSION=3 +LANGFUSE_PORT=3001 +LANGFUSE_NEXTAUTH_URL=http://localhost:3001 +LANGFUSE_NEXTAUTH_SECRET=nexent-langfuse-secret +LANGFUSE_SALT=nexent-langfuse-salt +LANGFUSE_ENCRYPTION_KEY=0000000000000000000000000000000000000000000000000000000000000000 +LANGFUSE_TELEMETRY_ENABLED=false +LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES=false +LANGFUSE_INIT_ORG_ID=nexent +LANGFUSE_INIT_ORG_NAME=Nexent +LANGFUSE_INIT_PROJECT_ID=nexent +LANGFUSE_INIT_PROJECT_NAME=Nexent +LANGFUSE_INIT_PROJECT_PUBLIC_KEY=pk-lf-nexent-local +LANGFUSE_INIT_PROJECT_SECRET_KEY=sk-lf-nexent-local +LANGFUSE_INIT_USER_EMAIL=admin@nexent.com +LANGFUSE_INIT_USER_NAME=admin +LANGFUSE_INIT_USER_PASSWORD=nexent@4321 +LANGFUSE_OTLP_AUTH_HEADER= +LANGFUSE_POSTGRES_VERSION=15-alpine +LANGFUSE_POSTGRES_USER=postgres +LANGFUSE_POSTGRES_PASSWORD=nexent@4321 +LANGFUSE_POSTGRES_DB=postgres +LANGFUSE_POSTGRES_PORT=5440 +LANGFUSE_CLICKHOUSE_VERSION=26.3-alpine +LANGFUSE_CLICKHOUSE_USER=clickhouse +LANGFUSE_CLICKHOUSE_PASSWORD=clickhouse +LANGFUSE_CLICKHOUSE_HTTP_PORT=8124 +LANGFUSE_CLICKHOUSE_NATIVE_PORT=9002 +LANGFUSE_MINIO_VERSION=RELEASE.2023-12-20T01-00-02Z +LANGFUSE_MINIO_ROOT_USER=minio +LANGFUSE_MINIO_ROOT_PASSWORD=miniosecret +LANGFUSE_MINIO_API_PORT=9092 +LANGFUSE_MINIO_CONSOLE_PORT=9093 +LANGFUSE_S3_BUCKET=langfuse +LANGFUSE_REDIS_AUTH=myredissecret +LANGFUSE_REDIS_VERSION=alpine +LANGFUSE_REDIS_PORT=6380 + +# Online LangSmith forwarding. Used by: ./start-monitoring.sh --stack langsmith +# LangSmith currently ingests OTLP traces. Metrics remain in the Collector debug pipeline. +LANGSMITH_API_KEY= +LANGSMITH_PROJECT=nexent +LANGSMITH_OTLP_TRACES_ENDPOINT=https://api.smith.langchain.com/otel/v1/traces + +# Local Grafana stack. Used by: ./start-monitoring.sh --stack grafana +GRAFANA_VERSION=12.4 +GRAFANA_PORT=3002 +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=nexent@4321 +GRAFANA_DEFAULT_LANGUAGE=zh-Hans +TEMPO_VERSION=2.10.5 +TEMPO_PORT=3200 +# Local Zipkin stack. Used by: ./start-monitoring.sh --stack zipkin +ZIPKIN_VERSION=latest +ZIPKIN_PORT=9411 diff --git a/docker/monitoring/otel-collector-config.yml b/docker/monitoring/otel-collector-config.yml index f14f427b5..8db36ba67 100644 --- a/docker/monitoring/otel-collector-config.yml +++ b/docker/monitoring/otel-collector-config.yml @@ -5,22 +5,16 @@ receivers: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 - - # Prometheus receiver to collect metrics from instrumented apps - prometheus: - config: - scrape_configs: - - job_name: 'nexent-backend-otel' - static_configs: - - targets: ['host.docker.internal:8000'] - scrape_interval: 5s processors: batch: timeout: 1s send_batch_size: 512 - - # Resource processor to add common attributes + + memory_limiter: + limit_mib: 256 + check_interval: 1s + resource: attributes: - key: service.name @@ -30,51 +24,72 @@ processors: from_attribute: version action: insert - # Memory limiter to prevent OOM - memory_limiter: - limit_mib: 256 - check_interval: 1s - - # Add attributes specifically for LLM monitoring - attributes: - actions: - - key: llm.system - value: openai - action: insert - - key: deployment.environment - value: development - action: insert - exporters: - # Export traces to Jaeger via OTLP - otlp/jaeger: - endpoint: jaeger:14250 - tls: - insecure: true - - # Export metrics to Prometheus - prometheus: - endpoint: "0.0.0.0:8889" - resource_to_telemetry_conversion: - enabled: true - - # Logging exporter for debugging - logging: + debug: verbosity: normal service: - extensions: [] pipelines: traces: receivers: [otlp] processors: [memory_limiter, resource, batch] - exporters: [otlp/jaeger, logging] - + exporters: [debug] + metrics: - receivers: [otlp, prometheus] - processors: [memory_limiter, resource, attributes, batch] - exporters: [prometheus, logging] - + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [debug] + telemetry: logs: level: "info" + +# Example configurations for AI observability platforms: +# +# === Arize Phoenix === +# Set environment variables: +# OTEL_EXPORTER_OTLP_ENDPOINT=https://app.phoenix.arize.com/s/YOUR_SPACE +# OTEL_EXPORTER_OTLP_AUTHORIZATION=Bearer YOUR_PHOENIX_API_KEY +# OTEL_EXPORTER_OTLP_METRICS_ENABLED=false +# +# Or configure directly in exporters section: +# otlphttp/arize: +# endpoint: https://app.phoenix.arize.com/s/YOUR_SPACE +# headers: +# Authorization: Bearer YOUR_PHOENIX_API_KEY +# Then add otlphttp/arize to the traces pipeline exporters. +# +# === Langfuse === +# Set environment variables: +# OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel +# OTEL_EXPORTER_OTLP_AUTHORIZATION=Basic BASE64_ENCODED_KEY +# OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION=4 +# +# Where BASE64_ENCODED_KEY = base64(public_key:secret_key) +# +# Or configure directly: +# otlphttp/langfuse: +# endpoint: https://cloud.langfuse.com/api/public/otel +# headers: +# Authorization: Basic BASE64_ENCODED_KEY +# x-langfuse-ingestion-version: "4" +# Then add otlphttp/langfuse to the traces pipeline exporters. +# +# === LangSmith === +# Set environment variables: +# MONITORING_PROVIDER=langsmith +# LANGSMITH_API_KEY=lsv2_... +# LANGSMITH_PROJECT=nexent +# +# Or configure directly: +# otlphttp/langsmith: +# traces_endpoint: https://api.smith.langchain.com/otel/v1/traces +# headers: +# x-api-key: YOUR_LANGSMITH_API_KEY +# Langsmith-Project: nexent +# Then add otlphttp/langsmith to the traces pipeline exporters. +# +# === Multiple Exporters === +# To export to multiple backends simultaneously, create multiple exporters +# and add them to the pipelines: +# exporters: [otlphttp/arize, otlphttp/langfuse, otlphttp/langsmith, debug] diff --git a/docker/monitoring/otel-collector-grafana-config.yml b/docker/monitoring/otel-collector-grafana-config.yml new file mode 100644 index 000000000..d69e69811 --- /dev/null +++ b/docker/monitoring/otel-collector-grafana-config.yml @@ -0,0 +1,50 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 512 + + memory_limiter: + limit_mib: 256 + check_interval: 1s + + resource: + attributes: + - key: service.name + value: nexent-backend + action: upsert + - key: service.version + from_attribute: version + action: insert + +exporters: + debug: + verbosity: normal + + otlp/tempo: + endpoint: tempo:4317 + tls: + insecure: true + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [otlp/tempo, debug] + + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [debug] + + telemetry: + logs: + level: "info" diff --git a/docker/monitoring/otel-collector-langfuse-config.yml b/docker/monitoring/otel-collector-langfuse-config.yml new file mode 100644 index 000000000..9304d93e9 --- /dev/null +++ b/docker/monitoring/otel-collector-langfuse-config.yml @@ -0,0 +1,69 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 512 + + memory_limiter: + limit_mib: 256 + check_interval: 1s + + resource: + attributes: + - key: service.name + value: nexent-backend + action: upsert + - key: service.version + from_attribute: version + action: insert + +exporters: + debug: + verbosity: normal + + otlphttp/langfuse: + endpoint: http://langfuse-web:3000/api/public/otel + headers: + Authorization: ${env:LANGFUSE_OTLP_AUTH_HEADER} + x-langfuse-ingestion-version: "4" + # 1. 超时控制 (Timeout) + # 防止 Collector 等待太久导致协程暴涨 + timeout: 5s + + # 2. 发送队列 (Sending Queue) + # 当后端处理变慢时,把数据先缓存在 Collector 内存中 + sending_queue: + enabled: true + num_consumers: 10 # 并发发送的工作线程数(可提升发送吞吐量) + queue_size: 5000 # 队列最大可容纳的批次数。如果队列满了,新来的数据将被丢弃! + + # 3. 失败重试 (Retry on Failure) + # 遇到网络抖动或后端返回 503 等临时性错误时,进行指数退避重试 + retry_on_failure: + enabled: true + initial_interval: 1s # 第一次重试间隔 1s + max_interval: 30s # 最大重试间隔不超过 30s + max_elapsed_time: 300s # 一条数据最多重试 5 分钟,超过则彻底放弃并丢弃 + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [otlphttp/langfuse, debug] + + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [debug] + + telemetry: + logs: + level: "info" diff --git a/docker/monitoring/otel-collector-langsmith-config.yml b/docker/monitoring/otel-collector-langsmith-config.yml new file mode 100644 index 000000000..28222c1cf --- /dev/null +++ b/docker/monitoring/otel-collector-langsmith-config.yml @@ -0,0 +1,63 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 512 + + memory_limiter: + limit_mib: 256 + check_interval: 1s + + resource: + attributes: + - key: service.name + value: nexent-backend + action: upsert + - key: service.version + from_attribute: version + action: insert + +exporters: + debug: + verbosity: normal + + otlphttp/langsmith: + traces_endpoint: ${env:LANGSMITH_OTLP_TRACES_ENDPOINT} + headers: + x-api-key: ${env:LANGSMITH_API_KEY} + Langsmith-Project: ${env:LANGSMITH_PROJECT} + timeout: 10s + + sending_queue: + enabled: true + num_consumers: 10 + queue_size: 5000 + + retry_on_failure: + enabled: true + initial_interval: 1s + max_interval: 30s + max_elapsed_time: 300s + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [otlphttp/langsmith, debug] + + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [debug] + + telemetry: + logs: + level: "info" diff --git a/docker/monitoring/otel-collector-phoenix-config.yml b/docker/monitoring/otel-collector-phoenix-config.yml new file mode 100644 index 000000000..0682a6e4d --- /dev/null +++ b/docker/monitoring/otel-collector-phoenix-config.yml @@ -0,0 +1,66 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 512 + + memory_limiter: + limit_mib: 256 + check_interval: 1s + + resource: + attributes: + - key: service.name + value: nexent-backend + action: upsert + - key: service.version + from_attribute: version + action: insert + +exporters: + debug: + verbosity: normal + + otlphttp/phoenix: + endpoint: http://phoenix:6006 + # 1. 超时控制 (Timeout) + # 防止 Collector 等待太久导致协程暴涨 + timeout: 5s + + # 2. 发送队列 (Sending Queue) + # 当后端处理变慢时,把数据先缓存在 Collector 内存中 + sending_queue: + enabled: true + num_consumers: 10 # 并发发送的工作线程数(可提升发送吞吐量) + queue_size: 5000 # 队列最大可容纳的批次数。如果队列满了,新来的数据将被丢弃! + + # 3. 失败重试 (Retry on Failure) + # 遇到网络抖动或后端返回 503 等临时性错误时,进行指数退避重试 + retry_on_failure: + enabled: true + initial_interval: 1s # 第一次重试间隔 1s + max_interval: 30s # 最大重试间隔不超过 30s + max_elapsed_time: 300s # 一条数据最多重试 5 分钟,超过则彻底放弃并丢弃 + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [otlphttp/phoenix, debug] + + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [debug] + + telemetry: + logs: + level: "info" diff --git a/docker/monitoring/otel-collector-zipkin-config.yml b/docker/monitoring/otel-collector-zipkin-config.yml new file mode 100644 index 000000000..ab26a84a9 --- /dev/null +++ b/docker/monitoring/otel-collector-zipkin-config.yml @@ -0,0 +1,49 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 512 + + memory_limiter: + limit_mib: 256 + check_interval: 1s + + resource: + attributes: + - key: service.name + value: nexent-backend + action: upsert + - key: service.version + from_attribute: version + action: insert + +exporters: + debug: + verbosity: normal + + zipkin: + endpoint: http://zipkin:9411/api/v2/spans + format: proto + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [zipkin, debug] + + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [debug] + + telemetry: + logs: + level: "info" \ No newline at end of file diff --git a/docker/monitoring/prometheus.yml b/docker/monitoring/prometheus.yml deleted file mode 100644 index 49258c097..000000000 --- a/docker/monitoring/prometheus.yml +++ /dev/null @@ -1,39 +0,0 @@ -global: - scrape_interval: 15s - evaluation_interval: 15s - -rule_files: - # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. - - "nexent_alerts.yml" - -scrape_configs: - # Nexent Backend - LLM Metrics - - job_name: 'nexent-backend' - static_configs: - - targets: ['host.docker.internal:8000'] # Adjust based on your backend service - scrape_interval: 15s - metrics_path: /metrics - scrape_timeout: 10s - - # OpenTelemetry Collector - - job_name: 'otel-collector' - static_configs: - - targets: ['otel-collector:8888'] - scrape_interval: 10s - - # Prometheus self-monitoring - - job_name: 'prometheus' - static_configs: - - targets: ['localhost:9090'] - - # Jaeger Metrics - - job_name: 'jaeger' - static_configs: - - targets: ['jaeger:14269'] - -# Alertmanager configuration (optional) -# alerting: -# alertmanagers: -# - static_configs: -# - targets: -# - alertmanager:9093 diff --git a/docker/monitoring/tempo.yml b/docker/monitoring/tempo.yml new file mode 100644 index 000000000..414ea42b9 --- /dev/null +++ b/docker/monitoring/tempo.yml @@ -0,0 +1,43 @@ +target: all +multitenancy_enabled: false +stream_over_http_enabled: true + +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +metrics_generator: + ring: + kvstore: + store: inmemory + storage: + path: /var/tempo/generator/wal + remote_write: [] + traces_storage: + path: /var/tempo/generator/traces + processor: + local_blocks: + filter_server_spans: false + flush_to_storage: true + +storage: + trace: + backend: local + wal: + path: /var/tempo/wal + local: + path: /var/tempo/blocks + +overrides: + defaults: + metrics_generator: + processors: + - local-blocks diff --git a/docker/start-monitoring.sh b/docker/start-monitoring.sh index 8cd8561f0..8791e5c2f 100755 --- a/docker/start-monitoring.sh +++ b/docker/start-monitoring.sh @@ -1,12 +1,61 @@ #!/bin/bash # Nexent LLM Performance Monitoring Setup Script -# This script sets up OpenTelemetry + Jaeger + Prometheus + Grafana for monitoring +# This script starts the OpenTelemetry Collector alone, or with a local +# Phoenix/Langfuse/Grafana/SkyWalking observability backend, or forward to +# online LangSmith. set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" MONITORING_DIR="$SCRIPT_DIR/monitoring" +COMPOSE_FILE="$SCRIPT_DIR/docker-compose-monitoring.yml" + +usage() { + cat < + +Stacks: + otlp Start OpenTelemetry Collector only. This is the default. + collector Alias for otlp. + phoenix Start Collector and local Arize Phoenix. + langfuse Start Collector and local Langfuse self-host stack. + langsmith Start Collector and forward traces to online LangSmith. + grafana Start Collector, Grafana, and Tempo. + zipkin Start Collector and local Zipkin. + +Set MONITORING_PROVIDER in monitoring/monitoring.env to change the default. +EOF +} + +STACK_ARG="" +while [ $# -gt 0 ]; do + case "$1" in + --stack) + if [ $# -lt 2 ]; then + echo "❌ Error: --stack requires a value." + usage + exit 1 + fi + STACK_ARG="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + otlp|collector|phoenix|langfuse|langsmith|grafana|zipkin) + STACK_ARG="$1" + shift + ;; + *) + echo "❌ Error: unknown argument '$1'." + usage + exit 1 + ;; + esac +done echo "🚀 Starting Nexent LLM Performance Monitoring Setup..." @@ -17,11 +66,11 @@ if ! docker info > /dev/null 2>&1; then fi # Create external network if it doesn't exist -if ! docker network ls | grep -q nexent-network; then - echo "🔗 Creating nexent-network..." - docker network create nexent-network +if ! docker network ls | grep -q nexent_nexent; then + echo "🔗 Creating nexent_nexent..." + docker network create nexent_nexent else - echo "✅ nexent-network already exists" + echo "✅ nexent_nexent already exists" fi # Copy environment file if it doesn't exist @@ -31,9 +80,84 @@ if [ ! -f "$MONITORING_DIR/monitoring.env" ]; then echo "⚠️ Please review and update $MONITORING_DIR/monitoring.env as needed" fi +# Load deployment options. Keep values shell-compatible in monitoring.env. +set -a +# shellcheck disable=SC1091 +. "$MONITORING_DIR/monitoring.env" +set +a + +MONITORING_PROVIDER="${STACK_ARG:-${MONITORING_PROVIDER:-otlp}}" +case "$MONITORING_PROVIDER" in + collector|otlp) + LOCAL_STACK="collector" + BACKEND_MONITORING_PROVIDER="otlp" + OTEL_COLLECTOR_CONFIG_FILE="${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-config.yml}" + COMPOSE_PROFILES=() + ;; + phoenix) + LOCAL_STACK="phoenix" + BACKEND_MONITORING_PROVIDER="phoenix" + OTEL_COLLECTOR_CONFIG_FILE="${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-phoenix-config.yml}" + COMPOSE_PROFILES=(--profile phoenix) + ;; + langfuse) + LOCAL_STACK="langfuse" + BACKEND_MONITORING_PROVIDER="langfuse" + OTEL_COLLECTOR_CONFIG_FILE="${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-langfuse-config.yml}" + COMPOSE_PROFILES=(--profile langfuse) + LANGFUSE_INIT_PROJECT_PUBLIC_KEY="${LANGFUSE_INIT_PROJECT_PUBLIC_KEY:-pk-lf-nexent-local}" + LANGFUSE_INIT_PROJECT_SECRET_KEY="${LANGFUSE_INIT_PROJECT_SECRET_KEY:-sk-lf-nexent-local}" + if [ -z "${LANGFUSE_OTLP_AUTH_HEADER:-}" ]; then + LANGFUSE_OTLP_AUTH_HEADER="Basic $(printf "%s:%s" "$LANGFUSE_INIT_PROJECT_PUBLIC_KEY" "$LANGFUSE_INIT_PROJECT_SECRET_KEY" | base64 | tr -d '\n')" + fi + export LANGFUSE_OTLP_AUTH_HEADER + ;; + langsmith) + LOCAL_STACK="langsmith" + BACKEND_MONITORING_PROVIDER="langsmith" + OTEL_COLLECTOR_CONFIG_FILE="${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-langsmith-config.yml}" + COMPOSE_PROFILES=() + LANGSMITH_OTLP_TRACES_ENDPOINT="${LANGSMITH_OTLP_TRACES_ENDPOINT:-https://api.smith.langchain.com/otel/v1/traces}" + LANGSMITH_PROJECT="${LANGSMITH_PROJECT:-${MONITORING_PROJECT_NAME:-nexent}}" + if [ -z "${LANGSMITH_API_KEY:-}" ]; then + echo "❌ Error: LANGSMITH_API_KEY is required for the langsmith stack." + echo " Set it in $MONITORING_DIR/monitoring.env or export it before running this script." + exit 1 + fi + export LANGSMITH_API_KEY LANGSMITH_PROJECT LANGSMITH_OTLP_TRACES_ENDPOINT + ;; + grafana) + LOCAL_STACK="grafana" + BACKEND_MONITORING_PROVIDER="grafana" + OTEL_COLLECTOR_CONFIG_FILE="${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-grafana-config.yml}" + COMPOSE_PROFILES=(--profile grafana) + ;; + zipkin) + LOCAL_STACK="zipkin" + BACKEND_MONITORING_PROVIDER="zipkin" + OTEL_COLLECTOR_CONFIG_FILE="${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-zipkin-config.yml}" + COMPOSE_PROFILES=(--profile zipkin) + ;; + *) + echo "❌ Error: unsupported MONITORING_PROVIDER '$MONITORING_PROVIDER'." + usage + exit 1 + ;; +esac +export OTEL_COLLECTOR_CONFIG_FILE + +if docker compose version > /dev/null 2>&1; then + COMPOSE_CMD=(docker compose) +elif command -v docker-compose > /dev/null 2>&1; then + COMPOSE_CMD=(docker-compose) +else + echo "❌ Error: Docker Compose is not installed." + exit 1 +fi + # Start monitoring services -echo "🐳 Starting monitoring services..." -docker-compose -f "$SCRIPT_DIR/docker-compose-monitoring.yml" --env-file "$MONITORING_DIR/monitoring.env" up -d +echo "🐳 Starting monitoring services with provider: $MONITORING_PROVIDER" +"${COMPOSE_CMD[@]}" -f "$COMPOSE_FILE" --env-file "$MONITORING_DIR/monitoring.env" "${COMPOSE_PROFILES[@]}" up -d --remove-orphans # Wait for services to be ready echo "⏳ Waiting for services to start..." @@ -47,7 +171,7 @@ check_service() { local name=$1 local url=$2 local port=$3 - + if curl -s --max-time 5 --connect-timeout 3 "$url" > /dev/null 2>&1; then echo "✅ $name is running at http://localhost:$port" return 0 @@ -57,33 +181,76 @@ check_service() { fi } -# Check Jaeger -check_service "Jaeger" "http://localhost:16686/api/services" "16686" || true - -# Check Prometheus -check_service "Prometheus" "http://localhost:9090/-/healthy" "9090" || true +# Check OpenTelemetry Collector HTTP receiver +check_service "OpenTelemetry Collector HTTP receiver" "http://localhost:${OTEL_COLLECTOR_HTTP_PORT:-4318}" "${OTEL_COLLECTOR_HTTP_PORT:-4318}" || true -# Check Grafana -check_service "Grafana" "http://localhost:3005/api/health" "3005" || true +case "$LOCAL_STACK" in + phoenix) + check_service "Phoenix UI" "http://localhost:${PHOENIX_PORT:-6006}" "${PHOENIX_PORT:-6006}" || true + ;; + langfuse) + check_service "Langfuse UI" "http://localhost:${LANGFUSE_PORT:-3001}" "${LANGFUSE_PORT:-3001}" || true + ;; + langsmith) + echo "✅ LangSmith forwarding is configured for project: ${LANGSMITH_PROJECT:-nexent}" + ;; + grafana) + check_service "Grafana" "http://localhost:${GRAFANA_PORT:-3002}/api/health" "${GRAFANA_PORT:-3002}" || true + check_service "Tempo API" "http://localhost:${TEMPO_PORT:-3200}/ready" "${TEMPO_PORT:-3200}" || true + ;; + skywalking) + check_service "SkyWalking UI" "http://localhost:${SKYWALKING_UI_PORT:-8080}" "${SKYWALKING_UI_PORT:-8080}" || true + check_service "SkyWalking OAP HTTP API" "http://localhost:${SKYWALKING_OAP_HTTP_PORT:-12800}" "${SKYWALKING_OAP_HTTP_PORT:-12800}" || true + ;; +esac echo "" echo "🎉 Monitoring setup complete!" echo "" echo "📊 Access your monitoring tools:" -echo " • Jaeger UI: http://localhost:16686" -echo " • Prometheus: http://localhost:9090" -echo " • Grafana: http://localhost:3005 (admin/admin)" +echo " • OTLP HTTP receiver: http://localhost:${OTEL_COLLECTOR_HTTP_PORT:-4318}" +echo " • OTLP gRPC receiver: localhost:${OTEL_COLLECTOR_GRPC_PORT:-4317}" +case "$LOCAL_STACK" in + phoenix) + echo " • Phoenix UI: http://localhost:${PHOENIX_PORT:-6006}" + ;; + langfuse) + echo " • Langfuse UI: http://localhost:${LANGFUSE_PORT:-3001}" + echo " • Langfuse admin: ${LANGFUSE_INIT_USER_EMAIL:-admin@nexent.local} / ${LANGFUSE_INIT_USER_PASSWORD:-nexent-langfuse-admin}" + ;; + langsmith) + echo " • LangSmith project: ${LANGSMITH_PROJECT:-nexent}" + echo " • LangSmith OTLP traces endpoint: ${LANGSMITH_OTLP_TRACES_ENDPOINT:-https://api.smith.langchain.com/otel/v1/traces}" + ;; + grafana) + echo " • Grafana UI: http://localhost:${GRAFANA_PORT:-3002}" + echo " • Grafana admin: ${GRAFANA_ADMIN_USER:-admin} / ${GRAFANA_ADMIN_PASSWORD:-nexent-grafana-admin}" + echo " • Tempo API: http://localhost:${TEMPO_PORT:-3200}" + ;; + skywalking) + echo " • SkyWalking UI: http://localhost:${SKYWALKING_UI_PORT:-8080}" + echo " • SkyWalking OAP HTTP API: http://localhost:${SKYWALKING_OAP_HTTP_PORT:-12800}" + echo " • SkyWalking OAP gRPC API: localhost:${SKYWALKING_OAP_GRPC_PORT:-11800}" + ;; + collector) + echo " • Configure Phoenix, Langfuse, LangSmith, Tempo, or another OTLP backend in monitoring.env" + ;; +esac echo "" echo "🔧 To enable monitoring in your Nexent backend:" echo " 1. Set ENABLE_TELEMETRY=true in your .env file" -echo " 2. Install performance dependencies:" +echo " 2. Set MONITORING_PROVIDER=$BACKEND_MONITORING_PROVIDER in your .env file" +echo " 3. Set OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 for Docker services" +echo " or http://localhost:${OTEL_COLLECTOR_HTTP_PORT:-4318} for a backend running on the host" +echo " 4. Install performance dependencies:" echo " uv sync --extra performance" -echo " 3. Restart your Nexent backend service" +echo " 5. Restart your Nexent backend service" echo "" -echo "📈 Key Metrics to Monitor:" -echo " • Token Generation Rate (tokens/second)" -echo " • Time to First Token (TTFT)" -echo " • Request Duration" -echo " • Error Rates" +echo "🔎 Key Trace Data to Inspect:" +echo " • Agent span hierarchy" +echo " • LLM generation spans" +echo " • Tool call spans" +echo " • Error events" echo "" -echo "🛑 To stop monitoring services: docker-compose -f docker-compose-monitoring.yml down" +echo "🛑 To stop monitoring services:" +echo " ${COMPOSE_CMD[*]} -f $COMPOSE_FILE --env-file $MONITORING_DIR/monitoring.env --profile phoenix --profile langfuse --profile grafana --profile skywalking down --remove-orphans" diff --git a/frontend/components/navigation/TopNavbar.tsx b/frontend/components/navigation/TopNavbar.tsx index 2fbeee744..b58e7549c 100644 --- a/frontend/components/navigation/TopNavbar.tsx +++ b/frontend/components/navigation/TopNavbar.tsx @@ -1,26 +1,91 @@ "use client"; -import { Button } from "antd"; +import { Button, Tooltip } from "antd"; import { AvatarDropdown } from "@/components/auth/avatarDropdown"; import { useTranslation } from "react-i18next"; -import { ChevronDown, Globe } from "lucide-react"; +import { Activity, ChevronDown, Globe } from "lucide-react"; import { Dropdown } from "antd"; import Link from "next/link"; import { HEADER_CONFIG, SIDER_CONFIG } from "@/const/layoutConstants"; import { languageOptions } from "@/const/constants"; import { useLanguageSwitch } from "@/lib/language"; -import React from "react"; +import React, { useEffect, useState } from "react"; import { Flex, Layout } from "antd"; import { ChatTopNavContent } from "./ChatTopNavContent"; import { useAuthorizationContext } from "../providers/AuthorizationProvider"; import { useDeployment } from "../providers/deploymentProvider"; +import { monitoringService } from "@/services/monitoringService"; +import type { MonitoringStatus } from "@/types/monitoring"; + const { Header } = Layout; +const MONITORING_PROVIDER_UI: Record = { + phoenix: { port: "6006", path: "/" }, + langfuse: { port: "3001", path: "/project/nexent" }, + grafana: { + port: "3002", + path: "/d/nexent-llm-agent/nexent-agent-trace-monitoring?orgId=1", + }, + skywalking: { port: "8080", path: "/" }, +}; + +function buildMonitoringUrl(status: MonitoringStatus | null): string | null { + if (!status?.telemetry_enabled || typeof window === "undefined") return null; + + const providerConfig = status.provider + ? MONITORING_PROVIDER_UI[status.provider.toLowerCase()] + : null; + const dashboardPort = status.dashboard_port || providerConfig?.port; + + if (dashboardPort) { + const path = status.dashboard_path || providerConfig?.path || "/"; + const normalizedPath = path.startsWith("/") ? path : `/${path}`; + return `${window.location.protocol}//${window.location.hostname}:${dashboardPort}${normalizedPath}`; + } + + if (status.dashboard_url) { + try { + const url = new URL(status.dashboard_url); + if (["localhost", "127.0.0.1", "0.0.0.0"].includes(url.hostname)) { + url.hostname = window.location.hostname; + } + return url.toString(); + } catch { + return status.dashboard_url; + } + } + + return null; +} + export function TopNavbar({ isChatPage }: { isChatPage: boolean }) { const { t } = useTranslation("common"); const { user, isLoading } = useAuthorizationContext(); - const { isSpeedMode } = useDeployment() + const { isSpeedMode } = useDeployment(); const { currentLanguage, handleLanguageChange } = useLanguageSwitch(); + const [monitoringStatus, setMonitoringStatus] = + useState(null); + + useEffect(() => { + let mounted = true; + + monitoringService.fetchStatus().then((status) => { + if (mounted) { + setMonitoringStatus(status); + } + }); + + return () => { + mounted = false; + }; + }, []); + + const monitoringUrl = buildMonitoringUrl(monitoringStatus); + + const openMonitoringDashboard = () => { + if (!monitoringUrl) return; + window.open(monitoringUrl, "_blank", "noopener,noreferrer"); + }; // Left content - Logo + optional additional title (aligned with sidebar width) const leftContent = ( @@ -61,6 +126,18 @@ export function TopNavbar({ isChatPage }: { isChatPage: boolean }) { // Right content - Additional content + default navigation items const rightContent = ( + {monitoringUrl && ( + +