apecloud
diff --git a/‎AGENTS.md‎
Lines changed: 16 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 3 additions & 11 deletions b/‎Makefile‎
Lines changed: 3 additions & 11 deletions
diff --git a/‎README-zh.md‎
Lines changed: 1 addition & 0 deletions b/‎README-zh.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aperag/app.py‎
Lines changed: 34 additions & 14 deletions b/‎aperag/app.py‎
Lines changed: 34 additions & 14 deletions
diff --git a/‎aperag/config.py‎
Lines changed: 12 additions & 4 deletions b/‎aperag/config.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎aperag/domains/retrieval/pipeline.py‎
Lines changed: 88 additions & 76 deletions b/‎aperag/domains/retrieval/pipeline.py‎
Lines changed: 88 additions & 76 deletions
@@ -0,0 +1,16 @@
+# Agent Guide
+
+## Observability
+
+ApeRAG's observability entrypoint is `aperag.observability`.
+
+- Default mode is `APERAG_OBSERVABILITY_MODE=local`: no extra observability service is required.
+- Logs should stay structured JSON and include trace/span correlation fields.
+- Export telemetry through OTLP only (`OTEL_EXPORTER_OTLP_ENDPOINT`) when a deployment needs a backend or collector.
+- Do not add backend-specific exporters or deployment profiles for tracing systems.
+- Do not log prompts, document bodies, API keys, cookies, authorization headers, database passwords, or raw LLM responses.
+- New business instrumentation should use stable low-cardinality names and attributes.
+
+Read the full design before changing observability behavior:
+
+- `docs/zh-CN/deployment/observability.md`
@@ -123,11 +123,8 @@ db-check:
 #   make stack-up                                # Full application
 #   make stack-up WITH_NEO4J=1                   # Full application + Neo4j
 #   make stack-up WITH_NEBULA=1                  # Full application + Nebula Graph
-#   make stack-up WITH_JAEGER=1                  # Full application + Jaeger
-#   make stack-up WITH_JAEGER=1 WITH_NEO4J=1     # Full application + Jaeger + Neo4j
 #   make infra-up                                # Infrastructure only (databases)
 #   make infra-up WITH_NEO4J=1                   # Infrastructure + Neo4j
-#   make infra-up WITH_JAEGER=1                  # Infrastructure + Jaeger
 #   make stack-down                              # Stop all services
 #   make stack-down REMOVE_VOLUMES=1             # Stop and remove volumes
 _PROFILES_TO_ACTIVATE :=
@@ -143,10 +140,6 @@ ifeq ($(WITH_NEBULA),1)
     _PROFILES_TO_ACTIVATE += --profile nebula
 endif
 
-ifeq ($(WITH_JAEGER),1)
-    _PROFILES_TO_ACTIVATE += --profile jaeger
-endif
-
 # Determine flags for 'compose-down'
 ifeq ($(REMOVE_VOLUMES),1)
     _COMPOSE_DOWN_FLAGS += -v
@@ -158,18 +151,17 @@ stack-up:
 	$(_EXTRA_ENVS) docker-compose $(_PROFILES_TO_ACTIVATE) -f docker-compose.yml up -d
 
 # Infrastructure only (databases + supporting services)
-# Optional services like Neo4j, Nebula, and Jaeger will ONLY start if explicitly enabled:
+# Optional services like Neo4j and Nebula will ONLY start if explicitly enabled:
 #   make infra-up WITH_NEO4J=1    # adds Neo4j
 #   make infra-up WITH_NEBULA=1   # adds Nebula Graph
-#   make infra-up WITH_JAEGER=1   # adds Jaeger
 infra-up:
 	docker-compose $(_PROFILES_TO_ACTIVATE) -f docker-compose.yml up -d \
-		postgres redis qdrant es jaeger \
+		postgres redis qdrant es \
 		$(if $(filter 1,$(WITH_NEO4J)),neo4j,) \
 		$(if $(filter 1,$(WITH_NEBULA)),nebula-metad nebula-storaged nebula-graphd nebula-storage-activator,)
 
 stack-down:
-	docker-compose --profile neo4j --profile nebula --profile jaeger -f docker-compose.yml down $(_COMPOSE_DOWN_FLAGS)
+	docker-compose --profile neo4j --profile nebula -f docker-compose.yml down $(_COMPOSE_DOWN_FLAGS)
 
 stack-logs:
 	docker-compose -f docker-compose.yml logs -f
 
@@ -15,6 +15,7 @@ ApeRAG 是你构建自己的知识图谱、进行上下文工程以及部署能
 - [Kubernetes 部署（推荐生产环境）](#kubernetes-部署推荐生产环境)
 - [开发指南](./docs/zh-CN/development-guide.md)
 - [构建 Docker 镜像](./docs/zh-CN/build-docker-image.md)
+- [可观测性设计](./docs/zh-CN/deployment/observability.md)
 - [致谢](#致谢)
 - [许可证](#许可证)
 
 
@@ -1,3 +1,4 @@
+# ruff: noqa: E402
 # Copyright 2025 ApeCloud, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,23 +14,22 @@
 # limitations under the License.
 
 from aperag.config import settings
+from aperag.observability import (
+    bind_observability_context,
+    build_observability_config,
+    configure_fastapi,
+    configure_logging,
+    configure_process_observability,
+    reset_observability_context,
+)
+from aperag.observability.tracing import inject_carrier
 
-# Initialize OpenTelemetry FIRST - before any other imports
-from aperag.trace import init_tracing
-
-# Initialize tracing with configuration
-if settings.otel_enabled:
-    init_tracing(
-        service_name=settings.otel_service_name,
-        service_version=settings.otel_service_version,
-        jaeger_endpoint=settings.jaeger_endpoint if settings.jaeger_enabled else None,
-        enable_console=settings.otel_console_enabled,
-        enable_fastapi=settings.otel_fastapi_enabled,
-        enable_sqlalchemy=settings.otel_sqlalchemy_enabled,
-        enable_mcp=settings.otel_mcp_enabled,
-    )
+observability_config = build_observability_config(settings)
+configure_logging(observability_config)
+configure_process_observability(observability_config)
 
 from fastapi import FastAPI  # noqa: E402
+from starlette.middleware.base import BaseHTTPMiddleware  # noqa: E402
 
 from aperag.domains.agent_runtime.api.routes import router as agent_runtime_router
 from aperag.domains.agent_runtime.runtime import set_prompt_template_ops as _ar_set_prompt_template_ops
@@ -219,6 +219,26 @@ async def combined_lifespan(app: FastAPI):
     generate_unique_id_function=custom_generate_unique_id,
 )
 
+
+class ObservabilityContextMiddleware(BaseHTTPMiddleware):
+    async def dispatch(self, request, call_next):
+        request_id = request.headers.get("x-request-id") or request.headers.get("x-correlation-id")
+        tokens = bind_observability_context(request_id=request_id)
+        try:
+            response = await call_next(request)
+            if request_id:
+                response.headers["x-request-id"] = request_id
+            trace_headers = inject_carrier({})
+            if "traceparent" in trace_headers:
+                response.headers["traceparent"] = trace_headers["traceparent"]
+            return response
+        finally:
+            reset_observability_context(tokens)
+
+
+app.add_middleware(ObservabilityContextMiddleware)
+configure_fastapi(app, observability_config)
+
 # Register global exception handlers
 register_exception_handlers(app)
 
 
@@ -196,12 +196,20 @@ class Config(BaseSettings):
     cache_enabled: bool = Field(True, alias="CACHE_ENABLED")
     cache_ttl: int = Field(86400, alias="CACHE_TTL")
 
-    # OpenTelemetry/Jaeger Tracing
-    otel_enabled: bool = Field(True, alias="OTEL_ENABLED")
+    # Observability
+    aperag_observability_mode: str = Field("local", alias="APERAG_OBSERVABILITY_MODE")
+    aperag_observability_log_format: str = Field("json", alias="APERAG_OBSERVABILITY_LOG_FORMAT")
+    aperag_observability_capture_content: bool = Field(False, alias="APERAG_OBSERVABILITY_CAPTURE_CONTENT")
+    aperag_observability_sample_ratio: float = Field(1.0, alias="APERAG_OBSERVABILITY_SAMPLE_RATIO")
+    otel_enabled: Optional[str] = Field(None, alias="OTEL_ENABLED")
     otel_service_name: str = Field("aperag", alias="OTEL_SERVICE_NAME")
     otel_service_version: str = Field("1.0.0", alias="OTEL_SERVICE_VERSION")
-    jaeger_enabled: bool = Field(False, alias="JAEGER_ENABLED")
-    jaeger_endpoint: Optional[str] = Field(None, alias="JAEGER_ENDPOINT")
+    otel_environment: str = Field("development", alias="OTEL_ENVIRONMENT")
+    otel_resource_attributes: Optional[str] = Field(None, alias="OTEL_RESOURCE_ATTRIBUTES")
+    otel_exporter_otlp_endpoint: Optional[str] = Field(None, alias="OTEL_EXPORTER_OTLP_ENDPOINT")
+    otel_exporter_otlp_headers: Optional[str] = Field(None, alias="OTEL_EXPORTER_OTLP_HEADERS")
+    otel_exporter_otlp_protocol: str = Field("http/protobuf", alias="OTEL_EXPORTER_OTLP_PROTOCOL")
+
     otel_console_enabled: bool = Field(False, alias="OTEL_CONSOLE_ENABLED")
     otel_fastapi_enabled: bool = Field(True, alias="OTEL_FASTAPI_ENABLED")
     otel_sqlalchemy_enabled: bool = Field(True, alias="OTEL_SQLALCHEMY_ENABLED")
 
@@ -50,6 +50,7 @@
     ProviderNotFoundError,
     RerankError,
 )
+from aperag.observability import start_span
 from aperag.platform.query.query import DocumentWithScore
 from aperag.schema.utils import parseCollectionConfig
 from aperag.utils.utils import generate_fulltext_index_name, generate_vector_db_collection_name
@@ -142,92 +143,103 @@ async def execute_search(
         search_user_id: str,
         chat_id: Optional[str] = None,
     ) -> Tuple[List[SearchResultItem], str]:
-        query = (data.query or "").strip()
-        if not query:
-            raise ValidationException("query is required")
-
-        recall_tasks = []
-        collection = await async_db_ops.query_collection(search_user_id, collection_id)
-        if not collection:
-            raise ValidationException(f"collection not found: {collection_id}")
-
-        if data.vector_search:
-            recall_tasks.append(
-                self._vector_search(
-                    collection=collection,
-                    query=query,
-                    top_k=data.vector_search.topk,
-                    similarity_threshold=data.vector_search.similarity,
-                    chat_id=chat_id,
+        with start_span(
+            "retrieval.search",
+            tracer_name=__name__,
+            **{
+                "aperag.domain": "retrieval",
+                "aperag.operation": "retrieval.search",
+                "aperag.collection.id": collection_id,
+                "aperag.user.id": search_user_id,
+                "aperag.chat.id": chat_id,
+            },
+        ):
+            query = (data.query or "").strip()
+            if not query:
+                raise ValidationException("query is required")
+
+            recall_tasks = []
+            collection = await async_db_ops.query_collection(search_user_id, collection_id)
+            if not collection:
+                raise ValidationException(f"collection not found: {collection_id}")
+
+            if data.vector_search:
+                recall_tasks.append(
+                    self._vector_search(
+                        collection=collection,
+                        query=query,
+                        top_k=data.vector_search.topk,
+                        similarity_threshold=data.vector_search.similarity,
+                        chat_id=chat_id,
+                    )
                 )
-            )
-        if data.fulltext_search:
-            recall_tasks.append(
-                self._fulltext_search(
-                    collection=collection,
-                    query=query,
-                    top_k=data.fulltext_search.topk,
-                    keywords=data.fulltext_search.keywords,
-                    user_id=search_user_id,
-                    chat_id=chat_id,
+            if data.fulltext_search:
+                recall_tasks.append(
+                    self._fulltext_search(
+                        collection=collection,
+                        query=query,
+                        top_k=data.fulltext_search.topk,
+                        keywords=data.fulltext_search.keywords,
+                        user_id=search_user_id,
+                        chat_id=chat_id,
+                    )
                 )
-            )
-        if data.graph_search:
-            recall_tasks.append(
-                self._graph_search(
-                    collection=collection,
-                    query=query,
-                    top_k=data.graph_search.topk,
+            if data.graph_search:
+                recall_tasks.append(
+                    self._graph_search(
+                        collection=collection,
+                        query=query,
+                        top_k=data.graph_search.topk,
+                    )
                 )
-            )
-        if data.summary_search:
-            recall_tasks.append(
-                self._summary_search(
-                    collection=collection,
-                    query=query,
-                    top_k=data.summary_search.topk,
-                    similarity_threshold=data.summary_search.similarity,
+            if data.summary_search:
+                recall_tasks.append(
+                    self._summary_search(
+                        collection=collection,
+                        query=query,
+                        top_k=data.summary_search.topk,
+                        similarity_threshold=data.summary_search.similarity,
+                    )
                 )
-            )
-        if data.vision_search:
-            recall_tasks.append(
-                self._vision_search(
-                    collection=collection,
-                    query=query,
-                    top_k=data.vision_search.topk,
-                    similarity_threshold=data.vision_search.similarity,
+            if data.vision_search:
+                recall_tasks.append(
+                    self._vision_search(
+                        collection=collection,
+                        query=query,
+                        top_k=data.vision_search.topk,
+                        similarity_threshold=data.vision_search.similarity,
+                    )
                 )
-            )
 
-        if not recall_tasks:
-            raise ValidationException("At least one search strategy must be enabled")
+            if not recall_tasks:
+                raise ValidationException("At least one search strategy must be enabled")
 
-        recall_results = await asyncio.gather(*recall_tasks)
-        merged_docs = self._merge_results(recall_results)
-        reranked_docs = await self._rerank(
-            query=query,
-            docs=merged_docs,
-            user_id=search_user_id,
-            use_rerank=bool(data.rerank),
-        )
+            recall_results = await asyncio.gather(*recall_tasks)
+            merged_docs = self._merge_results(recall_results)
+            reranked_docs = await self._rerank(
+                query=query,
+                docs=merged_docs,
+                user_id=search_user_id,
+                use_rerank=bool(data.rerank),
+            )
 
-        items = []
-        for idx, doc in enumerate(reranked_docs):
-            metadata = doc.metadata or {}
-            public_metadata = SearchResultMetadata.from_raw(metadata)
-            source = public_metadata.source if public_metadata and public_metadata.source else ""
-            items.append(
-                SearchResultItem(
-                    rank=idx + 1,
-                    score=doc.score,
-                    content=doc.text,
-                    source=source,
-                    recall_type=metadata.get("recall_type", ""),
-                    metadata=public_metadata,
+            items = []
+            for idx, doc in enumerate(reranked_docs):
+                metadata = doc.metadata or {}
+                public_metadata = SearchResultMetadata.from_raw(metadata)
+                source = public_metadata.source if public_metadata and public_metadata.source else ""
+                items.append(
+                    SearchResultItem(
+                        rank=idx + 1,
+                        score=doc.score,
+                        content=doc.text,
+                        source=source,
+                        recall_type=metadata.get("recall_type", ""),
+                        metadata=public_metadata,
+                    )
                 )
-            )
 
-        return items, "rerank"
+            return items, "rerank"
 
     async def _vector_search(
         self,