feat: enhance database connection management and document handling (#1091)

iziang · web-flow · commit 2b4f2bf0edc6 · 2025-07-11T14:31:10.000+08:00
- Added database connection pool settings to `config.py` for better resource management.
- Updated `async_engine` and `sync_engine` creation to utilize new pool settings.
- Improved session management in `AuditService` and `DocumentService` for database operations.
- Refactored document retrieval methods to use eager loading for indexes, optimizing performance.
- Added relationship mapping for `Document` and `DocumentIndex` models to streamline data access.
diff --git a/aperag/config.py b/aperag/config.py
@@ -51,6 +51,13 @@ class Config(BaseSettings):
     # Database
     database_url: str = Field(f"sqlite:///{BASE_DIR}/db.sqlite3", alias="DATABASE_URL")
 
+    # Database connection pool settings
+    db_pool_size: int = Field(20, alias="DB_POOL_SIZE")
+    db_max_overflow: int = Field(40, alias="DB_MAX_OVERFLOW")
+    db_pool_timeout: int = Field(60, alias="DB_POOL_TIMEOUT")
+    db_pool_recycle: int = Field(3600, alias="DB_POOL_RECYCLE")
+    db_pool_pre_ping: bool = Field(True, alias="DB_POOL_PRE_PING")
+
     # Auth
     auth_type: str = Field("none", alias="AUTH_TYPE")
     auth0_domain: str = Field("aperag-dev.auting.cn", alias="AUTH0_DOMAIN")
@@ -174,8 +181,25 @@ def get_async_database_url(url: str):
 
 settings = Config()
 
-async_engine = create_async_engine(get_async_database_url(settings.database_url), echo=settings.debug)
-sync_engine = create_engine(get_sync_database_url(settings.database_url), echo=settings.debug)
+# Database connection pool settings from configuration
+async_engine = create_async_engine(
+    get_async_database_url(settings.database_url),
+    echo=settings.debug,
+    pool_size=settings.db_pool_size,
+    max_overflow=settings.db_max_overflow,
+    pool_timeout=settings.db_pool_timeout,
+    pool_recycle=settings.db_pool_recycle,
+    pool_pre_ping=settings.db_pool_pre_ping,
+)
+sync_engine = create_engine(
+    get_sync_database_url(settings.database_url),
+    echo=settings.debug,
+    pool_size=settings.db_pool_size,
+    max_overflow=settings.db_max_overflow,
+    pool_timeout=settings.db_pool_timeout,
+    pool_recycle=settings.db_pool_recycle,
+    pool_pre_ping=settings.db_pool_pre_ping,
+)
 
 
 async def get_async_session() -> AsyncGenerator[AsyncSession, None]:
diff --git a/aperag/service/audit_service.py b/aperag/service/audit_service.py
@@ -159,10 +159,16 @@ async def log_audit(
                 request_id=request_id or str(uuid.uuid4()),
             )
 
-            # Save to database asynchronously
-            async for session in get_async_session():
+            # Save to database with proper session management
+            async def _save_audit_log(session):
                 session.add(audit_log)
                 await session.commit()
+                return audit_log
+
+            # Use get_async_session with proper session management
+            async for session in get_async_session():
+                await _save_audit_log(session)
+                break  # Only process one session
 
         except Exception as e:
             logger.error(f"Failed to log audit: {e}")
@@ -179,7 +185,9 @@ async def list_audit_logs(
         limit: int = 1000,
     ) -> List[AuditLog]:
         """List audit logs with filtering"""
-        async for session in get_async_session():
+
+        # Use proper session management
+        async def _list_audit_logs(session):
             # Build query
             stmt = select(AuditLog)
 
@@ -206,33 +214,39 @@ async def list_audit_logs(
             # Order by creation time (newest first) and limit
             stmt = stmt.order_by(desc(AuditLog.gmt_created)).limit(limit)
 
-            # Execute query
+            # Execute query and return results immediately
             result = await session.execute(stmt)
-            audit_logs = result.scalars().all()
-
-            # Extract resource_id for each log during query time
-            for log in audit_logs:
-                if log.resource_type and log.path:
-                    # Convert string to enum if needed
-                    resource_type_enum = log.resource_type
-                    if isinstance(log.resource_type, str):
-                        try:
-                            resource_type_enum = AuditResource(log.resource_type)
-                        except ValueError:
-                            resource_type_enum = None
-
-                    if resource_type_enum:
-                        log.resource_id = self.extract_resource_id_from_path(log.path, resource_type_enum)
-                    else:
-                        log.resource_id = None
-
-                # Calculate duration if both times are available
-                if log.start_time and log.end_time:
-                    log.duration_ms = log.end_time - log.start_time
+            return result.scalars().all()
+
+        # Execute query with proper session management
+        audit_logs = None
+        async for session in get_async_session():
+            audit_logs = await _list_audit_logs(session)
+            break  # Only process one session
+
+        # Post-process audit logs outside of session to avoid long session occupation
+        for log in audit_logs:
+            if log.resource_type and log.path:
+                # Convert string to enum if needed
+                resource_type_enum = log.resource_type
+                if isinstance(log.resource_type, str):
+                    try:
+                        resource_type_enum = AuditResource(log.resource_type)
+                    except ValueError:
+                        resource_type_enum = None
+
+                if resource_type_enum:
+                    log.resource_id = self.extract_resource_id_from_path(log.path, resource_type_enum)
                 else:
-                    log.duration_ms = None
+                    log.resource_id = None
+
+            # Calculate duration if both times are available
+            if log.start_time and log.end_time:
+                log.duration_ms = log.end_time - log.start_time
+            else:
+                log.duration_ms = None
 
-            return audit_logs
+        return audit_logs
 
 
 # Global audit service instance
diff --git a/aperag/service/document_service.py b/aperag/service/document_service.py
@@ -23,7 +23,7 @@
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from aperag.config import get_async_session, settings
+from aperag.config import settings
 from aperag.db import models as db_models
 from aperag.db.ops import AsyncDatabaseOps, async_db_ops
 from aperag.docparser.doc_parser import DocParser
@@ -79,52 +79,95 @@ def __init__(self, session: AsyncSession = None):
         else:
             self.db_ops = AsyncDatabaseOps(session)  # Create custom instance for transaction control
 
-    async def build_document_response(
-        self, document: db_models.Document, session: AsyncSession
-    ) -> view_models.Document:
-        """Build Document response object for API return using new status model."""
-        from sqlalchemy import select
-
-        from aperag.db.models import DocumentIndex
+    async def _query_documents_with_indexes(
+        self, user: str, collection_id: str, document_id: str = None
+    ) -> List[db_models.Document]:
+        """
+        Common function to query documents with their indexes using JOIN.
+        If document_id is provided, query single document, otherwise query all documents.
+        """
 
-        # Get all document indexes for status calculation
-        document_indexes = await session.execute(
-            select(DocumentIndex).where(
-                DocumentIndex.document_id == document.id,
-                DocumentIndex.status != db_models.DocumentIndexStatus.DELETING,
-                DocumentIndex.status != db_models.DocumentIndexStatus.DELETION_IN_PROGRESS,
+        async def _execute_query(session):
+            from sqlalchemy import and_, outerjoin, select
+
+            # Create JOIN query between Document and DocumentIndex tables
+            # Use outerjoin to get all documents even if they don't have indexes
+            query = (
+                select(
+                    db_models.Document,
+                    db_models.DocumentIndex.index_type,
+                    db_models.DocumentIndex.status.label("index_status"),
+                    db_models.DocumentIndex.gmt_created.label("index_created_at"),
+                    db_models.DocumentIndex.gmt_updated.label("index_updated_at"),
+                    db_models.DocumentIndex.error_message.label("index_error_message"),
+                )
+                .select_from(
+                    outerjoin(
+                        db_models.Document,
+                        db_models.DocumentIndex,
+                        db_models.Document.id == db_models.DocumentIndex.document_id,
+                    )
+                )
+                .where(
+                    and_(
+                        db_models.Document.user == user,
+                        db_models.Document.collection_id == collection_id,
+                        db_models.Document.status != db_models.DocumentStatus.DELETED,
+                    )
+                )
+                .order_by(db_models.Document.gmt_created.desc())
             )
-        )
-        indexes = document_indexes.scalars().all()
-
-        # Map index states to API response format
-        index_status = {}
-        index_updated = {}
-
-        # Initialize all types as SKIPPED (when no record exists)
-        all_types = [
-            db_models.DocumentIndexType.VECTOR,
-            db_models.DocumentIndexType.FULLTEXT,
-            db_models.DocumentIndexType.GRAPH,
-        ]
-        for index_type in all_types:
-            index_status[index_type] = "SKIPPED"
-
-        # Update with actual states from database
-        for index in indexes:
-            index_status[index.index_type] = index.status
-            index_updated[index.index_type] = index.gmt_updated
+
+            # Add document_id filter if provided (for single document query)
+            if document_id:
+                query = query.where(db_models.Document.id == document_id)
+
+            result = await session.execute(query)
+            rows = result.fetchall()
+
+            # Group results by document and attach all index information
+            documents_dict = {}
+            for row in rows:
+                doc = row.Document
+                if doc.id not in documents_dict:
+                    documents_dict[doc.id] = doc
+                    # Initialize index information for all types
+                    doc.indexes = {"VECTOR": None, "FULLTEXT": None, "GRAPH": None}
+
+                # Add index information if exists
+                if row.index_type:
+                    doc.indexes[row.index_type] = {
+                        "index_type": row.index_type,
+                        "status": row.index_status,
+                        "created_at": row.index_created_at,
+                        "updated_at": row.index_updated_at,
+                        "error_message": row.index_error_message,
+                    }
+
+            return list(documents_dict.values())
+
+        return await self.db_ops._execute_query(_execute_query)
+
+    async def _build_document_response(self, document: db_models.Document) -> view_models.Document:
+        """
+        Build document response object with all index types information.
+        """
+        # Get all index information if available
+        indexes = getattr(document, "indexes", {"VECTOR": None, "FULLTEXT": None, "GRAPH": None})
 
         return view_models.Document(
             id=document.id,
             name=document.name,
             status=document.status,
-            vector_index_status=index_status.get(db_models.DocumentIndexType.VECTOR, "SKIPPED"),
-            fulltext_index_status=index_status.get(db_models.DocumentIndexType.FULLTEXT, "SKIPPED"),
-            graph_index_status=index_status.get(db_models.DocumentIndexType.GRAPH, "SKIPPED"),
-            vector_index_updated=index_updated.get(db_models.DocumentIndexType.VECTOR, None),
-            fulltext_index_updated=index_updated.get(db_models.DocumentIndexType.FULLTEXT, None),
-            graph_index_updated=index_updated.get(db_models.DocumentIndexType.GRAPH, None),
+            # Vector index information
+            vector_index_status=indexes["VECTOR"]["status"] if indexes["VECTOR"] else "SKIPPED",
+            vector_index_updated=indexes["VECTOR"]["updated_at"] if indexes["VECTOR"] else None,
+            # Fulltext index information
+            fulltext_index_status=indexes["FULLTEXT"]["status"] if indexes["FULLTEXT"] else "SKIPPED",
+            fulltext_index_updated=indexes["FULLTEXT"]["updated_at"] if indexes["FULLTEXT"] else None,
+            # Graph index information
+            graph_index_status=indexes["GRAPH"]["status"] if indexes["GRAPH"] else "SKIPPED",
+            graph_index_updated=indexes["GRAPH"]["updated_at"] if indexes["GRAPH"] else None,
             size=document.size,
             created=document.gmt_created,
             updated=document.gmt_updated,
@@ -241,19 +284,24 @@ async def _create_documents_atomically(session):
         return DocumentList(items=response)
 
     async def list_documents(self, user: str, collection_id: str) -> view_models.DocumentList:
-        documents = await self.db_ops.query_documents([user], collection_id)
+        """List all documents for a user in a collection."""
+        documents = await self._query_documents_with_indexes(user, collection_id)
+
         response = []
-        async for session in get_async_session():
-            for document in documents:
-                response.append(await self.build_document_response(document, session))
-        return DocumentList(items=response)
+        for document in documents:
+            response.append(await self._build_document_response(document))
+
+        return view_models.DocumentList(items=response)
 
     async def get_document(self, user: str, collection_id: str, document_id: str) -> view_models.Document:
-        document = await self.db_ops.query_document(user, collection_id, document_id)
-        if document is None:
-            raise DocumentNotFoundException(document_id)
-        async for session in get_async_session():
-            return await self.build_document_response(document, session)
+        """Get a specific document by ID."""
+        documents = await self._query_documents_with_indexes(user, collection_id, document_id)
+
+        if not documents:
+            raise DocumentNotFoundException(f"Document not found: {document_id}")
+
+        document = documents[0]
+        return await self._build_document_response(document)
 
     async def _delete_document(self, session: AsyncSession, user: str, collection_id: str, document_id: str):
         """
@@ -390,7 +438,9 @@ async def get_document_chunks(self, user_id: str, collection_id: str, document_i
         """
         Get all chunks of a document.
         """
-        async for session in get_async_session():
+
+        # Use database operations with proper session management
+        async def _get_document_chunks(session):
             # 1. Get the document to verify ownership and get collection_id
             stmt = select(db_models.Document).filter(
                 db_models.Document.id == document_id,
@@ -471,11 +521,16 @@ async def get_document_chunks(self, user_id: str, collection_id: str, document_i
                 )
                 raise HTTPException(status_code=500, detail="Failed to retrieve chunks from vector store")
 
+        # Execute query with proper session management
+        return await self.db_ops._execute_query(_get_document_chunks)
+
     async def get_document_preview(self, user_id: str, collection_id: str, document_id: str) -> DocumentPreview:
         """
         Get all preview-related information for a document.
         """
-        async for session in get_async_session():
+
+        # Use database operations with proper session management
+        async def _get_document_preview(session):
             # 1. Get document and vector index in one go
             doc_stmt = select(db_models.Document).filter(
                 db_models.Document.id == document_id,
@@ -539,11 +594,16 @@ async def get_document_preview(self, user_id: str, collection_id: str, document_
                 chunks=chunks,
             )
 
+        # Execute query with proper session management
+        return await self.db_ops._execute_query(_get_document_preview)
+
     async def get_document_object(self, user_id: str, collection_id: str, document_id: str, path: str):
         """
         Get a file object associated with a document from the object store.
         """
-        async for session in get_async_session():
+
+        # Use database operations with proper session management
+        async def _get_document_object(session):
             # 1. Verify user has access to the document
             stmt = select(db_models.Document).filter(
                 db_models.Document.id == document_id,
@@ -580,6 +640,9 @@ async def get_document_object(self, user_id: str, collection_id: str, document_i
                 logger.error(f"Failed to get object for document {document_id} at path {full_path}: {e}", exc_info=True)
                 raise HTTPException(status_code=500, detail="Failed to get object from store")
 
+        # Execute query with proper session management
+        return await self.db_ops._execute_query(_get_document_object)
+
 
 # Create a global service instance for easy access
 # This uses the global db_ops instance and doesn't require session management in views
diff --git a/envs/docker.env.overrides b/envs/docker.env.overrides
@@ -6,6 +6,13 @@ VECTOR_DB_CONTEXT={"url":"http://aperag-qdrant", "port":6333, "distance":"Cosine
 ES_HOST=http://aperag-es:9200
 MEMORY_REDIS_URL=redis://default:password@aperag-redis:6379
 
+# Database Connection Pool Settings for Docker deployment
+DB_POOL_SIZE=25
+DB_MAX_OVERFLOW=50
+DB_POOL_TIMEOUT=60
+DB_POOL_RECYCLE=3600
+DB_POOL_PRE_PING=True
+
 # Override for path
 TIKTOKEN_CACHE_DIR=/root/.cache/tiktoken
 
diff --git a/envs/env.template b/envs/env.template