From a9f61b0116dd907f78a13fbd486b5127a3399570 Mon Sep 17 00:00:00 2001 From: Guo Ziang Date: Tue, 24 Jun 2025 15:05:11 +0800 Subject: [PATCH 1/6] feat: update document index status model and related components - Refactored document index status to use a single status model with new states: CREATING, ACTIVE, DELETING, and DELETION_IN_PROGRESS. - Updated related database models, API components, and service logic to accommodate the new status model. - Adjusted index management and reconciliation processes to align with the new status definitions. - Modified tests to reflect changes in index status handling and ensure compatibility with the updated API responses. --- aperag/api/components/schemas/document.yaml | 19 +- aperag/db/models.py | 68 ++-- aperag/index/manager.py | 122 +++--- aperag/index/reconciler.py | 375 +++++++++--------- ...478f.py => 20250624132425-850b2c5dc08f.py} | 17 +- ...62eb.py => 20250624132601-66b96592c84a.py} | 14 +- aperag/schema/view_models.py | 50 ++- aperag/service/document_service.py | 88 ++-- aperag/tasks/scheduler.py | 166 +------- config/celery_tasks.py | 147 ++++++- tests/e2e_test/conftest.py | 2 +- tests/e2e_test/test_document.py | 20 +- 12 files changed, 495 insertions(+), 593 deletions(-) rename aperag/migration/versions/{20250624093005-eb8aa708478f.py => 20250624132425-850b2c5dc08f.py} (97%) rename aperag/migration/versions/{20250624093016-dc0829e062eb.py => 20250624132601-66b96592c84a.py} (76%) diff --git a/aperag/api/components/schemas/document.yaml b/aperag/api/components/schemas/document.yaml index e46c8d41e..c84ecdb3c 100644 --- a/aperag/api/components/schemas/document.yaml +++ b/aperag/api/components/schemas/document.yaml @@ -14,29 +14,34 @@ document: - FAILED - DELETING - DELETED - - WARNING vector_index_status: type: string enum: - PENDING - - RUNNING - - COMPLETE + - CREATING + - ACTIVE + - DELETING + - DELETING_IN_PROGRESS - FAILED - SKIPPED fulltext_index_status: type: string enum: - PENDING - - RUNNING - - COMPLETE + - CREATING + - ACTIVE + - DELETING + - DELETING_IN_PROGRESS - FAILED - SKIPPED graph_index_status: type: string enum: - PENDING - - RUNNING - - COMPLETE + - CREATING + - ACTIVE + - DELETING + - DELETING_IN_PROGRESS - FAILED - SKIPPED vector_index_updated: diff --git a/aperag/db/models.py b/aperag/db/models.py index 87ae61a62..383a56ec5 100644 --- a/aperag/db/models.py +++ b/aperag/db/models.py @@ -83,6 +83,17 @@ class DocumentIndexType(str, Enum): GRAPH = "GRAPH" +class DocumentIndexStatus(str, Enum): + """Document index lifecycle status""" + + PENDING = "PENDING" # Awaiting processing (create/update) + CREATING = "CREATING" # Task claimed, creation/update in progress + ACTIVE = "ACTIVE" # Index is up-to-date and ready for use + DELETING = "DELETING" # Deletion has been requested + DELETION_IN_PROGRESS = "DELETION_IN_PROGRESS" # Task claimed, deletion in progress + FAILED = "FAILED" # The last operation failed + + class BotStatus(str, Enum): ACTIVE = "ACTIVE" DELETED = "DELETED" @@ -159,24 +170,6 @@ class LightRAGDocStatus(str, Enum): FAILED = "failed" -# Add new enums for K8s-inspired design -class IndexDesiredState(str, Enum): - """Desired state for index - what we want""" - - PRESENT = "present" - ABSENT = "absent" - - -class IndexActualState(str, Enum): - """Actual state for index - what currently exists""" - - ABSENT = "absent" - CREATING = "creating" - PRESENT = "present" - DELETING = "deleting" - FAILED = "failed" - - # Models class Collection(Base): __tablename__ = "collection" @@ -243,13 +236,13 @@ def get_overall_index_status(self, session) -> "DocumentStatus": if not document_indexes: return DocumentStatus.PENDING - states = [idx.actual_state for idx in document_indexes] + statuses = [idx.status for idx in document_indexes] - if any(state == IndexActualState.FAILED for state in states): + if any(status == DocumentIndexStatus.FAILED for status in statuses): return DocumentStatus.FAILED - elif any(state == IndexActualState.CREATING for state in states): + elif any(status in [DocumentIndexStatus.CREATING, DocumentIndexStatus.DELETION_IN_PROGRESS] for status in statuses): return DocumentStatus.RUNNING - elif all(state == IndexActualState.PRESENT for state in states): + elif all(status == DocumentIndexStatus.ACTIVE for status in statuses): return DocumentStatus.COMPLETE else: return DocumentStatus.PENDING @@ -700,7 +693,7 @@ class LightRAGLLMCacheModel(Base): class DocumentIndex(Base): - """Document index - combines spec and status into single table""" + """Document index - single status model""" __tablename__ = "document_index" __table_args__ = (UniqueConstraint("document_id", "index_type", name="uq_document_index"),) @@ -709,14 +702,12 @@ class DocumentIndex(Base): document_id = Column(String(24), nullable=False, index=True) index_type = Column(EnumColumn(DocumentIndexType), nullable=False, index=True) - # Desired state (spec) fields - desired_state = Column(EnumColumn(IndexDesiredState), nullable=False, default=IndexDesiredState.PRESENT, index=True) + status = Column(EnumColumn(DocumentIndexStatus), nullable=False, default=DocumentIndexStatus.PENDING, index=True) version = Column(Integer, nullable=False, default=1) # Incremented on each spec change + observed_version = Column(Integer, nullable=False, default=0) # Last processed spec version created_by = Column(String(256), nullable=False) # User who created this spec - # Actual state (status) fields - actual_state = Column(EnumColumn(IndexActualState), nullable=False, default=IndexActualState.ABSENT, index=True) - observed_version = Column(Integer, nullable=False, default=0) # Last processed spec version + # Index data and task tracking index_data = Column(Text, nullable=True) # JSON string for index-specific data error_message = Column(Text, nullable=True) @@ -726,23 +717,14 @@ class DocumentIndex(Base): gmt_last_reconciled = Column(DateTime(timezone=True), nullable=True) # Last reconciliation attempt def __repr__(self): - return f"" - - def is_in_sync(self) -> bool: - """Check if desired and actual states are in sync""" - if self.observed_version < self.version: - return False + return f"" - if self.desired_state == IndexDesiredState.PRESENT: - return self.actual_state == IndexActualState.PRESENT - elif self.desired_state == IndexDesiredState.ABSENT: - return self.actual_state == IndexActualState.ABSENT - return False + def is_out_of_sync(self) -> bool: + """Check if this index needs reconciliation""" + return self.observed_version < self.version - def update_spec(self, desired_state: IndexDesiredState = None, created_by: str = None): - """Update the spec (desired state) part""" - if desired_state is not None: - self.desired_state = desired_state + def update_version(self, created_by: str = None): + """Update the version to trigger reconciliation""" if created_by is not None: self.created_by = created_by self.version += 1 diff --git a/aperag/index/manager.py b/aperag/index/manager.py index 0e231f610..897f32075 100644 --- a/aperag/index/manager.py +++ b/aperag/index/manager.py @@ -12,31 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import logging from typing import List, Optional from sqlalchemy import and_, select from sqlalchemy.ext.asyncio import AsyncSession -from aperag.db.models import DocumentIndex, DocumentIndexType, IndexActualState, IndexDesiredState, utc_now +from aperag.db.models import DocumentIndex, DocumentIndexType, DocumentIndexStatus, utc_now logger = logging.getLogger(__name__) -class FrontendIndexManager: - """Simple manager for document index specs (frontend chain)""" +class DocumentIndexManager: + """Manager for document index lifecycle using single status model""" async def create_document_indexes( self, session: AsyncSession, document_id: str, user: str, index_types: Optional[List[DocumentIndexType]] = None ): """ - Create index specs for a document (called when document is created) + Create index records for a document (called when document is created) Args: session: Database session document_id: Document ID user: User creating the document - index_types: List of index types to create (defaults to all) + index_types: List of index types to create (defaults to vector and fulltext) """ if index_types is None: index_types = [DocumentIndexType.VECTOR, DocumentIndexType.FULLTEXT, DocumentIndexType.GRAPH] @@ -50,38 +51,44 @@ async def create_document_indexes( existing_index = result.scalar_one_or_none() if existing_index: - # Update existing index - existing_index.update_spec(IndexDesiredState.PRESENT, user) - logger.debug(f"Updated index for {document_id}:{index_type} to version {existing_index.version}") + # Update existing index to pending and increment version + existing_index.status = DocumentIndexStatus.PENDING + existing_index.update_version(user) + logger.debug(f"Updated index for {document_id}:{index_type.value} to version {existing_index.version}") else: # Create new index doc_index = DocumentIndex( document_id=document_id, index_type=index_type, - desired_state=IndexDesiredState.PRESENT, + status=DocumentIndexStatus.PENDING, version=1, + observed_version=0, created_by=user, ) session.add(doc_index) + logger.debug(f"Created new index for {document_id}:{index_type.value}") - async def update_document_indexes(self, session: AsyncSession, document_id: str): + async def update_document_indexes(self, session: AsyncSession, document_id: str, user: str = None): """ Update document indexes (called when document content is updated) - This increments the version of all indexes to trigger reconciliation. + This increments the version of all active indexes to trigger reconciliation. Args: session: Database session document_id: Document ID + user: User triggering the update (optional) """ stmt = select(DocumentIndex).where(DocumentIndex.document_id == document_id) result = await session.execute(stmt) indexes = result.scalars().all() for index in indexes: - if index.desired_state == IndexDesiredState.PRESENT: - index.version += 1 # Increment version to trigger re-indexing - index.gmt_updated = utc_now() + # Only update active indexes, failed indexes can be manually rebuilt + if index.status in [DocumentIndexStatus.ACTIVE, DocumentIndexStatus.FAILED]: + index.status = DocumentIndexStatus.PENDING + index.update_version(user) + logger.debug(f"Updated index {document_id}:{index.index_type} to version {index.version}") async def delete_document_indexes( self, session: AsyncSession, document_id: str, index_types: Optional[List[DocumentIndexType]] = None @@ -105,10 +112,13 @@ async def delete_document_indexes( doc_index = result.scalar_one_or_none() if doc_index: - doc_index.update_spec(IndexDesiredState.ABSENT) + # Mark for deletion + doc_index.status = DocumentIndexStatus.DELETING + doc_index.gmt_updated = utc_now() + logger.debug(f"Marked index {document_id}:{index_type.value} for deletion") async def rebuild_document_indexes( - self, session: AsyncSession, document_id: str, index_types: List[DocumentIndexType] + self, session: AsyncSession, document_id: str, index_types: List[DocumentIndexType], user: str = None ): """ Rebuild specified document indexes (called when user requests index rebuild) @@ -119,9 +129,10 @@ async def rebuild_document_indexes( session: Database session document_id: Document ID index_types: List of index types to rebuild + user: User triggering the rebuild (optional) """ if len(set(index_types)) != len(index_types): - raise Exception("Duplicate index types are not allowed") + raise ValueError("Duplicate index types are not allowed") for index_type in index_types: stmt = select(DocumentIndex).where( @@ -131,66 +142,23 @@ async def rebuild_document_indexes( doc_index = result.scalar_one_or_none() if doc_index: - # Only rebuild if the index is present or failed - if doc_index.desired_state == IndexDesiredState.PRESENT: - doc_index.version += 1 # Increment version to trigger re-indexing - doc_index.gmt_updated = utc_now() - logger.info(f"Triggered rebuild for {index_type.value} index of document {document_id}") - else: - logger.warning( - f"Cannot rebuild {index_type.value} index for document {document_id}: index not present" - ) + # Reset to pending and increment version to trigger rebuild + doc_index.status = DocumentIndexStatus.PENDING + doc_index.update_version(user) + doc_index.error_message = None # Clear any previous error + logger.info(f"Triggered rebuild for {index_type.value} index of document {document_id} (v{doc_index.version})") else: - logger.warning(f"No {index_type.value} index found for document {document_id}") - - async def get_document_index_status(self, session: AsyncSession, document_id: str) -> dict: - """ - Get current index status for a document - - Args: - session: Database session - document_id: Document ID - - Returns: - Dictionary with index status information - """ - # Get all indexes for the document - stmt = select(DocumentIndex).where(DocumentIndex.document_id == document_id) - result = await session.execute(stmt) - indexes = result.scalars().all() - - # Build result - result = {"document_id": document_id, "indexes": {}, "overall_status": "complete"} - - has_creating = False - has_failed = False - - for index in indexes: - index_info = { - "type": index.index_type, - "desired_state": index.desired_state, - "actual_state": index.actual_state, - "in_sync": index.is_in_sync(), - } - - if index.actual_state == IndexActualState.CREATING: - has_creating = True - elif index.actual_state == IndexActualState.FAILED: - has_failed = True - index_info["error"] = index.error_message - - result["indexes"][index.index_type] = index_info - - # Determine overall status - if has_failed: - result["overall_status"] = "failed" - elif has_creating: - result["overall_status"] = "running" - else: - result["overall_status"] = "complete" - - return result - + # Create new index if it doesn't exist + doc_index = DocumentIndex( + document_id=document_id, + index_type=index_type, + status=DocumentIndexStatus.PENDING, + version=1, + observed_version=0, + created_by=user or "system", + ) + session.add(doc_index) + logger.info(f"Created new {index_type.value} index for document {document_id}") # Global instance -document_index_manager = FrontendIndexManager() +document_index_manager = DocumentIndexManager() diff --git a/aperag/index/reconciler.py b/aperag/index/reconciler.py index 9a27438b7..6573c5fe3 100644 --- a/aperag/index/reconciler.py +++ b/aperag/index/reconciler.py @@ -24,9 +24,8 @@ Document, DocumentIndex, DocumentIndexType, + DocumentIndexStatus, DocumentStatus, - IndexActualState, - IndexDesiredState, ) from aperag.tasks.scheduler import TaskScheduler, create_task_scheduler from aperag.utils.utils import utc_now @@ -34,98 +33,29 @@ logger = logging.getLogger(__name__) -class BackendIndexReconciler: - """Simple reconciler for document indexes (backend chain)""" +class DocumentIndexReconciler: + """Reconciler for document indexes using single status model""" def __init__(self, task_scheduler: Optional[TaskScheduler] = None, scheduler_type: str = "celery"): self.task_scheduler = task_scheduler or create_task_scheduler(scheduler_type) - @staticmethod - def _get_reconciliation_conditions(operation_type: str, document_ids: List[str] = None): - """ - Get all conditions for indexes that need reconciliation based on operation type. - This is the authoritative source for determining which indexes can be processed. - - Args: - operation_type: 'create', 'update', or 'delete' - document_ids: Optional list of document IDs to filter by - - Returns: - List of SQLAlchemy conditions - """ - if operation_type in ["create", "update"]: - conditions = [ - DocumentIndex.desired_state == IndexDesiredState.PRESENT, - # Need reconciliation: either version mismatch or state mismatch - or_( - DocumentIndex.observed_version < DocumentIndex.version, - DocumentIndex.actual_state.in_([IndexActualState.ABSENT, IndexActualState.FAILED]), - ), - # For create/update operations, exclude both CREATING and DELETING states - DocumentIndex.actual_state.notin_([IndexActualState.CREATING, IndexActualState.DELETING]), - ] - elif operation_type == "delete": - conditions = [ - DocumentIndex.desired_state == IndexDesiredState.ABSENT, - # Only delete indexes that actually exist or are being created - DocumentIndex.actual_state.in_([IndexActualState.CREATING, IndexActualState.PRESENT]), - # For delete operations, allow claiming indexes in CREATING state to enable deletion override - # Only exclude DELETING state to prevent concurrent deletions - DocumentIndex.actual_state != IndexActualState.DELETING, - ] - else: - raise ValueError(f"Unknown operation_type: {operation_type}") - - if document_ids: - conditions.append(DocumentIndex.document_id.in_(document_ids)) - - return conditions - - def reconcile_all(self, document_ids: List[str] = None): + def reconcile_all(self): """ - Main reconciliation loop - scan specs and reconcile differences - Groups operations by document to enable batch processing - - Args: - document_ids: Optional list of specific document IDs to reconcile. If None, reconcile all. + Main reconciliation loop - scan indexes and reconcile differences + Groups operations by document and index type for atomic processing """ - # Get all indexes that need reconciliation first - all_indexes_needing_reconciliation = [] + # Get all indexes that need reconciliation for session in get_sync_session(): - all_indexes_needing_reconciliation = self._get_indexes_needing_reconciliation(session, document_ids) - break # Only need to query once - - if not all_indexes_needing_reconciliation: - logger.debug("No indexes need reconciliation") - return - - # Group by document ID and operation type for batch processing - from collections import defaultdict - - doc_operations = defaultdict(lambda: {"create": [], "update": [], "delete": []}) + operations = self._get_indexes_needing_reconciliation(session) - for doc_index in all_indexes_needing_reconciliation: - # Group operations by document and type - if doc_index.desired_state == IndexDesiredState.PRESENT: - # Check if this is an update (version mismatch with existing index data) or creation - if doc_index.index_data and doc_index.observed_version > 0: - # Index has data and was observed before - this is an update - operation_type = "update" - else: - # No existing data or never observed - this is a creation - operation_type = "create" - doc_operations[doc_index.document_id][operation_type].append(doc_index) - elif doc_index.desired_state == IndexDesiredState.ABSENT: - doc_operations[doc_index.document_id]["delete"].append(doc_index) - - logger.info(f"Found {len(doc_operations)} documents need to be reconciled") + logger.info(f"Found {len(operations)} documents need to be reconciled") # Process each document with its own transaction successful_docs = 0 failed_docs = 0 - for document_id, operations in doc_operations.items(): + for document_id, doc_operations in operations.items(): try: - self._reconcile_single_document(document_id, operations) + self._reconcile_single_document(document_id, doc_operations) successful_docs += 1 except Exception as e: failed_docs += 1 @@ -134,143 +64,199 @@ def reconcile_all(self, document_ids: List[str] = None): logger.info(f"Reconciliation completed: {successful_docs} successful, {failed_docs} failed") - def _get_indexes_needing_reconciliation( - self, session: Session, document_ids: List[str] = None - ) -> List[DocumentIndex]: + def _get_indexes_needing_reconciliation(self, session: Session) -> List[DocumentIndex]: """ Get all indexes that need reconciliation without modifying their state. State modifications will happen in individual document transactions. """ - # Use shared reconciliation conditions - create_conditions = self._get_reconciliation_conditions("create", document_ids) - delete_conditions = self._get_reconciliation_conditions("delete", document_ids) - - # Query for indexes that need creating - create_stmt = select(DocumentIndex).where(and_(*create_conditions)) - create_result = session.execute(create_stmt) - create_indexes = create_result.scalars().all() - - # Query for indexes that need deleting - delete_stmt = select(DocumentIndex).where(and_(*delete_conditions)) - delete_result = session.execute(delete_stmt) - delete_indexes = delete_result.scalars().all() + from collections import defaultdict - all_indexes = list(create_indexes) + list(delete_indexes) - logger.debug(f"Found {len(all_indexes)} indexes needing reconciliation") - return all_indexes + operations = defaultdict(lambda: {"create": [], "update": [], "delete": []}) + + conditions = { + "create": and_( + DocumentIndex.status == DocumentIndexStatus.PENDING, + DocumentIndex.observed_version < DocumentIndex.version, + DocumentIndex.version == 1, + ), + "update": and_( + DocumentIndex.status == DocumentIndexStatus.PENDING, + DocumentIndex.observed_version < DocumentIndex.version, + DocumentIndex.version > 1, + ), + "delete": and_( + DocumentIndex.status == DocumentIndexStatus.DELETING, + ), + } + + for operation_type, condition in conditions.items(): + stmt = select(DocumentIndex).where(condition) + result = session.execute(stmt) + indexes = result.scalars().all() + for index in indexes: + operations[index.document_id][operation_type].append(index) + + return operations def _reconcile_single_document(self, document_id: str, operations: dict): """ Reconcile operations for a single document within its own transaction """ for session in get_sync_session(): - # Get the specific indexes for this document and claim them atomically + # Collect indexes for this document that need claiming indexes_to_claim = [] - # Collect indexes for this document that need claiming for operation_type, doc_indexes in operations.items(): for doc_index in doc_indexes: - indexes_to_claim.append((doc_index.id, operation_type)) + indexes_to_claim.append((doc_index.id, doc_index.index_type, operation_type)) # Atomically claim the indexes for this document - claimed_successfully = self._claim_document_indexes(session, document_id, indexes_to_claim) + claimed_indexes = self._claim_document_indexes(session, document_id, indexes_to_claim) - if claimed_successfully: + if claimed_indexes: # Schedule tasks for successfully claimed indexes - self._reconcile_document_operations(document_id, operations) + self._reconcile_document_operations(document_id, claimed_indexes) session.commit() else: # Some indexes couldn't be claimed (likely already being processed), skip this document logger.debug(f"Skipping document {document_id} - indexes already being processed") - def _claim_document_indexes(self, session: Session, document_id: str, indexes_to_claim: List[tuple]) -> bool: + def _claim_document_indexes(self, session: Session, document_id: str, indexes_to_claim: List[tuple]) -> List[dict]: """ Atomically claim indexes for a document by updating their state. - Returns True if all indexes were successfully claimed, False otherwise. + Returns list of successfully claimed indexes with their details. """ + claimed_indexes = [] + try: - for index_id, operation_type in indexes_to_claim: + for index_id, index_type, operation_type in indexes_to_claim: if operation_type in ["create", "update"]: - target_state = IndexActualState.CREATING + target_state = DocumentIndexStatus.CREATING elif operation_type == "delete": - target_state = IndexActualState.DELETING + target_state = DocumentIndexStatus.DELETION_IN_PROGRESS else: continue - # Try to claim this specific index - # Use all reconciliation conditions plus specific index/document filters - # This ensures claiming conditions are a superset of reconciliation conditions - base_conditions = self._get_reconciliation_conditions(operation_type) - where_conditions = [ - DocumentIndex.id == index_id, - DocumentIndex.document_id == document_id, - ] + base_conditions + # Get the current index record to extract version info + stmt = select(DocumentIndex).where(DocumentIndex.id == index_id) + result = session.execute(stmt) + current_index = result.scalar_one_or_none() + + if not current_index: + continue + + # Build appropriate claiming conditions based on operation type + if operation_type == "create": + claiming_conditions = [ + DocumentIndex.id == index_id, + DocumentIndex.status == DocumentIndexStatus.PENDING, + DocumentIndex.observed_version < DocumentIndex.version, + DocumentIndex.version == 1, + ] + elif operation_type == "update": + claiming_conditions = [ + DocumentIndex.id == index_id, + DocumentIndex.status == DocumentIndexStatus.PENDING, + DocumentIndex.observed_version < DocumentIndex.version, + DocumentIndex.version > 1, + ] + elif operation_type == "delete": + claiming_conditions = [ + DocumentIndex.id == index_id, + DocumentIndex.status == DocumentIndexStatus.DELETING, + ] + # Try to claim this specific index update_stmt = ( update(DocumentIndex) - .where(and_(*where_conditions)) - .values(actual_state=target_state, gmt_updated=utc_now(), gmt_last_reconciled=utc_now()) + .where(and_(*claiming_conditions)) + .values(status=target_state, gmt_updated=utc_now(), gmt_last_reconciled=utc_now()) ) result = session.execute(update_stmt) - if result.rowcount == 0: - # This index couldn't be claimed (already being processed) + if result.rowcount > 0: + # Successfully claimed this index + claimed_indexes.append({ + 'index_id': index_id, + 'document_id': document_id, + 'index_type': index_type, + 'operation_type': operation_type, + 'target_version': current_index.version if operation_type in ["create", "update"] else None, + }) + logger.debug(f"Claimed index {index_id} for document {document_id} ({operation_type})") + else: logger.debug(f"Could not claim index {index_id} for document {document_id}") - return False session.flush() # Ensure changes are visible - return True + return claimed_indexes except Exception as e: logger.error(f"Failed to claim indexes for document {document_id}: {e}") - return False + return [] - def _reconcile_document_operations(self, document_id: str, operations: dict): + def _reconcile_document_operations(self, document_id: str, claimed_indexes: List[dict]): """ - Reconcile operations for a single document, using batch processing when possible - States are already updated to CREATING/DELETING before calling this method + Reconcile operations for a single document, batching same operation types together """ - - create_index_types = [] - for doc_index in operations["create"]: - create_index_types.append(doc_index.index_type) - if create_index_types: - # Add document_id to task for better idempotency checking - task_id = f"create_index_{document_id}_{int(time.time())}" - self.task_scheduler.schedule_create_index( - index_types=create_index_types, document_id=document_id, task_id=task_id - ) - logger.info( - f"Scheduled create index task {task_id} for document {document_id} with types {create_index_types}" - ) - - update_index_types = [] - for doc_index in operations["update"]: - update_index_types.append(doc_index.index_type) - if update_index_types: - task_id = f"update_index_{document_id}_{int(time.time())}" - self.task_scheduler.schedule_update_index( - index_types=update_index_types, document_id=document_id, task_id=task_id - ) - logger.info( - f"Scheduled update index task {task_id} for document {document_id} with types {update_index_types}" + from collections import defaultdict + + # Group by operation type to batch operations + operations_by_type = defaultdict(list) + for claimed_index in claimed_indexes: + operation_type = claimed_index['operation_type'] + operations_by_type[operation_type].append(claimed_index) + + # Process create operations as a batch + if "create" in operations_by_type: + create_indexes = operations_by_type["create"] + create_types = [claimed_index['index_type'] for claimed_index in create_indexes] + context = {} + + for claimed_index in create_indexes: + index_type = claimed_index['index_type'] + target_version = claimed_index.get('target_version') + + # Store version info in context + if target_version is not None: + context[f"{index_type}_version"] = target_version + + task_id = self.task_scheduler.schedule_create_index( + document_id=document_id, + index_types=create_types, + context=context ) - - delete_index_types = [] - for doc_index in operations["delete"]: - delete_index_types.append(doc_index.index_type) - if delete_index_types: - # Use the last index_data for the delete operation - index_data = operations["delete"][-1].index_data if operations["delete"] else None - task_id = f"delete_index_{document_id}_{int(time.time())}" - self.task_scheduler.schedule_delete_index( - index_types=delete_index_types, + logger.info(f"Scheduled create task for document {document_id}, types: {create_types}") + + # Process update operations as a batch + if "update" in operations_by_type: + update_indexes = operations_by_type["update"] + update_types = [claimed_index['index_type'] for claimed_index in update_indexes] + context = {} + + for claimed_index in update_indexes: + index_type = claimed_index['index_type'] + target_version = claimed_index.get('target_version') + + # Store version info in context + if target_version is not None: + context[f"{index_type}_version"] = target_version + + task_id = self.task_scheduler.schedule_update_index( document_id=document_id, - index_data=index_data, - task_id=task_id, + index_types=update_types, + context=context ) - logger.info( - f"Scheduled delete index task {task_id} for document {document_id} with types {delete_index_types}" + logger.info(f"Scheduled update task for document {document_id}, types: {update_types}") + + # Process delete operations as a batch + if "delete" in operations_by_type: + delete_indexes = operations_by_type["delete"] + delete_types = [claimed_index['index_type'] for claimed_index in delete_indexes] + + task_id = self.task_scheduler.schedule_delete_index( + document_id=document_id, + index_types=delete_types ) + logger.info(f"Scheduled delete task for document {document_id}, types: {delete_types}") # Index task completion callbacks @@ -288,22 +274,23 @@ def _update_document_status(document_id: str, session: Session): session.add(document) @staticmethod - def on_index_created(document_id: str, index_type: str, index_data: str = None): - """Called when index creation succeeds""" + def on_index_created(document_id: str, index_type: str, target_version: int, index_data: str = None): + """Called when index creation/update succeeds""" for session in get_sync_session(): - # Use atomic update with state validation + # Use atomic update with version validation update_stmt = ( update(DocumentIndex) .where( and_( DocumentIndex.document_id == document_id, DocumentIndex.index_type == DocumentIndexType(index_type), - DocumentIndex.actual_state == IndexActualState.CREATING, # Only allow transition from CREATING + DocumentIndex.status == DocumentIndexStatus.CREATING, + DocumentIndex.version == target_version, # Critical: validate version ) ) .values( - actual_state=IndexActualState.PRESENT, - observed_version=DocumentIndex.version, # Mark as processed + status=DocumentIndexStatus.ACTIVE, + observed_version=target_version, # Mark this version as processed index_data=index_data, error_message=None, gmt_updated=utc_now(), @@ -314,11 +301,11 @@ def on_index_created(document_id: str, index_type: str, index_data: str = None): result = session.execute(update_stmt) if result.rowcount > 0: IndexTaskCallbacks._update_document_status(document_id, session) - logger.info(f"{index_type} index creation completed for document {document_id}") + logger.info(f"{index_type} index creation completed for document {document_id} (v{target_version})") session.commit() else: logger.warning( - f"Index creation callback ignored for document {document_id} type {index_type} - not in CREATING state" + f"Index creation callback ignored for document {document_id} type {index_type} v{target_version} - not in expected state" ) session.rollback() @@ -333,12 +320,12 @@ def on_index_failed(document_id: str, index_type: str, error_message: str): and_( DocumentIndex.document_id == document_id, DocumentIndex.index_type == DocumentIndexType(index_type), - # Only allow transition from CREATING or DELETING states - DocumentIndex.actual_state.in_([IndexActualState.CREATING, IndexActualState.DELETING]), + # Allow transition from any in-progress state + DocumentIndex.status.in_([DocumentIndexStatus.CREATING, DocumentIndexStatus.DELETION_IN_PROGRESS]), ) ) .values( - actual_state=IndexActualState.FAILED, + status=DocumentIndexStatus.FAILED, error_message=error_message, gmt_updated=utc_now(), gmt_last_reconciled=utc_now(), @@ -352,46 +339,40 @@ def on_index_failed(document_id: str, index_type: str, error_message: str): session.commit() else: logger.warning( - f"Index failure callback ignored for document {document_id} type {index_type} - not in CREATING or DELETING state" + f"Index failure callback ignored for document {document_id} type {index_type} - not in expected state" ) session.rollback() @staticmethod def on_index_deleted(document_id: str, index_type: str): - """Called when index deletion succeeds""" + """Called when index deletion succeeds - hard delete the record""" for session in get_sync_session(): - # Use atomic update with state validation - update_stmt = ( - update(DocumentIndex) + # Delete the record entirely + from sqlalchemy import delete + + delete_stmt = ( + delete(DocumentIndex) .where( and_( DocumentIndex.document_id == document_id, DocumentIndex.index_type == DocumentIndexType(index_type), - DocumentIndex.actual_state == IndexActualState.DELETING, # Only allow transition from DELETING + DocumentIndex.status == DocumentIndexStatus.DELETION_IN_PROGRESS, ) ) - .values( - actual_state=IndexActualState.ABSENT, - observed_version=DocumentIndex.version, # Mark as processed - index_data=None, - error_message=None, - gmt_updated=utc_now(), - gmt_last_reconciled=utc_now(), - ) ) - result = session.execute(update_stmt) + result = session.execute(delete_stmt) if result.rowcount > 0: IndexTaskCallbacks._update_document_status(document_id, session) - logger.info(f"{index_type} index deletion completed for document {document_id}") + logger.info(f"{index_type} index deleted for document {document_id}") session.commit() else: logger.warning( - f"Index deletion callback ignored for document {document_id} type {index_type} - not in DELETING state" + f"Index deletion callback ignored for document {document_id} type {index_type} - not in expected state" ) session.rollback() -# Global instance -index_reconciler = BackendIndexReconciler() +# Global instances +index_reconciler = DocumentIndexReconciler() index_task_callbacks = IndexTaskCallbacks() diff --git a/aperag/migration/versions/20250624093005-eb8aa708478f.py b/aperag/migration/versions/20250624132425-850b2c5dc08f.py similarity index 97% rename from aperag/migration/versions/20250624093005-eb8aa708478f.py rename to aperag/migration/versions/20250624132425-850b2c5dc08f.py index f69d932bf..04e6e36fd 100644 --- a/aperag/migration/versions/20250624093005-eb8aa708478f.py +++ b/aperag/migration/versions/20250624132425-850b2c5dc08f.py @@ -1,8 +1,8 @@ """empty message -Revision ID: eb8aa708478f +Revision ID: 850b2c5dc08f Revises: -Create Date: 2025-06-24 09:30:05.267898 +Create Date: 2025-06-24 13:24:25.714734 """ from typing import Sequence, Union @@ -12,7 +12,7 @@ from pgvector.sqlalchemy import Vector # revision identifiers, used by Alembic. -revision: str = 'eb8aa708478f' +revision: str = '850b2c5dc08f' down_revision: Union[str, None] = None branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -156,11 +156,10 @@ def upgrade() -> None: sa.Column('id', sa.Integer(), nullable=False), sa.Column('document_id', sa.String(length=24), nullable=False), sa.Column('index_type', sa.Enum('VECTOR', 'FULLTEXT', 'GRAPH', name='documentindextype'), nullable=False), - sa.Column('desired_state', sa.Enum('present', 'absent', name='indexdesiredstate'), nullable=False), + sa.Column('status', sa.Enum('PENDING', 'CREATING', 'ACTIVE', 'DELETING', 'DELETION_IN_PROGRESS', 'FAILED', name='documentindexstatus'), nullable=False), sa.Column('version', sa.Integer(), nullable=False), - sa.Column('created_by', sa.String(length=256), nullable=False), - sa.Column('actual_state', sa.Enum('absent', 'creating', 'present', 'deleting', 'failed', name='indexactualstate'), nullable=False), sa.Column('observed_version', sa.Integer(), nullable=False), + sa.Column('created_by', sa.String(length=256), nullable=False), sa.Column('index_data', sa.Text(), nullable=True), sa.Column('error_message', sa.Text(), nullable=True), sa.Column('gmt_created', sa.DateTime(timezone=True), nullable=False), @@ -169,11 +168,10 @@ def upgrade() -> None: sa.PrimaryKeyConstraint('id'), sa.UniqueConstraint('document_id', 'index_type', name='uq_document_index') ) - op.create_index(op.f('ix_document_index_actual_state'), 'document_index', ['actual_state'], unique=False) - op.create_index(op.f('ix_document_index_desired_state'), 'document_index', ['desired_state'], unique=False) op.create_index(op.f('ix_document_index_document_id'), 'document_index', ['document_id'], unique=False) op.create_index(op.f('ix_document_index_id'), 'document_index', ['id'], unique=False) op.create_index(op.f('ix_document_index_index_type'), 'document_index', ['index_type'], unique=False) + op.create_index(op.f('ix_document_index_status'), 'document_index', ['status'], unique=False) op.create_table('invitation', sa.Column('id', sa.String(length=24), nullable=False), sa.Column('email', sa.String(length=254), nullable=False), @@ -397,11 +395,10 @@ def downgrade() -> None: op.drop_table('lightrag_doc_full') op.drop_table('lightrag_doc_chunks') op.drop_table('invitation') + op.drop_index(op.f('ix_document_index_status'), table_name='document_index') op.drop_index(op.f('ix_document_index_index_type'), table_name='document_index') op.drop_index(op.f('ix_document_index_id'), table_name='document_index') op.drop_index(op.f('ix_document_index_document_id'), table_name='document_index') - op.drop_index(op.f('ix_document_index_desired_state'), table_name='document_index') - op.drop_index(op.f('ix_document_index_actual_state'), table_name='document_index') op.drop_table('document_index') op.drop_index(op.f('ix_document_user'), table_name='document') op.drop_index(op.f('ix_document_status'), table_name='document') diff --git a/aperag/migration/versions/20250624093016-dc0829e062eb.py b/aperag/migration/versions/20250624132601-66b96592c84a.py similarity index 76% rename from aperag/migration/versions/20250624093016-dc0829e062eb.py rename to aperag/migration/versions/20250624132601-66b96592c84a.py index 3effcb939..0c6c1c551 100644 --- a/aperag/migration/versions/20250624093016-dc0829e062eb.py +++ b/aperag/migration/versions/20250624132601-66b96592c84a.py @@ -1,8 +1,8 @@ -"""Initialize model configurations data +"""empty message -Revision ID: dc0829e062eb -Revises: eb8aa708478f -Create Date: 2025-06-24 09:30:16.549135 +Revision ID: 66b96592c84a +Revises: 850b2c5dc08f +Create Date: 2025-06-24 13:26:01.031627 """ from typing import Sequence, Union @@ -12,13 +12,13 @@ from aperag.migration.utils import execute_sql_file + # revision identifiers, used by Alembic. -revision: str = 'dc0829e062eb' -down_revision: Union[str, None] = 'eb8aa708478f' +revision: str = '66b96592c84a' +down_revision: Union[str, None] = '850b2c5dc08f' branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None - def upgrade() -> None: """Initialize model configurations data.""" # Execute model configurations initialization SQL diff --git a/aperag/schema/view_models.py b/aperag/schema/view_models.py index 44bb889e6..f802f1ee1 100644 --- a/aperag/schema/view_models.py +++ b/aperag/schema/view_models.py @@ -1,20 +1,6 @@ -# Copyright 2025 ApeCloud, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - # generated by datamodel-codegen: # filename: openapi.merged.yaml -# timestamp: 2025-06-24T01:35:54+00:00 +# timestamp: 2025-06-24T06:45:59+00:00 from __future__ import annotations @@ -548,18 +534,40 @@ class Document(BaseModel): id: Optional[str] = None name: Optional[str] = None status: Optional[ - Literal[ - 'PENDING', 'RUNNING', 'COMPLETE', 'FAILED', 'DELETING', 'DELETED', 'WARNING' - ] + Literal['PENDING', 'RUNNING', 'COMPLETE', 'FAILED', 'DELETING', 'DELETED'] ] = None vector_index_status: Optional[ - Literal['PENDING', 'RUNNING', 'COMPLETE', 'FAILED', 'SKIPPED'] + Literal[ + 'PENDING', + 'CREATING', + 'ACTIVE', + 'DELETING', + 'DELETING_IN_PROGRESS', + 'FAILED', + 'SKIPPED', + ] ] = None fulltext_index_status: Optional[ - Literal['PENDING', 'RUNNING', 'COMPLETE', 'FAILED', 'SKIPPED'] + Literal[ + 'PENDING', + 'CREATING', + 'ACTIVE', + 'DELETING', + 'DELETING_IN_PROGRESS', + 'FAILED', + 'SKIPPED', + ] ] = None graph_index_status: Optional[ - Literal['PENDING', 'RUNNING', 'COMPLETE', 'FAILED', 'SKIPPED'] + Literal[ + 'PENDING', + 'CREATING', + 'ACTIVE', + 'DELETING', + 'DELETING_IN_PROGRESS', + 'FAILED', + 'SKIPPED', + ] ] = None vector_index_updated: Optional[datetime] = Field( None, description='Vector index last updated time' diff --git a/aperag/service/document_service.py b/aperag/service/document_service.py index 0f03edd7a..a7370c902 100644 --- a/aperag/service/document_service.py +++ b/aperag/service/document_service.py @@ -80,64 +80,44 @@ def __init__(self, session: AsyncSession = None): async def build_document_response( self, document: db_models.Document, session: AsyncSession ) -> view_models.Document: - """Build Document response object for API return.""" - # Get index status from new tables - index_status_info = await document_index_manager.get_document_index_status(session, document.id) - - # Convert new format to old API format for backward compatibility - indexes = index_status_info.get("indexes", {}) - - # Map new states to old enum values for API compatibility - def map_state_to_old_enum(actual_state: str): - if actual_state == "absent": - return "SKIPPED" - elif actual_state == "creating": - return "RUNNING" - elif actual_state == "present": - return "COMPLETE" - elif actual_state == "failed": - return "FAILED" - else: - return "PENDING" - - # Get individual index update times from DocumentIndex table + """Build Document response object for API return using new status model.""" from sqlalchemy import select from aperag.db.models import DocumentIndex, DocumentIndexType - - vector_updated = None - fulltext_updated = None - graph_updated = None - - # Query for each index type's update time - for index_type, var_name in [ - (DocumentIndexType.VECTOR, "vector_updated"), - (DocumentIndexType.FULLTEXT, "fulltext_updated"), - (DocumentIndexType.GRAPH, "graph_updated"), - ]: - stmt = select(DocumentIndex).where( - DocumentIndex.document_id == document.id, DocumentIndex.index_type == index_type + # Get all document indexes for status calculation + document_indexes = await session.execute( + select(DocumentIndex).where( + DocumentIndex.document_id == document.id, + DocumentIndex.status != db_models.DocumentIndexStatus.DELETING, + DocumentIndex.status != db_models.DocumentIndexStatus.DELETION_IN_PROGRESS, ) - result = await session.execute(stmt) - index_record = result.scalar_one_or_none() - if index_record: - if var_name == "vector_updated": - vector_updated = index_record.gmt_updated - elif var_name == "fulltext_updated": - fulltext_updated = index_record.gmt_updated - elif var_name == "graph_updated": - graph_updated = index_record.gmt_updated - - return Document( + ) + indexes = document_indexes.scalars().all() + + # Map index states to API response format + index_status = {} + index_updated = {} + + # Initialize all types as SKIPPED (when no record exists) + all_types = [db_models.DocumentIndexType.VECTOR, db_models.DocumentIndexType.FULLTEXT, db_models.DocumentIndexType.GRAPH] + for index_type in all_types: + index_status[index_type] = "SKIPPED" + + # Update with actual states from database + for index in indexes: + index_status[index.index_type] = index.status + index_updated[index.index_type] = index.gmt_updated + + return view_models.Document( id=document.id, name=document.name, status=document.status, - vector_index_status=map_state_to_old_enum(indexes.get("VECTOR", {}).get("actual_state", "absent")), - fulltext_index_status=map_state_to_old_enum(indexes.get("FULLTEXT", {}).get("actual_state", "absent")), - graph_index_status=map_state_to_old_enum(indexes.get("GRAPH", {}).get("actual_state", "absent")), - vector_index_updated=vector_updated, - fulltext_index_updated=fulltext_updated, - graph_index_updated=graph_updated, + vector_index_status=index_status.get(db_models.DocumentIndexType.VECTOR, "SKIPPED"), + fulltext_index_status=index_status.get(db_models.DocumentIndexType.FULLTEXT, "SKIPPED"), + graph_index_status=index_status.get(db_models.DocumentIndexType.GRAPH, "SKIPPED"), + vector_index_updated=index_updated.get(db_models.DocumentIndexType.VECTOR, None), + fulltext_index_updated=index_updated.get(db_models.DocumentIndexType.FULLTEXT, None), + graph_index_updated=index_updated.get(db_models.DocumentIndexType.GRAPH, None), size=document.size, created=document.gmt_created, updated=document.gmt_updated, @@ -226,8 +206,12 @@ async def _create_documents_atomically(session): if collection_config.get("enable_knowledge_graph", False): index_types.append(db_models.DocumentIndexType.GRAPH) + # Use index manager to create indexes with new status model await document_index_manager.create_document_indexes( - session, document_instance.id, user, index_types + document_id=document_instance.id, + index_types=index_types, + user=user, + session=session ) # Build response object diff --git a/aperag/tasks/scheduler.py b/aperag/tasks/scheduler.py index a2272a5ea..247b35424 100644 --- a/aperag/tasks/scheduler.py +++ b/aperag/tasks/scheduler.py @@ -35,13 +35,14 @@ class TaskScheduler(ABC): """Abstract base class for task schedulers""" @abstractmethod - def schedule_create_index(self, document_id: str, index_types: List[str], **kwargs) -> str: + def schedule_create_index(self, document_id: str, index_types: List[str], context: dict = None, **kwargs) -> str: """ - Schedule single index creation task (legacy support) + Schedule single index creation task Args: document_id: Document ID to process index_types: List of index types (vector, fulltext, graph) + context: Task context including version info **kwargs: Additional arguments Returns: @@ -50,13 +51,14 @@ def schedule_create_index(self, document_id: str, index_types: List[str], **kwar pass @abstractmethod - def schedule_update_index(self, document_id: str, index_types: List[str], **kwargs) -> str: + def schedule_update_index(self, document_id: str, index_types: List[str], context: dict = None, **kwargs) -> str: """ - Schedule single index update task (legacy support) + Schedule single index update task Args: document_id: Document ID to process index_types: List of index types (vector, fulltext, graph) + context: Task context including version info **kwargs: Additional arguments Returns: @@ -65,13 +67,14 @@ def schedule_update_index(self, document_id: str, index_types: List[str], **kwar pass @abstractmethod - def schedule_delete_index(self, document_id: str, index_types: List[str], **kwargs) -> str: + def schedule_delete_index(self, document_id: str, index_types: List[str], context: dict = None, **kwargs) -> str: """ - Schedule single index deletion task (legacy support) + Schedule single index deletion task Args: document_id: Document ID to process index_types: List of index types (vector, fulltext, graph) + context: Task context including version info **kwargs: Additional arguments Returns: @@ -94,9 +97,7 @@ def get_task_status(self, task_id: str) -> Optional[TaskResult]: def create_task_scheduler(scheduler_type: str): - if scheduler_type == "local": - return LocalTaskScheduler() - elif scheduler_type == "celery": + if scheduler_type == "celery": return CeleryTaskScheduler() elif scheduler_type == "prefect": return PrefectTaskScheduler() @@ -104,145 +105,16 @@ def create_task_scheduler(scheduler_type: str): raise Exception("unknown task scheduler type: %s" % scheduler_type) -class LocalTaskScheduler(TaskScheduler): - """Local synchronous implementation for testing or single-machine deployments""" - - def __init__(self): - self._task_counter = 0 - self._results = {} - - def _execute_task(self, task_func, *args, **kwargs) -> str: - """Execute task synchronously and store result""" - self._task_counter += 1 - task_id = f"local_task_{self._task_counter}" - - try: - result = task_func(*args, **kwargs) - self._results[task_id] = TaskResult(task_id, success=True, data=result) - except Exception as e: - self._results[task_id] = TaskResult(task_id, success=False, error=str(e)) - - return task_id - - def schedule_create_index(self, document_id: str, index_types: List[str], **kwargs) -> str: - """Schedule index creation task""" - from aperag.index.fulltext_index import fulltext_indexer - from aperag.index.graph_index import graph_indexer - from aperag.index.vector_index import vector_indexer - from aperag.tasks.utils import get_document_and_collection - - def batch_process(): - # Parse document once - document, collection = get_document_and_collection(document_id) - content, doc_parts, local_doc = parse_document_content(document, collection) - file_path = local_doc.path - - results = {} - - try: - # Process each requested index type - for index_type in index_types: - if index_type == "VECTOR": - try: - result = vector_indexer.create_index( - document_id=document_id, - content=content, - doc_parts=doc_parts, - collection=collection, - file_path=file_path, - ) - results["VECTOR"] = {"success": result.success, "data": result.data} - except Exception as e: - results["VECTOR"] = {"success": False, "error": str(e)} - - elif index_type == "FULLTEXT": - try: - result = fulltext_indexer.create_index( - document_id=document_id, - content=content, - doc_parts=doc_parts, - collection=collection, - file_path=file_path, - ) - results["FULLTEXT"] = {"success": result.success, "data": result.data} - except Exception as e: - results["FULLTEXT"] = {"success": False, "error": str(e)} - - elif index_type == "GRAPH": - if graph_indexer.is_enabled(collection): - try: - from aperag.graph.lightrag_manager import process_document_for_celery - - result = process_document_for_celery( - collection=collection, content=content, doc_id=document_id, file_path=file_path - ) - results["GRAPH"] = {"success": True, "data": result} - except Exception as e: - results["GRAPH"] = {"success": False, "error": str(e)} - else: - results["GRAPH"] = {"success": True, "data": None, "message": "Graph indexing disabled"} - - finally: - # Cleanup local document - cleanup_local_document(local_doc, collection) - - return results - - return self._execute_task(batch_process) - - def schedule_update_index(self, document_id: str, index_types: List[str], **kwargs) -> str: - """Schedule index update task""" - # For local scheduler, treat update same as create - return self.schedule_create_index(document_id, index_types, **kwargs) - - def schedule_delete_index(self, document_id: str, index_types: List[str], **kwargs) -> str: - """Schedule index deletion task""" - from aperag.index.fulltext_index import fulltext_indexer - from aperag.index.graph_index import graph_indexer - from aperag.index.vector_index import vector_indexer - from aperag.tasks.utils import get_document_and_collection - - def delete_single_index(): - document, collection = get_document_and_collection(document_id) - - for index_type in index_types: - if index_type == "VECTOR": - result = vector_indexer.delete_index(document_id, collection) - if not result.success: - raise Exception(result.error) - elif index_type == "FULLTEXT": - result = fulltext_indexer.delete_index(document_id, collection) - if not result.success: - raise Exception(result.error) - elif index_type == "GRAPH": - if graph_indexer.is_enabled(collection): - from aperag.graph.lightrag_manager import delete_document_for_celery - - result = delete_document_for_celery(collection=collection, doc_id=document_id) - if result.get("status") != "success": - raise Exception(result.get("message", "Unknown error")) - else: - raise ValueError(f"Unknown index type: {index_type}") - - return f"Deleted {index_type} index for document {document_id}" - - return self._execute_task(delete_single_index) - - def get_task_status(self, task_id: str) -> Optional[TaskResult]: - """Get local task status""" - return self._results.get(task_id) - - class CeleryTaskScheduler(TaskScheduler): """Celery implementation of TaskScheduler - Direct workflow execution""" - def schedule_create_index(self, document_id: str, index_types: List[str], **kwargs) -> str: + def schedule_create_index(self, document_id: str, index_types: List[str], context: dict = None, **kwargs) -> str: """Schedule index creation workflow""" from config.celery_tasks import create_document_indexes_workflow try: # Execute workflow and return AsyncResult ID (not calling .get()) - workflow_result = create_document_indexes_workflow(document_id, index_types) + workflow_result = create_document_indexes_workflow(document_id, index_types, context) workflow_id = workflow_result.id # Use .id instead of .get('workflow_id') logger.debug( f"Scheduled create indexes workflow {workflow_id} for document {document_id} with types {index_types}" @@ -252,13 +124,13 @@ def schedule_create_index(self, document_id: str, index_types: List[str], **kwar logger.error(f"Failed to schedule create indexes workflow for document {document_id}: {str(e)}") raise - def schedule_update_index(self, document_id: str, index_types: List[str], **kwargs) -> str: + def schedule_update_index(self, document_id: str, index_types: List[str], context: dict = None, **kwargs) -> str: """Schedule index update workflow""" from config.celery_tasks import update_document_indexes_workflow try: # Execute workflow and return AsyncResult ID (not calling .get()) - workflow_result = update_document_indexes_workflow(document_id, index_types) + workflow_result = update_document_indexes_workflow(document_id, index_types, context) workflow_id = workflow_result.id # Use .id instead of .get('workflow_id') logger.debug( f"Scheduled update indexes workflow {workflow_id} for document {document_id} with types {index_types}" @@ -273,9 +145,9 @@ def schedule_delete_index(self, document_id: str, index_types: List[str], **kwar from config.celery_tasks import delete_document_indexes_workflow try: - # Execute workflow and return AsyncResult ID (not calling .get()) + # Execute workflow and return AsyncResult ID workflow_result = delete_document_indexes_workflow(document_id, index_types) - workflow_id = workflow_result.id # Use .id instead of .get('workflow_id') + workflow_id = workflow_result.id logger.debug( f"Scheduled delete indexes workflow {workflow_id} for document {document_id} with types {index_types}" ) @@ -314,15 +186,15 @@ def get_task_status(self, task_id: str) -> Optional[TaskResult]: class PrefectTaskScheduler(TaskScheduler): """Prefect implementation of TaskScheduler - Direct workflow execution""" - def schedule_create_index(self, document_id: str, index_types: List[str], **kwargs) -> str: + def schedule_create_index(self, document_id: str, index_types: List[str], context: dict = None, **kwargs) -> str: """Schedule index creation workflow""" raise NotImplementedError("Prefect task scheduler is not implemented") - def schedule_update_index(self, document_id: str, index_types: List[str], **kwargs) -> str: + def schedule_update_index(self, document_id: str, index_types: List[str], context: dict = None, **kwargs) -> str: """Schedule index update workflow""" raise NotImplementedError("Prefect task scheduler is not implemented") - def schedule_delete_index(self, document_id: str, index_types: List[str], **kwargs) -> str: + def schedule_delete_index(self, document_id: str, index_types: List[str], context: dict = None, **kwargs) -> str: """Schedule index deletion workflow""" raise NotImplementedError("Prefect task scheduler is not implemented") diff --git a/config/celery_tasks.py b/config/celery_tasks.py index d99f752e5..ed039298d 100644 --- a/config/celery_tasks.py +++ b/config/celery_tasks.py @@ -128,11 +128,20 @@ def _handle_index_success(self, document_id: str, index_type: str, index_data: d try: from aperag.index.reconciler import index_task_callbacks index_data_json = json.dumps(index_data) if index_data else None - index_task_callbacks.on_index_created(document_id, index_type, index_data_json) + index_task_callbacks.on_index_created(document_id, index_type, 1, index_data_json) # Default version 1 for backward compatibility logger.info(f"Index success callback executed for {index_type} index of document {document_id}") except Exception as e: logger.warning(f"Failed to execute index success callback for {index_type} of {document_id}: {e}", exc_info=True) + def _handle_index_success_with_version(self, document_id: str, index_type: str, target_version: int, index_data: dict = None): + try: + from aperag.index.reconciler import index_task_callbacks + index_data_json = json.dumps(index_data) if index_data else None + index_task_callbacks.on_index_created(document_id, index_type, target_version, index_data_json) + logger.info(f"Index success callback executed for {index_type} index of document {document_id} (v{target_version})") + except Exception as e: + logger.warning(f"Failed to execute index success callback for {index_type} of {document_id} v{target_version}: {e}", exc_info=True) + def _handle_index_deletion_success(self, document_id: str, index_type: str): try: from aperag.index.reconciler import index_task_callbacks @@ -176,20 +185,55 @@ def parse_document_task(self, document_id: str) -> dict: @current_app.task(bind=True, base=BaseIndexTask, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60}) -def create_index_task(self, document_id: str, index_type: str, parsed_data_dict: dict) -> dict: +def create_index_task(self, document_id: str, index_type: str, parsed_data_dict: dict, context: dict = None) -> dict: """ - Create a single index for a document + Create a single index for a document with distributed locking Args: document_id: Document ID to process index_type: Type of index to create ('vector', 'fulltext', 'graph') parsed_data_dict: Serialized ParsedDocumentData from parse_document_task + context: Task context including index version Returns: Serialized IndexTaskResult """ + from aperag.db.models import DocumentIndex, DocumentIndexType, DocumentIndexStatus + from aperag.config import get_sync_session + from sqlalchemy import select, and_ + + # Extract target version from context + context = context or {} + target_version = context.get(f'{index_type}_version') + try: - logger.info(f"Starting to create {index_type} index for document {document_id}") + logger.info(f"Starting to create {index_type} index for document {document_id} (v{target_version})") + + # Double-check: verify task is still valid + for session in get_sync_session(): + stmt = select(DocumentIndex).where( + and_( + DocumentIndex.document_id == document_id, + DocumentIndex.index_type == DocumentIndexType(index_type) + ) + ) + result = session.execute(stmt) + db_index = result.scalar_one_or_none() + + # Validate task is still relevant + if not db_index: + logger.info(f"Index record not found for {document_id}:{index_type}, skipping task") + return {"status": "skipped", "reason": "index_record_not_found"} + + if db_index.status != DocumentIndexStatus.CREATING: + logger.info(f"Index status changed for {document_id}:{index_type}, current: {db_index.status}, skipping task") + return {"status": "skipped", "reason": f"status_changed_to_{db_index.status}"} + + if target_version and db_index.version != target_version: + logger.info(f"Version mismatch for {document_id}:{index_type}, expected: {target_version}, current: {db_index.version}, skipping task") + return {"status": "skipped", "reason": f"version_mismatch_expected_{target_version}_current_{db_index.version}"} + + break # Convert dict back to structured data parsed_data = ParsedDocumentData.from_dict(parsed_data_dict) @@ -203,9 +247,9 @@ def create_index_task(self, document_id: str, index_type: str, parsed_data_dict: logger.error(error_msg) raise Exception(error_msg) - # Handle success callback - logger.info(f"Successfully created {index_type} index for document {document_id}") - self._handle_index_success(document_id, index_type, result.data) + # Handle success callback with version validation + logger.info(f"Successfully created {index_type} index for document {document_id} (v{target_version})") + self._handle_index_success_with_version(document_id, index_type, target_version, result.data) return result.to_dict() @@ -232,9 +276,35 @@ def delete_index_task(self, document_id: str, index_type: str) -> dict: Returns: Serialized IndexTaskResult """ + from aperag.db.models import DocumentIndex, DocumentIndexType, DocumentIndexStatus + from aperag.config import get_sync_session + from sqlalchemy import select, and_ + try: logger.info(f"Starting to delete {index_type} index for document {document_id}") + # Double-check: verify task is still valid + for session in get_sync_session(): + stmt = select(DocumentIndex).where( + and_( + DocumentIndex.document_id == document_id, + DocumentIndex.index_type == DocumentIndexType(index_type) + ) + ) + result = session.execute(stmt) + db_index = result.scalar_one_or_none() + + # Validate task is still relevant + if not db_index: + logger.info(f"Index record not found for {document_id}:{index_type}, already deleted") + return {"status": "skipped", "reason": "index_record_not_found"} + + if db_index.status != DocumentIndexStatus.DELETION_IN_PROGRESS: + logger.info(f"Index status changed for {document_id}:{index_type}, current: {db_index.status}, skipping task") + return {"status": "skipped", "reason": f"status_changed_to_{db_index.status}"} + + break + # Execute index deletion result = document_index_task.delete_index(document_id, index_type) @@ -262,20 +332,55 @@ def delete_index_task(self, document_id: str, index_type: str) -> dict: @current_app.task(bind=True, base=BaseIndexTask, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60}) -def update_index_task(self, document_id: str, index_type: str, parsed_data_dict: dict) -> dict: +def update_index_task(self, document_id: str, index_type: str, parsed_data_dict: dict, context: dict = None) -> dict: """ - Update a single index for a document + Update a single index for a document with distributed locking Args: document_id: Document ID to process index_type: Type of index to update ('vector', 'fulltext', 'graph') parsed_data_dict: Serialized ParsedDocumentData from parse_document_task + context: Task context including index version Returns: Serialized IndexTaskResult """ + from aperag.db.models import DocumentIndex, DocumentIndexType, DocumentIndexStatus + from aperag.config import get_sync_session + from sqlalchemy import select, and_ + + # Extract target version from context + context = context or {} + target_version = context.get(f'{index_type}_version') + try: - logger.info(f"Starting to update {index_type} index for document {document_id}") + logger.info(f"Starting to update {index_type} index for document {document_id} (v{target_version})") + + # Double-check: verify task is still valid + for session in get_sync_session(): + stmt = select(DocumentIndex).where( + and_( + DocumentIndex.document_id == document_id, + DocumentIndex.index_type == DocumentIndexType(index_type) + ) + ) + result = session.execute(stmt) + db_index = result.scalar_one_or_none() + + # Validate task is still relevant + if not db_index: + logger.info(f"Index record not found for {document_id}:{index_type}, skipping task") + return {"status": "skipped", "reason": "index_record_not_found"} + + if db_index.status != DocumentIndexStatus.CREATING: + logger.info(f"Index status changed for {document_id}:{index_type}, current: {db_index.status}, skipping task") + return {"status": "skipped", "reason": f"status_changed_to_{db_index.status}"} + + if target_version and db_index.version != target_version: + logger.info(f"Version mismatch for {document_id}:{index_type}, expected: {target_version}, current: {db_index.version}, skipping task") + return {"status": "skipped", "reason": f"version_mismatch_expected_{target_version}_current_{db_index.version}"} + + break # Convert dict back to structured data parsed_data = ParsedDocumentData.from_dict(parsed_data_dict) @@ -289,9 +394,9 @@ def update_index_task(self, document_id: str, index_type: str, parsed_data_dict: logger.error(error_msg) raise Exception(error_msg) - # Handle success callback - logger.info(f"Successfully updated {index_type} index for document {document_id}") - self._handle_index_success(document_id, index_type, result.data) + # Handle success callback with version validation + logger.info(f"Successfully updated {index_type} index for document {document_id} (v{target_version})") + self._handle_index_success_with_version(document_id, index_type, target_version, result.data) return result.to_dict() @@ -309,7 +414,7 @@ def update_index_task(self, document_id: str, index_type: str, parsed_data_dict: # ========== Dynamic Workflow Orchestration Tasks ========== @current_app.task(bind=True) -def trigger_create_indexes_workflow(self, parsed_data_dict: dict, document_id: str, index_types: List[str]) -> Any: +def trigger_create_indexes_workflow(self, parsed_data_dict: dict, document_id: str, index_types: List[str], context: dict = None) -> Any: """ Dynamic orchestration task for index creation workflow. @@ -329,7 +434,7 @@ def trigger_create_indexes_workflow(self, parsed_data_dict: dict, document_id: s # Dynamically create parallel index creation tasks parallel_index_tasks = group([ - create_index_task.s(document_id, index_type, parsed_data_dict) + create_index_task.s(document_id, index_type, parsed_data_dict, context) for index_type in index_types ]) @@ -387,7 +492,7 @@ def trigger_delete_indexes_workflow(self, document_id: str, index_types: List[st @current_app.task(bind=True) -def trigger_update_indexes_workflow(self, parsed_data_dict: dict, document_id: str, index_types: List[str]) -> Any: +def trigger_update_indexes_workflow(self, parsed_data_dict: dict, document_id: str, index_types: List[str], context: dict = None) -> Any: """ Dynamic orchestration task for index update workflow. @@ -404,7 +509,7 @@ def trigger_update_indexes_workflow(self, parsed_data_dict: dict, document_id: s # Create parallel index update tasks parallel_update_tasks = group([ - update_index_task.s(document_id, index_type, parsed_data_dict) + update_index_task.s(document_id, index_type, parsed_data_dict, context) for index_type in index_types ]) @@ -510,7 +615,7 @@ def notify_workflow_complete(self, index_results: List[dict], document_id: str, # ========== Workflow Entry Point Functions ========== -def create_document_indexes_workflow(document_id: str, index_types: List[str]): +def create_document_indexes_workflow(document_id: str, index_types: List[str], context: dict = None): """ Create indexes for a document using dynamic workflow orchestration. @@ -530,7 +635,7 @@ def create_document_indexes_workflow(document_id: str, index_types: List[str]): # Create the workflow chain: parse -> dynamic trigger workflow_chain = chain( parse_document_task.s(document_id), - trigger_create_indexes_workflow.s(document_id, index_types) + trigger_create_indexes_workflow.s(document_id, index_types, context) ) # Submit the workflow @@ -560,7 +665,7 @@ def delete_document_indexes_workflow(document_id: str, index_types: List[str]): return workflow_result -def update_document_indexes_workflow(document_id: str, index_types: List[str]): +def update_document_indexes_workflow(document_id: str, index_types: List[str], context: dict = None): """ Update indexes for a document using dynamic workflow orchestration. @@ -581,7 +686,7 @@ def update_document_indexes_workflow(document_id: str, index_types: List[str]): # Create the workflow chain: parse -> dynamic trigger workflow_chain = chain( parse_document_task.s(document_id), - trigger_update_indexes_workflow.s(document_id, index_types) + trigger_update_indexes_workflow.s(document_id, index_types, context) ) # Submit the workflow diff --git a/tests/e2e_test/conftest.py b/tests/e2e_test/conftest.py index bb3e465f1..d623a0a36 100644 --- a/tests/e2e_test/conftest.py +++ b/tests/e2e_test/conftest.py @@ -122,7 +122,7 @@ def document(client, collection): get_resp = client.get(f"/api/v1/collections/{collection['id']}/documents/{doc_id}") assert get_resp.status_code == HTTPStatus.OK, get_resp.text data = get_resp.json() - if data.get("vector_index_status") == "COMPLETE" and data.get("fulltext_index_status") == "COMPLETE": + if data.get("vector_index_status") == "ACTIVE" and data.get("fulltext_index_status") == "ACTIVE": break time.sleep(interval) else: diff --git a/tests/e2e_test/test_document.py b/tests/e2e_test/test_document.py index d5921f4bc..32e8bdcfe 100644 --- a/tests/e2e_test/test_document.py +++ b/tests/e2e_test/test_document.py @@ -138,14 +138,14 @@ def test_rebuild_single_index_type(client, document, collection): collection_id = collection["id"] # Test rebuilding vector index - rebuild_request = {"index_types": ["vector"]} + rebuild_request = {"index_types": ["VECTOR"]} response = client.post( f"/api/v1/collections/{collection_id}/documents/{doc_id}/rebuild_indexes", json=rebuild_request ) assert response.status_code == HTTPStatus.OK, response.text data = response.json() assert data["code"] == "200" - assert "vector" in data["message"] + assert "VECTOR" in data["message"] def test_rebuild_all_index_types(client, document, collection): @@ -154,16 +154,16 @@ def test_rebuild_all_index_types(client, document, collection): collection_id = collection["id"] # Test rebuilding all index types - rebuild_request = {"index_types": ["vector", "fulltext", "graph"]} + rebuild_request = {"index_types": ["VECTOR", "FULLTEXT", "GRAPH"]} response = client.post( f"/api/v1/collections/{collection_id}/documents/{doc_id}/rebuild_indexes", json=rebuild_request ) assert response.status_code == HTTPStatus.OK, response.text data = response.json() assert data["code"] == "200" - assert "vector" in data["message"] - assert "fulltext" in data["message"] - assert "graph" in data["message"] + assert "VECTOR" in data["message"] + assert "FULLTEXT" in data["message"] + assert "GRAPH" in data["message"] def test_rebuild_index_invalid_index_type(client, document, collection): @@ -192,7 +192,7 @@ def test_rebuild_index_empty_index_types(client, document, collection): assert response.status_code == HTTPStatus.UNPROCESSABLE_ENTITY, response.text -@pytest.mark.parametrize("index_type", ["vector", "fulltext", "graph"]) +@pytest.mark.parametrize("index_type", ["VECTOR", "FULLTEXT", "GRAPH"]) def test_rebuild_individual_index_types(client, document, collection, index_type): """Test rebuilding each individual index type""" doc_id = document["id"] @@ -205,7 +205,7 @@ def test_rebuild_individual_index_types(client, document, collection, index_type assert response.status_code == HTTPStatus.OK, response.text data = response.json() assert data["code"] == "200" - assert index_type in data["message"] + assert index_type.upper() in data["message"] def test_rebuild_index_duplicate_types(client, document, collection): @@ -214,7 +214,7 @@ def test_rebuild_index_duplicate_types(client, document, collection): collection_id = collection["id"] # Test with duplicate index types - rebuild_request = {"index_types": ["vector", "vector", "fulltext"]} + rebuild_request = {"index_types": ["VECTOR", "VECTOR", "FULLTEXT"]} response = client.post( f"/api/v1/collections/{collection_id}/documents/{doc_id}/rebuild_indexes", json=rebuild_request ) @@ -228,7 +228,7 @@ def test_rebuild_index_case_sensitivity(client, document, collection): collection_id = collection["id"] # Test with uppercase index type (should fail) - rebuild_request = {"index_types": ["VECTOR"]} + rebuild_request = {"index_types": ["vector"]} response = client.post( f"/api/v1/collections/{collection_id}/documents/{doc_id}/rebuild_indexes", json=rebuild_request ) From 59e181a97cd2870491b6a1f25a26bb8eeaa38e48 Mon Sep 17 00:00:00 2001 From: Guo Ziang Date: Tue, 24 Jun 2025 15:46:31 +0800 Subject: [PATCH 2/6] chore: tidy up --- aperag/api/components/schemas/document.yaml | 10 -- aperag/api/paths/collections.yaml | 41 ------ aperag/db/models.py | 9 +- aperag/index/manager.py | 78 +---------- .../versions/20250624132425-850b2c5dc08f.py | 1 - aperag/schema/view_models.py | 8 +- aperag/service/document_service.py | 64 +-------- aperag/views/main.py | 12 -- config/celery_tasks.py | 15 +-- frontend/src/api/apis/default-api.ts | 125 ------------------ frontend/src/api/models/document-update.ts | 42 ------ frontend/src/api/models/document.ts | 21 +-- frontend/src/api/models/index.ts | 1 - frontend/src/api/openapi.merged.yaml | 69 ++-------- 14 files changed, 38 insertions(+), 458 deletions(-) delete mode 100644 frontend/src/api/models/document-update.ts diff --git a/aperag/api/components/schemas/document.yaml b/aperag/api/components/schemas/document.yaml index c84ecdb3c..9b7ee41f0 100644 --- a/aperag/api/components/schemas/document.yaml +++ b/aperag/api/components/schemas/document.yaml @@ -90,16 +90,6 @@ documentCreate: collection_id: type: string -documentUpdate: - type: object - properties: - title: - type: string - description: - type: string - source: - type: string - rebuildIndexesRequest: type: object properties: diff --git a/aperag/api/paths/collections.yaml b/aperag/api/paths/collections.yaml index 58e738af0..ee331f095 100644 --- a/aperag/api/paths/collections.yaml +++ b/aperag/api/paths/collections.yaml @@ -254,47 +254,6 @@ document: application/json: schema: $ref: '../components/schemas/common.yaml#/failResponse' - put: - summary: Update a document - description: Update a document - security: - - BearerAuth: [] - parameters: - - name: collection_id - in: path - required: true - schema: - type: string - - name: document_id - in: path - required: true - schema: - type: string - requestBody: - required: true - content: - application/json: - schema: - $ref: '../components/schemas/document.yaml#/documentUpdate' - responses: - '200': - description: Document updated successfully - content: - application/json: - schema: - $ref: '../components/schemas/document.yaml#/document' - '401': - description: Unauthorized - content: - application/json: - schema: - $ref: '../components/schemas/common.yaml#/failResponse' - '404': - description: Document not found - content: - application/json: - schema: - $ref: '../components/schemas/common.yaml#/failResponse' rebuild_indexes: post: diff --git a/aperag/db/models.py b/aperag/db/models.py index 383a56ec5..93d9fa73b 100644 --- a/aperag/db/models.py +++ b/aperag/db/models.py @@ -705,7 +705,6 @@ class DocumentIndex(Base): status = Column(EnumColumn(DocumentIndexStatus), nullable=False, default=DocumentIndexStatus.PENDING, index=True) version = Column(Integer, nullable=False, default=1) # Incremented on each spec change observed_version = Column(Integer, nullable=False, default=0) # Last processed spec version - created_by = Column(String(256), nullable=False) # User who created this spec # Index data and task tracking index_data = Column(Text, nullable=True) # JSON string for index-specific data @@ -719,14 +718,8 @@ class DocumentIndex(Base): def __repr__(self): return f"" - def is_out_of_sync(self) -> bool: - """Check if this index needs reconciliation""" - return self.observed_version < self.version - - def update_version(self, created_by: str = None): + def update_version(self): """Update the version to trigger reconciliation""" - if created_by is not None: - self.created_by = created_by self.version += 1 self.gmt_updated = utc_now() diff --git a/aperag/index/manager.py b/aperag/index/manager.py index 897f32075..c559e158c 100644 --- a/aperag/index/manager.py +++ b/aperag/index/manager.py @@ -25,19 +25,18 @@ class DocumentIndexManager: - """Manager for document index lifecycle using single status model""" + """Simple manager for document index specs (frontend chain)""" - async def create_document_indexes( - self, session: AsyncSession, document_id: str, user: str, index_types: Optional[List[DocumentIndexType]] = None + async def create_or_update_document_indexes( + self, session: AsyncSession, document_id: str, index_types: Optional[List[DocumentIndexType]] = None ): """ - Create index records for a document (called when document is created) + Create or update index records for a document (called when document is created or index isupdated) Args: session: Database session document_id: Document ID - user: User creating the document - index_types: List of index types to create (defaults to vector and fulltext) + index_types: List of index types to create (defaults to all) """ if index_types is None: index_types = [DocumentIndexType.VECTOR, DocumentIndexType.FULLTEXT, DocumentIndexType.GRAPH] @@ -53,7 +52,7 @@ async def create_document_indexes( if existing_index: # Update existing index to pending and increment version existing_index.status = DocumentIndexStatus.PENDING - existing_index.update_version(user) + existing_index.update_version() logger.debug(f"Updated index for {document_id}:{index_type.value} to version {existing_index.version}") else: # Create new index @@ -63,33 +62,10 @@ async def create_document_indexes( status=DocumentIndexStatus.PENDING, version=1, observed_version=0, - created_by=user, ) session.add(doc_index) logger.debug(f"Created new index for {document_id}:{index_type.value}") - async def update_document_indexes(self, session: AsyncSession, document_id: str, user: str = None): - """ - Update document indexes (called when document content is updated) - - This increments the version of all active indexes to trigger reconciliation. - - Args: - session: Database session - document_id: Document ID - user: User triggering the update (optional) - """ - stmt = select(DocumentIndex).where(DocumentIndex.document_id == document_id) - result = await session.execute(stmt) - indexes = result.scalars().all() - - for index in indexes: - # Only update active indexes, failed indexes can be manually rebuilt - if index.status in [DocumentIndexStatus.ACTIVE, DocumentIndexStatus.FAILED]: - index.status = DocumentIndexStatus.PENDING - index.update_version(user) - logger.debug(f"Updated index {document_id}:{index.index_type} to version {index.version}") - async def delete_document_indexes( self, session: AsyncSession, document_id: str, index_types: Optional[List[DocumentIndexType]] = None ): @@ -117,48 +93,6 @@ async def delete_document_indexes( doc_index.gmt_updated = utc_now() logger.debug(f"Marked index {document_id}:{index_type.value} for deletion") - async def rebuild_document_indexes( - self, session: AsyncSession, document_id: str, index_types: List[DocumentIndexType], user: str = None - ): - """ - Rebuild specified document indexes (called when user requests index rebuild) - - This increments the version of specified indexes to trigger reconciliation. - - Args: - session: Database session - document_id: Document ID - index_types: List of index types to rebuild - user: User triggering the rebuild (optional) - """ - if len(set(index_types)) != len(index_types): - raise ValueError("Duplicate index types are not allowed") - - for index_type in index_types: - stmt = select(DocumentIndex).where( - and_(DocumentIndex.document_id == document_id, DocumentIndex.index_type == index_type) - ) - result = await session.execute(stmt) - doc_index = result.scalar_one_or_none() - - if doc_index: - # Reset to pending and increment version to trigger rebuild - doc_index.status = DocumentIndexStatus.PENDING - doc_index.update_version(user) - doc_index.error_message = None # Clear any previous error - logger.info(f"Triggered rebuild for {index_type.value} index of document {document_id} (v{doc_index.version})") - else: - # Create new index if it doesn't exist - doc_index = DocumentIndex( - document_id=document_id, - index_type=index_type, - status=DocumentIndexStatus.PENDING, - version=1, - observed_version=0, - created_by=user or "system", - ) - session.add(doc_index) - logger.info(f"Created new {index_type.value} index for document {document_id}") # Global instance document_index_manager = DocumentIndexManager() diff --git a/aperag/migration/versions/20250624132425-850b2c5dc08f.py b/aperag/migration/versions/20250624132425-850b2c5dc08f.py index 04e6e36fd..29904415b 100644 --- a/aperag/migration/versions/20250624132425-850b2c5dc08f.py +++ b/aperag/migration/versions/20250624132425-850b2c5dc08f.py @@ -159,7 +159,6 @@ def upgrade() -> None: sa.Column('status', sa.Enum('PENDING', 'CREATING', 'ACTIVE', 'DELETING', 'DELETION_IN_PROGRESS', 'FAILED', name='documentindexstatus'), nullable=False), sa.Column('version', sa.Integer(), nullable=False), sa.Column('observed_version', sa.Integer(), nullable=False), - sa.Column('created_by', sa.String(length=256), nullable=False), sa.Column('index_data', sa.Text(), nullable=True), sa.Column('error_message', sa.Text(), nullable=True), sa.Column('gmt_created', sa.DateTime(timezone=True), nullable=False), diff --git a/aperag/schema/view_models.py b/aperag/schema/view_models.py index f802f1ee1..a1216e0a9 100644 --- a/aperag/schema/view_models.py +++ b/aperag/schema/view_models.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: openapi.merged.yaml -# timestamp: 2025-06-24T06:45:59+00:00 +# timestamp: 2025-06-24T07:36:45+00:00 from __future__ import annotations @@ -600,12 +600,6 @@ class DocumentCreate(BaseModel): collection_id: Optional[str] = None -class DocumentUpdate(BaseModel): - title: Optional[str] = None - description: Optional[str] = None - source: Optional[str] = None - - class RebuildIndexesRequest(BaseModel): index_types: list[Literal['VECTOR', 'FULLTEXT', 'GRAPH']] = Field( ..., description='Types of indexes to rebuild', min_items=1 diff --git a/aperag/service/document_service.py b/aperag/service/document_service.py index a7370c902..50bd2bd12 100644 --- a/aperag/service/document_service.py +++ b/aperag/service/document_service.py @@ -207,10 +207,9 @@ async def _create_documents_atomically(session): index_types.append(db_models.DocumentIndexType.GRAPH) # Use index manager to create indexes with new status model - await document_index_manager.create_document_indexes( + await document_index_manager.create_or_update_document_indexes( document_id=document_instance.id, index_types=index_types, - user=user, session=session ) @@ -251,65 +250,6 @@ async def get_document(self, user: str, collection_id: str, document_id: str) -> async for session in get_async_session(): return await self.build_document_response(document, session) - async def update_document( - self, user: str, collection_id: str, document_id: str, document_in: view_models.DocumentUpdate - ) -> view_models.Document: - instance = await self.db_ops.query_document(user, collection_id, document_id) - if instance is None: - raise DocumentNotFoundException(document_id) - - if document_in.config: - try: - config = json.loads(document_in.config) - metadata = json.loads(instance.doc_metadata or "{}") - metadata["labels"] = config["labels"] - updated_metadata = json.dumps(metadata) - - # Update document and indexes atomically in a single transaction - async def _update_document_atomically(session): - from sqlalchemy import select - - from aperag.db.models import Document, DocumentStatus - - # Update document metadata - stmt = select(Document).where( - Document.id == document_id, - Document.collection_id == collection_id, - Document.user == user, - Document.status != DocumentStatus.DELETED, - ) - result = await session.execute(stmt) - document = result.scalars().first() - - if not document: - raise DocumentNotFoundException(document_id) - - document.doc_metadata = updated_metadata - session.add(document) - await session.flush() - await session.refresh(document) - - # Update index specs to trigger re-indexing - await document_index_manager.update_document_indexes(session, document.id) - - # Build response object - return await self.build_document_response(document, session) - - result = await self.db_ops.execute_with_transaction(_update_document_atomically) - except json.JSONDecodeError: - raise invalid_param("config", "invalid document config") - else: - - async def _get_doc_response(session): - return await self.build_document_response(instance, session) - - result = await self.db_ops._execute_query(_get_doc_response) - - # Trigger index reconciliation after successful document update - _trigger_index_reconciliation() - - return result - async def delete_document(self, user: str, collection_id: str, document_id: str) -> Optional[view_models.Document]: """Delete document by ID (idempotent operation) @@ -467,7 +407,7 @@ async def _rebuild_document_indexes_atomically(session): raise ResourceNotFoundException(f"Collection {collection_id} not found or access denied") # Trigger index rebuild by incrementing version for selected index types - await document_index_manager.rebuild_document_indexes(session, document_id, index_type_enums) + await document_index_manager.create_or_update_document_indexes(session, document_id, index_type_enums) logger.info(f"Successfully triggered rebuild for document {document_id} indexes: {index_types}") diff --git a/aperag/views/main.py b/aperag/views/main.py index f74238af0..0f6566841 100644 --- a/aperag/views/main.py +++ b/aperag/views/main.py @@ -124,18 +124,6 @@ async def get_document_view( return await document_service.get_document(str(user.id), collection_id, document_id) -@router.put("/collections/{collection_id}/documents/{document_id}") -@audit(resource_type="document", api_name="UpdateDocument") -async def update_document_view( - request: Request, - collection_id: str, - document_id: str, - document: view_models.DocumentUpdate, - user: User = Depends(current_user), -) -> view_models.Document: - return await document_service.update_document(str(user.id), collection_id, document_id, document) - - @router.delete("/collections/{collection_id}/documents/{document_id}") @audit(resource_type="document", api_name="DeleteDocument") async def delete_document_view( diff --git a/config/celery_tasks.py b/config/celery_tasks.py index ed039298d..4fd158c7c 100644 --- a/config/celery_tasks.py +++ b/config/celery_tasks.py @@ -124,16 +124,7 @@ class BaseIndexTask(Task): abstract = True - def _handle_index_success(self, document_id: str, index_type: str, index_data: dict = None): - try: - from aperag.index.reconciler import index_task_callbacks - index_data_json = json.dumps(index_data) if index_data else None - index_task_callbacks.on_index_created(document_id, index_type, 1, index_data_json) # Default version 1 for backward compatibility - logger.info(f"Index success callback executed for {index_type} index of document {document_id}") - except Exception as e: - logger.warning(f"Failed to execute index success callback for {index_type} of {document_id}: {e}", exc_info=True) - - def _handle_index_success_with_version(self, document_id: str, index_type: str, target_version: int, index_data: dict = None): + def _handle_index_success(self, document_id: str, index_type: str, target_version: int, index_data: dict = None): try: from aperag.index.reconciler import index_task_callbacks index_data_json = json.dumps(index_data) if index_data else None @@ -249,7 +240,7 @@ def create_index_task(self, document_id: str, index_type: str, parsed_data_dict: # Handle success callback with version validation logger.info(f"Successfully created {index_type} index for document {document_id} (v{target_version})") - self._handle_index_success_with_version(document_id, index_type, target_version, result.data) + self._handle_index_success(document_id, index_type, target_version, result.data) return result.to_dict() @@ -396,7 +387,7 @@ def update_index_task(self, document_id: str, index_type: str, parsed_data_dict: # Handle success callback with version validation logger.info(f"Successfully updated {index_type} index for document {document_id} (v{target_version})") - self._handle_index_success_with_version(document_id, index_type, target_version, result.data) + self._handle_index_success(document_id, index_type, target_version, result.data) return result.to_dict() diff --git a/frontend/src/api/apis/default-api.ts b/frontend/src/api/apis/default-api.ts index d569bc7eb..c06ec3fa9 100644 --- a/frontend/src/api/apis/default-api.ts +++ b/frontend/src/api/apis/default-api.ts @@ -68,8 +68,6 @@ import type { DocumentCreate } from '../models'; // @ts-ignore import type { DocumentList } from '../models'; // @ts-ignore -import type { DocumentUpdate } from '../models'; -// @ts-ignore import type { FailResponse } from '../models'; // @ts-ignore import type { Feedback } from '../models'; @@ -1063,54 +1061,6 @@ export const DefaultApiAxiosParamCreator = function (configuration?: Configurati options: localVarRequestOptions, }; }, - /** - * Update a document - * @summary Update a document - * @param {string} collectionId - * @param {string} documentId - * @param {DocumentUpdate} documentUpdate - * @param {*} [options] Override http request option. - * @throws {RequiredError} - */ - collectionsCollectionIdDocumentsDocumentIdPut: async (collectionId: string, documentId: string, documentUpdate: DocumentUpdate, options: RawAxiosRequestConfig = {}): Promise => { - // verify required parameter 'collectionId' is not null or undefined - assertParamExists('collectionsCollectionIdDocumentsDocumentIdPut', 'collectionId', collectionId) - // verify required parameter 'documentId' is not null or undefined - assertParamExists('collectionsCollectionIdDocumentsDocumentIdPut', 'documentId', documentId) - // verify required parameter 'documentUpdate' is not null or undefined - assertParamExists('collectionsCollectionIdDocumentsDocumentIdPut', 'documentUpdate', documentUpdate) - const localVarPath = `/collections/{collection_id}/documents/{document_id}` - .replace(`{${"collection_id"}}`, encodeURIComponent(String(collectionId))) - .replace(`{${"document_id"}}`, encodeURIComponent(String(documentId))); - // use dummy base URL string because the URL constructor only accepts absolute URLs. - const localVarUrlObj = new URL(localVarPath, DUMMY_BASE_URL); - let baseOptions; - if (configuration) { - baseOptions = configuration.baseOptions; - } - - const localVarRequestOptions = { method: 'PUT', ...baseOptions, ...options}; - const localVarHeaderParameter = {} as any; - const localVarQueryParameter = {} as any; - - // authentication BearerAuth required - // http bearer authentication required - await setBearerAuthToObject(localVarHeaderParameter, configuration) - - - - localVarHeaderParameter['Content-Type'] = 'application/json'; - - setSearchParams(localVarUrlObj, localVarQueryParameter); - let headersFromBaseOptions = baseOptions && baseOptions.headers ? baseOptions.headers : {}; - localVarRequestOptions.headers = {...localVarHeaderParameter, ...headersFromBaseOptions, ...options.headers}; - localVarRequestOptions.data = serializeDataIfNeeded(documentUpdate, localVarRequestOptions, configuration) - - return { - url: toPathString(localVarUrlObj), - options: localVarRequestOptions, - }; - }, /** * Rebuild specified types of indexes for a document * @summary Rebuild document indexes @@ -2565,21 +2515,6 @@ export const DefaultApiFp = function(configuration?: Configuration) { const localVarOperationServerBasePath = operationServerMap['DefaultApi.collectionsCollectionIdDocumentsDocumentIdGet']?.[localVarOperationServerIndex]?.url; return (axios, basePath) => createRequestFunction(localVarAxiosArgs, globalAxios, BASE_PATH, configuration)(axios, localVarOperationServerBasePath || basePath); }, - /** - * Update a document - * @summary Update a document - * @param {string} collectionId - * @param {string} documentId - * @param {DocumentUpdate} documentUpdate - * @param {*} [options] Override http request option. - * @throws {RequiredError} - */ - async collectionsCollectionIdDocumentsDocumentIdPut(collectionId: string, documentId: string, documentUpdate: DocumentUpdate, options?: RawAxiosRequestConfig): Promise<(axios?: AxiosInstance, basePath?: string) => AxiosPromise> { - const localVarAxiosArgs = await localVarAxiosParamCreator.collectionsCollectionIdDocumentsDocumentIdPut(collectionId, documentId, documentUpdate, options); - const localVarOperationServerIndex = configuration?.serverIndex ?? 0; - const localVarOperationServerBasePath = operationServerMap['DefaultApi.collectionsCollectionIdDocumentsDocumentIdPut']?.[localVarOperationServerIndex]?.url; - return (axios, basePath) => createRequestFunction(localVarAxiosArgs, globalAxios, BASE_PATH, configuration)(axios, localVarOperationServerBasePath || basePath); - }, /** * Rebuild specified types of indexes for a document * @summary Rebuild document indexes @@ -3218,16 +3153,6 @@ export const DefaultApiFactory = function (configuration?: Configuration, basePa collectionsCollectionIdDocumentsDocumentIdGet(requestParameters: DefaultApiCollectionsCollectionIdDocumentsDocumentIdGetRequest, options?: RawAxiosRequestConfig): AxiosPromise { return localVarFp.collectionsCollectionIdDocumentsDocumentIdGet(requestParameters.collectionId, requestParameters.documentId, options).then((request) => request(axios, basePath)); }, - /** - * Update a document - * @summary Update a document - * @param {DefaultApiCollectionsCollectionIdDocumentsDocumentIdPutRequest} requestParameters Request parameters. - * @param {*} [options] Override http request option. - * @throws {RequiredError} - */ - collectionsCollectionIdDocumentsDocumentIdPut(requestParameters: DefaultApiCollectionsCollectionIdDocumentsDocumentIdPutRequest, options?: RawAxiosRequestConfig): AxiosPromise { - return localVarFp.collectionsCollectionIdDocumentsDocumentIdPut(requestParameters.collectionId, requestParameters.documentId, requestParameters.documentUpdate, options).then((request) => request(axios, basePath)); - }, /** * Rebuild specified types of indexes for a document * @summary Rebuild document indexes @@ -3759,16 +3684,6 @@ export interface DefaultApiInterface { */ collectionsCollectionIdDocumentsDocumentIdGet(requestParameters: DefaultApiCollectionsCollectionIdDocumentsDocumentIdGetRequest, options?: RawAxiosRequestConfig): AxiosPromise; - /** - * Update a document - * @summary Update a document - * @param {DefaultApiCollectionsCollectionIdDocumentsDocumentIdPutRequest} requestParameters Request parameters. - * @param {*} [options] Override http request option. - * @throws {RequiredError} - * @memberof DefaultApiInterface - */ - collectionsCollectionIdDocumentsDocumentIdPut(requestParameters: DefaultApiCollectionsCollectionIdDocumentsDocumentIdPutRequest, options?: RawAxiosRequestConfig): AxiosPromise; - /** * Rebuild specified types of indexes for a document * @summary Rebuild document indexes @@ -4491,34 +4406,6 @@ export interface DefaultApiCollectionsCollectionIdDocumentsDocumentIdGetRequest readonly documentId: string } -/** - * Request parameters for collectionsCollectionIdDocumentsDocumentIdPut operation in DefaultApi. - * @export - * @interface DefaultApiCollectionsCollectionIdDocumentsDocumentIdPutRequest - */ -export interface DefaultApiCollectionsCollectionIdDocumentsDocumentIdPutRequest { - /** - * - * @type {string} - * @memberof DefaultApiCollectionsCollectionIdDocumentsDocumentIdPut - */ - readonly collectionId: string - - /** - * - * @type {string} - * @memberof DefaultApiCollectionsCollectionIdDocumentsDocumentIdPut - */ - readonly documentId: string - - /** - * - * @type {DocumentUpdate} - * @memberof DefaultApiCollectionsCollectionIdDocumentsDocumentIdPut - */ - readonly documentUpdate: DocumentUpdate -} - /** * Request parameters for collectionsCollectionIdDocumentsDocumentIdRebuildIndexesPost operation in DefaultApi. * @export @@ -5235,18 +5122,6 @@ export class DefaultApi extends BaseAPI implements DefaultApiInterface { return DefaultApiFp(this.configuration).collectionsCollectionIdDocumentsDocumentIdGet(requestParameters.collectionId, requestParameters.documentId, options).then((request) => request(this.axios, this.basePath)); } - /** - * Update a document - * @summary Update a document - * @param {DefaultApiCollectionsCollectionIdDocumentsDocumentIdPutRequest} requestParameters Request parameters. - * @param {*} [options] Override http request option. - * @throws {RequiredError} - * @memberof DefaultApi - */ - public collectionsCollectionIdDocumentsDocumentIdPut(requestParameters: DefaultApiCollectionsCollectionIdDocumentsDocumentIdPutRequest, options?: RawAxiosRequestConfig) { - return DefaultApiFp(this.configuration).collectionsCollectionIdDocumentsDocumentIdPut(requestParameters.collectionId, requestParameters.documentId, requestParameters.documentUpdate, options).then((request) => request(this.axios, this.basePath)); - } - /** * Rebuild specified types of indexes for a document * @summary Rebuild document indexes diff --git a/frontend/src/api/models/document-update.ts b/frontend/src/api/models/document-update.ts deleted file mode 100644 index aa52b3908..000000000 --- a/frontend/src/api/models/document-update.ts +++ /dev/null @@ -1,42 +0,0 @@ -/* tslint:disable */ -/* eslint-disable */ -/** - * ApeRAG API - * ApeRAG API Documentation - * - * The version of the OpenAPI document: 1.0.0 - * - * - * NOTE: This class is auto generated by OpenAPI Generator (https://openapi-generator.tech). - * https://openapi-generator.tech - * Do not edit the class manually. - */ - - - -/** - * - * @export - * @interface DocumentUpdate - */ -export interface DocumentUpdate { - /** - * - * @type {string} - * @memberof DocumentUpdate - */ - 'title'?: string; - /** - * - * @type {string} - * @memberof DocumentUpdate - */ - 'description'?: string; - /** - * - * @type {string} - * @memberof DocumentUpdate - */ - 'source'?: string; -} - diff --git a/frontend/src/api/models/document.ts b/frontend/src/api/models/document.ts index 24cb7774e..6da2c0aed 100644 --- a/frontend/src/api/models/document.ts +++ b/frontend/src/api/models/document.ts @@ -106,15 +106,16 @@ export const DocumentStatusEnum = { COMPLETE: 'COMPLETE', FAILED: 'FAILED', DELETING: 'DELETING', - DELETED: 'DELETED', - WARNING: 'WARNING' + DELETED: 'DELETED' } as const; export type DocumentStatusEnum = typeof DocumentStatusEnum[keyof typeof DocumentStatusEnum]; export const DocumentVectorIndexStatusEnum = { PENDING: 'PENDING', - RUNNING: 'RUNNING', - COMPLETE: 'COMPLETE', + CREATING: 'CREATING', + ACTIVE: 'ACTIVE', + DELETING: 'DELETING', + DELETING_IN_PROGRESS: 'DELETING_IN_PROGRESS', FAILED: 'FAILED', SKIPPED: 'SKIPPED' } as const; @@ -122,8 +123,10 @@ export const DocumentVectorIndexStatusEnum = { export type DocumentVectorIndexStatusEnum = typeof DocumentVectorIndexStatusEnum[keyof typeof DocumentVectorIndexStatusEnum]; export const DocumentFulltextIndexStatusEnum = { PENDING: 'PENDING', - RUNNING: 'RUNNING', - COMPLETE: 'COMPLETE', + CREATING: 'CREATING', + ACTIVE: 'ACTIVE', + DELETING: 'DELETING', + DELETING_IN_PROGRESS: 'DELETING_IN_PROGRESS', FAILED: 'FAILED', SKIPPED: 'SKIPPED' } as const; @@ -131,8 +134,10 @@ export const DocumentFulltextIndexStatusEnum = { export type DocumentFulltextIndexStatusEnum = typeof DocumentFulltextIndexStatusEnum[keyof typeof DocumentFulltextIndexStatusEnum]; export const DocumentGraphIndexStatusEnum = { PENDING: 'PENDING', - RUNNING: 'RUNNING', - COMPLETE: 'COMPLETE', + CREATING: 'CREATING', + ACTIVE: 'ACTIVE', + DELETING: 'DELETING', + DELETING_IN_PROGRESS: 'DELETING_IN_PROGRESS', FAILED: 'FAILED', SKIPPED: 'SKIPPED' } as const; diff --git a/frontend/src/api/models/index.ts b/frontend/src/api/models/index.ts index ea46d69a5..47c080b65 100644 --- a/frontend/src/api/models/index.ts +++ b/frontend/src/api/models/index.ts @@ -37,7 +37,6 @@ export * from './debug-flow-request'; export * from './document'; export * from './document-create'; export * from './document-list'; -export * from './document-update'; export * from './edge'; export * from './embedding-data'; export * from './embedding-request'; diff --git a/frontend/src/api/openapi.merged.yaml b/frontend/src/api/openapi.merged.yaml index 454028b20..a34ac3bd3 100644 --- a/frontend/src/api/openapi.merged.yaml +++ b/frontend/src/api/openapi.merged.yaml @@ -645,47 +645,6 @@ paths: application/json: schema: $ref: '#/components/schemas/failResponse' - put: - summary: Update a document - description: Update a document - security: - - BearerAuth: [] - parameters: - - name: collection_id - in: path - required: true - schema: - type: string - - name: document_id - in: path - required: true - schema: - type: string - requestBody: - required: true - content: - application/json: - schema: - $ref: '#/components/schemas/documentUpdate' - responses: - '200': - description: Document updated successfully - content: - application/json: - schema: - $ref: '#/components/schemas/document' - '401': - description: Unauthorized - content: - application/json: - schema: - $ref: '#/components/schemas/failResponse' - '404': - description: Document not found - content: - application/json: - schema: - $ref: '#/components/schemas/failResponse' /collections/{collection_id}/documents/{document_id}/rebuild_indexes: post: summary: Rebuild document indexes @@ -2768,29 +2727,34 @@ components: - FAILED - DELETING - DELETED - - WARNING vector_index_status: type: string enum: - PENDING - - RUNNING - - COMPLETE + - CREATING + - ACTIVE + - DELETING + - DELETING_IN_PROGRESS - FAILED - SKIPPED fulltext_index_status: type: string enum: - PENDING - - RUNNING - - COMPLETE + - CREATING + - ACTIVE + - DELETING + - DELETING_IN_PROGRESS - FAILED - SKIPPED graph_index_status: type: string enum: - PENDING - - RUNNING - - COMPLETE + - CREATING + - ACTIVE + - DELETING + - DELETING_IN_PROGRESS - FAILED - SKIPPED vector_index_updated: @@ -2836,15 +2800,6 @@ components: type: string collection_id: type: string - documentUpdate: - type: object - properties: - title: - type: string - description: - type: string - source: - type: string rebuildIndexesRequest: type: object properties: From 68dd5e8ab950bb6345644a20de0644540fb8c19c Mon Sep 17 00:00:00 2001 From: Guo Ziang Date: Tue, 24 Jun 2025 16:44:22 +0800 Subject: [PATCH 3/6] feat: refactor document index reconciliation and deletion logic - Introduced a new IndexAction class to standardize index operation types (CREATE, UPDATE, DELETE). - Updated DocumentIndexReconciler to utilize the new IndexAction constants for improved clarity and maintainability. - Refactored document deletion logic in DocumentService to streamline the deletion process and ensure proper index reconciliation. - Enhanced error handling and logging during document deletion and index management operations. - Updated documentation to reflect changes in the indexing architecture and operational flow. --- aperag/index/reconciler.py | 51 ++--- aperag/service/document_service.py | 132 +++++-------- aperag/utils/constant.py | 6 + config/celery_tasks.py | 112 ++++++----- docs/indexing_architecture.md | 282 ++++++++++++++++----------- docs/indexing_architecture_zh.md | 296 +++++++++++++++++------------ 6 files changed, 477 insertions(+), 402 deletions(-) diff --git a/aperag/index/reconciler.py b/aperag/index/reconciler.py index 6573c5fe3..f6f2f872f 100644 --- a/aperag/index/reconciler.py +++ b/aperag/index/reconciler.py @@ -29,6 +29,7 @@ ) from aperag.tasks.scheduler import TaskScheduler, create_task_scheduler from aperag.utils.utils import utc_now +from aperag.utils.constant import IndexAction logger = logging.getLogger(__name__) @@ -71,30 +72,30 @@ def _get_indexes_needing_reconciliation(self, session: Session) -> List[Document """ from collections import defaultdict - operations = defaultdict(lambda: {"create": [], "update": [], "delete": []}) + operations = defaultdict(lambda: {IndexAction.CREATE: [], IndexAction.UPDATE: [], IndexAction.DELETE: []}) conditions = { - "create": and_( + IndexAction.CREATE: and_( DocumentIndex.status == DocumentIndexStatus.PENDING, DocumentIndex.observed_version < DocumentIndex.version, DocumentIndex.version == 1, ), - "update": and_( + IndexAction.UPDATE: and_( DocumentIndex.status == DocumentIndexStatus.PENDING, DocumentIndex.observed_version < DocumentIndex.version, DocumentIndex.version > 1, ), - "delete": and_( + IndexAction.DELETE: and_( DocumentIndex.status == DocumentIndexStatus.DELETING, ), } - for operation_type, condition in conditions.items(): + for action, condition in conditions.items(): stmt = select(DocumentIndex).where(condition) result = session.execute(stmt) indexes = result.scalars().all() for index in indexes: - operations[index.document_id][operation_type].append(index) + operations[index.document_id][action].append(index) return operations @@ -106,9 +107,9 @@ def _reconcile_single_document(self, document_id: str, operations: dict): # Collect indexes for this document that need claiming indexes_to_claim = [] - for operation_type, doc_indexes in operations.items(): + for action, doc_indexes in operations.items(): for doc_index in doc_indexes: - indexes_to_claim.append((doc_index.id, doc_index.index_type, operation_type)) + indexes_to_claim.append((doc_index.id, doc_index.index_type, action)) # Atomically claim the indexes for this document claimed_indexes = self._claim_document_indexes(session, document_id, indexes_to_claim) @@ -129,10 +130,10 @@ def _claim_document_indexes(self, session: Session, document_id: str, indexes_to claimed_indexes = [] try: - for index_id, index_type, operation_type in indexes_to_claim: - if operation_type in ["create", "update"]: + for index_id, index_type, action in indexes_to_claim: + if action in [IndexAction.CREATE, IndexAction.UPDATE]: target_state = DocumentIndexStatus.CREATING - elif operation_type == "delete": + elif action == IndexAction.DELETE: target_state = DocumentIndexStatus.DELETION_IN_PROGRESS else: continue @@ -146,21 +147,21 @@ def _claim_document_indexes(self, session: Session, document_id: str, indexes_to continue # Build appropriate claiming conditions based on operation type - if operation_type == "create": + if action == IndexAction.CREATE: claiming_conditions = [ DocumentIndex.id == index_id, DocumentIndex.status == DocumentIndexStatus.PENDING, DocumentIndex.observed_version < DocumentIndex.version, DocumentIndex.version == 1, ] - elif operation_type == "update": + elif action == IndexAction.UPDATE: claiming_conditions = [ DocumentIndex.id == index_id, DocumentIndex.status == DocumentIndexStatus.PENDING, DocumentIndex.observed_version < DocumentIndex.version, DocumentIndex.version > 1, ] - elif operation_type == "delete": + elif action == IndexAction.DELETE: claiming_conditions = [ DocumentIndex.id == index_id, DocumentIndex.status == DocumentIndexStatus.DELETING, @@ -180,10 +181,10 @@ def _claim_document_indexes(self, session: Session, document_id: str, indexes_to 'index_id': index_id, 'document_id': document_id, 'index_type': index_type, - 'operation_type': operation_type, - 'target_version': current_index.version if operation_type in ["create", "update"] else None, + 'action': action, + 'target_version': current_index.version if action in [IndexAction.CREATE, IndexAction.UPDATE] else None, }) - logger.debug(f"Claimed index {index_id} for document {document_id} ({operation_type})") + logger.debug(f"Claimed index {index_id} for document {document_id} ({action})") else: logger.debug(f"Could not claim index {index_id} for document {document_id}") @@ -202,12 +203,12 @@ def _reconcile_document_operations(self, document_id: str, claimed_indexes: List # Group by operation type to batch operations operations_by_type = defaultdict(list) for claimed_index in claimed_indexes: - operation_type = claimed_index['operation_type'] - operations_by_type[operation_type].append(claimed_index) + action = claimed_index['action'] + operations_by_type[action].append(claimed_index) # Process create operations as a batch - if "create" in operations_by_type: - create_indexes = operations_by_type["create"] + if IndexAction.CREATE in operations_by_type: + create_indexes = operations_by_type[IndexAction.CREATE] create_types = [claimed_index['index_type'] for claimed_index in create_indexes] context = {} @@ -227,8 +228,8 @@ def _reconcile_document_operations(self, document_id: str, claimed_indexes: List logger.info(f"Scheduled create task for document {document_id}, types: {create_types}") # Process update operations as a batch - if "update" in operations_by_type: - update_indexes = operations_by_type["update"] + if IndexAction.UPDATE in operations_by_type: + update_indexes = operations_by_type[IndexAction.UPDATE] update_types = [claimed_index['index_type'] for claimed_index in update_indexes] context = {} @@ -248,8 +249,8 @@ def _reconcile_document_operations(self, document_id: str, claimed_indexes: List logger.info(f"Scheduled update task for document {document_id}, types: {update_types}") # Process delete operations as a batch - if "delete" in operations_by_type: - delete_indexes = operations_by_type["delete"] + if IndexAction.DELETE in operations_by_type: + delete_indexes = operations_by_type[IndexAction.DELETE] delete_types = [claimed_index['index_type'] for claimed_index in delete_indexes] task_id = self.task_scheduler.schedule_delete_index( diff --git a/aperag/service/document_service.py b/aperag/service/document_service.py index 50bd2bd12..52054d6e6 100644 --- a/aperag/service/document_service.py +++ b/aperag/service/document_service.py @@ -250,111 +250,67 @@ async def get_document(self, user: str, collection_id: str, document_id: str) -> async for session in get_async_session(): return await self.build_document_response(document, session) - async def delete_document(self, user: str, collection_id: str, document_id: str) -> Optional[view_models.Document]: - """Delete document by ID (idempotent operation) - - Returns the deleted document or None if already deleted/not found + async def _delete_document(self, session: AsyncSession, user: str, collection_id: str, document_id: str): + """ + Core logic to delete a single document and its associated resources. + This method is designed to be called within a transaction. """ + # Validate document existence and ownership document = await self.db_ops.query_document(user, collection_id, document_id) if document is None: - # Document already deleted or never existed - idempotent operation - return None - - # Delete document and indexes atomically in a single transaction - async def _delete_document_atomically(session): - from sqlalchemy import select - - from aperag.db.models import Document, DocumentStatus, utc_now - - # Get and delete document - stmt = select(Document).where( - Document.id == document_id, - Document.collection_id == collection_id, - Document.user == user, - Document.status != DocumentStatus.DELETED, - ) - result = await session.execute(stmt) - doc_to_delete = result.scalars().first() + # Silently ignore if document not found, as it might have been deleted by another process + logger.warning(f"Document {document_id} not found for deletion, skipping.") + return - if not doc_to_delete: - return None - - # Soft delete document - doc_to_delete.status = DocumentStatus.DELETED - doc_to_delete.gmt_deleted = utc_now() - session.add(doc_to_delete) - await session.flush() - await session.refresh(doc_to_delete) - - # Mark index specs for deletion - await document_index_manager.delete_document_indexes(session, document_id) - - # Build response object - return await self.build_document_response(doc_to_delete, session) - - result = await self.db_ops.execute_with_transaction(_delete_document_atomically) + # Use index manager to mark all related indexes for deletion + await document_index_manager.delete_document_indexes( + document_id=document.id, index_types=None, session=session + ) - if result: - # Delete object storage files after successful database transaction - obj_store = get_object_store() + # Delete from object store + obj_store = get_object_store() + metadata = json.loads(document.doc_metadata) if document.doc_metadata else {} + if object_path := metadata.get("object_path"): try: - await sync_to_async(obj_store.delete_objects_by_prefix)(f"{document.object_store_base_path()}/") + # Use delete_objects_by_prefix to remove all related files (original, chunks, etc.) + await sync_to_async(obj_store.delete_objects_by_prefix)(document.object_store_base_path()) + logger.info(f"Deleted objects from object store with prefix: {document.object_store_base_path()}") except Exception as e: - logger.warning(f"Failed to delete object storage files for document {document_id}: {e}") - - # Trigger index reconciliation after successful document deletion - _trigger_index_reconciliation() - - return result + logger.warning(f"Failed to delete objects for document {document.id} from object store: {e}") - return None - - async def delete_documents(self, user: str, collection_id: str, document_ids: List[str]) -> dict: - # Delete documents and indexes atomically in a single transaction - async def _delete_documents_atomically(session): - from sqlalchemy import select - - from aperag.db.models import Document, DocumentStatus, utc_now - - # Get documents to delete - stmt = select(Document).where( - Document.id.in_(document_ids), - Document.collection_id == collection_id, - Document.user == user, - Document.status != DocumentStatus.DELETED, - ) - result = await session.execute(stmt) - documents_to_delete = result.scalars().all() + # Delete the document record from the database + await session.delete(document) + await session.flush() + logger.info(f"Successfully marked document {document.id} and its indexes for deletion.") - if not documents_to_delete: - return [], list(document_ids) + return document - # Soft delete documents - success_ids = [] - for doc in documents_to_delete: - doc.status = DocumentStatus.DELETED - doc.gmt_deleted = utc_now() - session.add(doc) - success_ids.append(doc.id) + async def delete_document(self, user: str, collection_id: str, document_id: str) -> dict: + """Delete a single document and trigger index reconciliation.""" - await session.flush() + async def _delete_document_atomically(session: AsyncSession): + return await self._delete_document(session, user, collection_id, document_id) - # Delete indexes for all successful deletions - for doc_id in success_ids: - await document_index_manager.delete_document_indexes(session, doc_id) + result = await self.db_ops.execute_with_transaction(_delete_document_atomically) - # Calculate failed IDs - failed_ids = list(set(document_ids) - set(success_ids)) - return success_ids, failed_ids + # Trigger reconciliation to process the deletion + _trigger_index_reconciliation() + return result - success_ids, failed_ids = await self.db_ops.execute_with_transaction(_delete_documents_atomically) + async def delete_documents(self, user: str, collection_id: str, document_ids: List[str]) -> dict: + """Delete multiple documents and trigger index reconciliation.""" - result = {"success": success_ids, "failed": failed_ids} + async def _delete_documents_atomically(session: AsyncSession): + deleted_ids = [] + for doc_id in document_ids: + await self._delete_document(session, user, collection_id, doc_id) + deleted_ids.append(doc_id) + return {"deleted_ids": deleted_ids, "status": "success"} - # Trigger index reconciliation after successful batch document deletion - if result.get("success"): # Only trigger if at least one document was deleted successfully - _trigger_index_reconciliation() + result = await self.db_ops.execute_with_transaction(_delete_documents_atomically) + # Trigger reconciliation to process deletions + _trigger_index_reconciliation() return result async def rebuild_document_indexes( diff --git a/aperag/utils/constant.py b/aperag/utils/constant.py index 1b0018a2e..bcbcc3191 100644 --- a/aperag/utils/constant.py +++ b/aperag/utils/constant.py @@ -39,3 +39,9 @@ class QuotaType: MAX_COLLECTION_COUNT = "max_collection_count" MAX_DOCUMENT_COUNT = "max_document_count" MAX_CONVERSATION_COUNT = "max_conversation_count" + + +class IndexAction: + CREATE = "create" + UPDATE = "update" + DELETE = "delete" diff --git a/config/celery_tasks.py b/config/celery_tasks.py index 4fd158c7c..9955ed7ba 100644 --- a/config/celery_tasks.py +++ b/config/celery_tasks.py @@ -110,13 +110,49 @@ ParsedDocumentData, IndexTaskResult, WorkflowResult, - TaskStatus + TaskStatus, ) +from aperag.utils.constant import IndexAction from config.celery import app logger = logging.getLogger() +def _validate_task_relevance(document_id: str, index_type: str, target_version: int, expected_status: "DocumentIndexStatus"): + """ + Double-check the database to ensure the task is still valid. + + Returns a dictionary with a 'skipped' status if the task is no longer relevant, + otherwise returns None. + """ + from aperag.db.models import DocumentIndex, DocumentIndexType + from aperag.config import get_sync_session + from sqlalchemy import select, and_ + + for session in get_sync_session(): + stmt = select(DocumentIndex).where( + and_( + DocumentIndex.document_id == document_id, + DocumentIndex.index_type == DocumentIndexType(index_type) + ) + ) + result = session.execute(stmt) + db_index = result.scalar_one_or_none() + + if not db_index: + logger.info(f"Index record not found for {document_id}:{index_type}, skipping task.") + return {"status": "skipped", "reason": "index_record_not_found"} + + if db_index.status != expected_status: + logger.info(f"Index status for {document_id}:{index_type} changed to {db_index.status} (expected {expected_status}), skipping task.") + return {"status": "skipped", "reason": f"status_changed_to_{db_index.status}"} + + if target_version and db_index.version != target_version: + logger.info(f"Version mismatch for {document_id}:{index_type}, expected: {target_version}, current: {db_index.version}, skipping task.") + return {"status": "skipped", "reason": f"version_mismatch_expected_{target_version}_current_{db_index.version}"} + + return None # Task is still relevant + class BaseIndexTask(Task): """ Base class for all index tasks @@ -201,30 +237,9 @@ def create_index_task(self, document_id: str, index_type: str, parsed_data_dict: logger.info(f"Starting to create {index_type} index for document {document_id} (v{target_version})") # Double-check: verify task is still valid - for session in get_sync_session(): - stmt = select(DocumentIndex).where( - and_( - DocumentIndex.document_id == document_id, - DocumentIndex.index_type == DocumentIndexType(index_type) - ) - ) - result = session.execute(stmt) - db_index = result.scalar_one_or_none() - - # Validate task is still relevant - if not db_index: - logger.info(f"Index record not found for {document_id}:{index_type}, skipping task") - return {"status": "skipped", "reason": "index_record_not_found"} - - if db_index.status != DocumentIndexStatus.CREATING: - logger.info(f"Index status changed for {document_id}:{index_type}, current: {db_index.status}, skipping task") - return {"status": "skipped", "reason": f"status_changed_to_{db_index.status}"} - - if target_version and db_index.version != target_version: - logger.info(f"Version mismatch for {document_id}:{index_type}, expected: {target_version}, current: {db_index.version}, skipping task") - return {"status": "skipped", "reason": f"version_mismatch_expected_{target_version}_current_{db_index.version}"} - - break + skip_reason = _validate_task_relevance(document_id, index_type, target_version, DocumentIndexStatus.CREATING) + if skip_reason: + return skip_reason # Convert dict back to structured data parsed_data = ParsedDocumentData.from_dict(parsed_data_dict) @@ -348,30 +363,9 @@ def update_index_task(self, document_id: str, index_type: str, parsed_data_dict: logger.info(f"Starting to update {index_type} index for document {document_id} (v{target_version})") # Double-check: verify task is still valid - for session in get_sync_session(): - stmt = select(DocumentIndex).where( - and_( - DocumentIndex.document_id == document_id, - DocumentIndex.index_type == DocumentIndexType(index_type) - ) - ) - result = session.execute(stmt) - db_index = result.scalar_one_or_none() - - # Validate task is still relevant - if not db_index: - logger.info(f"Index record not found for {document_id}:{index_type}, skipping task") - return {"status": "skipped", "reason": "index_record_not_found"} - - if db_index.status != DocumentIndexStatus.CREATING: - logger.info(f"Index status changed for {document_id}:{index_type}, current: {db_index.status}, skipping task") - return {"status": "skipped", "reason": f"status_changed_to_{db_index.status}"} - - if target_version and db_index.version != target_version: - logger.info(f"Version mismatch for {document_id}:{index_type}, expected: {target_version}, current: {db_index.version}, skipping task") - return {"status": "skipped", "reason": f"version_mismatch_expected_{target_version}_current_{db_index.version}"} - - break + skip_reason = _validate_task_relevance(document_id, index_type, target_version, DocumentIndexStatus.CREATING) + if skip_reason: + return skip_reason # Convert dict back to structured data parsed_data = ParsedDocumentData.from_dict(parsed_data_dict) @@ -429,15 +423,16 @@ def trigger_create_indexes_workflow(self, parsed_data_dict: dict, document_id: s for index_type in index_types ]) - # Create chord: parallel tasks + completion notification + # Create a chord that executes the completion notification after all create tasks are done workflow_chord = chord( parallel_index_tasks, - notify_workflow_complete.s(document_id, "create", index_types) + notify_workflow_complete.s(document_id, IndexAction.CREATE, index_types) ) - chord_async_result = workflow_chord.apply_async() + # Execute the chord + workflow_chord.apply_async() - return chord_async_result + return workflow_chord except Exception as e: error_msg = f"Failed to trigger create indexes workflow: {str(e)}" @@ -466,15 +461,16 @@ def trigger_delete_indexes_workflow(self, document_id: str, index_types: List[st for index_type in index_types ]) - # Create chord: parallel tasks + completion notification + # Create a chord that executes the completion notification after all delete tasks are done workflow_chord = chord( parallel_delete_tasks, - notify_workflow_complete.s(document_id, "delete", index_types) + notify_workflow_complete.s(document_id, IndexAction.DELETE, index_types) ) - chord_async_result = workflow_chord.apply_async() + # Execute the chord + workflow_chord.apply_async() - return chord_async_result + return workflow_chord except Exception as e: error_msg = f"Failed to trigger delete indexes workflow: {str(e)}" @@ -507,7 +503,7 @@ def trigger_update_indexes_workflow(self, parsed_data_dict: dict, document_id: s # Create chord: parallel tasks + completion notification workflow_chord = chord( parallel_update_tasks, - notify_workflow_complete.s(document_id, "update", index_types) + notify_workflow_complete.s(document_id, IndexAction.UPDATE, index_types) ) chord_async_result = workflow_chord.apply_async() diff --git a/docs/indexing_architecture.md b/docs/indexing_architecture.md index ce4578ba3..fb973beb5 100644 --- a/docs/indexing_architecture.md +++ b/docs/indexing_architecture.md @@ -9,14 +9,14 @@ ApeRAG's indexing pipeline architecture adopts a dual-chain design pattern, sepa ```mermaid graph TB subgraph "Frontend Chain (Synchronous Fast Response)" - A[API Request] --> B[FrontendIndexManager] + A[API Request] --> B[IndexManager] B --> C[Write to DocumentIndex Table] - C --> D[Set desired_state=PRESENT] + C --> D[Set status=PENDING, version++] end subgraph "Backend Chain (Asynchronous Task Processing)" - E[Periodic Task reconcile_indexes_task] --> F[BackendIndexReconciler.reconcile_all] - F --> G[Detect desired_state != actual_state] + E[Periodic Task reconcile_indexes_task] --> F[IndexReconciler.reconcile_all] + F --> G[Detect version mismatch or status change needed] G --> H[TaskScheduler schedules async tasks] end @@ -25,9 +25,9 @@ graph TB I --> J[parse_document_task] J --> K[trigger_create_indexes_workflow] K --> L[group parallel execution] - L --> M[create_index_task.vector] - L --> N[create_index_task.fulltext] - L --> O[create_index_task.graph] + L --> M[create_index_task.VECTOR] + L --> N[create_index_task.FULLTEXT] + L --> O[create_index_task.GRAPH] M --> P[chord callback] N --> P O --> P @@ -36,7 +36,7 @@ graph TB subgraph "State Feedback" Q --> R[IndexTaskCallbacks] - R --> S[Update actual_state=PRESENT] + R --> S[Update status=ACTIVE, observed_version] S --> T[Next reconciliation check] end @@ -51,51 +51,64 @@ graph TB **Frontend Chain**: - **Goal**: Fast response to user operations without blocking API requests - **Implementation**: Only operates on database tables, sets desired state, returns immediately -- **Code**: `FrontendIndexManager` in `aperag/index/manager.py` +- **Code**: `IndexManager` in `aperag/index/manager.py` **Backend Chain**: - **Goal**: Asynchronously execute time-consuming indexing operations with retry and error recovery support - **Implementation**: Continuously scans state differences through periodic tasks and schedules async tasks -- **Code**: `BackendIndexReconciler` in `aperag/index/reconciler.py` +- **Code**: `IndexReconciler` in `aperag/index/reconciler.py` -### 2. State-Driven Reconciliation +### 2. Single Status State-Driven Reconciliation -Records desired and actual states for each document index through the `DocumentIndex` database table: +Records index state and version for each document index through the `DocumentIndex` database table: ```python class DocumentIndex(BaseModel): document_id: str - index_type: DocumentIndexType # vector/fulltext/graph - desired_state: IndexDesiredState # PRESENT/ABSENT - actual_state: IndexActualState # ABSENT/CREATING/PRESENT/DELETING/FAILED - version: int # Version number, increment to trigger rebuild + index_type: DocumentIndexType # VECTOR/FULLTEXT/GRAPH + status: DocumentIndexStatus # PENDING/CREATING/ACTIVE/DELETING/DELETION_IN_PROGRESS/FAILED + version: int # Version number, increment to trigger rebuild + observed_version: int # Last processed version ``` -The reconciler periodically scans all records and triggers corresponding operations when `desired_state != actual_state`. +Key Status Meanings: +- **PENDING**: Awaiting processing (create/update needed) +- **CREATING**: Task claimed, creation/update in progress +- **ACTIVE**: Index is up-to-date and ready for use +- **DELETING**: Deletion has been requested +- **DELETION_IN_PROGRESS**: Task claimed, deletion in progress +- **FAILED**: The last operation failed + +The reconciler periodically scans all records and triggers corresponding operations based on: +- Version mismatch: `observed_version < version` indicates need for update +- Version = 1 with observed_version = 0: indicates need for initial creation +- Status = DELETING: indicates need for deletion ### 3. TaskScheduler Abstraction Layer Design **Design Advantages**: - **Business Logic and Task System Decoupling**: Reconciler only cares about "what operations to execute", not "what system to execute with" -- **Multi-scheduler Support**: Can switch between Celery, local synchronous, Prefect/Airflow and other workflow engines -- **Test-friendly**: Can use LocalTaskScheduler for synchronous execution during testing, facilitating debugging +- **Multi-scheduler Support**: Can switch between Celery, Prefect/Airflow and other workflow engines +- **Test-friendly**: Can use different schedulers for testing environments ```python # Abstract interface class TaskScheduler(ABC): - def schedule_create_index(self, document_id: str, index_types: List[str]) -> str - def schedule_update_index(self, document_id: str, index_types: List[str]) -> str + def schedule_create_index(self, document_id: str, index_types: List[str], context: dict = None) -> str + def schedule_update_index(self, document_id: str, index_types: List[str], context: dict = None) -> str def schedule_delete_index(self, document_id: str, index_types: List[str]) -> str # Reconciler uses abstract interface -class BackendIndexReconciler: +class IndexReconciler: def __init__(self, scheduler_type: str = "celery"): self.task_scheduler = create_task_scheduler(scheduler_type) - def _reconcile_document_operations(self, document_id: str, operations: dict): - if create_index_types: - # Only calls abstract interface, doesn't care about specific implementation - self.task_scheduler.schedule_create_index(document_id, create_index_types) + def _reconcile_document_operations(self, document_id: str, claimed_indexes: List[dict]): + # Only calls abstract interface, doesn't care about specific implementation + if create_types: + self.task_scheduler.schedule_create_index(document_id, create_types, context) + if update_types: + self.task_scheduler.schedule_update_index(document_id, update_types, context) ``` **Celery Task Entry Point and Business Code Separation**: @@ -103,6 +116,22 @@ class BackendIndexReconciler: - Business logic (`aperag/tasks/document.py`): Handle specific index creation logic - This separation enables independent testing of business logic and facilitates migration between different task systems +### 4. Create vs Update Operation Distinction + +The system clearly distinguishes between create and update operations: + +**Create Operations** (version = 1, observed_version = 0): +- For new documents or new index types +- Uses `schedule_create_index` and `create_index_task` +- Initial index creation from scratch + +**Update Operations** (version > 1, observed_version < version): +- For existing indexes that need rebuilding +- Uses `schedule_update_index` and `update_index_task` +- Updates existing index with new content + +This distinction allows for different processing strategies and optimizations for each operation type. + ## Asynchronous Task System ### Current Asynchronous Task List @@ -112,7 +141,7 @@ ApeRAG currently defines the following asynchronous tasks, each with clear respo | Task Name | Function | Retry Count | Location | |-----------|----------|-------------|----------| | `parse_document_task` | Parse document content, extract text and metadata | 3 times | config/celery_tasks.py | -| `create_index_task` | Create single type index (vector/fulltext/graph) | 3 times | config/celery_tasks.py | +| `create_index_task` | Create single type index (VECTOR/FULLTEXT/GRAPH) | 3 times | config/celery_tasks.py | | `update_index_task` | Update single type index | 3 times | config/celery_tasks.py | | `delete_index_task` | Delete single type index | 3 times | config/celery_tasks.py | | `trigger_create_indexes_workflow` | Dynamic fan-out for index creation tasks | No retry | config/celery_tasks.py | @@ -123,10 +152,11 @@ ApeRAG currently defines the following asynchronous tasks, each with clear respo ### Task Design Principles -1. **Fine-grained Tasks**: Each index type (vector/fulltext/graph) is an independent task, supporting individual retries +1. **Fine-grained Tasks**: Each index type (VECTOR/FULLTEXT/GRAPH) is an independent task, supporting individual retries 2. **Dynamic Orchestration**: Use trigger tasks to decide which index tasks to execute at runtime 3. **Layered Retry**: Business tasks support retry, orchestration tasks don't retry 4. **State Callbacks**: Each task calls back to update database state upon completion +5. **Version Validation**: Tasks validate version numbers to prevent stale operations ### Concurrent Execution Design @@ -135,17 +165,16 @@ ApeRAG currently defines the following asynchronous tasks, each with clear respo Use Celery's `group` for parallel execution and `chord` for result aggregation: ```python -# Group: Execute multiple index tasks in parallel +# Group: Execute multiple index tasks in parallel with context parallel_index_tasks = group([ - create_index_task.s(document_id, "vector", parsed_data_dict), - create_index_task.s(document_id, "fulltext", parsed_data_dict), - create_index_task.s(document_id, "graph", parsed_data_dict) + create_index_task.s(document_id, index_type, parsed_data_dict, context) + for index_type in index_types ]) # Chord: Execute callback after all parallel tasks complete workflow_chord = chord( parallel_index_tasks, - notify_workflow_complete.s(document_id, "create", ["vector", "fulltext", "graph"]) + notify_workflow_complete.s(document_id, "create", index_types) ) ``` @@ -154,32 +183,27 @@ workflow_chord = chord( Use Celery's `chain` for task chaining and `signature` for parameter passing: ```python -# Chained execution: parse -> dynamic fan-out +# Chained execution: parse -> dynamic fan-out with context workflow_chain = chain( - parse_document_task.s(document_id), # First task - trigger_create_indexes_workflow.s(document_id, index_types) # Second task, receives first task's result + parse_document_task.s(document_id), + trigger_create_indexes_workflow.s(document_id, index_types, context) ) - -# Signature mechanism for parameter passing -# parse_document_task's return value becomes the first parameter of trigger_create_indexes_workflow ``` -#### Parameter Passing and Data Flow +#### Parameter Passing and Context Flow ```python -# Data flow: -# 1. parse_document_task returns ParsedDocumentData.to_dict() -# 2. trigger_create_indexes_workflow receives parsing result -# 3. Dynamically creates parallel tasks, each task receives complete parsing data -# 4. notify_workflow_complete aggregates all index task results - -def trigger_create_indexes_workflow(self, parsed_data_dict: dict, document_id: str, index_types: List[str]): - # parsed_data_dict is the previous task's return value - parallel_index_tasks = group([ - # Each parallel task can access complete parsing data - create_index_task.s(document_id, index_type, parsed_data_dict) - for index_type in index_types - ]) +# Context includes version information for each index type +context = { + "VECTOR_version": 2, + "FULLTEXT_version": 1, + "GRAPH_version": 3 +} + +# Each index task extracts its specific version from context +def create_index_task(document_id, index_type, parsed_data_dict, context): + target_version = context.get(f'{index_type}_version') + # Validate version before processing ``` ## Specific Execution Flow Examples @@ -190,15 +214,15 @@ Taking user document upload triggering index creation as example: ```python # 1. Frontend Chain (Synchronous, millisecond-level) -API Call -> FrontendIndexManager.create_document_indexes() +API Call -> IndexManager.create_indexes() ↓ Write DocumentIndex table records: { document_id: "doc123", - index_type: "vector", - desired_state: "PRESENT", - actual_state: "ABSENT", - version: 1 + index_type: "VECTOR", + status: "PENDING", + version: 1, + observed_version: 0 } ↓ API returns 200 immediately @@ -206,11 +230,11 @@ API returns 200 immediately # 2. Backend Chain (Asynchronous, minute-level) Periodic task reconcile_indexes_task (executes every 30 seconds) ↓ -BackendIndexReconciler.reconcile_all() +IndexReconciler.reconcile_all() ↓ -Detects desired_state=PRESENT, actual_state=ABSENT +Detects version=1, observed_version=0 (create operation needed) ↓ -CeleryTaskScheduler.schedule_create_index(doc123, ["vector", "fulltext", "graph"]) +CeleryTaskScheduler.schedule_create_index(doc123, ["VECTOR", "FULLTEXT", "GRAPH"], context) ↓ create_document_indexes_workflow.delay() @@ -219,27 +243,33 @@ parse_document_task("doc123") ├── Download document file to local temp directory ├── Call docparser to parse document content ├── Return ParsedDocumentData.to_dict() -└── Update actual_state="CREATING" +└── Update status="CREATING" ↓ -trigger_create_indexes_workflow(parsed_data, "doc123", ["vector", "fulltext", "graph"]) -├── Create group parallel tasks +trigger_create_indexes_workflow(parsed_data, "doc123", ["VECTOR", "FULLTEXT", "GRAPH"], context) +├── Create group parallel tasks with version context └── Start chord waiting ↓ Parallel execution: -├── create_index_task("doc123", "vector", parsed_data) +├── create_index_task("doc123", "VECTOR", parsed_data, context) +│ ├── Extract VECTOR_version from context +│ ├── Validate version still matches database │ ├── Call vector_indexer.create_index() │ ├── Generate embeddings and store in vector database -│ └── Callback IndexTaskCallbacks.on_index_created() -├── create_index_task("doc123", "fulltext", parsed_data) +│ └── Callback IndexTaskCallbacks.on_index_created(target_version) +├── create_index_task("doc123", "FULLTEXT", parsed_data, context) +│ ├── Extract FULLTEXT_version from context +│ ├── Validate version still matches database │ ├── Call fulltext_indexer.create_index() │ ├── Build full-text search index -│ └── Callback IndexTaskCallbacks.on_index_created() -└── create_index_task("doc123", "graph", parsed_data) +│ └── Callback IndexTaskCallbacks.on_index_created(target_version) +└── create_index_task("doc123", "GRAPH", parsed_data, context) + ├── Extract GRAPH_version from context + ├── Validate version still matches database ├── Call graph_indexer.create_index() ├── Build knowledge graph - └── Callback IndexTaskCallbacks.on_index_created() + └── Callback IndexTaskCallbacks.on_index_created(target_version) ↓ -notify_workflow_complete([result1, result2, result3], "doc123", "create", ["vector", "fulltext", "graph"]) +notify_workflow_complete([result1, result2, result3], "doc123", "create", ["VECTOR", "FULLTEXT", "GRAPH"]) ├── Aggregate all index task results ├── Log workflow completion └── Return WorkflowResult @@ -251,7 +281,7 @@ User modifies document content triggering index update: ```python # 1. Frontend Chain -API Call -> FrontendIndexManager.update_document_indexes() +API Call -> IndexManager.rebuild_indexes() ↓ All existing index records version field +1: version: 1 -> 2 (triggers rebuild) @@ -261,11 +291,11 @@ API returns immediately # 2. Backend Chain reconcile_indexes_task detects version mismatch ↓ -actual_state=PRESENT but version outdated, determined as needing update +version=2, observed_version=1, version > 1 (update operation needed) ↓ schedule_update_index() -> update_document_indexes_workflow() -# 3. Task Execution (similar to creation) +# 3. Task Execution (similar to creation but with update tasks) parse_document_task -> trigger_update_indexes_workflow -> parallel update_index_task ``` @@ -275,14 +305,14 @@ User deletes document triggering index deletion: ```python # 1. Frontend Chain -API Call -> FrontendIndexManager.delete_document_indexes() +API Call -> IndexManager.delete_indexes() ↓ -Set desired_state="ABSENT" +Set status="DELETING" ↓ API returns immediately # 2. Backend Chain -Detects desired_state=ABSENT, actual_state=PRESENT +Detects status=DELETING ↓ schedule_delete_index() -> delete_document_indexes_workflow() @@ -301,21 +331,27 @@ Each Celery task is configured with automatic retry: ```python @current_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60}) -def create_index_task(self, document_id: str, index_type: str, parsed_data_dict: dict): +def create_index_task(self, document_id: str, index_type: str, parsed_data_dict: dict, context: dict = None): try: + # Extract and validate version from context + target_version = context.get(f'{index_type}_version') if context else None + + # Double-check version still matches database before processing + # ... version validation logic ... + # Business logic result = document_index_task.create_index(document_id, index_type, parsed_data) if result.success: - self._handle_index_success(document_id, index_type, result.data) + self._handle_index_success(document_id, index_type, target_version, result.data) else: # Business logic failure but don't throw exception to avoid meaningless retry if self.request.retries >= self.max_retries: - self._handle_index_failure(document_id, [index_type], result.error) + self._handle_index_failure(document_id, index_type, result.error) return result.to_dict() except Exception as e: # Only mark as failed after retry attempts are exhausted if self.request.retries >= self.max_retries: - self._handle_index_failure(document_id, [index_type], str(e)) + self._handle_index_failure(document_id, index_type, str(e)) raise # Continue throwing exception to trigger retry ``` @@ -346,33 +382,40 @@ def notify_workflow_complete(self, index_results: List[dict], document_id: str, ### State Management and Error Recovery -Track errors through database state: +Track errors through database state with version validation: ```python class IndexTaskCallbacks: @staticmethod - def on_index_failed(document_id: str, index_type: str, error_message: str): - """Task failure callback""" - # Update database state - doc_index.actual_state = IndexActualState.FAILED - doc_index.error_message = error_message - - # Will retry on next reconcile + def on_index_created(document_id: str, index_type: str, target_version: int, index_data: str = None): + """Task success callback with version validation""" + # Use atomic update with version validation + update_stmt = ( + update(DocumentIndex) + .where( + and_( + DocumentIndex.document_id == document_id, + DocumentIndex.index_type == DocumentIndexType(index_type), + DocumentIndex.status == DocumentIndexStatus.CREATING, + DocumentIndex.version == target_version, # Critical: validate version + ) + ) + .values( + status=DocumentIndexStatus.ACTIVE, + observed_version=target_version, # Mark this version as processed + index_data=index_data, + error_message=None, + ) + ) ``` ### Error Recovery Strategies 1. **Automatic Retry**: Task-level 3 automatic retries to handle temporary network or resource issues -2. **State Reset**: Users can manually reset failed state to trigger re-execution -3. **Partial Retry**: Only retry failed index types without affecting successful indexes -4. **Degraded Handling**: Some index failures don't affect document searchability (e.g., graph index failure but vector index success) - -### Monitoring and Alerting - -- **Task Execution Logs**: Each task records detailed execution logs -- **Failure Rate Statistics**: View task failure rates through Celery monitoring tools -- **State Inconsistency Detection**: Periodically check tasks in CREATING state for extended periods -- **Resource Usage Monitoring**: Monitor memory and CPU usage during task execution +2. **Version Validation**: Prevents stale operations through version checking at task execution time +3. **State Reset**: Users can manually reset failed state to trigger re-execution +4. **Partial Retry**: Only retry failed index types without affecting successful indexes +5. **Degraded Handling**: Some index failures don't affect document searchability (e.g., graph index failure but vector index success) ## Code Organization Structure @@ -381,7 +424,7 @@ class IndexTaskCallbacks: ``` aperag/ ├── index/ # Index management core module -│ ├── manager.py # Frontend index manager +│ ├── manager.py # Index manager (frontend operations) │ ├── reconciler.py # Backend reconciler │ ├── base.py # Indexer base class definition │ ├── vector_index.py # Vector index implementation @@ -401,23 +444,22 @@ config/ ### Core Interface Design -#### Frontend Management Interface +#### Index Management Interface ```python # aperag/index/manager.py -class FrontendIndexManager: - async def create_document_indexes(self, session, document_id, user, index_types) - async def update_document_indexes(self, session, document_id) - async def delete_document_indexes(self, session, document_id, index_types) - async def get_document_index_status(self, session, document_id) +class IndexManager: + def create_indexes(self, document_id, index_types, created_by, session) + def rebuild_indexes(self, document_id, index_types, created_by, session) + def delete_indexes(self, document_id, index_types, session) ``` #### Reconciler Interface ```python # aperag/index/reconciler.py -class BackendIndexReconciler: +class IndexReconciler: def reconcile_all(self) # Main reconciliation loop def _get_indexes_needing_reconciliation(self, session) # Get indexes needing reconciliation - def _reconcile_grouped(self, indexes_needing_reconciliation) # Batch reconciliation processing + def _reconcile_single_document(self, document_id, operations) # Process single document ``` #### Indexer Interface @@ -467,6 +509,21 @@ class WorkflowResult: index_results: List[IndexTaskResult] ``` +## Current Implementation Status + +### Simplified Architecture Features + +1. **Removed Distributed Locking**: Current implementation focuses on correctness through version validation rather than distributed locks for external resource concurrency +2. **Single Status Model**: Simplified from dual-state (desired/actual) to single status with version tracking +3. **Clear Operation Separation**: Explicit distinction between create (v=1) and update (v>1) operations +4. **Version-based Validation**: Prevents stale operations through version checking at task execution time + +### Future Considerations + +1. **Concurrency Control**: While distributed locking has been removed for simplicity, future implementations may need to address concurrent operations on external systems (vector databases, search engines, etc.) +2. **Performance Optimization**: The current architecture prioritizes correctness and simplicity over maximum performance +3. **Monitoring Enhancements**: Additional monitoring and alerting capabilities may be added as the system scales + ## Summary ApeRAG's indexing pipeline architecture achieves efficient document indexing through the following technical design: @@ -475,15 +532,16 @@ ApeRAG's indexing pipeline architecture achieves efficient document indexing thr 1. **Fast Response**: Frontend chain only operates on database, API response time controlled at millisecond level 2. **Strong Processing Capability**: Backend asynchronous processing supports large-scale document indexing, improving throughput through parallel tasks -3. **Good Error Recovery**: Multi-level retry mechanisms and state management, supporting graceful handling of partial failure scenarios +3. **Good Error Recovery**: Multi-level retry mechanisms and version-based state management, supporting graceful handling of partial failure scenarios 4. **Strong System Decoupling**: TaskScheduler abstraction layer decouples business logic from specific task systems -5. **Comprehensive Monitoring**: Full-chain state tracking and logging for easy troubleshooting and performance optimization +5. **Version Consistency**: Version validation prevents stale operations and ensures data consistency ### Technical Features -1. **State-driven**: Achieves eventual consistency through detection of differences between desired and actual states +1. **State-driven**: Achieves eventual consistency through detection of version mismatches and status changes 2. **Dynamic Orchestration**: Dynamically creates index tasks at runtime based on document parsing results, avoiding static workflow limitations 3. **Batch Optimization**: Multiple index tasks for the same document share parsing results, reducing redundant computation 4. **Layered Design**: Task scheduling, business logic, and index implementation are decoupled in layers for easy testing and maintenance +5. **Operation Distinction**: Clear separation of create vs update operations allows for optimized processing strategies -This architecture provides good performance and scalability support for high-concurrency document indexing scenarios while ensuring system reliability. \ No newline at end of file +This architecture provides good performance and scalability support for high-concurrency document indexing scenarios while ensuring system reliability and maintainability. \ No newline at end of file diff --git a/docs/indexing_architecture_zh.md b/docs/indexing_architecture_zh.md index fbdcfdf46..b112a33db 100644 --- a/docs/indexing_architecture_zh.md +++ b/docs/indexing_architecture_zh.md @@ -2,21 +2,21 @@ ## 概述 -ApeRAG的索引链路架构采用双链路设计模式,将索引管理分为前端链路(Frontend Chain)和后端链路(Backend Chain),通过状态驱动的调谐机制实现文档索引的异步处理。前端链路负责快速响应用户操作并设置索引期望状态,后端链路通过定时调谐器检测状态差异并调度异步任务执行实际的索引操作。 +ApeRAG的索引链路架构采用双链路设计模式,将索引管理分为前端链路(Frontend Chain)和后端链路(Backend Chain),通过状态驱动的调谐机制实现文档索引的异步处理。前端链路负责快速响应用户操作并设置索引状态,后端链路通过定时调谐器检测状态变化并调度异步任务执行实际的索引操作。 ## 架构概览 ```mermaid graph TB subgraph "Frontend Chain (同步快速响应)" - A[API请求] --> B[FrontendIndexManager] + A[API请求] --> B[IndexManager] B --> C[写入DocumentIndex表] - C --> D[设置desired_state=PRESENT] + C --> D[设置status=PENDING, version++] end subgraph "Backend Chain (异步任务处理)" - E[定时任务reconcile_indexes_task] --> F[BackendIndexReconciler.reconcile_all] - F --> G[检测desired_state != actual_state] + E[定时任务reconcile_indexes_task] --> F[IndexReconciler.reconcile_all] + F --> G[检测版本不匹配或状态变更需求] G --> H[TaskScheduler调度异步任务] end @@ -25,9 +25,9 @@ graph TB I --> J[parse_document_task] J --> K[trigger_create_indexes_workflow] K --> L[group并行执行] - L --> M[create_index_task.vector] - L --> N[create_index_task.fulltext] - L --> O[create_index_task.graph] + L --> M[create_index_task.VECTOR] + L --> N[create_index_task.FULLTEXT] + L --> O[create_index_task.GRAPH] M --> P[chord回调] N --> P O --> P @@ -36,7 +36,7 @@ graph TB subgraph "状态反馈" Q --> R[IndexTaskCallbacks] - R --> S[更新actual_state=PRESENT] + R --> S[更新status=ACTIVE, observed_version] S --> T[下次调谐检查] end @@ -49,53 +49,66 @@ graph TB ### 1. 双链路分离 **前端链路(Frontend Chain)**: -- 目标:快速响应用户操作,不阻塞API请求 -- 实现:只操作数据库表,设置期望状态,立即返回 -- 代码:`aperag/index/manager.py` 中的 `FrontendIndexManager` +- **目标**:快速响应用户操作,不阻塞API请求 +- **实现**:只操作数据库表,设置期望状态,立即返回 +- **代码**:`aperag/index/manager.py` 中的 `IndexManager` **后端链路(Backend Chain)**: -- 目标:异步执行耗时的索引操作,支持重试和错误恢复 -- 实现:通过定时任务持续扫描状态差异,调度异步任务 -- 代码:`aperag/index/reconciler.py` 中的 `BackendIndexReconciler` +- **目标**:异步执行耗时的索引操作,支持重试和错误恢复 +- **实现**:通过定时任务持续扫描状态变化,调度异步任务 +- **代码**:`aperag/index/reconciler.py` 中的 `IndexReconciler` -### 2. 状态驱动调谐 +### 2. 单一状态驱动调谐 -通过数据库表`DocumentIndex`记录每个文档索引的期望状态和实际状态: +通过数据库表`DocumentIndex`记录每个文档索引的状态和版本: ```python class DocumentIndex(BaseModel): document_id: str - index_type: DocumentIndexType # vector/fulltext/graph - desired_state: IndexDesiredState # PRESENT/ABSENT - actual_state: IndexActualState # ABSENT/CREATING/PRESENT/DELETING/FAILED - version: int # 版本号,递增触发重建 + index_type: DocumentIndexType # VECTOR/FULLTEXT/GRAPH + status: DocumentIndexStatus # PENDING/CREATING/ACTIVE/DELETING/DELETION_IN_PROGRESS/FAILED + version: int # 版本号,递增触发重建 + observed_version: int # 上次处理的版本 ``` -调谐器定时扫描所有记录,当`desired_state != actual_state`时触发相应操作。 +关键状态含义: +- **PENDING**:等待处理(需要创建/更新) +- **CREATING**:任务已认领,创建/更新进行中 +- **ACTIVE**:索引已是最新状态,可用于搜索 +- **DELETING**:已请求删除 +- **DELETION_IN_PROGRESS**:任务已认领,删除进行中 +- **FAILED**:上次操作失败 + +调谐器定时扫描所有记录,基于以下条件触发相应操作: +- 版本不匹配:`observed_version < version` 表示需要更新 +- 版本 = 1 且 observed_version = 0:表示需要初始创建 +- 状态 = DELETING:表示需要删除 ### 3. TaskScheduler抽象层设计 **设计优势**: - **业务逻辑与任务系统解耦**:Reconciler只关心"需要执行什么操作",不关心"用什么系统执行" -- **多调度器支持**:可以在Celery、本地同步、Prefect/Airflow等工作流引擎之间切换 -- **测试友好**:测试时可以使用LocalTaskScheduler同步执行,便于调试 +- **多调度器支持**:可以在Celery、Prefect/Airflow等工作流引擎之间切换 +- **测试友好**:测试时可以使用不同的调度器,便于调试 ```python # 抽象接口 class TaskScheduler(ABC): - def schedule_create_index(self, document_id: str, index_types: List[str]) -> str - def schedule_update_index(self, document_id: str, index_types: List[str]) -> str + def schedule_create_index(self, document_id: str, index_types: List[str], context: dict = None) -> str + def schedule_update_index(self, document_id: str, index_types: List[str], context: dict = None) -> str def schedule_delete_index(self, document_id: str, index_types: List[str]) -> str # Reconciler使用抽象接口 -class BackendIndexReconciler: +class IndexReconciler: def __init__(self, scheduler_type: str = "celery"): self.task_scheduler = create_task_scheduler(scheduler_type) - def _reconcile_document_operations(self, document_id: str, operations: dict): - if create_index_types: - # 只调用抽象接口,不关心具体实现 - self.task_scheduler.schedule_create_index(document_id, create_index_types) + def _reconcile_document_operations(self, document_id: str, claimed_indexes: List[dict]): + # 只调用抽象接口,不关心具体实现 + if create_types: + self.task_scheduler.schedule_create_index(document_id, create_types, context) + if update_types: + self.task_scheduler.schedule_update_index(document_id, update_types, context) ``` **Celery任务入口与业务代码分离**: @@ -103,6 +116,22 @@ class BackendIndexReconciler: - 业务逻辑(`aperag/tasks/document.py`):负责具体的索引创建逻辑 - 这种分离使得业务逻辑可以独立测试,也便于在不同任务系统间迁移 +### 4. 创建与更新操作区分 + +系统明确区分创建和更新操作: + +**创建操作** (version = 1, observed_version = 0): +- 针对新文档或新索引类型 +- 使用 `schedule_create_index` 和 `create_index_task` +- 从零开始创建索引 + +**更新操作** (version > 1, observed_version < version): +- 针对需要重建的现有索引 +- 使用 `schedule_update_index` 和 `update_index_task` +- 使用新内容更新现有索引 + +这种区分允许对每种操作类型采用不同的处理策略和优化。 + ## 异步任务体系 ### 当前异步任务列表 @@ -112,7 +141,7 @@ ApeRAG当前定义了以下异步任务,每个任务都有明确的职责分 | 任务名称 | 功能 | 重试次数 | 位置 | |---------|------|---------|------| | `parse_document_task` | 解析文档内容,提取文本和元数据 | 3次 | config/celery_tasks.py | -| `create_index_task` | 创建单个类型的索引(vector/fulltext/graph) | 3次 | config/celery_tasks.py | +| `create_index_task` | 创建单个类型的索引(VECTOR/FULLTEXT/GRAPH) | 3次 | config/celery_tasks.py | | `update_index_task` | 更新单个类型的索引 | 3次 | config/celery_tasks.py | | `delete_index_task` | 删除单个类型的索引 | 3次 | config/celery_tasks.py | | `trigger_create_indexes_workflow` | 动态扇出创建索引任务 | 无重试 | config/celery_tasks.py | @@ -123,10 +152,11 @@ ApeRAG当前定义了以下异步任务,每个任务都有明确的职责分 ### 任务设计原则 -1. **细粒度任务**:每个索引类型(vector/fulltext/graph)都是独立的任务,支持单独重试 +1. **细粒度任务**:每个索引类型(VECTOR/FULLTEXT/GRAPH)都是独立的任务,支持单独重试 2. **动态编排**:通过trigger任务在运行时决定要执行哪些索引任务 3. **分层重试**:业务任务支持重试,编排任务不重试 4. **状态回调**:每个任务完成后都会回调更新数据库状态 +5. **版本验证**:任务验证版本号以防止过期操作 ### 并发执行设计 @@ -135,17 +165,16 @@ ApeRAG当前定义了以下异步任务,每个任务都有明确的职责分 使用Celery的`group`实现并行执行,`chord`实现结果聚合: ```python -# Group:并行执行多个索引任务 +# Group:使用context并行执行多个索引任务 parallel_index_tasks = group([ - create_index_task.s(document_id, "vector", parsed_data_dict), - create_index_task.s(document_id, "fulltext", parsed_data_dict), - create_index_task.s(document_id, "graph", parsed_data_dict) + create_index_task.s(document_id, index_type, parsed_data_dict, context) + for index_type in index_types ]) # Chord:等待所有并行任务完成后执行回调 workflow_chord = chord( parallel_index_tasks, - notify_workflow_complete.s(document_id, "create", ["vector", "fulltext", "graph"]) + notify_workflow_complete.s(document_id, "create", index_types) ) ``` @@ -154,32 +183,27 @@ workflow_chord = chord( 通过Celery的`chain`实现任务串联,通过`signature`传递参数: ```python -# 串联执行:解析 -> 动态扇出 +# 串联执行:解析 -> 动态扇出并传递context workflow_chain = chain( - parse_document_task.s(document_id), # 第一个任务 - trigger_create_indexes_workflow.s(document_id, index_types) # 第二个任务,接收第一个任务的结果 + parse_document_task.s(document_id), + trigger_create_indexes_workflow.s(document_id, index_types, context) ) - -# signature机制传递参数 -# parse_document_task的返回值会作为trigger_create_indexes_workflow的第一个参数 ``` -#### 参数传递和数据流 +#### 参数传递和上下文流 ```python -# 数据流: -# 1. parse_document_task 返回 ParsedDocumentData.to_dict() -# 2. trigger_create_indexes_workflow 接收解析结果 -# 3. 动态创建并行任务,每个任务都收到完整的解析数据 -# 4. notify_workflow_complete 聚合所有索引任务的结果 - -def trigger_create_indexes_workflow(self, parsed_data_dict: dict, document_id: str, index_types: List[str]): - # parsed_data_dict是上一个任务的返回值 - parallel_index_tasks = group([ - # 每个并行任务都能访问完整的解析数据 - create_index_task.s(document_id, index_type, parsed_data_dict) - for index_type in index_types - ]) +# context包含每个索引类型的版本信息 +context = { + "VECTOR_version": 2, + "FULLTEXT_version": 1, + "GRAPH_version": 3 +} + +# 每个索引任务从context中提取其特定版本 +def create_index_task(document_id, index_type, parsed_data_dict, context): + target_version = context.get(f'{index_type}_version') + # 处理前验证版本 ``` ## 具体执行链路示例 @@ -190,15 +214,15 @@ def trigger_create_indexes_workflow(self, parsed_data_dict: dict, document_id: s ```python # 1. 前端链路(同步,毫秒级) -API调用 -> FrontendIndexManager.create_document_indexes() +API调用 -> IndexManager.create_indexes() ↓ 写入DocumentIndex表记录: { document_id: "doc123", - index_type: "vector", - desired_state: "PRESENT", - actual_state: "ABSENT", - version: 1 + index_type: "VECTOR", + status: "PENDING", + version: 1, + observed_version: 0 } ↓ API立即返回200 @@ -206,11 +230,11 @@ API立即返回200 # 2. 后端链路(异步,分钟级) 定时任务reconcile_indexes_task(每30秒执行) ↓ -BackendIndexReconciler.reconcile_all() +IndexReconciler.reconcile_all() ↓ -检测到desired_state=PRESENT, actual_state=ABSENT +检测到version=1, observed_version=0(需要创建操作) ↓ -CeleryTaskScheduler.schedule_create_index(doc123, ["vector", "fulltext", "graph"]) +CeleryTaskScheduler.schedule_create_index(doc123, ["VECTOR", "FULLTEXT", "GRAPH"], context) ↓ create_document_indexes_workflow.delay() @@ -219,27 +243,33 @@ parse_document_task("doc123") ├── 下载文档文件到本地临时目录 ├── 调用docparser解析文档内容 ├── 返回ParsedDocumentData.to_dict() -└── 更新actual_state="CREATING" +└── 更新status="CREATING" ↓ -trigger_create_indexes_workflow(parsed_data, "doc123", ["vector", "fulltext", "graph"]) -├── 创建group并行任务 +trigger_create_indexes_workflow(parsed_data, "doc123", ["VECTOR", "FULLTEXT", "GRAPH"], context) +├── 创建group并行任务并传递版本context └── 启动chord等待 ↓ 并行执行: -├── create_index_task("doc123", "vector", parsed_data) +├── create_index_task("doc123", "VECTOR", parsed_data, context) +│ ├── 从context提取VECTOR_version +│ ├── 验证版本仍与数据库匹配 │ ├── 调用vector_indexer.create_index() │ ├── 生成embedding并存入向量数据库 -│ └── 回调IndexTaskCallbacks.on_index_created() -├── create_index_task("doc123", "fulltext", parsed_data) +│ └── 回调IndexTaskCallbacks.on_index_created(target_version) +├── create_index_task("doc123", "FULLTEXT", parsed_data, context) +│ ├── 从context提取FULLTEXT_version +│ ├── 验证版本仍与数据库匹配 │ ├── 调用fulltext_indexer.create_index() │ ├── 建立全文搜索索引 -│ └── 回调IndexTaskCallbacks.on_index_created() -└── create_index_task("doc123", "graph", parsed_data) +│ └── 回调IndexTaskCallbacks.on_index_created(target_version) +└── create_index_task("doc123", "GRAPH", parsed_data, context) + ├── 从context提取GRAPH_version + ├── 验证版本仍与数据库匹配 ├── 调用graph_indexer.create_index() ├── 构建知识图谱 - └── 回调IndexTaskCallbacks.on_index_created() + └── 回调IndexTaskCallbacks.on_index_created(target_version) ↓ -notify_workflow_complete([result1, result2, result3], "doc123", "create", ["vector", "fulltext", "graph"]) +notify_workflow_complete([result1, result2, result3], "doc123", "create", ["VECTOR", "FULLTEXT", "GRAPH"]) ├── 聚合所有索引任务结果 ├── 记录工作流完成日志 └── 返回WorkflowResult @@ -251,7 +281,7 @@ notify_workflow_complete([result1, result2, result3], "doc123", "create", ["vect ```python # 1. 前端链路 -API调用 -> FrontendIndexManager.update_document_indexes() +API调用 -> IndexManager.rebuild_indexes() ↓ 所有现有索引记录version字段+1: version: 1 -> 2 (触发重建) @@ -259,13 +289,13 @@ version: 1 -> 2 (触发重建) API立即返回 # 2. 后端链路 -reconcile_indexes_task检测到version不匹配 +reconcile_indexes_task检测到版本不匹配 ↓ -actual_state=PRESENT但version过期,判定为需要更新 +version=2, observed_version=1, version > 1(需要更新操作) ↓ schedule_update_index() -> update_document_indexes_workflow() -# 3. 任务执行(与创建类似) +# 3. 任务执行(与创建类似但使用更新任务) parse_document_task -> trigger_update_indexes_workflow -> 并行update_index_task ``` @@ -275,14 +305,14 @@ parse_document_task -> trigger_update_indexes_workflow -> 并行update_index_tas ```python # 1. 前端链路 -API调用 -> FrontendIndexManager.delete_document_indexes() +API调用 -> IndexManager.delete_indexes() ↓ -设置desired_state="ABSENT" +设置status="DELETING" ↓ API立即返回 # 2. 后端链路 -检测到desired_state=ABSENT, actual_state=PRESENT +检测到status=DELETING ↓ schedule_delete_index() -> delete_document_indexes_workflow() @@ -301,21 +331,27 @@ trigger_delete_indexes_workflow -> 并行delete_index_task ```python @current_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60}) -def create_index_task(self, document_id: str, index_type: str, parsed_data_dict: dict): +def create_index_task(self, document_id: str, index_type: str, parsed_data_dict: dict, context: dict = None): try: + # 从context提取并验证版本 + target_version = context.get(f'{index_type}_version') if context else None + + # 处理前双重检查版本仍与数据库匹配 + # ... 版本验证逻辑 ... + # 业务逻辑 result = document_index_task.create_index(document_id, index_type, parsed_data) if result.success: - self._handle_index_success(document_id, index_type, result.data) + self._handle_index_success(document_id, index_type, target_version, result.data) else: # 业务逻辑失败但不抛异常,避免无意义重试 if self.request.retries >= self.max_retries: - self._handle_index_failure(document_id, [index_type], result.error) + self._handle_index_failure(document_id, index_type, result.error) return result.to_dict() except Exception as e: # 只有在重试次数用完后才标记失败 if self.request.retries >= self.max_retries: - self._handle_index_failure(document_id, [index_type], str(e)) + self._handle_index_failure(document_id, index_type, str(e)) raise # 继续抛出异常触发重试 ``` @@ -346,42 +382,49 @@ def notify_workflow_complete(self, index_results: List[dict], document_id: str, ### 状态管理和错误恢复 -通过数据库状态追踪错误: +通过数据库状态和版本验证追踪错误: ```python class IndexTaskCallbacks: @staticmethod - def on_index_failed(document_id: str, index_type: str, error_message: str): - """任务失败回调""" - # 更新数据库状态 - doc_index.actual_state = IndexActualState.FAILED - doc_index.error_message = error_message - - # 下次reconcile时会重新尝试 + def on_index_created(document_id: str, index_type: str, target_version: int, index_data: str = None): + """带版本验证的任务成功回调""" + # 使用带版本验证的原子更新 + update_stmt = ( + update(DocumentIndex) + .where( + and_( + DocumentIndex.document_id == document_id, + DocumentIndex.index_type == DocumentIndexType(index_type), + DocumentIndex.status == DocumentIndexStatus.CREATING, + DocumentIndex.version == target_version, # 关键:验证版本 + ) + ) + .values( + status=DocumentIndexStatus.ACTIVE, + observed_version=target_version, # 标记此版本已处理 + index_data=index_data, + error_message=None, + ) + ) ``` ### 错误恢复策略 1. **自动重试**:任务级别的3次自动重试,解决临时网络或资源问题 -2. **状态重置**:用户可以手动重置失败状态,触发重新执行 -3. **部分重试**:只重试失败的索引类型,不影响已成功的索引 -4. **降级处理**:某些索引失败不影响文档的可搜索性(如graph索引失败但vector索引成功) - -### 监控和告警 - -- **任务执行日志**:每个任务都记录详细的执行日志 -- **失败率统计**:通过Celery监控工具查看任务失败率 -- **状态不一致检测**:定期检查长时间处于CREATING状态的任务 -- **资源使用监控**:监控任务执行时的内存和CPU使用情况 +2. **版本验证**:通过在任务执行时检查版本来防止过期操作 +3. **状态重置**:用户可以手动重置失败状态,触发重新执行 +4. **部分重试**:只重试失败的索引类型,不影响已成功的索引 +5. **降级处理**:某些索引失败不影响文档的可搜索性(如graph索引失败但vector索引成功) ## 代码组织结构 -### 目录结构说明 +### 目录结构 ``` aperag/ ├── index/ # 索引管理核心模块 -│ ├── manager.py # 前端索引管理器 +│ ├── manager.py # 索引管理器(前端操作) │ ├── reconciler.py # 后端调谐器 │ ├── base.py # 索引器基类定义 │ ├── vector_index.py # 向量索引实现 @@ -401,23 +444,22 @@ config/ ### 核心接口设计 -#### 前端管理接口 +#### 索引管理接口 ```python # aperag/index/manager.py -class FrontendIndexManager: - async def create_document_indexes(self, session, document_id, user, index_types) - async def update_document_indexes(self, session, document_id) - async def delete_document_indexes(self, session, document_id, index_types) - async def get_document_index_status(self, session, document_id) +class IndexManager: + def create_indexes(self, document_id, index_types, created_by, session) + def rebuild_indexes(self, document_id, index_types, created_by, session) + def delete_indexes(self, document_id, index_types, session) ``` #### 调谐器接口 ```python # aperag/index/reconciler.py -class BackendIndexReconciler: +class IndexReconciler: def reconcile_all(self) # 主调谐循环 def _get_indexes_needing_reconciliation(self, session) # 获取需要调谐的索引 - def _reconcile_grouped(self, indexes_needing_reconciliation) # 批量处理调谐 + def _reconcile_single_document(self, document_id, operations) # 处理单个文档 ``` #### 索引器接口 @@ -467,6 +509,21 @@ class WorkflowResult: index_results: List[IndexTaskResult] ``` +## 当前实现状态 + +### 简化架构特性 + +1. **移除分布式锁**:当前实现专注于通过版本验证确保正确性,而非分布式锁处理外部资源并发 +2. **单一状态模型**:从双状态(期望/实际)简化为带版本跟踪的单一状态 +3. **明确操作分离**:显式区分创建(v=1)和更新(v>1)操作 +4. **基于版本的验证**:通过在任务执行时检查版本来防止过期操作 + +### 未来考虑 + +1. **并发控制**:虽然为简化而移除了分布式锁,但未来实现可能需要解决外部系统(向量数据库、搜索引擎等)的并发操作问题 +2. **性能优化**:当前架构优先考虑正确性和简洁性而非最大性能 +3. **监控增强**:随着系统扩展,可能会添加额外的监控和告警功能 + ## 总结 ApeRAG的索引链路架构通过以下技术设计实现了高效的文档索引处理: @@ -475,15 +532,16 @@ ApeRAG的索引链路架构通过以下技术设计实现了高效的文档索 1. **响应速度快**:前端链路只操作数据库,API响应时间控制在毫秒级 2. **处理能力强**:后端异步处理支持大批量文档索引,通过并行任务提升吞吐量 -3. **错误恢复好**:多层次的重试机制和状态管理,支持部分失败场景的优雅处理 +3. **错误恢复好**:多层次的重试机制和基于版本的状态管理,支持部分失败场景的优雅处理 4. **系统解耦强**:TaskScheduler抽象层使得业务逻辑与具体任务系统解耦 -5. **监控完善**:全链路的状态追踪和日志记录,便于问题排查和性能优化 +5. **版本一致性**:版本验证防止过期操作并确保数据一致性 ### 技术特点 -1. **状态驱动**:通过期望状态和实际状态的差异检测,实现最终一致性 +1. **状态驱动**:通过检测版本不匹配和状态变化实现最终一致性 2. **动态编排**:运行时根据文档解析结果动态创建索引任务,避免静态工作流的局限性 3. **批量优化**:同一文档的多个索引任务共享解析结果,减少重复计算 4. **分层设计**:任务调度、业务逻辑、索引实现分层解耦,便于测试和维护 +5. **操作区分**:明确分离创建与更新操作,允许优化的处理策略 -这个架构在保证系统可靠性的同时,为高并发的文档索引场景提供了良好的性能和扩展性支持。 \ No newline at end of file +这个架构在保证系统可靠性和可维护性的同时,为高并发的文档索引场景提供了良好的性能和扩展性支持。 \ No newline at end of file From b04dc00a21be6c20e5c8be1b8091717d9495fbd3 Mon Sep 17 00:00:00 2001 From: Guo Ziang Date: Tue, 24 Jun 2025 17:08:59 +0800 Subject: [PATCH 4/6] chore: tidy up --- aperag/api/components/schemas/document.yaml | 6 +++--- aperag/service/document_service.py | 3 +++ frontend/src/api/models/document.ts | 6 +++--- frontend/src/api/openapi.merged.yaml | 6 +++--- frontend/src/constants/index.ts | 6 ++++-- frontend/src/locales/en-US.ts | 6 ++++-- frontend/src/locales/zh-CN.ts | 6 ++++-- 7 files changed, 24 insertions(+), 15 deletions(-) diff --git a/aperag/api/components/schemas/document.yaml b/aperag/api/components/schemas/document.yaml index 9b7ee41f0..32e889731 100644 --- a/aperag/api/components/schemas/document.yaml +++ b/aperag/api/components/schemas/document.yaml @@ -21,7 +21,7 @@ document: - CREATING - ACTIVE - DELETING - - DELETING_IN_PROGRESS + - DELETION_IN_PROGRESS - FAILED - SKIPPED fulltext_index_status: @@ -31,7 +31,7 @@ document: - CREATING - ACTIVE - DELETING - - DELETING_IN_PROGRESS + - DELETION_IN_PROGRESS - FAILED - SKIPPED graph_index_status: @@ -41,7 +41,7 @@ document: - CREATING - ACTIVE - DELETING - - DELETING_IN_PROGRESS + - DELETION_IN_PROGRESS - FAILED - SKIPPED vector_index_updated: diff --git a/aperag/service/document_service.py b/aperag/service/document_service.py index 52054d6e6..28488f8a0 100644 --- a/aperag/service/document_service.py +++ b/aperag/service/document_service.py @@ -361,6 +361,9 @@ async def _rebuild_document_indexes_atomically(session): collection = await self.db_ops.query_collection(user_id, collection_id) if not collection or collection.user != user_id: raise ResourceNotFoundException(f"Collection {collection_id} not found or access denied") + collection_config = json.loads(collection.config) + if not collection_config.get("enable_knowledge_graph", False): + index_type_enums.remove(db_models.DocumentIndexType.GRAPH) # Trigger index rebuild by incrementing version for selected index types await document_index_manager.create_or_update_document_indexes(session, document_id, index_type_enums) diff --git a/frontend/src/api/models/document.ts b/frontend/src/api/models/document.ts index 6da2c0aed..851777a6d 100644 --- a/frontend/src/api/models/document.ts +++ b/frontend/src/api/models/document.ts @@ -115,7 +115,7 @@ export const DocumentVectorIndexStatusEnum = { CREATING: 'CREATING', ACTIVE: 'ACTIVE', DELETING: 'DELETING', - DELETING_IN_PROGRESS: 'DELETING_IN_PROGRESS', + DELETION_IN_PROGRESS: 'DELETION_IN_PROGRESS', FAILED: 'FAILED', SKIPPED: 'SKIPPED' } as const; @@ -126,7 +126,7 @@ export const DocumentFulltextIndexStatusEnum = { CREATING: 'CREATING', ACTIVE: 'ACTIVE', DELETING: 'DELETING', - DELETING_IN_PROGRESS: 'DELETING_IN_PROGRESS', + DELETION_IN_PROGRESS: 'DELETION_IN_PROGRESS', FAILED: 'FAILED', SKIPPED: 'SKIPPED' } as const; @@ -137,7 +137,7 @@ export const DocumentGraphIndexStatusEnum = { CREATING: 'CREATING', ACTIVE: 'ACTIVE', DELETING: 'DELETING', - DELETING_IN_PROGRESS: 'DELETING_IN_PROGRESS', + DELETION_IN_PROGRESS: 'DELETION_IN_PROGRESS', FAILED: 'FAILED', SKIPPED: 'SKIPPED' } as const; diff --git a/frontend/src/api/openapi.merged.yaml b/frontend/src/api/openapi.merged.yaml index a34ac3bd3..bcff3f3be 100644 --- a/frontend/src/api/openapi.merged.yaml +++ b/frontend/src/api/openapi.merged.yaml @@ -2734,7 +2734,7 @@ components: - CREATING - ACTIVE - DELETING - - DELETING_IN_PROGRESS + - DELETION_IN_PROGRESS - FAILED - SKIPPED fulltext_index_status: @@ -2744,7 +2744,7 @@ components: - CREATING - ACTIVE - DELETING - - DELETING_IN_PROGRESS + - DELETION_IN_PROGRESS - FAILED - SKIPPED graph_index_status: @@ -2754,7 +2754,7 @@ components: - CREATING - ACTIVE - DELETING - - DELETING_IN_PROGRESS + - DELETION_IN_PROGRESS - FAILED - SKIPPED vector_index_updated: diff --git a/frontend/src/constants/index.ts b/frontend/src/constants/index.ts index 66dae34f3..650c0c8a4 100644 --- a/frontend/src/constants/index.ts +++ b/frontend/src/constants/index.ts @@ -216,9 +216,11 @@ export const UI_INDEX_STATUS: { | 'warning'; } = { PENDING: 'warning', - RUNNING: 'processing', + CREATING: 'processing', + ACTIVE: 'success', + DELETING: 'warning', + DELETION_IN_PROGRESS: 'processing', FAILED: 'error', - COMPLETE: 'success', SKIPPED: 'default', }; diff --git a/frontend/src/locales/en-US.ts b/frontend/src/locales/en-US.ts index 3eb0089db..1bb4bd9a5 100644 --- a/frontend/src/locales/en-US.ts +++ b/frontend/src/locales/en-US.ts @@ -223,8 +223,10 @@ export default { 'document.status.DELETING': 'Deleting', 'document.index.status': 'Index Status', 'document.index.status.PENDING': 'Pending', - 'document.index.status.RUNNING': 'Running', - 'document.index.status.COMPLETE': 'Completed', + 'document.index.status.CREATING': 'Creating', + 'document.index.status.ACTIVE': 'Active', + 'document.index.status.DELETING': 'Deleting', + 'document.index.status.DELETION_IN_PROGRESS': 'Deleting', 'document.index.status.FAILED': 'Failed', 'document.index.status.SKIPPED': 'Skipped', 'document.index.type.vector': 'Vector Index', diff --git a/frontend/src/locales/zh-CN.ts b/frontend/src/locales/zh-CN.ts index fbe8b8667..324c923b1 100644 --- a/frontend/src/locales/zh-CN.ts +++ b/frontend/src/locales/zh-CN.ts @@ -221,8 +221,10 @@ export default { 'document.status.DELETING': '删除中', 'document.index.status': '索引状态', 'document.index.status.PENDING': '待处理', - 'document.index.status.RUNNING': '运行中', - 'document.index.status.COMPLETE': '已完成', + 'document.index.status.CREATING': '创建中', + 'document.index.status.ACTIVE': '已完成', + 'document.index.status.DELETING': '删除中', + 'document.index.status.DELETION_IN_PROGRESS': '删除中', 'document.index.status.FAILED': '失败', 'document.index.status.SKIPPED': '已跳过', 'document.index.type.vector': '向量索引', From ed14ca0fc680f09c92fe54b956ac95e65bf09002 Mon Sep 17 00:00:00 2001 From: Guo Ziang Date: Tue, 24 Jun 2025 17:18:13 +0800 Subject: [PATCH 5/6] chore: tidy up --- aperag/service/document_service.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aperag/service/document_service.py b/aperag/service/document_service.py index 28488f8a0..4cee3e7b9 100644 --- a/aperag/service/document_service.py +++ b/aperag/service/document_service.py @@ -363,7 +363,9 @@ async def _rebuild_document_indexes_atomically(session): raise ResourceNotFoundException(f"Collection {collection_id} not found or access denied") collection_config = json.loads(collection.config) if not collection_config.get("enable_knowledge_graph", False): - index_type_enums.remove(db_models.DocumentIndexType.GRAPH) + # Only remove GRAPH type if it's actually in the list to avoid ValueError + if db_models.DocumentIndexType.GRAPH in index_type_enums: + index_type_enums.remove(db_models.DocumentIndexType.GRAPH) # Trigger index rebuild by incrementing version for selected index types await document_index_manager.create_or_update_document_indexes(session, document_id, index_type_enums) From 8c580b6a9f6f34931e6935be8bb32d899b83cbd4 Mon Sep 17 00:00:00 2001 From: Guo Ziang Date: Tue, 24 Jun 2025 17:25:04 +0800 Subject: [PATCH 6/6] chore: tidy up --- aperag/db/models.py | 14 +++-- aperag/index/manager.py | 3 +- aperag/index/reconciler.py | 99 ++++++++++++++---------------- aperag/service/document_service.py | 27 ++++---- aperag/tasks/scheduler.py | 2 - 5 files changed, 70 insertions(+), 75 deletions(-) diff --git a/aperag/db/models.py b/aperag/db/models.py index 93d9fa73b..d6f610183 100644 --- a/aperag/db/models.py +++ b/aperag/db/models.py @@ -86,12 +86,12 @@ class DocumentIndexType(str, Enum): class DocumentIndexStatus(str, Enum): """Document index lifecycle status""" - PENDING = "PENDING" # Awaiting processing (create/update) - CREATING = "CREATING" # Task claimed, creation/update in progress - ACTIVE = "ACTIVE" # Index is up-to-date and ready for use - DELETING = "DELETING" # Deletion has been requested + PENDING = "PENDING" # Awaiting processing (create/update) + CREATING = "CREATING" # Task claimed, creation/update in progress + ACTIVE = "ACTIVE" # Index is up-to-date and ready for use + DELETING = "DELETING" # Deletion has been requested DELETION_IN_PROGRESS = "DELETION_IN_PROGRESS" # Task claimed, deletion in progress - FAILED = "FAILED" # The last operation failed + FAILED = "FAILED" # The last operation failed class BotStatus(str, Enum): @@ -240,7 +240,9 @@ def get_overall_index_status(self, session) -> "DocumentStatus": if any(status == DocumentIndexStatus.FAILED for status in statuses): return DocumentStatus.FAILED - elif any(status in [DocumentIndexStatus.CREATING, DocumentIndexStatus.DELETION_IN_PROGRESS] for status in statuses): + elif any( + status in [DocumentIndexStatus.CREATING, DocumentIndexStatus.DELETION_IN_PROGRESS] for status in statuses + ): return DocumentStatus.RUNNING elif all(status == DocumentIndexStatus.ACTIVE for status in statuses): return DocumentStatus.COMPLETE diff --git a/aperag/index/manager.py b/aperag/index/manager.py index c559e158c..843bbe0b2 100644 --- a/aperag/index/manager.py +++ b/aperag/index/manager.py @@ -12,14 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import logging from typing import List, Optional from sqlalchemy import and_, select from sqlalchemy.ext.asyncio import AsyncSession -from aperag.db.models import DocumentIndex, DocumentIndexType, DocumentIndexStatus, utc_now +from aperag.db.models import DocumentIndex, DocumentIndexStatus, DocumentIndexType, utc_now logger = logging.getLogger(__name__) diff --git a/aperag/index/reconciler.py b/aperag/index/reconciler.py index f6f2f872f..b1339d70d 100644 --- a/aperag/index/reconciler.py +++ b/aperag/index/reconciler.py @@ -13,23 +13,22 @@ # limitations under the License. import logging -import time from typing import List, Optional -from sqlalchemy import and_, or_, select, update +from sqlalchemy import and_, select, update from sqlalchemy.orm import Session from aperag.config import get_sync_session from aperag.db.models import ( Document, DocumentIndex, - DocumentIndexType, DocumentIndexStatus, + DocumentIndexType, DocumentStatus, ) from aperag.tasks.scheduler import TaskScheduler, create_task_scheduler -from aperag.utils.utils import utc_now from aperag.utils.constant import IndexAction +from aperag.utils.utils import utc_now logger = logging.getLogger(__name__) @@ -89,7 +88,7 @@ def _get_indexes_needing_reconciliation(self, session: Session) -> List[Document DocumentIndex.status == DocumentIndexStatus.DELETING, ), } - + for action, condition in conditions.items(): stmt = select(DocumentIndex).where(condition) result = session.execute(stmt) @@ -128,7 +127,7 @@ def _claim_document_indexes(self, session: Session, document_id: str, indexes_to Returns list of successfully claimed indexes with their details. """ claimed_indexes = [] - + try: for index_id, index_type, action in indexes_to_claim: if action in [IndexAction.CREATE, IndexAction.UPDATE]: @@ -142,7 +141,7 @@ def _claim_document_indexes(self, session: Session, document_id: str, indexes_to stmt = select(DocumentIndex).where(DocumentIndex.id == index_id) result = session.execute(stmt) current_index = result.scalar_one_or_none() - + if not current_index: continue @@ -177,13 +176,17 @@ def _claim_document_indexes(self, session: Session, document_id: str, indexes_to result = session.execute(update_stmt) if result.rowcount > 0: # Successfully claimed this index - claimed_indexes.append({ - 'index_id': index_id, - 'document_id': document_id, - 'index_type': index_type, - 'action': action, - 'target_version': current_index.version if action in [IndexAction.CREATE, IndexAction.UPDATE] else None, - }) + claimed_indexes.append( + { + "index_id": index_id, + "document_id": document_id, + "index_type": index_type, + "action": action, + "target_version": current_index.version + if action in [IndexAction.CREATE, IndexAction.UPDATE] + else None, + } + ) logger.debug(f"Claimed index {index_id} for document {document_id} ({action})") else: logger.debug(f"Could not claim index {index_id} for document {document_id}") @@ -199,64 +202,57 @@ def _reconcile_document_operations(self, document_id: str, claimed_indexes: List Reconcile operations for a single document, batching same operation types together """ from collections import defaultdict - + # Group by operation type to batch operations operations_by_type = defaultdict(list) for claimed_index in claimed_indexes: - action = claimed_index['action'] + action = claimed_index["action"] operations_by_type[action].append(claimed_index) # Process create operations as a batch if IndexAction.CREATE in operations_by_type: create_indexes = operations_by_type[IndexAction.CREATE] - create_types = [claimed_index['index_type'] for claimed_index in create_indexes] + create_types = [claimed_index["index_type"] for claimed_index in create_indexes] context = {} - + for claimed_index in create_indexes: - index_type = claimed_index['index_type'] - target_version = claimed_index.get('target_version') - + index_type = claimed_index["index_type"] + target_version = claimed_index.get("target_version") + # Store version info in context if target_version is not None: context[f"{index_type}_version"] = target_version - - task_id = self.task_scheduler.schedule_create_index( - document_id=document_id, - index_types=create_types, - context=context + + self.task_scheduler.schedule_create_index( + document_id=document_id, index_types=create_types, context=context ) logger.info(f"Scheduled create task for document {document_id}, types: {create_types}") - + # Process update operations as a batch if IndexAction.UPDATE in operations_by_type: update_indexes = operations_by_type[IndexAction.UPDATE] - update_types = [claimed_index['index_type'] for claimed_index in update_indexes] + update_types = [claimed_index["index_type"] for claimed_index in update_indexes] context = {} for claimed_index in update_indexes: - index_type = claimed_index['index_type'] - target_version = claimed_index.get('target_version') - + index_type = claimed_index["index_type"] + target_version = claimed_index.get("target_version") + # Store version info in context if target_version is not None: context[f"{index_type}_version"] = target_version - - task_id = self.task_scheduler.schedule_update_index( - document_id=document_id, - index_types=update_types, - context=context + + self.task_scheduler.schedule_update_index( + document_id=document_id, index_types=update_types, context=context ) logger.info(f"Scheduled update task for document {document_id}, types: {update_types}") # Process delete operations as a batch if IndexAction.DELETE in operations_by_type: delete_indexes = operations_by_type[IndexAction.DELETE] - delete_types = [claimed_index['index_type'] for claimed_index in delete_indexes] - - task_id = self.task_scheduler.schedule_delete_index( - document_id=document_id, - index_types=delete_types - ) + delete_types = [claimed_index["index_type"] for claimed_index in delete_indexes] + + self.task_scheduler.schedule_delete_index(document_id=document_id, index_types=delete_types) logger.info(f"Scheduled delete task for document {document_id}, types: {delete_types}") @@ -322,7 +318,9 @@ def on_index_failed(document_id: str, index_type: str, error_message: str): DocumentIndex.document_id == document_id, DocumentIndex.index_type == DocumentIndexType(index_type), # Allow transition from any in-progress state - DocumentIndex.status.in_([DocumentIndexStatus.CREATING, DocumentIndexStatus.DELETION_IN_PROGRESS]), + DocumentIndex.status.in_( + [DocumentIndexStatus.CREATING, DocumentIndexStatus.DELETION_IN_PROGRESS] + ), ) ) .values( @@ -350,15 +348,12 @@ def on_index_deleted(document_id: str, index_type: str): for session in get_sync_session(): # Delete the record entirely from sqlalchemy import delete - - delete_stmt = ( - delete(DocumentIndex) - .where( - and_( - DocumentIndex.document_id == document_id, - DocumentIndex.index_type == DocumentIndexType(index_type), - DocumentIndex.status == DocumentIndexStatus.DELETION_IN_PROGRESS, - ) + + delete_stmt = delete(DocumentIndex).where( + and_( + DocumentIndex.document_id == document_id, + DocumentIndex.index_type == DocumentIndexType(index_type), + DocumentIndex.status == DocumentIndexStatus.DELETION_IN_PROGRESS, ) ) diff --git a/aperag/service/document_service.py b/aperag/service/document_service.py index 4cee3e7b9..8fbb84c30 100644 --- a/aperag/service/document_service.py +++ b/aperag/service/document_service.py @@ -15,7 +15,7 @@ import json import logging import os -from typing import List, Optional +from typing import List from asgiref.sync import sync_to_async from fastapi import UploadFile @@ -38,7 +38,7 @@ from aperag.index.manager import document_index_manager from aperag.objectstore.base import get_object_store from aperag.schema import view_models -from aperag.schema.view_models import Document, DocumentList +from aperag.schema.view_models import DocumentList from aperag.utils.constant import QuotaType from aperag.utils.uncompress import SUPPORTED_COMPRESSED_EXTENSIONS @@ -83,7 +83,8 @@ async def build_document_response( """Build Document response object for API return using new status model.""" from sqlalchemy import select - from aperag.db.models import DocumentIndex, DocumentIndexType + from aperag.db.models import DocumentIndex + # Get all document indexes for status calculation document_indexes = await session.execute( select(DocumentIndex).where( @@ -97,12 +98,16 @@ async def build_document_response( # Map index states to API response format index_status = {} index_updated = {} - + # Initialize all types as SKIPPED (when no record exists) - all_types = [db_models.DocumentIndexType.VECTOR, db_models.DocumentIndexType.FULLTEXT, db_models.DocumentIndexType.GRAPH] + all_types = [ + db_models.DocumentIndexType.VECTOR, + db_models.DocumentIndexType.FULLTEXT, + db_models.DocumentIndexType.GRAPH, + ] for index_type in all_types: index_status[index_type] = "SKIPPED" - + # Update with actual states from database for index in indexes: index_status[index.index_type] = index.status @@ -208,9 +213,7 @@ async def _create_documents_atomically(session): # Use index manager to create indexes with new status model await document_index_manager.create_or_update_document_indexes( - document_id=document_instance.id, - index_types=index_types, - session=session + document_id=document_instance.id, index_types=index_types, session=session ) # Build response object @@ -263,14 +266,12 @@ async def _delete_document(self, session: AsyncSession, user: str, collection_id return # Use index manager to mark all related indexes for deletion - await document_index_manager.delete_document_indexes( - document_id=document.id, index_types=None, session=session - ) + await document_index_manager.delete_document_indexes(document_id=document.id, index_types=None, session=session) # Delete from object store obj_store = get_object_store() metadata = json.loads(document.doc_metadata) if document.doc_metadata else {} - if object_path := metadata.get("object_path"): + if metadata.get("object_path"): try: # Use delete_objects_by_prefix to remove all related files (original, chunks, etc.) await sync_to_async(obj_store.delete_objects_by_prefix)(document.object_store_base_path()) diff --git a/aperag/tasks/scheduler.py b/aperag/tasks/scheduler.py index 247b35424..89f302f99 100644 --- a/aperag/tasks/scheduler.py +++ b/aperag/tasks/scheduler.py @@ -16,8 +16,6 @@ from abc import ABC, abstractmethod from typing import Any, List, Optional -from aperag.tasks.utils import cleanup_local_document, parse_document_content - logger = logging.getLogger(__name__)