param20h
diff --git a/‎backend/app/database.py‎
Lines changed: 1 addition & 7 deletions b/‎backend/app/database.py‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎backend/app/main.py‎
Lines changed: 64 additions & 0 deletions b/‎backend/app/main.py‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎backend/app/models.py‎
Lines changed: 29 additions & 1 deletion b/‎backend/app/models.py‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎backend/app/rag/agent.py‎
Lines changed: 28 additions & 3 deletions b/‎backend/app/rag/agent.py‎
Lines changed: 28 additions & 3 deletions
diff --git a/‎backend/app/rag/bm25.py‎
Lines changed: 7 additions & 2 deletions b/‎backend/app/rag/bm25.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎backend/app/rag/graph_retriever.py‎
Lines changed: 18 additions & 2 deletions b/‎backend/app/rag/graph_retriever.py‎
Lines changed: 18 additions & 2 deletions
@@ -189,13 +189,7 @@ def _migrate_schema():
         ("documents", "drive_file_id", "ALTER TABLE documents ADD COLUMN drive_file_id VARCHAR(255)"),
         ("documents", "drive_folder_id", "ALTER TABLE documents ADD COLUMN drive_folder_id VARCHAR(255)"),
         ("documents", "drive_synced_at", "ALTER TABLE documents ADD COLUMN drive_synced_at TIMESTAMP"),
-        ("documents", "processing_progress", "ALTER TABLE documents ADD COLUMN processing_progress INTEGER DEFAULT 0"),
-        ("documents", "processing_stage", "ALTER TABLE documents ADD COLUMN processing_stage VARCHAR(20) DEFAULT 'queued'"),
-        ("documents", "retry_count", "ALTER TABLE documents ADD COLUMN retry_count INTEGER DEFAULT 0"),
-        ("documents", "last_error_traceback", "ALTER TABLE documents ADD COLUMN last_error_traceback TEXT"),
-        ("documents", "processing_started_at", "ALTER TABLE documents ADD COLUMN processing_started_at TIMESTAMP"),
-        ("documents", "completed_at", "ALTER TABLE documents ADD COLUMN completed_at TIMESTAMP"),
-        ("documents", "extracted_urls", "ALTER TABLE documents ADD COLUMN extracted_urls TEXT"),
+        ("documents", "workspace_id", "ALTER TABLE documents ADD COLUMN workspace_id VARCHAR(36)"),
     ]
     for table, column, ddl in docs_migrations:
         if column not in existing_docs_columns:
 
@@ -40,6 +40,70 @@
 settings = get_settings()
 
 
+async def document_cleanup_job():
+    """Background loop to periodically purge documents not accessed in 30 days."""
+    import asyncio
+    from datetime import datetime, timedelta, timezone
+    logger.info("Starting document cleanup background job loop")
+    while True:
+        try:
+            from app.database import SessionLocal
+            from app.models import Document
+            from app.rag.vectorstore import delete_document_chunks
+            from sqlalchemy import or_
+            
+            db = SessionLocal()
+            try:
+                cutoff = datetime.now(timezone.utc) - timedelta(days=30)
+                expired_docs = db.query(Document).filter(
+                    or_(
+                        Document.last_accessed_at < cutoff,
+                        Document.last_accessed_at.is_(None) & (Document.uploaded_at < cutoff)
+                    )
+                ).all()
+                
+                for doc in expired_docs:
+                    logger.info(f"Auto-cleanup: Purging document {doc.id} ('{doc.original_name}') due to inactivity since {doc.last_accessed_at or doc.uploaded_at}")
+                    
+                    # Delete physical file
+                    filepath = os.path.join(settings.UPLOAD_DIR, doc.user_id, doc.filename)
+                    if os.path.exists(filepath):
+                        try:
+                            os.remove(filepath)
+                        except Exception as e:
+                            logger.warning(f"Auto-cleanup: Failed to delete physical file {filepath}: {e}")
+                    
+                    # Delete vectors
+                    try:
+                        delete_document_chunks(document_id=doc.id, user_id=doc.user_id)
+                    except Exception as e:
+                        logger.warning(f"Auto-cleanup: Error deleting vectors for document {doc.id}: {e}")
+                    
+                    # Delete knowledge graph
+                    try:
+                        from app.rag.graph_builder import delete_graph
+                        delete_graph(user_id=doc.user_id, document_id=doc.id)
+                    except Exception as e:
+                        logger.warning(f"Auto-cleanup: Error deleting graph for document {doc.id}: {e}")
+                    
+                    # Delete database record
+                    db.delete(doc)
+                
+                db.commit()
+                if expired_docs:
+                    logger.info(f"Auto-cleanup: Purged {len(expired_docs)} documents.")
+            except Exception as exc:
+                logger.error(f"Auto-cleanup job encountered error: {exc}", exc_info=True)
+            finally:
+                db.close()
+                
+        except Exception as e:
+            logger.error(f"Error in document cleanup background loop: {e}", exc_info=True)
+            
+        # Run every 24 hours (86400 seconds)
+        await asyncio.sleep(86400)
+
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Application startup/shutdown lifecycle."""
 
@@ -171,7 +171,7 @@ class User(Base):
         cascade="all, delete-orphan",
     )
     workspace_memberships = relationship(
-        "WorkspaceMember",
+        "WorkspaceMembership",
         back_populates="user",
         cascade="all, delete-orphan",
     )
@@ -197,6 +197,32 @@ class ApiKey(Base):
     user = relationship("User", back_populates="api_keys")
 
 
+class Workspace(Base):
+    __tablename__ = "workspaces"
+
+    id = Column(String(36), primary_key=True, default=generate_uuid)
+    name = Column(String(100), nullable=False)
+    created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc))
+
+    # Relationships
+    memberships = relationship("WorkspaceMembership", back_populates="workspace", cascade="all, delete-orphan")
+    documents = relationship("Document", back_populates="workspace")
+
+
+class WorkspaceMembership(Base):
+    __tablename__ = "workspace_memberships"
+
+    id = Column(String(36), primary_key=True, default=generate_uuid)
+    workspace_id = Column(String(36), ForeignKey("workspaces.id"), nullable=False, index=True)
+    user_id = Column(GUID, ForeignKey("users.id"), nullable=False, index=True)
+    role = Column(String(20), default="member", nullable=False)  # "admin" | "member"
+    joined_at = Column(DateTime, default=lambda: datetime.now(timezone.utc))
+
+    # Relationships
+    workspace = relationship("Workspace", back_populates="memberships")
+    user = relationship("User", back_populates="workspace_memberships")
+
+
 class WorkspaceInvitation(Base):
     __tablename__ = "workspace_invitations"
 
@@ -348,6 +374,7 @@ class Document(Base):
     drive_synced_at = Column(DateTime, nullable=True)
     is_deleted = Column(Boolean, default=False, nullable=False, index=True)
     deleted_at = Column(DateTime, nullable=True)
+    workspace_id = Column(String(36), ForeignKey("workspaces.id"), nullable=True, index=True)
     processing_progress = Column(Integer, default=0)
     processing_stage = Column(String(20), default="queued")
     retry_count = Column(Integer, default=0)
@@ -358,6 +385,7 @@ class Document(Base):
 
     # Relationships
     owner = relationship("User", back_populates="documents")
+    workspace = relationship("Workspace", back_populates="documents")
     messages = relationship(
         "ChatMessage",
         back_populates="document",
 
@@ -65,11 +65,18 @@ def get_agent_executor(
     hf_token: Optional[str] = None,
     top_k: Optional[int] = None,
     chat_history: Optional[List[Dict[str, str]]] = None,
+    workspace: Optional[str] = None,
 ):
     """Initialize the LangChain ReAct agent executor."""
 
     # Initialize tools
-    pdf_tool = PDFSearchTool(user_id=user_id, document_id=document_id, document_ids=document_ids, top_k=top_k)
+    pdf_tool = PDFSearchTool(
+        user_id=user_id,
+        document_id=document_id,
+        document_ids=document_ids,
+        workspace=workspace,
+        top_k=top_k,
+    )
     tools = [pdf_tool, MathTool(), WebSearchTool()]
 
     # Initialize LLM
@@ -140,6 +147,7 @@ def generate_answer(
     hf_token: Optional[str] = None,
     top_k: Optional[int] = None,
     chat_history: Optional[List[Dict[str, str]]] = None,
+    workspace: Optional[str] = None,
 ) -> Dict[str, Any]:
     """
     Agentic generation: retrieve via tools → reason → generate answer.
@@ -165,7 +173,15 @@ def generate_answer(
 
     # ── Run Agent ────────────────────────────────────
     try:
-        executor, pdf_tool, formatted_history = get_agent_executor(user_id, document_id, document_ids, hf_token, top_k, chat_history)
+        executor, pdf_tool, formatted_history = get_agent_executor(
+            user_id=user_id,
+            document_id=document_id,
+            document_ids=document_ids,
+            hf_token=hf_token,
+            top_k=top_k,
+            chat_history=chat_history,
+            workspace=workspace,
+        )
         result = executor.invoke({"input": question, "chat_history": formatted_history})
 
         raw_answer = result.get("output", "")
@@ -214,6 +230,7 @@ def generate_answer_stream(
     hf_token: Optional[str] = None,
     top_k: Optional[int] = None,
     chat_history: Optional[List[Dict[str, str]]] = None,
+    workspace: Optional[str] = None,
 ) -> Generator[str, None, None]:
     """
     Streaming Agentic pipeline.
@@ -239,7 +256,15 @@ def generate_answer_stream(
 
     # ── Run Agent ────────────────────────────────────
     try:
-        executor, pdf_tool, formatted_history = get_agent_executor(user_id, document_id, document_ids, hf_token, top_k, chat_history)
+        executor, pdf_tool, formatted_history = get_agent_executor(
+            user_id=user_id,
+            document_id=document_id,
+            document_ids=document_ids,
+            hf_token=hf_token,
+            top_k=top_k,
+            chat_history=chat_history,
+            workspace=workspace,
+        )
 
         sources_sent = False
 
 
@@ -169,8 +169,13 @@ def query_bm25(
 
     user_dir = get_bm25_dir(user_id)
     all_results = []
-
-    for path in glob.glob(os.path.join(user_dir, "*.json")):
+    
+    for path in glob.glob(os.path.join(user_dir, "*.pkl")):
+        # Filter by document_ids if provided
+        if document_ids is not None:
+            doc_id = os.path.basename(path).rsplit(".", 1)[0]
+            if doc_id not in document_ids:
+                continue
         results = _query_single_index(path, tokenized_query, top_k)
         all_results.extend(results)
 
 
@@ -18,10 +18,21 @@
 settings = get_settings()
 
 
-def _candidate_graphs(user_id: str, document_id: Optional[str]) -> Iterable[nx.Graph]:
+def _candidate_graphs(
+    user_id: str,
+    document_id: Optional[str],
+    document_ids: Optional[List[str]] = None,
+) -> Iterable[nx.Graph]:
     if document_id:
         graph = load_graph(user_id, document_id)
         return [graph] if graph is not None else []
+    elif document_ids:
+        graphs = []
+        for doc_id in document_ids:
+            graph = load_graph(user_id, doc_id)
+            if graph is not None:
+                graphs.append(graph)
+        return graphs
 
     graphs = []
     for path in iter_graph_paths(user_id):
@@ -67,12 +78,17 @@ def get_entity_context(
     query: str,
     user_id: str,
     document_id: Optional[str] = None,
+    document_ids: Optional[List[str]] = None,
 ) -> str:
     """Return compact graph relationship context relevant to the query."""
     relationships: Dict[Tuple[str, str], Dict[str, object]] = {}
 
     try:
-        graphs = _candidate_graphs(user_id=user_id, document_id=document_id)
+        graphs = _candidate_graphs(
+            user_id=user_id,
+            document_id=document_id,
+            document_ids=document_ids,
+        )
         for graph in graphs:
             matched_nodes = _match_query_nodes(graph, query)