fixes to rag retreival for policy prompts.

cledantec · cledantec · commit e4c211a37b5a · 2026-02-09T13:56:40.000-05:00
diff --git a/main_chat/chat_route.py b/main_chat/chat_route.py
@@ -167,13 +167,7 @@ def _check_if_needs_new_data(
         "Return ONLY valid JSON with keys: needs_new_data (boolean) and reason (brief string explaining your decision)."
     )
 
-    user_prompt = (
-        "Conversation History:\n" + (history_context if history_context else "(No previous conversation)") + "\n\n"
-        "Cached Data:\n" + cache_summary + "\n\n"
-        "Current Question: " + question + "\n\n"
-        "Analyze if this question can be answered from the conversation history and/or cached data above, or if it needs new data retrieval.\n"
-        "Return JSON only."
-    )
+    user_prompt = "Conversation History:\n" + (history_context if history_context else "(No previous conversation)") + "\n\n" "Cached Data:\n" + cache_summary + "\n\n" "Current Question: " + question + "\n\n" "Analyze if this question can be answered from the conversation history and/or cached data above, or if it needs new data retrieval.\n" "Return JSON only."
 
     default_result = {"needs_new_data": True, "reason": "Error analyzing question, defaulting to new data"}
 
@@ -266,11 +260,11 @@ def _route_question(question: str) -> Dict[str, Any]:
         "   - If question mentions a SPECIFIC policy by name (e.g., 'Anti-Displacement Plan', 'Slow Streets', 'Imagine Boston 2030'):\n"
         "     * Set policy_sources to that specific document (e.g., ['Boston Anti-Displacement Plan Analysis.txt'])\n"
         "     * ALWAYS also add relevant transcript_tags for community perspective\n"
-        "   - If question is GENERAL about policy/planning but doesn't name a specific document:\n"
-        "     * Examples: 'What are current policy issues?', 'What policies affect housing?', 'What is being planned for the neighborhood?'\n"
+        "   - If question is GENERAL about policy/policies/planning/housing (doesn't name a specific document):\n"
+        "     * Examples: 'What are current policy issues?', 'What policies affect housing?', 'What is being planned for the neighborhood?', 'What does the city say about displacement?'\n"
         "     * Set policy_sources to null (will search ALL policy documents)\n"
-        "     * Add relevant transcript_tags\n"
-        "     * Set k to at least 10 to get diverse policy coverage\n\n"
+        "     * Set folder_categories to ['policies'] to search the policy folder\n"
+        "     * Set k to at least 15 to get diverse policy coverage\n\n"
         "RULE 6: COMBINED DATA + CONTEXT → 'hybrid' mode\n"
         "   - Questions that explicitly ask for BOTH numbers/data AND context/explanation\n"
         "   - Examples: 'How many homicides and what concerns come up?', 'Show trends and how policies address them'\n"
@@ -304,7 +298,7 @@ def _route_question(question: str) -> Dict[str, Any]:
         "Question:\n" + question + "\n\n"
         "Policy sources include: 'Boston Anti-Displacement Plan Analysis.txt', 'Boston Slow Streets Plan Analysis.txt', 'Imagine Boston 2030 Analysis.txt'.\n"
         "Transcript tags include: safety, violence, youth, media, community, displacement, government, structural racism.\n"
-        "Folder categories (for client uploads): newsletters, policy, transcripts.\n"
+        "Folder categories (for client uploads): newsletters, policies, transcripts.\n"
         "Output JSON only."
     )
 
@@ -584,39 +578,54 @@ def _run_rag(question: str, plan: Dict[str, Any], conversation_history: Optional
     k = int(plan.get("k", 5))
     tags = plan.get("transcript_tags")
     sources = plan.get("policy_sources")
+    folders = plan.get("folder_categories")
 
     combined_chunks: List[str] = []
     combined_meta: List[Dict[str, Any]] = []
 
     # Increase k for better source diversity
-    # Retrieve more chunks to get more diverse sources in the citations
-    # Use at least 20 chunks to ensure we get multiple unique sources
-    retrieval_k = max(k * 3, 20)  # At least 20 chunks for source diversity
+    retrieval_k = max(k * 3, 20)
 
-    # transcripts
+    # ========================================================================
+    # TRANSCRIPTS - retrieve with tags
+    # ========================================================================
     try:
         t_res = rag_retrieval.retrieve_transcripts(question, tags=tags, k=retrieval_k)
         t_chunks = t_res.get("chunks", [])
-        print(f"  📝 Transcripts: {len(t_chunks)} chunks found")
+        print(f"  📄 Transcripts: {len(t_chunks)} chunks found")
         combined_chunks.extend(t_chunks)
         combined_meta.extend(t_res.get("metadata", []))
     except Exception as e:
         print(f"  ⚠ Transcript retrieval error: {e}")
 
-    # policies
+    # ========================================================================
+    # POLICIES - retrieve from CLIENT_UPLOAD with folder_category filter
+    # ========================================================================
     try:
         if sources:
-            print(f"  🔍 Policy sources requested: {sources}")
-            # When specific policy sources are requested
+            # Specific policy documents requested by name
+            print(f"  📚 Policy sources requested: {sources}")
             for src in sources:
-                print(f"  🔍 Querying policy source: {src}")
+                print(f"  📚 Querying policy source: {src}")
                 p_res = rag_retrieval.retrieve_policies(question, k=retrieval_k, source=src)
                 p_chunks = p_res.get("chunks", [])
                 print(f"      Found {len(p_chunks)} chunks from {src}")
                 combined_chunks.extend(p_chunks)
                 combined_meta.extend(p_res.get("metadata", []))
+        elif folders:
+            # Folder categories specified (e.g., ["policies", "newsletters"])
+            print(f"  📁 Folder categories requested: {folders}")
+            folders_list = folders if isinstance(folders, list) else [folders]
+
+            for folder in folders_list:
+                p_res = rag_retrieval.retrieve(question, k=retrieval_k, doc_type="client_upload", folder_category=folder)
+                p_chunks = p_res.get("chunks", [])
+                print(f"    Found {len(p_chunks)} chunks from folder: {folder}")
+                combined_chunks.extend(p_chunks)
+                combined_meta.extend(p_res.get("metadata", []))
         else:
-            print("  🔍 No specific policy sources, searching all policies")
+            # No specific sources or folders - search all policies by default
+            print("  📚 No specific policy sources, searching all policies")
             p_res = rag_retrieval.retrieve_policies(question, k=retrieval_k)
             p_chunks = p_res.get("chunks", [])
             print(f"    Policies: {len(p_chunks)} chunks found")
diff --git a/main_chat/rag_pipeline/rag_retrieval.py b/main_chat/rag_pipeline/rag_retrieval.py
@@ -35,10 +35,10 @@ def load_vectordb():
     return vectordb
 
 
-def retrieve(query, k=5, doc_type=None, tags=None, source=None, min_score=None, vectordb=None):
+def retrieve(query, k=5, doc_type=None, tags=None, source=None, folder_category=None, min_score=None, vectordb=None):
     """
     Universal retrieval with flexible metadata filtering.
-    [... rest of function unchanged ...]
+    Added 'folder_category' parameter to filter by Google Drive subfolder.
     """
     # Defensive clamp: Chroma requires k >= 1
     try:
@@ -52,8 +52,9 @@ def retrieve(query, k=5, doc_type=None, tags=None, source=None, min_score=None,
         vectordb = load_vectordb()
 
     # Build filter dictionary
-    filter_dict = None
+    filter_conditions = []
 
+    # Doc type filter
     doc_filter = None
     if isinstance(doc_type, (list, tuple)):
         doc_types = [dt for dt in doc_type if dt]
@@ -64,17 +65,29 @@ def retrieve(query, k=5, doc_type=None, tags=None, source=None, min_score=None,
     elif doc_type:
         doc_filter = {"doc_type": doc_type}
 
-    if doc_filter and source:
-        filter_dict = {
-            "$and": [
-                doc_filter,
-                {"source": source},
-            ]
-        }
-    elif doc_filter:
-        filter_dict = doc_filter
-    elif source:
-        filter_dict = {"source": source}
+    if doc_filter:
+        filter_conditions.append(doc_filter)
+
+    # Source filter
+    if source:
+        filter_conditions.append({"source": source})
+
+    # Folder category filter (NEW)
+    if folder_category:
+        if isinstance(folder_category, (list, tuple)):
+            if len(folder_category) == 1:
+                filter_conditions.append({"folder_category": folder_category[0]})
+            elif len(folder_category) > 1:
+                filter_conditions.append({"$or": [{"folder_category": f} for f in folder_category]})
+        else:
+            filter_conditions.append({"folder_category": folder_category})
+
+    # Combine all conditions
+    filter_dict = None
+    if len(filter_conditions) == 1:
+        filter_dict = filter_conditions[0]
+    elif len(filter_conditions) > 1:
+        filter_dict = {"$and": filter_conditions}
 
     if min_score is not None:
         results_with_scores = vectordb.similarity_search_with_score(query, k=k * 3 if tags else k, filter=filter_dict if filter_dict else None)
@@ -114,12 +127,15 @@ def retrieve(query, k=5, doc_type=None, tags=None, source=None, min_score=None,
 
 def retrieve_transcripts(query, tags=None, k=5):
     """Convenience function for transcript-only search."""
-    return retrieve(query, k=k, doc_type="transcript", tags=tags)
+    return retrieve(query, k=k, doc_type="transcripts", tags=tags)
 
 
 def retrieve_policies(query, k=5, source=None):
-    """Convenience function for policy-only search."""
-    return retrieve(query, k=k, doc_type="policy", source=source)
+    """
+    Convenience function for policy-only search.
+    Searches CLIENT_UPLOAD documents in the 'policies' folder category.
+    """
+    return retrieve(query, k=k, doc_type="client_upload", folder_category="policies", source=source)  # Changed from "policy" to "policies"
 
 
 def format_results(result_dict):