FEWS-NET
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎notebooks/outcome_analysis/README.md‎
Lines changed: 25 additions & 0 deletions b/‎notebooks/outcome_analysis/README.md‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎notebooks/outcome_analysis/current_experiments/hea_documentation_QA_bot.py‎
Lines changed: 136 additions & 0 deletions b/‎notebooks/outcome_analysis/current_experiments/hea_documentation_QA_bot.py‎
Lines changed: 136 additions & 0 deletions
@@ -54,7 +54,6 @@ reports/
 # Compiled Assets and Media
 assets/
 media/
-notebooks/
 static/dojo/
 
 # any node or bower installation folders
 
@@ -0,0 +1,25 @@
+# Current Experiments
+
+**hea_documentation_QA_bot.py** ; initial proof of concept of an HEA/LIAS "chatbot" agent, designed to answer questions by referencing its knowledge base of HEA documentation. The goal is to integrate this with other tools.
+
+**formula_translator.ipynb** ; more of a "black box" approach. The idea is to leverage an LLM to translate Excel formulas to Python code without worrying about contextual understanding, so that later we can have another agent reason over the raw Python code to infer logical groupings for class methods.
+
+**workbook_context_extraction.ipynb** ; repurposes some of the logic from the ill fated **narrative_explanation.ipynb**, but with a simpler goal: capture text content found in the workbook alongside relevant metadata.
+
+# TO-DO
+
+Orchestrate the tools described above. A potential workflow I intend to explore is:
+
+- Recurse on a given cell
+- For each leaf node (or collapsed range), refer to the extracted workbook context to identify relevant text (headers, explanatory text, etc.)
+- Use the result of the previous step to create context-aware variable names/named ranges
+- Reconcile these variable/range names with the formulas translated into Python code
+- Synthesize results into meaningful Python class structures   
+
+# Experiment Graveyard
+
+**graph_visualizer** ; a Flask app where you can upload an Excel workbook, enter a sheet name and cell, and it'll build and display the dependency graph. Helpful for initial exploration, but not particularly actionable.
+
+**llm_formula_reccurse.py** ; a tool to aid in building text descriptions for different workbook cells/ranges. For a given cell, it recurses and asks for a text description of any cells not already in its cache. More actionable than the graph visualizer, but still demands a nontrivial amount of manual effort. Much of the logic here was repurposed in the **formula_translator**.
+
+**narrative_explanation.ipynb** An attempt at associating context with cells (i.e., tagging a cell with its nearest table header), collapsing large range operations to a descriptive 'motif', and passing the enriched cell information to an LLM to get a natural language explanation of the flow of logic. This got messy pretty fast, and generated narrative summaries weren't particularly actionable for porting algorithmic logic. 
@@ -0,0 +1,136 @@
+import os
+import lancedb
+from lancedb.schema import vector
+from sentence_transformers import SentenceTransformer
+import ssl
+import certifi
+import httpx
+import openai
+from azure.identity import DefaultAzureCredential, get_bearer_token_provider
+
+LANCEDB_PATH = "data/hea_lancedb"
+TABLE_NAME = "hea"
+
+def initialize_azure_client(division="ts", region="eastus2", api_version="2024-10-21"):
+    openai_endpoints = {
+            'ts': {
+                'eastus':'https://air-ts-eastus.openai.azure.com/',
+                'eastus2':'https://air-ts-eastus2.openai.azure.com/',
+                'northcentralus':'https://air-poc-northcentralus.openai.azure.com/',
+            },
+            'ps': {
+                'eastus':'https://air-ps-eastus.openai.azure.com/',
+                'eastus2':'https://air-ps-eastus2.openai.azure.com/',
+                'northcentralus':'https://air-poc-northcentralus.openai.azure.com/'
+            },
+        }
+    openai_endpoint = openai_endpoints[division][region]
+    token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
+    ctx = ssl.create_default_context(cafile=os.environ.get('REQUESTS_CA_BUNDLE', certifi.where()))
+    httpx_client = httpx.Client(verify=ctx)
+    openai_client = openai.AzureOpenAI(
+        api_version=api_version,
+        azure_endpoint=openai_endpoint,
+        azure_ad_token_provider=token_provider,
+        http_client=httpx_client
+    )
+    return openai_client
+
+# --- RAG Components ---
+
+class QnAPipeline:
+    def __init__(self):
+        # Initialize the embedding model
+        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+        
+        # Connect to LanceDB and the document chunks table
+        try:
+            db = lancedb.connect(LANCEDB_PATH)
+            self.table = db.open_table(TABLE_NAME)
+        except Exception as e:
+            raise FileNotFoundError(f"LanceDB table not found at {LANCEDB_PATH}/{TABLE_NAME}. Please run the ingestion script first. Error: {e}")
+
+        self.client = initialize_azure_client()
+
+    def search_knowledge_base(self, query: str, top_k: int = 5):
+        """
+        Embeds a query and searches the LanceDB table for the most relevant chunks.
+        """
+        # Embed the user's query
+        query_vector = self.embedding_model.encode(query).tolist()
+        
+        # Search the LanceDB table using the vector
+        # .to_list() retrieves the search results as a Python list of dictionaries
+        search_results = (
+            self.table
+            .search(query_vector)
+            .limit(top_k)
+            .to_list()
+        )
+        
+        return search_results
+
+    def generate_response(self, user_question: str, context: list):
+        """
+        Constructs a prompt with retrieved context and generates a response using Azure OpenAI.
+        """
+        # Format the context for the LLM
+        context_str = "\n".join([f"Source: {c['source_uri']}\nContent: {c['text']}" for c in context])
+
+        # Define the system message to guide the LLM's behavior
+        system_message = (
+            "You are a helpful assistant that answers questions based on the provided context. "
+            "Only use the information from the documents provided. "
+            "If the answer is not in the context, say 'I cannot answer this question based on the provided documents.' "
+            "Please cite the source document(s) for your answer."
+        )
+
+        # Send the prompt to the Azure OpenAI client
+        response = self.client.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {"role": "system", "content": system_message},
+                {"role": "user", "content": f"Context: {context_str}\n\nQuestion: {user_question}"}
+            ],
+            temperature=0.7,
+            max_tokens=500
+        )
+        
+        return response.choices[0].message.content
+
+    def run_qa_loop(self):
+        """
+        Runs the interactive Q&A loop.
+        """
+        print("Welcome to the LIAS Q&A System! Type 'quit' to exit.")
+        while True:
+            user_question = input("\nAsk a question: ")
+            if user_question.lower() == 'quit':
+                break
+            
+            try:
+                # 1. Retrieve relevant chunks
+                relevant_chunks = self.search_knowledge_base(user_question)
+                
+                if not relevant_chunks:
+                    print("I couldn't find any relevant information for that question.")
+                    continue
+                
+                # 2. Generate a response with the retrieved context
+                answer = self.generate_response(user_question, relevant_chunks)
+                
+                # 3. Print the final answer
+                print(f"\nAI Answer: {answer}")
+                
+            except Exception as e:
+                print(f"An error occurred: {e}")
+
+if __name__ == "__main__":
+    try:
+        qa_system = QnAPipeline()
+        qa_system.run_qa_loop()
+    except FileNotFoundError as e:
+        print(f"Error: {e}")
+        print("Please run your document ingestion script first to create the LanceDB table.")
+    except Exception as e:
+        print(f"An unexpected error occurred during setup: {e}")