Skip to content

Commit 9fc17f3

Browse files
committed
add notebook experiments for outcome analysis
1 parent f819089 commit 9fc17f3

29 files changed

Lines changed: 42066 additions & 1 deletion

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ reports/
5454
# Compiled Assets and Media
5555
assets/
5656
media/
57-
notebooks/
5857
static/dojo/
5958

6059
# any node or bower installation folders
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Current Experiments
2+
3+
**hea_documentation_QA_bot.py** ; initial proof of concept of an HEA/LIAS "chatbot" agent, designed to answer questions by referencing its knowledge base of HEA documentation. The goal is to integrate this with other tools.
4+
5+
**formula_translator.ipynb** ; more of a "black box" approach. The idea is to leverage an LLM to translate Excel formulas to Python code without worrying about contextual understanding, so that later we can have another agent reason over the raw Python code to infer logical groupings for class methods.
6+
7+
**workbook_context_extraction.ipynb** ; repurposes some of the logic from the ill fated **narrative_explanation.ipynb**, but with a simpler goal: capture text content found in the workbook alongside relevant metadata.
8+
9+
# TO-DO
10+
11+
Orchestrate the tools described above. A potential workflow I intend to explore is:
12+
13+
- Recurse on a given cell
14+
- For each leaf node (or collapsed range), refer to the extracted workbook context to identify relevant text (headers, explanatory text, etc.)
15+
- Use the result of the previous step to create context-aware variable names/named ranges
16+
- Reconcile these variable/range names with the formulas translated into Python code
17+
- Synthesize results into meaningful Python class structures
18+
19+
# Experiment Graveyard
20+
21+
**graph_visualizer** ; a Flask app where you can upload an Excel workbook, enter a sheet name and cell, and it'll build and display the dependency graph. Helpful for initial exploration, but not particularly actionable.
22+
23+
**llm_formula_reccurse.py** ; a tool to aid in building text descriptions for different workbook cells/ranges. For a given cell, it recurses and asks for a text description of any cells not already in its cache. More actionable than the graph visualizer, but still demands a nontrivial amount of manual effort. Much of the logic here was repurposed in the **formula_translator**.
24+
25+
**narrative_explanation.ipynb** An attempt at associating context with cells (i.e., tagging a cell with its nearest table header), collapsing large range operations to a descriptive 'motif', and passing the enriched cell information to an LLM to get a natural language explanation of the flow of logic. This got messy pretty fast, and generated narrative summaries weren't particularly actionable for porting algorithmic logic.
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import os
2+
import lancedb
3+
from lancedb.schema import vector
4+
from sentence_transformers import SentenceTransformer
5+
import ssl
6+
import certifi
7+
import httpx
8+
import openai
9+
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
10+
11+
LANCEDB_PATH = "data/hea_lancedb"
12+
TABLE_NAME = "hea"
13+
14+
def initialize_azure_client(division="ts", region="eastus2", api_version="2024-10-21"):
15+
openai_endpoints = {
16+
'ts': {
17+
'eastus':'https://air-ts-eastus.openai.azure.com/',
18+
'eastus2':'https://air-ts-eastus2.openai.azure.com/',
19+
'northcentralus':'https://air-poc-northcentralus.openai.azure.com/',
20+
},
21+
'ps': {
22+
'eastus':'https://air-ps-eastus.openai.azure.com/',
23+
'eastus2':'https://air-ps-eastus2.openai.azure.com/',
24+
'northcentralus':'https://air-poc-northcentralus.openai.azure.com/'
25+
},
26+
}
27+
openai_endpoint = openai_endpoints[division][region]
28+
token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
29+
ctx = ssl.create_default_context(cafile=os.environ.get('REQUESTS_CA_BUNDLE', certifi.where()))
30+
httpx_client = httpx.Client(verify=ctx)
31+
openai_client = openai.AzureOpenAI(
32+
api_version=api_version,
33+
azure_endpoint=openai_endpoint,
34+
azure_ad_token_provider=token_provider,
35+
http_client=httpx_client
36+
)
37+
return openai_client
38+
39+
# --- RAG Components ---
40+
41+
class QnAPipeline:
42+
def __init__(self):
43+
# Initialize the embedding model
44+
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
45+
46+
# Connect to LanceDB and the document chunks table
47+
try:
48+
db = lancedb.connect(LANCEDB_PATH)
49+
self.table = db.open_table(TABLE_NAME)
50+
except Exception as e:
51+
raise FileNotFoundError(f"LanceDB table not found at {LANCEDB_PATH}/{TABLE_NAME}. Please run the ingestion script first. Error: {e}")
52+
53+
self.client = initialize_azure_client()
54+
55+
def search_knowledge_base(self, query: str, top_k: int = 5):
56+
"""
57+
Embeds a query and searches the LanceDB table for the most relevant chunks.
58+
"""
59+
# Embed the user's query
60+
query_vector = self.embedding_model.encode(query).tolist()
61+
62+
# Search the LanceDB table using the vector
63+
# .to_list() retrieves the search results as a Python list of dictionaries
64+
search_results = (
65+
self.table
66+
.search(query_vector)
67+
.limit(top_k)
68+
.to_list()
69+
)
70+
71+
return search_results
72+
73+
def generate_response(self, user_question: str, context: list):
74+
"""
75+
Constructs a prompt with retrieved context and generates a response using Azure OpenAI.
76+
"""
77+
# Format the context for the LLM
78+
context_str = "\n".join([f"Source: {c['source_uri']}\nContent: {c['text']}" for c in context])
79+
80+
# Define the system message to guide the LLM's behavior
81+
system_message = (
82+
"You are a helpful assistant that answers questions based on the provided context. "
83+
"Only use the information from the documents provided. "
84+
"If the answer is not in the context, say 'I cannot answer this question based on the provided documents.' "
85+
"Please cite the source document(s) for your answer."
86+
)
87+
88+
# Send the prompt to the Azure OpenAI client
89+
response = self.client.chat.completions.create(
90+
model="gpt-4o",
91+
messages=[
92+
{"role": "system", "content": system_message},
93+
{"role": "user", "content": f"Context: {context_str}\n\nQuestion: {user_question}"}
94+
],
95+
temperature=0.7,
96+
max_tokens=500
97+
)
98+
99+
return response.choices[0].message.content
100+
101+
def run_qa_loop(self):
102+
"""
103+
Runs the interactive Q&A loop.
104+
"""
105+
print("Welcome to the LIAS Q&A System! Type 'quit' to exit.")
106+
while True:
107+
user_question = input("\nAsk a question: ")
108+
if user_question.lower() == 'quit':
109+
break
110+
111+
try:
112+
# 1. Retrieve relevant chunks
113+
relevant_chunks = self.search_knowledge_base(user_question)
114+
115+
if not relevant_chunks:
116+
print("I couldn't find any relevant information for that question.")
117+
continue
118+
119+
# 2. Generate a response with the retrieved context
120+
answer = self.generate_response(user_question, relevant_chunks)
121+
122+
# 3. Print the final answer
123+
print(f"\nAI Answer: {answer}")
124+
125+
except Exception as e:
126+
print(f"An error occurred: {e}")
127+
128+
if __name__ == "__main__":
129+
try:
130+
qa_system = QnAPipeline()
131+
qa_system.run_qa_loop()
132+
except FileNotFoundError as e:
133+
print(f"Error: {e}")
134+
print("Please run your document ingestion script first to create the LanceDB table.")
135+
except Exception as e:
136+
print(f"An unexpected error occurred during setup: {e}")

0 commit comments

Comments
 (0)