-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
294 lines (264 loc) · 12.4 KB
/
app.py
File metadata and controls
294 lines (264 loc) · 12.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import streamlit as st
import pandas as pd
import numpy as np
try:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
SKLEARN_AVAILABLE = True
except ImportError as e:
SKLEARN_AVAILABLE = False
IsolationForest = None
StandardScaler = None
try:
from langchain_groq import ChatGroq
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
LANGCHAIN_AVAILABLE = True
except ImportError as e:
LANGCHAIN_AVAILABLE = False
ChatGroq = None
FastEmbedEmbeddings = None
Chroma = None
ChatPromptTemplate = None
RunnablePassthrough = None
StrOutputParser = None
import hashlib
st.set_page_config(
page_title="FinAgent",
page_icon="🤖",
layout="wide"
)
GROQ_LLM_MODEL = "moonshotai/kimi-k2-instruct-0905"
LOCAL_EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
if 'rag_chain' not in st.session_state:
st.session_state.rag_chain = None
if 'df_hash' not in st.session_state:
st.session_state.df_hash = None
if 'messages' not in st.session_state:
st.session_state.messages = []
st.sidebar.header("🔑 Configuration")
groq_api_key = st.sidebar.text_input(
"Enter your Groq API Key",
type="password"
)
@st.cache_resource
def get_llm(api_key):
if not api_key:
return None
if not LANGCHAIN_AVAILABLE or ChatGroq is None:
return f"Error: Langchain Groq components not available."
try:
return ChatGroq(
model=GROQ_LLM_MODEL,
groq_api_key=api_key
)
except Exception as e:
return f"Error initializing LLM: {e}"
@st.cache_resource
def get_embeddings():
if not LANGCHAIN_AVAILABLE or FastEmbedEmbeddings is None:
return f"Error: FastEmbed components not available."
try:
return FastEmbedEmbeddings(model_name=LOCAL_EMBEDDING_MODEL)
except Exception as e:
return f"Error initializing Embeddings: {e}"
@st.cache_data
def load_data(uploaded_file):
if uploaded_file is None:
return None, None, None
try:
# Check if required dependencies are available
if not SKLEARN_AVAILABLE:
st.error("scikit-learn is required for data processing but not installed.")
return None, None, None
df = pd.read_csv(uploaded_file)
amount_column = None
possible_amount_columns = ['Amount', 'amount', 'transaction_amount', 'txn_amount', 'value']
for col in possible_amount_columns:
if col in df.columns:
amount_column = col
break
if amount_column is None:
available_columns = list(df.columns)
raise ValueError(f"No amount column found. Available columns: {available_columns}")
df[amount_column] = pd.to_numeric(df[amount_column], errors='coerce')
df = df.replace([np.inf, -np.inf], np.nan)
missing_count = df[amount_column].isnull().sum()
if missing_count > 0:
median_val = df[amount_column].median()
df[amount_column].fillna(median_val, inplace=True)
st.warning(f"Imputed {missing_count} missing/invalid values in '{amount_column}' with the median ({median_val:.2f}).")
scaler = StandardScaler()
amount_values = df[amount_column].values.reshape(-1, 1)
df['Scaled_Amount'] = scaler.fit_transform(amount_values).flatten()
time_column = 'Time'
possible_time_columns = ['Time', 'time', 'timestamp', 'date']
for col in possible_time_columns:
if col in df.columns:
time_column = col
break
columns_to_drop = []
if time_column in df.columns:
columns_to_drop.append(time_column)
if amount_column in df.columns:
columns_to_drop.append(amount_column)
df_processed = df.drop(columns_to_drop, axis=1, errors='ignore').select_dtypes(include=np.number).fillna(0)
data_bytes = df_processed.to_csv(index=False).encode('utf-8')
data_hash = hashlib.md5(data_bytes).hexdigest()
return df_processed, df, data_hash
except Exception as e:
st.error(f"Error loading data: {str(e)}")
st.info("Please ensure your CSV is clean and contains a clear transaction amount column.")
return None, None, None
@st.cache_resource
def train_risk_model(df):
if not SKLEARN_AVAILABLE:
st.error("scikit-learn is required for risk modeling but not installed.")
return None
if 'Class' in df.columns:
features = df.drop(['Class'], axis=1, errors='ignore')
else:
features = df.copy()
st.warning("No 'Class' column found in data. Using unsupervised anomaly detection only (Isolation Forest).")
model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42, n_jobs=-1)
model.fit(features)
return model
def run_risk_analysis(model, df):
if model is None:
st.error("Risk model is not available due to missing dependencies.")
# Return the dataframe with default values
df_copy = df.copy()
df_copy['anomaly_score'] = 0
df_copy['is_anomaly'] = False
return df_copy
df_copy = df.copy()
features = df_copy.drop(['Class'], axis=1, errors='ignore')
df_copy['anomaly_score'] = model.decision_function(features)
df_copy['is_anomaly'] = model.predict(features)
df_copy['is_anomaly'] = df_copy['is_anomaly'] == -1
return df_copy
def get_anomaly_summary(df):
if 'is_anomaly' not in df.columns:
return "Risk analysis is not available due to missing dependencies."
anomalies = df[df['is_anomaly']].sort_values(by='anomaly_score', ascending=True)
if anomalies.empty:
return "No anomalies detected in this batch based on the Isolation Forest model (contamination=0.01)."
amount_col = [col for col in df.columns if col not in ['Scaled_Amount', 'anomaly_score', 'is_anomaly', 'Class'] and df[col].dtype != object][0] if len(df.columns) > 5 else 'Amount (Check Original DF)'
summary = f"""
Anomaly Detection Report (Isolation Forest, Contamination=1%):
- Total transactions processed: {len(df)}
- Total anomalies detected: {len(anomalies)}
- The lowest anomaly score (most anomalous) was: {anomalies['anomaly_score'].min():.4f}
- The Isolation Forest model used scaled features.
Key observations from the detected anomalies:
- Average Scaled_Amount for anomalies: {anomalies['Scaled_Amount'].mean():.2f}
- Average Scaled_Amount for normal transactions: {df[~df['is_anomaly']]['Scaled_Amount'].mean():.2f}
Sample Anomalous Transaction Details (Top 3 most isolated events):
"""
for i, row in anomalies.head(3).iterrows():
original_amount_val = df_original.loc[i, amount_col] if 'df_original' in globals() else 'N/A'
summary += f"- Index {i}: Original '{amount_col}': {original_amount_val}, Scaled_Amount: {row['Scaled_Amount']:.2f}, Anomaly Score (Lower is worse): {row['anomaly_score']:.4f}\n"
return summary
def setup_rag_pipeline(summary_text, api_key):
embeddings_result = get_embeddings()
llm_result = get_llm(api_key)
if isinstance(embeddings_result, str):
raise RuntimeError(embeddings_result)
if isinstance(llm_result, str):
raise RuntimeError(llm_result)
embeddings = embeddings_result
llm = llm_result
docs = [summary_text]
try:
with st.spinner(f"Embedding text locally with {LOCAL_EMBEDDING_MODEL}..."):
vectorstore = Chroma.from_texts(docs, embeddings)
retriever = vectorstore.as_retriever()
except Exception as e:
raise RuntimeError(f"Error setting up ChromaDB/FastEmbed: {e}")
template = """
You are 'FinAgent', a financial analysis assistant specializing in anomaly detection.
You have analyzed a batch of transactions using an Isolation Forest model.
Use the following CONTEXT (the anomaly report) to answer the user's question concisely.
If the context does not contain the answer, state that clearly ("I cannot answer that based on the provided report.").
CONTEXT:
{context}
QUESTION:
{question}
ANSWER:
"""
prompt = ChatPromptTemplate.from_template(template)
rag_chain = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
return rag_chain
st.title("FinAgent 🤖")
if not groq_api_key:
st.info("Please enter your Groq API Key in the sidebar to start.")
st.session_state.rag_chain = None
st.stop()
st.success(f"Groq API Key loaded. Conversational model: {GROQ_LLM_MODEL}.")
st.header("1. Data Ingestion (Data Agent)")
st.write("Upload a CSV dataset (e.g., Credit Card Fraud Detection).")
uploaded_file = st.file_uploader("Upload creditcard.csv", type=["csv"])
if uploaded_file:
with st.spinner("Processing Data and Feature Engineering..."):
df_processed, df_original, data_hash = load_data(uploaded_file)
if df_processed is not None:
st.success("Data loaded and robustly preprocessed.")
st.header("2. Risk Analysis (Risk Agent)")
with st.spinner("Training Isolation Forest Model..."):
risk_model = train_risk_model(df_processed)
with st.spinner("Running Risk Analysis..."):
df_with_anomalies = run_risk_analysis(risk_model, df_processed)
df_display = df_original.copy()
df_display['is_anomaly'] = df_with_anomalies['is_anomaly']
df_display['anomaly_score'] = df_with_anomalies['anomaly_score']
anomalies_df = df_display[df_display['is_anomaly']].sort_values(by='anomaly_score', ascending=True)
st.subheader("Fraud & Anomaly Alerts")
st.write(f"Found **{len(anomalies_df)}** potential anomalies/fraud events.")
st.dataframe(anomalies_df.head(10))
st.header("3. Conversational Analysis (Insight Agent)")
if st.session_state.rag_chain is None or st.session_state.df_hash != data_hash:
st.session_state.messages = []
st.session_state.df_hash = data_hash
st.session_state.rag_chain = None
try:
with st.spinner("Generating Insight Report & Setting up RAG..."):
summary = get_anomaly_summary(df_with_anomalies)
st.session_state.rag_chain = setup_rag_pipeline(summary, groq_api_key)
st.success("RAG pipeline successfully set up. Ready for chat.")
except RuntimeError as e:
st.error(f"Failed to set up RAG pipeline: {e}")
if st.session_state.rag_chain:
st.subheader("Chat with your Analysis")
st.write(f"The **{GROQ_LLM_MODEL}** LLM is ready to answer questions about the generated report.")
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
if prompt := st.chat_input("E.g., How many anomalies were found? What is the average scaled amount for them?"):
st.chat_message("user").markdown(prompt)
st.session_state.messages.append({"role": "user", "content": prompt})
with st.spinner(f"FinAgent ({GROQ_LLM_MODEL}) is thinking..."):
try:
response = st.session_state.rag_chain.invoke(prompt)
except Exception as e:
response = f"**Error:** An API issue occurred during the Groq LLM call. Please check your Groq API key or rate limits. Details: {e}"
st.error(response)
with st.chat_message("assistant"):
st.markdown(response)
st.session_state.messages.append({"role": "assistant", "content": response})
else:
st.warning("RAG Chat is unavailable. Check error messages above and ensure RAG components are installed.")
else:
st.session_state.rag_chain = None
st.session_state.df_hash = None
else:
st.session_state.rag_chain = None
st.session_state.df_hash = None
st.info("Awaiting CSV file upload to begin analysis.")