Skip to content

Commit a381380

Browse files
committed
Update README, enhance SQL formatting, and improve schema handling
- Added description of Azure AD support in README. - Enhanced SQL formatting in output.py with dialect support. - Implemented dynamic schema retrieval in data_nodes.py. - Cleaned up conversation history handling in multiple files. - Removed deprecated table schema definitions from amex.yaml. - Updated SQLValidator to improve error handling and formatting. - Refined response handling in app.py for better clarity.
1 parent bfa756a commit a381380

9 files changed

Lines changed: 61 additions & 255 deletions

File tree

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,15 @@
2020

2121
A natural language to SQL (NL2SQL) platform built on LangGraph and Azure OpenAI. This multi-agent system automatically routes user questions to the appropriate database backend and generates optimized SQL queries and results.
2222

23+
Built on top of LangChain's [`SQLDatabase`](https://docs.langchain.com/oss/python/langchain/sql-agent) with extended support for Azure AD authentication, Cosmos DB, and built-in dialect validation.
24+
2325
## Features
2426

2527
- **Multi-Database Support**: PostgreSQL, Azure SQL, Azure Synapse, Azure Cosmos DB, Databricks SQL, and Google BigQuery
2628
- **Intent Detection**: Automatically routes queries to the correct data agent based on question context
2729
- **Multi-Turn Conversations**: Follow-up questions with context awareness (e.g., "What's the average?" after a query)
2830
- **SQL Validation**: Safe query execution with sqlglot-based validation across all dialects
2931
- **Configurable Agents**: YAML-based configuration for adding new data sources
30-
- **Conversation Persistence**: Built-in checkpointer for maintaining conversation state
31-
- **Async Architecture**: Fully asynchronous for high-performance query execution
3232
- **A2A Protocol**: Agent-to-Agent interoperability for integration with other A2A-compliant systems
3333

3434
## Architecture

src/data_agent/agent.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -275,12 +275,11 @@ def intent_detection_node(state: AgentState) -> dict[str, Any]:
275275

276276
system_content = intent_system_prompt.format(agent_descriptions=agent_list)
277277

278-
# Get recent conversation history for multi-turn context
279278
history = get_recent_history(state.get("messages", []), max_messages=4)
280279

281280
messages = [
282281
SystemMessage(content=system_content),
283-
*history, # Previous turns help detect "more of the same" patterns
282+
*history,
284283
HumanMessage(content=question),
285284
]
286285

@@ -356,10 +355,8 @@ def query_rewrite_node(state: AgentState) -> dict[str, Any]:
356355

357356
agent_desc = agent_descriptions[datasource]
358357

359-
# Get recent conversation history for context
360358
history = get_recent_history(state.get("messages", []), max_messages=4)
361359

362-
# Build conversation context summary for the rewriter
363360
conversation_context = ""
364361
if history:
365362
conversation_context = "\n## Conversation History (for context)\n"

src/data_agent/cli/output.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,22 @@
44
from rich.syntax import Syntax
55

66
from data_agent.cli.console import console, err_console
7+
from data_agent.validators import SQLValidator
78

89

9-
def print_sql(sql: str, title: str = "Generated SQL") -> None:
10+
def print_sql(
11+
sql: str, title: str = "Generated SQL", dialect: str | None = None
12+
) -> None:
1013
"""Print SQL with syntax highlighting in a panel.
1114
1215
Args:
1316
sql: The SQL query string to display.
1417
title: The panel title.
18+
dialect: SQL dialect for formatting.
1519
"""
16-
syntax = Syntax(sql.strip(), "sql", theme="monokai", line_numbers=False)
20+
validator = SQLValidator(dialect=dialect or "postgres")
21+
formatted_sql = validator.format_sql(sql)
22+
syntax = Syntax(formatted_sql, "sql", theme="monokai", line_numbers=False)
1723
panel = Panel(syntax, title=f"[sql]{title}[/sql]", border_style="green")
1824
console.print(panel)
1925

src/data_agent/config/amex.yaml

Lines changed: 1 addition & 222 deletions
Original file line numberDiff line numberDiff line change
@@ -99,228 +99,7 @@ data_agents:
9999
- Format dates in readable format (e.g., Dec 16, 2025)
100100
- Limit tables to 20 rows max; if more rows exist, show first 20 and note "... and X more rows"
101101
- After the table, provide a brief summary or insight about the data
102-
table_schemas:
103-
- table_name: customers
104-
table_description: Customer profiles with segmentation and risk scoring
105-
columns:
106-
- column_name: customer_id
107-
data_type: STRING
108-
description: Unique customer identifier (e.g., CUST-001)
109-
- column_name: first_name
110-
data_type: STRING
111-
description: Customer first name
112-
- column_name: last_name
113-
data_type: STRING
114-
description: Customer last name
115-
- column_name: email
116-
data_type: STRING
117-
description: Customer email address
118-
- column_name: phone
119-
data_type: STRING
120-
description: Customer phone number
121-
- column_name: date_of_birth
122-
data_type: DATE
123-
description: Customer date of birth
124-
- column_name: registration_date
125-
data_type: DATE
126-
description: Date customer registered
127-
- column_name: customer_segment
128-
data_type: STRING
129-
description: Customer tier/segment
130-
allowed_values:
131-
Standard: Regular customers
132-
Premium: High-value customers with enhanced benefits
133-
VIP: Top-tier customers with exclusive services
134-
- column_name: risk_score
135-
data_type: FLOAT64
136-
description: Customer risk score (0.0 to 1.0, higher = riskier)
137-
- column_name: is_active
138-
data_type: BOOL
139-
description: Whether the customer account is active
140-
- table_name: accounts
141-
table_description: Customer bank accounts including checking, savings, credit, and investment
142-
columns:
143-
- column_name: account_id
144-
data_type: STRING
145-
description: Unique account identifier (e.g., ACC-001-CHK)
146-
- column_name: customer_id
147-
data_type: STRING
148-
description: Customer who owns the account
149-
- column_name: account_type
150-
data_type: STRING
151-
description: Type of account
152-
allowed_values:
153-
Checking: Standard checking account
154-
Savings: Savings account with interest
155-
Credit: Credit card account
156-
Investment: Investment/brokerage account
157-
- column_name: account_status
158-
data_type: STRING
159-
description: Current status of the account
160-
allowed_values:
161-
Active: Account is open and operational
162-
Frozen: Account is temporarily frozen
163-
Closed: Account has been closed
164-
- column_name: opened_date
165-
data_type: DATE
166-
description: Date the account was opened
167-
- column_name: currency
168-
data_type: STRING
169-
description: Account currency (e.g., USD)
170-
- column_name: current_balance
171-
data_type: NUMERIC
172-
description: Current account balance (negative for credit owed)
173-
- column_name: credit_limit
174-
data_type: NUMERIC
175-
description: Credit limit for credit accounts (NULL for others)
176-
- column_name: interest_rate
177-
data_type: FLOAT64
178-
description: Annual interest rate percentage
179-
- table_name: transactions
180-
table_description: Financial transactions including deposits, withdrawals, transfers, and payments
181-
columns:
182-
- column_name: transaction_id
183-
data_type: STRING
184-
description: Unique transaction identifier
185-
- column_name: account_id
186-
data_type: STRING
187-
description: Account involved in the transaction
188-
- column_name: transaction_timestamp
189-
data_type: TIMESTAMP
190-
description: Exact time of the transaction
191-
- column_name: transaction_date
192-
data_type: DATE
193-
description: Date of the transaction
194-
- column_name: transaction_type
195-
data_type: STRING
196-
description: Type of transaction
197-
allowed_values:
198-
Deposit: Money added to account
199-
Withdrawal: Cash withdrawn from account
200-
Transfer: Money transferred between accounts
201-
Payment: Payment to merchant or bill
202-
Refund: Refund received
203-
- column_name: amount
204-
data_type: NUMERIC
205-
description: Transaction amount (negative for outgoing)
206-
- column_name: currency
207-
data_type: STRING
208-
description: Transaction currency
209-
- column_name: merchant_name
210-
data_type: STRING
211-
description: Merchant or payee name (NULL for deposits/transfers)
212-
- column_name: merchant_category
213-
data_type: STRING
214-
description: Category of merchant
215-
allowed_values:
216-
Grocery: Grocery stores and supermarkets
217-
Gas: Gas stations
218-
Entertainment: Entertainment and streaming
219-
Travel: Airlines, hotels, travel services
220-
Electronics: Electronics and tech stores
221-
Shopping: General retail shopping
222-
Food: Restaurants and food delivery
223-
Transportation: Ride services and public transit
224-
Housing: Rent and housing payments
225-
Other: Other/uncategorized
226-
- column_name: channel
227-
data_type: STRING
228-
description: Transaction channel
229-
allowed_values:
230-
Branch: In-person at bank branch
231-
ATM: ATM transaction
232-
Online: Online/web transaction
233-
Mobile: Mobile app transaction
234-
POS: Point of sale terminal
235-
- column_name: status
236-
data_type: STRING
237-
description: Transaction status
238-
allowed_values:
239-
Completed: Transaction completed successfully
240-
Pending: Transaction is pending
241-
Reversed: Transaction was reversed/cancelled
242-
Failed: Transaction failed
243-
- column_name: reference_id
244-
data_type: STRING
245-
description: External reference identifier
246-
- column_name: description
247-
data_type: STRING
248-
description: Transaction description or notes
249-
- table_name: fraud_alerts
250-
table_description: Fraud detection alerts and investigation status
251-
columns:
252-
- column_name: alert_id
253-
data_type: STRING
254-
description: Unique alert identifier
255-
- column_name: transaction_id
256-
data_type: STRING
257-
description: Transaction that triggered the alert
258-
- column_name: alert_timestamp
259-
data_type: TIMESTAMP
260-
description: When the alert was generated
261-
- column_name: alert_type
262-
data_type: STRING
263-
description: Type of fraud alert
264-
allowed_values:
265-
Suspicious Amount: Unusually large transaction amount
266-
Velocity: Too many transactions in short time
267-
Unusual Location: Transaction from unusual location
268-
Pattern: Unusual spending pattern detected
269-
- column_name: severity
270-
data_type: STRING
271-
description: Alert severity level
272-
allowed_values:
273-
Low: Minor concern, low risk
274-
Medium: Moderate concern, investigation recommended
275-
High: Serious concern, immediate attention needed
276-
Critical: Urgent, possible active fraud
277-
- column_name: status
278-
data_type: STRING
279-
description: Investigation status
280-
allowed_values:
281-
Investigating: Under active investigation
282-
Confirmed Fraud: Confirmed as fraudulent
283-
False Positive: Determined to be legitimate
284-
Resolved: Issue has been resolved
285-
- column_name: risk_score
286-
data_type: FLOAT64
287-
description: Calculated risk score (0.0 to 1.0)
288-
- column_name: notes
289-
data_type: STRING
290-
description: Investigation notes
291-
- table_name: monthly_summaries
292-
table_description: Monthly account summaries with aggregated transaction data
293-
columns:
294-
- column_name: summary_id
295-
data_type: STRING
296-
description: Unique summary identifier
297-
- column_name: account_id
298-
data_type: STRING
299-
description: Account the summary is for
300-
- column_name: year_month
301-
data_type: STRING
302-
description: Summary period in YYYY-MM format
303-
- column_name: opening_balance
304-
data_type: NUMERIC
305-
description: Balance at start of month
306-
- column_name: closing_balance
307-
data_type: NUMERIC
308-
description: Balance at end of month
309-
- column_name: total_deposits
310-
data_type: NUMERIC
311-
description: Total deposits during month
312-
- column_name: total_withdrawals
313-
data_type: NUMERIC
314-
description: Total withdrawals during month
315-
- column_name: transaction_count
316-
data_type: INT64
317-
description: Number of transactions during month
318-
- column_name: avg_transaction_amount
319-
data_type: NUMERIC
320-
description: Average transaction amount
321-
- column_name: largest_transaction
322-
data_type: NUMERIC
323-
description: Largest single transaction amount
102+
# table_schemas omitted - will use dynamic schema discovery from BigQuery
324103
few_shot_examples:
325104
- question: What are the total deposits by customer segment this month?
326105
answer: This query shows total deposits grouped by customer segment (VIP, Premium, Standard) for the current month.

src/data_agent/nodes/data_nodes.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,40 @@ def _check_is_cosmos(datasource: Any) -> bool:
122122
# Check by class name to avoid import issues
123123
return type(datasource).__name__ == "CosmosAdapter"
124124

125+
def _get_schema_context(self) -> str:
126+
"""Get schema context from config or dynamically from database.
127+
128+
If table_schemas is defined in config, uses the static schema.
129+
Otherwise, fetches schema dynamically from the database using
130+
SQLDatabase.get_table_info().
131+
132+
Returns:
133+
Schema context string for the LLM prompt.
134+
"""
135+
if self._config.table_schemas:
136+
return SchemaFormatter.format_schema_context(self._config)
137+
138+
if not self._is_cosmos and isinstance(self._datasource, SQLDatabase):
139+
try:
140+
table_info = self._datasource.get_table_info()
141+
if table_info:
142+
logger.debug(
143+
"Using dynamic schema from database. Available tables: %s",
144+
table_info,
145+
)
146+
return f"Available tables and their schemas:\n\n{table_info}"
147+
except Exception as e:
148+
logger.warning("Failed to fetch dynamic schema: %s", e)
149+
150+
return ""
151+
125152
def _build_prompt(self) -> str:
126153
"""Build system prompt, adding Cosmos constraints if needed.
127154
128155
Returns:
129156
Formatted system prompt with schema context and date.
130157
"""
131-
schema_context = SchemaFormatter.format_schema_context(self._config)
158+
schema_context = self._get_schema_context()
132159
few_shot = SchemaFormatter.format_few_shot_examples(self._config)
133160
base_prompt = self._config.system_prompt or DEFAULT_SQL_PROMPT
134161

src/data_agent/nodes/response.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,11 @@ def generate_response(self, state: "AgentState") -> dict[str, Any]:
6868
sql = state.get("generated_sql", "")
6969
result = state.get("result", {})
7070

71-
# Get recent conversation history for conversational continuity
7271
history = get_recent_history(state.get("messages", []), max_messages=4)
7372

7473
messages = [
7574
SystemMessage(content=prompt),
76-
*history, # Previous turns for conversational context
75+
*history,
7776
HumanMessage(
7877
content=(
7978
f"Question: {question}\n\nSQL Query: {sql}\n\nResults: {result}"

0 commit comments

Comments
 (0)