Skip to content

Commit ec9c580

Browse files
authored
Merge pull request #33 from thisisqubika/feature/judges
evaluation pipeline implemented
2 parents a15fb9e + bd5cf15 commit ec9c580

15 files changed

Lines changed: 3055 additions & 39 deletions

poetry.lock

Lines changed: 1751 additions & 29 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,14 @@ databricks-langchain = ">=0.1.0"
2424
sqlglot = ">=20.0.0"
2525
pydantic = ">=2.0.0"
2626
typing_extensions = ">=4.15.0"
27+
mlflow = ">=2.10.0"
28+
29+
[tool.poetry.group.dev.dependencies]
30+
pytest = ">=7.0.0"
31+
pytest-cov = ">=4.0.0"
32+
pytest-asyncio = ">=0.21.0"
33+
notebook = ">=6.5.0"
34+
ipykernel = ">=6.25.0"
2735

2836
[build-system]
2937
requires = ["poetry-core"]

requirements.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# Snowflake dependencies (for data extraction)
22
snowflake-connector-python>=3.0.0
33
snowflake-snowpark-python>=1.0.0
4+
requests>=2.31.0
5+
typing-extensions>=4.15.0
46
python-dotenv>=1.0.0
57

68
# Translation graph dependencies
@@ -11,8 +13,10 @@ langgraph>=0.1.0
1113
sqlglot>=20.0.0
1214
pydantic>=2.0.0
1315

14-
# Databricks LLM support
16+
# Databricks LLM support & Evaluation
1517
databricks-langchain>=0.1.0
18+
databricks-sdk>=0.1.0
19+
mlflow>=2.10.0
1620

1721
# Environment variable management
1822
python-dotenv>=1.0.0

run_local_benchmark.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Run evaluation benchmark locally with Databricks connection.
4+
5+
Usage:
6+
python3 run_local_benchmark.py
7+
python3 run_local_benchmark.py --artifact-type tables --batch-size 5
8+
"""
9+
10+
import os
11+
import sys
12+
import argparse
13+
import logging
14+
from pathlib import Path
15+
16+
from dotenv import load_dotenv
17+
18+
# Load .env from project root
19+
env_path = Path(__file__).parent / ".env"
20+
if env_path.exists():
21+
load_dotenv(env_path)
22+
print(f"✅ Loaded .env from: {env_path}")
23+
24+
# Add src to path
25+
sys.path.insert(0, str(Path(__file__).parent / "src"))
26+
27+
import mlflow
28+
from databricks_langchain import ChatDatabricks
29+
30+
from artifact_translation_package.evaluation import run_benchmark, ModelConfig
31+
from artifact_translation_package.evaluation.model_benchmark import create_default_model_configs
32+
33+
34+
def get_experiment_name(custom_name: str = None) -> str:
35+
"""Get experiment name, auto-detecting user if not provided."""
36+
if custom_name:
37+
return custom_name
38+
try:
39+
from databricks.sdk import WorkspaceClient
40+
username = WorkspaceClient().current_user.me().user_name
41+
return f"/Users/{username}/sql-translation-benchmark"
42+
except Exception:
43+
return "/Shared/sql-translation-benchmark"
44+
45+
46+
def setup_mlflow(experiment_name: str) -> str:
47+
"""Configure MLflow with Databricks tracking."""
48+
mlflow.set_tracking_uri("databricks")
49+
50+
try:
51+
mlflow.set_experiment(experiment_name)
52+
except Exception:
53+
mlflow.create_experiment(experiment_name)
54+
mlflow.set_experiment(experiment_name)
55+
56+
print(f"✅ MLflow experiment: {experiment_name}")
57+
return experiment_name
58+
59+
60+
def parse_args():
61+
"""Parse command line arguments."""
62+
parser = argparse.ArgumentParser(description="SQL translation benchmark")
63+
parser.add_argument("--artifact-type", default="tables", help="Artifact type")
64+
parser.add_argument("--dataset-source", help="Dataset JSON path")
65+
parser.add_argument("--experiment-name", help="MLflow experiment name")
66+
parser.add_argument("--batch-size", type=int, default=5, help="Batch size")
67+
parser.add_argument("--judge-endpoint", default="databricks-llama-4-maverick")
68+
parser.add_argument("--models", nargs="+", help="Model endpoints to test")
69+
return parser.parse_args()
70+
71+
72+
def main():
73+
"""Run benchmark."""
74+
args = parse_args()
75+
76+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
77+
78+
print("\n" + "="*70)
79+
print("🚀 SQL Translation Benchmark")
80+
print("="*70)
81+
82+
experiment_name = setup_mlflow(get_experiment_name(args.experiment_name))
83+
84+
# Configure models
85+
if args.models:
86+
model_configs = [
87+
ModelConfig(name=ep, endpoint=ep, temperature=0.1, max_tokens=4000)
88+
for ep in args.models
89+
]
90+
else:
91+
model_configs = create_default_model_configs()
92+
93+
print(f"Models: {[c.name for c in model_configs]}")
94+
print(f"Artifact: {args.artifact_type}, Batch: {args.batch_size}")
95+
96+
try:
97+
results_df = run_benchmark(
98+
artifact_type=args.artifact_type,
99+
dataset_source=args.dataset_source,
100+
experiment_name=experiment_name,
101+
model_configs=model_configs,
102+
batch_size=args.batch_size,
103+
judge_endpoint=args.judge_endpoint
104+
)
105+
106+
print("\n" + "="*70)
107+
print("✅ Benchmark Complete!")
108+
print("="*70)
109+
print(results_df.to_string())
110+
print(f"\n🔗 View results: {experiment_name}")
111+
112+
except Exception as e:
113+
print(f"\n❌ Failed: {e}")
114+
logging.error("Benchmark error", exc_info=True)
115+
sys.exit(1)
116+
117+
118+
if __name__ == "__main__":
119+
main()

src/artifact_translation_package/config/ddl_config.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,12 +177,13 @@ class DDLConfig:
177177
"additional_params": {
178178
"endpoint": LangGraphConfig.DBX_ENDPOINT.value
179179
}
180-
}
180+
}
181181
},
182182
"processing": {
183183
"batch_size": LangGraphConfig.DDL_BATCH_SIZE.value,
184184
"max_concurrent_batches": LangGraphConfig.DDL_MAX_CONCURRENT.value,
185-
"timeout_seconds": LangGraphConfig.DDL_TIMEOUT.value
185+
"timeout_seconds": LangGraphConfig.DDL_TIMEOUT.value,
186+
"evaluation_batch_size": 5 # Number of SQL statements per LLM evaluation call
186187
},
187188
"output": {
188189
"format": LangGraphConfig.DDL_OUTPUT_FORMAT.value,
@@ -197,6 +198,14 @@ class DDLConfig:
197198
"llm_validated_artifacts": ["procedures", "pipes"],
198199
"skip_unsupported_artifacts": ["grants", "procedures", "udfs", "stages", "pipes", "roles"]
199200
},
201+
"benchmark": {
202+
"mlflow_experiment_name": "sql-translation-benchmark",
203+
"default_models": ["databricks-llama-4-maverick", "databricks-gemini-2-5-flash"],
204+
"temperature_variations": [0.1, 0.2],
205+
"artifact_types": ["tables", "views", "procedures"],
206+
"judge_endpoint": "databricks-llama-4-maverick", # LLM endpoint for evaluation/judging
207+
"batch_size": 10
208+
},
200209
"langsmith": {
201210
"tracing": LangGraphConfig.LANGSMITH_TRACING.value,
202211
"project": LangGraphConfig.LANGSMITH_PROJECT.value,
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# SQL Translation Model Benchmark Evaluation
2+
3+
Evaluate and compare SQL translation models (Snowflake → Databricks) using MLflow and LLM-as-a-judge.
4+
5+
## Prerequisites
6+
7+
Before running the benchmark, ensure you have:
8+
9+
1. **Databricks CLI configured** (or `.env` file with credentials):
10+
```bash
11+
# Option A: Databricks CLI
12+
databricks configure --host https://your-workspace.cloud.databricks.com
13+
14+
# Option B: .env file in project root
15+
DATABRICKS_HOST=https://your-workspace.cloud.databricks.com
16+
DATABRICKS_TOKEN=dapi...
17+
```
18+
19+
2. **Dependencies installed**:
20+
```bash
21+
pip install -r requirements.txt
22+
```
23+
24+
3. **Input Data**: The benchmark expects input JSON files (Snowflake DDL metadata). By default, it looks in `src/artifact_translation_package/examples/`.
25+
26+
## Quick Start (Running the Evaluation)
27+
28+
### Option 1: Interactive Notebook
29+
The simplest way to run and visualize results.
30+
31+
1. Open `src/artifact_translation_package/evaluation/benchmark_interactive.ipynb`
32+
2. **Configure models**: Set your endpoints in the Config cell.
33+
```python
34+
TRANSLATION_MODELS = ["databricks-llama-4-maverick", "databricks-gemini-2-5-flash"]
35+
ARTIFACT_TYPE = "tables" # or "views", "procedures"
36+
```
37+
3. **Run all cells**: It will trigger `run_local_benchmark.py` and display comparison charts.
38+
39+
### Option 2: Command Line (Fastest)
40+
41+
Run the benchmark from the project root:
42+
43+
```bash
44+
# 1. Basic: Run benchmark for tables using default models
45+
python3 run_local_benchmark.py --artifact-type tables
46+
47+
# 2. Advanced: Specify custom models to compare
48+
python3 run_local_benchmark.py \
49+
--artifact-type views \
50+
--models databricks-llama-4-maverick databricks-meta-llama-3-1-70b-instruct
51+
52+
# 3. Custom Data: Specify a custom input JSON file
53+
python3 run_local_benchmark.py \
54+
--artifact-type tables \
55+
--dataset-source /path/to/your/metadata.json
56+
57+
# 4. Settings: Control batch size for judge execution
58+
python3 run_local_benchmark.py --batch-size 10
59+
```
60+
61+
## How Evaluation Works
62+
63+
We use a **Strict Deduction-Based Scoring System** (starting at 100) to evaluate two independent dimensions:
64+
65+
### Dimension 1: Compliance Score (0-100)
66+
**Goal**: Functional correctness. Can this code actually run on Databricks?
67+
- **Invalid syntax**: Automatic score of **0**.
68+
- **Point Deductions**:
69+
- Missing `USING DELTA` (-20 pts)
70+
- Using legacy types like `VARCHAR` or `TEXT` instead of `STRING` (-10 pts)
71+
- Missing 3-level naming (`catalog.schema.table`) (-15 pts)
72+
73+
### Dimension 2: Best Practices Score (0-100)
74+
**Goal**: Performance and Documentation. Is this production-grade code?
75+
- **Point Deductions**:
76+
- Missing `CLUSTER BY` (Liquid Clustering) (-30 pts)
77+
- Missing table properties like `autoOptimize` (-20 pts)
78+
- Missing table or column `COMMENT`s (-25 pts each)
79+
80+
## MLflow Features
81+
82+
The benchmark automatically logs rich data to Databricks MLflow:
83+
84+
- **Experiment Name**: Defaults to `sql-translation-benchmark` or your username.
85+
- **Searchable Tags**: Every run is tagged with issue categories (e.g., `has_naming_issues: true`).
86+
- **Issues Table**: `issues_table.json` logs every single violation found for queryable analysis.
87+
- **Top Issues Summary**: `top_issues_summary.txt` provides an at-a-glance summary of the most common mistakes across all samples.
88+
89+
## Metrics Reference
90+
91+
| Metric | Threshold | Description |
92+
|--------|-----------|-------------|
93+
| `avg_compliance` | 0-100 | Mean functional correctness score. |
94+
| `avg_best_practices` | 0-100 | Mean optimization/docs score. |
95+
| `compliant_pct` | >= 70 | % of statements that are functional. |
96+
| `syntax_valid_pct` | 100% | % of statements with valid Databricks SQL syntax. |
97+
98+
## Troubleshooting
99+
100+
| Issue | Cause | Fix |
101+
|-------|-------|-----|
102+
| Authentication Error | Missing `DATABRICKS_TOKEN` | Check `.env` or run `databricks configure`. |
103+
| `File Not Found` | Custom JSON path is wrong | Verify `--dataset-source` path. |
104+
| Model not found | Incorrect endpoint name | Verify name in Databricks Model Serving UI. |
105+
| Low scores | Model performance | Check `top_issues_summary.txt` to find systemic errors. |
106+
107+
## Deployment to Databricks
108+
109+
To run this directly within a Databricks Job or Notebook:
110+
1. Ensure the `requirements.txt` libraries are installed on the cluster.
111+
2. The benchmark will automatically detect it is running in Databricks and create MLflow experiments in your User workspace folder.
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
Evaluation and benchmarking module for SQL translation quality assessment.
3+
4+
This module provides tools for evaluating and benchmarking SQL translation models
5+
using MLflow and LLM-as-a-judge patterns.
6+
"""
7+
8+
from artifact_translation_package.evaluation.evaluation_dataset import (
9+
EvaluationDataset,
10+
load_evaluation_dataset
11+
)
12+
from artifact_translation_package.evaluation.databricks_sql_scorer import (
13+
DatabricksSQLComplianceScorer,
14+
create_compliance_scorer
15+
)
16+
from artifact_translation_package.evaluation.model_benchmark import (
17+
ModelConfig,
18+
ModelBenchmark,
19+
create_default_model_configs
20+
)
21+
from artifact_translation_package.evaluation.run_benchmark import (
22+
BenchmarkRunner,
23+
run_benchmark
24+
)
25+
26+
__all__ = [
27+
"EvaluationDataset",
28+
"load_evaluation_dataset",
29+
"DatabricksSQLComplianceScorer",
30+
"create_compliance_scorer",
31+
"ModelConfig",
32+
"ModelBenchmark",
33+
"create_default_model_configs",
34+
"BenchmarkRunner",
35+
"run_benchmark"
36+
]

0 commit comments

Comments
 (0)