Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,780 changes: 1,751 additions & 29 deletions poetry.lock

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ databricks-langchain = ">=0.1.0"
sqlglot = ">=20.0.0"
pydantic = ">=2.0.0"
typing_extensions = ">=4.15.0"
mlflow = ">=2.10.0"

[tool.poetry.group.dev.dependencies]
pytest = ">=7.0.0"
pytest-cov = ">=4.0.0"
pytest-asyncio = ">=0.21.0"
notebook = ">=6.5.0"
ipykernel = ">=6.25.0"

[build-system]
requires = ["poetry-core"]
Expand Down
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Snowflake dependencies (for data extraction)
snowflake-connector-python>=3.0.0
snowflake-snowpark-python>=1.0.0
requests>=2.31.0
typing-extensions>=4.15.0
python-dotenv>=1.0.0

# Translation graph dependencies
Expand All @@ -11,8 +13,10 @@ langgraph>=0.1.0
sqlglot>=20.0.0
pydantic>=2.0.0

# Databricks LLM support
# Databricks LLM support & Evaluation
databricks-langchain>=0.1.0
databricks-sdk>=0.1.0
mlflow>=2.10.0

# Environment variable management
python-dotenv>=1.0.0
Expand Down
119 changes: 119 additions & 0 deletions run_local_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env python3
"""
Run evaluation benchmark locally with Databricks connection.

Usage:
python3 run_local_benchmark.py
python3 run_local_benchmark.py --artifact-type tables --batch-size 5
"""

import os
import sys
import argparse
import logging
from pathlib import Path

from dotenv import load_dotenv

# Load .env from project root
env_path = Path(__file__).parent / ".env"
if env_path.exists():
load_dotenv(env_path)
print(f"βœ… Loaded .env from: {env_path}")

# Add src to path
sys.path.insert(0, str(Path(__file__).parent / "src"))

import mlflow
from databricks_langchain import ChatDatabricks

from artifact_translation_package.evaluation import run_benchmark, ModelConfig
from artifact_translation_package.evaluation.model_benchmark import create_default_model_configs


def get_experiment_name(custom_name: str = None) -> str:
"""Get experiment name, auto-detecting user if not provided."""
if custom_name:
return custom_name
try:
from databricks.sdk import WorkspaceClient
username = WorkspaceClient().current_user.me().user_name
return f"/Users/{username}/sql-translation-benchmark"
except Exception:
return "/Shared/sql-translation-benchmark"


def setup_mlflow(experiment_name: str) -> str:
"""Configure MLflow with Databricks tracking."""
mlflow.set_tracking_uri("databricks")

try:
mlflow.set_experiment(experiment_name)
except Exception:
mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

print(f"βœ… MLflow experiment: {experiment_name}")
return experiment_name


def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="SQL translation benchmark")
parser.add_argument("--artifact-type", default="tables", help="Artifact type")
parser.add_argument("--dataset-source", help="Dataset JSON path")
parser.add_argument("--experiment-name", help="MLflow experiment name")
parser.add_argument("--batch-size", type=int, default=5, help="Batch size")
parser.add_argument("--judge-endpoint", default="databricks-llama-4-maverick")
parser.add_argument("--models", nargs="+", help="Model endpoints to test")
return parser.parse_args()


def main():
"""Run benchmark."""
args = parse_args()

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")

print("\n" + "="*70)
print("πŸš€ SQL Translation Benchmark")
print("="*70)

experiment_name = setup_mlflow(get_experiment_name(args.experiment_name))

# Configure models
if args.models:
model_configs = [
ModelConfig(name=ep, endpoint=ep, temperature=0.1, max_tokens=4000)
for ep in args.models
]
else:
model_configs = create_default_model_configs()

print(f"Models: {[c.name for c in model_configs]}")
print(f"Artifact: {args.artifact_type}, Batch: {args.batch_size}")

try:
results_df = run_benchmark(
artifact_type=args.artifact_type,
dataset_source=args.dataset_source,
experiment_name=experiment_name,
model_configs=model_configs,
batch_size=args.batch_size,
judge_endpoint=args.judge_endpoint
)

print("\n" + "="*70)
print("βœ… Benchmark Complete!")
print("="*70)
print(results_df.to_string())
print(f"\nπŸ”— View results: {experiment_name}")

except Exception as e:
print(f"\n❌ Failed: {e}")
logging.error("Benchmark error", exc_info=True)
sys.exit(1)


if __name__ == "__main__":
main()
13 changes: 11 additions & 2 deletions src/artifact_translation_package/config/ddl_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,12 +177,13 @@ class DDLConfig:
"additional_params": {
"endpoint": LangGraphConfig.DBX_ENDPOINT.value
}
}
}
},
"processing": {
"batch_size": LangGraphConfig.DDL_BATCH_SIZE.value,
"max_concurrent_batches": LangGraphConfig.DDL_MAX_CONCURRENT.value,
"timeout_seconds": LangGraphConfig.DDL_TIMEOUT.value
"timeout_seconds": LangGraphConfig.DDL_TIMEOUT.value,
"evaluation_batch_size": 5 # Number of SQL statements per LLM evaluation call
},
"output": {
"format": LangGraphConfig.DDL_OUTPUT_FORMAT.value,
Expand All @@ -197,6 +198,14 @@ class DDLConfig:
"llm_validated_artifacts": ["procedures", "pipes"],
"skip_unsupported_artifacts": ["grants", "procedures", "udfs", "stages", "pipes", "roles"]
},
"benchmark": {
"mlflow_experiment_name": "sql-translation-benchmark",
"default_models": ["databricks-llama-4-maverick", "databricks-gemini-2-5-flash"],
"temperature_variations": [0.1, 0.2],
"artifact_types": ["tables", "views", "procedures"],
"judge_endpoint": "databricks-llama-4-maverick", # LLM endpoint for evaluation/judging
"batch_size": 10
},
"langsmith": {
"tracing": LangGraphConfig.LANGSMITH_TRACING.value,
"project": LangGraphConfig.LANGSMITH_PROJECT.value,
Expand Down
111 changes: 111 additions & 0 deletions src/artifact_translation_package/evaluation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# SQL Translation Model Benchmark Evaluation

Evaluate and compare SQL translation models (Snowflake β†’ Databricks) using MLflow and LLM-as-a-judge.

## Prerequisites

Before running the benchmark, ensure you have:

1. **Databricks CLI configured** (or `.env` file with credentials):
```bash
# Option A: Databricks CLI
databricks configure --host https://your-workspace.cloud.databricks.com

# Option B: .env file in project root
DATABRICKS_HOST=https://your-workspace.cloud.databricks.com
DATABRICKS_TOKEN=dapi...
```

2. **Dependencies installed**:
```bash
pip install -r requirements.txt
```

3. **Input Data**: The benchmark expects input JSON files (Snowflake DDL metadata). By default, it looks in `src/artifact_translation_package/examples/`.

## Quick Start (Running the Evaluation)

### Option 1: Interactive Notebook
The simplest way to run and visualize results.

1. Open `src/artifact_translation_package/evaluation/benchmark_interactive.ipynb`
2. **Configure models**: Set your endpoints in the Config cell.
```python
TRANSLATION_MODELS = ["databricks-llama-4-maverick", "databricks-gemini-2-5-flash"]
ARTIFACT_TYPE = "tables" # or "views", "procedures"
```
3. **Run all cells**: It will trigger `run_local_benchmark.py` and display comparison charts.

### Option 2: Command Line (Fastest)

Run the benchmark from the project root:

```bash
# 1. Basic: Run benchmark for tables using default models
python3 run_local_benchmark.py --artifact-type tables

# 2. Advanced: Specify custom models to compare
python3 run_local_benchmark.py \
--artifact-type views \
--models databricks-llama-4-maverick databricks-meta-llama-3-1-70b-instruct

# 3. Custom Data: Specify a custom input JSON file
python3 run_local_benchmark.py \
--artifact-type tables \
--dataset-source /path/to/your/metadata.json

# 4. Settings: Control batch size for judge execution
python3 run_local_benchmark.py --batch-size 10
```

## How Evaluation Works

We use a **Strict Deduction-Based Scoring System** (starting at 100) to evaluate two independent dimensions:

### Dimension 1: Compliance Score (0-100)
**Goal**: Functional correctness. Can this code actually run on Databricks?
- **Invalid syntax**: Automatic score of **0**.
- **Point Deductions**:
- Missing `USING DELTA` (-20 pts)
- Using legacy types like `VARCHAR` or `TEXT` instead of `STRING` (-10 pts)
- Missing 3-level naming (`catalog.schema.table`) (-15 pts)

### Dimension 2: Best Practices Score (0-100)
**Goal**: Performance and Documentation. Is this production-grade code?
- **Point Deductions**:
- Missing `CLUSTER BY` (Liquid Clustering) (-30 pts)
- Missing table properties like `autoOptimize` (-20 pts)
- Missing table or column `COMMENT`s (-25 pts each)

## MLflow Features

The benchmark automatically logs rich data to Databricks MLflow:

- **Experiment Name**: Defaults to `sql-translation-benchmark` or your username.
- **Searchable Tags**: Every run is tagged with issue categories (e.g., `has_naming_issues: true`).
- **Issues Table**: `issues_table.json` logs every single violation found for queryable analysis.
- **Top Issues Summary**: `top_issues_summary.txt` provides an at-a-glance summary of the most common mistakes across all samples.

## Metrics Reference

| Metric | Threshold | Description |
|--------|-----------|-------------|
| `avg_compliance` | 0-100 | Mean functional correctness score. |
| `avg_best_practices` | 0-100 | Mean optimization/docs score. |
| `compliant_pct` | >= 70 | % of statements that are functional. |
| `syntax_valid_pct` | 100% | % of statements with valid Databricks SQL syntax. |

## Troubleshooting

| Issue | Cause | Fix |
|-------|-------|-----|
| Authentication Error | Missing `DATABRICKS_TOKEN` | Check `.env` or run `databricks configure`. |
| `File Not Found` | Custom JSON path is wrong | Verify `--dataset-source` path. |
| Model not found | Incorrect endpoint name | Verify name in Databricks Model Serving UI. |
| Low scores | Model performance | Check `top_issues_summary.txt` to find systemic errors. |

## Deployment to Databricks

To run this directly within a Databricks Job or Notebook:
1. Ensure the `requirements.txt` libraries are installed on the cluster.
2. The benchmark will automatically detect it is running in Databricks and create MLflow experiments in your User workspace folder.
36 changes: 36 additions & 0 deletions src/artifact_translation_package/evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Evaluation and benchmarking module for SQL translation quality assessment.

This module provides tools for evaluating and benchmarking SQL translation models
using MLflow and LLM-as-a-judge patterns.
"""

from artifact_translation_package.evaluation.evaluation_dataset import (
EvaluationDataset,
load_evaluation_dataset
)
from artifact_translation_package.evaluation.databricks_sql_scorer import (
DatabricksSQLComplianceScorer,
create_compliance_scorer
)
from artifact_translation_package.evaluation.model_benchmark import (
ModelConfig,
ModelBenchmark,
create_default_model_configs
)
from artifact_translation_package.evaluation.run_benchmark import (
BenchmarkRunner,
run_benchmark
)

__all__ = [
"EvaluationDataset",
"load_evaluation_dataset",
"DatabricksSQLComplianceScorer",
"create_compliance_scorer",
"ModelConfig",
"ModelBenchmark",
"create_default_model_configs",
"BenchmarkRunner",
"run_benchmark"
]
Loading