thisisqubika · FacuSentena · Jan 5, 2026 · Jan 2, 2026 · Jan 2, 2026
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,14 @@ databricks-langchain = ">=0.1.0"
 sqlglot = ">=20.0.0"
 pydantic = ">=2.0.0"
 typing_extensions = ">=4.15.0"
+mlflow = ">=2.10.0"
+
+[tool.poetry.group.dev.dependencies]
+pytest = ">=7.0.0"
+pytest-cov = ">=4.0.0"
+pytest-asyncio = ">=0.21.0"
+notebook = ">=6.5.0"
+ipykernel = ">=6.25.0"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,8 @@
 # Snowflake dependencies (for data extraction)
 snowflake-connector-python>=3.0.0
 snowflake-snowpark-python>=1.0.0
+requests>=2.31.0
+typing-extensions>=4.15.0
 python-dotenv>=1.0.0
 
 # Translation graph dependencies
@@ -11,8 +13,10 @@ langgraph>=0.1.0
 sqlglot>=20.0.0
 pydantic>=2.0.0
 
-# Databricks LLM support
+# Databricks LLM support & Evaluation
 databricks-langchain>=0.1.0
+databricks-sdk>=0.1.0
+mlflow>=2.10.0
 
 # Environment variable management
 python-dotenv>=1.0.0

diff --git a/run_local_benchmark.py b/run_local_benchmark.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""
+Run evaluation benchmark locally with Databricks connection.
+
+Usage:
+    python3 run_local_benchmark.py
+    python3 run_local_benchmark.py --artifact-type tables --batch-size 5
+"""
+
+import os
+import sys
+import argparse
+import logging
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+# Load .env from project root
+env_path = Path(__file__).parent / ".env"
+if env_path.exists():
+    load_dotenv(env_path)
+    print(f"✅ Loaded .env from: {env_path}")
+
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent / "src"))
+
+import mlflow
+from databricks_langchain import ChatDatabricks
+
+from artifact_translation_package.evaluation import run_benchmark, ModelConfig
+from artifact_translation_package.evaluation.model_benchmark import create_default_model_configs
+
+
+def get_experiment_name(custom_name: str = None) -> str:
+    """Get experiment name, auto-detecting user if not provided."""
+    if custom_name:
+        return custom_name
+    try:
+        from databricks.sdk import WorkspaceClient
+        username = WorkspaceClient().current_user.me().user_name
+        return f"/Users/{username}/sql-translation-benchmark"
+    except Exception:
+        return "/Shared/sql-translation-benchmark"
+
+
+def setup_mlflow(experiment_name: str) -> str:
+    """Configure MLflow with Databricks tracking."""
+    mlflow.set_tracking_uri("databricks")
+
+    try:
+        mlflow.set_experiment(experiment_name)
+    except Exception:
+        mlflow.create_experiment(experiment_name)
+        mlflow.set_experiment(experiment_name)
+
+    print(f"✅ MLflow experiment: {experiment_name}")
+    return experiment_name
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="SQL translation benchmark")
+    parser.add_argument("--artifact-type", default="tables", help="Artifact type")
+    parser.add_argument("--dataset-source", help="Dataset JSON path")
+    parser.add_argument("--experiment-name", help="MLflow experiment name")
+    parser.add_argument("--batch-size", type=int, default=5, help="Batch size")
+    parser.add_argument("--judge-endpoint", default="databricks-llama-4-maverick")
+    parser.add_argument("--models", nargs="+", help="Model endpoints to test")
+    return parser.parse_args()
+
+
+def main():
+    """Run benchmark."""
+    args = parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+
+    print("\n" + "="*70)
+    print("🚀 SQL Translation Benchmark")
+    print("="*70)
+
+    experiment_name = setup_mlflow(get_experiment_name(args.experiment_name))
+
+    # Configure models
+    if args.models:
+        model_configs = [
+            ModelConfig(name=ep, endpoint=ep, temperature=0.1, max_tokens=4000)
+            for ep in args.models
+        ]
+    else:
+        model_configs = create_default_model_configs()
+
+    print(f"Models: {[c.name for c in model_configs]}")
+    print(f"Artifact: {args.artifact_type}, Batch: {args.batch_size}")
+
+    try:
+        results_df = run_benchmark(
+            artifact_type=args.artifact_type,
+            dataset_source=args.dataset_source,
+            experiment_name=experiment_name,
+            model_configs=model_configs,
+            batch_size=args.batch_size,
+            judge_endpoint=args.judge_endpoint
+        )
+
+        print("\n" + "="*70)
+        print("✅ Benchmark Complete!")
+        print("="*70)
+        print(results_df.to_string())
+        print(f"\n🔗 View results: {experiment_name}")
+
+    except Exception as e:
+        print(f"\n❌ Failed: {e}")
+        logging.error("Benchmark error", exc_info=True)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/artifact_translation_package/config/ddl_config.py b/src/artifact_translation_package/config/ddl_config.py
@@ -177,12 +177,13 @@ class DDLConfig:
                 "additional_params": {
                     "endpoint": LangGraphConfig.DBX_ENDPOINT.value
                 }
-            }
+                        }
         },
         "processing": {
             "batch_size": LangGraphConfig.DDL_BATCH_SIZE.value,
             "max_concurrent_batches": LangGraphConfig.DDL_MAX_CONCURRENT.value,
-            "timeout_seconds": LangGraphConfig.DDL_TIMEOUT.value
+            "timeout_seconds": LangGraphConfig.DDL_TIMEOUT.value,
+            "evaluation_batch_size": 5  # Number of SQL statements per LLM evaluation call
         },
         "output": {
             "format": LangGraphConfig.DDL_OUTPUT_FORMAT.value,
@@ -197,6 +198,14 @@ class DDLConfig:
             "llm_validated_artifacts": ["procedures", "pipes"],
             "skip_unsupported_artifacts": ["grants", "procedures", "udfs", "stages", "pipes", "roles"]
         },
+        "benchmark": {
+            "mlflow_experiment_name": "sql-translation-benchmark",
+            "default_models": ["databricks-llama-4-maverick", "databricks-gemini-2-5-flash"],
+            "temperature_variations": [0.1, 0.2],
+            "artifact_types": ["tables", "views", "procedures"],
+            "judge_endpoint": "databricks-llama-4-maverick",  # LLM endpoint for evaluation/judging
+            "batch_size": 10
+        },
         "langsmith": {
             "tracing": LangGraphConfig.LANGSMITH_TRACING.value,
             "project": LangGraphConfig.LANGSMITH_PROJECT.value,

diff --git a/src/artifact_translation_package/evaluation/README.md b/src/artifact_translation_package/evaluation/README.md
@@ -0,0 +1,111 @@
+# SQL Translation Model Benchmark Evaluation
+
+Evaluate and compare SQL translation models (Snowflake → Databricks) using MLflow and LLM-as-a-judge.
+
+## Prerequisites
+
+Before running the benchmark, ensure you have:
+
+1. **Databricks CLI configured** (or `.env` file with credentials):
+   ```bash
+   # Option A: Databricks CLI
+   databricks configure --host https://your-workspace.cloud.databricks.com
+
+   # Option B: .env file in project root
+   DATABRICKS_HOST=https://your-workspace.cloud.databricks.com
+   DATABRICKS_TOKEN=dapi...
+   ```
+
+2. **Dependencies installed**:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. **Input Data**: The benchmark expects input JSON files (Snowflake DDL metadata). By default, it looks in `src/artifact_translation_package/examples/`.
+
+## Quick Start (Running the Evaluation)
+
+### Option 1: Interactive Notebook
+The simplest way to run and visualize results.
+
+1. Open `src/artifact_translation_package/evaluation/benchmark_interactive.ipynb`
+2. **Configure models**: Set your endpoints in the Config cell.
+   ```python
+   TRANSLATION_MODELS = ["databricks-llama-4-maverick", "databricks-gemini-2-5-flash"]
+   ARTIFACT_TYPE = "tables"  # or "views", "procedures"
+   ```
+3. **Run all cells**: It will trigger `run_local_benchmark.py` and display comparison charts.
+
+### Option 2: Command Line (Fastest)
+
+Run the benchmark from the project root:
+
+```bash
+# 1. Basic: Run benchmark for tables using default models
+python3 run_local_benchmark.py --artifact-type tables
+
+# 2. Advanced: Specify custom models to compare
+python3 run_local_benchmark.py \
+  --artifact-type views \
+  --models databricks-llama-4-maverick databricks-meta-llama-3-1-70b-instruct
+
+# 3. Custom Data: Specify a custom input JSON file
+python3 run_local_benchmark.py \
+  --artifact-type tables \
+  --dataset-source /path/to/your/metadata.json
+
+# 4. Settings: Control batch size for judge execution
+python3 run_local_benchmark.py --batch-size 10
+```
+
+## How Evaluation Works
+
+We use a **Strict Deduction-Based Scoring System** (starting at 100) to evaluate two independent dimensions:
+
+### Dimension 1: Compliance Score (0-100)
+**Goal**: Functional correctness. Can this code actually run on Databricks?
+- **Invalid syntax**: Automatic score of **0**.
+- **Point Deductions**:
+  - Missing `USING DELTA` (-20 pts)
+  - Using legacy types like `VARCHAR` or `TEXT` instead of `STRING` (-10 pts)
+  - Missing 3-level naming (`catalog.schema.table`) (-15 pts)
+
+### Dimension 2: Best Practices Score (0-100)
+**Goal**: Performance and Documentation. Is this production-grade code?
+- **Point Deductions**:
+  - Missing `CLUSTER BY` (Liquid Clustering) (-30 pts)
+  - Missing table properties like `autoOptimize` (-20 pts)
+  - Missing table or column `COMMENT`s (-25 pts each)
+
+## MLflow Features
+
+The benchmark automatically logs rich data to Databricks MLflow:
+
+- **Experiment Name**: Defaults to `sql-translation-benchmark` or your username.
+- **Searchable Tags**: Every run is tagged with issue categories (e.g., `has_naming_issues: true`).
+- **Issues Table**: `issues_table.json` logs every single violation found for queryable analysis.
+- **Top Issues Summary**: `top_issues_summary.txt` provides an at-a-glance summary of the most common mistakes across all samples.
+
+## Metrics Reference
+
+| Metric | Threshold | Description |
+|--------|-----------|-------------|
+| `avg_compliance` | 0-100 | Mean functional correctness score. |
+| `avg_best_practices` | 0-100 | Mean optimization/docs score. |
+| `compliant_pct` | >= 70 | % of statements that are functional. |
+| `syntax_valid_pct` | 100% | % of statements with valid Databricks SQL syntax. |
+
+## Troubleshooting
+
+| Issue | Cause | Fix |
+|-------|-------|-----|
+| Authentication Error | Missing `DATABRICKS_TOKEN` | Check `.env` or run `databricks configure`. |
+| `File Not Found` | Custom JSON path is wrong | Verify `--dataset-source` path. |
+| Model not found | Incorrect endpoint name | Verify name in Databricks Model Serving UI. |
+| Low scores | Model performance | Check `top_issues_summary.txt` to find systemic errors. |
+
+## Deployment to Databricks
+
+To run this directly within a Databricks Job or Notebook:
+1. Ensure the `requirements.txt` libraries are installed on the cluster.
+2. The benchmark will automatically detect it is running in Databricks and create MLflow experiments in your User workspace folder.
diff --git a/src/artifact_translation_package/evaluation/__init__.py b/src/artifact_translation_package/evaluation/__init__.py
@@ -0,0 +1,36 @@
+"""
+Evaluation and benchmarking module for SQL translation quality assessment.
+
+This module provides tools for evaluating and benchmarking SQL translation models
+using MLflow and LLM-as-a-judge patterns.
+"""
+
+from artifact_translation_package.evaluation.evaluation_dataset import (
+    EvaluationDataset,
+    load_evaluation_dataset
+)
+from artifact_translation_package.evaluation.databricks_sql_scorer import (
+    DatabricksSQLComplianceScorer,
+    create_compliance_scorer
+)
+from artifact_translation_package.evaluation.model_benchmark import (
+    ModelConfig,
+    ModelBenchmark,
+    create_default_model_configs
+)
+from artifact_translation_package.evaluation.run_benchmark import (
+    BenchmarkRunner,
+    run_benchmark
+)
+
+__all__ = [
+    "EvaluationDataset",
+    "load_evaluation_dataset",
+    "DatabricksSQLComplianceScorer",
+    "create_compliance_scorer",
+    "ModelConfig",
+    "ModelBenchmark",
+    "create_default_model_configs",
+    "BenchmarkRunner",
+    "run_benchmark"
+]