diff --git a/.github/workflows/databricks-asset-bundle-deploy.yml b/.github/workflows/databricks-asset-bundle-deploy.yml index 6341c1b..a6ea929 100644 --- a/.github/workflows/databricks-asset-bundle-deploy.yml +++ b/.github/workflows/databricks-asset-bundle-deploy.yml @@ -50,11 +50,32 @@ jobs: - name: Validate bundle run: | - databricks bundle validate + databricks bundle validate -t dev \ + --var="cluster_id=${{ secrets.DATABRICKS_CLUSTER_ID }}" \ + --var="catalog_name=${{ secrets.UC_CATALOG }}" \ + --var="devs_group=${{ secrets.DEVS_GROUP }}" - name: Deploy bundle run: | - databricks bundle deploy -t dev - databricks bundle run databricks_job_executor_app -t dev - databricks bundle summary - databricks bundle resources \ No newline at end of file + databricks bundle deploy -t dev \ + --var="cluster_id=${{ secrets.DATABRICKS_CLUSTER_ID }}" \ + --var="catalog_name=${{ secrets.UC_CATALOG }}" \ + --var="devs_group=${{ secrets.DEVS_GROUP }}" + + - name: Run job executor app + run: | + databricks bundle run databricks_job_executor_app -t dev \ + --var="cluster_id=${{ secrets.DATABRICKS_CLUSTER_ID }}" \ + --var="catalog_name=${{ secrets.UC_CATALOG }}" \ + --var="devs_group=${{ secrets.DEVS_GROUP }}" + + - name: Show bundle info + run: | + databricks bundle summary -t dev \ + --var="cluster_id=${{ secrets.DATABRICKS_CLUSTER_ID }}" \ + --var="catalog_name=${{ secrets.UC_CATALOG }}" \ + --var="devs_group=${{ secrets.DEVS_GROUP }}" + databricks bundle resources -t dev \ + --var="cluster_id=${{ secrets.DATABRICKS_CLUSTER_ID }}" \ + --var="catalog_name=${{ secrets.UC_CATALOG }}" \ + --var="devs_group=${{ secrets.DEVS_GROUP }}" \ No newline at end of file diff --git a/README.md b/README.md index 489a748..317a02e 100644 --- a/README.md +++ b/README.md @@ -98,20 +98,65 @@ For detailed instructions on configuring Snowflake credentials, see [CONFIGURATI cp env.example .env ``` -2. Edit `.env` with your Snowflake credentials: +2. Edit `.env` with your credentials: ```env SNOWFLAKE_ACCOUNT=your_account_identifier SNOWFLAKE_USER=your_username SNOWFLAKE_PASSWORD=your_password - SNOWFLAKE_DATABASE=DATA_MIGRATION_DB - SNOWFLAKE_SCHEMA=DATA_MIGRATION_SCHEMA - SNOWFLAKE_WAREHOUSE=COMPUTE_WH # Optional - SNOWFLAKE_ROLE=SYSADMIN # Optional - SNOWFLAKE_REGION=us-east-1 # Optional + SNOWFLAKE_DATABASE=your_database # Required + SNOWFLAKE_SCHEMA=your_schema # Required + SNOWFLAKE_WAREHOUSE=COMPUTE_WH # Optional + SNOWFLAKE_ROLE=SYSADMIN # Optional + + # Unity Catalog - Required + UC_CATALOG=your_catalog_name + UC_SCHEMA=migration_accelerator ``` 3. The `.env` file is already in `.gitignore` to protect your credentials +### Databricks Deployment + +To run on Databricks, configure the following: + +#### Databricks Secrets + +Create a secrets scope and add credentials: + +```bash +databricks secrets create-scope migration-accelerator +databricks secrets put-secret migration-accelerator SNOWFLAKE_ACCOUNT +databricks secrets put-secret migration-accelerator SNOWFLAKE_USER +databricks secrets put-secret migration-accelerator SNOWFLAKE_PASSWORD +databricks secrets put-secret migration-accelerator DATABRICKS_HOST +databricks secrets put-secret migration-accelerator DATABRICKS_CLIENT_ID +databricks secrets put-secret migration-accelerator DATABRICKS_CLIENT_SECRET +``` + +#### Cluster Environment Variables + +Set in **Cluster → Advanced Options → Spark → Environment Variables**: + +```bash +UC_CATALOG=your_catalog_name +UC_SCHEMA=migration_accelerator +SNOWFLAKE_DATABASE=your_database +SNOWFLAKE_SCHEMA=your_schema +``` + +#### GitHub Secrets (for CI/CD) + +| Secret | Description | +|--------|-------------| +| `DATABRICKS_HOST` | Workspace URL | +| `DATABRICKS_CLIENT_ID` | OAuth M2M client ID | +| `DATABRICKS_CLIENT_SECRET` | OAuth M2M client secret | +| `DATABRICKS_CLUSTER_ID` | Cluster ID for jobs | +| `UC_CATALOG` | Unity Catalog name | +| `DEVS_GROUP` | Group name for permissions | + +> **Note:** The `DEVS_GROUP` (e.g., `migration-accelerator-devs`) must exist in Databricks before deployment. Create it in **Admin Console → Groups → Create Group**. + ## Run Locally (translation job) diff --git a/databricks_job_executor/streamlit_app/components/ui/initializers.py b/databricks_job_executor/streamlit_app/components/ui/initializers.py index 4343191..37a6854 100644 --- a/databricks_job_executor/streamlit_app/components/ui/initializers.py +++ b/databricks_job_executor/streamlit_app/components/ui/initializers.py @@ -62,8 +62,7 @@ def initialize_job_state(): def initialize_environment_state(db_env: dict): """Initialize environment-related state.""" - if 'databricks_env' not in st.session_state: - st.session_state.databricks_env = db_env + st.session_state.databricks_env = db_env def initialize_session_state(db_env: dict): diff --git a/docs/combined_documentation.md b/docs/combined_documentation.md new file mode 100644 index 0000000..b359ca8 --- /dev/null +++ b/docs/combined_documentation.md @@ -0,0 +1,1419 @@ +# Migration Accelerator Documentation + +Welcome to the comprehensive documentation for the **Data Migration Accelerator**. This tool enables seamless migration of Snowflake database artifacts to Databricks using LLM-powered translation with built-in validation, observability, and benchmarking. + +_Last updated by the team on January 5, 2026_ + +--- + +## Table of Contents + +- [Overview](#overview) +- [Quick Start Guide](#quick-start-guide) +- [Architecture & Components](#architecture--components) +- [Setup & Installation Guide](#setup--installation-guide) +- [Runbook for Daily Operation](#runbook-for-daily-operation) +- [Evaluation & Benchmarking](#evaluation--benchmarking) +- [Troubleshooting & Support](#troubleshooting--support) +- [Client Onboarding Material](#client-onboarding-material) +- [Current Capabilities & Limitations](#current-capabilities--limitations) +- [Configuration Reference](#configuration-reference) + +--- + +## Overview + +The Migration Accelerator is an end-to-end solution for migrating Snowflake database artifacts to Databricks. It combines: + +- **Automated Metadata Extraction**: Connects to Snowflake via Snowpark API to extract complete metadata for all database objects +- **LLM-Powered Translation**: Uses LangGraph orchestration with Databricks-hosted LLMs to translate Snowflake DDL to Databricks-compatible SQL +- **Built-in Validation**: Validates translations using SQLGlot syntax checking and LLM-based semantic evaluation +- **Observability & Benchmarking**: MLflow integration for tracking translation quality and comparing LLM models +- **UI-Based Execution**: Streamlit application for job execution and monitoring + +### Supported Artifact Types + +The accelerator supports **extraction** for all artifact types and **translation** for most. Some artifacts require manual review. + +| Category | Artifact Types | Status | +|----------|---------------|--------| +| **Core Objects** | Tables, Views, Schemas, Databases | ✅ Full Translation | +| **Programmatic** | Procedures, Functions (UDFs) | ⚠️ Translation (Manual Review) | +| **Programmatic** | Sequences | ✅ Full Translation | +| **Data Pipeline** | Streams | ✅ Full Translation | +| **Data Pipeline** | Stages, Pipes | ⚠️ Translation (Manual Review) | +| **Data Pipeline** | Tasks, File Formats | 📥 Extraction Only | +| **Security** | Roles, Grants | ⚠️ Translation (Manual Review) | +| **Governance** | Tags, Comments, Masking Policies | ✅ Full Translation | +| **Governance** | External Locations | ⚠️ Translation (Manual Review) | + +> [!NOTE] +> **Legend**: ✅ Full Translation = automated with sql_glot validation | ⚠️ Manual Review = translation available but sql_glot validation skipped | 📥 Extraction Only = metadata extracted, no translation + +### Primary Execution Environment + +> [!IMPORTANT] +> This tool is designed to run primarily on **Databricks Jobs** for production use. Local execution is supported for development and testing. + +--- + +## Quick Start Guide + +Get the Migration Accelerator running in 5 steps: + +### Step 1: Clone the Repository +```bash +git clone https://github.com/your-org/data-migration-accelerator.git +cd data-migration-accelerator +``` + +### Step 2: Configure Environment +```bash +cp env.example .env +# Edit .env with your Snowflake and Databricks credentials +``` + +### Step 3: Install Dependencies +```bash +# Install Poetry (if not already installed) +pip install poetry + +# Install project dependencies +poetry install +``` + + +### Step 4: Run Translation +```bash +poetry run make translate +# Output: SQL files in src/artifact_translation_package/out_sql_examples_/ +``` + +For production deployment on Databricks, see [Setup & Installation Guide](#setup--installation-guide). + +--- + +## Architecture & Components + +### High-Level Architecture + +```mermaid +graph TD + subgraph "Source Environment" + A[Snowflake Database] + end + + subgraph "Migration Accelerator" + B[Extraction Module
migration_accelerator_package] + C[JSON Metadata
Unity Catalog Volume] + D[Translation Pipeline
artifact_translation_package] + E[Validation & Evaluation] + F[Job Executor UI
databricks_job_executor] + end + + subgraph "Target Environment" + G[Databricks SQL Files] + H[Unity Catalog] + end + + A --> B + B --> C + C --> D + D --> E + E --> G + G --> H + F -.-> B + F -.-> D +``` + +### Component Overview + +The accelerator consists of four main packages: + +| Package | Purpose | Key Files | +|---------|---------|-----------| +| `migration_accelerator_package` | Snowflake metadata extraction | `snowpark.py`, `artifact_readers.py` | +| `artifact_translation_package` | LLM-based translation pipeline | `graph_builder.py`, `main.py`, `nodes/` | +| `databricks_job_executor` | Streamlit UI for job management | `streamlit_app/app.py` | +| `evaluation` (subpackage) | MLflow benchmarking | `run_benchmark.py`, `model_benchmark.py` | + +--- + +### Component 1: Extraction Module (`migration_accelerator_package`) + +Extracts metadata from Snowflake using the Snowpark API. + +**Features:** +- Connects to Snowflake with password authentication +- Extracts all artifact types (tables, views, procedures, functions, etc.) +- Retrieves DDL definitions and column metadata +- Outputs JSON files organized by artifact type +- Supports sample data extraction (first N rows) + +**Key Classes:** +- `SnowparkObjectReader`: Main extraction class +- `ArtifactReaders`: Factory for reading specific artifact types +- `ArtifactValidators`: Validates extracted metadata + +**Output Files:** +``` +snowflake_artifacts_raw/ +├── tables.json +├── views.json +├── procedures.json +├── functions.json +├── sequences.json +├── stages.json +├── file_formats.json +├── tasks.json +├── streams.json +├── pipelines.json +├── roles.json +├── grants_privileges.json +├── grants_hierarchy.json +├── grants_future.json +└── grants_flattened.json +``` + +--- + +### Component 2: Translation Pipeline (`artifact_translation_package`) + +LangGraph-based orchestration for artifact translation. + +**Architecture:** + +```mermaid +graph LR + subgraph "Translation Graph" + A[Input Batch] --> B[Router Node] + B --> C{Artifact Type} + C -->|tables| D[Tables Translator] + C -->|views| E[Views Translator] + C -->|procedures| F[Procedures Translator] + C -->|...| G[Other Translators] + D --> H[Evaluation Node] + E --> H + F --> H + G --> H + H --> I[Aggregator Node] + I --> J[Output Results] + end +``` + +**Key Components:** + +| Component | Description | +|-----------|-------------| +| **Router** (`nodes/router.py`) | Determines artifact type and routes to appropriate translator | +| **Translation Nodes** (`nodes/*_translation.py`) | Type-specific translation logic using LLM prompts | +| **Evaluation Node** (`nodes/evaluation.py`) | SQLGlot syntax validation | +| **Aggregator** (`nodes/aggregator.py`) | Collects and merges all translation results | +| **Prompts** (`prompts/*.py`) | Specialized prompts for each artifact type | + +**Supported Translators:** +- Databases, Schemas, Tables, Views +- Procedures, UDFs, Stages, Streams, Pipes +- Roles, Grants, Tags, Comments +- Masking Policies, External Locations + +**Processing Flow:** +1. **Batch Creation**: Input JSON split into configurable batch sizes +2. **Routing**: Each batch routed to appropriate translator +3. **Translation**: LLM generates Databricks-compatible DDL +4. **Evaluation**: Syntax validation via SQLGlot +5. **Aggregation**: Results merged into final output + +--- + +### Component 3: Job Executor UI (`databricks_job_executor`) + +Streamlit application for Databricks job execution and monitoring. + +**Features:** +- Execute configured Databricks migration jobs +- Real-time job run monitoring +- View job logs and diagnostics (multi-task support) +- Cancel running jobs +- Connection status display + +**Deployment Options:** +1. **Local**: `streamlit run databricks_job_executor/streamlit_app/app.py` +2. **Databricks App**: Deploy via Databricks Asset Bundles + +--- + +### Component 4: Evaluation & Benchmarking (`evaluation`) + +MLflow-based evaluation for comparing translation quality across models. + +**Features:** +- LLM-as-judge evaluation +- Compliance scoring (functional correctness) +- Best practices scoring (performance & documentation) +- Multi-model comparison +- MLflow experiment tracking + +**Evaluation Dimensions:** + +| Dimension | Goal | Scoring | +|-----------|------|---------| +| **Compliance** | Functional correctness | 0-100 (deduction-based) | +| **Best Practices** | Performance & documentation | 0-100 (deduction-based) | + +--- + +## Setup & Installation Guide + +This guide provides step-by-step instructions to configure the Migration Accelerator in a Databricks workspace. + +### Prerequisites + +- [ ] Databricks workspace with admin access +- [ ] Snowflake account with appropriate permissions (SYSADMIN recommended) +- [ ] Python 3.12+ environment (for local development) +- [ ] Databricks CLI installed and configured +- [ ] Access to Databricks-hosted LLMs (Foundation Model APIs) + +--- + +### Step 1: Configure Unity Catalog Resources + +Create the required Unity Catalog resources for storing artifacts. + +#### Option A: Using Databricks Asset Bundles (Recommended) + +The project includes pre-configured Unity Catalog resources in `resources/unity_catalog.yml`: + +```bash +# From project root +databricks bundle deploy -t dev +``` + +This creates: +- **Catalog**: `qubika_partner_solutions` (configurable) +- **Schema**: `migration_accelerator` +- **Volume**: `snowflake_artifacts_raw` + +#### Option B: Manual Creation + +```sql +-- Create catalog (if not exists) +CREATE CATALOG IF NOT EXISTS qubika_partner_solutions; + +-- Create schema +CREATE SCHEMA IF NOT EXISTS qubika_partner_solutions.migration_accelerator +COMMENT 'Contains Unity Catalog resources for Migration Accelerator'; + +-- Create volume for raw artifacts +CREATE VOLUME IF NOT EXISTS qubika_partner_solutions.migration_accelerator.snowflake_artifacts_raw +COMMENT 'Contains extracted Snowflake artifacts in JSON format'; + +-- Grant permissions to developers +GRANT ALL PRIVILEGES ON SCHEMA qubika_partner_solutions.migration_accelerator +TO `migration-accelerator-devs`; +``` + +--- + +### Step 2: Configure Databricks Cluster + +Create or configure a cluster with the following specifications: + +| Setting | Recommended Value | +|---------|-------------------| +| **Databricks Runtime** | 14.3 LTS or later (Python 3.12+) | +| **Node Type** | `i3.xlarge` or equivalent | +| **Cluster Mode** | Single node (testing) or multi-node (production) | +| **Access Mode** | Single user or Shared | + +#### Required Libraries + +Install via cluster libraries (Compute → Libraries → Install New): + +``` +snowflake-connector-python>=3.0.0 +snowflake-snowpark-python>=1.0.0 +langchain>=0.1.0 +langchain-core>=0.1.0 +langchain-community>=0.1.0 +langgraph>=0.1.0 +sqlglot>=20.0.0 +pydantic>=2.0.0 +python-dotenv>=1.0.0 +databricks-sdk>=0.1.0 +databricks-langchain>=0.1.0 +mlflow>=2.10.0 +``` + +Or use an init script: +```bash +#!/bin/bash +pip install snowflake-connector-python snowflake-snowpark-python langchain langchain-core langchain-community langgraph sqlglot pydantic python-dotenv databricks-sdk databricks-langchain mlflow +``` + +--- + +### Step 3: Set Up Databricks Secrets + +Create a secret scope and add required secrets. + +```bash +# Create secret scope +databricks secrets create-scope migration-accelerator + +# Add Snowflake secrets +databricks secrets put-secret migration-accelerator snowflake-account +databricks secrets put-secret migration-accelerator snowflake-user +databricks secrets put-secret migration-accelerator snowflake-password + +# Add Databricks OAuth M2M secrets (Service Principal) +databricks secrets put-secret migration-accelerator databricks-host +databricks secrets put-secret migration-accelerator databricks-client-id +databricks secrets put-secret migration-accelerator databricks-client-secret + +# Add LLM endpoint (optional - has default) +databricks secrets put-secret migration-accelerator dbx-endpoint + +# Add LangSmith secrets (optional - for tracing) +databricks secrets put-secret migration-accelerator langsmith-endpoint +databricks secrets put-secret migration-accelerator langsmith-api-key +``` + +#### Secrets Reference + +| Secret Key | Description | Required | +|------------|-------------|----------| +| `snowflake-account` | Snowflake account URL (e.g., `xy12345.us-east-1`) | Yes | +| `snowflake-user` | Snowflake username | Yes | +| `snowflake-password` | Snowflake password | Yes | +| `databricks-host` | Databricks workspace URL | Yes | +| `databricks-client-id` | OAuth M2M client ID (Service Principal) | Yes | +| `databricks-client-secret` | OAuth M2M client secret | Yes | +| `dbx-endpoint` | LLM endpoint name | No (default: `databricks-llama-4-maverick`) | +| `langsmith-endpoint` | LangSmith API endpoint | No | +| `langsmith-api-key` | LangSmith API key | No | + +--- + +### Step 4: Configure Environment Variables + +Environment variables can be set at cluster level, job level, or via `.env` file. + +#### For Databricks Jobs + +Set in job configuration under "Environment variables": + +```bash +# Core settings +ENVIRONMENT=production +DBX_ENDPOINT=databricks-llama-4-maverick + +# Processing settings +DDL_BATCH_SIZE=8 +DDL_OUTPUT_FORMAT=sql +DDL_OUTPUT_DIR=/Volumes/qubika_partner_solutions/migration_accelerator/outputs + +# Unity Catalog paths (if overriding defaults) +UC_CATALOG=qubika_partner_solutions +UC_SCHEMA=migration_accelerator +UC_RAW_VOLUME=snowflake_artifacts_raw +``` + +#### For Local Development + +Create a `.env` file from the example: + +```bash +cp env.example .env +``` + +Edit with your credentials: +```env +# Snowflake Connection +SNOWFLAKE_ACCOUNT=your-account.snowflakecomputing.com +SNOWFLAKE_USER=your_username +SNOWFLAKE_PASSWORD=your_password +SNOWFLAKE_DATABASE=your_database +SNOWFLAKE_SCHEMA=your_schema +SNOWFLAKE_ROLE=SYSADMIN +SNOWFLAKE_WAREHOUSE=COMPUTE_WH + +# Databricks Connection (OAuth M2M - Service Principal) +DATABRICKS_HOST=https://your-workspace.cloud.databricks.com +DATABRICKS_CLIENT_ID=your_client_id +DATABRICKS_CLIENT_SECRET=your_client_secret + +# LLM Configuration +DBX_ENDPOINT=databricks-llama-4-maverick +``` + +--- + +### Step 5: Deploy Databricks Jobs + +The project uses Databricks Asset Bundles for deployment. + +#### Job Configuration + +The main ingestion job (`resources/jobs.yml`) includes 4 tasks: + +```mermaid +graph LR + A[snowflake_ingestion_task] --> B[snowflake_ingestion_validation_task] + A --> C[grant_flattening_task] + C --> D[artifact_translation_task] +``` + +| Task | Entry Point | Description | +|------|-------------|-------------| +| `snowflake_ingestion_task` | `snowpark-reader` | Extract Snowflake metadata | +| `snowflake_ingestion_validation_task` | `snowflake-validator` | Validate extracted metadata | +| `grant_flattening_task` | `grant-transformer` | Flatten grant hierarchies | +| `artifact_translation_task` | `translation-module` | Translate artifacts to Databricks SQL | + +#### Deploy the Bundle + +```bash +# Validate configuration +databricks bundle validate + +# Deploy to development target +databricks bundle deploy -t dev + +# Run the job +databricks bundle run snowflake_ingestion_job -t dev +``` + +#### CI/CD Pipeline (Automated Deployment) + +The project includes a GitHub Actions CI/CD pipeline for automated deployment: + +**Workflow File**: `.github/workflows/databricks-asset-bundle-deploy.yml` + +**Triggers**: +- Push to `main` or `develop` branches +- Pull requests to `main` or `develop` branches +- Manual workflow dispatch + +**Pipeline Steps**: +1. Checkout code +2. Set up Python 3.12 and Poetry +3. Install project dependencies +4. Install Databricks CLI +5. Validate bundle configuration +6. Deploy bundle to dev environment +7. Run the Databricks Job Executor app +8. Display bundle summary and resources + +**Required GitHub Secrets**: +- `DATABRICKS_HOST`: Databricks workspace URL +- `DATABRICKS_CLIENT_ID`: OAuth M2M client ID +- `DATABRICKS_CLIENT_SECRET`: OAuth M2M client secret +- `DATABRICKS_CLUSTER_ID`: Cluster ID for job execution (DAB variable) +- `UC_CATALOG`: Unity Catalog name (DAB variable) +- `DEVS_GROUP`: Group name for permissions (DAB variable) + +> [!NOTE] +> The `DEVS_GROUP` (e.g., `migration-accelerator-devs`) must exist in Databricks before deployment. Create it in **Admin Console → Groups → Create Group**. + +**Authentication**: Uses OAuth machine-to-machine (M2M) authentication for secure, token-less deployment. + +**To set up CI/CD**: +1. Configure the required secrets in GitHub repository settings +2. Push to `main` or `develop` branch to trigger deployment +3. Monitor workflow progress in GitHub Actions tab + +--- + +### Step 6: Deploy Job Executor App (Optional) + +For UI-based job management: + +```bash +cd databricks_job_executor + +# Deploy the Streamlit app +databricks bundle deploy -t dev +``` + +Access the app from Databricks workspace → Apps section. + +--- + +### Step 7: Verify Configuration + +Run these verification steps: + +#### Test Snowflake Connection +```python +from snowflake.snowpark import Session + +connection_parameters = { + "account": "your-account", + "user": "your-user", + "password": "your-password", + "database": "your-database", + "schema": "your-schema" +} +session = Session.builder.configs(connection_parameters).create() +print(f"Connected to: {session.get_current_database()}") +``` + +#### Test Databricks Connection +```python +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient() +print(f"Current user: {w.current_user.me().user_name}") +``` + +#### Test LLM Endpoint +```python +from databricks_langchain import ChatDatabricks + +llm = ChatDatabricks(endpoint="databricks-llama-4-maverick") +response = llm.invoke("Say hello") +print(response.content) +``` + +--- + +### Configuration Diagram + +```mermaid +graph TD + subgraph "Secrets Scope: migration-accelerator" + S1[snowflake-account] + S2[snowflake-user] + S3[snowflake-password] + S4[databricks-host] + S5[databricks-client-id] + S6[databricks-client-secret] + end + + subgraph "Environment Variables" + E1[DBX_ENDPOINT] + E2[DDL_BATCH_SIZE] + E3[DDL_OUTPUT_DIR] + end + + subgraph "Unity Catalog" + UC1[Catalog: qubika_partner_solutions] + UC2[Schema: migration_accelerator] + UC3[Volume: snowflake_artifacts_raw] + end + + subgraph "Application" + A1[DDLConfig] + A2[TranslationGraph] + end + + S1 --> A1 + S2 --> A1 + S3 --> A1 + E1 --> A1 + E2 --> A1 + A1 --> A2 + A2 --> UC3 +``` + +--- + +## Runbook for Daily Operation + +This runbook details operational workflows for running the Migration Accelerator. + +### Execution Modes + +| Mode | Use Case | Setup | +|------|----------|-------| +| **Databricks Jobs** | Production, scheduled runs | Deploy via Asset Bundles | +| **Interactive/Local** | Development, testing | Run Python scripts locally | + +--- + +### Workflow 1: Metadata Extraction + +Extract Snowflake metadata to JSON files. + +#### Via Databricks Jobs (Production) + +```bash +# Run the ingestion task +databricks jobs run-now --job-id + +# Or via bundle +databricks bundle run snowflake_ingestion_job -t dev +``` + +**Task**: `snowflake_ingestion_task` +**Entry Point**: `snowpark-reader` +**Output**: JSON files in `snowflake_artifacts_raw` volume + +#### Via Local Execution (Development) + +```bash +# Run extraction with Poetry +poetry run python snowpark.py +``` + +**Output**: `snowflake_objects_snowpark.json` in current directory + +#### Parameters + +| Parameter | Environment Variable | Default | +|-----------|---------------------|---------| +| Database | `SNOWFLAKE_DATABASE` | `LVDMS` | +| Schema | `SNOWFLAKE_SCHEMA` | `LVDMS` | +| Role | `SNOWFLAKE_ROLE` | `SYSADMIN` | +| Warehouse | `SNOWFLAKE_WAREHOUSE` | `COMPUTE_WH` | + +--- + +### Workflow 2: Grant Transformation + +Flatten grant hierarchies for easier translation. + +#### Via Databricks Jobs + +**Task**: `grant_flattening_task` +**Entry Point**: `grant-transformer` +**Depends On**: `snowflake_ingestion_task` + +#### Programmatic Usage + +```python +from migration_accelerator_package.grant_transformer import GrantTransformer + +transformer = GrantTransformer(grants_data) +flattened = transformer.flatten() +``` + +**Output**: `grants_flattened.json` + +--- + +### Workflow 3: Artifact Translation + +Translate Snowflake DDL to Databricks-compatible SQL. + +#### Via Databricks Jobs + +**Task**: `artifact_translation_task` +**Entry Point**: `translation-module` +**Depends On**: `grant_flattening_task` + +#### Via Local Execution + +**Option 1: Using Make (Recommended)** +```bash +make translate +``` + +**Option 2: Using run script** +```bash +./scripts/run_translation.sh "path/to/input/*.json" 4 sql +``` + +**Option 3: Direct Python execution** +```bash +poetry run python -m artifact_translation_package.main \ + path/to/tables.json \ + path/to/views.json \ + --batch-size 5 \ + --output ./output \ + --output-format combined +``` + +#### Parameters + +| Parameter | CLI Flag | Environment Variable | Default | +|-----------|----------|---------------------|---------| +| Batch Size | `--batch-size` | `DDL_BATCH_SIZE` | `8` | +| Output Format | `--output-format` | `DDL_OUTPUT_FORMAT` | `sql` | +| Output Directory | `--output` | `DDL_OUTPUT_DIR` | `/Volumes/.../outputs` | +| LLM Endpoint | - | `DBX_ENDPOINT` | `databricks-llama-4-maverick` | +| Temperature | - | `DDL_TEMPERATURE` | `0.1` | +| Max Tokens | - | `DDL_MAX_TOKENS` | `2000` | + +#### Output Formats + +| Format | Description | Files Created | +|--------|-------------|---------------| +| `sql` | SQL files only | `tables.sql`, `views.sql`, etc. | +| `json` | JSON results | `translation_results.json` | +| `combined` | Both SQL and JSON | All of the above | + +--- + +### Workflow 4: Validation & Evaluation + +Validate translated SQL for syntax and semantic correctness. + +#### Automatic Validation + +Validation runs automatically after translation: +- **SQLGlot**: Syntax validation for all artifacts +- **LLM Evaluation**: Semantic validation for complex artifacts (procedures, pipes) + +#### Manual Evaluation via Benchmark + +```bash +python3 run_local_benchmark.py \ + --artifact-type tables \ + --models databricks-llama-4-maverick databricks-gemini-2-5-flash +``` + +See [Evaluation & Benchmarking](#evaluation--benchmarking) for details. + +--- + +### Output Directory Structure + +``` +output_directory/ +├── databases.sql # Database DDL +├── schemas.sql # Schema DDL +├── tables.sql # Table DDL +├── views.sql # View DDL +├── procedures.sql # Procedure DDL +├── functions.sql # UDF DDL +├── stages.sql # Stage DDL +├── streams.sql # Stream DDL +├── pipes.sql # Pipe DDL +├── roles.sql # Role DDL +├── grants.sql # Grant statements +├── translation_results.json # Full results with metadata +├── results_summary.json # Summary statistics +└── evaluation_results/ + └── evaluation_batch_*.json # Validation results per batch +``` + +--- + +### Common Workflow Paths + +| Workflow | Steps | Use Case | +|----------|-------|----------| +| **Full Migration** | Extract → Validate → Transform → Translate → Validate | Complete migration | +| **Incremental Update** | Extract (subset) → Translate → Validate | Add new tables/views | +| **Translation Only** | Translate (existing JSON) → Validate | Re-run with different LLM | +| **Benchmark** | Translate → Benchmark multiple models | Compare model quality | + +--- + +## Evaluation & Benchmarking + +The evaluation module uses MLflow and LLM-as-judge to assess translation quality. + +### Overview + +The benchmark evaluates translations across two dimensions: + +```mermaid +graph TD + A[Translated SQL] --> B{Evaluation} + B --> C[Compliance Score
0-100] + B --> D[Best Practices Score
0-100] + C --> E[MLflow Tracking] + D --> E + E --> F[Comparison Dashboard] +``` + +### Running Benchmarks + +#### Command Line + +```bash +# Basic: Single artifact type +python3 run_local_benchmark.py --artifact-type tables + +# Compare multiple models +python3 run_local_benchmark.py \ + --artifact-type views \ + --models databricks-llama-4-maverick databricks-meta-llama-3-1-70b-instruct + +# Custom input data +python3 run_local_benchmark.py \ + --artifact-type tables \ + --dataset-source /path/to/your/metadata.json + +# Adjust batch size +python3 run_local_benchmark.py --batch-size 10 +``` + +#### Interactive Notebook + +Open `src/artifact_translation_package/evaluation/benchmark_interactive.ipynb`: + +```python +# Configure models +TRANSLATION_MODELS = ["databricks-llama-4-maverick", "databricks-gemini-2-5-flash"] +ARTIFACT_TYPE = "tables" + +# Run benchmark and visualize results +``` + +### Scoring System + +#### Compliance Score (0-100) + +Measures functional correctness. Starting from 100, points are deducted for issues: + +| Issue | Deduction | Description | +|-------|-----------|-------------| +| Invalid syntax | 100 pts | Score becomes 0 | +| Missing `USING DELTA` | 20 pts | Tables should specify Delta format | +| Legacy types (`VARCHAR`, `TEXT`) | 10 pts | Should use `STRING` | +| Missing 3-level naming | 15 pts | Should use `catalog.schema.table` | + +#### Best Practices Score (0-100) + +Measures optimization and documentation quality: + +| Issue | Deduction | Description | +|-------|-----------|-------------| +| Missing `CLUSTER BY` | 30 pts | Liquid Clustering recommended | +| Missing table properties | 20 pts | `autoOptimize`, etc. | +| Missing table `COMMENT` | 25 pts | Documentation | +| Missing column `COMMENT`s | 25 pts | Documentation | + +### MLflow Integration + +The benchmark automatically logs to MLflow: + +- **Experiment Name**: `sql-translation-benchmark` (configurable) +- **Run Parameters**: Model name, temperature, artifact type +- **Metrics**: `avg_compliance`, `avg_best_practices`, `syntax_valid_pct` +- **Artifacts**: `issues_table.json`, `top_issues_summary.txt` +- **Tags**: Issue categories for searchability + +### Metrics Reference + +| Metric | Range | Description | +|--------|-------|-------------| +| `avg_compliance` | 0-100 | Mean functional correctness score | +| `avg_best_practices` | 0-100 | Mean optimization/documentation score | +| `compliant_pct` | 0-100% | Percentage of statements scoring ≥70 | +| `syntax_valid_pct` | 0-100% | Percentage with valid Databricks SQL syntax | + +### Configuring Judge Model + +The judge model (used for evaluation) can be configured separately from translation models: + +```python +# In ddl_config.py +"benchmark": { + "judge_endpoint": "databricks-llama-4-maverick", + ... +} +``` + +Or via environment: +```bash +export DDL_JUDGE_ENDPOINT=databricks-llama-4-maverick +``` + +--- + +## Troubleshooting & Support + +### Common Errors + +#### Snowflake Connection Errors + +**Error**: `Snowflake connection failed` or `Invalid credentials` + +**Causes & Solutions**: +| Cause | Solution | +|-------|----------| +| Invalid credentials | Verify `SNOWFLAKE_USER` and `SNOWFLAKE_PASSWORD` in secrets | +| Account URL format | Use format `xy12345.us-east-1` (not full URL) | +| Network issues | Check VPC/firewall allows Snowflake access | +| Role permissions | Ensure role has `SYSADMIN` or equivalent access | + +**Verification**: +```python +from snowflake.snowpark import Session +session = Session.builder.configs({ + "account": "your-account", + "user": "your-user", + "password": "your-password" +}).create() +``` + +--- + +#### Databricks Authentication Errors + +**Error**: `Databricks authentication failed` or `401 Unauthorized` + +**Solutions**: +1. Verify OAuth M2M credentials (client_id and client_secret) are correct +2. Verify Service Principal has required permissions: + - Workspace access + - Cluster access + - Jobs access (for job execution) + - Unity Catalog access +3. Check secrets are configured correctly: + ```bash + databricks secrets get-secret migration-accelerator databricks-client-id + databricks secrets get-secret migration-accelerator databricks-client-secret + ``` +4. Ensure the Service Principal is added to the workspace + +--- + +#### LLM Endpoint Errors + +**Error**: `Model not found` or `Endpoint unavailable` + +**Solutions**: +1. Verify endpoint name in Databricks Model Serving UI +2. Check endpoint status (must be "Ready") +3. Verify permissions to access the endpoint +4. Try alternative endpoint: `databricks-meta-llama-3-1-70b-instruct` + +**Common Endpoints**: +- `databricks-llama-4-maverick` +- `databricks-meta-llama-3-1-70b-instruct` +- `databricks-gemini-2-5-flash` + +--- + +#### Unity Catalog Access Errors + +**Error**: `Access denied to catalog/schema/volume` + +**Solutions**: +1. Grant required permissions: +```sql +GRANT ALL PRIVILEGES ON CATALOG qubika_partner_solutions TO `your-user`; +GRANT ALL PRIVILEGES ON SCHEMA migration_accelerator TO `your-user`; +``` +2. Verify catalog exists and is accessible +3. Check volume path format: `/Volumes////` + +--- + +#### Translation Failures + +**Error**: `Translation failed for artifact X` + +**Diagnostic Steps**: +1. Check `translation_results.json` for error details +2. Review `evaluation_results/` for validation failures +3. Check input JSON format matches expected schema +4. Verify artifact type is supported + +**Common Issues**: +| Issue | Solution | +|-------|----------| +| Empty DDL | Check Snowflake extraction result | +| Complex procedure | May need manual adjustment | +| Unsupported syntax | See [Release Notes](#release-notes) for limitations | + +--- + +#### Memory/Timeout Issues + +**Error**: `Out of memory` or `Timeout exceeded` + +**Solutions**: +1. Reduce batch size: `DDL_BATCH_SIZE=4` +2. Increase cluster memory/nodes +3. Increase timeout: `DDL_TIMEOUT=600` +4. Process fewer artifacts per run + +--- + +### Reading Logs + +#### Log Locations + +| Environment | Location | +|-------------|----------| +| Databricks Jobs | Job run details → Logs tab | +| Local execution | Console output + log files | +| Cluster logs | DBFS: `/cluster-logs//` | + +#### Log Levels + +Set via `DDL_LOG_LEVEL` or `LOG_LEVEL`: +- `DEBUG`: Detailed debugging information +- `INFO`: General progress information (default) +- `WARN`: Warning messages +- `ERROR`: Error messages only + +#### Log Format + +```json +{ + "timestamp": "2026-01-05T10:00:00Z", + "level": "INFO", + "run_id": "abc123", + "message": "Processing batch 1/10 for tables", + "artifact_type": "tables", + "batch_size": 8 +} +``` + +--- + +### Reviewing Validation Reports + +**Location**: `/evaluation_results/` + +**Report Structure**: +```json +{ + "batch": { + "artifact_type": "tables", + "evaluated_count": 5 + }, + "validation": { + "total_statements": 5, + "valid_statements": 4, + "invalid_statements": 1, + "validation_method": "syntax_validator" + }, + "results": [ + { + "artifact_name": "my_table", + "is_valid": true, + "syntax_errors": [], + "warnings": [] + } + ], + "timestamp": "20260105_100000" +} +``` + +--- + +### Identifying Problematic Artifacts + +1. **Check summary**: Review `results_summary.json` for failed count +2. **Filter failures**: Look for `"is_valid": false` in evaluation results +3. **Review errors**: Check `syntax_errors` array for specific issues +4. **Check original**: Compare translated SQL against original Snowflake DDL + +--- + +### Escalation Paths + +1. **Self-Service**: Check this documentation and FAQs +2. **Team Support**: Contact engineering team with: + - Run ID + - Error logs + - Input artifacts + - Environment details +3. **Critical Issues**: Response within 24 hours for production blockers + +**Information to Include**: +- Run ID or Job Run ID +- Error message and stack trace +- Artifact type(s) affected +- Environment (Databricks/Local) +- Configuration settings + +--- + +## Client Onboarding Material + + + +### Client Responsibilities vs Accelerator Behavior + +| Responsibility | Client | Accelerator | +|----------------|--------|-------------| +| **Snowflake Access** | Provide credentials | Connect and extract | +| **Databricks Setup** | Provide workspace | Use for processing | +| **Input Accuracy** | Ensure source DDL is valid | Extract as-is | +| **Translation** | Review outputs | Generate Databricks SQL | +| **Validation** | Final review and testing | Syntax and semantic checks | +| **Deployment** | Execute in target environment | Produce ready-to-run SQL | +| **Unsupported Features** | Manual handling | Flag in reports | + +--- + +### Overview Content + +#### What is Migration Accelerator? +- Automated tool for Snowflake → Databricks migration +- Uses LLM-powered translation for complex artifacts +- Provides end-to-end workflow from extraction to deployment-ready SQL + +#### Key Features +- Supports 15+ Snowflake artifact types +- Built-in validation with SQLGlot and LLM evaluation +- Databricks-native integration (Unity Catalog, Jobs, Model Serving) +- Comprehensive reporting and observability + +#### Workflow Summary +``` +Snowflake → Extract Metadata → Transform → Translate with LLM → Validate → Databricks SQL +``` + +--- + +### FAQs + +**Q: What artifacts are supported?** +A: Tables, views, procedures, functions (UDFs), sequences, stages, file formats, tasks, streams, pipes, roles, grants, tags, comments, masking policies, and external locations. + +**Q: How long does migration take?** +A: Depends on volume: +- Small (< 100 objects): Minutes +- Medium (100-1000 objects): 30-60 minutes +- Large (1000+ objects): Hours + +**Q: What if translation fails?** +A: Check logs and validation reports. Common resolutions: +1. Review original DDL for unsupported syntax +2. Reduce complexity (split large objects) +3. Manual adjustment for edge cases + +**Q: Can I customize the LLM prompts?** +A: Yes, prompts are in `src/artifact_translation_package/prompts/`. Modify for specific requirements. + +**Q: Which LLM models are supported?** +A: Any Databricks-hosted model via Model Serving, including: +- `databricks-llama-4-maverick` (default) +- `databricks-meta-llama-3-1-70b-instruct` +- `databricks-gemini-2-5-flash` + +**Q: How do I handle unsupported features?** +A: The accelerator flags unsupported constructs. Manual translation required for: +- Snowflake-specific UDF syntax +- Complex procedural logic +- External integrations (AWS/Azure specific stages) + +**Q: Is there production support?** +A: Support expectations: +- Critical issues: Response within 24 hours +- Standard issues: Response within 48-72 hours +- Provide run ID and logs for faster resolution + +--- + +### Glossary + +| Term | Definition | +|------|------------| +| **Artifact** | A database object (table, view, procedure, etc.) | +| **DDL** | Data Definition Language - SQL statements for creating/modifying schema objects | +| **Delta / Delta Lake** | Databricks' optimized storage format with ACID transactions | +| **LangGraph** | Framework for building stateful, multi-step LLM workflows | +| **LLM** | Large Language Model - AI model for natural language and code generation | +| **MLflow** | Platform for ML lifecycle management, used for tracking experiments | +| **Snowflake** | Cloud data platform - the source system for migration | +| **Snowpark** | Snowflake's Python API for data processing | +| **SQLGlot** | SQL parser/transpiler used for syntax validation | +| **Unity Catalog** | Databricks' unified governance solution for data and AI | +| **Volume** | Unity Catalog managed storage, similar to DBFS | + +--- + +## Current Capabilities & Limitations + +This section describes the current state of the Migration Accelerator, including what is supported, what has limitations, and what features are being planned. + +### Current Features + +- **End-to-End Migration Pipeline** + - Snowflake metadata extraction via Snowpark API + - LLM-powered translation to Databricks SQL + - Automated syntax validation with SQLGlot + - LLM-based semantic evaluation + +- **Comprehensive Artifact Support** + - Core: Tables, Views, Schemas, Databases + - Programmatic: Procedures, Functions, Sequences + - Data Pipeline: Stages, Streams, Pipes, Tasks, File Formats + - Security: Roles, Grants (Privileges, Hierarchy, Future) + - Governance: Tags, Comments, Masking Policies, External Locations + +- **Databricks Integration** + - Unity Catalog native storage + - Databricks Asset Bundles deployment + - Model Serving LLM integration + - MLflow experiment tracking + - GitHub Actions CI/CD pipeline for automated deployment + +- **Observability** + - Structured logging with run IDs + - Validation reports per batch + - Summary statistics + - LangSmith tracing support (optional) + +- **User Interface** + - Streamlit Job Executor app + - Real-time job monitoring + - Log viewing and diagnostics + +- **Benchmarking** + - Multi-model comparison + - Compliance and best practices scoring + - MLflow integration for tracking + +### Artifact Support Matrix + +| Artifact Type | Extraction | Translation | Validation | Notes | +|---------------|------------|-------------|------------|-------| +| Tables | ✅ | ✅ Full | ✅ SQLGlot | Delta format with liquid clustering | +| Views | ✅ | ✅ Full | ✅ SQLGlot | Materialized view conversion | +| Schemas | ✅ | ✅ Full | ✅ SQLGlot | Schema creation | +| Databases | ✅ | ✅ Full | ✅ SQLGlot | Catalog mapping | +| Sequences | ✅ | ✅ Full | ✅ SQLGlot | Identity column conversion | +| Streams | ✅ | ✅ Full | ✅ SQLGlot | CDC pattern translation | +| Tags | ✅ | ✅ Full | ✅ SQLGlot | Tag propagation | +| Comments | ✅ | ✅ Full | ✅ SQLGlot | Comment preservation | +| Masking Policies | ✅ | ✅ Full | ✅ SQLGlot | Row-level security | +| Procedures | ✅ | ⚠️ Available | ❌ Skip | Complex logic - manual review required | +| Functions (UDFs) | ✅ | ⚠️ Available | ❌ Skip | Some syntax limitations - manual review | +| Stages | ✅ | ⚠️ Available | ❌ Skip | Cloud-specific - manual review | +| Pipes | ✅ | ⚠️ Available | ❌ Skip | Auto Loader conversion - manual review | +| Roles | ✅ | ⚠️ Available | ❌ Skip | Unity Catalog groups - manual review | +| Grants | ✅ | ⚠️ Available | ❌ Skip | Permission model differences - manual | +| External Locations | ✅ | ⚠️ Available | ⚠️ Manual | Cloud-specific configuration | +| Tasks | ✅ | ❌ Not Yet | N/A | Extraction only - planned enhancement | +| File Formats | ✅ | ❌ Not Yet | N/A | Extraction only - planned enhancement | + +**Legend**: +- ✅ Full = Fully automated and validated +- ⚠️ Available/Skip = Translation available but validation skipped, requires manual review +- ❌ = Not available or skipped + +### Known Limitations + +1. **Extraction-Only Artifacts**: Tasks and File Formats are extracted but not yet translated - requires manual conversion +2. **Validation-Skipped Artifacts**: Procedures, UDFs, Stages, Pipes, Roles, and Grants have translation available but validation is skipped - requires manual review +3. **Complex Procedural Logic**: Procedures with advanced Snowflake-specific constructs may require manual adjustment +4. **External Stages**: AWS/Azure/GCP-specific stages need cloud configuration review +5. **Real-Time Migration**: This tool handles metadata/DDL only, not data migration +6. **Large Datasets**: Very large schemas (10,000+ objects) may require batching strategies +7. **LLM Variability**: Translation quality depends on model capability and temperature settings + +### Unsupported Features + +- **Tasks**: Snowflake Tasks → Databricks Workflows (extraction only, manual translation required) +- **File Formats**: Snowflake File Formats (extraction only, manual mapping to Databricks) +- Snowflake external functions (require separate integration) +- Time travel queries (need adaptation for Delta time travel) +- Some advanced UDF syntax (Java/Scala UDFs in Snowflake) +- Reader accounts (Snowflake-specific) +- Data shares (require Databricks-specific setup) + + +--- + +## Configuration Reference + +### Environment Variables + +> [!IMPORTANT] +> Variables marked as **Required** have no defaults in the code and must be explicitly set in your cluster configuration or `.env` file. + +#### Core Settings + +| Variable | Description | Default | Required | +|----------|-------------|---------|----------| +| `ENVIRONMENT` | Execution environment | `development` | No | +| `DBX_ENDPOINT` | LLM endpoint name | `databricks-llama-4-maverick` | No | +| `LOG_LEVEL` | Logging level | `INFO` | No | + +#### Snowflake Connection + +| Variable | Description | Default | Required | +|----------|-------------|---------|----------| +| `SNOWFLAKE_ACCOUNT` | Account identifier | - | **Yes** | +| `SNOWFLAKE_USER` | Username | - | **Yes** | +| `SNOWFLAKE_PASSWORD` | Password | - | **Yes** | +| `SNOWFLAKE_DATABASE` | Database name | - | **Yes** | +| `SNOWFLAKE_SCHEMA` | Schema name | - | **Yes** | +| `SNOWFLAKE_ROLE` | Role | `SYSADMIN` | No | +| `SNOWFLAKE_WAREHOUSE` | Warehouse | `COMPUTE_WH` | No | + +#### Databricks Connection + +| Variable | Description | Default | Required | +|----------|-------------|---------|----------| +| `DATABRICKS_HOST` | Workspace URL | - | **Yes** | +| `DATABRICKS_CLIENT_ID` | OAuth M2M client ID | - | **Yes** | +| `DATABRICKS_CLIENT_SECRET` | OAuth M2M client secret | - | **Yes** | +| `DATABRICKS_JOB_ID` | Job ID for executor | - | No | + +#### Unity Catalog (Required) + +| Variable | Description | Default | Required | +|----------|-------------|---------|----------| +| `UC_CATALOG` | Catalog name | - | **Yes** | +| `UC_SCHEMA` | Schema name | - | **Yes** | +| `UC_RAW_VOLUME` | Raw volume name | `snowflake_artifacts_raw` | No | +| `SECRETS_SCOPE` | Databricks secrets scope | `migration-accelerator` | No | + +#### Processing Settings + +| Variable | Description | Default | +|----------|-------------|---------| +| `DDL_BATCH_SIZE` | Artifacts per batch | `8` | +| `DDL_MAX_CONCURRENT` | Concurrent batches | `5` | +| `DDL_TIMEOUT` | Timeout in seconds | `300` | +| `DDL_TEMPERATURE` | LLM temperature | `0.1` | +| `DDL_MAX_TOKENS` | Max LLM tokens | `2000` | + +#### Output Settings + +| Variable | Description | Default | +|----------|-------------|---------| +| `DDL_OUTPUT_DIR` | Output directory | - (generated from UC config) | +| `DDL_OUTPUT_FORMAT` | Format: sql, json, combined | `sql` | +| `DDL_INCLUDE_METADATA` | Include metadata | `true` | +| `DDL_COMPRESS_OUTPUT` | Compress files | `false` | +| `LOCAL_DBFS_MOUNT` | Local DBFS mapping | `./ddl_output` | + +#### Observability + +| Variable | Description | Default | +|----------|-------------|---------| +| `DDL_VERBOSE_LOGGING` | Verbose logs | `true` | +| `DDL_DEBUG` | Debug mode | `false` | +| `LANGSMITH_TRACING` | Enable tracing | `true` | +| `LANGSMITH_PROJECT` | Project name | `databricks-migration-accelerator` | + +--- + +### Databricks Secrets + +| Secret Key | Scope | Description | +|------------|-------|-------------| +| `snowflake-account` | `migration-accelerator` | Snowflake account URL | +| `snowflake-user` | `migration-accelerator` | Snowflake username | +| `snowflake-password` | `migration-accelerator` | Snowflake password | +| `databricks-host` | `migration-accelerator` | Databricks workspace URL | +| `databricks-client-id` | `migration-accelerator` | OAuth M2M client ID | +| `databricks-client-secret` | `migration-accelerator` | OAuth M2M client secret | +| `dbx-endpoint` | `migration-accelerator` | LLM endpoint (optional) | +| `langsmith-endpoint` | `migration-accelerator` | LangSmith URL (optional) | +| `langsmith-api-key` | `migration-accelerator` | LangSmith API key (optional) | + +--- + +### LLM Configuration by Node + +| Node | Temperature | Max Tokens | Purpose | +|------|-------------|------------|---------| +| `smart_router` | 0.1 | 2000 | Route artifacts to translators | +| `database_translator` | 0.1 | 2000 | Translate databases | +| `schemas_translator` | 0.1 | 2000 | Translate schemas | +| `tables_translator` | 0.2 | 4000 | Translate tables | +| `views_translator` | 0.2 | 4000 | Translate views | +| `procedures_translator` | 0.2 | 4000 | Translate procedures | +| `udfs_translator` | 0.2 | 4000 | Translate UDFs | +| `evaluator` | 0.1 | 2000 | Evaluate translations | + +--- + +*This documentation is maintained in the repository at `docs/combined_documentation.md`. For the latest version, refer to the source repository.* \ No newline at end of file diff --git a/env.example b/env.example index 52d5fd5..4f80fa0 100644 --- a/env.example +++ b/env.example @@ -10,9 +10,9 @@ SNOWFLAKE_USER=your_username # Password authentication # SNOWFLAKE_PASSWORD=your_password -# Database and schema context -SNOWFLAKE_DATABASE=DATA_MIGRATION_DB -SNOWFLAKE_SCHEMA=DATA_MIGRATION_SCHEMA +# Database and schema context (REQUIRED - no defaults) +SNOWFLAKE_DATABASE=LVDMS +SNOWFLAKE_SCHEMA=LVDMS # Warehouse (optional - only needed if you want to specify a warehouse) # Leave this commented out if you don't have a warehouse or want to use default @@ -23,3 +23,82 @@ SNOWFLAKE_ROLE=your_role # Region (optional - for Snowpark, if your account requires explicit region) # SNOWFLAKE_REGION=us-east-1 +# ============================================================================== +# DATABRICKS CONNECTION (OAuth M2M - Service Principal) +# ============================================================================== + +# Databricks workspace URL +DATABRICKS_HOST=https://your-workspace.cloud.databricks.com + +# OAuth Machine-to-Machine authentication +DATABRICKS_CLIENT_ID=your_client_id +DATABRICKS_CLIENT_SECRET=your_client_secret + +# ============================================================================== +# UNITY CATALOG CONFIGURATION (REQUIRED - no defaults in code) +# ============================================================================== + +# Unity Catalog name +UC_CATALOG=qubika_partner_solutions + +# Unity Catalog schema +UC_SCHEMA=migration_accelerator + +# Unity Catalog volume for raw artifacts +UC_RAW_VOLUME=snowflake_artifacts_raw + +# ============================================================================== +# SECRETS CONFIGURATION +# ============================================================================== + +# Databricks secrets scope name +SECRETS_SCOPE=migration-accelerator + +# ============================================================================== +# LLM CONFIGURATION +# ============================================================================== + +# Databricks Model Serving endpoint for translation +DBX_ENDPOINT=databricks-llama-4-maverick + +# LLM parameters (optional) +# DDL_TEMPERATURE=0.1 +# DDL_MAX_TOKENS=2000 + +# ============================================================================== +# PROCESSING CONFIGURATION +# ============================================================================== + +# Batch size for artifact processing +DDL_BATCH_SIZE=8 + +# Output format: sql, json, or combined +DDL_OUTPUT_FORMAT=sql + +# Output directory (REQUIRED - set to your Volume path) +DDL_OUTPUT_DIR=/Volumes/qubika_partner_solutions/migration_accelerator/outputs + +# Local DBFS mount for local development +LOCAL_DBFS_MOUNT=./ddl_output + +# ============================================================================== +# OBSERVABILITY (optional) +# ============================================================================== + +# LangSmith tracing +LANGSMITH_TRACING=true +LANGSMITH_PROJECT=databricks-migration-accelerator +# LANGSMITH_ENDPOINT=https://api.smith.langchain.com +# LANGSMITH_API_KEY=your_langsmith_api_key + +# Logging level +LOG_LEVEL=INFO +DDL_VERBOSE_LOGGING=true +DDL_DEBUG=false + +# ============================================================================== +# DATABRICKS JOB EXECUTOR (optional) +# ============================================================================== + +# Job ID to execute via the Job Executor UI +# DATABRICKS_JOB_ID=123456 diff --git a/resources/jobs.yml b/resources/jobs.yml index 3e48a62..8b3700c 100644 --- a/resources/jobs.yml +++ b/resources/jobs.yml @@ -1,3 +1,9 @@ +variables: + cluster_id: + description: "Databricks cluster ID for job execution (required)" + devs_group: + description: "Group name for permissions (required)" + resources: jobs: snowflake_ingestion_job: @@ -5,13 +11,13 @@ resources: permissions: - level: CAN_MANAGE_RUN - group_name: migration-accelerator-devs + group_name: ${var.devs_group} tasks: - task_key: snowflake_ingestion_task - # Run on existing cluster "data-migration" - existing_cluster_id: 1214-215558-eghymads + # Run on existing cluster configured via cluster_id variable + existing_cluster_id: ${var.cluster_id} libraries: - whl: "../dist/*.whl" python_wheel_task: @@ -21,7 +27,7 @@ resources: - task_key: snowflake_ingestion_validation_task depends_on: - task_key: snowflake_ingestion_task - existing_cluster_id: 1214-215558-eghymads + existing_cluster_id: ${var.cluster_id} libraries: - whl: "../dist/*.whl" python_wheel_task: @@ -31,7 +37,7 @@ resources: - task_key: grant_flattening_task depends_on: - task_key: snowflake_ingestion_task - existing_cluster_id: 1214-215558-eghymads + existing_cluster_id: ${var.cluster_id} libraries: - whl: "../dist/*.whl" python_wheel_task: @@ -41,7 +47,7 @@ resources: - task_key: artifact_translation_task depends_on: - task_key: grant_flattening_task - existing_cluster_id: 1214-215558-eghymads + existing_cluster_id: ${var.cluster_id} python_wheel_task: entry_point: translation-module package_name: migration_accelerator_package @@ -49,14 +55,12 @@ resources: - task_key: migration_report_task depends_on: - task_key: artifact_translation_task - existing_cluster_id: 1214-215558-eghymads + existing_cluster_id: ${var.cluster_id} libraries: - whl: "../dist/*.whl" python_wheel_task: entry_point: migration-report package_name: migration_accelerator_package - # Using existing cluster "data-migration" (ID: 1214-215558-eghymads); no job_clusters or spark_version needed. - timeout_seconds: 3600 max_concurrent_runs: 1 \ No newline at end of file diff --git a/resources/unity_catalog.yml b/resources/unity_catalog.yml index ec78cac..4685be8 100644 --- a/resources/unity_catalog.yml +++ b/resources/unity_catalog.yml @@ -1,7 +1,8 @@ variables: catalog_name: - description: "The Unity Catalog in which the project will be contained." - default: "qubika_partner_solutions" + description: "The Unity Catalog name (required - no default)" + devs_group: + description: "Group name for permissions (required - no default)" resources: schemas: @@ -10,7 +11,7 @@ resources: catalog_name: ${var.catalog_name} comment: Contains the Unity Catalog Resources pertaining to the Databricks Governance Migration Agent Accelerator grants: - - principal: migration-accelerator-devs + - principal: ${var.devs_group} privileges: - ALL_PRIVILEGES @@ -20,6 +21,3 @@ resources: schema_name: migration_accelerator catalog_name: ${var.catalog_name} comment: Contains the metadata of the extracted Snowflake artifacts in a raw JSON format. - - - diff --git a/src/artifact_translation_package/config/config_validator.py b/src/artifact_translation_package/config/config_validator.py new file mode 100644 index 0000000..3df6805 --- /dev/null +++ b/src/artifact_translation_package/config/config_validator.py @@ -0,0 +1,202 @@ +""" +Configuration validation utilities. + +Provides functions to validate required environment variables and raise +clear error messages when they are missing. +""" + +import os +import logging +from typing import List, Optional, Dict, Any + +logger = logging.getLogger(__name__) + +# Required environment variables for different components +REQUIRED_SNOWFLAKE_VARS = [ + "SNOWFLAKE_DATABASE", + "SNOWFLAKE_SCHEMA", +] + +REQUIRED_UC_VARS = [ + "UC_CATALOG", + "UC_SCHEMA", +] + +REQUIRED_SECRETS_VARS = [ + "SNOWFLAKE_ACCOUNT", + "SNOWFLAKE_USER", + "SNOWFLAKE_PASSWORD", +] + + +class ConfigurationError(Exception): + """Raised when required configuration is missing.""" + pass + + +def get_required_env(var_name: str, component: str = "application") -> str: + """ + Get a required environment variable, raising a clear error if not set. + + Args: + var_name: Name of the environment variable + component: Component name for error message context + + Returns: + The environment variable value + + Raises: + ConfigurationError: If the variable is not set or empty + """ + value = os.environ.get(var_name, "").strip() + if not value: + error_msg = ( + f"Missing required environment variable: {var_name}\n" + f"Component: {component}\n" + f"Please set this variable in your cluster configuration or .env file.\n" + f"See env.example for reference." + ) + logger.error(error_msg) + raise ConfigurationError(error_msg) + return value + + +def get_env_with_fallback(var_name: str, fallback: str, component: str = "application") -> str: + """ + Get environment variable with fallback, logging a warning if using fallback. + + Args: + var_name: Name of the environment variable + fallback: Fallback value if not set + component: Component name for logging context + + Returns: + The environment variable value or fallback + """ + value = os.environ.get(var_name, "").strip() + if not value: + if fallback: + logger.warning( + f"Environment variable {var_name} not set for {component}, " + f"using fallback: '{fallback}'" + ) + return fallback + else: + logger.warning( + f"Environment variable {var_name} not set for {component}, " + f"no fallback available - this may cause errors" + ) + return "" + return value + + +def validate_snowflake_config() -> Dict[str, str]: + """ + Validate and return Snowflake configuration. + + Returns: + Dict with validated Snowflake config + + Raises: + ConfigurationError: If required variables are missing + """ + missing = [] + config = {} + + for var in REQUIRED_SNOWFLAKE_VARS: + value = os.environ.get(var, "").strip() + if not value: + missing.append(var) + else: + config[var] = value + + if missing: + error_msg = ( + f"Missing required Snowflake configuration:\n" + f" - {chr(10).join(missing)}\n\n" + f"Please set these environment variables in your cluster configuration.\n" + f"See env.example for reference." + ) + logger.error(error_msg) + raise ConfigurationError(error_msg) + + logger.info(f"Snowflake config validated: database={config.get('SNOWFLAKE_DATABASE')}, " + f"schema={config.get('SNOWFLAKE_SCHEMA')}") + return config + + +def validate_unity_catalog_config() -> Dict[str, str]: + """ + Validate and return Unity Catalog configuration. + + Returns: + Dict with validated UC config + + Raises: + ConfigurationError: If required variables are missing + """ + missing = [] + config = {} + + for var in REQUIRED_UC_VARS: + value = os.environ.get(var, "").strip() + if not value: + missing.append(var) + else: + config[var] = value + + # RAW_VOLUME has a default + config["UC_RAW_VOLUME"] = os.environ.get("UC_RAW_VOLUME", "snowflake_artifacts_raw") + + if missing: + error_msg = ( + f"Missing required Unity Catalog configuration:\n" + f" - {chr(10).join(missing)}\n\n" + f"Please set these environment variables in your cluster configuration.\n" + f"See env.example for reference." + ) + logger.error(error_msg) + raise ConfigurationError(error_msg) + + logger.info(f"Unity Catalog config validated: catalog={config.get('UC_CATALOG')}, " + f"schema={config.get('UC_SCHEMA')}") + return config + + +def get_uc_volume_path_validated() -> str: + """ + Get the Unity Catalog volume path with validation. + + Returns: + Validated volume path + + Raises: + ConfigurationError: If required UC variables are missing + """ + config = validate_unity_catalog_config() + return f"/Volumes/{config['UC_CATALOG']}/{config['UC_SCHEMA']}/{config['UC_RAW_VOLUME']}" + + +def log_config_summary(): + """Log a summary of current configuration for debugging.""" + env_vars = [ + "UC_CATALOG", "UC_SCHEMA", "UC_RAW_VOLUME", + "SNOWFLAKE_DATABASE", "SNOWFLAKE_SCHEMA", + "SECRETS_SCOPE", "DBX_ENDPOINT", "DDL_OUTPUT_DIR" + ] + + logger.info("=" * 60) + logger.info("Configuration Summary:") + logger.info("=" * 60) + + for var in env_vars: + value = os.environ.get(var, "") + if value: + # Mask sensitive values + if "SECRET" in var or "PASSWORD" in var: + value = "***" + logger.info(f" {var}: {value}") + else: + logger.warning(f" {var}: NOT SET") + + logger.info("=" * 60) diff --git a/src/artifact_translation_package/config/constants.py b/src/artifact_translation_package/config/constants.py index cf0a0c5..1c6df91 100644 --- a/src/artifact_translation_package/config/constants.py +++ b/src/artifact_translation_package/config/constants.py @@ -1,10 +1,17 @@ from enum import Enum class UnityCatalogConfig(Enum): - """Unity Catalog configuration for translation package""" - CATALOG = "qubika_partner_solutions" - SCHEMA = "migration_accelerator" - RAW_VOLUME = "snowflake_artifacts_raw" + """Unity Catalog configuration for translation package. + + These values are placeholders. Set via environment variables: + - UC_CATALOG: Unity Catalog name (required) + - UC_SCHEMA: Schema name (required) + - UC_RAW_VOLUME: Volume name (optional, has default) + """ + # Placeholders - must be overridden via env vars + CATALOG = "" # Set UC_CATALOG env var + SCHEMA = "" # Set UC_SCHEMA env var + RAW_VOLUME = "snowflake_artifacts_raw" # Can keep default class LangGraphConfig(Enum): @@ -13,13 +20,10 @@ class LangGraphConfig(Enum): ENVIRONMENT = "development" #DDL Settings DDL_BATCH_SIZE = 8 - # Default outputs for Databricks should go to a Volume mounted path - # Use segmented path segments: /Volumes//// - DDL_OUTPUT_DIR = "/Volumes/qubika_partner_solutions/migration_accelerator/outputs" + # DDL_OUTPUT_DIR must be set via env var (no default - depends on catalog) + DDL_OUTPUT_DIR = "" # Set DDL_OUTPUT_DIR env var # Optional: DDL Generation Settings - #DDL_CATALOG_NAME= "demo_catalog" - #DDL_SCHEMA_NAME=" bronze" DDL_TEMPERATURE=0.1 DDL_MAX_TOKENS=2000 DDL_MAX_CONCURRENT = 5 @@ -36,18 +40,17 @@ class LangGraphConfig(Enum): # LangSmith Settings LANGSMITH_TRACING=True - LANGSMITH_PROJECT="databricks-migration-accelerator-local" - #Configured as secrets (under migration-accelerator scope): + LANGSMITH_PROJECT="databricks-migration-accelerator" + #Configured as secrets (under scope defined by SECRETS_SCOPE): #LANGSMITH_ENDPOINT #LANGSMITH_API_KEY #Lakebase Settings LAKEBASE_DATABASE = "databricks_postgres" - #Configured as secrets (under migration-accelerator scope): + #Configured as secrets: #LAKEBASE_HOST #LAKEBASE_USER #LAKEBASE_PASSWORD - #Secrets Settings + #Secrets Settings - default scope name SECRETS_SCOPE = "migration-accelerator" - diff --git a/src/artifact_translation_package/config/secrets.py b/src/artifact_translation_package/config/secrets.py index 4793ca6..7326b0c 100644 --- a/src/artifact_translation_package/config/secrets.py +++ b/src/artifact_translation_package/config/secrets.py @@ -2,11 +2,15 @@ from typing import Optional, Any import os +# Default scope name - can be overridden via SECRETS_SCOPE env var +DEFAULT_SECRETS_SCOPE = "migration-accelerator" + def get_secret(secret_name): """Retrieve secrets from Databricks secret scope""" + scope = os.getenv("SECRETS_SCOPE", DEFAULT_SECRETS_SCOPE) try: - return dbutils.secrets.get("migration-accelerator", secret_name) + return dbutils.secrets.get(scope, secret_name) except Exception as e: # Fallback to environment variables for local development return os.getenv(secret_name, "") diff --git a/src/artifact_translation_package/databricks_job.py b/src/artifact_translation_package/databricks_job.py index e79790c..237aafa 100644 --- a/src/artifact_translation_package/databricks_job.py +++ b/src/artifact_translation_package/databricks_job.py @@ -306,32 +306,67 @@ def databricks_entrypoint(): """ Simplified entry point for Databricks jobs using configuration constants. No CLI arguments required - uses constants directly. - """ - # Use constants directly - no environment variables or complex logic + + Raises: + ValueError: If required environment variables are not set + """ + logger = get_logger("databricks_entrypoint") + + # Validate and get required Unity Catalog config + catalog = os.environ.get("UC_CATALOG", "").strip() or UnityCatalogConfig.CATALOG.value + schema = os.environ.get("UC_SCHEMA", "").strip() or UnityCatalogConfig.SCHEMA.value + raw_volume = os.environ.get("UC_RAW_VOLUME", "").strip() or UnityCatalogConfig.RAW_VOLUME.value or "snowflake_artifacts_raw" + + # Validate required config + if not catalog: + error_msg = ( + "Missing required environment variable: UC_CATALOG\n" + "Please set this in your cluster configuration or .env file.\n" + "See env.example for reference." + ) + logger.error(error_msg) + raise ValueError(error_msg) + + if not schema: + error_msg = ( + "Missing required environment variable: UC_SCHEMA\n" + "Please set this in your cluster configuration or .env file.\n" + "See env.example for reference." + ) + logger.error(error_msg) + raise ValueError(error_msg) + volume_path = ( f"/Volumes/" - f"{UnityCatalogConfig.CATALOG.value}/" - f"{UnityCatalogConfig.SCHEMA.value}/" - f"{UnityCatalogConfig.RAW_VOLUME.value}/" + f"{catalog}/" + f"{schema}/" + f"{raw_volume}/" ) - # Respect env vars with fallbacks to constants so cluster/job-level env can override defaults - batch_size = int(os.environ.get("DDL_BATCH_SIZE") or LangGraphConfig.DDL_BATCH_SIZE.value) - output_format = (os.environ.get("DDL_OUTPUT_FORMAT") or LangGraphConfig.DDL_OUTPUT_FORMAT.value).lower() - # Default output_path comes from the LangGraphConfig constant. - # Allow an environment variable `DDL_OUTPUT_PATH` to override this (useful in Databricks jobs). - output_path = os.environ.get("DDL_OUTPUT_PATH") or LangGraphConfig.DDL_OUTPUT_DIR.value - # Treat empty string as no output (in-memory) - if output_path == "": - output_path = None + + # Process settings with validation + batch_size = int(os.environ.get("DDL_BATCH_SIZE") or LangGraphConfig.DDL_BATCH_SIZE.value or 8) + output_format = (os.environ.get("DDL_OUTPUT_FORMAT") or LangGraphConfig.DDL_OUTPUT_FORMAT.value or "sql").lower() + + # Output path - validate if set + output_path = os.environ.get("DDL_OUTPUT_PATH") or os.environ.get("DDL_OUTPUT_DIR") or LangGraphConfig.DDL_OUTPUT_DIR.value + if not output_path: + logger.warning("DDL_OUTPUT_DIR not set - will generate path from UC config") + output_path = f"/Volumes/{catalog}/{schema}/outputs" + output_path = make_timestamped_output_path(output_path, output_format) + + # Log configuration summary print("=" * 60) print("Databricks Translation Job Starting") print("=" * 60) + print(f"UC Catalog: {catalog}") + print(f"UC Schema: {schema}") print(f"Volume Path: {volume_path}") print(f"Batch Size: {batch_size}") print(f"Output Format: {output_format}") print(f"Output Path: {output_path or 'In-memory (no file output)'}") print("-" * 60) + result = process_from_volume( volume_path=volume_path, batch_size=batch_size, diff --git a/src/migration_accelerator_package/constants.py b/src/migration_accelerator_package/constants.py index a36849a..4ec026d 100644 --- a/src/migration_accelerator_package/constants.py +++ b/src/migration_accelerator_package/constants.py @@ -1,15 +1,29 @@ from enum import Enum class SnowflakeConfig(Enum): + """Snowflake configuration. + + Database and schema must be set via environment variables: + - SNOWFLAKE_DATABASE (required) + - SNOWFLAKE_SCHEMA (required) + """ SNOWFLAKE_ROLE = "SYSADMIN" SNOWFLAKE_WAREHOUSE = "COMPUTE_WH" - SNOWFLAKE_DATABASE = "LVDMS" - SNOWFLAKE_SCHEMA = "LVDMS" + # Placeholders - must be overridden via env vars + SNOWFLAKE_DATABASE = "" # Set SNOWFLAKE_DATABASE env var + SNOWFLAKE_SCHEMA = "" # Set SNOWFLAKE_SCHEMA env var class UnityCatalogConfig(Enum): - CATALOG = "qubika_partner_solutions" - SCHEMA = "migration_accelerator" - RAW_VOLUME = "snowflake_artifacts_raw" + """Unity Catalog configuration. + + Catalog and schema must be set via environment variables: + - UC_CATALOG (required) + - UC_SCHEMA (required) + """ + # Placeholders - must be overridden via env vars + CATALOG = "" # Set UC_CATALOG env var + SCHEMA = "" # Set UC_SCHEMA env var + RAW_VOLUME = "snowflake_artifacts_raw" # Can keep default class ArtifactType(Enum): """Enumeration of Snowflake artifact types.""" diff --git a/src/migration_accelerator_package/ingestion_validation.py b/src/migration_accelerator_package/ingestion_validation.py index c05a806..d885730 100644 --- a/src/migration_accelerator_package/ingestion_validation.py +++ b/src/migration_accelerator_package/ingestion_validation.py @@ -4,6 +4,7 @@ """ import json +import os from snowflake.snowpark import Session from databricks.sdk.runtime import dbutils @@ -26,13 +27,28 @@ def main(): connection_parameters = build_snowflake_connection_params() session = Session.builder.configs(connection_parameters).create() - db = SnowflakeConfig.SNOWFLAKE_DATABASE.value - schema = SnowflakeConfig.SNOWFLAKE_SCHEMA.value + # Get database/schema from env vars with fallback to constants + db = os.environ.get("SNOWFLAKE_DATABASE", SnowflakeConfig.SNOWFLAKE_DATABASE.value) + schema = os.environ.get("SNOWFLAKE_SCHEMA", SnowflakeConfig.SNOWFLAKE_SCHEMA.value) + + # Validate required config + if not db or not schema: + missing = [] + if not db: missing.append("SNOWFLAKE_DATABASE") + if not schema: missing.append("SNOWFLAKE_SCHEMA") + error_msg = ( + f"Missing required configuration: {', '.join(missing)}\n" + f"Please set these in your cluster environment variables or .env file.\n" + f"See env.example for reference." + ) + logger.error(error_msg) + raise ValueError(error_msg) + + logger.info(f"Validating: database={db}, schema={schema}") volume_path = get_uc_volume_path() logger.info(f"UC Volume Path: {volume_path}") - validator = MetadataValidator(session, volume_path) logger.info("Loading extracted metadata") diff --git a/src/migration_accelerator_package/snowpark.py b/src/migration_accelerator_package/snowpark.py index 7cccce4..40eb9b4 100644 --- a/src/migration_accelerator_package/snowpark.py +++ b/src/migration_accelerator_package/snowpark.py @@ -17,10 +17,14 @@ from migration_accelerator_package.constants import ArtifactType, ArtifactFileName from migration_accelerator_package.logging_utils import get_app_logger +# Default scope name - can be overridden via SECRETS_SCOPE env var +DEFAULT_SECRETS_SCOPE = "migration-accelerator" + def get_secret(secret_name): """Retrieve secrets from Databricks secret scope""" + scope = os.getenv("SECRETS_SCOPE", DEFAULT_SECRETS_SCOPE) try: - return dbutils.secrets.get("migration-accelerator", secret_name) + return dbutils.secrets.get(scope, secret_name) except Exception as e: # Fallback to environment variables for local development return os.getenv(secret_name, "") @@ -35,13 +39,35 @@ def get_secret(secret_name): SFLKuser = get_secret('SNOWFLAKE_USER') SFLKpass = get_secret('SNOWFLAKE_PASSWORD') -SFLKrole = constants.SnowflakeConfig.SNOWFLAKE_ROLE.value -SFLKwarehouse = constants.SnowflakeConfig.SNOWFLAKE_WAREHOUSE.value -SFLKdatabase = constants.SnowflakeConfig.SNOWFLAKE_DATABASE.value -SFLKschema = constants.SnowflakeConfig.SNOWFLAKE_SCHEMA.value +SFLKrole = os.getenv('SNOWFLAKE_ROLE', constants.SnowflakeConfig.SNOWFLAKE_ROLE.value) or "SYSADMIN" +SFLKwarehouse = os.getenv('SNOWFLAKE_WAREHOUSE', constants.SnowflakeConfig.SNOWFLAKE_WAREHOUSE.value) or "COMPUTE_WH" +SFLKdatabase = os.getenv('SNOWFLAKE_DATABASE', constants.SnowflakeConfig.SNOWFLAKE_DATABASE.value) +SFLKschema = os.getenv('SNOWFLAKE_SCHEMA', constants.SnowflakeConfig.SNOWFLAKE_SCHEMA.value) SFLKregion = "" +# Validate required parameters +missing_params = [] +if not SFLKuser: + missing_params.append("SNOWFLAKE_USER") +if not SFLKpass: + missing_params.append("SNOWFLAKE_PASSWORD") +if not SFLKdatabase: + missing_params.append("SNOWFLAKE_DATABASE") +if not SFLKschema: + missing_params.append("SNOWFLAKE_SCHEMA") + +if missing_params: + error_msg = ( + f"Missing required configuration: {', '.join(missing_params)}\n" + f"Please set these in your cluster environment variables or .env file.\n" + f"See env.example for reference." + ) + logger.error(error_msg) + raise ValueError(error_msg) + +logger.info(f"Snowflake config: database={SFLKdatabase}, schema={SFLKschema}") + # Build connection parameters connection_parameters = { "account": SFLKaccount, @@ -57,12 +83,6 @@ def get_secret(secret_name): if SFLKregion: connection_parameters["region"] = SFLKregion -# Validate required parameters -if not SFLKuser or not SFLKpass: - raise ValueError( - "Missing required environment variables. Please set SNOWFLAKE_USER and SNOWFLAKE_PASSWORD " - "in your .env file. See env.example for reference." - ) class SnowparkObjectReader: @@ -197,7 +217,25 @@ def save_to_json(self, output_dir: str = None): # Use default volume path if output_dir not specified if output_dir is None: - base_volume_path = f"/Volumes/{constants.UnityCatalogConfig.CATALOG.value}/{constants.UnityCatalogConfig.SCHEMA.value}/{constants.UnityCatalogConfig.RAW_VOLUME.value}" + catalog = os.environ.get("UC_CATALOG", constants.UnityCatalogConfig.CATALOG.value) + schema = os.environ.get("UC_SCHEMA", constants.UnityCatalogConfig.SCHEMA.value) + raw_volume = os.environ.get("UC_RAW_VOLUME", constants.UnityCatalogConfig.RAW_VOLUME.value) or "snowflake_artifacts_raw" + + # Validate required UC config + if not catalog or not schema: + missing = [] + if not catalog: missing.append("UC_CATALOG") + if not schema: missing.append("UC_SCHEMA") + error_msg = ( + f"Missing required Unity Catalog configuration: {', '.join(missing)}\n" + f"Please set these in your cluster environment variables or .env file.\n" + f"See env.example for reference." + ) + logger.error(error_msg) + raise ValueError(error_msg) + + base_volume_path = f"/Volumes/{catalog}/{schema}/{raw_volume}" + logger.info(f"Unity Catalog volume path: {base_volume_path}") else: base_volume_path = output_dir diff --git a/src/migration_accelerator_package/snowpark_utils.py b/src/migration_accelerator_package/snowpark_utils.py index 83130f4..cc700ff 100644 --- a/src/migration_accelerator_package/snowpark_utils.py +++ b/src/migration_accelerator_package/snowpark_utils.py @@ -4,6 +4,7 @@ import json import os +import logging from databricks.sdk.runtime import dbutils from migration_accelerator_package.constants import SnowflakeConfig, UnityCatalogConfig from migration_accelerator_package.logging_utils import get_app_logger @@ -11,36 +12,129 @@ # Module-level logger for utility functions (uses app logger format) _utils_logger = get_app_logger("utils") +# Default scope name - can be overridden via SECRETS_SCOPE env var +DEFAULT_SECRETS_SCOPE = "migration-accelerator" + + +class ConfigurationError(Exception): + """Raised when required configuration is missing.""" + pass + + +def _get_required_env(var_name: str, fallback: str = "") -> str: + """ + Get environment variable with validation and logging. + + Args: + var_name: Name of the environment variable + fallback: Fallback value from constants (may be empty) + + Returns: + The environment variable value + + Raises: + ConfigurationError: If the variable is not set and no fallback + """ + value = os.environ.get(var_name, "").strip() + if value: + return value + + # Try fallback from constants + if fallback: + _utils_logger.warning( + f"Environment variable {var_name} not set, using fallback: '{fallback}'" + ) + return fallback + + # No value and no fallback - this is required + error_msg = ( + f"Missing required environment variable: {var_name}\n" + f"Please set this variable in your cluster configuration or .env file.\n" + f"See env.example for reference." + ) + _utils_logger.error(error_msg) + raise ConfigurationError(error_msg) + def get_secret(secret_name: str): """Retrieve secrets from Databricks secret scope or fallback to env variables.""" + scope = os.getenv("SECRETS_SCOPE", DEFAULT_SECRETS_SCOPE) try: - return dbutils.secrets.get("migration-accelerator", secret_name) - except Exception: - return os.getenv(secret_name, "") + value = dbutils.secrets.get(scope, secret_name) + if value: + return value + except Exception as e: + _utils_logger.debug(f"Could not get secret {secret_name} from scope {scope}: {e}") + + # Fallback to environment variables + value = os.getenv(secret_name, "") + if not value: + _utils_logger.warning( + f"Secret {secret_name} not found in scope '{scope}' or environment. " + f"This may cause authentication failures." + ) + return value def build_snowflake_connection_params(): - """Return Snowflake connection parameters used by all wheel entrypoints.""" + """ + Return Snowflake connection parameters used by all wheel entrypoints. + + Raises: + ConfigurationError: If required Snowflake config is missing + """ + # Get credentials from secrets + account = get_secret("SNOWFLAKE_ACCOUNT") + user = get_secret("SNOWFLAKE_USER") + password = get_secret("SNOWFLAKE_PASSWORD") + + # Validate credentials + if not account or not user or not password: + missing = [] + if not account: missing.append("SNOWFLAKE_ACCOUNT") + if not user: missing.append("SNOWFLAKE_USER") + if not password: missing.append("SNOWFLAKE_PASSWORD") + + error_msg = ( + f"Missing Snowflake credentials: {', '.join(missing)}\n" + f"Please configure these in Databricks secrets scope or environment variables." + ) + _utils_logger.error(error_msg) + raise ConfigurationError(error_msg) + + # Get database/schema with validation + database = _get_required_env("SNOWFLAKE_DATABASE", SnowflakeConfig.SNOWFLAKE_DATABASE.value) + schema = _get_required_env("SNOWFLAKE_SCHEMA", SnowflakeConfig.SNOWFLAKE_SCHEMA.value) + + _utils_logger.info(f"Snowflake connection: database={database}, schema={schema}") + return { - "account": get_secret("SNOWFLAKE_ACCOUNT"), - "user": get_secret("SNOWFLAKE_USER"), - "password": get_secret("SNOWFLAKE_PASSWORD"), - "role": SnowflakeConfig.SNOWFLAKE_ROLE.value, - "warehouse": SnowflakeConfig.SNOWFLAKE_WAREHOUSE.value, - "database": SnowflakeConfig.SNOWFLAKE_DATABASE.value, - "schema": SnowflakeConfig.SNOWFLAKE_SCHEMA.value, + "account": account, + "user": user, + "password": password, + "role": os.getenv("SNOWFLAKE_ROLE", SnowflakeConfig.SNOWFLAKE_ROLE.value), + "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE", SnowflakeConfig.SNOWFLAKE_WAREHOUSE.value), + "database": database, + "schema": schema, } def get_uc_volume_path() -> str: - """Return the base UC volume path where JSON artifacts live.""" - return ( - f"/Volumes/" - f"{UnityCatalogConfig.CATALOG.value}/" - f"{UnityCatalogConfig.SCHEMA.value}/" - f"{UnityCatalogConfig.RAW_VOLUME.value}" - ) + """ + Return the base UC volume path where JSON artifacts live. + + Raises: + ConfigurationError: If required UC config is missing + """ + catalog = _get_required_env("UC_CATALOG", UnityCatalogConfig.CATALOG.value) + schema = _get_required_env("UC_SCHEMA", UnityCatalogConfig.SCHEMA.value) + raw_volume = os.environ.get("UC_RAW_VOLUME", UnityCatalogConfig.RAW_VOLUME.value) or "snowflake_artifacts_raw" + + volume_path = f"/Volumes/{catalog}/{schema}/{raw_volume}" + _utils_logger.info(f"Unity Catalog volume path: {volume_path}") + + return volume_path + def load_json_from_volume(volume_path: str, filename: str) -> dict: """ @@ -61,3 +155,25 @@ def load_json_from_volume(volume_path: str, filename: str) -> dict: except Exception as e: _utils_logger.warning(f"Could not load {filename}: {e}") return {} + + +def log_config_summary(): + """Log a summary of current configuration for debugging.""" + env_vars = [ + "UC_CATALOG", "UC_SCHEMA", "UC_RAW_VOLUME", + "SNOWFLAKE_DATABASE", "SNOWFLAKE_SCHEMA", + "SECRETS_SCOPE", "DBX_ENDPOINT", "DDL_OUTPUT_DIR" + ] + + _utils_logger.info("=" * 60) + _utils_logger.info("Configuration Summary:") + _utils_logger.info("=" * 60) + + for var in env_vars: + value = os.environ.get(var, "") + if value: + _utils_logger.info(f" {var}: {value}") + else: + _utils_logger.warning(f" {var}: NOT SET") + + _utils_logger.info("=" * 60)