thisisqubika
diff --git a/‎.github/workflows/databricks-asset-bundle-deploy.yml‎
Lines changed: 1 addition & 14 deletions b/‎.github/workflows/databricks-asset-bundle-deploy.yml‎
Lines changed: 1 addition & 14 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎resources/jobs.yml‎
Lines changed: 10 additions & 0 deletions b/‎resources/jobs.yml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/migration_report_package/README.md‎
Lines changed: 131 additions & 0 deletions b/‎src/migration_report_package/README.md‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎src/migration_report_package/__init__.py‎ b/‎src/migration_report_package/__init__.py‎
diff --git a/‎src/migration_report_package/graph_builder.py‎
Lines changed: 162 additions & 0 deletions b/‎src/migration_report_package/graph_builder.py‎
Lines changed: 162 additions & 0 deletions
@@ -57,17 +57,4 @@ jobs:
         databricks bundle deploy -t dev
         databricks bundle run databricks_job_executor_app -t dev 
         databricks bundle summary
-        databricks bundle resources
-    
-    # - name: Sync app source to workspace
-    #   working-directory: databricks_job_executor
-    #   run: |
-    #     WORKSPACE_PATH="/Workspace/Shared/databricks_job_executor"
-    #     databricks workspace mkdirs "$WORKSPACE_PATH" || true
-    #     databricks workspace import-dir . "$WORKSPACE_PATH" --overwrite
-
-    # - name: Deploy app
-    #   working-directory: databricks_job_executor
-    #   run: |
-    #     databricks apps deploy dbx-job-executor-app-2 --source-code-path /Workspace/Shared/databricks_job_executor
-    #     databricks apps update dbx-job-executor-app-2 --default-source-code-path /Workspace/Shared/databricks_job_executor || true
+        databricks bundle resources
@@ -5,7 +5,8 @@ description = "Reusable python logic for Snowflake - Databricks Migration."
 authors = ["Samuel Solarte samuel.solarte@qubika.com"]
 packages = [
     { include = "migration_accelerator_package", from = "src" },
-    { include = "artifact_translation_package", from = "src" }
+    { include = "artifact_translation_package", from = "src" },
+    { include = "migration_report_package", from = "src" }
 ]
 
 
@@ -39,6 +40,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
 migration-accelerator = "migration_accelerator_package.main:main"
+migration-report = "migration_report_package.main:main"
 snowpark-reader = "migration_accelerator_package.snowpark:main"
 snowflake-validator = "migration_accelerator_package.ingestion_validation:main"
 grant-transformer = "migration_accelerator_package.grant_transformation:main"
 
@@ -45,6 +45,16 @@ resources:
           python_wheel_task:
             entry_point: translation-module
             package_name: migration_accelerator_package
+
+        - task_key: migration_report_task
+          depends_on:
+            - task_key: artifact_translation_task
+          existing_cluster_id: 1214-215558-eghymads
+          libraries:
+            - whl: "../dist/*.whl"
+          python_wheel_task:
+            entry_point: migration-report
+            package_name: migration_accelerator_package
 
       # Using existing cluster "data-migration" (ID: 1214-215558-eghymads); no job_clusters or spark_version needed.
 
 
@@ -0,0 +1,131 @@
+# Migration Report Graph
+
+A LangGraph-based system for creating a migration report based on the translation and evaluation results obtained from the DDL Translation Graph. 
+
+## Project Structure
+
+```
+migration_report_package/
+├── graph_builder.py           # LangGraph construction
+├── main.py                    # Main entry point for the migration report
+├── report_llm.py              # LLM configuration for report generation
+```
+
+## Usage
+
+Process JSON files where each file contains a specific artifact type. The artifact type is determined from the filename (e.g., `tables.json`, `views.json`).
+
+```bash
+# Search for the "output" folder on the Default location and save the report there
+python main.py
+
+# Search for the "output" folder on the Default location and save the report on a custom location
+python main.py --md_output ./custom_location
+
+# Search for the "output" folder save the report on a custom location
+python main.py --output_dir ./custom_location --md_output ./custom_location
+```
+
+### Programmatic Usage
+
+```python
+from graph_builder import MigrationReportGraph
+
+# Create report from "output" folder in input_dir and save it in output_dir
+graph = MigrationReportGraph()
+md_report, json_report = graph.run(input_dir)
+
+with open(os.path.join(output_dir, "migration_report.md"), "w", encoding="utf-8") as f:
+    f.write(md_report)
+
+print("JSON Report: ",json_report)
+```
+
+## Migration Report Example
+
+```markdown
+# Migration Report
+## Overview
+The migration process has completed with the following summary:
+- Total artifacts migrated: 8
+- Total errors: 0
+- Total warnings: 0
+- Total successes: 8
+- Validation errors: 2
+
+## Detailed Results per Artifact Type
+
+### Schemas
+| Artifact Name | Type | Status |
+| --- | --- | --- |
+| BRONZE_LAYER | schemas | success |
+| SILVER_LAYER | schemas | success |
+| GOLD_LAYER | schemas | success |
+
+### Tables
+| Artifact Name | Type | Status |
+| --- | --- | --- |
+| EXAMPLE_TABLE_1 | tables | error |
+| EXAMPLE_TABLE_2 | tables | error |
+
+### Views
+| Artifact Name | Type | Status |
+| --- | --- | --- |
+| ACTIVE_USERS_VIEW | views | success |
+| SALES_SUMMARY_VIEW | views | success |
+| INVENTORY_STATUS_VIEW | views | success |
+
+## Error and Warning Sections
+
+### Errors
+No errors were reported during the migration.
+
+### Warnings
+No warnings were reported during the migration.
+
+## Objects Requiring Manual Review
+The following objects require manual review due to validation errors or other issues:
+- EXAMPLE_TABLE_1 (tables): Validation failed with syntax error.
+- EXAMPLE_TABLE_2 (tables): Validation failed with syntax error.
+
+## Summary of AI-assisted vs Rule-based Outputs
+The migration utilized a combination of AI-assisted and rule-based approaches. The exact distribution is not available in the provided data.
+
+## Performance Metrics
+- Total duration: 31.17 seconds
+- Stage durations:
+  - translate_tables: 22.95 seconds
+  - translate_views: 5.57 seconds
+  - translate_schemas: 2.44 seconds
+
+## Analysis
+
+### Common Translation Errors
+- The translation of tables resulted in Python code that failed validation due to syntax errors.
+
+### Patterns in Warnings or Inconsistencies
+- No warnings were reported, but the validation errors for tables indicate a potential inconsistency in the translation process.
+
+### Success Rate per Artifact Type
+- Schemas: 100% success (3/3)
+- Tables: 0% success (0/2) due to validation errors
+- Views: 100% success (3/3)
+
+### Unsupported or Partially Supported Features
+- The translation process generated Python code for tables, which is not directly executable in Databricks. This indicates a potential gap in the translation rules for tables.
+
+### Dependencies that Failed or Were Skipped
+- The tables (EXAMPLE_TABLE_1 and EXAMPLE_TABLE_2) failed due to validation errors, indicating potential issues with dependencies or the translation process.
+
+### Recommendations for Improving Translation Rules
+1. Review and adjust the translation rules for tables to directly generate valid Databricks DDL instead of Python code.
+2. Enhance the validation step to catch syntax errors early in the translation process.
+
+### Suggested Workaround for Unsupported Features
+For tables, manually review and convert the generated Python code into valid Databricks DDL statements. Ensure that the data types and constraints are correctly translated.
+```
+
+## Requirements
+
+- Python 3.7+
+- LangChain ecosystem
@@ -0,0 +1,162 @@
+from typing import Any, Dict, List, Optional, Annotated, TypedDict
+from typing_extensions import TypedDict
+from langgraph.graph import StateGraph, END, START
+from langchain_core.runnables import RunnableConfig
+from report_llm import generate_report
+from datetime import datetime
+import os
+
+import json
+
+
+class MigrationState(TypedDict):
+    """State for the migration graph execution."""
+    input_dir: str
+    latest_dir: str
+    raw: Dict[str, Any]
+    cleaned_raw: Dict[str, Any]
+    count: Dict[str, Any]
+    json_report: Dict[str, Any]
+    md_report: str
+    
+def input_node(state: MigrationState) -> MigrationState:
+    """Input node for the migration graph."""
+    input_path = state["input_dir"]
+    if not os.path.exists(input_path):
+        raise FileNotFoundError(f"Output folder not found: {input_path}")
+    ## Get output with most recent timestamp
+    output_dirs = []
+    for name in os.listdir(input_path):
+        res = os.path.join(input_path, name)
+        if not os.path.isdir(res):
+            continue
+        run_dt = datetime.strptime(name, "%Y-%m-%dT%H-%M-%SZ")
+        output_dirs.append((run_dt, res))
+    
+    _ , latest = max(output_dirs, key=lambda x: x[0])
+    state["latest_dir"] = latest
+    ## Get translation results and evaluation notes
+    raw = {"translation_results": [], "evaluation": []}
+    for name in os.listdir(latest):
+        out = os.path.join(latest, name)
+        if os.path.isdir(out):
+            for file_path in os.listdir(out):
+                file = os.path.join(out, file_path)
+                if "evaluation" in os.path.basename(file).lower():
+                    with open(file, "r", encoding="utf-8") as f:
+                        raw["evaluation"].append(json.load(f))
+        else:
+            if "translation_results.json" in os.path.basename(out).lower():
+                with open(out, "r", encoding="utf-8") as f:
+                    raw["translation_results"].append(json.load(f))
+    state["raw"] = raw
+    return state
+
+def clean_raw(obj: Any) -> Any:
+    if obj is None:
+        return None
+
+    # Handle dicts
+    if isinstance(obj, dict):
+        cleaned = {}
+        for k, v in obj.items():
+            pruned = clean_raw(v)
+            if pruned not in (None, {}, [], ""):
+                cleaned[k] = pruned
+        return cleaned or None
+
+    # Handle lists
+    if isinstance(obj, list):
+        cleaned = []
+        for item in obj:
+            pruned = clean_raw(item)
+            if pruned not in (None, {}, [], ""):
+                cleaned.append(pruned)
+        return cleaned or None
+
+    # Handle strings (remove empty and prune to MAX_LEN length)
+    MAX_LEN = 150
+    if isinstance(obj, str):
+        s = obj.strip()
+        if not s:
+            return None
+        return s if len(s) <= MAX_LEN else s[:MAX_LEN] + "…"
+
+    return obj
+
+
+def clean_raw_node(state: MigrationState) -> MigrationState:  
+    """clean raw data by removing empty values and pruning long strings"""
+    state["cleaned_raw"] = clean_raw(state["raw"])
+    return state
+
+def count_node(state: MigrationState) -> MigrationState:
+    """Count trnslated artifacts, errors, warnings and validation errors for the report."""
+    count = {"artifact_type": {}, "migration_errors": 0, "migration_warnings": 0, "successes": 0, "validation_errors": 0}
+    for trans in state.get("cleaned_raw", {}).get("translation_results", []):
+        for type, value in trans.get("observability", {}).get("artifact_counts", {}).items():
+            if count["artifact_type"].get(type) is None:
+                count["artifact_type"][type] = value
+                count["successes"] += value
+            else:
+                count["artifact_type"][type] += value
+                count["successes"] += value
+        count["migration_errors"] += trans["observability"]["total_errors"]
+        count["migration_warnings"] += trans["observability"]["total_warnings"]
+    for eval in state.get("cleaned_raw", {}).get("evaluation", []):
+        for res in eval.get("validation", {}).get("results", []):
+            count["validation_errors"] += (1 if not res.get("syntax_valid", True) else 0)
+    state["count"] = count
+    return state
+
+def report_node(state: MigrationState) -> MigrationState:
+    """Create report with LLM."""
+    result = generate_report(state["cleaned_raw"], state["count"])
+    return {**state, "md_report": result, "json_report": state["count"]}
+
+class MigrationReportGraph:
+    def __init__(self, run_id: Optional[str] = None):
+        """
+        Initialize migration report graph.
+        
+        Args:
+            run_id: Unique identifier for this run
+        """
+        # Create the StateGraph
+        self.graph = StateGraph(MigrationState)
+
+        # Add nodes
+        self.graph.add_node("input", input_node)
+        self.graph.add_node("clean", clean_raw_node)
+        self.graph.add_node("count", count_node)
+        self.graph.add_node("report", report_node)
+
+        self.graph.add_edge(START, "input")
+        self.graph.add_edge("input", "clean")
+        self.graph.add_edge("clean", "count")   
+        self.graph.add_edge("count", "report")
+        self.graph.add_edge("report", END)
+
+        # Compile the graph
+        self.compiled_graph = self.graph.compile()
+
+    def run(self, input_path: str) -> Dict[str, Any]:        
+        try:
+            initial_state: MigrationState = {
+            "input_dir": input_path,
+            "latest_dir": None,
+            "raw": [],
+            "count": None,
+            "json_report": None,
+            "md_report": None
+        }
+
+            final_state = self.compiled_graph.invoke(initial_state)
+            report = final_state["md_report"] or {}
+            json_report = final_state["json_report"] or {}
+            latest_dir = final_state["latest_dir"] or {}
+            return report[0], json_report, latest_dir
+
+        except Exception as e:
+            raise
+