diff --git a/app.yaml b/app.yaml index 6114bbf..5ef67dd 100644 --- a/app.yaml +++ b/app.yaml @@ -98,6 +98,16 @@ env: - name: GSO_WAREHOUSE_ID valueFrom: sql-warehouse +# --------------------------------------------------------------------------- +# Resources +# --------------------------------------------------------------------------- +resources: + - name: sql-warehouse + description: "SQL Warehouse for queries and catalog discovery" + sql_warehouse: + id: "__WAREHOUSE_ID__" + permission: CAN_USE + # --------------------------------------------------------------------------- # Deployment # --------------------------------------------------------------------------- diff --git a/scripts/deploy-config.sh b/scripts/deploy-config.sh index 323a41f..949164b 100755 --- a/scripts/deploy-config.sh +++ b/scripts/deploy-config.sh @@ -12,7 +12,7 @@ # GENIE_APP_NAME (optional) Databricks App name [default: genie-workbench] # GENIE_DEPLOY_PROFILE (optional) Databricks CLI profile [default: DEFAULT] # GENIE_LLM_MODEL (optional) LLM serving endpoint [default: databricks-claude-sonnet-4-6] -# GENIE_LAKEBASE_INSTANCE (optional) Lakebase instance name [default: ] +# GENIE_LAKEBASE_INSTANCE (optional) Lakebase instance name [default: none] # GENIE_MLFLOW_EXPERIMENT_ID (optional) MLflow experiment ID for agent tracing [default: disabled] # # After sourcing, the following variables are available: @@ -36,7 +36,7 @@ GSO_SCHEMA="genie_space_optimizer" # Fixed default — matches GSO convention WAREHOUSE_ID="${GENIE_WAREHOUSE_ID:-}" PROFILE="${GENIE_DEPLOY_PROFILE:-DEFAULT}" LLM_MODEL="${GENIE_LLM_MODEL:-databricks-claude-sonnet-4-6}" -LAKEBASE_INSTANCE="${GENIE_LAKEBASE_INSTANCE:-$APP_NAME}" +LAKEBASE_INSTANCE="${GENIE_LAKEBASE_INSTANCE:-}" MLFLOW_EXPERIMENT_ID="${GENIE_MLFLOW_EXPERIMENT_ID:-}" # ── Validate required values ───────────────────────────────────────────── diff --git a/scripts/deploy.sh b/scripts/deploy.sh index 7d2cacb..8d59297 100755 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -442,6 +442,7 @@ echo "▸ Step $STEP/$TOTAL_STEPS: Redeploying app with freshest code..." echo " Patching app.yaml on workspace with GSO config..." PATCHED_APP_YAML="/tmp/app.yaml.patched" cp "$PROJECT_DIR/app.yaml" "$PATCHED_APP_YAML" +sed -i.bak "s|__WAREHOUSE_ID__|$WAREHOUSE_ID|" "$PATCHED_APP_YAML" sed -i.bak "s|__GSO_CATALOG__|$CATALOG|" "$PATCHED_APP_YAML" sed -i.bak "s|__LAKEBASE_INSTANCE__|$LAKEBASE_INSTANCE|" "$PATCHED_APP_YAML" sed -i.bak "s|__LLM_MODEL__|$LLM_MODEL|" "$PATCHED_APP_YAML" @@ -461,7 +462,7 @@ fi databricks workspace import "$WS_PATH/app.yaml" \ --profile "$PROFILE" --file "$PATCHED_APP_YAML" --format AUTO --overwrite 2>/dev/null && \ -echo " ✓ app.yaml patched (GSO_CATALOG=$CATALOG, GSO_JOB_ID=${JOB_ID:-}, LAKEBASE_INSTANCE=$LAKEBASE_INSTANCE, LLM_MODEL=$LLM_MODEL, MLFLOW=${MLFLOW_EXPERIMENT_ID:-})" || \ +echo " ✓ app.yaml patched (WAREHOUSE=$WAREHOUSE_ID, GSO_CATALOG=$CATALOG, GSO_JOB_ID=${JOB_ID:-}, LAKEBASE_INSTANCE=$LAKEBASE_INSTANCE, LLM_MODEL=$LLM_MODEL, MLFLOW=${MLFLOW_EXPERIMENT_ID:-})" || \ echo " ⚠ Could not patch app.yaml — config may not be set" # Sync _metadata.py — required at runtime for the genie_space_optimizer @@ -550,17 +551,20 @@ except Exception: pass fi fi -# ── Set app scopes + resources, then deploy ────────────────────────────── -# Merge existing resources (e.g. manually-added Lakebase) with required ones. +# ── Configure app scopes and resources ─────────────────────────────────── +# The PATCH API is the mechanism that configures both user_api_scopes and +# resources on a Databricks App. app.yaml user_api_scopes are documentation +# only; apps deploy does not apply them. echo " Configuring app scopes and resources..." EXISTING_RESOURCES=$(databricks apps get "$APP_NAME" --profile "$PROFILE" -o json 2>/dev/null \ | python3 -c "import sys,json; print(json.dumps(json.load(sys.stdin).get('resources',[])))" 2>/dev/null || echo "[]") PATCH_PAYLOAD=$(python3 -c " import json + scopes = ['sql', 'dashboards.genie', 'serving.serving-endpoints', - 'catalog.catalogs:read', 'catalog.schemas:read', - 'catalog.tables:read', 'files.files'] + 'catalog.catalogs:read', 'catalog.schemas:read', + 'catalog.tables:read', 'files.files'] # Start with existing resources. The PATCH API replaces all resources, # so we must include everything. Preserve all resources that either have @@ -596,7 +600,7 @@ print(json.dumps({'user_api_scopes': scopes, 'resources': list(by_name.values()) ") databricks api patch "/api/2.0/apps/$APP_NAME" \ --profile "$PROFILE" --json "$PATCH_PAYLOAD" 2>/dev/null && \ - echo " ✓ App scopes and resources configured" || \ + echo " ✓ App scopes and resources configured (sql-warehouse: $WAREHOUSE_ID)" || \ echo " ⚠ Could not configure app scopes/resources" databricks apps deploy "$APP_NAME" --profile "$PROFILE" \ diff --git a/scripts/install.sh b/scripts/install.sh index 4dc238d..c8ca96b 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -60,7 +60,8 @@ _prompt_yn() { local default="${3:-Y}" local result - echo -en " ${prompt_text} [${default}]: " + local yn_hint="[Y/N, Enter=${default}]" + echo -en " ${prompt_text} ${yn_hint}: " read -r result result="${result:-$default}" case "$result" in @@ -69,6 +70,58 @@ _prompt_yn() { esac } +# Usage: _select_from VARNAME "Prompt text" [default_idx] item1 item2 item3 ... +# Optional default_idx (a plain integer) sets a default choice — Enter accepts it. +# Sets VARNAME to the selected item. Caller must handle empty result. +_select_from() { + local varname="$1" + local prompt_text="$2" + shift 2 + + # If first remaining arg is a plain integer, treat it as the default index (1-based) + local default_idx=0 + if [[ "${1:-}" =~ ^[0-9]+$ ]]; then + default_idx="$1" + shift + fi + + local items=("$@") + + if [ ${#items[@]} -eq 0 ]; then + printf -v "$varname" '%s' "" + return + fi + + local i + for i in "${!items[@]}"; do + if [ "$((i+1))" -eq "$default_idx" ]; then + echo " $((i+1))) ${items[$i]} (default)" + else + echo " $((i+1))) ${items[$i]}" + fi + done + echo "" + + local choice result + local range_hint="[1-${#items[@]}]" + [ "$default_idx" -gt 0 ] && range_hint="[1-${#items[@]}, Enter for $default_idx]" + + while true; do + echo -en " ${prompt_text} ${range_hint}: " + read -r choice + if [ -z "$choice" ] && [ "$default_idx" -gt 0 ]; then + result="${items[$((default_idx-1))]}" + break + fi + if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -ge 1 ] && [ "$choice" -le "${#items[@]}" ]; then + result="${items[$((choice-1))]}" + break + fi + echo " Please enter a number between 1 and ${#items[@]}." + done + printf -v "$varname" '%s' "$result" +} + # ══════════════════════════════════════════════════════════════════════════ # Step 0: Banner # ══════════════════════════════════════════════════════════════════════════ @@ -138,18 +191,83 @@ fi # ══════════════════════════════════════════════════════════════════════════ _header "Step 2: Databricks profile" -# Show available profiles -_info "Available profiles:" -databricks auth profiles 2>/dev/null | head -20 || echo " (could not list profiles)" +_info "Discovering configured Databricks profiles..." + +# Parse name + auth status from databricks auth profiles +DEFAULT_VALID="NO" +LOGGEDIN_NAMES=() +NOTLOGGEDIN_NAMES=() +PROFILES_EXIT=0 +PROFILES_OUTPUT=$(databricks auth profiles 2>/dev/null) || PROFILES_EXIT=1 +while IFS= read -r line; do + [ -z "$line" ] && continue + name=$(echo "$line" | awk '{print $1}') + valid=$(echo "$line" | awk '{print $NF}') + [ -z "$name" ] && continue + if [ "$name" = "DEFAULT" ]; then + DEFAULT_VALID="$valid" + elif [ "$valid" = "YES" ]; then + LOGGEDIN_NAMES+=("$name") + else + NOTLOGGEDIN_NAMES+=("$name") + fi +done < <(echo "$PROFILES_OUTPUT" | tail -n +2 | grep -v '^$' || true) + +# Build ordered selection array: DEFAULT first, then logged-in, then not-logged-in +ORDERED_PROFILES=("DEFAULT") +for n in "${LOGGEDIN_NAMES[@]}"; do ORDERED_PROFILES+=("$n"); done +for n in "${NOTLOGGEDIN_NAMES[@]}"; do ORDERED_PROFILES+=("$n"); done + +if [ "${#ORDERED_PROFILES[@]}" -eq 1 ] && [ "$PROFILES_EXIT" -ne 0 ]; then + _warn "Could not list profiles (databricks CLI error). Falling back to DEFAULT." +fi + +# Display with sections echo "" +if [ "$DEFAULT_VALID" = "YES" ]; then + echo " 1) DEFAULT ✓" +else + echo " 1) DEFAULT (not logged in)" +fi + +DISPLAY_IDX=2 +if [ ${#LOGGEDIN_NAMES[@]} -gt 0 ]; then + echo "" + echo -e " ${BOLD}Logged in:${NC}" + for n in "${LOGGEDIN_NAMES[@]}"; do + echo " $DISPLAY_IDX) $n ✓" + DISPLAY_IDX=$((DISPLAY_IDX + 1)) + done +fi + +if [ ${#NOTLOGGEDIN_NAMES[@]} -gt 0 ]; then + echo "" + echo -e " ${BOLD}Not logged in:${NC}" + for n in "${NOTLOGGEDIN_NAMES[@]}"; do + echo " $DISPLAY_IDX) $n" + DISPLAY_IDX=$((DISPLAY_IDX + 1)) + done +fi -_prompt PROFILE "Databricks CLI profile" "DEFAULT" +echo "" +TOTAL_PROFILES=${#ORDERED_PROFILES[@]} +PROFILE="" +while true; do + echo -en " Select a profile [1-$TOTAL_PROFILES]: " + read -r choice + if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -ge 1 ] && [ "$choice" -le "$TOTAL_PROFILES" ]; then + PROFILE="${ORDERED_PROFILES[$((choice-1))]}" + break + fi + echo " Please enter a number between 1 and $TOTAL_PROFILES." +done +echo "" # Validate the profile if databricks current-user me --profile "$PROFILE" -o json &>/dev/null; then DEPLOYER=$(databricks current-user me --profile "$PROFILE" -o json \ | python3 -c "import sys,json; print(json.load(sys.stdin)['userName'])") - _ok "Authenticated as $DEPLOYER" + _ok "Authenticated as $DEPLOYER (profile: $PROFILE)" else _error "Could not authenticate with profile '$PROFILE'." _info "Run: databricks configure --profile $PROFILE" @@ -157,118 +275,274 @@ else fi # ══════════════════════════════════════════════════════════════════════════ -# Step 3: Catalog +# Step 3: Unity Catalog # ══════════════════════════════════════════════════════════════════════════ _header "Step 3: Unity Catalog" -_info "The optimizer stores state tables in a schema called 'genie_space_optimizer'" -_info "inside the catalog you choose below." +_info "Genie Workbench stores run history, benchmark results, and optimization" +_info "state in a schema called 'genie_space_optimizer' inside the catalog you" +_info "choose here. The deploy script creates this schema automatically." echo "" -_warn "You must have CREATE SCHEMA permission on this catalog." -_info "The deploy script will create the schema automatically if it doesn't exist." +_warn "You must have CREATE SCHEMA permission on the selected catalog." echo "" -_info "Available catalogs:" -databricks catalogs list --profile "$PROFILE" -o json 2>/dev/null \ +_info "Discovering available catalogs..." +CATALOG_NAMES=() +while IFS= read -r name; do + [ -n "$name" ] && CATALOG_NAMES+=("$name") +done < <( + databricks catalogs list --profile "$PROFILE" -o json 2>/dev/null \ | python3 -c " -import sys,json +import sys, json try: data = json.load(sys.stdin) cats = data if isinstance(data, list) else data.get('catalogs', []) - for c in cats[:20]: + for c in cats[:30]: name = c.get('name','') if isinstance(c, dict) else str(c) - print(f' {name}') + if name: + print(name) except: pass -" 2>/dev/null || echo " (could not list catalogs)" -echo "" +" 2>/dev/null +) -_prompt CATALOG "Catalog name" "" +if [ ${#CATALOG_NAMES[@]} -eq 0 ]; then + _warn "Could not list catalogs. Enter a catalog name manually." + _prompt CATALOG "Catalog name" "" +else + _info "Available catalogs:" + _select_from CATALOG "Select the catalog to use" "${CATALOG_NAMES[@]}" +fi if [ -z "$CATALOG" ]; then - _error "Catalog is required." >&2 - echo "" >&2 - echo " The optimizer needs a Unity Catalog to store state tables," >&2 - echo " benchmarks, and prompt artifacts." >&2 - echo "" >&2 - echo " Example:" >&2 - echo " export GENIE_CATALOG=my_catalog" >&2 + _error "Catalog is required." exit 1 fi GSO_SCHEMA="genie_space_optimizer" -_ok "Will use schema: ${CATALOG}.${GSO_SCHEMA}" +_ok "Will create schema: ${CATALOG}.${GSO_SCHEMA}" # ══════════════════════════════════════════════════════════════════════════ # Step 4: SQL Warehouse # ══════════════════════════════════════════════════════════════════════════ _header "Step 4: SQL Warehouse" -_info "Available SQL warehouses:" -databricks warehouses list --profile "$PROFILE" -o json 2>/dev/null \ +_info "The warehouse runs SQL queries for the optimizer and catalog discovery." +echo "" +_info "Discovering available SQL warehouses..." + +# Build parallel arrays: display labels and raw IDs +WH_LABELS=() +WH_IDS=() +while IFS='|' read -r wid wlabel; do + [ -n "$wid" ] && WH_IDS+=("$wid") && WH_LABELS+=("$wlabel") +done < <( + databricks warehouses list --profile "$PROFILE" -o json 2>/dev/null \ | python3 -c " -import sys,json +import sys, json try: data = json.load(sys.stdin) whs = data if isinstance(data, list) else data.get('warehouses', []) - for w in whs[:15]: - wid = w.get('id','') - name = w.get('name','') - state = w.get('state','') - print(f' {wid} {name} ({state})') + for w in whs[:20]: + wid = w.get('id','') + name = w.get('name','Unnamed') + state = w.get('state','UNKNOWN') + if wid: + label = f'{name} ({state}) — ID: {wid}' + print(f'{wid}|{label}') except: pass -" 2>/dev/null || echo " (could not list warehouses)" -echo "" +" 2>/dev/null +) -_prompt WAREHOUSE_ID "SQL Warehouse ID" "" +if [ ${#WH_LABELS[@]} -eq 0 ]; then + _warn "Could not list warehouses. Enter the warehouse ID manually." + _prompt WAREHOUSE_ID "SQL Warehouse ID" "" +else + _info "Available SQL warehouses:" + _select_from WH_LABEL "Select a warehouse" "${WH_LABELS[@]}" + # Reverse-lookup the ID for the selected label + WAREHOUSE_ID="" + for i in "${!WH_LABELS[@]}"; do + if [ "${WH_LABELS[$i]}" = "$WH_LABEL" ]; then + WAREHOUSE_ID="${WH_IDS[$i]}" + break + fi + done +fi if [ -z "$WAREHOUSE_ID" ]; then _error "Warehouse ID is required." exit 1 fi +_ok "Selected warehouse: $WAREHOUSE_ID" + # ══════════════════════════════════════════════════════════════════════════ # Step 5: LLM Model # ══════════════════════════════════════════════════════════════════════════ _header "Step 5: LLM Model" -_prompt LLM_MODEL "LLM serving endpoint" "databricks-claude-sonnet-4-6" +_info "Choose the foundation model Genie Workbench will use to create and" +_info "optimize Genie Spaces, generate SQL instructions, and explain findings." +echo "" + +CURATED_MODELS=( + "Claude Sonnet 4.6 (Recommended — databricks-claude-sonnet-4-6)" + "GPT-5.4 (Databricks — databricks-gpt-5-4)" + "Other (browse all serving endpoints or enter a name manually)" +) + +LLM_MODEL="" +_select_from MODEL_CHOICE "Select a model" 1 "${CURATED_MODELS[@]}" + +case "$MODEL_CHOICE" in + Claude*) + LLM_MODEL="databricks-claude-sonnet-4-6" ;; + GPT*) + LLM_MODEL="databricks-gpt-5-4" ;; + Other*) + echo "" + echo " 1) Browse all serving endpoints in my workspace" + echo " 2) Enter endpoint name manually" + echo "" + OTHER_CHOICE="" + while true; do + echo -en " [1-2]: " + read -r OTHER_CHOICE + case "$OTHER_CHOICE" in + 1|2) break ;; + *) echo " Please enter 1 or 2." ;; + esac + done + if [ "$OTHER_CHOICE" = "1" ]; then + _info "Fetching all serving endpoints..." + ALL_ENDPOINTS=() + while IFS= read -r ep; do + [ -n "$ep" ] && ALL_ENDPOINTS+=("$ep") + done < <( + databricks serving-endpoints list --profile "$PROFILE" -o json 2>/dev/null \ + | python3 -c " +import sys, json +try: + data = json.load(sys.stdin) + eps = data if isinstance(data, list) else data.get('endpoints', []) + for e in eps: + print(e.get('name','')) +except: pass +" 2>/dev/null + ) + if [ ${#ALL_ENDPOINTS[@]} -eq 0 ]; then + _warn "No serving endpoints found. Enter endpoint name manually." + _prompt LLM_MODEL "Endpoint name" "" + else + _select_from LLM_MODEL "Select endpoint" "${ALL_ENDPOINTS[@]}" + fi + else + _prompt LLM_MODEL "Endpoint name" "" + fi + ;; +esac + +if [ -z "$LLM_MODEL" ]; then + _error "No LLM model selected." + exit 1 +fi + +_ok "LLM model: $LLM_MODEL" # ══════════════════════════════════════════════════════════════════════════ # Step 6: MLflow Tracing (optional) # ══════════════════════════════════════════════════════════════════════════ -APP_NAME_DEFAULT="genie-workbench" - _header "Step 6: MLflow Tracing (optional)" -_info "MLflow tracing provides observability for the Create Agent and Fix Agent." -_info "Traces are logged to an MLflow experiment in your workspace." +_info "MLflow tracing records every LLM call the Create Agent and Fix Agent make:" +_info "inputs, outputs, token counts, and latency — viewable in the MLflow" +_info "Experiments UI. Useful for debugging agent behavior; adds minor overhead." +_info "You can enable or disable this later by editing .env.deploy." echo "" MLFLOW_EXPERIMENT_ID="" -_prompt_yn ENABLE_MLFLOW "Enable MLflow tracing for agents?" "N" +_prompt_yn ENABLE_MLFLOW "Enable MLflow tracing for agents?" "Y" if [ "$ENABLE_MLFLOW" = "Y" ]; then _prompt_yn HAS_EXPERIMENT "Do you already have an MLflow experiment?" "N" if [ "$HAS_EXPERIMENT" = "Y" ]; then - _prompt MLFLOW_EXPERIMENT_ID "MLflow experiment ID" "" + _info "Discovering MLflow experiments..." + EXP_LABELS=() + EXP_IDS=() + while IFS='|' read -r eid elabel; do + [ -n "$eid" ] && EXP_IDS+=("$eid") && EXP_LABELS+=("$elabel") + done < <( + databricks api post /api/2.0/mlflow/experiments/search \ + --profile "$PROFILE" --json '{"max_results": 50}' -o json 2>/dev/null \ + | python3 -c " +import sys, json +try: + data = json.load(sys.stdin) + for e in data.get('experiments', []): + eid = e.get('experiment_id','') + name = e.get('name','Unnamed') + if eid: + print(f'{eid}|{name} (ID: {eid})') +except: pass +" 2>/dev/null + ) + + if [ ${#EXP_LABELS[@]} -eq 0 ]; then + _warn "No MLflow experiments found. Enter an experiment ID manually." + _prompt MLFLOW_EXPERIMENT_ID "MLflow experiment ID" "" + else + _info "Available MLflow experiments:" + _select_from EXP_LABEL "Select an experiment" "${EXP_LABELS[@]}" + MLFLOW_EXPERIMENT_ID="" + for i in "${!EXP_LABELS[@]}"; do + if [ "${EXP_LABELS[$i]}" = "$EXP_LABEL" ]; then + MLFLOW_EXPERIMENT_ID="${EXP_IDS[$i]}" + break + fi + done + fi + if [ -z "$MLFLOW_EXPERIMENT_ID" ]; then - _warn "No experiment ID provided. MLflow tracing will be disabled." + _warn "No experiment selected. MLflow tracing will be disabled." else _ok "MLflow tracing enabled (experiment: $MLFLOW_EXPERIMENT_ID)" fi else _info "Creating MLflow experiment..." - EXPERIMENT_PATH="/Shared/${APP_NAME_DEFAULT}-agent-tracing" + _info "Press Enter to accept the default path, or type a custom one." + _prompt EXPERIMENT_PATH "Experiment path" "/Shared/genie-workbench-agent-tracing" + # Build JSON safely so paths with quotes/special chars don't break the payload + MLFLOW_CREATE_JSON=$(python3 -c "import json,sys; print(json.dumps({'name': sys.argv[1]}))" "$EXPERIMENT_PATH") + # Try to create the experiment MLFLOW_EXPERIMENT_ID=$( - databricks experiments create-experiment "$EXPERIMENT_PATH" \ - --profile "$PROFILE" -o json 2>/dev/null \ + databricks api post /api/2.0/mlflow/experiments/create \ + --profile "$PROFILE" \ + --json "$MLFLOW_CREATE_JSON" -o json 2>/dev/null \ | python3 -c "import sys,json; print(json.load(sys.stdin).get('experiment_id',''))" 2>/dev/null || true ) + # If creation failed (e.g. already exists), look it up by name + if [ -z "$MLFLOW_EXPERIMENT_ID" ]; then + MLFLOW_EXPERIMENT_ID=$( + databricks api post /api/2.0/mlflow/experiments/search \ + --profile "$PROFILE" \ + --json '{"max_results": 100}' -o json 2>/dev/null \ + | EXPERIMENT_PATH="$EXPERIMENT_PATH" python3 -c " +import sys, json, os +path = os.environ['EXPERIMENT_PATH'] +try: + for e in json.load(sys.stdin).get('experiments', []): + if e.get('name','') == path: + print(e.get('experiment_id','')) + break +except: pass +" 2>/dev/null || true + ) + fi if [ -n "$MLFLOW_EXPERIMENT_ID" ]; then - _ok "Created experiment: $EXPERIMENT_PATH (ID: $MLFLOW_EXPERIMENT_ID)" + _ok "MLflow experiment ready: $EXPERIMENT_PATH (ID: $MLFLOW_EXPERIMENT_ID)" else - _warn "Could not create experiment. MLflow tracing will be disabled." + _warn "Could not create or find MLflow experiment. Tracing will be disabled." _info "You can create one manually and add the ID to .env.deploy later." fi fi @@ -277,26 +551,107 @@ else fi # ══════════════════════════════════════════════════════════════════════════ -# Step 7: Lakebase +# Step 7: Lakebase (PostgreSQL) # ══════════════════════════════════════════════════════════════════════════ _header "Step 7: Lakebase (PostgreSQL)" -_info "Lakebase provides persistent storage for scan history and starred spaces." -_info "Without it, the app uses in-memory storage (data lost on restart)." +_info "Lakebase provides persistent PostgreSQL storage for scan history, starred" +_info "spaces, and Create Agent sessions. Without it, the app uses in-memory" +_info "storage and all history is lost every time the app restarts." +echo "" +_warn "Genie Workbench requires Lakebase Autoscaling (Serverless). Provisioned" +_warn "Lakebase instances are not supported." echo "" -_info "Enter a Lakebase Autoscaling project name below." -_info "The deploy script will create the project, set up a Postgres role for" -_info "the app's SP, and attach it automatically. Leave blank to skip." +_info "If you don't have a Lakebase Autoscaling project yet, you can skip this" +_info "step and create one later via the Databricks UI (SQL → Lakebase)." echo "" -_prompt LAKEBASE_INSTANCE "Lakebase project name" "$APP_NAME_DEFAULT" +_info "Discovering available Lakebase Autoscaling projects..." +LB_NAMES=() +while IFS= read -r name; do + [ -n "$name" ] && LB_NAMES+=("$name") +done < <( + databricks api get /api/2.0/postgres/projects --profile "$PROFILE" -o json 2>/dev/null \ + | python3 -c " +import sys, json +try: + data = json.load(sys.stdin) + projects = data if isinstance(data, list) else data.get('projects', []) + for proj in projects: + if isinstance(proj, dict): + # resource name is 'projects/' — extract the ID + resource_name = proj.get('name', '') + project_id = resource_name.removeprefix('projects/') if resource_name else proj.get('project_id', '') + if project_id: + print(project_id) + else: + val = str(proj) + if val: + print(val) +except: pass +" 2>/dev/null +) + +LB_OPTIONS=() +if [ ${#LB_NAMES[@]} -gt 0 ]; then + LB_OPTIONS+=("${LB_NAMES[@]}") +fi +LB_OPTIONS+=("Skip — use in-memory fallback (history lost on restart)") + +if [ ${#LB_NAMES[@]} -gt 0 ]; then + _info "Available Lakebase Autoscaling projects:" +else + _info "No Lakebase Autoscaling projects found. You can skip for now and create" + _info "one in the Databricks UI (SQL → Lakebase), then re-run ./scripts/deploy.sh." +fi +_select_from LB_CHOICE "Select a Lakebase Autoscaling project" "${LB_OPTIONS[@]}" + +if [[ "$LB_CHOICE" == "Skip — use in-memory fallback (history lost on restart)" ]]; then + LAKEBASE_INSTANCE="" + _warn "Skipping Lakebase. Scan history and stars will not persist across restarts." +else + LAKEBASE_INSTANCE="$LB_CHOICE" + _ok "Lakebase Autoscaling project: $LAKEBASE_INSTANCE" +fi # ══════════════════════════════════════════════════════════════════════════ # Step 8: App name # ══════════════════════════════════════════════════════════════════════════ _header "Step 8: App name" -_prompt APP_NAME "Databricks App name" "$APP_NAME_DEFAULT" +_info "This is the name of the Databricks App that will be created in your workspace." +_info "Only lowercase letters, numbers, and hyphens are allowed." +echo "" + +APP_NAME="" +while true; do + echo -en " Name your Databricks app (e.g. genie-workbench): " + read -r APP_NAME_INPUT + if [ -z "$APP_NAME_INPUT" ]; then + echo " Please enter an app name." + continue + fi + + # Auto-fix: lowercase, replace spaces and underscores with hyphens, strip disallowed chars + APP_NAME_FIXED=$(echo "$APP_NAME_INPUT" | tr '[:upper:]' '[:lower:]' | tr ' _' '-' | tr -cd 'a-z0-9-') + + if [ "$APP_NAME_FIXED" = "$APP_NAME_INPUT" ]; then + # Already valid + APP_NAME="$APP_NAME_INPUT" + break + elif [ -z "$APP_NAME_FIXED" ]; then + _warn "That name contains no valid characters. Please try again." + else + _warn "App names may only contain lowercase letters, numbers, and hyphens. Suggested fix: ${BOLD}${APP_NAME_FIXED}${NC}" + _prompt_yn USE_FIXED "Use '$APP_NAME_FIXED' as your app name?" "Y" + if [ "$USE_FIXED" = "Y" ]; then + APP_NAME="$APP_NAME_FIXED" + break + fi + fi +done + +_ok "App name: $APP_NAME" # ══════════════════════════════════════════════════════════════════════════ # Step 9: Write .env.deploy @@ -308,13 +663,13 @@ cat > "$ENV_FILE" <