From daa569495da029a30b972cd0326663f3cecfb012 Mon Sep 17 00:00:00 2001 From: Vamshi-Microsoft Date: Tue, 12 May 2026 15:45:57 +0530 Subject: [PATCH 1/3] fix: enhance post-deployment setup to handle transient ServiceUnavailable errors Co-authored-by: Copilot --- .../workflows/job-post-deployment-setup.yml | 35 +----------------- scripts/post_deployment_setup.sh | 37 ++++++++++++++++--- 2 files changed, 33 insertions(+), 39 deletions(-) diff --git a/.github/workflows/job-post-deployment-setup.yml b/.github/workflows/job-post-deployment-setup.yml index c48b5a501..76aacd14d 100644 --- a/.github/workflows/job-post-deployment-setup.yml +++ b/.github/workflows/job-post-deployment-setup.yml @@ -43,40 +43,7 @@ jobs: run: | pip install psycopg2-binary azure-identity - - name: Run Post-Deployment Setup (Attempt 1) - id: setup1 - shell: bash - env: - RESOURCE_GROUP: ${{ inputs.RESOURCE_GROUP_NAME }} - run: | - chmod +x scripts/post_deployment_setup.sh - bash scripts/post_deployment_setup.sh "$RESOURCE_GROUP" - continue-on-error: true - - - name: Wait 20 seconds before retry - if: ${{ steps.setup1.outcome == 'failure' }} - shell: bash - run: sleep 20s - - - name: Run Post-Deployment Setup (Attempt 2) - id: setup2 - if: ${{ steps.setup1.outcome == 'failure' }} - shell: bash - env: - RESOURCE_GROUP: ${{ inputs.RESOURCE_GROUP_NAME }} - run: | - chmod +x scripts/post_deployment_setup.sh - bash scripts/post_deployment_setup.sh "$RESOURCE_GROUP" - continue-on-error: true - - - name: Wait 40 seconds before final retry - if: ${{ steps.setup2.outcome == 'failure' }} - shell: bash - run: sleep 40s - - - name: Run Post-Deployment Setup (Attempt 3) - id: setup3 - if: ${{ steps.setup2.outcome == 'failure' }} + - name: Run Post-Deployment Setup shell: bash env: RESOURCE_GROUP: ${{ inputs.RESOURCE_GROUP_NAME }} diff --git a/scripts/post_deployment_setup.sh b/scripts/post_deployment_setup.sh index f6d34c740..dacee5557 100644 --- a/scripts/post_deployment_setup.sh +++ b/scripts/post_deployment_setup.sh @@ -196,14 +196,34 @@ else sleep $RETRY_INTERVAL done - # Set the function key via REST API (with retries — the host runtime may not be ready yet) - echo "✓ Setting function key 'ClientKey' on '${FUNCTION_APP_NAME}'..." + # Probe the host runtime status endpoint so we don't attempt the key write + # before the Functions host has finished initializing. This significantly + # reduces transient "ServiceUnavailable from host runtime" failures, + # especially under service-principal driven deployments where the host + # may still be cold-starting after provisioning. + echo "Waiting for Functions host runtime to be ready..." SUBSCRIPTION_ID=$(az account show --query "id" -o tsv | tr -d '\r') + HOST_STATUS_URI="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP}/providers/Microsoft.Web/sites/${FUNCTION_APP_NAME}/host/default/properties/status?api-version=2023-01-01" + HOST_MAX_RETRIES=20 + HOST_RETRY_INTERVAL=30 + for i in $(seq 1 $HOST_MAX_RETRIES); do + HOST_STATE=$(az rest --method get --uri "$HOST_STATUS_URI" --query "properties.state" -o tsv 2>/dev/null || true) + if [ "$HOST_STATE" = "Running" ]; then + echo "Functions host runtime is Running." + break + fi + echo " [${i}/${HOST_MAX_RETRIES}] Host runtime state: '${HOST_STATE:-unknown}'. Retrying in ${HOST_RETRY_INTERVAL}s..." + sleep $HOST_RETRY_INTERVAL + done + + # Set the function key via REST API (with retries — the host runtime may + # still report transient ServiceUnavailable during early initialization). + echo "✓ Setting function key 'ClientKey' on '${FUNCTION_APP_NAME}'..." URI="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP}/providers/Microsoft.Web/sites/${FUNCTION_APP_NAME}/host/default/functionKeys/clientKey?api-version=2023-01-01" BODY="{\"properties\":{\"name\":\"ClientKey\",\"value\":\"${FUNCTION_KEY}\"}}" KEY_SET=false - KEY_MAX_RETRIES=5 + KEY_MAX_RETRIES=20 KEY_RETRY_INTERVAL=30 for attempt in $(seq 1 $KEY_MAX_RETRIES); do REST_ERR=$(az rest --method put --uri "$URI" --body "$BODY" 2>&1 > /dev/null) || true @@ -211,8 +231,14 @@ else KEY_SET=true break fi - echo " [${attempt}/${KEY_MAX_RETRIES}] Host runtime not ready yet. Retrying in ${KEY_RETRY_INTERVAL}s..." - echo " $REST_ERR" + # Recognize the well-known transient "ServiceUnavailable from host runtime" + # error and continue retrying. Other errors are also retried but logged. + if echo "$REST_ERR" | grep -qi "ServiceUnavailable"; then + echo " [${attempt}/${KEY_MAX_RETRIES}] Host runtime returned ServiceUnavailable. Retrying in ${KEY_RETRY_INTERVAL}s..." + else + echo " [${attempt}/${KEY_MAX_RETRIES}] Key set failed. Retrying in ${KEY_RETRY_INTERVAL}s..." + echo " $REST_ERR" + fi sleep $KEY_RETRY_INTERVAL done @@ -220,6 +246,7 @@ else echo "✓ Function key set successfully." else echo "✗ ERROR: Failed to set function key on '${FUNCTION_APP_NAME}' after ${KEY_MAX_RETRIES} attempts." >&2 + echo " Last error: $REST_ERR" >&2 restore_network_access exit 1 fi From 279aba10f68a206ecb5ddd2e4f522d7a53f91c2f Mon Sep 17 00:00:00 2001 From: Vamshi-Microsoft Date: Wed, 13 May 2026 10:42:06 +0530 Subject: [PATCH 2/3] fix: improve handling of transient ServiceUnavailable errors during function app setup --- scripts/post_deployment_setup.sh | 53 +++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/scripts/post_deployment_setup.sh b/scripts/post_deployment_setup.sh index dacee5557..11c29ecf1 100644 --- a/scripts/post_deployment_setup.sh +++ b/scripts/post_deployment_setup.sh @@ -201,11 +201,31 @@ else # reduces transient "ServiceUnavailable from host runtime" failures, # especially under service-principal driven deployments where the host # may still be cold-starting after provisioning. - echo "Waiting for Functions host runtime to be ready..." SUBSCRIPTION_ID=$(az account show --query "id" -o tsv | tr -d '\r') + + # Restart the function app before setting the key. + # After fresh provisioning, the Functions host frequently lands in a state where + # the ARM-proxied keystore endpoints return InternalServerError/ServiceUnavailable. + # This is especially common under service-principal-driven deployments because + # role assignments (e.g. Storage Blob Data Owner for identity-based AzureWebJobsStorage) + # may not have fully propagated by the time the host first booted. A restart forces + # the host to re-initialize against the now-valid configuration. + echo "✓ Restarting function app to ensure host runtime is in a clean state..." + az functionapp restart --name "$FUNCTION_APP_NAME" --resource-group "$RESOURCE_GROUP" >/dev/null 2>&1 || true + sleep 20 + + # Wait for site to report Running again + for i in $(seq 1 20); do + STATE=$(az functionapp show --name "$FUNCTION_APP_NAME" --resource-group "$RESOURCE_GROUP" --query "state" -o tsv 2>/dev/null || true) + [ "$STATE" = "Running" ] && break + echo " [${i}/20] Function app not Running after restart. Retrying in 15s..." + sleep 15 + done + + echo "Waiting for Functions host runtime to be ready..." HOST_STATUS_URI="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP}/providers/Microsoft.Web/sites/${FUNCTION_APP_NAME}/host/default/properties/status?api-version=2023-01-01" - HOST_MAX_RETRIES=20 - HOST_RETRY_INTERVAL=30 + HOST_MAX_RETRIES=30 + HOST_RETRY_INTERVAL=20 for i in $(seq 1 $HOST_MAX_RETRIES); do HOST_STATE=$(az rest --method get --uri "$HOST_STATUS_URI" --query "properties.state" -o tsv 2>/dev/null || true) if [ "$HOST_STATE" = "Running" ]; then @@ -216,8 +236,11 @@ else sleep $HOST_RETRY_INTERVAL done + # Warm up the host with an HTTP probe (best-effort, ignores result). + curl -fsS -o /dev/null -m 30 "https://${FUNCTION_APP_NAME}.azurewebsites.net/" >/dev/null 2>&1 || true + # Set the function key via REST API (with retries — the host runtime may - # still report transient ServiceUnavailable during early initialization). + # still report transient errors during early initialization). echo "✓ Setting function key 'ClientKey' on '${FUNCTION_APP_NAME}'..." URI="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP}/providers/Microsoft.Web/sites/${FUNCTION_APP_NAME}/host/default/functionKeys/clientKey?api-version=2023-01-01" BODY="{\"properties\":{\"name\":\"ClientKey\",\"value\":\"${FUNCTION_KEY}\"}}" @@ -231,10 +254,16 @@ else KEY_SET=true break fi - # Recognize the well-known transient "ServiceUnavailable from host runtime" - # error and continue retrying. Other errors are also retried but logged. - if echo "$REST_ERR" | grep -qi "ServiceUnavailable"; then - echo " [${attempt}/${KEY_MAX_RETRIES}] Host runtime returned ServiceUnavailable. Retrying in ${KEY_RETRY_INTERVAL}s..." + # Recognize the well-known transient "ServiceUnavailable / InternalServerError + # from host runtime" error and continue retrying. Other errors are also retried but logged. + if echo "$REST_ERR" | grep -qiE "ServiceUnavailable|InternalServerError"; then + echo " [${attempt}/${KEY_MAX_RETRIES}] Host runtime transient error. Retrying in ${KEY_RETRY_INTERVAL}s..." + # Every 5 failed attempts, restart the function app again to nudge the host out of a bad state. + if [ $((attempt % 5)) -eq 0 ] && [ $attempt -lt $KEY_MAX_RETRIES ]; then + echo " → Re-restarting function app to clear stuck host state..." + az functionapp restart --name "$FUNCTION_APP_NAME" --resource-group "$RESOURCE_GROUP" >/dev/null 2>&1 || true + sleep 30 + fi else echo " [${attempt}/${KEY_MAX_RETRIES}] Key set failed. Retrying in ${KEY_RETRY_INTERVAL}s..." echo " $REST_ERR" @@ -247,6 +276,14 @@ else else echo "✗ ERROR: Failed to set function key on '${FUNCTION_APP_NAME}' after ${KEY_MAX_RETRIES} attempts." >&2 echo " Last error: $REST_ERR" >&2 + echo "" >&2 + echo " Manual workaround:" >&2 + echo " 1. In the Azure Portal, open Function App '${FUNCTION_APP_NAME}' → Functions → App keys." >&2 + echo " 2. Add a Host key named 'ClientKey' with the value of the 'FUNCTION-KEY' secret" >&2 + echo " in Key Vault '${KEY_VAULT_NAME}'." >&2 + echo " 3. Or run:" >&2 + echo " az functionapp keys set --name ${FUNCTION_APP_NAME} --resource-group ${RESOURCE_GROUP} \\" >&2 + echo " --key-type functionKeys --key-name ClientKey --key-value " >&2 restore_network_access exit 1 fi From 71d53ed3a5014683d195c5c626d92ef756d77536 Mon Sep 17 00:00:00 2001 From: Vamshi-Microsoft Date: Wed, 13 May 2026 16:09:19 +0530 Subject: [PATCH 3/3] Update comments Co-authored-by: Copilot --- scripts/post_deployment_setup.sh | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/scripts/post_deployment_setup.sh b/scripts/post_deployment_setup.sh index 11c29ecf1..b0b125e1b 100644 --- a/scripts/post_deployment_setup.sh +++ b/scripts/post_deployment_setup.sh @@ -196,20 +196,12 @@ else sleep $RETRY_INTERVAL done - # Probe the host runtime status endpoint so we don't attempt the key write - # before the Functions host has finished initializing. This significantly - # reduces transient "ServiceUnavailable from host runtime" failures, - # especially under service-principal driven deployments where the host - # may still be cold-starting after provisioning. + # Force host re-init: identity-based AzureWebJobsStorage role assignments + # can land after the host's first boot, leaving the ARM-proxied keystore + # endpoint stuck on InternalServerError/ServiceUnavailable (most common + # under service-principal deployments). SUBSCRIPTION_ID=$(az account show --query "id" -o tsv | tr -d '\r') - # Restart the function app before setting the key. - # After fresh provisioning, the Functions host frequently lands in a state where - # the ARM-proxied keystore endpoints return InternalServerError/ServiceUnavailable. - # This is especially common under service-principal-driven deployments because - # role assignments (e.g. Storage Blob Data Owner for identity-based AzureWebJobsStorage) - # may not have fully propagated by the time the host first booted. A restart forces - # the host to re-initialize against the now-valid configuration. echo "✓ Restarting function app to ensure host runtime is in a clean state..." az functionapp restart --name "$FUNCTION_APP_NAME" --resource-group "$RESOURCE_GROUP" >/dev/null 2>&1 || true sleep 20 @@ -236,11 +228,10 @@ else sleep $HOST_RETRY_INTERVAL done - # Warm up the host with an HTTP probe (best-effort, ignores result). + # Warm up the host (best-effort). curl -fsS -o /dev/null -m 30 "https://${FUNCTION_APP_NAME}.azurewebsites.net/" >/dev/null 2>&1 || true - # Set the function key via REST API (with retries — the host runtime may - # still report transient errors during early initialization). + # Set the function key via REST API (with retries — the host runtime may not be ready yet) echo "✓ Setting function key 'ClientKey' on '${FUNCTION_APP_NAME}'..." URI="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP}/providers/Microsoft.Web/sites/${FUNCTION_APP_NAME}/host/default/functionKeys/clientKey?api-version=2023-01-01" BODY="{\"properties\":{\"name\":\"ClientKey\",\"value\":\"${FUNCTION_KEY}\"}}" @@ -254,11 +245,10 @@ else KEY_SET=true break fi - # Recognize the well-known transient "ServiceUnavailable / InternalServerError - # from host runtime" error and continue retrying. Other errors are also retried but logged. + # Treat ServiceUnavailable / InternalServerError from the host runtime as transient. if echo "$REST_ERR" | grep -qiE "ServiceUnavailable|InternalServerError"; then echo " [${attempt}/${KEY_MAX_RETRIES}] Host runtime transient error. Retrying in ${KEY_RETRY_INTERVAL}s..." - # Every 5 failed attempts, restart the function app again to nudge the host out of a bad state. + # Every 5 attempts, restart to nudge a stuck host. if [ $((attempt % 5)) -eq 0 ] && [ $attempt -lt $KEY_MAX_RETRIES ]; then echo " → Re-restarting function app to clear stuck host state..." az functionapp restart --name "$FUNCTION_APP_NAME" --resource-group "$RESOURCE_GROUP" >/dev/null 2>&1 || true