diff --git a/Dockerfile b/Dockerfile index 1485535..d98356a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ -FROM alpine:3.13.5 +FROM alpine:3.17.3 -RUN apk add --no-cache curl jq +RUN apk add --no-cache curl jq bash COPY docker-entrypoint / ENTRYPOINT ["/docker-entrypoint"] @@ -9,6 +9,7 @@ ENV AUTOHEAL_CONTAINER_LABEL=autoheal \ AUTOHEAL_START_PERIOD=0 \ AUTOHEAL_INTERVAL=5 \ AUTOHEAL_DEFAULT_STOP_TIMEOUT=10 \ + AUTOHEAL_RETRIES=5 \ DOCKER_SOCK=/var/run/docker.sock \ CURL_TIMEOUT=30 \ WEBHOOK_URL="" diff --git a/README.md b/README.md index f9bcdcb..d6d1033 100644 --- a/README.md +++ b/README.md @@ -47,25 +47,26 @@ The certificates, and keys need these names: ### Change Timezone If you need the timezone to match the local machine, you can map the `/etc/localtime` into the container. -``` +```bash docker run ... -v /etc/localtime:/etc/localtime:ro ``` ## ENV Defaults -``` +```bash AUTOHEAL_CONTAINER_LABEL=autoheal -AUTOHEAL_INTERVAL=5 # check every 5 seconds -AUTOHEAL_START_PERIOD=0 # wait 0 seconds before first health check -AUTOHEAL_DEFAULT_STOP_TIMEOUT=10 # Docker waits max 10 seconds (the Docker default) for a container to stop before killing during restarts (container overridable via label, see below) -DOCKER_SOCK=/var/run/docker.sock # Unix socket for curl requests to Docker API -CURL_TIMEOUT=30 # --max-time seconds for curl requests to Docker API -WEBHOOK_URL="" # post message to the webhook if a container was restarted (or restart failed) +AUTOHEAL_INTERVAL=5 # check every 5 seconds +AUTOHEAL_START_PERIOD=0 # wait 0 seconds before first health check +AUTOHEAL_DEFAULT_STOP_TIMEOUT=10 # Docker waits max 10 seconds (the Docker default) for a container to stop before killing during restarts (container overridable via label, see below) +AUTOHEAL_RETRIES=5 # Number of retries pf restarting an unhealthy container +DOCKER_SOCK=/var/run/docker.sock # Unix socket for curl requests to Docker API +CURL_TIMEOUT=30 # --max-time seconds for curl requests to Docker API +WEBHOOK_URL="" # post message to the webhook if a container was restarted (or restart failed) ``` ### Optional Container Labels -``` -autoheal.stop.timeout=20 # Per containers override for stop timeout seconds during restart +```bash +autoheal.stop.timeout=20 # Per containers override for stop timeout seconds during restart ``` ## Testing diff --git a/docker-entrypoint b/docker-entrypoint index eb482bf..4ab4cd6 100755 --- a/docker-entrypoint +++ b/docker-entrypoint @@ -1,4 +1,4 @@ -#!/usr/bin/env sh +#!/usr/bin/env bash set -e # shellcheck disable=2039 @@ -25,6 +25,7 @@ AUTOHEAL_CONTAINER_LABEL=${AUTOHEAL_CONTAINER_LABEL:-autoheal} AUTOHEAL_START_PERIOD=${AUTOHEAL_START_PERIOD:-0} AUTOHEAL_INTERVAL=${AUTOHEAL_INTERVAL:-5} AUTOHEAL_DEFAULT_STOP_TIMEOUT=${AUTOHEAL_DEFAULT_STOP_TIMEOUT:-10} +AUTOHEAL_RETRIES=${AUTOHEAL_RETRIES:-5} docker_curl() { curl --max-time "${CURL_TIMEOUT}" --no-buffer -s \ @@ -73,7 +74,7 @@ generate_webhook_payload() { cat <&2 sleep "$AUTOHEAL_START_PERIOD" fi - + + declare -A SICK_CONTAINERS + while true do STOP_TIMEOUT=".Labels[\"autoheal.stop.timeout\"] // $AUTOHEAL_DEFAULT_STOP_TIMEOUT" - get_container_info | \ + shopt -s lastpipe; set +m; get_container_info | \ jq -r "foreach .[] as \$CONTAINER([];[]; \$CONTAINER | .Id, .Names[0], .State, ${STOP_TIMEOUT})" | \ while read -r CONTAINER_ID && read -r CONTAINER_NAME && read -r CONTAINER_STATE && read -r TIMEOUT do @@ -109,18 +112,48 @@ if [ "$1" = "autoheal" ] && [ -e "$DOCKER_SOCK" ];then echo "$DATE Container name of (${CONTAINER_SHORT_ID}) is null, which implies container does not exist - don't restart" >&2 elif [ "$CONTAINER_STATE" = "restarting" ] then - echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) found to be restarting - don't restart" + echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) found to be restarting - don't restart" >&2 else - echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) found to be unhealthy - Restarting container now with ${TIMEOUT}s timeout" - if ! restart_container "$CONTAINER_ID" "$TIMEOUT" - then - echo "$DATE Restarting container $CONTAINER_SHORT_ID failed" >&2 - notify_webhook "Container ${CONTAINER_NAME:1} (${CONTAINER_SHORT_ID}) found to be unhealthy. Failed to restart the container!" & - else - notify_webhook "Container ${CONTAINER_NAME:1} (${CONTAINER_SHORT_ID}) found to be unhealthy. Successfully restarted the container!" & + if [ ${SICK_CONTAINERS[$CONTAINER_ID]+_} ]; then + echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) already in a queue" >&2 + else + echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) found to be unhealthy - adding to queue" >&2 + SICK_CONTAINERS[$CONTAINER_ID]="CONTAINER_ID=\"$CONTAINER_ID\" CONTAINER_SHORT_ID=\"$CONTAINER_SHORT_ID\" CONTAINER_NAME=\"$CONTAINER_NAME\" CONTAINER_STATE=\"$CONTAINER_STATE\" TIMEOUT=\"$TIMEOUT\" RETRY=0" + fi + fi + done + + declare -A SICK_CONTAINERS_NEW + + for SICK_CONTAINER in "${SICK_CONTAINERS[@]}"; do + eval "$SICK_CONTAINER" + + if ! restart_container "$CONTAINER_ID" "$TIMEOUT" + then + echo "$DATE Restarting container $CONTAINER_SHORT_ID failed" >&2 + notify_webhook "Container ${CONTAINER_NAME:1} (${CONTAINER_SHORT_ID}) found to be unhealthy. Failed to restart the container!" & + + RETRY_NEW=$((RETRY+1)) + if [ $RETRY_NEW -lt $AUTOHEAL_RETRIES ]; then + SICK_CONTAINER_NEW="${SICK_CONTAINER/RETRY=$RETRY/RETRY=$RETRY_NEW}" + SICK_CONTAINERS_NEW[$CONTAINER_ID]="$SICK_CONTAINER_NEW" + else + echo "$DATE All attempts to restart the container $CONTAINER_SHORT_ID have failed" >&2 fi + else + echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) - successfully restarted the container" >&2 + notify_webhook "Container ${CONTAINER_NAME:1} (${CONTAINER_SHORT_ID}) found to be unhealthy. Successfully restarted the container!" & fi done + + unset SICK_CONTAINERS + declare -A SICK_CONTAINERS + + for idx in "${!SICK_CONTAINERS_NEW[@]}"; do + SICK_CONTAINERS[$idx]="${SICK_CONTAINERS_NEW[$idx]}" + done + unset SICK_CONTAINERS_NEW + sleep "$AUTOHEAL_INTERVAL" done