Skip to content

Commit 352d1d6

Browse files
committed
Deduplicate health check alerts
1 parent 3ecbcc3 commit 352d1d6

2 files changed

Lines changed: 95 additions & 55 deletions

File tree

.github/workflows/health-check.yml

Lines changed: 91 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2,70 +2,110 @@ name: Health Check
22

33
on:
44
schedule:
5-
- cron: '*/5 * * * *'
5+
- cron: "0 * * * *"
66
workflow_dispatch:
77

8+
concurrency:
9+
group: health-check
10+
cancel-in-progress: true
11+
812
jobs:
913
health-check:
1014
runs-on: ubuntu-latest
1115
permissions:
1216
issues: write
17+
contents: read
1318

1419
steps:
15-
- name: Check Production Environment
16-
id: check_prod
20+
- name: Check endpoints
21+
id: check
1722
run: |
18-
set +e # Allow curl to fail without exiting the script
19-
CURL_OUTPUT=$(curl -s -o /dev/null -w "%{http_code}" ${{ secrets.PROD_URL }}/api/health 2>&1)
20-
HTTP_CODE=$(echo $CURL_OUTPUT | tail -n 1)
21-
CURL_ERROR=$(echo $CURL_OUTPUT | head -n -1)
22-
23-
if [ "$HTTP_CODE" -ne 200 ]; then
24-
echo "Production environment is down! HTTP Code: $HTTP_CODE"
25-
echo "PROD_DOWN=true" >> $GITHUB_ENV
26-
echo "PROD_ERROR=$CURL_ERROR" >> $GITHUB_ENV
27-
else
28-
echo "Production environment is up. HTTP Code: $HTTP_CODE"
29-
fi
23+
set +e
24+
failures=""
3025
31-
- name: Check Development Environment
32-
id: check_dev
33-
run: |
34-
set +e # Allow curl to fail without exiting the script
35-
CURL_OUTPUT=$(curl -s -o /dev/null -w "%{http_code}" ${{ secrets.DEV_URL }}/api/health 2>&1)
36-
HTTP_CODE=$(echo $CURL_OUTPUT | tail -n 1)
37-
CURL_ERROR=$(echo $CURL_OUTPUT | head -n -1)
38-
39-
if [ "$HTTP_CODE" -ne 200 ]; then
40-
echo "Development environment is down! HTTP Code: $HTTP_CODE"
41-
echo "DEV_DOWN=true" >> $GITHUB_ENV
42-
echo "DEV_ERROR=$CURL_ERROR" >> $GITHUB_ENV
43-
else
44-
echo "Development environment is up. HTTP Code: $HTTP_CODE"
26+
check_url() {
27+
name="$1"
28+
url="$2"
29+
30+
if [ -z "$url" ]; then
31+
echo "::notice::$name URL is not configured; skipping."
32+
return
33+
fi
34+
35+
http_code=$(curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 "$url/api/health")
36+
curl_status=$?
37+
38+
if [ "$curl_status" -ne 0 ] || [ "$http_code" -ne 200 ]; then
39+
failures="${failures}${name}: ${url}/api/health returned ${http_code} (curl exit ${curl_status})%0A"
40+
echo "::error::$name health check failed with HTTP $http_code"
41+
else
42+
echo "$name health check passed with HTTP $http_code"
43+
fi
44+
}
45+
46+
check_url "production" "${{ secrets.PROD_URL }}"
47+
check_url "development" "${{ secrets.DEV_URL }}"
48+
49+
if [ -n "$failures" ]; then
50+
echo "failed=true" >> "$GITHUB_OUTPUT"
51+
echo "failures=$failures" >> "$GITHUB_OUTPUT"
52+
exit 1
4553
fi
4654
47-
- name: Create Issue on Failure
48-
if: env.PROD_DOWN == 'true' || env.DEV_DOWN == 'true'
55+
echo "failed=false" >> "$GITHUB_OUTPUT"
56+
57+
- name: Find existing health-check issue
58+
id: existing_issue
59+
if: failure() && steps.check.outputs.failed == 'true'
4960
uses: actions/github-script@v7
5061
with:
51-
github-token: ${{ secrets.GITHUB_TOKEN }}
5262
script: |
53-
let body = ''
54-
if (process.env.PROD_DOWN === 'true') {
55-
body += 'Production environment is down!\n'
56-
if (process.env.PROD_ERROR) {
57-
body += `cURL Error: ${process.env.PROD_ERROR}\n`
58-
}
59-
}
60-
if (process.env.DEV_DOWN === 'true') {
61-
body += 'Development environment is down!\n'
62-
if (process.env.DEV_ERROR) {
63-
body += `cURL Error: ${process.env.DEV_ERROR}\n`
64-
}
65-
}
66-
github.rest.issues.create({
67-
owner: context.repo.owner,
68-
repo: context.repo.repo,
69-
title: 'Application Health Check Failed',
70-
body: body
71-
});
63+
const { owner, repo } = context.repo;
64+
const { data } = await github.rest.search.issuesAndPullRequests({
65+
q: `repo:${owner}/${repo} is:issue is:open in:title "Application Health Check Failed"`,
66+
per_page: 1,
67+
});
68+
core.setOutput("number", data.items[0]?.number || "");
69+
70+
- name: Create health-check issue
71+
if: failure() && steps.check.outputs.failed == 'true' && steps.existing_issue.outputs.number == ''
72+
uses: actions/github-script@v7
73+
with:
74+
script: |
75+
const body = [
76+
"The scheduled health check failed.",
77+
"",
78+
"${{ steps.check.outputs.failures }}".replaceAll("%0A", "\n"),
79+
"",
80+
`Workflow run: ${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
81+
].join("\n");
82+
83+
await github.rest.issues.create({
84+
...context.repo,
85+
title: "Application Health Check Failed",
86+
body,
87+
});
88+
89+
- name: Update existing health-check issue
90+
if: failure() && steps.check.outputs.failed == 'true' && steps.existing_issue.outputs.number != ''
91+
uses: actions/github-script@v7
92+
with:
93+
script: |
94+
const body = [
95+
"The scheduled health check is still failing.",
96+
"",
97+
"${{ steps.check.outputs.failures }}".replaceAll("%0A", "\n"),
98+
"",
99+
`Latest workflow run: ${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
100+
].join("\n");
101+
102+
await github.rest.issues.createComment({
103+
...context.repo,
104+
issue_number: Number("${{ steps.existing_issue.outputs.number }}"),
105+
body,
106+
});
107+
108+
- name: Fail workflow after reporting
109+
if: failure() && steps.check.outputs.failed == 'true'
110+
run: |
111+
exit 1

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# github-actions-ec2-pipeline
22

33
> GitHub Actions pipeline that builds, tests, versions, and deploys a Node.js app
4-
> to AWS EC2 — zero manual steps after initial setup.
4+
> to AWS EC2 with PM2 reload, rollback support, and scheduled health checks.
55
66
[![CI Pipeline](https://github.com/darestack/github-actions-ec2-pipeline/actions/workflows/ci.yml/badge.svg)](https://github.com/darestack/github-actions-ec2-pipeline/actions/workflows/ci.yml)
77

@@ -25,10 +25,10 @@ Push to main / feature branch
2525

2626
| Decision | Implementation | Why |
2727
|---|---|---|
28-
| **Zero-downtime deploy** | `pm2 reload` + atomic symlink swap (`current release-timestamp`) | App stays up during deployment; rollback is a symlink change |
28+
| **Low-interruption deploy** | `pm2 reload` + atomic symlink swap (`current -> release-timestamp`) | Keeps deploy behavior predictable and rollback-friendly |
2929
| **Auto-rollback** | `deploy.sh` keeps previous release; restores on failure | No manual intervention if deploy breaks the app |
3030
| **Automatic versioning** | `bump-version` job creates `v1.x.x` tags on every merge to main | Release history is automatic; no manual tagging |
31-
| **Health check monitoring** | Scheduled workflow every 5 min; creates a GitHub Issue on failure | On-call alert without a third-party service |
31+
| **Health check monitoring** | Scheduled workflow runs hourly and reuses one open health-check issue while an outage is active | Avoids duplicate alert noise and keeps incident state readable |
3232
| **Separate CI / CD workflows** | `ci.yml` + `release.yml` split by tag trigger | CD only runs on verified, tagged builds — not every push |
3333

3434
---
@@ -50,7 +50,7 @@ Triggers: new tag matching `v*`
5050
2. **`create-release`**: publishes GitHub Release with tag name
5151

5252
### `health-check.yml` — Uptime Monitoring
53-
Runs every 5 minutes. Hits `/api/health`. Creates a GitHub Issue if the check fails.
53+
Runs hourly. Hits `/api/health` for configured environments. If a check fails, the workflow creates one health-check issue or comments on the existing open issue instead of creating duplicates.
5454

5555
---
5656

0 commit comments

Comments
 (0)