Skip to content

Uptime Monitor

Uptime Monitor #5

name: Uptime Monitor
on:
schedule:
- cron: '*/5 * * * *' # Run every 5 minutes
workflow_dispatch: # Allow manual trigger
concurrency:
group: uptime-monitor
cancel-in-progress: false
permissions:
issues: write
jobs:
check-uptime:
name: Check Backend Health
runs-on: ubuntu-latest
steps:
- name: Uptime Check Script
uses: actions/github-script@v7
env:
ALERT_WEBHOOK_URL: ${{ secrets.ALERT_WEBHOOK_URL }}
with:
script: |
const endpoints = [
{ env: "Production", url: "https://api.tinyhumans.ai/health", timeout: 10000 },
{ env: "Staging", url: "https://staging-api.tinyhumans.ai/health", timeout: 15000 }
];
const maxRetries = 3;
const retryDelay = 5000;
const issueTitle = "CRITICAL: Backend Outage Detected";
async function fetchWithRetry(url, timeout, retries) {
for (let i = 0; i <= retries; i++) {
try {
const controller = new AbortController();
const id = setTimeout(() => controller.abort(), timeout);
const response = await fetch(url, { signal: controller.signal });
clearTimeout(id);
if (response.status === 200) return { ok: true, status: response.status };
if (i === retries) return { ok: false, status: response.status };
} catch (error) {
if (i === retries) return { ok: false, error: error.message };
}
await new Promise(resolve => setTimeout(resolve, retryDelay));
}
}
let allHealthy = true;
let failureDetails = [];
for (const ep of endpoints) {
const result = await fetchWithRetry(ep.url, ep.timeout, maxRetries);
if (!result.ok) {
allHealthy = false;
failureDetails.push(`- **${ep.env}**: ${ep.url} (Status: ${result.status || result.error}) at ${new Date().toISOString()}`);
}
}
const { data: issues } = await github.rest.issues.listForRepo({
owner: context.repo.owner,
repo: context.repo.repo,
state: 'open',
creator: 'github-actions[bot]'
});
const openIssue = issues.find(i => i.title === issueTitle);
const webhookUrl = process.env.ALERT_WEBHOOK_URL;
async function sendWebhook(message) {
if (webhookUrl) {
try {
// Discord uses { content } and Slack uses { text }; detect by URL
const payload = webhookUrl.includes('discord')
? { content: message }
: { text: message };
await fetch(webhookUrl, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload)
});
} catch (e) {
console.error("Failed to send webhook", e);
}
}
}
if (allHealthy) {
if (openIssue) {
const recoveryMsg = `✅ **RESOLVED**: All backend endpoints are now healthy and reachable.\n\nMonitors reported recovery at ${new Date().toISOString()}.`;
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: openIssue.number,
body: recoveryMsg
});
await github.rest.issues.update({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: openIssue.number,
state: 'closed'
});
await sendWebhook(recoveryMsg);
console.log("Services recovered. Issue closed.");
} else {
console.log("All services healthy.");
}
} else {
const issueBody = `The automated uptime monitor detected an outage in the backend.\n\n### Failing Endpoints:\n${failureDetails.join('\n')}\n\nPlease check the logs and follow the runbook in \`docs/OPERATIONS.md\`.`;
if (!openIssue) {
await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: issueTitle,
body: issueBody,
labels: ['bug', 'critical', 'ops']
});
await sendWebhook(`🚨 **${issueTitle}**\n${issueBody}`);
core.setFailed("Backend health check failed.");
} else {
console.log("Issue already exists, skipping duplicate creation.");
core.setFailed("Backend health check failed (ongoing).");
}
}