Uptime Monitor #5
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Uptime Monitor | |
| on: | |
| schedule: | |
| - cron: '*/5 * * * *' # Run every 5 minutes | |
| workflow_dispatch: # Allow manual trigger | |
| concurrency: | |
| group: uptime-monitor | |
| cancel-in-progress: false | |
| permissions: | |
| issues: write | |
| jobs: | |
| check-uptime: | |
| name: Check Backend Health | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Uptime Check Script | |
| uses: actions/github-script@v7 | |
| env: | |
| ALERT_WEBHOOK_URL: ${{ secrets.ALERT_WEBHOOK_URL }} | |
| with: | |
| script: | | |
| const endpoints = [ | |
| { env: "Production", url: "https://api.tinyhumans.ai/health", timeout: 10000 }, | |
| { env: "Staging", url: "https://staging-api.tinyhumans.ai/health", timeout: 15000 } | |
| ]; | |
| const maxRetries = 3; | |
| const retryDelay = 5000; | |
| const issueTitle = "CRITICAL: Backend Outage Detected"; | |
| async function fetchWithRetry(url, timeout, retries) { | |
| for (let i = 0; i <= retries; i++) { | |
| try { | |
| const controller = new AbortController(); | |
| const id = setTimeout(() => controller.abort(), timeout); | |
| const response = await fetch(url, { signal: controller.signal }); | |
| clearTimeout(id); | |
| if (response.status === 200) return { ok: true, status: response.status }; | |
| if (i === retries) return { ok: false, status: response.status }; | |
| } catch (error) { | |
| if (i === retries) return { ok: false, error: error.message }; | |
| } | |
| await new Promise(resolve => setTimeout(resolve, retryDelay)); | |
| } | |
| } | |
| let allHealthy = true; | |
| let failureDetails = []; | |
| for (const ep of endpoints) { | |
| const result = await fetchWithRetry(ep.url, ep.timeout, maxRetries); | |
| if (!result.ok) { | |
| allHealthy = false; | |
| failureDetails.push(`- **${ep.env}**: ${ep.url} (Status: ${result.status || result.error}) at ${new Date().toISOString()}`); | |
| } | |
| } | |
| const { data: issues } = await github.rest.issues.listForRepo({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| state: 'open', | |
| creator: 'github-actions[bot]' | |
| }); | |
| const openIssue = issues.find(i => i.title === issueTitle); | |
| const webhookUrl = process.env.ALERT_WEBHOOK_URL; | |
| async function sendWebhook(message) { | |
| if (webhookUrl) { | |
| try { | |
| // Discord uses { content } and Slack uses { text }; detect by URL | |
| const payload = webhookUrl.includes('discord') | |
| ? { content: message } | |
| : { text: message }; | |
| await fetch(webhookUrl, { | |
| method: 'POST', | |
| headers: { 'Content-Type': 'application/json' }, | |
| body: JSON.stringify(payload) | |
| }); | |
| } catch (e) { | |
| console.error("Failed to send webhook", e); | |
| } | |
| } | |
| } | |
| if (allHealthy) { | |
| if (openIssue) { | |
| const recoveryMsg = `✅ **RESOLVED**: All backend endpoints are now healthy and reachable.\n\nMonitors reported recovery at ${new Date().toISOString()}.`; | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: openIssue.number, | |
| body: recoveryMsg | |
| }); | |
| await github.rest.issues.update({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: openIssue.number, | |
| state: 'closed' | |
| }); | |
| await sendWebhook(recoveryMsg); | |
| console.log("Services recovered. Issue closed."); | |
| } else { | |
| console.log("All services healthy."); | |
| } | |
| } else { | |
| const issueBody = `The automated uptime monitor detected an outage in the backend.\n\n### Failing Endpoints:\n${failureDetails.join('\n')}\n\nPlease check the logs and follow the runbook in \`docs/OPERATIONS.md\`.`; | |
| if (!openIssue) { | |
| await github.rest.issues.create({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| title: issueTitle, | |
| body: issueBody, | |
| labels: ['bug', 'critical', 'ops'] | |
| }); | |
| await sendWebhook(`🚨 **${issueTitle}**\n${issueBody}`); | |
| core.setFailed("Backend health check failed."); | |
| } else { | |
| console.log("Issue already exists, skipping duplicate creation."); | |
| core.setFailed("Backend health check failed (ongoing)."); | |
| } | |
| } |