-
Notifications
You must be signed in to change notification settings - Fork 1
279 lines (231 loc) · 9.72 KB
/
monitoring.yml
File metadata and controls
279 lines (231 loc) · 9.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
name: Monitoring and Alerts
on:
schedule:
# Run every 5 minutes
- cron: '*/5 * * * *'
workflow_dispatch:
push:
branches: [main]
paths:
- 'k8s/**'
- '.github/workflows/monitoring.yml'
jobs:
check-deployment:
runs-on: ubuntu-latest
environment: production
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up kubectl
uses: azure/setup-kubectl@v3
with:
version: 'latest'
- name: Configure kubectl
run: |
mkdir -p $HOME/.kube
echo "${{ secrets.KUBECONFIG_PRODUCTION }}" | base64 --decode > $HOME/.kube/config
kubectl config use-context ${{ secrets.KUBERNETES_CONTEXT_PRODUCTION }}
- name: Check deployment status
id: deployment-check
run: |
# Check all deployments
DEPLOYMENTS=("everything-opencode" "pinescript-debug" "command-runners")
ALL_HEALTHY=true
for deployment in "${DEPLOYMENTS[@]}"; do
echo "Checking $deployment..."
# Check if deployment exists
if ! kubectl get deployment $deployment --namespace everything-opencode > /dev/null 2>&1; then
echo "❌ Deployment $deployment not found"
ALL_HEALTHY=false
continue
fi
# Check replica status
DESIRED=$(kubectl get deployment $deployment --namespace everything-opencode -o jsonpath='{.spec.replicas}')
READY=$(kubectl get deployment $deployment --namespace everything-opencode -o jsonpath='{.status.readyReplicas}')
if [ "$DESIRED" != "$READY" ]; then
echo "❌ Deployment $deployment: $READY/$DESIRED replicas ready"
ALL_HEALTHY=false
# Get pod status for debugging
echo "Pod status:"
kubectl get pods --namespace everything-opencode -l app=everything-opencode,component=${deployment//-/_} -o wide
# Get pod logs for failed pods
FAILED_PODS=$(kubectl get pods --namespace everything-opencode -l app=everything-opencode,component=${deployment//-/_} --field-selector=status.phase!=Running -o jsonpath='{.items[*].metadata.name}')
for pod in $FAILED_PODS; do
echo "Logs for $pod:"
kubectl logs $pod --namespace everything-opencode --tail=50 || true
done
else
echo "✅ Deployment $deployment: $READY/$DESIRED replicas ready"
fi
done
if [ "$ALL_HEALTHY" = true ]; then
echo "DEPLOYMENT_STATUS=healthy" >> $GITHUB_OUTPUT
else
echo "DEPLOYMENT_STATUS=unhealthy" >> $GITHUB_OUTPUT
fi
- name: Send alert if unhealthy
if: steps.deployment-check.outputs.DEPLOYMENT_STATUS == 'unhealthy'
uses: 8398a7/action-slack@v3
with:
channel: '#alerts'
status: ${{ job.status }}
text: |
❌ Everything OpenCode deployment is unhealthy!
Check the GitHub Actions run for details:
${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
username: 'Deployment Monitor'
icon_emoji: ':warning:'
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
check-resources:
runs-on: ubuntu-latest
environment: production
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up kubectl
uses: azure/setup-kubectl@v3
with:
version: 'latest'
- name: Configure kubectl
run: |
mkdir -p $HOME/.kube
echo "${{ secrets.KUBECONFIG_PRODUCTION }}" | base64 --decode > $HOME/.kube/config
kubectl config use-context ${{ secrets.KUBERNETES_CONTEXT_PRODUCTION }}
- name: Check resource usage
id: resource-check
run: |
# Check pod resource usage
echo "Checking resource usage..."
# Get top pods by CPU
echo "Top pods by CPU usage:"
kubectl top pods --namespace everything-opencode --sort-by=cpu | head -10
# Get top pods by memory
echo "Top pods by memory usage:"
kubectl top pods --namespace everything-opencode --sort-by=memory | head -10
# Check HPA status
echo "HPA status:"
kubectl get hpa --namespace everything-opencode
# Check for resource warnings
WARNINGS=false
# Check CPU usage > 80%
HIGH_CPU_PODS=$(kubectl top pods --namespace everything-opencode --no-headers | awk '$3 > 80 {print $1}')
if [ -n "$HIGH_CPU_PODS" ]; then
echo "⚠️ High CPU usage detected: $HIGH_CPU_PODS"
WARNINGS=true
fi
# Check memory usage > 80%
HIGH_MEM_PODS=$(kubectl top pods --namespace everything-opencode --no-headers | awk '$4 > 80 {print $1}')
if [ -n "$HIGH_MEM_PODS" ]; then
echo "⚠️ High memory usage detected: $HIGH_MEM_PODS"
WARNINGS=true
fi
if [ "$WARNINGS" = true ]; then
echo "RESOURCE_STATUS=warning" >> $GITHUB_OUTPUT
else
echo "RESOURCE_STATUS=normal" >> $GITHUB_OUTPUT
fi
- name: Send resource warning
if: steps.resource-check.outputs.RESOURCE_STATUS == 'warning'
uses: 8398a7/action-slack@v3
with:
channel: '#alerts'
status: ${{ job.status }}
text: |
⚠️ Everything OpenCode resource usage is high!
Check the GitHub Actions run for details:
${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
username: 'Resource Monitor'
icon_emoji: ':chart_with_upwards_trend:'
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
check-application-health:
runs-on: ubuntu-latest
environment: production
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up kubectl
uses: azure/setup-kubectl@v3
with:
version: 'latest'
- name: Configure kubectl
run: |
mkdir -p $HOME/.kube
echo "${{ secrets.KUBECONFIG_PRODUCTION }}" | base64 --decode > $HOME/.kube/config
kubectl config use-context ${{ secrets.KUBERNETES_CONTEXT_PRODUCTION }}
- name: Get ingress URL
id: ingress
run: |
INGRESS_HOST=$(kubectl get ingress everything-opencode --namespace everything-opencode -o jsonpath='{.spec.rules[0].host}')
echo "INGRESS_HOST=$INGRESS_HOST" >> $GITHUB_OUTPUT
- name: Check application endpoints
id: health-check
run: |
HOST=${{ steps.ingress.outputs.INGRESS_HOST }}
ALL_HEALTHY=true
# Define endpoints to check
ENDPOINTS=(
"/health"
"/api/status"
"/debug/health"
"/api/commands/health"
"/api/python/health"
)
for endpoint in "${ENDPOINTS[@]}"; do
URL="https://$HOST$endpoint"
echo "Checking $URL..."
if curl -f -s -o /dev/null -w "%{http_code}" "$URL" | grep -q "200\|201\|204"; then
echo "✅ $endpoint is healthy"
else
echo "❌ $endpoint is unhealthy"
ALL_HEALTHY=false
# Try to get error details
curl -v "$URL" 2>&1 | tail -20 || true
fi
done
if [ "$ALL_HEALTHY" = true ]; then
echo "APPLICATION_STATUS=healthy" >> $GITHUB_OUTPUT
else
echo "APPLICATION_STATUS=unhealthy" >> $GITHUB_OUTPUT
fi
- name: Send application alert
if: steps.health-check.outputs.APPLICATION_STATUS == 'unhealthy'
uses: 8398a7/action-slack@v3
with:
channel: '#alerts'
status: ${{ job.status }}
text: |
❌ Everything OpenCode application endpoints are unhealthy!
Check the GitHub Actions run for details:
${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
username: 'Application Monitor'
icon_emoji: ':hospital:'
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
generate-report:
runs-on: ubuntu-latest
needs: [check-deployment, check-resources, check-application-health]
environment: production
steps:
- name: Generate monitoring report
run: |
echo "# Everything OpenCode Monitoring Report" > monitoring-report.md
echo "Generated: $(date)" >> monitoring-report.md
echo "" >> monitoring-report.md
echo "## Summary" >> monitoring-report.md
echo "- Deployment Status: ${{ needs.check-deployment.outputs.DEPLOYMENT_STATUS }}" >> monitoring-report.md
echo "- Resource Status: ${{ needs.check-resources.outputs.RESOURCE_STATUS }}" >> monitoring-report.md
echo "- Application Status: ${{ needs.check-application-health.outputs.APPLICATION_STATUS }}" >> monitoring-report.md
echo "" >> monitoring-report.md
echo "## Details" >> monitoring-report.md
echo "Run ID: ${{ github.run_id }}" >> monitoring-report.md
echo "Run URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" >> monitoring-report.md
# Upload report as artifact
echo "monitoring-report.md" > artifact-list.txt
- name: Upload monitoring report
uses: actions/upload-artifact@v4
with:
name: monitoring-report
path: monitoring-report.md
retention-days: 7