Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
## Goal
<!-- What does this PR accomplish? 1 sentence. -->

## Changes
-

## Testing
<!-- How did you verify it? -->

## Checklist
- [ ] Title is a clear sentence (≤ 70 chars)
- [ ] Commits are signed (`git log --show-signature`)
- [ ] `submissions/labN.md` updated
31 changes: 31 additions & 0 deletions app/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# syntax=docker/dockerfile:1

# ---------- builder stage ----------
FROM golang:1.24-alpine AS builder

WORKDIR /src

COPY go.mod ./
RUN go mod download

COPY . .
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \
go build -trimpath -ldflags='-s -w' -o /quicknotes .

RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \
go build -trimpath -ldflags='-s -w' -o /healthcheck ./cmd/healthcheck

# ---------- runtime stage ----------
FROM gcr.io/distroless/static:nonroot

WORKDIR /

COPY --from=builder /quicknotes /quicknotes
COPY --from=builder /healthcheck /healthcheck
COPY seed.json /seed.json
COPY --chown=nonroot:nonroot data/ /data/

EXPOSE 8080

USER nonroot:nonroot
ENTRYPOINT ["/quicknotes"]
21 changes: 21 additions & 0 deletions app/cmd/healthcheck/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package main

import (
"net/http"
"os"
"time"
)

func main() {
client := http.Client{Timeout: 2 * time.Second}

resp, err := client.Get("http://127.0.0.1:8080/health")
if err != nil {
os.Exit(1)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
os.Exit(1)
}
}
57 changes: 57 additions & 0 deletions compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
services:
quicknotes:
build:
context: ./app
image: quicknotes:lab8
ports:
- "8080:8080"
environment:
ADDR: ":8080"
DATA_PATH: "/data/notes.json"
SEED_PATH: "/seed.json"
volumes:
- quicknotes-data:/data
restart: unless-stopped
read_only: true
tmpfs:
- /tmp
cap_drop:
- ALL
security_opt:
- no-new-privileges:true
healthcheck:
test: ["CMD", "/healthcheck"]
interval: 10s
timeout: 3s
retries: 3
start_period: 5s

prometheus:
image: prom/prometheus:v3.11.3
ports:
- "9090:9090"
volumes:
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./monitoring/prometheus/rules:/etc/prometheus/rules:ro
depends_on:
quicknotes:
condition: service_healthy
restart: unless-stopped

grafana:
image: grafana/grafana:13.0
ports:
- "3000:3000"
environment:
GF_SECURITY_ADMIN_USER: "lab8admin"
GF_SECURITY_ADMIN_PASSWORD: "lab8-change-me-2026"
GF_USERS_ALLOW_SIGN_UP: "false"
volumes:
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
- ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
depends_on:
- prometheus
restart: unless-stopped

volumes:
quicknotes-data:
24 changes: 24 additions & 0 deletions docs/runbook/high-error-rate.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Runbook — QuickNotes High Error Rate

## What this alert means

This alert means more than 5% of QuickNotes HTTP requests are returning 4xx or 5xx responses for at least 5 minutes.

## Triage steps

1. Open the Prometheus alerts page at `http://localhost:9090/alerts` and confirm that `QuickNotesHighErrorRate` is firing.
2. Open the Grafana dashboard at `http://localhost:3000` and check the error ratio, traffic rate, and notes stored panels.
3. Check whether the errors are mostly caused by bad client requests or server failures by querying this metric in Prometheus: `sum by (code) (rate(quicknotes_http_responses_by_code_total[5m]))`.
4. Check whether QuickNotes is still healthy by running `curl http://localhost:8080/health` and `docker compose ps`.
5. Check recent QuickNotes logs by running `docker compose logs --tail=100 quicknotes`.

## Mitigations

1. If the errors are caused by a bad deploy or bad configuration, roll back to the last known working Compose configuration or image.
2. If the errors are caused by malformed traffic, reduce or stop that traffic source and verify that healthy requests still work.
3. If QuickNotes is unhealthy or stuck, restart only the QuickNotes service with `docker compose restart quicknotes`.
4. If the data file or volume looks corrupted, stop writes temporarily, back up the current volume, and restore from a known good seed or backup.

## Post-incident

After the service is stable, write a blameless postmortem using the Lecture 1 postmortem format. Include the timeline, impact, root cause, what worked, what did not work, and concrete action items to prevent the same incident from happening again.
203 changes: 203 additions & 0 deletions monitoring/grafana/dashboards/golden-signals.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "reqps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(quicknotes_http_requests_total[1m]))",
"legendFormat": "request-rate proxy",
"refId": "A"
}
],
"title": "Latency — request-rate proxy because duration histogram is unavailable",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "reqps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(quicknotes_http_requests_total[1m]))",
"legendFormat": "requests/sec",
"refId": "A"
}
],
"title": "Traffic — request rate",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 3,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "100 * sum(rate(quicknotes_http_responses_by_code_total{code=~\"4..|5..\"}[5m])) / clamp_min(sum(rate(quicknotes_http_requests_total[5m])), 0.001)",
"legendFormat": "error ratio %",
"refId": "A"
}
],
"title": "Errors — 4xx + 5xx ratio",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 4,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "quicknotes_notes_total",
"legendFormat": "notes stored",
"refId": "A"
}
],
"title": "Saturation — notes stored",
"type": "timeseries"
}
],
"refresh": "10s",
"schemaVersion": 41,
"tags": [
"quicknotes",
"sre",
"golden-signals"
],
"templating": {
"list": []
},
"time": {
"from": "now-15m",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "QuickNotes Golden Signals",
"uid": "quicknotes-golden-signals",
"version": 2
}
12 changes: 12 additions & 0 deletions monitoring/grafana/provisioning/dashboards/dashboard.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: 1

providers:
- name: "QuickNotes Golden Signals"
orgId: 1
folder: "QuickNotes"
type: file
disableDeletion: false
editable: true
updateIntervalSeconds: 10
options:
path: /var/lib/grafana/dashboards
10 changes: 10 additions & 0 deletions monitoring/grafana/provisioning/datasources/datasource.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: 1

datasources:
- name: Prometheus
uid: prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
13 changes: 13 additions & 0 deletions monitoring/prometheus/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
global:
scrape_interval: 15s
evaluation_interval: 15s

rule_files:
- /etc/prometheus/rules/*.yml

scrape_configs:
- job_name: "quicknotes"
metrics_path: /metrics
static_configs:
- targets:
- "quicknotes:8080"
Loading