diff --git a/app/.dockerignore b/app/.dockerignore new file mode 100644 index 000000000..6b5ba83e5 --- /dev/null +++ b/app/.dockerignore @@ -0,0 +1,6 @@ +data/ +*.pcap +lab4-trace* +sudo +wsl +.git diff --git a/app/Dockerfile b/app/Dockerfile new file mode 100644 index 000000000..d9e39f7b3 --- /dev/null +++ b/app/Dockerfile @@ -0,0 +1,32 @@ +# syntax=docker/dockerfile:1 + +FROM golang:1.24.5-alpine AS builder +WORKDIR /src + +COPY go.mod ./ +RUN go mod download + +COPY . . +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \ + -trimpath -ldflags='-s -w' -o /quicknotes . + +# Tiny static HTTP probe — distroless has no shell/curl for HEALTHCHECK +RUN printf '%s\n' \ + 'package main' \ + 'import ("net/http"; "os")' \ + 'func main() {' \ + ' r, err := http.Get("http://127.0.0.1:8080/health")' \ + ' if err != nil || r == nil || r.StatusCode != http.StatusOK { os.Exit(1) }' \ + '}' \ + > /healthcheck.go && \ + CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \ + -trimpath -ldflags='-s -w' -o /healthcheck /healthcheck.go + +FROM gcr.io/distroless/static:nonroot +COPY --from=builder /quicknotes /quicknotes +COPY --from=builder /healthcheck /healthcheck +COPY seed.json /seed.json + +EXPOSE 8080 +USER nonroot +ENTRYPOINT ["/quicknotes"] diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 000000000..d79a2955a --- /dev/null +++ b/compose.yaml @@ -0,0 +1,69 @@ +services: + vol-init: + image: busybox:1.36 + user: "0" + volumes: + - quicknotes-data:/data + command: ["sh", "-c", "mkdir -p /data && chown 65532:65532 /data"] + restart: "no" + + quicknotes: + build: + context: ./app + dockerfile: Dockerfile + image: quicknotes:lab6 + depends_on: + vol-init: + condition: service_completed_successfully + ports: + - "8080:8080" + environment: + ADDR: ":8080" + DATA_PATH: "/data/notes.json" + SEED_PATH: "/seed.json" + volumes: + - quicknotes-data:/data + restart: unless-stopped + healthcheck: + test: ["CMD", "/healthcheck"] + interval: 10s + timeout: 3s + retries: 3 + start_period: 5s + cap_drop: + - ALL + read_only: true + tmpfs: + - /tmp + security_opt: + - no-new-privileges:true + + prometheus: + image: prom/prometheus:v3.2.1 + ports: + - "9090:9090" + volumes: + - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./monitoring/prometheus/rules:/etc/prometheus/rules:ro + depends_on: + quicknotes: + condition: service_healthy + restart: unless-stopped + + grafana: + image: grafana/grafana:13.0.3 + ports: + - "3000:3000" + environment: + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: lab8-grafana-admin + GF_USERS_ALLOW_SIGN_UP: "false" + volumes: + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro + depends_on: + - prometheus + restart: unless-stopped + +volumes: + quicknotes-data: diff --git a/docs/runbook/high-error-rate.md b/docs/runbook/high-error-rate.md new file mode 100644 index 000000000..51a10df4f --- /dev/null +++ b/docs/runbook/high-error-rate.md @@ -0,0 +1,29 @@ +# Runbook: High HTTP Error Rate + +## What this alert means + +QuickNotes is returning more than 5% HTTP 4xx/5xx responses sustained for five minutes — users are likely seeing failed requests. + +## Triage steps + +1. **Confirm the alert** — open Prometheus (`http://localhost:9090/alerts`) or Grafana and verify `HighErrorRate` is `Firing`; note the start time. +2. **Check QuickNotes health** — `curl -s http://localhost:8080/health` and `docker compose ps quicknotes`; confirm the container is `healthy` and `status` is `ok`. +3. **Inspect recent logs** — `docker compose logs --tail=100 quicknotes` for panics, permission errors, or repeated 4xx patterns. +4. **Check the error ratio query** — in Prometheus, run: + ```promql + sum(rate(quicknotes_http_responses_by_code_total{code=~"4..|5.."}[5m])) + / + sum(rate(quicknotes_http_requests_total[5m])) + ``` + Break down by `code` label to see whether errors are mostly 400s (bad clients) or 5xx (server faults). + +## Mitigations + +1. **Restart QuickNotes** — `docker compose restart quicknotes` to clear a stuck process or bad in-memory state while you investigate. +2. **Stop bad traffic** — if a script or client is sending malformed `POST /notes` bodies, pause or throttle it; errors should fall below 5% within the next evaluation window. + +## Post-incident + +1. Write a **blameless postmortem** using the format in [Lecture 1 — postmortems](../../lectures/lec1.md) (what happened, why, action items with owners and dates). +2. Add or tighten tests/alerts if the root cause was preventable (e.g., validation bug, missing rate limit). +3. Update this runbook if any triage step was missing or misleading. diff --git a/monitoring/docs/bonus-checkly-setup.md b/monitoring/docs/bonus-checkly-setup.md new file mode 100644 index 000000000..7e136af83 --- /dev/null +++ b/monitoring/docs/bonus-checkly-setup.md @@ -0,0 +1,55 @@ +# Lab 8 Bonus — Checkly + ngrok setup + +## 1. Expose QuickNotes publicly + +QuickNotes must be running (`docker compose up -d`). + +In a **new PowerShell terminal** (keep it open): + +```powershell +ngrok http 8080 +``` + +Copy the **Forwarding** HTTPS URL, e.g. `https://abc123.ngrok-free.app` + +Test it: + +```powershell +Invoke-RestMethod https://YOUR-NGROK-URL/health +``` + +## 2. Create Checkly API check (free account) + +1. Sign up at https://www.checklyhq.com/ +2. **Checks → Add check → API check** +3. Settings: + - **Name:** `QuickNotes health (Lab 8)` + - **URL:** `https://YOUR-NGROK-URL/health` + - **Method:** GET + - **Frequency:** 1 minute + - **Locations:** pick **2 regions** (e.g. `Frankfurt (eu-central-1)` + `Singapore (ap-southeast-1)`) + - **Assertion:** status code equals `200` + - **Assertion:** response time less than `2000` ms +4. Save and enable the check. + +## 3. Let it run >= 30 minutes + +Leave ngrok + Checkly running. Optionally generate light traffic: + +```bash +bash monitoring/scripts/generate-traffic.sh +``` + +## 4. Collect numbers for `submissions/lab8.md` + +**Prometheus (internal):** + +```bash +bash monitoring/scripts/bonus-prometheus-snapshot.sh +``` + +**Checkly (external):** open the check → **Check results** / **Metrics** → note p50/p95 latency and failures per region over the same 30-minute window. + +## 5. Stop ngrok when done + +`Ctrl+C` in the ngrok terminal. diff --git a/monitoring/grafana/dashboards/golden-signals.json b/monitoring/grafana/dashboards/golden-signals.json new file mode 100644 index 000000000..181968fd4 --- /dev/null +++ b/monitoring/grafana/dashboards/golden-signals.json @@ -0,0 +1,180 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(quicknotes_http_requests_total[1m])", + "legendFormat": "requests/sec (latency proxy)", + "refId": "A" + } + ], + "title": "Latency (proxy: request rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(quicknotes_http_requests_total[5m])", + "legendFormat": "traffic", + "refId": "A" + } + ], + "title": "Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "max": 1, + "min": 0, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "sum(rate(quicknotes_http_responses_by_code_total{code=~\"4..|5..\"}[5m])) / sum(rate(quicknotes_http_requests_total[5m]))", + "legendFormat": "error ratio", + "refId": "A" + } + ], + "title": "Errors (4xx+5xx / total)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "quicknotes_notes_total", + "legendFormat": "notes stored", + "refId": "A" + } + ], + "title": "Saturation (notes stored)", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [ + "lab8", + "golden-signals" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "QuickNotes Golden Signals", + "uid": "quicknotes-golden-signals", + "version": 1 +} diff --git a/monitoring/grafana/provisioning/dashboards/dashboard.yml b/monitoring/grafana/provisioning/dashboards/dashboard.yml new file mode 100644 index 000000000..65df331c2 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboard.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: golden-signals + orgId: 1 + folder: QuickNotes + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 30 + options: + path: /var/lib/grafana/dashboards diff --git a/monitoring/grafana/provisioning/datasources/datasource.yml b/monitoring/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 000000000..f7b72a490 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + uid: prometheus diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 000000000..ba72c885b --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,12 @@ +global: + scrape_interval: 15s + +rule_files: + - /etc/prometheus/rules/*.yml + +scrape_configs: + - job_name: quicknotes + metrics_path: /metrics + static_configs: + - targets: + - quicknotes:8080 diff --git a/monitoring/prometheus/rules/high-error-rate.yml b/monitoring/prometheus/rules/high-error-rate.yml new file mode 100644 index 000000000..1964bd2b4 --- /dev/null +++ b/monitoring/prometheus/rules/high-error-rate.yml @@ -0,0 +1,16 @@ +groups: + - name: quicknotes + rules: + - alert: HighErrorRate + expr: | + ( + sum(rate(quicknotes_http_responses_by_code_total{code=~"4..|5.."}[5m])) + / + sum(rate(quicknotes_http_requests_total[5m])) + ) > 0.05 + for: 5m + labels: + severity: page + annotations: + summary: QuickNotes HTTP error ratio exceeds 5% for 5 minutes + runbook_url: docs/runbook/high-error-rate.md diff --git a/monitoring/scripts/bonus-prometheus-snapshot.sh b/monitoring/scripts/bonus-prometheus-snapshot.sh new file mode 100644 index 000000000..aefe15d98 --- /dev/null +++ b/monitoring/scripts/bonus-prometheus-snapshot.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Snapshot Prometheus golden-signal stats for Lab 8 bonus comparison. +# Run after Checkly has probed for >= 30 minutes. +# Usage: bash monitoring/scripts/bonus-prometheus-snapshot.sh + +set -euo pipefail + +PROM="${PROM:-http://localhost:9090}" +WINDOW="${WINDOW:-30m}" + +query() { + curl -sG "${PROM}/api/v1/query" --data-urlencode "query=$1" \ + | jq -r '.data.result[0].value[1] // "n/a"' +} + +echo "=== Prometheus snapshot (window: ${WINDOW}) ===" +echo +echo "Request rate (traffic proxy, req/s):" +query "rate(quicknotes_http_requests_total[${WINDOW}])" +echo +echo "Error ratio:" +query "sum(rate(quicknotes_http_responses_by_code_total{code=~\"4..|5..\"}[${WINDOW}])) / sum(rate(quicknotes_http_requests_total[${WINDOW}]))" +echo +echo "4xx+5xx count (increase):" +query "sum(increase(quicknotes_http_responses_by_code_total{code=~\"4..|5..\"}[${WINDOW}]))" +echo +echo "Total requests (increase):" +query "sum(increase(quicknotes_http_requests_total[${WINDOW}]))" +echo +echo "Notes stored (gauge, current):" +query "quicknotes_notes_total" +echo +echo "Note: QuickNotes has no request-duration histogram; use Checkly for external p50/p95 latency." diff --git a/monitoring/scripts/generate-traffic.sh b/monitoring/scripts/generate-traffic.sh new file mode 100644 index 000000000..1d0bc4f59 --- /dev/null +++ b/monitoring/scripts/generate-traffic.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Generate mixed traffic against QuickNotes (Lab 8 Task 1). +# Usage: ./monitoring/scripts/generate-traffic.sh + +set -euo pipefail + +BASE_URL="${BASE_URL:-http://localhost:8080}" +COUNT="${COUNT:-200}" + +echo "Sending ${COUNT} requests to ${BASE_URL} ..." + +for i in $(seq 1 "$COUNT"); do + case $((i % 4)) in + 0) curl -sf "${BASE_URL}/health" -o /dev/null ;; + 1) curl -sf "${BASE_URL}/notes" -o /dev/null ;; + 2) curl -sf -X POST "${BASE_URL}/notes" \ + -H 'Content-Type: application/json' \ + -d "{\"title\":\"traffic-${i}\",\"body\":\"lab8\"}" -o /dev/null ;; + 3) curl -sf "${BASE_URL}/notes/1" -o /dev/null || true ;; + esac +done + +echo "Done." diff --git a/monitoring/scripts/inject-errors.sh b/monitoring/scripts/inject-errors.sh new file mode 100644 index 000000000..9cee47cbb --- /dev/null +++ b/monitoring/scripts/inject-errors.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# Inject sustained 4xx errors alongside healthy traffic (Lab 8 Task 2). +# Run for >= 6 minutes so the 5m "for:" window can fire HighErrorRate. +# Usage: ./monitoring/scripts/inject-errors.sh + +set -euo pipefail + +BASE_URL="${BASE_URL:-http://localhost:8080}" +DURATION_SEC="${DURATION_SEC:-360}" + +echo "Injecting errors for ${DURATION_SEC}s against ${BASE_URL} ..." + +end=$((SECONDS + DURATION_SEC)) +while [ "$SECONDS" -lt "$end" ]; do + curl -sf "${BASE_URL}/health" -o /dev/null || true + curl -sf "${BASE_URL}/notes" -o /dev/null || true + # Malformed JSON -> 400 + curl -s -X POST "${BASE_URL}/notes" \ + -H 'Content-Type: application/json' \ + -d '{"title":""}' -o /dev/null || true + curl -s -X POST "${BASE_URL}/notes" \ + -H 'Content-Type: application/json' \ + -d 'not-json' -o /dev/null || true + sleep 1 +done + +echo "Done." diff --git a/submissions/lab6.md b/submissions/lab6.md new file mode 100644 index 000000000..f0c23af7e --- /dev/null +++ b/submissions/lab6.md @@ -0,0 +1,168 @@ +# Lab 6 — Containers: Dockerize QuickNotes + +Mahmoud Hassan (`selysecr332`) +**Environment:** Windows 11 + Docker 29.x + +--- + +## Task 1 — Multi-stage Dockerfile (≤ 25 MB) + +### Dockerfile + +See [`app/Dockerfile`](../app/Dockerfile). + +### `docker images quicknotes:lab6` + +```text +IMAGE ID DISK USAGE CONTENT SIZE +quicknotes:lab6 4a2a4575e34a 22.7MB 5.71MB +``` + +### `docker inspect` excerpt (User, ExposedPorts, Entrypoint) + +```text +$ docker inspect quicknotes:lab6 --format "User={{.Config.User}} Entrypoint={{.Config.Entrypoint}}" +User=nonroot Entrypoint=[/quicknotes] +``` + +### Builder base image size (comparison) + +```text +quicknotes:lab6 22.7MB +golang:1.24.5-alpine 394MB +``` + +(~17× smaller runtime image than builder base) + +### Task 1 verification + +```text +$ Invoke-RestMethod http://127.0.0.1:8080/health +{"notes":4,"status":"ok"} +``` + +### Design questions (Task 1) + +**a) Why does layer-order matter?** + +`COPY go.mod` + `go mod download` before `COPY . .` keeps the dependency layer cached when only source changes. With `COPY . .` first, any file edit invalidates `go mod download` and forces a full module fetch + rebuild. On this project (no external deps) the win is small; the pattern matters once `go.sum` grows. + +**b) Why `CGO_ENABLED=0`?** + +Produces a fully static binary with no dynamic linker dependency. `gcr.io/distroless/static` has no `libc.so` loader — a CGO-linked binary fails at startup with `no such file or directory` (often misread as a missing binary). + +**c) What is `gcr.io/distroless/static:nonroot`?** + +Contains CA certs, `/etc/passwd` entry for UID 65532 (`nonroot`), timezone data, and **only** what a static binary needs. No shell, no `apt`, no package manager. Fewer packages → smaller attack surface and far fewer CVEs than `ubuntu` or `alpine` with a shell. + +**d) `-ldflags='-s -w'` and `-trimpath`?** + +`-s -w` strips the symbol table and DWARF debug info (smaller binary; harder to debug with `dlv`). `-trimpath` removes local filesystem paths from the binary for reproducible builds and cleaner stack traces in CI. + +--- + +## Task 2 — Compose + healthcheck + volume + +### `compose.yaml` + +See [`compose.yaml`](../compose.yaml) at repo root. +Note: `vol-init` (busybox) runs once to `chown` the named volume for UID 65532 — required because Docker creates new volumes as root and distroless runs as `nonroot`. + +### Persistence test (present → down → up → present → down -v → up → absent) + +```text +# POST durable note +{"id":5,"title":"durable","body":"survive a restart",...} + +# after docker compose down && docker compose up -d (no -v) +{"id":5,"title":"durable","body":"survive a restart",...} ✅ still present + +# after docker compose down -v && docker compose up -d +durable absent (expected) ✅ volume destroyed +``` + +### Design questions (Task 2) + +**e) Distroless has no shell. How do you healthcheck it?** + +Strategy: **HTTP probe via a second static binary** (`/healthcheck`) built in the builder stage and copied into the runtime image. Compose runs `test: ["CMD", "/healthcheck"]`, which GETs `http://127.0.0.1:8080/health` and exits non-zero on failure — no shell, `curl`, or `wget` required in distroless. + +**f) Why does `volumes: [quicknotes-data:/data]` survive `docker compose down`?** + +Named volumes are managed by Docker outside the container lifecycle. `docker compose down` removes containers and networks but **not** named volumes unless you pass `-v`. `docker compose down -v` (or `docker volume rm`) destroys the data. + +**g) `depends_on` without `condition: service_healthy`?** + +Compose only waits for the **container to start**, not for the app inside to be ready. A dependent service can connect before QuickNotes listens on `:8080`, causing flaky startup races (relevant in Lab 8 when Prometheus scrapes QuickNotes). + +--- + +## Bonus — 6 security defaults + +### Hardened `compose.yaml` snippet + +```yaml + quicknotes: + cap_drop: + - ALL + read_only: true + tmpfs: + - /tmp + security_opt: + - no-new-privileges:true +``` + +(Dockerfile: `USER nonroot`, `gcr.io/distroless/static:nonroot` base) + +### Verification outputs (B.2) + +```text +USER: nonroot +exec sh: exec: "sh": executable file not found in $PATH ✅ no shell +CapDrop: [ALL] +SecurityOpt: [no-new-privileges:true] +ReadonlyRootfs: true +``` + +### Trivy summary + +```text +Distroless base layer: Total: 0 (HIGH: 0, CRITICAL: 0) +Embedded Go stdlib (v1.24.5): Total: 16 (HIGH: 15, CRITICAL: 1) +``` + +Distroless base is clean; remaining findings are in the **compiled Go stdlib** (fixed in Go 1.24.12+). Lab 9 will wire Trivy into CI. + +### Which default gives the most security per line of YAML? + +`read_only: true` plus `cap_drop: [ALL]` — two lines that block most container escape and persistence paths. `read_only` prevents runtime package installs and config tampering; dropping all capabilities removes the Linux privilege escalation surface. Distroless + `nonroot` are Dockerfile-level but equally high leverage. + +--- + +## Lab 6 completion checklist + +### Task 1 (6 pts) + +- [x] Multi-stage Dockerfile, ≤ 25 MB, nonroot, distroless +- [x] `docker run` serves `/health` +- [x] Design questions a–d answered +- [x] Build outputs pasted + +### Task 2 (4 pts) + +- [x] `compose.yaml` with volume, healthcheck, env, restart +- [x] Persistence test demonstrated +- [x] Design questions e–g answered + +### Bonus (2 pts) + +- [x] All 6 defaults applied and verified +- [x] Trivy scan documented + +### Submission + +- [x] Course PR (`feature/lab6` → `inno-devops-labs/main`) + **https://github.com/inno-devops-labs/DevOps-Intro/pull/1157** +- [x] Fork PR (`feature/lab6-fork` → `selysecr332/main`) + **https://github.com/selysecr332/DevOps-Intro/pull/7** +- [x] Both URLs on Moodle diff --git a/submissions/lab8.md b/submissions/lab8.md new file mode 100644 index 000000000..c00719b0a --- /dev/null +++ b/submissions/lab8.md @@ -0,0 +1,184 @@ + # Lab 8 — SRE & Monitoring: Golden Signals Dashboard + One Good Alert + +Mahmoud Hassan (`selysecr332`) +**Environment:** Windows 11 + Docker Compose + Lab 6 QuickNotes image + +--- + +## Task 1 — Prometheus + Grafana with provisioned dashboard + +### Layout + +```text +monitoring/ +├── prometheus/ +│ ├── prometheus.yml +│ └── rules/high-error-rate.yml +├── grafana/ +│ ├── provisioning/ +│ │ ├── datasources/datasource.yml +│ │ └── dashboards/dashboard.yml +│ └── dashboards/golden-signals.json +└── scripts/ + ├── generate-traffic.sh + └── inject-errors.sh +``` + +Extended [`compose.yaml`](../compose.yaml) with `prometheus` and `grafana` services. + +### Config files + +See [`monitoring/`](../monitoring/) directory. + +### Prometheus targets health + +```bash +curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets[].health' +``` + +```text +"up" +``` + +### Grafana dashboard + +Screenshot: [`submissions/screenshots/grafana.png`](screenshots/grafana.png) + +Login: `admin` / `lab8-grafana-admin` (set in compose — not default `admin`/`admin`). + +Traffic generated with `bash monitoring/scripts/generate-traffic.sh` — all four panels show non-trivial data. + +### Design questions (Task 1) + +**a) Pull vs push — reachability and failure mode?** + +Prometheus **pulls** by HTTP-scraping QuickNotes `/metrics`. QuickNotes must be reachable **from Prometheus** on the Compose network (`quicknotes:8080`), not the other way around. If Prometheus cannot reach QuickNotes, the scrape fails, `up == 0`, and metrics go stale — you lose visibility even if the app is still serving some users via the published host port. + +**b) `scrape_interval: 15s` — problems at 5s vs 5m?** + +At **5s**: more frequent scrapes → higher load on QuickNotes and Prometheus, larger TSDB churn, and noisier graphs; diminishing returns for a small lab API. At **5m**: coarse resolution — short spikes and brief outages can be missed or averaged away; alert rules using `[5m]` windows react slowly and dashboards feel “stale.” + +**c) `rate()` vs `irate()` vs `delta()` for Traffic panel?** + +Use **`rate()`** on the `quicknotes_http_requests_total` counter over a range (e.g. `[5m]`). `rate()` smooths per-second increase across the window — right for “requests per second” traffic. `irate()` only uses the last two samples → spiky, bad for dashboards/alerts. `delta()` is for gauges, not counters. + +**d) Why provision Grafana from files?** + +Dashboard + datasource JSON/YAML lives in Git next to the app: reproducible `docker compose up`, reviewable in PRs, same panels for every teammate/CI environment — no manual UI clicking after each fresh stack. + +--- + +## Task 2 — Alert + runbook + +### Alert rule + +Prometheus rule: [`monitoring/prometheus/rules/high-error-rate.yml`](../monitoring/prometheus/rules/high-error-rate.yml) + +```yaml +expr: | + ( + sum(rate(quicknotes_http_responses_by_code_total{code=~"4..|5.."}[5m])) + / + sum(rate(quicknotes_http_requests_total[5m])) + ) > 0.05 +for: 5m +labels: + severity: page +``` + +### Trigger demo + +```bash +bash monitoring/scripts/inject-errors.sh +# 360s of mixed healthy + malformed POST /notes traffic +``` + +Errors panel during injection (sustained **~40–50%** error ratio, well above 5% threshold): + +Screenshot: [`submissions/screenshots/grafana_2.png`](screenshots/grafana_2.png) + +Prometheus alert rule loaded (`HighErrorRate`, `for: 5m`, `severity: page`, runbook annotation): + +Screenshot: [`submissions/screenshots/alerts.png`](screenshots/alerts.png) + +During `inject-errors.sh`, the rule transitions **Inactive → Pending → Firing** while error ratio stays >5% for 5 minutes; returns to **Inactive** after injection stops (as in screenshot). + +### Runbook + +[`docs/runbook/high-error-rate.md`](../docs/runbook/high-error-rate.md) + +### Design questions (Task 2) + +**e) Why sustained 5 minutes instead of first bad request?** + +A single malformed `POST /notes` returns one 400 — not an outage. Requiring **5 minutes** above 5% filters bursty bad clients and avoids paging on noise; you alert when users are *sustainedly* affected, matching SLO-style “budget over time” thinking. + +**f) Symptom vs cause alert example for QuickNotes?** + +**Symptom (good):** error ratio > 5% — what users see. **Cause (worse as a page):** `container_cpu_usage > 80%` — CPU can be high while errors are zero, or errors can happen at low CPU (bad deploy, bug). Cause metrics create false positives and send on-call chasing the wrong layer. + +**g) Alert fatigue — quantitative threshold for “too noisy”?** + +If **`HighErrorRate` pages more than ~1–2 times per month** when users were **not** actually impaired (checked via health checks / support tickets), the threshold or `for:` duration is too aggressive. Rule of thumb from Lecture 8: if **>30% of pages** are false alarms, fix or silence the alert before adding more. + +--- + +## Bonus — Synthetic monitoring (not completed) + +**Attempted** ngrok, `trycloudflare.com` quick tunnel, and named Cloudflare Zero Trust tunnel (`lab8-quicknote`). + +**Blocked by network** — `cloudflared` connectivity pre-checks: + +| Check | Result | +|-------|--------| +| DNS Resolution | PASS | +| Cloudflare API (`api.cloudflare.com:443`) | PASS | +| UDP/QUIC to `region*.v2.argotunnel.com` | **FAIL** | +| TCP/HTTP2 to `region*.v2.argotunnel.com:7844` | **FAIL** — blocked or unreachable | + +``` +ERROR: Allow outbound QUIC traffic on port 7844 or use HTTP2. +ERROR: Allow outbound TCP on port 7844. +SUMMARY: Environment has critical failures. cloudflared may not be able to establish a tunnel. +``` + +Tunnel dashboard stayed **Inactive** (0 replicas). Checkly could not be configured without a public URL. + +Setup notes: [`monitoring/docs/bonus-checkly-setup.md`](../monitoring/docs/bonus-checkly-setup.md) + +### Failure-mode analysis + +**Checkly would catch, Prometheus cannot:** Internet path failures (DNS, TLS, regional routing, tunnel down) — external probes hit the public URL like real users; Prometheus only scrapes `quicknotes:8080` inside Docker. + +**Prometheus catches, Checkly cannot:** In-app error ratios across all routes, saturation (`quicknotes_notes_total`), and sustained 5xx/4xx alerts — Checkly only probes `/health` once per minute per region. + +--- + +## Lab 8 completion checklist + +### Task 1 (6 pts) + +- [x] Prometheus scrapes QuickNotes (`up`) +- [x] Grafana 4-panel golden-signals dashboard +- [x] Traffic generated, graphs non-trivial +- [x] Design questions a–d answered + +### Task 2 (4 pts) + +- [x] Alert rule with 5% / 5m sustained gate +- [x] Runbook complete +- [x] Alert observed Firing +- [x] Design questions e–g answered + +### Bonus (2 pts) + +- [x] Attempted (ngrok + cloudflared) — blocked on port **7844** +- [ ] Checkly 2-region comparison (not possible without public URL) +- [x] Failure-mode analysis written + +### Submission + +- [x] Course PR: https://github.com/inno-devops-labs/DevOps-Intro/pull/1222 +- [x] Fork PR: https://github.com/selysecr332/DevOps-Intro/pull/9 +- [x] Moodle URL submitted + diff --git a/submissions/screenshots/alerts.png b/submissions/screenshots/alerts.png new file mode 100644 index 000000000..cc1eaea4b Binary files /dev/null and b/submissions/screenshots/alerts.png differ diff --git a/submissions/screenshots/grafana.png b/submissions/screenshots/grafana.png new file mode 100644 index 000000000..e37e4fb46 Binary files /dev/null and b/submissions/screenshots/grafana.png differ diff --git a/submissions/screenshots/grafana_2.png b/submissions/screenshots/grafana_2.png new file mode 100644 index 000000000..0db1f2bcc Binary files /dev/null and b/submissions/screenshots/grafana_2.png differ