diff --git a/docs/o11y-internal-vip-reconcile.md b/docs/o11y-internal-vip-reconcile.md new file mode 100644 index 0000000..441b597 --- /dev/null +++ b/docs/o11y-internal-vip-reconcile.md @@ -0,0 +1,257 @@ +# O11y Internal LB VIP Reconciliation + +This guide exists for one job: + +- the `o11y` internal load balancer IP changed +- `staging` and `prod` still point at the old IP in `coredns-custom` +- logs and metrics stop reaching `o11y` + +This guide tells you exactly how to check for that problem and fix it. + +## What you are fixing + +`staging` and `prod` do **not** use a private DNS service. + +Instead, they use the `coredns-custom` ConfigMap to force these names to the current internal `o11y` load balancer IP: + +- `prom.vipyrsec.com` +- `loki.vipyrsec.com` +- `grafana.vipyrsec.com` + +If the `o11y` internal load balancer is recreated and gets a new private IP, those overrides become stale. + +When that happens: + +- `staging` and `prod` still resolve the old IP +- Alloy keeps trying to send to the old IP +- observability ingestion breaks + +## What you need before you start + +You need all of these: + +1. A shell in the `infrastructure` repo. +2. `kubectl` installed. +3. `rg` installed. +4. Working kube contexts for: + - `do-sfo3-o11y` + - `do-sfo3-staging` + - `do-sfo3-prod` + +If any of those are missing, stop and fix that first. + +## Where the helper script lives + +Script path: + +`/home/rem/github/vipyrsec/infrastructure/scripts/reconcile-o11y-internal-vip.sh` + +## Step 1: Open the repo + +Run: + +```bash +cd /home/rem/github/vipyrsec/infrastructure +``` + +## Step 2: Make sure the script exists + +Run: + +```bash +ls -l ./scripts/reconcile-o11y-internal-vip.sh +``` + +You should see the script in the output. + +If you do not see it, stop. + +## Step 3: Make the script executable + +Run: + +```bash +chmod +x ./scripts/reconcile-o11y-internal-vip.sh +``` + +## Step 4: Run the script in dry-run mode first + +Run: + +```bash +./scripts/reconcile-o11y-internal-vip.sh +``` + +Important: + +- Do **not** start with `--apply` +- Dry-run is there so you can see what the script thinks the current `o11y` VIP is + +## Step 5: Read the dry-run output + +The script will print: + +- the current internal `o11y` VIP +- the live `coredns-custom` contents for `staging` +- the live `coredns-custom` contents for `prod` + +You are looking for this: + +- if all three host entries already point at the same `o11y` VIP, you do **not** need to repair anything +- if `staging` or `prod` still point at some older IP, the entry is stale and you should continue + +## Step 6: Apply the fix + +Run: + +```bash +./scripts/reconcile-o11y-internal-vip.sh --apply +``` + +What this does: + +1. Reads the live internal `o11y` LB IP from: + - context `do-sfo3-o11y` + - namespace `default` + - service `vipyrsec-ingress-nginx-controller` +2. Rewrites `coredns-custom` in: + - `do-sfo3-staging` + - `do-sfo3-prod` +3. Restarts CoreDNS in both clusters +4. Waits for both CoreDNS rollouts to finish +5. Verifies that all three hostnames resolve to the new VIP in both clusters + +## Step 7: If the script succeeds + +You are not done yet. + +You still need to confirm that ingestion is actually healthy. + +Run these commands exactly: + +```bash +kubectl --context do-sfo3-o11y -n prometheus exec prometheus-7454cf6864-fpmvm -- promtool query instant http://localhost:9090 'count(up{cluster="staging",job=~".*postgres.*"})' +``` + +```bash +kubectl --context do-sfo3-o11y -n prometheus exec prometheus-7454cf6864-fpmvm -- promtool query instant http://localhost:9090 'count(up{cluster="prod",job=~".*postgres.*"})' +``` + +Healthy output should show a numeric result instead of an empty vector. + +## Step 8: Check that the endpoints are reachable privately + +Run these commands. + +From `staging`: + +```bash +kubectl --context do-sfo3-staging -n default run prom-ready-staging --rm -i --restart=Never --image=curlimages/curl --command -- sh -lc 'curl -sk -o /dev/null -w "%{http_code}\n" https://prom.vipyrsec.com/-/ready' +``` + +```bash +kubectl --context do-sfo3-staging -n default run loki-ready-staging --rm -i --restart=Never --image=curlimages/curl --command -- sh -lc 'curl -sk -o /dev/null -w "%{http_code}\n" https://loki.vipyrsec.com/ready' +``` + +```bash +kubectl --context do-sfo3-staging -n default run grafana-login-staging --rm -i --restart=Never --image=curlimages/curl --command -- sh -lc 'curl -sk -o /dev/null -w "%{http_code}\n" https://grafana.vipyrsec.com/login' +``` + +From `prod`: + +```bash +kubectl --context do-sfo3-prod -n default run prom-ready-prod --rm -i --restart=Never --image=curlimages/curl --command -- sh -lc 'curl -sk -o /dev/null -w "%{http_code}\n" https://prom.vipyrsec.com/-/ready' +``` + +```bash +kubectl --context do-sfo3-prod -n default run loki-ready-prod --rm -i --restart=Never --image=curlimages/curl --command -- sh -lc 'curl -sk -o /dev/null -w "%{http_code}\n" https://loki.vipyrsec.com/ready' +``` + +```bash +kubectl --context do-sfo3-prod -n default run grafana-login-prod --rm -i --restart=Never --image=curlimages/curl --command -- sh -lc 'curl -sk -o /dev/null -w "%{http_code}\n" https://grafana.vipyrsec.com/login' +``` + +Expected results: + +- Prometheus: `200` +- Loki: `404` on `/ready` is acceptable and still proves reachability +- Grafana: `200` + +## What to do if the script fails + +Do not guess. + +Read the error and follow this order: + +1. Check that the `o11y` Service has a VIP at all. + +Run: + +```bash +kubectl --context do-sfo3-o11y -n default get svc vipyrsec-ingress-nginx-controller -o yaml +``` + +If `.status.loadBalancer.ingress[0].ip` is empty, stop. + +The `o11y` internal LB itself is not ready yet. + +2. Check that CoreDNS restarted successfully. + +Run: + +```bash +kubectl --context do-sfo3-staging -n kube-system rollout status deployment coredns --timeout=180s +``` + +```bash +kubectl --context do-sfo3-prod -n kube-system rollout status deployment coredns --timeout=180s +``` + +If either rollout fails, stop and fix CoreDNS before doing anything else. + +3. Check what `coredns-custom` actually contains. + +Run: + +```bash +kubectl --context do-sfo3-staging -n kube-system get configmap coredns-custom -o yaml +``` + +```bash +kubectl --context do-sfo3-prod -n kube-system get configmap coredns-custom -o yaml +``` + +You should see `prom.server`, `loki.server`, and `grafana.server` all pointing to the same `o11y` internal VIP. + +4. Check DNS resolution from inside the client clusters. + +Run: + +```bash +kubectl --context do-sfo3-staging -n default run dns-check-staging --rm -i --restart=Never --image=busybox:1.36 --command -- sh -lc "nslookup prom.vipyrsec.com && nslookup loki.vipyrsec.com && nslookup grafana.vipyrsec.com" +``` + +```bash +kubectl --context do-sfo3-prod -n default run dns-check-prod --rm -i --restart=Never --image=busybox:1.36 --command -- sh -lc "nslookup prom.vipyrsec.com && nslookup loki.vipyrsec.com && nslookup grafana.vipyrsec.com" +``` + +If those do not return the current `o11y` internal VIP, the repair did not take. + +## What “good” looks like + +The repair is complete only when all of these are true: + +1. `o11y` ingress Service has an internal VIP. +2. `staging` and `prod` CoreDNS both resolve: + - `prom.vipyrsec.com` + - `loki.vipyrsec.com` + - `grafana.vipyrsec.com` + to that VIP. +3. Internal HTTPS checks work from both clusters. +4. Prometheus in `o11y` still shows healthy scrape targets. + +If even one of those is false, the repair is not complete. + +## One sentence summary + +If the `o11y` internal LB IP changes, run the script in dry-run mode first, then run it with `--apply`, then do the verification commands until you prove the client clusters resolve and reach the new VIP correctly. diff --git a/kubernetes/chart/values/o11y/vipyrsec.yaml b/kubernetes/chart/values/o11y/vipyrsec.yaml index 8eac5d7..8830f9f 100644 --- a/kubernetes/chart/values/o11y/vipyrsec.yaml +++ b/kubernetes/chart/values/o11y/vipyrsec.yaml @@ -2,22 +2,20 @@ ingress-nginx: controller: service: - type: NodePort - nodePorts: - http: 30080 - https: 30443 + type: LoadBalancer + annotations: + service.beta.kubernetes.io/do-loadbalancer-network: INTERNAL loki: enabled: true - # annotations: - # cert-manager.io/cluster-issuer: letsencrypt - - # ingress: - # tls: - # - hosts: - # - loki.vipyrsec.com - # secretName: loki-tls + ingress: + annotations: + cert-manager.io/cluster-issuer: letsencrypt + tls: + - hosts: + - loki.vipyrsec.com + secretName: loki-tls alloy: enabled: true diff --git a/kubernetes/chart/values/prod/vipyrsec.yaml b/kubernetes/chart/values/prod/vipyrsec.yaml index e1464cb..93def60 100644 --- a/kubernetes/chart/values/prod/vipyrsec.yaml +++ b/kubernetes/chart/values/prod/vipyrsec.yaml @@ -65,7 +65,7 @@ alloy: loki.write "o11y" { endpoint { - url = "http://loki.vipyrsec.com:30080/loki/api/v1/push" + url = "https://loki.vipyrsec.com/loki/api/v1/push" } } @@ -82,6 +82,15 @@ alloy: prometheus.exporter.postgres "dragonfly" { data_source_names = [remote.kubernetes.secret.postgres.data.dsn] + enabled_collectors = [ + "locks", + "replication", + "replication_slot", + "stat_bgwriter", + "stat_database", + "stat_user_tables", + "statio_user_tables", + ] } prometheus.scrape "self" { @@ -132,6 +141,6 @@ alloy: prometheus.remote_write "o11y" { endpoint { - url = "http://prom.vipyrsec.com:30080/api/v1/write" + url = "https://prom.vipyrsec.com/api/v1/write" } } diff --git a/kubernetes/chart/values/staging/vipyrsec.yaml b/kubernetes/chart/values/staging/vipyrsec.yaml index e22c520..c7a553e 100644 --- a/kubernetes/chart/values/staging/vipyrsec.yaml +++ b/kubernetes/chart/values/staging/vipyrsec.yaml @@ -65,7 +65,7 @@ alloy: loki.write "o11y" { endpoint { - url = "http://loki.vipyrsec.com:30080/loki/api/v1/push" + url = "https://loki.vipyrsec.com/loki/api/v1/push" } } @@ -82,6 +82,15 @@ alloy: prometheus.exporter.postgres "dragonfly" { data_source_names = [remote.kubernetes.secret.postgres.data.dsn] + enabled_collectors = [ + "locks", + "replication", + "replication_slot", + "stat_bgwriter", + "stat_database", + "stat_user_tables", + "statio_user_tables", + ] } prometheus.scrape "self" { @@ -132,6 +141,6 @@ alloy: prometheus.remote_write "o11y" { endpoint { - url = "http://prom.vipyrsec.com:30080/api/v1/write" + url = "https://prom.vipyrsec.com/api/v1/write" } } diff --git a/kubernetes/manifests/cert-manager/cluster_issuer.yaml b/kubernetes/manifests/cert-manager/cluster_issuer.yaml index 2106353..a8c2ada 100644 --- a/kubernetes/manifests/cert-manager/cluster_issuer.yaml +++ b/kubernetes/manifests/cert-manager/cluster_issuer.yaml @@ -8,11 +8,14 @@ metadata: spec: acme: - email: bradley.reynolds@darbia.dev + email: siddhesh.mhadnak@outlook.com server: https://acme-v02.api.letsencrypt.org/directory privateKeySecretRef: name: letsencrypt-issuer-account-key solvers: - - http01: - ingress: - ingressClassName: nginx + - dns01: + cloudflare: + apiTokenSecretRef: + name: cloudflare + key: token + email: siddhesh.mhadnak@outlook.com diff --git a/kubernetes/manifests/kube-system/coredns-custom-configmap.yaml b/kubernetes/manifests/kube-system/coredns-custom-configmap.yaml index 240e1f0..121af42 100644 --- a/kubernetes/manifests/kube-system/coredns-custom-configmap.yaml +++ b/kubernetes/manifests/kube-system/coredns-custom-configmap.yaml @@ -7,17 +7,24 @@ metadata: namespace: kube-system data: + prom.server: | + prom.vipyrsec.com:53 { + hosts { + 10.124.0.2 prom.vipyrsec.com + fallthrough + } + } loki.server: | loki.vipyrsec.com:53 { hosts { - 10.124.0.11 loki.vipyrsec.com + 10.124.0.2 loki.vipyrsec.com fallthrough } } - prom.server: |- - prom.vipyrsec.com:53 { + grafana.server: | + grafana.vipyrsec.com:53 { hosts { - 10.124.0.11 prom.vipyrsec.com + 10.124.0.2 grafana.vipyrsec.com fallthrough } } diff --git a/kubernetes/manifests/monitoring/prometheus/ingress.yaml b/kubernetes/manifests/monitoring/prometheus/ingress.yaml index 7d11b70..6129232 100644 --- a/kubernetes/manifests/monitoring/prometheus/ingress.yaml +++ b/kubernetes/manifests/monitoring/prometheus/ingress.yaml @@ -3,18 +3,18 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: - # annotations: - # cert-manager.io/cluster-issuer: letsencrypt + annotations: + cert-manager.io/cluster-issuer: letsencrypt name: prometheus namespace: prometheus spec: ingressClassName: nginx - # tls: - # - hosts: - # - prom.vipyrsec.com - # secretName: prom-tls + tls: + - hosts: + - prom.vipyrsec.com + secretName: prom-tls rules: - host: prom.vipyrsec.com diff --git a/scripts/reconcile-o11y-internal-vip.sh b/scripts/reconcile-o11y-internal-vip.sh new file mode 100644 index 0000000..f6bb25e --- /dev/null +++ b/scripts/reconcile-o11y-internal-vip.sh @@ -0,0 +1,184 @@ +#!/usr/bin/env bash + +set -euo pipefail + +readonly O11Y_CONTEXT="do-sfo3-o11y" +readonly STAGING_CONTEXT="do-sfo3-staging" +readonly PROD_CONTEXT="do-sfo3-prod" +readonly O11Y_NAMESPACE="default" +readonly O11Y_SERVICE="vipyrsec-ingress-nginx-controller" +readonly COREDNS_NAMESPACE="kube-system" +readonly COREDNS_CONFIGMAP="coredns-custom" +readonly COREDNS_DEPLOYMENT="coredns" + +usage() { + cat <<'EOF' +Usage: + reconcile-o11y-internal-vip.sh [--apply] + +What it does: + 1. Reads the current internal o11y load balancer VIP from: + context do-sfo3-o11y, namespace default, service vipyrsec-ingress-nginx-controller + 2. Compares that VIP to the live coredns-custom ConfigMap in: + - do-sfo3-staging + - do-sfo3-prod + 3. If --apply is provided, updates prom.vipyrsec.com, loki.vipyrsec.com, and + grafana.vipyrsec.com to the current VIP and restarts CoreDNS in both clusters. + +Default mode: + Dry-run only. Nothing is changed unless you pass --apply. + +Requirements: + - kubectl installed + - kube contexts available for do-sfo3-o11y, do-sfo3-staging, do-sfo3-prod + - permission to update ConfigMaps and restart CoreDNS in staging and prod +EOF +} + +log() { + printf '[reconcile-o11y-internal-vip] %s\n' "$*" +} + +die() { + printf '[reconcile-o11y-internal-vip] ERROR: %s\n' "$*" >&2 + exit 1 +} + +require_tool() { + command -v "$1" >/dev/null 2>&1 || die "required tool not found: $1" +} + +get_o11y_vip() { + kubectl --context "$O11Y_CONTEXT" -n "$O11Y_NAMESPACE" \ + get svc "$O11Y_SERVICE" \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' +} + +render_coredns_configmap() { + local vip="$1" + cat <}" +} + +apply_cluster() { + local context="$1" + local vip="$2" + + log "Applying updated coredns-custom to ${context}" + render_coredns_configmap "$vip" | kubectl --context "$context" apply -f - + + log "Restarting CoreDNS in ${context}" + kubectl --context "$context" -n "$COREDNS_NAMESPACE" rollout restart deployment "$COREDNS_DEPLOYMENT" + + log "Waiting for CoreDNS rollout in ${context}" + kubectl --context "$context" -n "$COREDNS_NAMESPACE" rollout status deployment "$COREDNS_DEPLOYMENT" --timeout=180s +} + +verify_resolution() { + local context="$1" + local host="$2" + local expected_vip="$3" + local output + + output="$( + kubectl --context "$context" -n default run "dns-check-${host//./-}" \ + --rm -i --restart=Never --image=busybox:1.36 \ + --command -- sh -lc "nslookup ${host}" 2>/dev/null || true + )" + + printf '\nResolution check for %s in %s:\n%s\n' "$host" "$context" "$output" + printf '%s' "$output" | rg -q "$expected_vip" || die "${context} does not resolve ${host} to ${expected_vip}" +} + +main() { + local apply_mode="false" + + if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then + usage + exit 0 + fi + + if [[ "${1:-}" == "--apply" ]]; then + apply_mode="true" + elif [[ $# -gt 0 ]]; then + usage + die "unknown argument: $1" + fi + + require_tool kubectl + require_tool rg + + local vip + vip="$(get_o11y_vip)" + [[ -n "$vip" ]] || die "o11y internal load balancer VIP is empty" + + log "Detected o11y internal VIP: ${vip}" + + show_cluster_status "$STAGING_CONTEXT" "$vip" + show_cluster_status "$PROD_CONTEXT" "$vip" + + if [[ "$apply_mode" != "true" ]]; then + log "Dry-run only. Re-run with --apply to update staging and prod." + exit 0 + fi + + apply_cluster "$STAGING_CONTEXT" "$vip" + apply_cluster "$PROD_CONTEXT" "$vip" + + verify_resolution "$STAGING_CONTEXT" "prom.vipyrsec.com" "$vip" + verify_resolution "$STAGING_CONTEXT" "loki.vipyrsec.com" "$vip" + verify_resolution "$STAGING_CONTEXT" "grafana.vipyrsec.com" "$vip" + verify_resolution "$PROD_CONTEXT" "prom.vipyrsec.com" "$vip" + verify_resolution "$PROD_CONTEXT" "loki.vipyrsec.com" "$vip" + verify_resolution "$PROD_CONTEXT" "grafana.vipyrsec.com" "$vip" + + log "Reconciliation complete." +} + +main "$@"