From e3af57262051f080cb4b814f89cd2b456def45ae Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 17 Mar 2026 14:31:43 +0000
Subject: [PATCH 1/2] Initial plan


From 39273e1d1ef888fa20d60a99944afdc2dc7f9716 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 17 Mar 2026 14:41:34 +0000
Subject: [PATCH 2/2] feat: add disk usage monitoring and auto-alerting for all
 GCE instances

Co-authored-by: numbers-official <181934381+numbers-official@users.noreply.github.com>
---
 README.md                           |  67 ++++++
 docs/runbooks/disk-management.md    | 342 ++++++++++++++++++++++++++++
 monitoring/disk-check.sh            | 138 +++++++++++
 monitoring/setup-gcp-disk-alerts.sh | 194 ++++++++++++++++
 4 files changed, 741 insertions(+)
 create mode 100644 docs/runbooks/disk-management.md
 create mode 100755 monitoring/disk-check.sh
 create mode 100755 monitoring/setup-gcp-disk-alerts.sh

diff --git a/README.md b/README.md
index c207e4b..d2bc718 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,10 @@ Numbers Network is dedicated to preserving digital media provenance and related
 - [Wrapped NUM](#wrapped-num)
 - [Bridge](#bridge)
 - [Archieve Node](#archieve-node)
+- [Disk Monitoring](#disk-monitoring)
+  - [Cron-based disk check script](#cron-based-disk-check-script)
+  - [GCP Cloud Monitoring alerts](#gcp-cloud-monitoring-alerts)
+  - [Disk Management Runbook](#disk-management-runbook)
 
 ## Mainnet: Jade (玉)
 
@@ -951,3 +955,66 @@ Make a Full Node instance to be an Archive Node instance:
     ```
 
     [Discord discussion](https://discord.com/channels/578992315641626624/905684871731634196/1026850988042244247)
+
+# Disk Monitoring
+
+Multiple GCE instances run continuously-growing blockchain databases. Without proactive
+monitoring, validators silently auto-shutdown when disk space falls below ~3% free,
+causing chain downtime and transaction mempool backlog.
+
+Alert thresholds:
+
+| Level    | Threshold | Action                                    |
+|----------|-----------|-------------------------------------------|
+| OK       | < 80%     | No action required                        |
+| WARNING  | ≥ 80%     | Schedule cleanup or expansion within 48 h |
+| CRITICAL | ≥ 90%     | Immediate action required                 |
+
+## Cron-based disk check script
+
+`monitoring/disk-check.sh` is a lightweight shell script that checks all mounted
+filesystems and fires alerts via **email** and/or **Slack** when usage exceeds the
+configured thresholds.
+
+Deploy to each GCE instance:
+
+```sh
+sudo mkdir -p /opt/numbers-network/monitoring
+sudo cp monitoring/disk-check.sh /opt/numbers-network/monitoring/
+sudo chmod +x /opt/numbers-network/monitoring/disk-check.sh
+
+# Set environment variables (edit as needed)
+export ALERT_EMAIL=ops@example.com
+export SLACK_WEBHOOK_URL=https://hooks.slack.com/services/XXX/YYY/ZZZ
+export DISK_WARNING_PCT=80
+export DISK_CRITICAL_PCT=90
+
+# Add a cron job to run every 15 minutes
+(crontab -l 2>/dev/null; echo "*/15 * * * * /opt/numbers-network/monitoring/disk-check.sh >> /var/log/disk-check.log 2>&1") | crontab -
+```
+
+## GCP Cloud Monitoring alerts
+
+`monitoring/setup-gcp-disk-alerts.sh` provisions GCP-native alerting policies
+(WARNING at 80%, CRITICAL at 90%) and notification channels using the `gcloud` CLI.
+
+```sh
+export GCP_PROJECT=your-gcp-project-id
+export ALERT_EMAIL=ops@example.com
+# Optional Slack integration:
+# export SLACK_CHANNEL_NAME=numbers-ops
+# export SLACK_AUTH_TOKEN=xoxb-...
+
+bash monitoring/setup-gcp-disk-alerts.sh
+```
+
+> **Note**: The [Ops Agent](https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation)
+> must be installed on each GCE instance to populate the
+> `agent.googleapis.com/disk/percent_used` metric used by these policies.
+
+## Disk Management Runbook
+
+For step-by-step instructions on disk expansion, Avalanchego chain data pruning,
+and Blockscout/explorer database cleanup, see:
+
+[docs/runbooks/disk-management.md](docs/runbooks/disk-management.md)
diff --git a/docs/runbooks/disk-management.md b/docs/runbooks/disk-management.md
new file mode 100644
index 0000000..4ba28be
--- /dev/null
+++ b/docs/runbooks/disk-management.md
@@ -0,0 +1,342 @@
+# Disk Management Runbook
+
+**Status**: Active  
+**Last Updated**: 2026-03-17  
+**Applies to**: All Numbers Network GCE instances (mainnet validators, testnet validators, explorers)
+
+---
+
+## Table of Contents
+
+- [Background](#background)
+- [Alert Thresholds](#alert-thresholds)
+- [Instance Inventory](#instance-inventory)
+- [Immediate Triage](#immediate-triage)
+- [Remediation: Expand a GCE Persistent Disk (Online)](#remediation-expand-a-gce-persistent-disk-online)
+- [Remediation: Avalanchego Chain Data Pruning](#remediation-avalanchego-chain-data-pruning)
+- [Remediation: Blockscout / Explorer Database Cleanup](#remediation-blockscout--explorer-database-cleanup)
+- [Automated Monitoring Setup](#automated-monitoring-setup)
+- [Incident History](#incident-history)
+
+---
+
+## Background
+
+GCE instances running Avalanche validators accumulate blockchain data continuously.
+Without monitoring, disks can silently fill to 100%, triggering an automatic OS-level
+shutdown of the validator node. This causes:
+
+- Chain downtime for Numbers Network validators
+- Transaction mempool backlog
+- Potential double-sign risk if the node rejoins without catching up
+
+The 2026-03-15 incident on `numbers-mainnet-validator-1` (auto-shutdown at 97% disk)
+and the 2026-03-17 situation on `numbers-testnet-validator-3` (96% disk) prompted
+implementation of this runbook.
+
+---
+
+## Alert Thresholds
+
+| Level    | Threshold | Action Required                            |
+|----------|-----------|--------------------------------------------|
+| OK       | < 80%     | No action                                  |
+| WARNING  | ≥ 80%     | Schedule cleanup or expansion within 48 h  |
+| CRITICAL | ≥ 90%     | Immediate action — expand or prune today   |
+| DANGER   | ≥ 97%     | Emergency — node may auto-shutdown         |
+
+---
+
+## Instance Inventory
+
+| Instance                    | Typical Disk | Growth Rate      | Notes                    |
+|-----------------------------|-------------|------------------|--------------------------|
+| numbers-mainnet-validator-1 | 3.4 T       | ~20 GB/day       | Primary mainnet validator |
+| numbers-mainnet-validator-a1| 1.9 T       | ~15 GB/day       |                          |
+| numbers-mainnet-validator-a2| 2.0 T       | ~15 GB/day       |                          |
+| numbers-testnet-validator-3 | 497 G       | ~5 GB/day        | Testnet — smaller disk   |
+| testnet-explorer            | 29 G        | ~500 MB/day      | Blockscout explorer      |
+| mainnet-explorer            | 47 G        | ~1 GB/day        | Blockscout explorer      |
+
+---
+
+## Immediate Triage
+
+Run these commands on the affected instance to understand disk consumption:
+
+```bash
+# Overall disk usage
+df -h
+
+# Top disk consumers in the avalanche data directory
+sudo du -sh /home/ubuntu/.avalanchego/* 2>/dev/null | sort -rh | head -20
+
+# Top disk consumers in blockchain chain data
+sudo du -sh /home/ubuntu/.avalanchego/db/*/chainData 2>/dev/null | sort -rh | head -10
+
+# Log file sizes
+sudo du -sh /var/log/* 2>/dev/null | sort -rh | head -10
+
+# Docker volumes (if applicable)
+docker system df 2>/dev/null || true
+```
+
+---
+
+## Remediation: Expand a GCE Persistent Disk (Online)
+
+GCE supports **online disk expansion** — the instance stays running throughout.
+
+### Step 1 — Resize the persistent disk in GCP
+
+```bash
+# Using gcloud CLI (replace variables as appropriate)
+INSTANCE_NAME="numbers-testnet-validator-3"
+ZONE="us-central1-a"           # adjust to the actual zone
+DISK_NAME="${INSTANCE_NAME}"   # disk usually shares the instance name
+NEW_SIZE_GB=600                # desired new size in GiB
+
+gcloud compute disks resize "${DISK_NAME}" \
+    --size="${NEW_SIZE_GB}GB" \
+    --zone="${ZONE}"
+```
+
+You can also do this in the [GCP Console](https://console.cloud.google.com/compute/disks):
+**Compute Engine → Disks → select disk → Edit → increase size**.
+
+> ⚠️ GCE disks can only be **increased** in size, never decreased.
+
+### Step 2 — Grow the partition on the instance
+
+SSH into the instance and run:
+
+```bash
+# Confirm the disk device (usually /dev/sda or /dev/nvme0n1)
+lsblk
+
+# Grow the partition (number 1 in most GCE instances)
+sudo growpart /dev/sda 1
+# or for NVMe:
+sudo growpart /dev/nvme0n1 1
+```
+
+### Step 3 — Resize the filesystem (no unmounting required)
+
+```bash
+# For ext4 filesystems (most GCE boot/data disks)
+sudo resize2fs /dev/sda1
+# or for NVMe:
+sudo resize2fs /dev/nvme0n1p1
+
+# Verify the new size is reflected
+df -h /
+```
+
+For **XFS** filesystems:
+
+```bash
+sudo xfs_growfs /
+```
+
+### Verification
+
+```bash
+df -h
+# The mount point should now show the expanded capacity.
+```
+
+---
+
+## Remediation: Avalanchego Chain Data Pruning
+
+Avalanchego accumulates historical state data under `~/.avalanchego/db/`.
+The two main strategies are **state pruning** (EVM) and **network pruning** (P/X/C chains).
+
+### Check current data sizes
+
+```bash
+sudo du -sh ~/.avalanchego/db/*/
+sudo du -sh ~/.avalanchego/db/*/*/ 2>/dev/null | sort -rh | head -20
+```
+
+### Option A — Enable state pruning in the EVM chain config
+
+> Applies to **validators** only — do **not** enable on archive nodes.
+
+Edit the chain config (see `avalanchego/configs/chains/<blockchain-id>/config-validator.json`):
+
+```json
+{
+    "pruning-enabled": true,
+    "state-sync-enabled": false
+}
+```
+
+Restart avalanchego:
+
+```bash
+sudo systemctl restart avalanchego
+```
+
+On first restart with pruning enabled, the node will begin compacting its LevelDB state.
+This takes several hours and temporarily increases CPU/disk I/O.
+
+### Option B — State-sync re-sync (fastest, but requires downtime)
+
+State-sync downloads only the latest state rather than replaying all historical blocks.
+This is the fastest way to reclaim large amounts of disk space.
+
+```bash
+# 1. Stop the node
+sudo systemctl stop avalanchego
+
+# 2. Remove the chain database (preserves staking keys)
+BLOCKCHAIN_ID="2PDRxzc6jMbZSTLb3sufkVszgQc2jtDnYZGtDTAAfom1CTwPsE"  # mainnet
+# BLOCKCHAIN_ID="2oo5UvYgFQikM7KBsMXFQE3RQv3xAFFc8JY2GEBNBF1tp4JaeZ"  # testnet
+sudo rm -rf ~/.avalanchego/db/*/chainData   # removes only chain data
+
+# 3. Enable state-sync in the chain config
+#    Set "state-sync-enabled": true in the relevant config.json
+
+# 4. Restart — the node will state-sync from peers
+sudo systemctl start avalanchego
+sudo journalctl -fu avalanchego
+```
+
+> ⚠️ The node will not be able to serve historical RPC requests until it fully catches up.
+> Do **not** use this on an archive node.
+
+### Option C — Remove old log files
+
+```bash
+# Rotate and compress logs immediately
+sudo logrotate -f /etc/logrotate.d/avalanchego 2>/dev/null || true
+
+# Remove logs older than 7 days
+sudo find ~/.avalanchego/logs/ -name "*.log" -mtime +7 -delete
+sudo find /var/log/ -name "*.gz" -mtime +30 -delete
+```
+
+### Option D — Remove old snapshots
+
+```bash
+# List and remove snapshots (keep at least one recent snapshot)
+ls -lh ~/.avalanchego/db/*/snapshots/ 2>/dev/null
+# Identify and delete old snapshots after confirming the node is healthy
+sudo rm -rf ~/.avalanchego/db/*/snapshots/<old-snapshot-name>
+```
+
+---
+
+## Remediation: Blockscout / Explorer Database Cleanup
+
+Explorer instances (`testnet-explorer`, `mainnet-explorer`) run Blockscout backed by
+PostgreSQL. The database is the primary disk consumer.
+
+### Check database size
+
+```bash
+# Connect to PostgreSQL (adjust credentials)
+sudo -u postgres psql -c "\l+"
+sudo -u postgres psql -d blockscout -c "
+  SELECT pg_size_pretty(pg_database_size('blockscout')) AS db_size;
+"
+
+# Largest tables
+sudo -u postgres psql -d blockscout -c "
+  SELECT relname AS table,
+         pg_size_pretty(pg_total_relation_size(relid)) AS total_size
+  FROM pg_catalog.pg_statio_user_tables
+  ORDER BY pg_total_relation_size(relid) DESC
+  LIMIT 20;
+"
+```
+
+### Option A — VACUUM and ANALYZE
+
+Running VACUUM reclaims space from deleted/updated rows without downtime:
+
+```bash
+sudo -u postgres psql -d blockscout -c "VACUUM ANALYZE;"
+# For a more aggressive reclaim (brief table locks):
+sudo -u postgres psql -d blockscout -c "VACUUM FULL ANALYZE;"
+```
+
+### Option B — Prune old transaction data (if supported by Blockscout version)
+
+Some Blockscout versions support data pruning. Check:
+
+```bash
+cat /opt/blockscout/.env | grep -i prune
+# If BLOCK_TRANSFORMER=base and version supports it, add:
+# INDEXER_DISABLE_PENDING_TRANSACTIONS_FETCHER=true
+# to reduce ongoing growth
+```
+
+### Option C — Expand the explorer disk
+
+Follow the same [GCE disk expansion steps](#remediation-expand-a-gce-persistent-disk-online)
+above. Explorer disks are typically smaller so expansion is inexpensive.
+
+### Option D — Docker volume cleanup (if using Docker deployment)
+
+```bash
+# Remove dangling images and stopped containers
+docker system prune -f
+
+# Show volume usage
+docker system df -v
+```
+
+---
+
+## Automated Monitoring Setup
+
+### Option 1 — Cron-based instance-level script (fallback)
+
+Deploy `monitoring/disk-check.sh` from this repository to each instance:
+
+```bash
+# On each GCE instance:
+sudo mkdir -p /opt/numbers-network/monitoring
+sudo cp /path/to/repo/monitoring/disk-check.sh /opt/numbers-network/monitoring/
+sudo chmod +x /opt/numbers-network/monitoring/disk-check.sh
+
+# Configure environment variables
+sudo tee /etc/environment.d/disk-check.conf > /dev/null <<'EOF'
+ALERT_EMAIL=ops@example.com
+SLACK_WEBHOOK_URL=https://hooks.slack.com/services/XXX/YYY/ZZZ
+DISK_WARNING_PCT=80
+DISK_CRITICAL_PCT=90
+EOF
+
+# Add cron job (every 15 minutes)
+(crontab -l 2>/dev/null; echo "*/15 * * * * /opt/numbers-network/monitoring/disk-check.sh >> /var/log/disk-check.log 2>&1") | crontab -
+```
+
+### Option 2 — GCP Cloud Monitoring alerting policies
+
+Use `monitoring/setup-gcp-disk-alerts.sh` to provision GCP-native alerts:
+
+```bash
+export GCP_PROJECT=your-gcp-project-id
+export ALERT_EMAIL=ops@example.com
+# Optional Slack:
+# export SLACK_CHANNEL_NAME=numbers-ops
+# export SLACK_AUTH_TOKEN=xoxb-...
+
+bash monitoring/setup-gcp-disk-alerts.sh
+```
+
+> **Note**: The GCP script requires the [Ops Agent](https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation)
+> to be installed on each instance to report disk metrics. Without it, the GCP
+> monitoring metric `agent.googleapis.com/disk/percent_used` will not be populated.
+
+---
+
+## Incident History
+
+| Date       | Instance                    | Disk % | Event                                | Resolution         |
+|------------|-----------------------------|--------|--------------------------------------|--------------------|
+| 2026-03-15 | numbers-mainnet-validator-1 | 97%    | Auto-shutdown triggered by OS        | Manual cleanup     |
+| 2026-03-17 | numbers-testnet-validator-3 | 96%    | CRITICAL — approaching auto-shutdown | Disk expansion + pruning |
diff --git a/monitoring/disk-check.sh b/monitoring/disk-check.sh
new file mode 100755
index 0000000..6f04050
--- /dev/null
+++ b/monitoring/disk-check.sh
@@ -0,0 +1,138 @@
+#!/usr/bin/env bash
+# disk-check.sh — Cron-deployable disk usage monitor for GCE instances.
+#
+# Checks every mounted filesystem and fires alerts when usage crosses
+# configurable WARNING (default 80%) or CRITICAL (default 90%) thresholds.
+#
+# Alert channels supported:
+#   • Email  — requires a local MTA (sendmail/postfix) or mailx
+#   • Slack  — requires SLACK_WEBHOOK_URL env variable (or set below)
+#
+# Recommended cron entry (runs every 15 minutes):
+#   */15 * * * * /opt/numbers-network/monitoring/disk-check.sh >> /var/log/disk-check.log 2>&1
+#
+# Environment variables (can also be hard-coded in the CONFIG section):
+#   DISK_WARNING_PCT   — percentage threshold for WARNING (default 80)
+#   DISK_CRITICAL_PCT  — percentage threshold for CRITICAL (default 90)
+#   ALERT_EMAIL        — email address for alert delivery
+#   SLACK_WEBHOOK_URL  — Slack incoming-webhook URL
+#   INSTANCE_NAME      — human-readable instance name (defaults to hostname)
+
+set -euo pipefail
+
+# ---------------------------------------------------------------------------
+# CONFIG — override via environment variables or edit here
+# ---------------------------------------------------------------------------
+WARNING_PCT="${DISK_WARNING_PCT:-80}"
+CRITICAL_PCT="${DISK_CRITICAL_PCT:-90}"
+ALERT_EMAIL="${ALERT_EMAIL:-}"
+SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}"
+INSTANCE="${INSTANCE_NAME:-$(hostname)}"
+TIMESTAMP="$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
+
+# Filesystems to skip (space-separated list of mount-point prefixes)
+SKIP_MOUNTS="${DISK_SKIP_MOUNTS:-/dev /proc /sys /run /snap}"
+
+# ---------------------------------------------------------------------------
+# HELPER FUNCTIONS
+# ---------------------------------------------------------------------------
+
+log() {
+    echo "[${TIMESTAMP}] $*"
+}
+
+# Send an email alert. Requires mailx or sendmail.
+send_email() {
+    local subject="$1"
+    local body="$2"
+    if [[ -z "${ALERT_EMAIL}" ]]; then
+        return
+    fi
+    if command -v mailx &>/dev/null; then
+        echo "${body}" | mailx -s "${subject}" "${ALERT_EMAIL}"
+    elif command -v sendmail &>/dev/null; then
+        { echo "Subject: ${subject}"; echo ""; echo "${body}"; } | sendmail "${ALERT_EMAIL}"
+    else
+        log "WARN: No mail client found — email alert skipped."
+    fi
+}
+
+# Post a message to Slack via incoming webhook.
+send_slack() {
+    local message="$1"
+    if [[ -z "${SLACK_WEBHOOK_URL}" ]]; then
+        return
+    fi
+    if ! command -v curl &>/dev/null; then
+        log "WARN: curl not found — Slack alert skipped."
+        return
+    fi
+    local payload
+    payload=$(printf '{"text": "%s"}' "${message}")
+    curl -s -X POST -H 'Content-type: application/json' \
+        --data "${payload}" \
+        "${SLACK_WEBHOOK_URL}" >/dev/null
+}
+
+# Determine if a mount point should be skipped.
+should_skip() {
+    local mount="$1"
+    for prefix in ${SKIP_MOUNTS}; do
+        if [[ "${mount}" == "${prefix}" || "${mount}" == "${prefix}"/* ]]; then
+            return 0
+        fi
+    done
+    return 1
+}
+
+# ---------------------------------------------------------------------------
+# MAIN DISK CHECK
+# ---------------------------------------------------------------------------
+
+ALERT_TRIGGERED=0
+
+# Parse df output: filesystem, size, used, available, use%, mountpoint
+while IFS= read -r line; do
+    # Skip header line
+    [[ "${line}" =~ ^Filesystem ]] && continue
+
+    # Extract columns (df -P produces POSIX format: guaranteed single-line per FS)
+    read -r filesystem size used avail pct mount <<< "${line}"
+
+    # Remove trailing '%' from pct
+    pct_num="${pct//%/}"
+
+    # Skip non-numeric (e.g. headers that slipped through)
+    [[ "${pct_num}" =~ ^[0-9]+$ ]] || continue
+
+    # Skip excluded mount points
+    should_skip "${mount}" && continue
+
+    if (( pct_num >= CRITICAL_PCT )); then
+        level="CRITICAL"
+        ALERT_TRIGGERED=1
+    elif (( pct_num >= WARNING_PCT )); then
+        level="WARNING"
+        ALERT_TRIGGERED=1
+    else
+        level="OK"
+    fi
+
+    log "[${level}] ${mount} — ${pct_num}% used (${used}/${size}) on ${filesystem}"
+
+    if [[ "${level}" != "OK" ]]; then
+        subject="[Disk ${level}] ${INSTANCE} — ${mount} at ${pct_num}%"
+        body=$(printf "Instance : %s\nTimestamp: %s\nMount    : %s\nUsage    : %s%% (%s used of %s, %s free)\nFilesys  : %s\n\nThresholds: WARNING>=%s%% | CRITICAL>=%s%%" \
+            "${INSTANCE}" "${TIMESTAMP}" "${mount}" "${pct_num}" \
+            "${used}" "${size}" "${avail}" "${filesystem}" \
+            "${WARNING_PCT}" "${CRITICAL_PCT}")
+        slack_msg="[Disk ${level}] *${INSTANCE}* — \`${mount}\` is at *${pct_num}%* (${used}/${size}). Investigate immediately."
+
+        send_email "${subject}" "${body}"
+        send_slack "${slack_msg}"
+    fi
+done < <(df -P -h --output=source,size,used,avail,pcent,target 2>/dev/null || df -P 2>/dev/null)
+
+if (( ALERT_TRIGGERED == 0 )); then
+    log "[OK] All filesystems below warning threshold (${WARNING_PCT}%)."
+fi
diff --git a/monitoring/setup-gcp-disk-alerts.sh b/monitoring/setup-gcp-disk-alerts.sh
new file mode 100755
index 0000000..ce77455
--- /dev/null
+++ b/monitoring/setup-gcp-disk-alerts.sh
@@ -0,0 +1,194 @@
+#!/usr/bin/env bash
+# setup-gcp-disk-alerts.sh — Provision GCP Cloud Monitoring alerting policies
+# for disk utilisation across all Numbers Network GCE instances.
+#
+# This script uses the gcloud CLI to create:
+#   1. (Optional) A Slack or email notification channel
+#   2. A WARNING alerting policy  — fires when disk used > 80%
+#   3. A CRITICAL alerting policy — fires when disk used > 90%
+#
+# Pre-requisites:
+#   • gcloud CLI installed and authenticated (`gcloud auth login`)
+#   • Target GCP project set (`gcloud config set project PROJECT_ID`)
+#   • roles/monitoring.alertPolicyEditor (or Owner) on the project
+#
+# Usage:
+#   export GCP_PROJECT=your-gcp-project-id
+#   export ALERT_EMAIL=ops@example.com          # optional
+#   export SLACK_CHANNEL_NAME=numbers-disk-alerts  # optional
+#   export SLACK_AUTH_TOKEN=xoxb-...            # optional — required for Slack
+#   bash setup-gcp-disk-alerts.sh
+
+set -euo pipefail
+
+# ---------------------------------------------------------------------------
+# CONFIG
+# ---------------------------------------------------------------------------
+GCP_PROJECT="${GCP_PROJECT:-$(gcloud config get-value project 2>/dev/null)}"
+ALERT_EMAIL="${ALERT_EMAIL:-}"
+SLACK_CHANNEL_NAME="${SLACK_CHANNEL_NAME:-}"
+SLACK_AUTH_TOKEN="${SLACK_AUTH_TOKEN:-}"
+
+POLICY_PREFIX="numbers-network-disk"
+
+# Disk utilisation thresholds (0–100 as a ratio stored in GCP: 0.80 = 80%)
+WARNING_RATIO="0.80"
+CRITICAL_RATIO="0.90"
+
+# Alert duration: how long the threshold must be exceeded before firing (seconds)
+DURATION_WARNING="300s"   # 5 minutes
+DURATION_CRITICAL="60s"   # 1 minute
+
+# GCP Ops Agent disk utilisation metric used by the alerting policies
+DISK_METRIC="agent.googleapis.com/disk/percent_used"
+
+if [[ -z "${GCP_PROJECT}" ]]; then
+    echo "ERROR: GCP_PROJECT is not set. Run: export GCP_PROJECT=<your-project-id>"
+    exit 1
+fi
+
+echo "==> Configuring GCP Monitoring alerts for project: ${GCP_PROJECT}"
+
+# ---------------------------------------------------------------------------
+# 1. CREATE NOTIFICATION CHANNELS
+# ---------------------------------------------------------------------------
+NOTIFICATION_CHANNELS=""
+
+# --- Email channel ---
+if [[ -n "${ALERT_EMAIL}" ]]; then
+    echo "==> Creating email notification channel for: ${ALERT_EMAIL}"
+    EMAIL_CHANNEL=$(gcloud beta monitoring channels create \
+        --display-name="Numbers Disk Alerts (Email)" \
+        --type=email \
+        --channel-labels="email_address=${ALERT_EMAIL}" \
+        --project="${GCP_PROJECT}" \
+        --format="value(name)" 2>/dev/null) || true
+    if [[ -n "${EMAIL_CHANNEL}" ]]; then
+        echo "    Channel created: ${EMAIL_CHANNEL}"
+        NOTIFICATION_CHANNELS="${EMAIL_CHANNEL}"
+    fi
+fi
+
+# --- Slack channel ---
+if [[ -n "${SLACK_CHANNEL_NAME}" && -n "${SLACK_AUTH_TOKEN}" ]]; then
+    echo "==> Creating Slack notification channel: #${SLACK_CHANNEL_NAME}"
+    SLACK_CHANNEL=$(gcloud beta monitoring channels create \
+        --display-name="Numbers Disk Alerts (Slack #${SLACK_CHANNEL_NAME})" \
+        --type=slack \
+        --channel-labels="channel_name=${SLACK_CHANNEL_NAME}" \
+        --sensitive-labels="auth_token=${SLACK_AUTH_TOKEN}" \
+        --project="${GCP_PROJECT}" \
+        --format="value(name)" 2>/dev/null) || true
+    if [[ -n "${SLACK_CHANNEL}" ]]; then
+        echo "    Channel created: ${SLACK_CHANNEL}"
+        NOTIFICATION_CHANNELS="${NOTIFICATION_CHANNELS:+${NOTIFICATION_CHANNELS},}${SLACK_CHANNEL}"
+    fi
+fi
+
+# ---------------------------------------------------------------------------
+# 2. HELPER: create an alerting policy via inline JSON
+# ---------------------------------------------------------------------------
+create_policy() {
+    local display_name="$1"
+    local threshold="$2"
+    local duration="$3"
+    local severity="$4"    # WARNING | CRITICAL
+
+    # Build notification-channel array
+    local nc_array="[]"
+    if [[ -n "${NOTIFICATION_CHANNELS}" ]]; then
+        # Convert comma-separated list to JSON array
+        nc_array=$(echo "${NOTIFICATION_CHANNELS}" | tr ',' '\n' \
+            | awk '{printf "\"%s\",", $0}' | sed 's/,$//' | awk '{print "[" $0 "]"}')
+    fi
+
+    # The GCP metric for disk utilisation on GCE instances
+    local metric="${DISK_METRIC}"
+
+    cat > /tmp/gcp_alert_policy.json <<EOF
+{
+  "displayName": "${display_name}",
+  "documentation": {
+    "content": "Disk utilisation on a Numbers Network GCE instance has exceeded the ${severity} threshold (${threshold} ratio). Refer to the runbook: https://github.com/numbersprotocol/numbers-network/blob/main/docs/runbooks/disk-management.md",
+    "mimeType": "text/markdown"
+  },
+  "conditions": [
+    {
+      "displayName": "Disk utilisation ${severity}: > ${threshold}",
+      "conditionThreshold": {
+        "filter": "resource.type=\"gce_instance\" AND metric.type=\"${metric}\" AND metric.labels.state=\"used\"",
+        "aggregations": [
+          {
+            "alignmentPeriod": "60s",
+            "crossSeriesReducer": "REDUCE_MEAN",
+            "perSeriesAligner": "ALIGN_MEAN",
+            "groupByFields": ["resource.labels.instance_id", "resource.labels.zone"]
+          }
+        ],
+        "comparison": "COMPARISON_GT",
+        "thresholdValue": ${threshold},
+        "duration": "${duration}",
+        "trigger": {
+          "count": 1
+        }
+      }
+    }
+  ],
+  "alertStrategy": {
+    "notificationRateLimit": {
+      "period": "3600s"
+    },
+    "autoClose": "604800s"
+  },
+  "combiner": "OR",
+  "enabled": true,
+  "notificationChannels": ${nc_array},
+  "severity": "${severity}"
+}
+EOF
+
+    gcloud alpha monitoring policies create \
+        --policy-from-file=/tmp/gcp_alert_policy.json \
+        --project="${GCP_PROJECT}" \
+        --format="value(name)"
+}
+
+# ---------------------------------------------------------------------------
+# 3. CREATE WARNING POLICY (>80% for 5 minutes)
+# ---------------------------------------------------------------------------
+echo "==> Creating WARNING alerting policy (>${WARNING_RATIO} for ${DURATION_WARNING})"
+WARNING_POLICY=$(create_policy \
+    "${POLICY_PREFIX}-warning" \
+    "${WARNING_RATIO}" \
+    "${DURATION_WARNING}" \
+    "WARNING") || true
+echo "    Policy created: ${WARNING_POLICY}"
+
+# ---------------------------------------------------------------------------
+# 4. CREATE CRITICAL POLICY (>90% for 1 minute)
+# ---------------------------------------------------------------------------
+echo "==> Creating CRITICAL alerting policy (>${CRITICAL_RATIO} for ${DURATION_CRITICAL})"
+CRITICAL_POLICY=$(create_policy \
+    "${POLICY_PREFIX}-critical" \
+    "${CRITICAL_RATIO}" \
+    "${DURATION_CRITICAL}" \
+    "CRITICAL") || true
+echo "    Policy created: ${CRITICAL_POLICY}"
+
+# ---------------------------------------------------------------------------
+# 5. SUMMARY
+# ---------------------------------------------------------------------------
+echo ""
+echo "==> Done. Summary:"
+echo "    Project           : ${GCP_PROJECT}"
+echo "    Notification chans: ${NOTIFICATION_CHANNELS:-<none>}"
+echo "    Warning policy    : ${WARNING_POLICY:-<not created>}"
+echo "    Critical policy   : ${CRITICAL_POLICY:-<not created>}"
+echo ""
+echo "Next steps:"
+echo "  1. Install the Ops Agent on each GCE instance so that the"
+echo "     '${DISK_METRIC}' metric is reported:"
+echo "     https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation"
+echo "  2. Verify policies in the GCP Console:"
+echo "     https://console.cloud.google.com/monitoring/alerting?project=${GCP_PROJECT}"
+echo "  3. Deploy monitoring/disk-check.sh to each instance as a cron fallback."