From 4c99426f49276e9e86975f027e2e571dd532d74b Mon Sep 17 00:00:00 2001 From: Jonathan Gimeno Date: Thu, 16 Oct 2025 12:32:55 +0200 Subject: [PATCH] include reth backup --- scripts/reth-backup/Dockerfile | 24 +++ scripts/reth-backup/README.md | 288 ++++++++++++++++++++++++++++++ scripts/reth-backup/backup-lib.sh | 165 +++++++++++++++++ scripts/reth-backup/backup.sh | 220 +++++++++++++++++++++++ 4 files changed, 697 insertions(+) create mode 100644 scripts/reth-backup/Dockerfile create mode 100644 scripts/reth-backup/README.md create mode 100644 scripts/reth-backup/backup-lib.sh create mode 100755 scripts/reth-backup/backup.sh diff --git a/scripts/reth-backup/Dockerfile b/scripts/reth-backup/Dockerfile new file mode 100644 index 00000000..e328bae1 --- /dev/null +++ b/scripts/reth-backup/Dockerfile @@ -0,0 +1,24 @@ +FROM ghcr.io/evstack/ev-reth:latest + +ARG LIBMDBX_REPO=https://github.com/erthink/libmdbx.git +ARG LIBMDBX_REF=master + +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + git \ + jq \ + ; \ + rm -rf /var/lib/apt/lists/* + +RUN set -eux; \ + git clone --depth 1 --branch "${LIBMDBX_REF}" "${LIBMDBX_REPO}" /tmp/libmdbx; \ + cmake -S /tmp/libmdbx -B /tmp/libmdbx/build -DCMAKE_BUILD_TYPE=Release; \ + cmake --build /tmp/libmdbx/build --target mdbx_copy mdbx_dump mdbx_chk; \ + install -m 0755 /tmp/libmdbx/build/mdbx_copy /usr/local/bin/mdbx_copy; \ + install -m 0755 /tmp/libmdbx/build/mdbx_dump /usr/local/bin/mdbx_dump; \ + install -m 0755 /tmp/libmdbx/build/mdbx_chk /usr/local/bin/mdbx_chk; \ + rm -rf /tmp/libmdbx diff --git a/scripts/reth-backup/README.md b/scripts/reth-backup/README.md new file mode 100644 index 00000000..6d374c5f --- /dev/null +++ b/scripts/reth-backup/README.md @@ -0,0 +1,288 @@ +# Reth Backup Helper + +Script to snapshot the `ev-reth` MDBX database while the node keeps running and +record the block height contained in the snapshot. + +The script supports two execution modes: + +- **local**: Backup a reth instance running directly on the host machine +- **docker**: Backup a reth instance running in a Docker container + +## Prerequisites + +### Common requirements + +- The `mdbx_copy` binary available in the target environment (see [libmdbx + documentation](https://libmdbx.dqdkfa.ru/)). +- `jq` installed on the host to parse the JSON output. + +### Docker mode + +- Docker access to the container running `ev-reth` (defaults to the service name + `ev-reth` from `docker-compose`). + +### Local mode + +- Direct filesystem access to the reth datadir. +- Sufficient permissions to read the database files. + +## Usage + +### Local mode + +When reth is running directly on your machine: + +```bash +./scripts/reth-backup/backup.sh \ + --mode local \ + --datadir /var/lib/reth \ + --mdbx-copy /usr/local/bin/mdbx_copy \ + /path/to/backups +``` + +### Docker mode + +When reth is running in a Docker container: + +```bash +./scripts/reth-backup/backup.sh \ + --mode docker \ + --container ev-reth \ + --datadir /home/reth/eth-home \ + --mdbx-copy /tmp/libmdbx/build/mdbx_copy \ + /path/to/backups +``` + +### Output structure + +Both modes create a timestamped folder under `/path/to/backups` with: + +- `db/mdbx.dat` – consistent MDBX snapshot. +- `db/mdbx.lck` – placeholder lock file (empty). +- `static_files/` – static files copied from the node. +- `stage_checkpoints.json` – raw StageCheckpoints table. +- `height.txt` – extracted block height (from the `Finish` stage). + +Additional flags: + +- `--tag LABEL` to override the timestamped folder name. +- `--keep-remote` to leave the temporary snapshot in the target environment + (useful for debugging). + +The script outputs the height at the end so you can coordinate other backups +with the same block number. + +## Architecture + +The backup script is split into two components: + +- **`backup-lib.sh`**: Abstract execution layer providing a common interface for + different execution modes (local, docker). This library defines functions like + `exec_remote`, `copy_from_remote`, `copy_to_remote`, and `cleanup_remote` + that are implemented differently for each backend. +- **`backup.sh`**: Main script that uses the library and orchestrates the backup + workflow. It's mode-agnostic and works with any backend that implements the + required interface. + +This separation allows easy extension to support additional execution +environments (SSH, Kubernetes, etc.) without modifying the core backup logic. + +## End-to-end workflow with `apps/evm/single` (Docker mode) + +### Prerequisites + +1. Build the reth image with MDBX tooling: + + ```bash + docker build -t ghcr.io/evstack/ev-reth:latest scripts/reth-backup + ``` + +2. Build the ev-node image with backup/restore commands: + + ```bash + docker build -t ghcr.io/evstack/ev-node-evm-single:main -f apps/evm/single/Dockerfile . + ``` + +3. Start the stack: + + ```bash + cd apps/evm/single && docker compose up -d + ``` + +### Backup + +1. Backup reth (captures MDBX snapshot at current height): + + ```bash + ./scripts/reth-backup/backup.sh --mode docker backups/full-run/reth + ``` + + Note the printed TAG (e.g., `20251013-104816`) and height. + +2. Backup ev-node (captures complete Badger datastore): + + ```bash + TAG= # from previous step + HEIGHT=$(cat backups/full-run/reth/${TAG}/height.txt) + + mkdir -p backups/full-run/ev-node + + docker exec evolveevm-ev-node-evm-single-1 \ + evm-single backup \ + --output /tmp/backup-${TAG}.badger \ + --force + + docker cp evolveevm-ev-node-evm-single-1:/tmp/backup-${TAG}.badger \ + backups/full-run/ev-node/ + + echo ${HEIGHT} > backups/full-run/ev-node/target-height.txt + ``` + +### Restore + +1. Stop services and recreate containers: + + ```bash + cd apps/evm/single + docker compose down + docker compose up --no-start + ``` + +2. Restore reth volume: + + ```bash + TAG= + + # From apps/evm/single directory, use relative path to backups + docker run --rm \ + --volumes-from ev-reth \ + -v "$PWD/../../backups/full-run/reth/${TAG}:/backup:ro" \ + alpine:3.18 \ + sh -c 'rm -rf /home/reth/eth-home/db /home/reth/eth-home/static_files && \ + mkdir -p /home/reth/eth-home/db /home/reth/eth-home/static_files && \ + cp /backup/db/mdbx.dat /home/reth/eth-home/db/ && \ + cp /backup/db/mdbx.lck /home/reth/eth-home/db/ && \ + cp -a /backup/static_files/. /home/reth/eth-home/static_files/ || true' + ``` + +3. Restore ev-node volume: + + ```bash + TAG= + + # From apps/evm/single directory, use relative path to backups + docker run --rm \ + --volumes-from evolveevm-ev-node-evm-single-1 \ + -v "$PWD/../../backups/full-run/ev-node:/backup:ro" \ + ghcr.io/evstack/ev-node-evm-single:main \ + restore \ + --input /backup/backup-${TAG}.badger \ + --home /root/.evm-single \ + --app-name evm-single \ + --force + ``` + +4. Align ev-node to reth height using rollback (before starting): + + ```bash + HEIGHT=$(cat backups/full-run/ev-node/target-height.txt) + + docker run --rm \ + --volumes-from evolveevm-ev-node-evm-single-1 \ + ghcr.io/evstack/ev-node-evm-single:main \ + rollback \ + --home /root/.evm-single \ + --height ${HEIGHT} \ + --sync-node + ``` + + > **Note:** The rollback may report errors for p2p header/data stores with invalid + > ranges. This is expected and can be ignored. The main state will be correctly + > rolled back to the target height. The `--sync-node` flag is required for + > non-aggregator mode rollback. + +5. Start reth and local-da services: + + ```bash + docker compose start ev-reth local-da + ``` + +6. Start ev-node with cache cleared (first time only): + + ```bash + # Remove the stopped container and start with --evnode.clear_cache + docker rm evolveevm-ev-node-evm-single-1 + + docker run -d \ + --name evolveevm-ev-node-evm-single-1 \ + --network evolveevm_evolve-network \ + -p 7676:7676 -p 7331:7331 \ + -v evolveevm_evm-single-data:/root/.evm-single/ \ + -e EVM_ENGINE_URL=http://ev-reth:8551 \ + -e EVM_ETH_URL=http://ev-reth:8545 \ + -e EVM_JWT_SECRET=f747494bb0fb338a0d71f5f9fe5b5034c17cc988c229b59fd71e005ee692e9bf \ + -e EVM_GENESIS_HASH=0x2b8bbb1ea1e04f9c9809b4b278a8687806edc061a356c7dbc491930d8e922503 \ + -e EVM_BLOCK_TIME=1s \ + -e EVM_SIGNER_PASSPHRASE=secret \ + -e DA_ADDRESS=http://local-da:7980 \ + ghcr.io/evstack/ev-node-evm-single:main \ + start --evnode.clear_cache + ``` + + > **Important:** Use `--evnode.clear_cache` on first start after restore to clear + > any cached p2p data that may be inconsistent after rollback. On subsequent restarts, + > you can use `docker compose up -d` normally. + +7. Verify both nodes are at the same height: + + ```bash + HEIGHT=$(cat backups/full-run/ev-node/target-height.txt) + echo "Expected restored height: ${HEIGHT}" + + # Check ev-node is producing blocks from the restored height + docker logs evolveevm-ev-node-evm-single-1 2>&1 | grep "produced block" | head -10 + + # Check reth current height + docker exec ev-reth curl -s -X POST -H "Content-Type: application/json" \ + --data '{"jsonrpc":"2.0","method":"eth_blockNumber","params":[],"id":1}' \ + http://localhost:8545 | jq -r '.result' | xargs printf "%d\n" + ``` + +## Known Limitations + +### Rollback P2P Store Errors + +When rolling back to a height significantly lower than the current state, the p2p +header and data sync stores may report "invalid range" errors. This occurs because +these stores track sync progress independently. The errors can be safely ignored as: + +1. The main blockchain state is correctly rolled back +2. Using `--evnode.clear_cache` on restart clears the inconsistent cache +3. The node will resync p2p data from the restored height + +### Timestamp Consistency + +After a restore, if significant real-world time has passed since the backup was created, +you may encounter timestamp validation errors when the node attempts to continue block +production. This occurs because: + +- Reth stores block timestamps based on when blocks were originally created +- After restore, the restored timestamps may be in the past relative to system time +- Block validators may reject new blocks with timestamps earlier than parent blocks + +**Workaround:** In production environments, coordinate restore operations to minimize +time between backup and restore, or ensure the entire network is restored simultaneously. + +## Summary + +This backup/restore workflow enables point-in-time recovery for both reth (MDBX) and +ev-node (Badger) datastores. Key points: + +- **Backup**: Hot backup while nodes are running (no downtime) +- **Restore**: Requires stopping services, restoring volumes, and aligning heights +- **Rollback**: May show p2p store errors that can be safely ignored +- **Production**: Test the full workflow in staging before deploying to production + +The process has been validated to correctly restore state and resume block production +from the backup point, with known limitations around p2p store consistency and timestamp +validation that can be mitigated with proper operational procedures. diff --git a/scripts/reth-backup/backup-lib.sh b/scripts/reth-backup/backup-lib.sh new file mode 100644 index 00000000..c445f145 --- /dev/null +++ b/scripts/reth-backup/backup-lib.sh @@ -0,0 +1,165 @@ +#!/usr/bin/env bash + +# backup-lib.sh - Abstract execution layer for reth backup operations +# Provides a common interface for local and Docker-based executions. + +# Backend interface that must be implemented: +# - exec_remote Execute a command in the target environment +# - copy_from_remote Copy a file/directory from target to local +# - copy_to_remote Copy a file/directory from local to target +# - cleanup_remote Remove a path in the target environment + +# ============================================================================ +# LOCAL BACKEND +# ============================================================================ + +local_exec_remote() { + bash -c "$1" +} + +local_copy_from_remote() { + local src="$1" + local dst="$2" + cp -a "$src" "$dst" +} + +local_copy_to_remote() { + local src="$1" + local dst="$2" + cp -a "$src" "$dst" +} + +local_cleanup_remote() { + local path="$1" + rm -rf "$path" +} + +local_check_available() { + # Always available + return 0 +} + +# ============================================================================ +# DOCKER BACKEND +# ============================================================================ + +docker_exec_remote() { + local container="$BACKEND_CONTAINER" + docker exec "$container" bash -lc "$1" +} + +docker_copy_from_remote() { + local container="$BACKEND_CONTAINER" + local src="$1" + local dst="$2" + docker cp "${container}:${src}" "$dst" +} + +docker_copy_to_remote() { + local container="$BACKEND_CONTAINER" + local src="$1" + local dst="$2" + docker cp "$src" "${container}:${dst}" +} + +docker_cleanup_remote() { + local container="$BACKEND_CONTAINER" + local path="$1" + docker exec "$container" rm -rf "$path" +} + +docker_check_available() { + if ! command -v docker >/dev/null 2>&1; then + echo "error: docker command not found" >&2 + return 1 + fi + + local container="$BACKEND_CONTAINER" + if [[ -z "$container" ]]; then + echo "error: container name is required for docker mode" >&2 + return 1 + fi + + if ! docker ps --format '{{.Names}}' | grep -q "^${container}$"; then + echo "error: container '$container' is not running" >&2 + return 1 + fi + + return 0 +} + +# ============================================================================ +# BACKEND INITIALIZATION +# ============================================================================ + +# Set the backend mode and initialize function pointers +init_backend() { + local mode="$1" + + case "$mode" in + local) + exec_remote=local_exec_remote + copy_from_remote=local_copy_from_remote + copy_to_remote=local_copy_to_remote + cleanup_remote=local_cleanup_remote + check_backend_available=local_check_available + ;; + docker) + exec_remote=docker_exec_remote + copy_from_remote=docker_copy_from_remote + copy_to_remote=docker_copy_to_remote + cleanup_remote=docker_cleanup_remote + check_backend_available=docker_check_available + ;; + *) + echo "error: unknown backend mode '$mode'" >&2 + echo "supported modes: local, docker" >&2 + return 1 + ;; + esac + + BACKEND_MODE="$mode" + return 0 +} + +# ============================================================================ +# HIGH-LEVEL BACKUP OPERATIONS +# ============================================================================ + +# Verify that a command is available in the target environment +verify_remote_command() { + local cmd="$1" + if ! $exec_remote "command -v '$cmd' >/dev/null 2>&1 || [ -x '$cmd' ]"; then + echo "error: command '$cmd' not found in target environment" >&2 + return 1 + fi + return 0 +} + +# Create a directory in the target environment +create_remote_dir() { + local path="$1" + $exec_remote "mkdir -p '$path'" +} + +# Check if a path exists in the target environment +remote_path_exists() { + local path="$1" + $exec_remote "test -e '$path'" +} + +# Run mdbx_copy in the target environment +run_mdbx_copy() { + local mdbx_copy="$1" + local source_db="$2" + local dest_file="$3" + + echo "Running mdbx_copy..." + $exec_remote "'$mdbx_copy' -c '$source_db' '$dest_file'" +} + +# Query ev-reth for stage checkpoints +query_stage_checkpoints() { + local datadir="$1" + $exec_remote "ev-reth db --datadir '$datadir' list StageCheckpoints --len 20 --json" | sed -n '/^\[/,$p' +} diff --git a/scripts/reth-backup/backup.sh b/scripts/reth-backup/backup.sh new file mode 100755 index 00000000..c254fecb --- /dev/null +++ b/scripts/reth-backup/backup.sh @@ -0,0 +1,220 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Load the backend library +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/backup-lib.sh" + +usage() { + cat <<'EOF' +Usage: backup.sh [OPTIONS] + +Create a consistent backup of the ev-reth database using mdbx_copy and record +the block height captured in the snapshot. + +Options: + --mode MODE Execution mode: 'local' or 'docker' (default: docker) + --container NAME Docker container name running ev-reth (default: ev-reth) + Only used in docker mode. + --datadir PATH Path to the reth datadir in the target environment + (default docker: /home/reth/eth-home) + (default local: /var/lib/reth) + --mdbx-copy CMD Path to the mdbx_copy binary in the target environment + (default: mdbx_copy; override if you compiled it elsewhere) + --tag LABEL Custom label for the backup directory (default: timestamp) + --keep-remote Leave the temporary snapshot in the target environment + -h, --help Show this help message + +Modes: + local Run backup on the local machine (reth running locally) + docker Run backup on a Docker container (default) + +Requirements: + - mdbx_copy available in the target environment (compile it once if necessary). + - jq installed on the host (used to parse StageCheckpoints JSON). + - For docker mode: Docker access to the container running ev-reth. + - For local mode: Direct filesystem access to reth datadir. + +The destination directory will receive: + //db/mdbx.dat MDBX snapshot + //db/mdbx.lck Empty lock file placeholder + //static_files/... Static files copied from the node + //stage_checkpoints.json + //height.txt Height extracted from StageCheckpoints + +Examples: + # Backup from local reth instance + ./backup.sh --mode local --datadir /var/lib/reth /path/to/backups + + # Backup from Docker container + ./backup.sh --mode docker --container ev-reth /path/to/backups +EOF +} + +require_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "error: required command '$1' not found in PATH" >&2 + exit 1 + fi +} + +DEST="" +MODE="docker" +CONTAINER="ev-reth" +DATADIR="" +MDBX_COPY="mdbx_copy" +BACKUP_TAG="" +KEEP_REMOTE=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --mode) + MODE="$2" + shift 2 + ;; + --container) + CONTAINER="$2" + shift 2 + ;; + --datadir) + DATADIR="$2" + shift 2 + ;; + --mdbx-copy) + MDBX_COPY="$2" + shift 2 + ;; + --tag) + BACKUP_TAG="$2" + shift 2 + ;; + --keep-remote) + KEEP_REMOTE=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + --) + shift + break + ;; + -*) + echo "unknown option: $1" >&2 + usage >&2 + exit 1 + ;; + *) + if [[ -z "$DEST" ]]; then + DEST="$1" + shift + else + echo "unexpected argument: $1" >&2 + usage >&2 + exit 1 + fi + ;; + esac +done + +if [[ -z "$DEST" ]]; then + echo "error: destination directory is required" >&2 + usage >&2 + exit 1 +fi + +# Validate and set defaults based on mode +case "$MODE" in + local) + if [[ -z "$DATADIR" ]]; then + DATADIR="/var/lib/reth" + fi + ;; + docker) + if [[ -z "$DATADIR" ]]; then + DATADIR="/home/reth/eth-home" + fi + ;; + *) + echo "error: invalid mode '$MODE'. Use 'local' or 'docker'." >&2 + exit 1 + ;; +esac + +# Initialize the backend +if ! init_backend "$MODE"; then + exit 1 +fi + +# Set container for docker mode +if [[ "$MODE" == "docker" ]]; then + BACKEND_CONTAINER="$CONTAINER" +fi + +# Check backend availability +if ! $check_backend_available; then + exit 1 +fi + +require_cmd jq + +if [[ -z "$BACKUP_TAG" ]]; then + BACKUP_TAG="$(date +'%Y%m%d-%H%M%S')" +fi + +REMOTE_TMP="/tmp/reth-backup-${BACKUP_TAG}" +HOST_DEST="$(mkdir -p "$DEST" && cd "$DEST" && pwd)/${BACKUP_TAG}" + +echo "Mode: $MODE" +echo "Creating backup tag '$BACKUP_TAG' into ${HOST_DEST}" + +# Prepare temporary workspace in target environment +echo "Preparing temporary workspace..." +$exec_remote "rm -rf '$REMOTE_TMP' && mkdir -p '$REMOTE_TMP/db' '$REMOTE_TMP/static_files'" + +# Verify mdbx_copy availability +if ! verify_remote_command "$MDBX_COPY"; then + exit 1 +fi + +echo "Running mdbx_copy in target environment..." +run_mdbx_copy "$MDBX_COPY" "${DATADIR}/db" "$REMOTE_TMP/db/mdbx.dat" +$exec_remote "touch '$REMOTE_TMP/db/mdbx.lck'" + +echo "Copying static_files..." +$exec_remote "if [ -d '${DATADIR}/static_files' ]; then cp -a '${DATADIR}/static_files/.' '$REMOTE_TMP/static_files/' 2>/dev/null || true; fi" + +echo "Querying StageCheckpoints height..." +STAGE_JSON=$(query_stage_checkpoints "$REMOTE_TMP") +HEIGHT=$(echo "$STAGE_JSON" | jq -r '.[] | select(.[0]=="Finish") | .[1].block_number' | tr -d '\r\n') + +if [[ -z "$HEIGHT" || "$HEIGHT" == "null" ]]; then + echo "warning: could not determine height from StageCheckpoints" >&2 +fi + +echo "Copying snapshot to host..." +mkdir -p "$HOST_DEST/db" +$copy_from_remote "${REMOTE_TMP}/db/mdbx.dat" "$HOST_DEST/db/mdbx.dat" +$copy_from_remote "${REMOTE_TMP}/db/mdbx.lck" "$HOST_DEST/db/mdbx.lck" + +if remote_path_exists "${REMOTE_TMP}/static_files"; then + mkdir -p "$HOST_DEST/static_files" + $copy_from_remote "${REMOTE_TMP}/static_files/." "$HOST_DEST/static_files/" || true +fi + +echo "$STAGE_JSON" > "$HOST_DEST/stage_checkpoints.json" +if [[ -n "$HEIGHT" && "$HEIGHT" != "null" ]]; then + echo "$HEIGHT" > "$HOST_DEST/height.txt" + echo "Backup height: $HEIGHT" +else + echo "Height not captured (see stage_checkpoints.json for details)" +fi + +if [[ "$KEEP_REMOTE" -ne 1 ]]; then + echo "Cleaning up temporary files..." + $cleanup_remote "$REMOTE_TMP" +fi + +echo "Backup completed: $HOST_DEST"