Skip to content

Commit a3e7d8e

Browse files
authored
feat(infra): add HPMN backup workflow (#740)
* [infra] add HPMN Backup Workflow * [infra] harden HPMN backup and restore flow * [infra] add HPMN recovery runbook * [infra] capture Drive ABCI state in HPMN backups * [infra] fix HPMN restore volume ownership * update gitignore * [infra] fix HPMN ansible lint line lengths
1 parent b5d57e2 commit a3e7d8e

20 files changed

Lines changed: 1419 additions & 1 deletion

File tree

.github/workflows/hpmn-backup.yml

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
name: Manual HP Masternode Backup
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
network:
7+
description: "Network to back up"
8+
required: true
9+
type: string
10+
default: testnet
11+
backup_label:
12+
description: "Short label stamped into the S3 object path"
13+
required: true
14+
type: string
15+
host_limit:
16+
description: "Ansible limit expression (start with a single hp-masternode-N for validation)"
17+
required: true
18+
type: string
19+
default: hp-masternode-1
20+
batch_size:
21+
description: "How many HPMNs may run backup concurrently"
22+
required: true
23+
type: string
24+
default: "5"
25+
install_backup_tooling:
26+
description: "Install/update backup script prerequisites before running the backup"
27+
required: true
28+
type: boolean
29+
default: false
30+
31+
jobs:
32+
backup:
33+
name: Run HP masternode backup
34+
runs-on: ubuntu-22.04
35+
timeout-minutes: 90
36+
env:
37+
NETWORK_NAME: ${{ inputs.network }}
38+
HOST_LIMIT: ${{ inputs.host_limit }}
39+
HPMN_BACKUP_TRIGGER_LABEL: ${{ inputs.backup_label }}
40+
HPMN_BACKUP_S3_BUCKET: ${{ vars.HPMN_BACKUP_S3_BUCKET }}
41+
HPMN_BACKUP_S3_PREFIX: ${{ vars.HPMN_BACKUP_S3_PREFIX }}
42+
HPMN_BACKUP_S3_SSE_MODE: ${{ vars.HPMN_BACKUP_S3_SSE_MODE }}
43+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
44+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
45+
AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
46+
AWS_REGION: ${{ secrets.AWS_REGION }}
47+
ANSIBLE_HOST_KEY_CHECKING: "false"
48+
ANSIBLE_CONFIG: ansible/ansible.cfg
49+
50+
steps:
51+
- name: Checkout dash-network-deploy
52+
uses: actions/checkout@v4
53+
54+
- name: Install controller dependencies
55+
run: |
56+
sudo apt-get update
57+
sudo apt-get install -y python3-pip python3-netaddr
58+
python3 -m pip install --upgrade pip
59+
python3 -m pip install ansible-core==2.16.3 jmespath
60+
61+
- name: Install Ansible roles and collections
62+
run: |
63+
ansible-galaxy install -r ansible/requirements.yml
64+
mkdir -p ~/.ansible/roles
65+
cp -r ansible/roles/* ~/.ansible/roles/
66+
67+
- name: Set up SSH keys
68+
env:
69+
DEPLOY_SERVER_KEY: ${{ secrets.DEPLOY_SERVER_KEY }}
70+
EVO_APP_DEPLOY_KEY: ${{ secrets.EVO_APP_DEPLOY_KEY }}
71+
run: |
72+
mkdir -p ~/.ssh
73+
74+
printf '%s\n' "$DEPLOY_SERVER_KEY" > ~/.ssh/id_rsa
75+
chmod 600 ~/.ssh/id_rsa
76+
ssh-keygen -y -f ~/.ssh/id_rsa > ~/.ssh/id_rsa.pub
77+
chmod 644 ~/.ssh/id_rsa.pub
78+
79+
printf '%s\n' "$EVO_APP_DEPLOY_KEY" > ~/.ssh/id_ed25519
80+
chmod 600 ~/.ssh/id_ed25519
81+
82+
cat > ~/.ssh/config << 'EOL'
83+
Host github.com
84+
IdentityFile ~/.ssh/id_ed25519
85+
StrictHostKeyChecking no
86+
87+
Host *
88+
IdentityFile ~/.ssh/id_rsa
89+
User ubuntu
90+
StrictHostKeyChecking no
91+
UserKnownHostsFile=/dev/null
92+
EOL
93+
94+
chmod 600 ~/.ssh/config
95+
96+
- name: Clone network configs
97+
run: |
98+
rm -rf networks
99+
git clone git@github.com:dashpay/dash-network-configs.git networks
100+
101+
- name: Validate network files and backup config
102+
run: |
103+
test -f "networks/${NETWORK_NAME}.inventory"
104+
test -f "networks/${NETWORK_NAME}.yml"
105+
test -n "$HPMN_BACKUP_S3_BUCKET"
106+
test -n "$AWS_ACCESS_KEY_ID"
107+
test -n "$AWS_SECRET_ACCESS_KEY"
108+
test -n "$AWS_REGION"
109+
case "${HPMN_BACKUP_S3_SSE_MODE:-AES256}" in
110+
AES256)
111+
;;
112+
aws:kms)
113+
test -n "${{ secrets.HPMN_BACKUP_S3_KMS_KEY_ID }}"
114+
;;
115+
*)
116+
echo "Unsupported HPMN_BACKUP_S3_SSE_MODE: ${HPMN_BACKUP_S3_SSE_MODE}" >&2
117+
exit 1
118+
;;
119+
esac
120+
121+
- name: Install backup tooling on target HP masternodes
122+
if: ${{ inputs.install_backup_tooling }}
123+
run: |
124+
ansible-playbook \
125+
-i "networks/${NETWORK_NAME}.inventory" \
126+
ansible/hpmn_backup_install.yml \
127+
-e "@networks/${NETWORK_NAME}.yml" \
128+
-e "dash_network_name=${NETWORK_NAME}" \
129+
-e "hpmn_backup_install_serial=${{ inputs.batch_size }}" \
130+
--limit "${HOST_LIMIT}"
131+
132+
- name: Trigger HP masternode backup
133+
run: |
134+
if [[ "${HPMN_BACKUP_S3_SSE_MODE:-AES256}" == "aws:kms" ]]; then
135+
export HPMN_BACKUP_S3_KMS_KEY_ID="${{ secrets.HPMN_BACKUP_S3_KMS_KEY_ID }}"
136+
fi
137+
138+
ansible-playbook \
139+
-i "networks/${NETWORK_NAME}.inventory" \
140+
ansible/hpmn_backup_run.yml \
141+
-e "@networks/${NETWORK_NAME}.yml" \
142+
-e "dash_network_name=${NETWORK_NAME}" \
143+
-e "hpmn_backup_serial=${{ inputs.batch_size }}" \
144+
-e "hpmn_backup_trigger_label=${HPMN_BACKUP_TRIGGER_LABEL}" \
145+
--limit "${HOST_LIMIT}"

.github/workflows/hpmn-restore.yml

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
name: HP Masternode Restore
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
network:
7+
description: "Network to restore"
8+
required: true
9+
type: string
10+
default: testnet
11+
target_host:
12+
description: "Single HP masternode host to restore"
13+
required: true
14+
type: string
15+
default: hp-masternode-1
16+
restore_s3_uri:
17+
description: "Full S3 URI of the backup archive to restore"
18+
required: true
19+
type: string
20+
install_restore_tooling:
21+
description: "Install/update restore prerequisites before running restore"
22+
required: true
23+
type: boolean
24+
default: false
25+
start_services:
26+
description: "Start dashmate services directly from the restore script"
27+
required: true
28+
type: boolean
29+
default: false
30+
finalize_restore:
31+
description: "Run the finalize playbook after restore to regenerate host-specific config and start services"
32+
required: true
33+
type: boolean
34+
default: true
35+
36+
jobs:
37+
restore:
38+
name: Restore HP masternode
39+
runs-on: ubuntu-22.04
40+
timeout-minutes: 120
41+
env:
42+
NETWORK_NAME: ${{ inputs.network }}
43+
TARGET_HOST: ${{ inputs.target_host }}
44+
HPMN_RESTORE_S3_URI: ${{ inputs.restore_s3_uri }}
45+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
46+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
47+
AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
48+
AWS_REGION: ${{ secrets.AWS_REGION }}
49+
ANSIBLE_HOST_KEY_CHECKING: "false"
50+
ANSIBLE_CONFIG: ansible/ansible.cfg
51+
52+
steps:
53+
- name: Checkout dash-network-deploy
54+
uses: actions/checkout@v4
55+
56+
- name: Install controller dependencies
57+
run: |
58+
sudo apt-get update
59+
sudo apt-get install -y python3-pip python3-netaddr
60+
python3 -m pip install --upgrade pip
61+
python3 -m pip install ansible-core==2.16.3 jmespath
62+
63+
- name: Install Ansible roles and collections
64+
run: |
65+
ansible-galaxy install -r ansible/requirements.yml
66+
mkdir -p ~/.ansible/roles
67+
cp -r ansible/roles/* ~/.ansible/roles/
68+
69+
- name: Set up SSH keys
70+
env:
71+
DEPLOY_SERVER_KEY: ${{ secrets.DEPLOY_SERVER_KEY }}
72+
EVO_APP_DEPLOY_KEY: ${{ secrets.EVO_APP_DEPLOY_KEY }}
73+
run: |
74+
mkdir -p ~/.ssh
75+
76+
printf '%s\n' "$DEPLOY_SERVER_KEY" > ~/.ssh/id_rsa
77+
chmod 600 ~/.ssh/id_rsa
78+
ssh-keygen -y -f ~/.ssh/id_rsa > ~/.ssh/id_rsa.pub
79+
chmod 644 ~/.ssh/id_rsa.pub
80+
81+
printf '%s\n' "$EVO_APP_DEPLOY_KEY" > ~/.ssh/id_ed25519
82+
chmod 600 ~/.ssh/id_ed25519
83+
84+
cat > ~/.ssh/config << 'EOL'
85+
Host github.com
86+
IdentityFile ~/.ssh/id_ed25519
87+
StrictHostKeyChecking no
88+
89+
Host *
90+
IdentityFile ~/.ssh/id_rsa
91+
User ubuntu
92+
StrictHostKeyChecking no
93+
UserKnownHostsFile=/dev/null
94+
EOL
95+
96+
chmod 600 ~/.ssh/config
97+
98+
- name: Clone network configs
99+
run: |
100+
rm -rf networks
101+
git clone git@github.com:dashpay/dash-network-configs.git networks
102+
103+
- name: Validate restore config
104+
run: |
105+
test -f "networks/${NETWORK_NAME}.inventory"
106+
test -f "networks/${NETWORK_NAME}.yml"
107+
test -n "$HPMN_RESTORE_S3_URI"
108+
test -n "$AWS_REGION"
109+
110+
- name: Install restore tooling on target host
111+
if: ${{ inputs.install_restore_tooling }}
112+
run: |
113+
ansible-playbook \
114+
-i "networks/${NETWORK_NAME}.inventory" \
115+
ansible/hpmn_restore_install.yml \
116+
-e "@networks/${NETWORK_NAME}.yml" \
117+
-e "dash_network_name=${NETWORK_NAME}" \
118+
--limit "${TARGET_HOST}"
119+
120+
- name: Restore target host from S3 backup
121+
run: |
122+
ansible-playbook \
123+
-i "networks/${NETWORK_NAME}.inventory" \
124+
ansible/hpmn_restore_run.yml \
125+
-e "@networks/${NETWORK_NAME}.yml" \
126+
-e "dash_network_name=${NETWORK_NAME}" \
127+
-e "hpmn_restore_s3_uri=${HPMN_RESTORE_S3_URI}" \
128+
-e "hpmn_restore_start_services=${{ inputs.start_services }}" \
129+
--limit "${TARGET_HOST}"
130+
131+
- name: Finalize restored target host
132+
if: ${{ inputs.finalize_restore }}
133+
run: |
134+
ansible-playbook \
135+
-i "networks/${NETWORK_NAME}.inventory" \
136+
ansible/hpmn_restore_finalize.yml \
137+
-e "@networks/${NETWORK_NAME}.yml" \
138+
-e "dash_network_name=${NETWORK_NAME}" \
139+
-e "dash_network=${NETWORK_NAME}" \
140+
--limit "${TARGET_HOST}"

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,5 @@ networks
1919

2020
# Local docs
2121
INFRA.MD
22+
23+
.codex

ansible/deploy.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
- name: Update apt cache and install jq
5252
ansible.builtin.apt:
5353
pkg:
54+
- acl
5455
- jq
5556
- unzip
5657
update_cache: true

ansible/hpmn_backup_install.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
- name: Install HP masternode backup tooling
3+
hosts: hp_masternodes
4+
serial: "{{ hpmn_backup_install_serial | default(5) }}"
5+
become: true
6+
gather_facts: true
7+
roles:
8+
- role: hpmn_backup
9+
tags:
10+
- hpmn_backup_install

ansible/hpmn_backup_run.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
- name: Run HP masternode backup on demand
3+
hosts: hp_masternodes
4+
serial: "{{ hpmn_backup_serial | default(5) }}"
5+
become: true
6+
gather_facts: false
7+
tasks:
8+
- name: Execute HP masternode backup role
9+
ansible.builtin.include_role:
10+
name: hpmn_backup
11+
tasks_from: run
12+
tags:
13+
- hpmn_backup_run

ansible/hpmn_restore_finalize.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
---
2+
- name: Finalize restored HP masternode
3+
hosts: hp_masternodes
4+
serial: 1
5+
become: true
6+
gather_facts: false
7+
pre_tasks:
8+
- name: Enforce single-host finalize
9+
ansible.builtin.assert:
10+
that:
11+
- ansible_play_hosts_all | length == 1
12+
fail_msg: Finalize must target exactly one HP masternode host.
13+
run_once: true
14+
15+
- name: Load HP masternode config for finalize
16+
ansible.builtin.set_fact:
17+
node: "{{ hp_masternodes[inventory_hostname] }}"
18+
when: inventory_hostname in hp_masternodes
19+
20+
roles:
21+
- role: dash_cli
22+
- role: dashmate
23+
24+
tags:
25+
- hpmn_restore_finalize

ansible/hpmn_restore_install.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
- name: Install HP masternode restore tooling
3+
hosts: hp_masternodes
4+
serial: 1
5+
become: true
6+
gather_facts: true
7+
roles:
8+
- role: hpmn_restore
9+
tags:
10+
- hpmn_restore_install

ansible/hpmn_restore_run.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
---
2+
- name: Restore HP masternode from backup
3+
hosts: hp_masternodes
4+
serial: 1
5+
become: true
6+
gather_facts: false
7+
pre_tasks:
8+
- name: Enforce single-host restore
9+
ansible.builtin.assert:
10+
that:
11+
- ansible_play_hosts_all | length == 1
12+
fail_msg: Restore must target exactly one HP masternode host.
13+
run_once: true
14+
tasks:
15+
- name: Execute HP masternode restore role
16+
ansible.builtin.include_role:
17+
name: hpmn_restore
18+
tasks_from: run
19+
tags:
20+
- hpmn_restore_run

0 commit comments

Comments
 (0)