Skip to content

Commit 4d74dee

Browse files
committed
Merge branch 'claude/rds-aurora-component-012oeo6D7EyNwxoupNfefuCS' into 'master'
RDS/Aurora logical refresh component See merge request postgres-ai/database-lab!1070
2 parents c44e064 + 0a8fb73 commit 4d74dee

19 files changed

Lines changed: 4353 additions & 3 deletions

engine/.gitlab-ci.yml

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,17 @@ build-image-feature-ci-checker:
204204
DOCKER_NAME: "${CI_REGISTRY}/${CI_PROJECT_NAMESPACE}/${CI_PROJECT_NAME}/dblab-ci-checker"
205205
TAGS: "${DOCKER_NAME}:${CI_COMMIT_REF_SLUG}"
206206

207+
build-image-feature-rds-refresh:
208+
<<: *build_image_definition
209+
<<: *only_feature
210+
variables:
211+
REGISTRY_USER: "${CI_REGISTRY_USER}"
212+
REGISTRY_PASSWORD: "${CI_REGISTRY_PASSWORD}"
213+
REGISTRY: "${CI_REGISTRY}"
214+
DOCKER_FILE: "Dockerfile.rds-refresh"
215+
DOCKER_NAME: "${CI_REGISTRY}/${CI_PROJECT_NAMESPACE}/${CI_PROJECT_NAME}/dblab-rds-refresh"
216+
TAGS: "${DOCKER_NAME}:${CI_COMMIT_REF_SLUG}"
217+
207218
build-image-feature-client:
208219
<<: *build_image_definition
209220
<<: *only_feature
@@ -239,6 +250,14 @@ build-image-master-ci-checker:
239250
DOCKER_NAME: "registry.gitlab.com/postgres-ai/database-lab/dblab-ci-checker"
240251
TAGS: "${DOCKER_NAME}:master,${DOCKER_NAME}:master-${CI_COMMIT_SHORT_SHA}"
241252

253+
build-image-master-rds-refresh:
254+
<<: *build_image_definition
255+
<<: *only_master
256+
variables:
257+
DOCKER_FILE: "Dockerfile.rds-refresh"
258+
DOCKER_NAME: "registry.gitlab.com/postgres-ai/database-lab/dblab-rds-refresh"
259+
TAGS: "${DOCKER_NAME}:master,${DOCKER_NAME}:master-${CI_COMMIT_SHORT_SHA}"
260+
242261
build-image-master-client:
243262
<<: *build_image_definition
244263
<<: *only_master
@@ -314,6 +333,33 @@ build-image-latest-ci-checker-dev:
314333
- export CLEAN_TAG=$(echo ${CI_COMMIT_TAG#"v"})
315334
- export TAGS="${DOCKER_NAME}:${CLEAN_TAG}"
316335

336+
build-image-latest-rds-refresh:
337+
<<: *build_image_definition
338+
<<: *only_tag_release
339+
variables:
340+
REGISTRY_USER: "${DH_CI_REGISTRY_USER}"
341+
REGISTRY_PASSWORD: "${DH_CI_REGISTRY_PASSWORD}"
342+
REGISTRY: "${DH_CI_REGISTRY}"
343+
DOCKER_FILE: "Dockerfile.rds-refresh"
344+
DOCKER_NAME: "postgresai/dblab-rds-refresh"
345+
before_script:
346+
- export CLEAN_TAG=$(echo ${CI_COMMIT_TAG#"v"})
347+
- export LATEST_TAG=$(echo ${CLEAN_TAG%.*}-latest)
348+
- export TAGS="${DOCKER_NAME}:${LATEST_TAG},${DOCKER_NAME}:${CLEAN_TAG}"
349+
350+
build-image-latest-rds-refresh-dev:
351+
<<: *build_image_definition
352+
<<: *only_tag_release
353+
variables:
354+
REGISTRY_USER: "${CI_REGISTRY_USER}"
355+
REGISTRY_PASSWORD: "${CI_REGISTRY_PASSWORD}"
356+
REGISTRY: "${CI_REGISTRY}"
357+
DOCKER_FILE: "Dockerfile.rds-refresh"
358+
DOCKER_NAME: "registry.gitlab.com/postgres-ai/database-lab/dblab-rds-refresh"
359+
before_script:
360+
- export CLEAN_TAG=$(echo ${CI_COMMIT_TAG#"v"})
361+
- export TAGS="${DOCKER_NAME}:${CLEAN_TAG}"
362+
317363
build-image-latest-client:
318364
<<: *build_image_definition
319365
<<: *only_tag_release
@@ -401,6 +447,33 @@ build-image-rc-ci-checker-dev:
401447
REGISTRY: "${CI_REGISTRY}"
402448
DOCKER_FILE: "Dockerfile.ci-checker"
403449
DOCKER_NAME: "registry.gitlab.com/postgres-ai/database-lab/dblab-ci-checker"
450+
451+
build-image-rc-rds-refresh:
452+
<<: *build_image_definition
453+
<<: *only_tag_rc
454+
before_script:
455+
- export CLEAN_TAG=$(echo ${CI_COMMIT_TAG#"v"})
456+
- export TAGS="${DOCKER_NAME}:${CLEAN_TAG}"
457+
variables:
458+
REGISTRY_USER: "${DH_CI_REGISTRY_USER}"
459+
REGISTRY_PASSWORD: "${DH_CI_REGISTRY_PASSWORD}"
460+
REGISTRY: "${DH_CI_REGISTRY}"
461+
DOCKER_FILE: "Dockerfile.rds-refresh"
462+
DOCKER_NAME: "postgresai/dblab-rds-refresh"
463+
464+
build-image-rc-rds-refresh-dev:
465+
<<: *build_image_definition
466+
<<: *only_tag_rc
467+
before_script:
468+
- export CLEAN_TAG=$(echo ${CI_COMMIT_TAG#"v"})
469+
- export TAGS="${DOCKER_NAME}:${CLEAN_TAG}"
470+
variables:
471+
REGISTRY_USER: "${CI_REGISTRY_USER}"
472+
REGISTRY_PASSWORD: "${CI_REGISTRY_PASSWORD}"
473+
REGISTRY: "${CI_REGISTRY}"
474+
DOCKER_FILE: "Dockerfile.rds-refresh"
475+
DOCKER_NAME: "registry.gitlab.com/postgres-ai/database-lab/dblab-rds-refresh"
476+
404477
build-image-rc-client:
405478
<<: *build_image_definition
406479
<<: *only_tag_rc

engine/Dockerfile.rds-refresh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
FROM alpine:3.23
2+
3+
RUN apk add --no-cache ca-certificates tzdata
4+
5+
# create non-root user
6+
RUN adduser -D -u 1000 appuser
7+
8+
WORKDIR /app
9+
10+
COPY ./bin/rds-refresh /usr/local/bin/rds-refresh
11+
12+
USER appuser
13+
14+
ENTRYPOINT ["/usr/local/bin/rds-refresh"]
15+
CMD ["--help"]

engine/Makefile

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
SERVER_BINARY = dblab-server
44
RUN_CI_BINARY = run-ci
55
CLI_BINARY = dblab
6+
RDS_REFRESH_BINARY = rds-refresh
67
GOARCH = amd64
78

89
PWD= $(shell pwd)
@@ -41,10 +42,11 @@ run-lint: ## Run linters
4142

4243
lint: install-lint run-lint ## Install and run linters
4344

44-
build: ## Build binary files of all Database Lab components (Engine, CI Checker, CLI)
45+
build: ## Build binary files of all Database Lab components (Engine, CI Checker, CLI, RDS Refresh)
4546
${GOBUILD} -o bin/${SERVER_BINARY} ./cmd/database-lab/main.go
4647
${GOBUILD} -o bin/${RUN_CI_BINARY} ./cmd/runci/main.go
4748
${GOBUILD} -o bin/${CLI_BINARY} ./cmd/cli/main.go
49+
${GOBUILD} -o bin/${RDS_REFRESH_BINARY} ./cmd/rds-refresh/main.go
4850

4951
build-debug: ## Build the Database Lab Server binary for debugging
5052
${GOBUILD} -ldflags "-X gitlab.com/postgres-ai/database-lab/v3/version.version=${VERSION} \
@@ -54,6 +56,9 @@ build-debug: ## Build the Database Lab Server binary for debugging
5456
build-ci-checker: ## Build the Database Lab CI Checker binary
5557
${GOBUILD} -o bin/${RUN_CI_BINARY} ./cmd/runci/main.go
5658

59+
build-rds-refresh: ## Build the RDS Refresh binary
60+
${GOBUILD} -o bin/${RDS_REFRESH_BINARY} ./cmd/rds-refresh/main.go
61+
5762
build-client: ## Build Database Lab CLI binaries for all supported operating systems and platforms
5863
$(foreach GOOS, $(CLIENT_PLATFORMS),\
5964
$(foreach GOARCH, $(ARCHITECTURES), \
@@ -95,4 +100,4 @@ run-dle: build-dle
95100
-p "2345:2345" \
96101
dblab_server:local
97102

98-
.PHONY: help all build test run-lint install-lint lint fmt clean build-image build-dle build-ci-checker build-client build-ci-checker run-dle
103+
.PHONY: help all build test run-lint install-lint lint fmt clean build-image build-dle build-ci-checker build-client build-rds-refresh run-dle

engine/cmd/rds-refresh/README.md

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
# RDS/Aurora Refresh for DBLab
2+
3+
Perform full refresh from RDS/Aurora snapshots (logical mode).
4+
5+
## Why?
6+
7+
DBLab logical mode runs `pg_dump` against your database. On large databases, this:
8+
- **Holds xmin horizon for hours** → bloat accumulation
9+
- **Creates load on production**
10+
- **Requires direct network access** to production
11+
12+
This tool dumps from a **temporary RDS clone** instead. Production is never touched.
13+
14+
```
15+
Production ──RDS snapshot──► RDS Snapshot ──restore──► RDS Clone ──pg_dump──► DBLab
16+
(automated) (temporary)
17+
```
18+
19+
## Quick Start
20+
21+
```bash
22+
# 1. Configure
23+
cat > config.yaml << 'EOF'
24+
source:
25+
type: rds # or "aurora-cluster"
26+
identifier: my-prod-db
27+
dbName: postgres
28+
username: postgres
29+
password: ${DB_PASSWORD}
30+
31+
clone:
32+
instanceClass: db.t3.medium
33+
securityGroups: [sg-xxx] # must allow DBLab inbound
34+
35+
dblab:
36+
apiEndpoint: https://dblab:2345
37+
token: ${DBLAB_TOKEN}
38+
39+
aws:
40+
region: us-east-1
41+
EOF
42+
43+
# 2. Test
44+
docker run --rm \
45+
-v $PWD/config.yaml:/config.yaml \
46+
-e DB_PASSWORD -e DBLAB_TOKEN -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
47+
postgresai/rds-refresh -config /config.yaml -dry-run
48+
49+
# 3. Run
50+
docker run --rm \
51+
-v $PWD/config.yaml:/config.yaml \
52+
-e DB_PASSWORD -e DBLAB_TOKEN -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
53+
postgresai/rds-refresh -config /config.yaml
54+
```
55+
56+
## Configuration
57+
58+
| Field | Required | Description |
59+
|-------|----------|-------------|
60+
| `source.type` || `rds` or `aurora-cluster` |
61+
| `source.identifier` || RDS/Aurora identifier |
62+
| `source.dbName` || Database name |
63+
| `source.username` || Database user |
64+
| `source.password` || Password (use `${ENV_VAR}`) |
65+
| `clone.instanceClass` || RDS clone instance type |
66+
| `clone.securityGroups` | | SGs allowing DBLab access |
67+
| `clone.subnetGroup` | | DB subnet group |
68+
| `clone.maxAge` | | Max age before clone is stale (default: 48h) |
69+
| `dblab.apiEndpoint` || DBLab API URL |
70+
| `dblab.token` || DBLab verification token |
71+
| `dblab.timeout` | | Max refresh wait (default: 4h) |
72+
| `aws.region` || AWS region |
73+
74+
Full example: [config.example.rds_refresh.yaml](../../configs/config.example.rds_refresh.yaml)
75+
76+
## Scheduling
77+
78+
```bash
79+
# Cron (weekly, Sunday 2 AM)
80+
0 2 * * 0 docker run --rm -v /etc/dblab/config.yaml:/config.yaml \
81+
--env-file /etc/dblab/env postgresai/rds-refresh -config /config.yaml
82+
```
83+
84+
<details>
85+
<summary>Kubernetes CronJob</summary>
86+
87+
```yaml
88+
apiVersion: batch/v1
89+
kind: CronJob
90+
metadata:
91+
name: dblab-refresh
92+
spec:
93+
schedule: "0 2 * * 0"
94+
concurrencyPolicy: Forbid
95+
jobTemplate:
96+
spec:
97+
template:
98+
spec:
99+
serviceAccountName: dblab-refresh # IRSA
100+
containers:
101+
- name: refresh
102+
image: postgresai/rds-refresh
103+
args: ["-config", "/config/config.yaml"]
104+
envFrom:
105+
- secretRef:
106+
name: dblab-refresh-secrets
107+
volumeMounts:
108+
- name: config
109+
mountPath: /config
110+
volumes:
111+
- name: config
112+
configMap:
113+
name: dblab-refresh-config
114+
restartPolicy: Never
115+
```
116+
</details>
117+
118+
<details>
119+
<summary>ECS Scheduled Task</summary>
120+
121+
```bash
122+
aws events put-rule --name dblab-refresh --schedule-expression "cron(0 2 ? * SUN *)"
123+
aws events put-targets --rule dblab-refresh --targets '[{
124+
"Id": "1",
125+
"Arn": "arn:aws:ecs:REGION:ACCOUNT:cluster/CLUSTER",
126+
"RoleArn": "arn:aws:iam::ACCOUNT:role/ecsEventsRole",
127+
"EcsParameters": {
128+
"TaskDefinitionArn": "arn:aws:ecs:REGION:ACCOUNT:task-definition/dblab-refresh",
129+
"TaskCount": 1, "LaunchType": "FARGATE"
130+
}
131+
}]'
132+
```
133+
</details>
134+
135+
## IAM Policy
136+
137+
```json
138+
{
139+
"Version": "2012-10-17",
140+
"Statement": [
141+
{
142+
"Effect": "Allow",
143+
"Action": ["rds:DescribeDBSnapshots", "rds:DescribeDBClusterSnapshots",
144+
"rds:DescribeDBInstances", "rds:DescribeDBClusters"],
145+
"Resource": "*"
146+
},
147+
{
148+
"Effect": "Allow",
149+
"Action": ["rds:RestoreDBInstanceFromDBSnapshot", "rds:RestoreDBClusterFromSnapshot",
150+
"rds:CreateDBInstance", "rds:DeleteDBInstance", "rds:DeleteDBCluster",
151+
"rds:AddTagsToResource", "rds:ModifyDBInstance", "rds:ModifyDBCluster"],
152+
"Resource": ["arn:aws:rds:*:ACCOUNT:db:dblab-refresh-*",
153+
"arn:aws:rds:*:ACCOUNT:cluster:dblab-refresh-*",
154+
"arn:aws:rds:*:ACCOUNT:snapshot:*",
155+
"arn:aws:rds:*:ACCOUNT:cluster-snapshot:*",
156+
"arn:aws:rds:*:ACCOUNT:subgrp:*", "arn:aws:rds:*:ACCOUNT:pg:*"]
157+
}
158+
]
159+
}
160+
```
161+
162+
## Network
163+
164+
RDS clone must be reachable from DBLab on port 5432. Same VPC or peered.
165+
166+
## DBLab Setup
167+
168+
Must run in **logical mode**. Tool updates config via API (no SSH needed).
169+
170+
```yaml
171+
retrieval:
172+
refresh:
173+
timetable: "" # disable built-in scheduler
174+
jobs: [logicalDump, logicalRestore, logicalSnapshot]
175+
spec:
176+
logicalDump:
177+
options:
178+
source:
179+
connection:
180+
host: placeholder # updated by rds-refresh
181+
port: 5432
182+
```
183+
184+
## How It Works
185+
186+
1. **Startup cleanup**: Check for orphaned clones from previous runs
187+
2. Check DBLab health
188+
3. Find latest RDS snapshot
189+
4. Create RDS clone from RDS snapshot (`dblab-refresh-YYYYMMDD-HHMMSS`)
190+
5. Wait for RDS clone (~15 min)
191+
6. Update DBLab config via API
192+
7. Trigger refresh, wait for completion
193+
8. Delete RDS clone (always, even on error)
194+
195+
## Orphan Protection
196+
197+
The tool has multiple layers of protection against orphaned clones:
198+
199+
1. **Defer cleanup**: Clone is deleted when process exits normally
200+
2. **Signal handlers**: Catches SIGINT, SIGTERM, SIGHUP (SSH disconnect)
201+
3. **State file**: Tracks active clone in `./meta/rds-refresh.state` (same directory as DBLab meta files)
202+
4. **Tag scan**: Finds clones by `ManagedBy=dblab-rds-refresh` tag
203+
204+
### Manual Cleanup
205+
206+
```bash
207+
# Dry run - see what would be deleted
208+
rds-refresh cleanup -config config.yaml -dry-run
209+
210+
# Delete stale clones older than 24 hours
211+
rds-refresh cleanup -config config.yaml -max-age 24h
212+
213+
# Run in cron as safety net (weekly)
214+
0 3 * * 0 rds-refresh cleanup -config /etc/dblab/config.yaml -max-age 48h
215+
```
216+
217+
### Best Practice
218+
219+
Run inside `screen` or `tmux` to prevent SSH disconnections from orphaning clones:
220+
221+
```bash
222+
screen -S dblab-refresh
223+
rds-refresh -config config.yaml
224+
# Ctrl+A, D to detach
225+
```
226+
227+
## Troubleshooting
228+
229+
| Error | Fix |
230+
|-------|-----|
231+
| No snapshots | Enable automated backups on RDS |
232+
| RDS clone not accessible | Check security group allows 5432 from DBLab |
233+
| Config update failed | Verify DBLab endpoint and token |
234+
| Timeout | Increase `dblab.timeout`, check DBLab logs |
235+
236+
## Cost
237+
238+
RDS clone cost only while running (~2-5 hours):
239+
- db.t3.medium: ~$0.35
240+
- db.r5.large: ~$1.20
241+
242+
## License
243+
244+
Apache 2.0 — [Postgres.ai](https://postgres.ai)

0 commit comments

Comments
 (0)