Skip to content

Commit b395962

Browse files
committed
standarize snapshot functions
1 parent a233af0 commit b395962

6 files changed

Lines changed: 116 additions & 79 deletions

File tree

src/_util/backup_config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import os
2+
3+
SNAPSHOT_TIMEOUT_SEC = int(os.environ.get("SNAPSHOT_TIMEOUT_SEC", "120"))
4+
SNAPSHOT_POLL_INTERVAL_SEC = int(os.environ.get("SNAPSHOT_POLL_INTERVAL_SEC", "5"))
5+
VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass")

src/api/_util/backups.py

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,7 @@
99

1010
from ...models.backups import BackupEntry, BackupSchedule, BackupScheduleRow, NextBackup
1111
from ...models.branch import Branch
12-
from ..backup_snapshots import (
13-
delete_branch_snapshot,
14-
)
12+
from ..backup_snapshots import build_snapshot_metadata, delete_snapshot
1513

1614
logger = logging.getLogger(__name__)
1715

@@ -114,16 +112,11 @@ async def delete_branch_backups(session: SessionDep, branch_id: Identifier) -> N
114112
return
115113

116114
for backup in backups:
115+
snapshot = build_snapshot_metadata(backup)
116+
if snapshot is None:
117+
logger.warning("Skipping snapshot deletion for backup %s because metadata was incomplete", backup.id)
118+
continue
117119
try:
118-
await delete_branch_snapshot(
119-
name=backup.snapshot_name,
120-
namespace=backup.snapshot_namespace,
121-
content_name=backup.snapshot_content_name,
122-
)
120+
await delete_snapshot(snapshot)
123121
except Exception:
124-
logger.exception(
125-
"Failed to delete snapshot %s/%s for backup %s",
126-
backup.snapshot_namespace,
127-
backup.snapshot_name,
128-
backup.id,
129-
)
122+
logger.exception("Failed to delete snapshots for branch %s", branch_id)

src/api/backup.py

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,7 @@
3333
from ..models.project import Project
3434
from ._util.backups import _remove_existing_schedule, _validate_project_retention_budget
3535
from .auth import authenticated_user
36-
from .backup_snapshots import (
37-
SNAPSHOT_POLL_INTERVAL_SEC,
38-
create_branch_snapshot,
39-
delete_branch_snapshot,
40-
)
36+
from .backup_snapshots import build_snapshot_metadata, create_branch_db_snapshot, delete_snapshot
4137
from .db import SessionDep
4238
from .dependencies import OrganizationDep
4339

@@ -46,7 +42,6 @@
4642
# ---------------------------
4743
# Constants
4844
# ---------------------------
49-
VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass")
5045
MANUAL_BACKUP_TIMEOUT_SEC = int(os.environ.get("MANUAL_BACKUP_TIMEOUT_SEC", "10"))
5146

5247
UNIT_MULTIPLIER = {
@@ -539,13 +534,10 @@ async def manual_backup(session: SessionDep, branch_id: Identifier) -> BackupCre
539534
recorded_at = datetime.now(UTC)
540535

541536
try:
542-
snapshot = await create_branch_snapshot(
537+
snapshot = await create_branch_db_snapshot(
543538
branch.id,
544539
backup_id=backup_id,
545-
snapshot_class=VOLUME_SNAPSHOT_CLASS,
546-
poll_interval=SNAPSHOT_POLL_INTERVAL_SEC,
547540
label="manual",
548-
time_limit=MANUAL_BACKUP_TIMEOUT_SEC,
549541
)
550542
except Exception as exc:
551543
logger.exception("Manual backup failed for branch %s within timeout", branch.id)
@@ -586,15 +578,13 @@ async def delete_backup(session: SessionDep, backup_id: Identifier) -> BackupDel
586578
if not backup:
587579
raise HTTPException(status_code=404, detail="Backup not found")
588580

589-
try:
590-
await delete_branch_snapshot(
591-
name=backup.snapshot_name,
592-
namespace=backup.snapshot_namespace,
593-
content_name=backup.snapshot_content_name,
594-
)
595-
except Exception as exc:
596-
logger.exception("Failed to delete snapshot for backup %s", backup_id)
597-
raise HTTPException(status_code=500, detail="Failed to delete backup snapshot") from exc
581+
metadata = build_snapshot_metadata(backup)
582+
if metadata is not None:
583+
try:
584+
await delete_snapshot(metadata)
585+
except Exception as exc:
586+
logger.exception("Failed to delete snapshot for backup %s", backup_id)
587+
raise HTTPException(status_code=500, detail="Failed to delete backup snapshot") from exc
598588

599589
await session.delete(backup)
600590

src/api/backup_snapshots.py

Lines changed: 81 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,22 @@
22

33
import asyncio
44
import logging
5-
import os
65
import re
76
from dataclasses import dataclass
87
from typing import TYPE_CHECKING
98

9+
from pydantic import BaseModel, Field
10+
1011
from .._util import Identifier, quantity_to_bytes
11-
from ..deployment import AUTOSCALER_PVC_SUFFIX, get_autoscaler_vm_identity
12+
from .._util.backup_config import (
13+
SNAPSHOT_POLL_INTERVAL_SEC,
14+
SNAPSHOT_TIMEOUT_SEC,
15+
VOLUME_SNAPSHOT_CLASS,
16+
)
17+
from ..deployment import (
18+
AUTOSCALER_PVC_SUFFIX,
19+
get_autoscaler_vm_identity,
20+
)
1221
from ..deployment.kubernetes.snapshot import (
1322
create_snapshot_from_pvc,
1423
ensure_snapshot_absent,
@@ -21,12 +30,38 @@
2130
if TYPE_CHECKING:
2231
from ulid import ULID
2332

24-
logger = logging.getLogger(__name__)
33+
from ..models.backups import BackupEntry
2534

26-
SNAPSHOT_TIMEOUT_SEC = int(os.environ.get("SNAPSHOT_TIMEOUT_SEC", "120"))
27-
SNAPSHOT_POLL_INTERVAL_SEC = int(os.environ.get("SNAPSHOT_POLL_INTERVAL_SEC", "5"))
35+
logger = logging.getLogger(__name__)
2836

2937
_K8S_NAME_MAX_LENGTH = 63
38+
DEFAULT_SNAPSHOT_TIMEOUT_SEC = float(SNAPSHOT_TIMEOUT_SEC)
39+
DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC = float(SNAPSHOT_POLL_INTERVAL_SEC)
40+
41+
42+
class SnapshotMetadata(BaseModel):
43+
name: str = Field(..., min_length=1)
44+
namespace: str = Field(..., min_length=1)
45+
# content_name stays optional because there are runtime scenarios where the
46+
# VolumeSnapshotContent hasn’t been bound yet
47+
content_name: str | None
48+
49+
50+
def build_snapshot_metadata(backup: BackupEntry) -> SnapshotMetadata | None:
51+
name = backup.snapshot_name
52+
namespace = backup.snapshot_namespace
53+
if not name or not namespace:
54+
logger.debug(
55+
"Skipping metadata for missing snapshot identifiers (name=%r namespace=%r)",
56+
name,
57+
namespace,
58+
)
59+
return None
60+
return SnapshotMetadata(
61+
name=name,
62+
namespace=namespace,
63+
content_name=backup.snapshot_content_name,
64+
)
3065

3166

3267
@dataclass(frozen=True)
@@ -59,20 +94,18 @@ def _build_snapshot_name(*, label: str, backup_id: ULID) -> str:
5994
return f"{label_component}{separator}{backup_component}"
6095

6196

62-
async def create_branch_snapshot(
63-
branch_id: Identifier,
97+
async def _create_snapshot_from_pvc(
6498
*,
99+
namespace: str,
100+
pvc_name: str,
65101
backup_id: ULID,
66102
snapshot_class: str,
67-
poll_interval: float,
68103
label: str,
104+
poll_interval: float,
69105
time_limit: float,
70106
) -> SnapshotDetails:
71-
namespace, autoscaler_vm_name = get_autoscaler_vm_identity(branch_id)
72-
pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_PVC_SUFFIX}"
73107
snapshot_name = _build_snapshot_name(label=label, backup_id=backup_id)
74-
75-
logger.info("Creating VolumeSnapshot %s/%s for branch %s", namespace, snapshot_name, branch_id)
108+
logger.info("Creating VolumeSnapshot %s/%s for branch PVC %s", namespace, snapshot_name, pvc_name)
76109
try:
77110
async with asyncio.timeout(time_limit):
78111
await create_snapshot_from_pvc(
@@ -89,14 +122,14 @@ async def create_branch_snapshot(
89122
)
90123
except TimeoutError as exc:
91124
logger.exception(
92-
"Timed out creating VolumeSnapshot %s/%s for branch %s within %s seconds",
125+
"Timed out creating VolumeSnapshot %s/%s for PVC %s within %s seconds",
93126
namespace,
94127
snapshot_name,
95-
branch_id,
128+
pvc_name,
96129
time_limit,
97130
)
98131
raise VelaSnapshotTimeoutError(
99-
f"Timed out creating VolumeSnapshot {namespace}/{snapshot_name} for branch {branch_id}"
132+
f"Timed out creating VolumeSnapshot {namespace}/{snapshot_name} for namespace {namespace}"
100133
) from exc
101134

102135
status = snapshot.get("status") or {}
@@ -118,29 +151,43 @@ async def create_branch_snapshot(
118151
)
119152

120153

121-
async def delete_branch_snapshot(
154+
async def create_branch_db_snapshot(
155+
branch_id: Identifier,
122156
*,
123-
name: str | None,
124-
namespace: str | None,
125-
content_name: str | None,
126-
time_limit: float = SNAPSHOT_TIMEOUT_SEC,
127-
poll_interval: float = SNAPSHOT_POLL_INTERVAL_SEC,
128-
) -> None:
129-
if not name or not namespace:
130-
logger.debug(
131-
"Skipping deletion for VolumeSnapshot with missing metadata (name=%s namespace=%s)",
132-
name,
133-
namespace,
134-
)
135-
return
157+
backup_id: ULID,
158+
snapshot_class: str = VOLUME_SNAPSHOT_CLASS,
159+
poll_interval: float = DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC,
160+
label: str,
161+
time_limit: float = DEFAULT_SNAPSHOT_TIMEOUT_SEC,
162+
) -> SnapshotDetails:
163+
namespace, autoscaler_vm_name = get_autoscaler_vm_identity(branch_id)
164+
pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_PVC_SUFFIX}"
165+
return await _create_snapshot_from_pvc(
166+
namespace=namespace,
167+
pvc_name=pvc_name,
168+
backup_id=backup_id,
169+
snapshot_class=snapshot_class,
170+
poll_interval=poll_interval,
171+
label=label,
172+
time_limit=time_limit,
173+
)
174+
136175

137-
derived_content_name = content_name
176+
async def delete_snapshot(
177+
metadata: SnapshotMetadata,
178+
*,
179+
time_limit: float = DEFAULT_SNAPSHOT_TIMEOUT_SEC,
180+
poll_interval: float = DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC,
181+
) -> None:
182+
name = metadata.name
183+
namespace = metadata.namespace
184+
content_name = metadata.content_name
138185
try:
139186
async with asyncio.timeout(time_limit):
140187
snapshot = await read_snapshot(namespace, name)
141188
if snapshot is not None:
142189
status = snapshot.get("status") or {}
143-
derived_content_name = derived_content_name or status.get("boundVolumeSnapshotContentName")
190+
content_name = content_name or status.get("boundVolumeSnapshotContentName")
144191
logger.info("Deleting VolumeSnapshot %s/%s", namespace, name)
145192
await ensure_snapshot_absent(
146193
namespace,
@@ -151,10 +198,10 @@ async def delete_branch_snapshot(
151198
else:
152199
logger.info("VolumeSnapshot %s/%s already absent", namespace, name)
153200

154-
if derived_content_name:
155-
logger.info("Ensuring VolumeSnapshotContent %s is absent", derived_content_name)
201+
if content_name:
202+
logger.info("Ensuring VolumeSnapshotContent %s is absent", content_name)
156203
await ensure_snapshot_content_absent(
157-
derived_content_name,
204+
content_name,
158205
timeout=time_limit,
159206
poll_interval=poll_interval,
160207
)

src/api/backupmonitor.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from sqlmodel import SQLModel, asc, delete, select
1111
from ulid import ULID
1212

13+
from .._util.backup_config import SNAPSHOT_POLL_INTERVAL_SEC, SNAPSHOT_TIMEOUT_SEC, VOLUME_SNAPSHOT_CLASS
1314
from ..models.backups import (
1415
BackupEntry,
1516
BackupLog,
@@ -21,18 +22,16 @@
2122
from ..models.organization import Organization
2223
from ..models.project import Project
2324
from .backup_snapshots import (
24-
SNAPSHOT_POLL_INTERVAL_SEC,
25-
SNAPSHOT_TIMEOUT_SEC,
26-
create_branch_snapshot,
27-
delete_branch_snapshot,
25+
build_snapshot_metadata,
26+
create_branch_db_snapshot,
27+
delete_snapshot,
2828
)
2929
from .organization.project.branch import refresh_branch_status
3030
from .settings import get_settings
3131

3232
# ---------------------------
3333
# Config
3434
# ---------------------------
35-
VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass")
3635
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "60"))
3736

3837
logger = logging.getLogger(__name__)
@@ -204,11 +203,15 @@ async def _delete_many(
204203

205204
deleted_ids: list[ULID] = []
206205
for backup in backups:
206+
metadata = build_snapshot_metadata(backup)
207+
if metadata is None:
208+
logger.warning("Skipping snapshot deletion for backup %s because metadata was incomplete", backup.id)
209+
continue
207210
try:
208-
await delete_branch_snapshot(
209-
name=backup.snapshot_name,
210-
namespace=backup.snapshot_namespace,
211-
content_name=backup.snapshot_content_name,
211+
await delete_snapshot(
212+
metadata,
213+
time_limit=SNAPSHOT_TIMEOUT_SEC,
214+
poll_interval=SNAPSHOT_POLL_INTERVAL_SEC,
212215
)
213216
except Exception:
214217
context = {
@@ -303,13 +306,11 @@ async def execute_backup(self, db: AsyncSession, branch: Branch, row: BackupSche
303306
backup_id = ULID()
304307

305308
try:
306-
snapshot = await create_branch_snapshot(
309+
snapshot = await create_branch_db_snapshot(
307310
branch.id,
308311
backup_id=backup_id,
309312
snapshot_class=VOLUME_SNAPSHOT_CLASS,
310-
poll_interval=SNAPSHOT_POLL_INTERVAL_SEC,
311313
label=f"row-{row.row_index}",
312-
time_limit=SNAPSHOT_TIMEOUT_SEC,
313314
)
314315
except Exception:
315316
nb.next_at = next_due

src/deployment/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
DATABASE_PVC_SUFFIX = "-db-pvc"
7575
AUTOSCALER_PVC_SUFFIX = "-block-data"
7676
AUTOSCALER_WAL_PVC_SUFFIX = "-pg-wal"
77+
AUTOSCALER_PVC_SUFFIX = "-block-data"
7778
_LOAD_BALANCER_TIMEOUT_SECONDS = float(600)
7879
_LOAD_BALANCER_POLL_INTERVAL_SECONDS = float(2)
7980
_OVERLAY_IP_TIMEOUT_SECONDS = float(300)

0 commit comments

Comments
 (0)