Skip to content

Commit c08e968

Browse files
committed
standarize snapshot functions
1 parent ca8efee commit c08e968

9 files changed

Lines changed: 126 additions & 90 deletions

File tree

src/_util/backup_config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import os
2+
3+
SNAPSHOT_TIMEOUT_SEC = int(os.environ.get("SNAPSHOT_TIMEOUT_SEC", "120"))
4+
SNAPSHOT_POLL_INTERVAL_SEC = int(os.environ.get("SNAPSHOT_POLL_INTERVAL_SEC", "5"))
5+
VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass")

src/api/_util/backups.py

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,7 @@
99

1010
from ...models.backups import BackupEntry, BackupSchedule, BackupScheduleRow, NextBackup
1111
from ...models.branch import Branch
12-
from ..backup_snapshots import (
13-
delete_branch_snapshot,
14-
)
12+
from ..backup_snapshots import build_snapshot_metadata, delete_snapshot
1513

1614
logger = logging.getLogger(__name__)
1715

@@ -114,16 +112,11 @@ async def delete_branch_backups(session: SessionDep, branch: Branch) -> None:
114112
return
115113

116114
for backup in backups:
115+
snapshot = build_snapshot_metadata(backup)
116+
if snapshot is None:
117+
logger.warning("Skipping snapshot deletion for backup %s because metadata was incomplete", backup.id)
118+
continue
117119
try:
118-
await delete_branch_snapshot(
119-
name=backup.snapshot_name,
120-
namespace=backup.snapshot_namespace,
121-
content_name=backup.snapshot_content_name,
122-
)
120+
await delete_snapshot(snapshot)
123121
except Exception:
124-
logger.exception(
125-
"Failed to delete snapshot %s/%s for backup %s",
126-
backup.snapshot_namespace,
127-
backup.snapshot_name,
128-
backup.id,
129-
)
122+
logger.exception("Failed to delete snapshots for branch %s", branch.id)

src/api/backup.py

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,7 @@
3333
from ..models.project import Project
3434
from ._util.backups import _remove_existing_schedule, _validate_project_retention_budget
3535
from .auth import authenticated_user
36-
from .backup_snapshots import (
37-
SNAPSHOT_POLL_INTERVAL_SEC,
38-
create_branch_snapshot,
39-
delete_branch_snapshot,
40-
)
36+
from .backup_snapshots import build_snapshot_metadata, create_branch_db_snapshot, delete_snapshot
4137
from .db import SessionDep
4238
from .dependencies import OrganizationDep
4339

@@ -46,7 +42,6 @@
4642
# ---------------------------
4743
# Constants
4844
# ---------------------------
49-
VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass")
5045
MANUAL_BACKUP_TIMEOUT_SEC = int(os.environ.get("MANUAL_BACKUP_TIMEOUT_SEC", "10"))
5146

5247
UNIT_MULTIPLIER = {
@@ -541,13 +536,10 @@ async def manual_backup(session: SessionDep, branch_id: Identifier) -> BackupCre
541536
recorded_at = datetime.now(UTC)
542537

543538
try:
544-
snapshot = await create_branch_snapshot(
539+
snapshot = await create_branch_db_snapshot(
545540
branch.id,
546541
backup_id=backup_id,
547-
snapshot_class=VOLUME_SNAPSHOT_CLASS,
548-
poll_interval=SNAPSHOT_POLL_INTERVAL_SEC,
549542
label="manual",
550-
time_limit=MANUAL_BACKUP_TIMEOUT_SEC,
551543
)
552544
except Exception as exc:
553545
logger.exception("Manual backup failed for branch %s within timeout", branch.id)
@@ -588,15 +580,13 @@ async def delete_backup(session: SessionDep, backup_id: Identifier) -> BackupDel
588580
if not backup:
589581
raise HTTPException(status_code=404, detail="Backup not found")
590582

591-
try:
592-
await delete_branch_snapshot(
593-
name=backup.snapshot_name,
594-
namespace=backup.snapshot_namespace,
595-
content_name=backup.snapshot_content_name,
596-
)
597-
except Exception as exc:
598-
logger.exception("Failed to delete snapshot for backup %s", backup_id)
599-
raise HTTPException(status_code=500, detail="Failed to delete backup snapshot") from exc
583+
metadata = build_snapshot_metadata(backup)
584+
if metadata is not None:
585+
try:
586+
await delete_snapshot(metadata)
587+
except Exception as exc:
588+
logger.exception("Failed to delete snapshot for backup %s", backup_id)
589+
raise HTTPException(status_code=500, detail="Failed to delete backup snapshot") from exc
600590

601591
await session.delete(backup)
602592

src/api/backup_snapshots.py

Lines changed: 80 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,22 @@
22

33
import asyncio
44
import logging
5-
import os
65
import re
76
from dataclasses import dataclass
87
from typing import TYPE_CHECKING
98

9+
from pydantic import BaseModel, Field
10+
1011
from .._util import Identifier, quantity_to_bytes
11-
from ..deployment import AUTOSCALER_PVC_SUFFIX, get_autoscaler_vm_identity
12+
from .._util.backup_config import (
13+
SNAPSHOT_POLL_INTERVAL_SEC,
14+
SNAPSHOT_TIMEOUT_SEC,
15+
VOLUME_SNAPSHOT_CLASS,
16+
)
17+
from ..deployment import (
18+
AUTOSCALER_DB_PVC_SUFFIX,
19+
get_autoscaler_vm_identity,
20+
)
1221
from ..deployment.kubernetes.snapshot import (
1322
create_snapshot_from_pvc,
1423
ensure_snapshot_absent,
@@ -20,12 +29,38 @@
2029
if TYPE_CHECKING:
2130
from ulid import ULID
2231

23-
logger = logging.getLogger(__name__)
32+
from ..models.backups import BackupEntry
2433

25-
SNAPSHOT_TIMEOUT_SEC = int(os.environ.get("SNAPSHOT_TIMEOUT_SEC", "120"))
26-
SNAPSHOT_POLL_INTERVAL_SEC = int(os.environ.get("SNAPSHOT_POLL_INTERVAL_SEC", "5"))
34+
logger = logging.getLogger(__name__)
2735

2836
_K8S_NAME_MAX_LENGTH = 63
37+
DEFAULT_SNAPSHOT_TIMEOUT_SEC = float(SNAPSHOT_TIMEOUT_SEC)
38+
DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC = float(SNAPSHOT_POLL_INTERVAL_SEC)
39+
40+
41+
class SnapshotMetadata(BaseModel):
42+
name: str = Field(..., min_length=1)
43+
namespace: str = Field(..., min_length=1)
44+
# content_name stays optional because there are runtime scenarios where the
45+
# VolumeSnapshotContent hasn’t been bound yet
46+
content_name: str | None
47+
48+
49+
def build_snapshot_metadata(backup: BackupEntry) -> SnapshotMetadata | None:
50+
name = backup.snapshot_name
51+
namespace = backup.snapshot_namespace
52+
if not name or not namespace:
53+
logger.debug(
54+
"Skipping metadata for missing snapshot identifiers (name=%r namespace=%r)",
55+
name,
56+
namespace,
57+
)
58+
return None
59+
return SnapshotMetadata(
60+
name=name,
61+
namespace=namespace,
62+
content_name=backup.snapshot_content_name,
63+
)
2964

3065

3166
@dataclass(frozen=True)
@@ -58,20 +93,18 @@ def _build_snapshot_name(*, label: str, backup_id: ULID) -> str:
5893
return f"{label_component}{separator}{backup_component}"
5994

6095

61-
async def create_branch_snapshot(
62-
branch_id: Identifier,
96+
async def _create_snapshot_from_pvc(
6397
*,
98+
namespace: str,
99+
pvc_name: str,
64100
backup_id: ULID,
65101
snapshot_class: str,
66-
poll_interval: float,
67102
label: str,
103+
poll_interval: float,
68104
time_limit: float,
69105
) -> SnapshotDetails:
70-
namespace, autoscaler_vm_name = get_autoscaler_vm_identity(branch_id)
71-
pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_PVC_SUFFIX}"
72106
snapshot_name = _build_snapshot_name(label=label, backup_id=backup_id)
73-
74-
logger.info("Creating VolumeSnapshot %s/%s for branch %s", namespace, snapshot_name, branch_id)
107+
logger.info("Creating VolumeSnapshot %s/%s for branch PVC %s", namespace, snapshot_name, pvc_name)
75108
try:
76109
async with asyncio.timeout(time_limit):
77110
await create_snapshot_from_pvc(
@@ -88,10 +121,10 @@ async def create_branch_snapshot(
88121
)
89122
except TimeoutError:
90123
logger.exception(
91-
"Timed out creating VolumeSnapshot %s/%s for branch %s within %s seconds",
124+
"Timed out creating VolumeSnapshot %s/%s for PVC %s within %s seconds",
92125
namespace,
93126
snapshot_name,
94-
branch_id,
127+
pvc_name,
95128
time_limit,
96129
)
97130
raise
@@ -115,29 +148,43 @@ async def create_branch_snapshot(
115148
)
116149

117150

118-
async def delete_branch_snapshot(
151+
async def create_branch_db_snapshot(
152+
branch_id: Identifier,
119153
*,
120-
name: str | None,
121-
namespace: str | None,
122-
content_name: str | None,
123-
time_limit: float = SNAPSHOT_TIMEOUT_SEC,
124-
poll_interval: float = SNAPSHOT_POLL_INTERVAL_SEC,
125-
) -> None:
126-
if not name or not namespace:
127-
logger.debug(
128-
"Skipping deletion for VolumeSnapshot with missing metadata (name=%s namespace=%s)",
129-
name,
130-
namespace,
131-
)
132-
return
154+
backup_id: ULID,
155+
snapshot_class: str = VOLUME_SNAPSHOT_CLASS,
156+
poll_interval: float = DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC,
157+
label: str,
158+
time_limit: float = DEFAULT_SNAPSHOT_TIMEOUT_SEC,
159+
) -> SnapshotDetails:
160+
namespace, autoscaler_vm_name = get_autoscaler_vm_identity(branch_id)
161+
pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_DB_PVC_SUFFIX}"
162+
return await _create_snapshot_from_pvc(
163+
namespace=namespace,
164+
pvc_name=pvc_name,
165+
backup_id=backup_id,
166+
snapshot_class=snapshot_class,
167+
poll_interval=poll_interval,
168+
label=label,
169+
time_limit=time_limit,
170+
)
171+
133172

134-
derived_content_name = content_name
173+
async def delete_snapshot(
174+
metadata: SnapshotMetadata,
175+
*,
176+
time_limit: float = DEFAULT_SNAPSHOT_TIMEOUT_SEC,
177+
poll_interval: float = DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC,
178+
) -> None:
179+
name = metadata.name
180+
namespace = metadata.namespace
181+
content_name = metadata.content_name
135182
try:
136183
async with asyncio.timeout(time_limit):
137184
snapshot = await read_snapshot(namespace, name)
138185
if snapshot is not None:
139186
status = snapshot.get("status") or {}
140-
derived_content_name = derived_content_name or status.get("boundVolumeSnapshotContentName")
187+
content_name = content_name or status.get("boundVolumeSnapshotContentName")
141188
logger.info("Deleting VolumeSnapshot %s/%s", namespace, name)
142189
await ensure_snapshot_absent(
143190
namespace,
@@ -148,10 +195,10 @@ async def delete_branch_snapshot(
148195
else:
149196
logger.info("VolumeSnapshot %s/%s already absent", namespace, name)
150197

151-
if derived_content_name:
152-
logger.info("Ensuring VolumeSnapshotContent %s is absent", derived_content_name)
198+
if content_name:
199+
logger.info("Ensuring VolumeSnapshotContent %s is absent", content_name)
153200
await ensure_snapshot_content_absent(
154-
derived_content_name,
201+
content_name,
155202
timeout=time_limit,
156203
poll_interval=poll_interval,
157204
)

src/api/backupmonitor.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from sqlmodel import SQLModel, asc, delete, select
1111
from ulid import ULID
1212

13+
from .._util.backup_config import SNAPSHOT_POLL_INTERVAL_SEC, SNAPSHOT_TIMEOUT_SEC, VOLUME_SNAPSHOT_CLASS
1314
from ..models.backups import (
1415
BackupEntry,
1516
BackupLog,
@@ -21,18 +22,16 @@
2122
from ..models.organization import Organization
2223
from ..models.project import Project
2324
from .backup_snapshots import (
24-
SNAPSHOT_POLL_INTERVAL_SEC,
25-
SNAPSHOT_TIMEOUT_SEC,
26-
create_branch_snapshot,
27-
delete_branch_snapshot,
25+
build_snapshot_metadata,
26+
create_branch_db_snapshot,
27+
delete_snapshot,
2828
)
2929
from .organization.project.branch import refresh_branch_status
3030
from .settings import get_settings
3131

3232
# ---------------------------
3333
# Config
3434
# ---------------------------
35-
VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "csi-snapshot-class")
3635
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "60"))
3736

3837
logging.basicConfig(
@@ -209,11 +208,15 @@ async def _delete_many(
209208

210209
deleted_ids: list[ULID] = []
211210
for backup in backups:
211+
metadata = build_snapshot_metadata(backup)
212+
if metadata is None:
213+
logger.warning("Skipping snapshot deletion for backup %s because metadata was incomplete", backup.id)
214+
continue
212215
try:
213-
await delete_branch_snapshot(
214-
name=backup.snapshot_name,
215-
namespace=backup.snapshot_namespace,
216-
content_name=backup.snapshot_content_name,
216+
await delete_snapshot(
217+
metadata,
218+
time_limit=SNAPSHOT_TIMEOUT_SEC,
219+
poll_interval=SNAPSHOT_POLL_INTERVAL_SEC,
217220
)
218221
except Exception:
219222
context = {
@@ -308,13 +311,11 @@ async def execute_backup(self, db: AsyncSession, branch: Branch, row: BackupSche
308311
backup_id = ULID()
309312

310313
try:
311-
snapshot = await create_branch_snapshot(
314+
snapshot = await create_branch_db_snapshot(
312315
branch.id,
313316
backup_id=backup_id,
314317
snapshot_class=VOLUME_SNAPSHOT_CLASS,
315-
poll_interval=SNAPSHOT_POLL_INTERVAL_SEC,
316318
label=f"row-{row.row_index}",
317-
time_limit=SNAPSHOT_TIMEOUT_SEC,
318319
)
319320
except Exception:
320321
nb.next_at = next_due

src/api/organization/project/branch/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from ....._util import DEFAULT_DB_NAME, DEFAULT_DB_USER, Identifier, bytes_to_gb
2323
from ....._util.crypto import encrypt_with_passphrase, generate_keys
2424
from .....deployment import (
25-
AUTOSCALER_PVC_SUFFIX,
25+
AUTOSCALER_DB_PVC_SUFFIX,
2626
STORAGE_PVC_SUFFIX,
2727
DeploymentParameters,
2828
ResizeParameters,
@@ -750,7 +750,7 @@ async def _apply_resize_operations(
750750
namespace, autoscaler_vm_name = get_autoscaler_vm_identity(branch.id)
751751
if "database_size" in effective_parameters:
752752
new_database_size = effective_parameters["database_size"]
753-
pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_PVC_SUFFIX}"
753+
pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_DB_PVC_SUFFIX}"
754754
storage_size_gb = f"{bytes_to_gb(new_database_size)}G"
755755
await kube_service.resize_pvc_storage(namespace, pvc_name, storage_size_gb)
756756

src/deployment/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
SIMPLYBLOCK_CSI_SECRET = "simplyblock-csi-secret"
7474
STORAGE_PVC_SUFFIX = "-db-storage-pvc"
7575
DATABASE_PVC_SUFFIX = "-db-pvc"
76-
AUTOSCALER_PVC_SUFFIX = "-block-data"
76+
AUTOSCALER_DB_PVC_SUFFIX = "-block-data"
7777
_LOAD_BALANCER_TIMEOUT_SECONDS = float(600)
7878
_LOAD_BALANCER_POLL_INTERVAL_SECONDS = float(2)
7979
DNSRecordType = Literal["AAAA", "CNAME"]
@@ -289,7 +289,7 @@ async def resolve_storage_volume_identifiers(namespace: str) -> tuple[str, str |
289289

290290

291291
async def resolve_autoscaler_volume_identifiers(namespace: str) -> tuple[str, str | None]:
292-
pvc_name = f"{_autoscaler_vm_name(namespace)}{AUTOSCALER_PVC_SUFFIX}"
292+
pvc_name = f"{_autoscaler_vm_name(namespace)}{AUTOSCALER_DB_PVC_SUFFIX}"
293293
return await _resolve_volume_identifiers(namespace, pvc_name)
294294

295295

@@ -422,7 +422,7 @@ def _configure_vela_values(
422422
autoscaler_resources["memorySlots"] = memory_slots
423423

424424
autoscaler_persistence = autoscaler_spec.setdefault("persistence", {})
425-
autoscaler_persistence["claimName"] = f"{_autoscaler_vm_name(namespace)}{AUTOSCALER_PVC_SUFFIX}"
425+
autoscaler_persistence["claimName"] = f"{_autoscaler_vm_name(namespace)}{AUTOSCALER_DB_PVC_SUFFIX}"
426426
autoscaler_persistence["size"] = f"{bytes_to_gb(parameters.database_size)}G"
427427
autoscaler_persistence["storageClassName"] = storage_class_name
428428
autoscaler_persistence.setdefault("accessModes", ["ReadWriteMany"])

0 commit comments

Comments
 (0)