Skip to content

Commit f26dff6

Browse files
committed
standarize snapshot functions
1 parent ca8efee commit f26dff6

8 files changed

Lines changed: 120 additions & 72 deletions

File tree

src/_util/backup_config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import os
2+
3+
SNAPSHOT_TIMEOUT_SEC = int(os.environ.get("SNAPSHOT_TIMEOUT_SEC", "120"))
4+
SNAPSHOT_POLL_INTERVAL_SEC = int(os.environ.get("SNAPSHOT_POLL_INTERVAL_SEC", "5"))
5+
VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass")

src/api/backup.py

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,7 @@
3333
from ..models.project import Project
3434
from ._util.backups import _remove_existing_schedule, _validate_project_retention_budget
3535
from .auth import authenticated_user
36-
from .backup_snapshots import (
37-
SNAPSHOT_POLL_INTERVAL_SEC,
38-
create_branch_snapshot,
39-
delete_branch_snapshot,
40-
)
36+
from .backup_snapshots import build_snapshot_metadata, create_branch_db_snapshot, delete_snapshot
4137
from .db import SessionDep
4238
from .dependencies import OrganizationDep
4339

@@ -46,7 +42,6 @@
4642
# ---------------------------
4743
# Constants
4844
# ---------------------------
49-
VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass")
5045
MANUAL_BACKUP_TIMEOUT_SEC = int(os.environ.get("MANUAL_BACKUP_TIMEOUT_SEC", "10"))
5146

5247
UNIT_MULTIPLIER = {
@@ -541,13 +536,10 @@ async def manual_backup(session: SessionDep, branch_id: Identifier) -> BackupCre
541536
recorded_at = datetime.now(UTC)
542537

543538
try:
544-
snapshot = await create_branch_snapshot(
539+
snapshot = await create_branch_db_snapshot(
545540
branch.id,
546541
backup_id=backup_id,
547-
snapshot_class=VOLUME_SNAPSHOT_CLASS,
548-
poll_interval=SNAPSHOT_POLL_INTERVAL_SEC,
549542
label="manual",
550-
time_limit=MANUAL_BACKUP_TIMEOUT_SEC,
551543
)
552544
except Exception as exc:
553545
logger.exception("Manual backup failed for branch %s within timeout", branch.id)
@@ -588,15 +580,13 @@ async def delete_backup(session: SessionDep, backup_id: Identifier) -> BackupDel
588580
if not backup:
589581
raise HTTPException(status_code=404, detail="Backup not found")
590582

591-
try:
592-
await delete_branch_snapshot(
593-
name=backup.snapshot_name,
594-
namespace=backup.snapshot_namespace,
595-
content_name=backup.snapshot_content_name,
596-
)
597-
except Exception as exc:
598-
logger.exception("Failed to delete snapshot for backup %s", backup_id)
599-
raise HTTPException(status_code=500, detail="Failed to delete backup snapshot") from exc
583+
metadata = build_snapshot_metadata(backup)
584+
if metadata is not None:
585+
try:
586+
await delete_snapshot(metadata)
587+
except Exception as exc:
588+
logger.exception("Failed to delete snapshot for backup %s", backup_id)
589+
raise HTTPException(status_code=500, detail="Failed to delete backup snapshot") from exc
600590

601591
await session.delete(backup)
602592

src/api/backup_snapshots.py

Lines changed: 81 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,18 @@
77
from dataclasses import dataclass
88
from typing import TYPE_CHECKING
99

10+
from pydantic import BaseModel, Field
11+
1012
from .._util import Identifier, quantity_to_bytes
11-
from ..deployment import AUTOSCALER_PVC_SUFFIX, get_autoscaler_vm_identity
13+
from .._util.backup_config import (
14+
SNAPSHOT_POLL_INTERVAL_SEC,
15+
SNAPSHOT_TIMEOUT_SEC,
16+
VOLUME_SNAPSHOT_CLASS,
17+
)
18+
from ..deployment import (
19+
AUTOSCALER_DB_PVC_SUFFIX,
20+
get_autoscaler_vm_identity,
21+
)
1222
from ..deployment.kubernetes.snapshot import (
1323
create_snapshot_from_pvc,
1424
ensure_snapshot_absent,
@@ -18,14 +28,44 @@
1828
)
1929

2030
if TYPE_CHECKING:
31+
2132
from ulid import ULID
2233

34+
from ..models.backups import BackupEntry
35+
2336
logger = logging.getLogger(__name__)
2437

2538
SNAPSHOT_TIMEOUT_SEC = int(os.environ.get("SNAPSHOT_TIMEOUT_SEC", "120"))
2639
SNAPSHOT_POLL_INTERVAL_SEC = int(os.environ.get("SNAPSHOT_POLL_INTERVAL_SEC", "5"))
2740

2841
_K8S_NAME_MAX_LENGTH = 63
42+
DEFAULT_SNAPSHOT_TIMEOUT_SEC = float(SNAPSHOT_TIMEOUT_SEC)
43+
DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC = float(SNAPSHOT_POLL_INTERVAL_SEC)
44+
45+
46+
class SnapshotMetadata(BaseModel):
47+
name: str = Field(..., min_length=1)
48+
namespace: str = Field(..., min_length=1)
49+
# content_name stays optional because there are runtime scenarios where the
50+
# VolumeSnapshotContent hasn’t been bound yet
51+
content_name: str | None
52+
53+
54+
def build_snapshot_metadata(backup: BackupEntry) -> SnapshotMetadata | None:
55+
name = backup.snapshot_name
56+
namespace = backup.snapshot_namespace
57+
if not name or not namespace:
58+
logger.debug(
59+
"Skipping metadata for missing snapshot identifiers (name=%r namespace=%r)",
60+
name,
61+
namespace,
62+
)
63+
return None
64+
return SnapshotMetadata(
65+
name=name,
66+
namespace=namespace,
67+
content_name=backup.snapshot_content_name,
68+
)
2969

3070

3171
@dataclass(frozen=True)
@@ -58,20 +98,18 @@ def _build_snapshot_name(*, label: str, backup_id: ULID) -> str:
5898
return f"{label_component}{separator}{backup_component}"
5999

60100

61-
async def create_branch_snapshot(
62-
branch_id: Identifier,
101+
async def _create_snapshot_from_pvc(
63102
*,
103+
namespace: str,
104+
pvc_name: str,
64105
backup_id: ULID,
65106
snapshot_class: str,
66-
poll_interval: float,
67107
label: str,
108+
poll_interval: float,
68109
time_limit: float,
69110
) -> SnapshotDetails:
70-
namespace, autoscaler_vm_name = get_autoscaler_vm_identity(branch_id)
71-
pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_PVC_SUFFIX}"
72111
snapshot_name = _build_snapshot_name(label=label, backup_id=backup_id)
73-
74-
logger.info("Creating VolumeSnapshot %s/%s for branch %s", namespace, snapshot_name, branch_id)
112+
logger.info("Creating VolumeSnapshot %s/%s for branch PVC %s", namespace, snapshot_name, pvc_name)
75113
try:
76114
async with asyncio.timeout(time_limit):
77115
await create_snapshot_from_pvc(
@@ -88,10 +126,10 @@ async def create_branch_snapshot(
88126
)
89127
except TimeoutError:
90128
logger.exception(
91-
"Timed out creating VolumeSnapshot %s/%s for branch %s within %s seconds",
129+
"Timed out creating VolumeSnapshot %s/%s for PVC %s within %s seconds",
92130
namespace,
93131
snapshot_name,
94-
branch_id,
132+
pvc_name,
95133
time_limit,
96134
)
97135
raise
@@ -115,29 +153,43 @@ async def create_branch_snapshot(
115153
)
116154

117155

118-
async def delete_branch_snapshot(
156+
async def create_branch_db_snapshot(
157+
branch_id: Identifier,
119158
*,
120-
name: str | None,
121-
namespace: str | None,
122-
content_name: str | None,
123-
time_limit: float = SNAPSHOT_TIMEOUT_SEC,
124-
poll_interval: float = SNAPSHOT_POLL_INTERVAL_SEC,
125-
) -> None:
126-
if not name or not namespace:
127-
logger.debug(
128-
"Skipping deletion for VolumeSnapshot with missing metadata (name=%s namespace=%s)",
129-
name,
130-
namespace,
131-
)
132-
return
159+
backup_id: ULID,
160+
snapshot_class: str = VOLUME_SNAPSHOT_CLASS,
161+
poll_interval: float = DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC,
162+
label: str,
163+
time_limit: float = DEFAULT_SNAPSHOT_TIMEOUT_SEC,
164+
) -> SnapshotDetails:
165+
namespace, autoscaler_vm_name = get_autoscaler_vm_identity(branch_id)
166+
pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_DB_PVC_SUFFIX}"
167+
return await _create_snapshot_from_pvc(
168+
namespace=namespace,
169+
pvc_name=pvc_name,
170+
backup_id=backup_id,
171+
snapshot_class=snapshot_class,
172+
poll_interval=poll_interval,
173+
label=label,
174+
time_limit=time_limit,
175+
)
176+
133177

134-
derived_content_name = content_name
178+
async def delete_snapshot(
179+
metadata: SnapshotMetadata,
180+
*,
181+
time_limit: float = DEFAULT_SNAPSHOT_TIMEOUT_SEC,
182+
poll_interval: float = DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC,
183+
) -> None:
184+
name = metadata.name
185+
namespace = metadata.namespace
186+
content_name = metadata.content_name
135187
try:
136188
async with asyncio.timeout(time_limit):
137189
snapshot = await read_snapshot(namespace, name)
138190
if snapshot is not None:
139191
status = snapshot.get("status") or {}
140-
derived_content_name = derived_content_name or status.get("boundVolumeSnapshotContentName")
192+
content_name = content_name or status.get("boundVolumeSnapshotContentName")
141193
logger.info("Deleting VolumeSnapshot %s/%s", namespace, name)
142194
await ensure_snapshot_absent(
143195
namespace,
@@ -148,10 +200,10 @@ async def delete_branch_snapshot(
148200
else:
149201
logger.info("VolumeSnapshot %s/%s already absent", namespace, name)
150202

151-
if derived_content_name:
152-
logger.info("Ensuring VolumeSnapshotContent %s is absent", derived_content_name)
203+
if content_name:
204+
logger.info("Ensuring VolumeSnapshotContent %s is absent", content_name)
153205
await ensure_snapshot_content_absent(
154-
derived_content_name,
206+
content_name,
155207
timeout=time_limit,
156208
poll_interval=poll_interval,
157209
)

src/api/backupmonitor.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from sqlmodel import SQLModel, asc, delete, select
1111
from ulid import ULID
1212

13+
from .._util.backup_config import SNAPSHOT_POLL_INTERVAL_SEC, SNAPSHOT_TIMEOUT_SEC, VOLUME_SNAPSHOT_CLASS
1314
from ..models.backups import (
1415
BackupEntry,
1516
BackupLog,
@@ -21,18 +22,16 @@
2122
from ..models.organization import Organization
2223
from ..models.project import Project
2324
from .backup_snapshots import (
24-
SNAPSHOT_POLL_INTERVAL_SEC,
25-
SNAPSHOT_TIMEOUT_SEC,
26-
create_branch_snapshot,
27-
delete_branch_snapshot,
25+
build_snapshot_metadata,
26+
create_branch_db_snapshot,
27+
delete_snapshot,
2828
)
2929
from .organization.project.branch import refresh_branch_status
3030
from .settings import get_settings
3131

3232
# ---------------------------
3333
# Config
3434
# ---------------------------
35-
VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "csi-snapshot-class")
3635
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "60"))
3736

3837
logging.basicConfig(
@@ -209,11 +208,15 @@ async def _delete_many(
209208

210209
deleted_ids: list[ULID] = []
211210
for backup in backups:
211+
metadata = build_snapshot_metadata(backup)
212+
if metadata is None:
213+
logger.warning("Skipping snapshot deletion for backup %s because metadata was incomplete", backup.id)
214+
continue
212215
try:
213-
await delete_branch_snapshot(
214-
name=backup.snapshot_name,
215-
namespace=backup.snapshot_namespace,
216-
content_name=backup.snapshot_content_name,
216+
await delete_snapshot(
217+
metadata,
218+
time_limit=SNAPSHOT_TIMEOUT_SEC,
219+
poll_interval=SNAPSHOT_POLL_INTERVAL_SEC,
217220
)
218221
except Exception:
219222
context = {
@@ -308,13 +311,11 @@ async def execute_backup(self, db: AsyncSession, branch: Branch, row: BackupSche
308311
backup_id = ULID()
309312

310313
try:
311-
snapshot = await create_branch_snapshot(
314+
snapshot = await create_branch_db_snapshot(
312315
branch.id,
313316
backup_id=backup_id,
314317
snapshot_class=VOLUME_SNAPSHOT_CLASS,
315-
poll_interval=SNAPSHOT_POLL_INTERVAL_SEC,
316318
label=f"row-{row.row_index}",
317-
time_limit=SNAPSHOT_TIMEOUT_SEC,
318319
)
319320
except Exception:
320321
nb.next_at = next_due

src/api/organization/project/branch/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from ....._util import DEFAULT_DB_NAME, DEFAULT_DB_USER, Identifier, bytes_to_gb
2323
from ....._util.crypto import encrypt_with_passphrase, generate_keys
2424
from .....deployment import (
25-
AUTOSCALER_PVC_SUFFIX,
25+
AUTOSCALER_DB_PVC_SUFFIX,
2626
STORAGE_PVC_SUFFIX,
2727
DeploymentParameters,
2828
ResizeParameters,
@@ -750,7 +750,7 @@ async def _apply_resize_operations(
750750
namespace, autoscaler_vm_name = get_autoscaler_vm_identity(branch.id)
751751
if "database_size" in effective_parameters:
752752
new_database_size = effective_parameters["database_size"]
753-
pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_PVC_SUFFIX}"
753+
pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_DB_PVC_SUFFIX}"
754754
storage_size_gb = f"{bytes_to_gb(new_database_size)}G"
755755
await kube_service.resize_pvc_storage(namespace, pvc_name, storage_size_gb)
756756

src/deployment/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
SIMPLYBLOCK_CSI_SECRET = "simplyblock-csi-secret"
7474
STORAGE_PVC_SUFFIX = "-db-storage-pvc"
7575
DATABASE_PVC_SUFFIX = "-db-pvc"
76-
AUTOSCALER_PVC_SUFFIX = "-block-data"
76+
AUTOSCALER_DB_PVC_SUFFIX = "-block-data"
7777
_LOAD_BALANCER_TIMEOUT_SECONDS = float(600)
7878
_LOAD_BALANCER_POLL_INTERVAL_SECONDS = float(2)
7979
DNSRecordType = Literal["AAAA", "CNAME"]
@@ -289,7 +289,7 @@ async def resolve_storage_volume_identifiers(namespace: str) -> tuple[str, str |
289289

290290

291291
async def resolve_autoscaler_volume_identifiers(namespace: str) -> tuple[str, str | None]:
292-
pvc_name = f"{_autoscaler_vm_name(namespace)}{AUTOSCALER_PVC_SUFFIX}"
292+
pvc_name = f"{_autoscaler_vm_name(namespace)}{AUTOSCALER_DB_PVC_SUFFIX}"
293293
return await _resolve_volume_identifiers(namespace, pvc_name)
294294

295295

@@ -422,7 +422,7 @@ def _configure_vela_values(
422422
autoscaler_resources["memorySlots"] = memory_slots
423423

424424
autoscaler_persistence = autoscaler_spec.setdefault("persistence", {})
425-
autoscaler_persistence["claimName"] = f"{_autoscaler_vm_name(namespace)}{AUTOSCALER_PVC_SUFFIX}"
425+
autoscaler_persistence["claimName"] = f"{_autoscaler_vm_name(namespace)}{AUTOSCALER_DB_PVC_SUFFIX}"
426426
autoscaler_persistence["size"] = f"{bytes_to_gb(parameters.database_size)}G"
427427
autoscaler_persistence["storageClassName"] = storage_class_name
428428
autoscaler_persistence.setdefault("accessModes", ["ReadWriteMany"])

src/deployment/kubernetes/volume_clone.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from ..._util import Identifier
88
from ...exceptions import VelaKubernetesError
9-
from .. import AUTOSCALER_PVC_SUFFIX, get_autoscaler_vm_identity, kube_service
9+
from .. import AUTOSCALER_DB_PVC_SUFFIX, get_autoscaler_vm_identity, kube_service
1010
from ..settings import get_settings
1111
from .pvc import (
1212
build_pvc_manifest_from_existing,
@@ -112,8 +112,8 @@ class _VolumeCloneOperation:
112112
def __post_init__(self) -> None:
113113
source_ns, source_vm_name = get_autoscaler_vm_identity(self.source_branch_id)
114114
target_ns, target_vm_name = get_autoscaler_vm_identity(self.target_branch_id)
115-
pvc_name = f"{source_vm_name}{AUTOSCALER_PVC_SUFFIX}"
116-
target_pvc_name = f"{target_vm_name}{AUTOSCALER_PVC_SUFFIX}"
115+
pvc_name = f"{source_vm_name}{AUTOSCALER_DB_PVC_SUFFIX}"
116+
target_pvc_name = f"{target_vm_name}{AUTOSCALER_DB_PVC_SUFFIX}"
117117
if target_pvc_name != pvc_name:
118118
raise VelaKubernetesError(
119119
f"Autoscaler PVC name mismatch between source ({pvc_name}) and target ({target_pvc_name})"
@@ -314,8 +314,8 @@ def __post_init__(self) -> None:
314314
source_ns = self.snapshot_namespace
315315
_, source_vm_name = get_autoscaler_vm_identity(self.source_branch_id)
316316
target_ns, target_vm_name = get_autoscaler_vm_identity(self.target_branch_id)
317-
pvc_name = f"{source_vm_name}{AUTOSCALER_PVC_SUFFIX}"
318-
target_pvc_name = f"{target_vm_name}{AUTOSCALER_PVC_SUFFIX}"
317+
pvc_name = f"{source_vm_name}{AUTOSCALER_DB_PVC_SUFFIX}"
318+
target_pvc_name = f"{target_vm_name}{AUTOSCALER_DB_PVC_SUFFIX}"
319319
if target_pvc_name != pvc_name:
320320
raise VelaKubernetesError(
321321
f"Autoscaler PVC name mismatch between source ({pvc_name}) and target ({target_pvc_name})"

0 commit comments

Comments
 (0)