|
6 | 6 | from collections import OrderedDict |
7 | 7 | from dataclasses import dataclass |
8 | 8 | from pathlib import Path |
9 | | -from typing import Protocol |
10 | | -from urllib.parse import unquote, urlparse |
| 9 | +from typing import Any, Protocol |
| 10 | +from urllib.parse import quote, unquote, urlparse |
11 | 11 |
|
12 | 12 | EXTERNAL_PAYLOAD_REFERENCE_SCHEMA = "durable-workflow.v2.external-payload-reference.v1" |
13 | 13 |
|
@@ -174,6 +174,130 @@ def _path_from_uri(self, uri: str) -> Path: |
174 | 174 | return path |
175 | 175 |
|
176 | 176 |
|
| 177 | +class S3ExternalStorage: |
| 178 | + """External storage driver backed by a boto3-compatible S3 client. |
| 179 | +
|
| 180 | + The SDK does not depend on boto3. Applications that need S3 pass an |
| 181 | + already-configured client exposing ``put_object``, ``get_object``, and |
| 182 | + ``delete_object``. |
| 183 | + """ |
| 184 | + |
| 185 | + def __init__(self, client: Any, *, bucket: str, prefix: str = "") -> None: |
| 186 | + if not bucket: |
| 187 | + raise ValueError("s3 external storage bucket must be non-empty") |
| 188 | + self.client = client |
| 189 | + self.bucket = bucket |
| 190 | + self.prefix = _normalize_object_prefix(prefix) |
| 191 | + |
| 192 | + def put(self, data: bytes, *, sha256: str, codec: str) -> str: |
| 193 | + key = _object_key(self.prefix, sha256=sha256, codec=codec) |
| 194 | + self.client.put_object( |
| 195 | + Bucket=self.bucket, |
| 196 | + Key=key, |
| 197 | + Body=data, |
| 198 | + ContentType="application/octet-stream", |
| 199 | + Metadata={"sha256": sha256, "codec": codec}, |
| 200 | + ) |
| 201 | + return _object_uri("s3", self.bucket, key) |
| 202 | + |
| 203 | + def get(self, uri: str) -> bytes: |
| 204 | + bucket, key = _parse_object_uri(uri, scheme="s3", expected_bucket=self.bucket, expected_prefix=self.prefix) |
| 205 | + response = self.client.get_object(Bucket=bucket, Key=key) |
| 206 | + body = response["Body"] |
| 207 | + data = body.read() if hasattr(body, "read") else body |
| 208 | + if not isinstance(data, bytes): |
| 209 | + raise ValueError("s3 external storage client returned non-bytes payload") |
| 210 | + return data |
| 211 | + |
| 212 | + def delete(self, uri: str) -> None: |
| 213 | + bucket, key = _parse_object_uri(uri, scheme="s3", expected_bucket=self.bucket, expected_prefix=self.prefix) |
| 214 | + self.client.delete_object(Bucket=bucket, Key=key) |
| 215 | + |
| 216 | + |
| 217 | +class GCSExternalStorage: |
| 218 | + """External storage driver backed by a google-cloud-storage client. |
| 219 | +
|
| 220 | + The SDK does not depend on google-cloud-storage. Applications pass a |
| 221 | + configured client exposing ``bucket(name).blob(key)``. |
| 222 | + """ |
| 223 | + |
| 224 | + def __init__(self, client: Any, *, bucket: str, prefix: str = "") -> None: |
| 225 | + if not bucket: |
| 226 | + raise ValueError("gcs external storage bucket must be non-empty") |
| 227 | + self.client = client |
| 228 | + self.bucket = bucket |
| 229 | + self.prefix = _normalize_object_prefix(prefix) |
| 230 | + |
| 231 | + def put(self, data: bytes, *, sha256: str, codec: str) -> str: |
| 232 | + key = _object_key(self.prefix, sha256=sha256, codec=codec) |
| 233 | + blob = self.client.bucket(self.bucket).blob(key) |
| 234 | + blob.metadata = {"sha256": sha256, "codec": codec} |
| 235 | + blob.upload_from_string(data, content_type="application/octet-stream") |
| 236 | + return _object_uri("gs", self.bucket, key) |
| 237 | + |
| 238 | + def get(self, uri: str) -> bytes: |
| 239 | + bucket, key = _parse_object_uri(uri, scheme="gs", expected_bucket=self.bucket, expected_prefix=self.prefix) |
| 240 | + data = self.client.bucket(bucket).blob(key).download_as_bytes() |
| 241 | + if not isinstance(data, bytes): |
| 242 | + raise ValueError("gcs external storage client returned non-bytes payload") |
| 243 | + return data |
| 244 | + |
| 245 | + def delete(self, uri: str) -> None: |
| 246 | + bucket, key = _parse_object_uri(uri, scheme="gs", expected_bucket=self.bucket, expected_prefix=self.prefix) |
| 247 | + self.client.bucket(bucket).blob(key).delete() |
| 248 | + |
| 249 | + |
| 250 | +class AzureBlobExternalStorage: |
| 251 | + """External storage driver backed by an azure-storage-blob container client. |
| 252 | +
|
| 253 | + The SDK does not depend on azure-storage-blob. Applications pass a |
| 254 | + configured container client exposing ``upload_blob``, ``download_blob``, |
| 255 | + and ``delete_blob``. |
| 256 | + """ |
| 257 | + |
| 258 | + def __init__(self, container_client: Any, *, container: str, prefix: str = "") -> None: |
| 259 | + if not container: |
| 260 | + raise ValueError("azure external storage container must be non-empty") |
| 261 | + self.container_client = container_client |
| 262 | + self.container = container |
| 263 | + self.prefix = _normalize_object_prefix(prefix) |
| 264 | + |
| 265 | + def put(self, data: bytes, *, sha256: str, codec: str) -> str: |
| 266 | + key = _object_key(self.prefix, sha256=sha256, codec=codec) |
| 267 | + self.container_client.upload_blob( |
| 268 | + name=key, |
| 269 | + data=data, |
| 270 | + overwrite=True, |
| 271 | + metadata={"sha256": sha256, "codec": codec}, |
| 272 | + ) |
| 273 | + return _object_uri("azure-blob", self.container, key) |
| 274 | + |
| 275 | + def get(self, uri: str) -> bytes: |
| 276 | + container, key = _parse_object_uri( |
| 277 | + uri, |
| 278 | + scheme="azure-blob", |
| 279 | + expected_bucket=self.container, |
| 280 | + expected_prefix=self.prefix, |
| 281 | + ) |
| 282 | + data = self.container_client.download_blob(key).readall() |
| 283 | + if not isinstance(data, bytes): |
| 284 | + raise ValueError("azure external storage client returned non-bytes payload") |
| 285 | + if container != self.container: |
| 286 | + raise ValueError("azure external storage URI uses a different container") |
| 287 | + return data |
| 288 | + |
| 289 | + def delete(self, uri: str) -> None: |
| 290 | + container, key = _parse_object_uri( |
| 291 | + uri, |
| 292 | + scheme="azure-blob", |
| 293 | + expected_bucket=self.container, |
| 294 | + expected_prefix=self.prefix, |
| 295 | + ) |
| 296 | + if container != self.container: |
| 297 | + raise ValueError("azure external storage URI uses a different container") |
| 298 | + self.container_client.delete_blob(key) |
| 299 | + |
| 300 | + |
177 | 301 | def store_external_payload( |
178 | 302 | driver: ExternalStorageDriver, |
179 | 303 | data: bytes, |
@@ -230,3 +354,46 @@ def _safe_codec_segment(codec: str) -> str: |
230 | 354 | if not all(char.isalnum() or char in {"-", "_", "."} for char in codec): |
231 | 355 | raise ValueError("codec contains characters that are unsafe for local storage paths") |
232 | 356 | return codec |
| 357 | + |
| 358 | + |
| 359 | +def _normalize_object_prefix(prefix: str) -> str: |
| 360 | + cleaned = prefix.strip("/") |
| 361 | + if not cleaned: |
| 362 | + return "" |
| 363 | + parts = cleaned.split("/") |
| 364 | + if any(part in {"", ".", ".."} for part in parts): |
| 365 | + raise ValueError("external storage prefix contains an unsafe path segment") |
| 366 | + return "/".join(quote(part, safe="-_.~") for part in parts) |
| 367 | + |
| 368 | + |
| 369 | +def _object_key(prefix: str, *, sha256: str, codec: str) -> str: |
| 370 | + _validate_sha256(sha256) |
| 371 | + codec_segment = quote(_safe_codec_segment(codec), safe="-_.~") |
| 372 | + key = f"{codec_segment}/{sha256[:2]}/{sha256}" |
| 373 | + return f"{prefix}/{key}" if prefix else key |
| 374 | + |
| 375 | + |
| 376 | +def _object_uri(scheme: str, bucket: str, key: str) -> str: |
| 377 | + return f"{scheme}://{bucket}/{key}" |
| 378 | + |
| 379 | + |
| 380 | +def _parse_object_uri( |
| 381 | + uri: str, |
| 382 | + *, |
| 383 | + scheme: str, |
| 384 | + expected_bucket: str, |
| 385 | + expected_prefix: str, |
| 386 | +) -> tuple[str, str]: |
| 387 | + parsed = urlparse(uri) |
| 388 | + if parsed.scheme != scheme or parsed.netloc != expected_bucket: |
| 389 | + raise ValueError(f"{scheme} external storage URI uses a different bucket or container") |
| 390 | + |
| 391 | + key = parsed.path.lstrip("/") |
| 392 | + if not key: |
| 393 | + raise ValueError(f"{scheme} external storage URI must include an object key") |
| 394 | + parts = key.split("/") |
| 395 | + if any(part in {"", ".", ".."} for part in parts): |
| 396 | + raise ValueError(f"{scheme} external storage URI contains an unsafe object key") |
| 397 | + if expected_prefix and not (key == expected_prefix or key.startswith(f"{expected_prefix}/")): |
| 398 | + raise ValueError(f"{scheme} external storage URI is outside the configured prefix") |
| 399 | + return parsed.netloc, key |
0 commit comments