Skip to content

Commit 351f12f

Browse files
committed
feat(scripts): Add script for hiding S3 objects that do not appear in latest tag
1 parent fda81e0 commit 351f12f

1 file changed

Lines changed: 110 additions & 0 deletions

File tree

scripts/s3-hide-old-files.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# /// script
2+
# requires-python = ">=3.13"
3+
# dependencies = [
4+
# "attrs>=25.4.0",
5+
# "boto3>=1.42.45",
6+
# "pygit2>=1.19.1",
7+
# "pyyaml>=6.0.3",
8+
# "structlog>=25.5.0",
9+
# "typer>=0.21.1",
10+
# ]
11+
# ///
12+
from __future__ import annotations
13+
14+
import os
15+
import typing as ty
16+
from pathlib import Path
17+
18+
import attrs
19+
import boto3
20+
import pygit2
21+
import structlog
22+
import typer
23+
import yaml
24+
25+
26+
@attrs.define
27+
class AWSConfig:
28+
AWS_ACCESS_KEY_ID: str
29+
AWS_SECRET_ACCESS_KEY: str
30+
AWS_S3_BUCKET_NAME: str
31+
AWS_REGION: str
32+
33+
@classmethod
34+
def from_dict(cls, data: dict) -> ty.Self:
35+
return cls(
36+
AWS_ACCESS_KEY_ID=data["AWS_ACCESS_KEY_ID"],
37+
AWS_SECRET_ACCESS_KEY=data["AWS_SECRET_ACCESS_KEY"],
38+
AWS_S3_BUCKET_NAME=data.get("AWS_S3_PUBLIC_BUCKET", "openneuro.org"),
39+
AWS_REGION=data.get("AWS_REGION", "us-east-1"),
40+
)
41+
42+
43+
def load_config(config_path: Path) -> AWSConfig:
44+
config_data = yaml.safe_load(Path(config_path).read_text())
45+
try:
46+
return AWSConfig.from_dict(config_data["secrets"]["aws"])
47+
except KeyError:
48+
raise ValueError("AWS credentials are missing in the config file.")
49+
50+
51+
def load_env_config() -> AWSConfig:
52+
try:
53+
return AWSConfig.from_dict(dict(os.environ))
54+
except KeyError:
55+
raise ValueError("AWS credentials are missing from environment variables.")
56+
57+
58+
def get_latest_tag(repo: pygit2.Repository) -> pygit2.Reference:
59+
for ref_name in sorted(repo.references, reverse=True):
60+
if ref_name.startswith("refs/tags/"):
61+
return repo.references[ref_name]
62+
else:
63+
raise ValueError("No tags found in the repository.")
64+
65+
66+
def main(
67+
dataset: ty.Annotated[Path, typer.Argument()] = Path(),
68+
config: ty.Annotated[Path | None, typer.Option()] = None,
69+
dry_run: ty.Annotated[bool, typer.Option()] = False,
70+
debug: ty.Annotated[
71+
ty.Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], typer.Option()
72+
] = "INFO",
73+
) -> None:
74+
structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(debug))
75+
logger = structlog.get_logger()
76+
77+
dataset = dataset.resolve()
78+
if not dataset.is_dir() or not dataset.name.startswith("ds"):
79+
raise ValueError(
80+
"Provided dataset path must be a directory starting with 'ds'."
81+
)
82+
repo = pygit2.Repository(dataset)
83+
tag = get_latest_tag(repo)
84+
tree = tag.peel().tree
85+
log = logger.bind(dataset=dataset.name, tag=tag.shorthand)
86+
log.info("Loaded repository")
87+
88+
log.debug("Accessing S3 bucket")
89+
conf = load_config(config) if config else load_env_config()
90+
s3 = boto3.resource(
91+
"s3",
92+
region_name=conf.AWS_REGION,
93+
aws_access_key_id=conf.AWS_ACCESS_KEY_ID,
94+
aws_secret_access_key=conf.AWS_SECRET_ACCESS_KEY,
95+
)
96+
bucket = next(b for b in s3.buckets.all() if b.name == conf.AWS_S3_BUCKET_NAME)
97+
prefix = f"{dataset.name}/"
98+
99+
for obj in bucket.objects.filter(Prefix=prefix):
100+
fname = obj.key[len(prefix) :]
101+
if fname not in tree:
102+
log.info(f"Hiding {fname}")
103+
if not dry_run:
104+
obj.delete()
105+
else:
106+
log.debug(f"Verified {fname}")
107+
108+
109+
if __name__ == "__main__":
110+
typer.run(main)

0 commit comments

Comments
 (0)