-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdata_upload.py
More file actions
104 lines (92 loc) · 2.87 KB
/
Copy pathdata_upload.py
File metadata and controls
104 lines (92 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from typing import List
from huggingface_hub import HfApi, CommitOperationAdd
from huggingface_hub.errors import RevisionNotFoundError
from google.cloud import storage
from pathlib import Path
from importlib import metadata
import google.auth
import logging
def upload_data_files(
files: List[str],
gcs_bucket_name: str = "policyengine-uk-data-private",
hf_repo_name: str = "policyengine/policyengine-uk-data",
hf_repo_type: str = "model",
version: str = None,
):
if version is None:
version = metadata.version("policyengine-uk-data")
upload_files_to_hf(
files=files,
version=version,
hf_repo_name=hf_repo_name,
hf_repo_type=hf_repo_type,
)
upload_files_to_gcs(
files=files,
version=version,
gcs_bucket_name=gcs_bucket_name,
)
def upload_files_to_hf(
files: List[str],
version: str,
hf_repo_name: str = "policyengine/policyengine-uk-data-private",
hf_repo_type: str = "model",
):
"""
Upload files to Hugging Face repository and tag the commit with the version.
"""
api = HfApi()
hf_operations = []
for file_path in files:
file_path = Path(file_path)
if not file_path.exists():
raise ValueError(f"File {file_path} does not exist.")
hf_operations.append(
CommitOperationAdd(
path_in_repo=file_path.name,
path_or_fileobj=str(file_path),
)
)
commit_info = api.create_commit(
repo_id=hf_repo_name,
operations=hf_operations,
repo_type=hf_repo_type,
commit_message=f"Upload data files for version {version}",
)
logging.info(f"Uploaded files to Hugging Face repository {hf_repo_name}.")
# Tag commit with version
api.create_tag(
repo_id=hf_repo_name,
tag=version,
revision=commit_info.oid,
repo_type=hf_repo_type,
)
logging.info(
f"Tagged commit with {version} in Hugging Face repository {hf_repo_name}."
)
def upload_files_to_gcs(
files: List[str],
version: str,
gcs_bucket_name: str = "policyengine-uk-data-private",
):
"""
Upload files to Google Cloud Storage and set metadata with the version.
"""
credentials, project_id = google.auth.default()
storage_client = storage.Client(
credentials=credentials, project=project_id
)
bucket = storage_client.bucket(gcs_bucket_name)
for file_path in files:
file_path = Path(file_path)
blob = bucket.blob(file_path.name)
blob.upload_from_filename(file_path)
logging.info(
f"Uploaded {file_path.name} to GCS bucket {gcs_bucket_name}."
)
# Set metadata
blob.metadata = {"version": version}
blob.patch()
logging.info(
f"Set metadata for {file_path.name} in GCS bucket {gcs_bucket_name}."
)