Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .github/workflows/changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Versioning

on:
pull_request:
branches: [ main ]

jobs:
check-changelog-entry:
name: Changelog entry check
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

- name: Check for changelog entry
run: |
if [ ! -f "changelog_entry.yaml" ]; then
echo "Error: changelog_entry.yaml file is missing."
echo "Please add a changelog_entry.yaml file at the root of the repository."
exit 1
fi

# Check if the file is empty
if [ ! -s "changelog_entry.yaml" ]; then
echo "Error: changelog_entry.yaml file is empty."
echo "Please add content to the changelog_entry.yaml file."
exit 1
fi

echo "Changelog entry found and is not empty."
23 changes: 6 additions & 17 deletions .github/workflows/push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ on:
push:
branches:
- main
paths:
- pyproject.toml

jobs:
lint:
Expand Down Expand Up @@ -70,27 +72,14 @@ jobs:
with:
branch: gh-pages
folder: docs/_build/html
publish-to-pypi:
name: Publish to PyPI
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for all tags and branches
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Install package
run: make install
- name: Build package
run: python -m build
- name: Publish a git tag
run: ".github/publish-git-tag.sh || true"
- name: Publish to PyPI
- name: Remove .whl files
run: rm dist/*.whl
- name: Publish a Python distribution to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
password: ${{ secrets.PYPI }}
skip-existing: true
verbose: true
37 changes: 37 additions & 0 deletions .github/workflows/versioning.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Workflow that runs on versioning metadata updates.

name: Versioning updates
on:
push:
branches:
- main

paths:
- changelog_entry.yaml
- "!pyproject.toml"
Comment thread
nikhilwoodruff marked this conversation as resolved.
Comment thread
nikhilwoodruff marked this conversation as resolved.

jobs:
Versioning:
runs-on: ubuntu-latest
if: |
(!(github.event.head_commit.message == 'Update package version'))
steps:
- name: Checkout repo
uses: actions/checkout@v4
with:
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.ref }}
token: ${{ secrets.POLICYENGINE_GITHUB }}
Comment thread
nikhilwoodruff marked this conversation as resolved.
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Build changelog
run: pip install yaml-changelog && make changelog
- name: Preview changelog update
run: ".github/get-changelog-diff.sh"
- name: Update changelog
uses: EndBug/add-and-commit@v9
with:
add: "."
message: Update package version
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@
!incomes_projection.csv
!policyengine_uk_data/datasets/frs/local_areas/**/*.csv
**/_build
!policyengine_uk_data/storage/*.csv
!policyengine_uk_data/storage/*.csv
**/version.json
68 changes: 14 additions & 54 deletions policyengine_uk_data/storage/upload_completed_datasets.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,25 @@
from policyengine_uk_data.datasets import EnhancedFRS_2022_23, FRS_2022_23
from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk_data.utils.huggingface import upload
from google.cloud import storage
import google.auth
from policyengine_uk_data.utils.data_upload import upload_data_files


def upload_datasets():
credentials, project_id = google.auth.default()
storage_client = storage.Client(
credentials=credentials, project=project_id
)
bucket = storage_client.bucket("policyengine-uk-data-private")
for dataset in [FRS_2022_23, EnhancedFRS_2022_23]:
dataset = dataset()
if not dataset.exists:
raise ValueError(
f"Dataset {dataset.name} does not exist at {dataset.file_path}."
)

upload(
dataset.file_path,
"policyengine/policyengine-uk-data",
dataset.file_path.name,
)
blob = dataset.file_path.name
blob = bucket.blob(blob)
blob.upload_from_filename(dataset.file_path)
print(
f"Uploaded {dataset.file_path.name} to GCS bucket policyengine-uk-data-private."
)

# Constituency weights:

upload(
dataset_files = [
FRS_2022_23.file_path,
EnhancedFRS_2022_23.file_path,
STORAGE_FOLDER / "parliamentary_constituency_weights.h5",
"policyengine/policyengine-uk-data",
"parliamentary_constituency_weights.h5",
)

blob = "parliamentary_constituency_weights.h5"
blob = bucket.blob(blob)
blob.upload_from_filename(
STORAGE_FOLDER / "parliamentary_constituency_weights.h5"
)
print(
f"Uploaded parliamentary_constituency_weights.h5 to GCS bucket policyengine-uk-data-private."
)

# Local authority weights:

upload(
STORAGE_FOLDER / "local_authority_weights.h5",
"policyengine/policyengine-uk-data",
"local_authority_weights.h5",
)
]

for file_path in dataset_files:
if not file_path.exists():
raise ValueError(f"File {file_path} does not exist.")

blob = "local_authority_weights.h5"
blob = bucket.blob(blob)
blob.upload_from_filename(STORAGE_FOLDER / "local_authority_weights.h5")
print(
f"Uploaded local_authority_weights.h5 to GCS bucket policyengine-uk-data-private."
upload_data_files(
files=dataset_files,
hf_repo_name="policyengine-uk-data/policyengine-uk-data-private",
Comment thread
nikhilwoodruff marked this conversation as resolved.
hf_repo_type="model",
gcs_bucket_name="policyengine-uk-data-private",
)


Expand Down
104 changes: 104 additions & 0 deletions policyengine_uk_data/utils/data_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from typing import List
from huggingface_hub import HfApi, CommitOperationAdd
from huggingface_hub.errors import RevisionNotFoundError
from google.cloud import storage
from pathlib import Path
from importlib import metadata
import google.auth
import logging


def upload_data_files(
files: List[str],
gcs_bucket_name: str = "policyengine-uk-data-private",
hf_repo_name: str = "policyengine/policyengine-uk-data",
hf_repo_type: str = "model",
version: str = None,
):
if version is None:
version = metadata.version("policyengine-uk-data")
Comment thread
nikhilwoodruff marked this conversation as resolved.

upload_files_to_hf(
files=files,
version=version,
hf_repo_name=hf_repo_name,
hf_repo_type=hf_repo_type,
)

upload_files_to_gcs(
files=files,
version=version,
gcs_bucket_name=gcs_bucket_name,
)


def upload_files_to_hf(
files: List[str],
version: str,
hf_repo_name: str = "policyengine/policyengine-uk-data-private",
hf_repo_type: str = "model",
):
"""
Upload files to Hugging Face repository and tag the commit with the version.
"""
api = HfApi()
hf_operations = []

for file_path in files:
file_path = Path(file_path)
if not file_path.exists():
raise ValueError(f"File {file_path} does not exist.")
hf_operations.append(
CommitOperationAdd(
path_in_repo=file_path.name,
path_or_fileobj=str(file_path),
)
)
commit_info = api.create_commit(
Comment thread
nikhilwoodruff marked this conversation as resolved.
repo_id=hf_repo_name,
operations=hf_operations,
repo_type=hf_repo_type,
commit_message=f"Upload data files for version {version}",
)
logging.info(f"Uploaded files to Hugging Face repository {hf_repo_name}.")

# Tag commit with version
api.create_tag(
repo_id=hf_repo_name,
tag=version,
revision=commit_info.oid,
repo_type=hf_repo_type,
)
logging.info(
f"Tagged commit with {version} in Hugging Face repository {hf_repo_name}."
)


def upload_files_to_gcs(
files: List[str],
version: str,
gcs_bucket_name: str = "policyengine-uk-data-private",
):
"""
Upload files to Google Cloud Storage and set metadata with the version.
"""
credentials, project_id = google.auth.default()
storage_client = storage.Client(
credentials=credentials, project=project_id
)
bucket = storage_client.bucket(gcs_bucket_name)

for file_path in files:
file_path = Path(file_path)
blob = bucket.blob(file_path.name)
blob.upload_from_filename(file_path)
logging.info(
f"Uploaded {file_path.name} to GCS bucket {gcs_bucket_name}."
)

# Set metadata
blob.metadata = {"version": version}
blob.patch()
logging.info(
f"Set metadata for {file_path.name} in GCS bucket {gcs_bucket_name}."
)
Loading