diff --git a/.github/workflows/changelog_entry.yaml b/.github/workflows/changelog_entry.yaml new file mode 100644 index 000000000..d0eb8574d --- /dev/null +++ b/.github/workflows/changelog_entry.yaml @@ -0,0 +1,29 @@ +name: Versioning + +on: + pull_request: + branches: [ main ] + +jobs: + check-changelog-entry: + name: Changelog entry check + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Check for changelog entry + run: | + if [ ! -f "changelog_entry.yaml" ]; then + echo "Error: changelog_entry.yaml file is missing." + echo "Please add a changelog_entry.yaml file at the root of the repository." + exit 1 + fi + + # Check if the file is empty + if [ ! -s "changelog_entry.yaml" ]; then + echo "Error: changelog_entry.yaml file is empty." + echo "Please add content to the changelog_entry.yaml file." + exit 1 + fi + + echo "Changelog entry found and is not empty." \ No newline at end of file diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml index 71864d9bc..7bd46d26d 100644 --- a/.github/workflows/push.yaml +++ b/.github/workflows/push.yaml @@ -6,6 +6,8 @@ on: push: branches: - main + paths: + - pyproject.toml jobs: lint: @@ -70,27 +72,14 @@ jobs: with: branch: gh-pages folder: docs/_build/html - publish-to-pypi: - name: Publish to PyPI - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 # Fetch all history for all tags and branches - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.12 - - name: Install package - run: make install - - name: Build package - run: python -m build - name: Publish a git tag run: ".github/publish-git-tag.sh || true" - - name: Publish to PyPI + - name: Remove .whl files + run: rm dist/*.whl + - name: Publish a Python distribution to PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ password: ${{ secrets.PYPI }} skip-existing: true + verbose: true diff --git a/.github/workflows/versioning.yaml b/.github/workflows/versioning.yaml new file mode 100644 index 000000000..b4e0edb9e --- /dev/null +++ b/.github/workflows/versioning.yaml @@ -0,0 +1,37 @@ +# Workflow that runs on versioning metadata updates. + +name: Versioning updates +on: + push: + branches: + - main + + paths: + - changelog_entry.yaml + - "!pyproject.toml" + +jobs: + Versioning: + runs-on: ubuntu-latest + if: | + (!(github.event.head_commit.message == 'Update package version')) + steps: + - name: Checkout repo + uses: actions/checkout@v4 + with: + repository: ${{ github.event.pull_request.head.repo.full_name }} + ref: ${{ github.event.pull_request.head.ref }} + token: ${{ secrets.POLICYENGINE_GITHUB }} + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: Build changelog + run: pip install yaml-changelog && make changelog + - name: Preview changelog update + run: ".github/get-changelog-diff.sh" + - name: Update changelog + uses: EndBug/add-and-commit@v9 + with: + add: "." + message: Update package version \ No newline at end of file diff --git a/.gitignore b/.gitignore index 52b67116f..9d52f6ca9 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ !incomes_projection.csv !policyengine_uk_data/datasets/frs/local_areas/**/*.csv **/_build -!policyengine_uk_data/storage/*.csv \ No newline at end of file +!policyengine_uk_data/storage/*.csv +**/version.json diff --git a/policyengine_uk_data/storage/upload_completed_datasets.py b/policyengine_uk_data/storage/upload_completed_datasets.py index 21f61fe1d..3eb02385c 100644 --- a/policyengine_uk_data/storage/upload_completed_datasets.py +++ b/policyengine_uk_data/storage/upload_completed_datasets.py @@ -1,65 +1,25 @@ from policyengine_uk_data.datasets import EnhancedFRS_2022_23, FRS_2022_23 from policyengine_uk_data.storage import STORAGE_FOLDER -from policyengine_uk_data.utils.huggingface import upload -from google.cloud import storage -import google.auth +from policyengine_uk_data.utils.data_upload import upload_data_files def upload_datasets(): - credentials, project_id = google.auth.default() - storage_client = storage.Client( - credentials=credentials, project=project_id - ) - bucket = storage_client.bucket("policyengine-uk-data-private") - for dataset in [FRS_2022_23, EnhancedFRS_2022_23]: - dataset = dataset() - if not dataset.exists: - raise ValueError( - f"Dataset {dataset.name} does not exist at {dataset.file_path}." - ) - - upload( - dataset.file_path, - "policyengine/policyengine-uk-data", - dataset.file_path.name, - ) - blob = dataset.file_path.name - blob = bucket.blob(blob) - blob.upload_from_filename(dataset.file_path) - print( - f"Uploaded {dataset.file_path.name} to GCS bucket policyengine-uk-data-private." - ) - - # Constituency weights: - - upload( + dataset_files = [ + FRS_2022_23.file_path, + EnhancedFRS_2022_23.file_path, STORAGE_FOLDER / "parliamentary_constituency_weights.h5", - "policyengine/policyengine-uk-data", - "parliamentary_constituency_weights.h5", - ) - - blob = "parliamentary_constituency_weights.h5" - blob = bucket.blob(blob) - blob.upload_from_filename( - STORAGE_FOLDER / "parliamentary_constituency_weights.h5" - ) - print( - f"Uploaded parliamentary_constituency_weights.h5 to GCS bucket policyengine-uk-data-private." - ) - - # Local authority weights: - - upload( STORAGE_FOLDER / "local_authority_weights.h5", - "policyengine/policyengine-uk-data", - "local_authority_weights.h5", - ) + ] + + for file_path in dataset_files: + if not file_path.exists(): + raise ValueError(f"File {file_path} does not exist.") - blob = "local_authority_weights.h5" - blob = bucket.blob(blob) - blob.upload_from_filename(STORAGE_FOLDER / "local_authority_weights.h5") - print( - f"Uploaded local_authority_weights.h5 to GCS bucket policyengine-uk-data-private." + upload_data_files( + files=dataset_files, + hf_repo_name="policyengine-uk-data/policyengine-uk-data-private", + hf_repo_type="model", + gcs_bucket_name="policyengine-uk-data-private", ) diff --git a/policyengine_uk_data/utils/data_upload.py b/policyengine_uk_data/utils/data_upload.py new file mode 100644 index 000000000..42d0fec24 --- /dev/null +++ b/policyengine_uk_data/utils/data_upload.py @@ -0,0 +1,104 @@ +from typing import List +from huggingface_hub import HfApi, CommitOperationAdd +from huggingface_hub.errors import RevisionNotFoundError +from google.cloud import storage +from pathlib import Path +from importlib import metadata +import google.auth +import logging + + +def upload_data_files( + files: List[str], + gcs_bucket_name: str = "policyengine-uk-data-private", + hf_repo_name: str = "policyengine/policyengine-uk-data", + hf_repo_type: str = "model", + version: str = None, +): + if version is None: + version = metadata.version("policyengine-uk-data") + + upload_files_to_hf( + files=files, + version=version, + hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, + ) + + upload_files_to_gcs( + files=files, + version=version, + gcs_bucket_name=gcs_bucket_name, + ) + + +def upload_files_to_hf( + files: List[str], + version: str, + hf_repo_name: str = "policyengine/policyengine-uk-data-private", + hf_repo_type: str = "model", +): + """ + Upload files to Hugging Face repository and tag the commit with the version. + """ + api = HfApi() + hf_operations = [] + + for file_path in files: + file_path = Path(file_path) + if not file_path.exists(): + raise ValueError(f"File {file_path} does not exist.") + hf_operations.append( + CommitOperationAdd( + path_in_repo=file_path.name, + path_or_fileobj=str(file_path), + ) + ) + commit_info = api.create_commit( + repo_id=hf_repo_name, + operations=hf_operations, + repo_type=hf_repo_type, + commit_message=f"Upload data files for version {version}", + ) + logging.info(f"Uploaded files to Hugging Face repository {hf_repo_name}.") + + # Tag commit with version + api.create_tag( + repo_id=hf_repo_name, + tag=version, + revision=commit_info.oid, + repo_type=hf_repo_type, + ) + logging.info( + f"Tagged commit with {version} in Hugging Face repository {hf_repo_name}." + ) + + +def upload_files_to_gcs( + files: List[str], + version: str, + gcs_bucket_name: str = "policyengine-uk-data-private", +): + """ + Upload files to Google Cloud Storage and set metadata with the version. + """ + credentials, project_id = google.auth.default() + storage_client = storage.Client( + credentials=credentials, project=project_id + ) + bucket = storage_client.bucket(gcs_bucket_name) + + for file_path in files: + file_path = Path(file_path) + blob = bucket.blob(file_path.name) + blob.upload_from_filename(file_path) + logging.info( + f"Uploaded {file_path.name} to GCS bucket {gcs_bucket_name}." + ) + + # Set metadata + blob.metadata = {"version": version} + blob.patch() + logging.info( + f"Set metadata for {file_path.name} in GCS bucket {gcs_bucket_name}." + )