From 75518ccbdee2b0b4ffa87321ce9dcf78f8a7ce32 Mon Sep 17 00:00:00 2001 From: pkdash Date: Tue, 9 Apr 2024 15:08:34 -0400 Subject: [PATCH 1/9] [#124] error handling for s3 object read --- api/adapters/s3.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/api/adapters/s3.py b/api/adapters/s3.py index 43f1b91..0930c25 100644 --- a/api/adapters/s3.py +++ b/api/adapters/s3.py @@ -1,10 +1,14 @@ -import boto3 import json -from botocore.client import Config +from http import HTTPStatus + +import boto3 from botocore import UNSIGNED +from botocore.client import Config +from botocore.exceptions import ClientError as S3ClientError from api.adapters.base import AbstractRepositoryMetadataAdapter, AbstractRepositoryRequestHandler from api.adapters.utils import RepositoryType, register_adapter +from api.exceptions import RepositoryException from api.models.catalog import DatasetMetadataDOC from api.models.user import Submission @@ -17,12 +21,25 @@ def get_metadata(self, record_id: str): file_key = record_id.split("+")[2] s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED), endpoint_url=endpoint_url) - - response = s3.get_object(Bucket=bucket_name, Key=file_key) - json_content = response['Body'].read().decode('utf-8') + try: + response = s3.get_object(Bucket=bucket_name, Key=file_key) + except S3ClientError as ex: + if ex.response["Error"]["Code"] == "NoSuchKey": + raise RepositoryException( + detail=f"Specified metadata file was not found in S3: {bucket_name}/{file_key}", + status_code=HTTPStatus.NOT_FOUND + ) + else: + err_msg = f"Error accessing S3 file({bucket_name}/{file_key}): {str(ex)}" + raise RepositoryException(detail=err_msg, status_code=HTTPStatus.BAD_REQUEST) + json_content = response['Body'].read().decode('utf-8') # Parse the JSON content - data = json.loads(json_content) + try: + data = json.loads(json_content) + except json.JSONDecodeError as ex: + err_msg = f"Invalid JSON content in S3 file ({file_key}). Error: {str(ex)}" + raise RepositoryException(detail=err_msg, status_code=HTTPStatus.BAD_REQUEST) return data From a4f4957c6ac3c80c0932e65f216035b8e72cde8b Mon Sep 17 00:00:00 2001 From: pkdash Date: Tue, 9 Apr 2024 15:11:42 -0400 Subject: [PATCH 2/9] [#124] fix s3 object identifier as stored in catalog record --- api/routes/catalog.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/api/routes/catalog.py b/api/routes/catalog.py index 476dfc5..b5afcc3 100644 --- a/api/routes/catalog.py +++ b/api/routes/catalog.py @@ -135,9 +135,12 @@ async def register_s3_dataset(request_model: S3Path, user: Annotated[User, Depen path = request_model.path bucket = request_model.bucket endpoint_url = request_model.endpoint_url - identifier = f"{endpoint_url}+{bucket}+{path}" + endpoint_url = endpoint_url.rstrip("/") + identifier = f"{endpoint_url}/{bucket}/{path}" submission: Submission = user.submission_by_repository(repo_type=RepositoryType.S3, identifier=identifier) - dataset = await _save_to_db(repository_type=RepositoryType.S3, identifier=identifier, user=user, submission=submission) + identifier = f"{endpoint_url}+{bucket}+{path}" + dataset = await _save_to_db(repository_type=RepositoryType.S3, identifier=identifier, user=user, + submission=submission) return dataset @@ -145,6 +148,9 @@ async def _save_to_db(repository_type: RepositoryType, identifier: str, user: Us adapter = get_adapter_by_type(repository_type=repository_type) # fetch metadata from repository as catalog dataset repo_dataset: DatasetMetadataDOC = await _get_repo_meta_as_catalog_record(adapter=adapter, identifier=identifier) + if repository_type == RepositoryType.S3: + s3_endpoint_url, bucket, path = identifier.split("+") + identifier = f"{s3_endpoint_url}/{bucket}/{path}" if submission is None: # new registration await repo_dataset.insert() From 837931512af96a5f999a561a263d47a4a2ad427b Mon Sep 17 00:00:00 2001 From: pkdash Date: Tue, 9 Apr 2024 15:13:51 -0400 Subject: [PATCH 3/9] [#124] unit test for registering s3 dataset --- tests/test_dataset_routes.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/test_dataset_routes.py b/tests/test_dataset_routes.py index 426fd1c..1f05e53 100644 --- a/tests/test_dataset_routes.py +++ b/tests/test_dataset_routes.py @@ -179,6 +179,31 @@ async def test_get_datasets_exclude_none(client_test, dataset_data): assert "measurementTechnique" not in a_property +@pytest.mark.asyncio +async def test_register_minio_s3_dataset(client_test): + """Testing registering metadata for a generic dataset stored on minIO s3""" + + # set the path to the generic metadata file on minIO s3 + s3_path = { + "path": "data/.hs/dataset_metadata.json", + "bucket": "catalog-api-test", + "endpoint_url": "https://api.minio.cuahsi.io/", + } + + dataset_response = await client_test.put( + "api/catalog/repository/s3", json=s3_path + ) + assert dataset_response.status_code == 200 + ds_metadata = dataset_response.json() + expected_repository_identifier = f"{s3_path['endpoint_url']}{s3_path['bucket']}/{s3_path['path']}" + assert ds_metadata["repository_identifier"] == expected_repository_identifier + + # retrieve the record from the db + record_id = ds_metadata.get('_id') + response = await client_test.get(f"api/catalog/dataset/{record_id}") + assert response.status_code == 200 + + @pytest.mark.parametrize("multiple", [True, False]) @pytest.mark.asyncio async def test_get_submissions(client_test, dataset_data, multiple): From 5e2845981439fa64af9784d3fc6da2eee3d50ed7 Mon Sep 17 00:00:00 2001 From: pkdash Date: Wed, 29 May 2024 17:05:29 -0400 Subject: [PATCH 4/9] [#124] updating routes related to s3 dataset registration --- api/routes/catalog.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/api/routes/catalog.py b/api/routes/catalog.py index 13f584f..7ba1b5c 100644 --- a/api/routes/catalog.py +++ b/api/routes/catalog.py @@ -149,8 +149,13 @@ async def register_s3_dataset(s3_path: S3Path, user: Annotated[User, Depends(get identifier = s3_path.identifier submission: Submission = user.submission_by_repository(repo_type=RepositoryType.S3, identifier=identifier) - identifier = f"{endpoint_url}+{bucket}+{path}" - dataset = await _save_to_db(repository_type=RepositoryType.S3, identifier=identifier, user=user, + if submission is not None: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="This S3 dataset has already been submitted by this user", + ) + fetch_identifier = s3_path.fetch_identifier + dataset = await _save_to_db(repository_type=RepositoryType.S3, identifier=fetch_identifier, user=user, submission=submission) return dataset @@ -220,14 +225,17 @@ async def _save_to_db(repository_type: RepositoryType, identifier: str, user: Us adapter = get_adapter_by_type(repository_type=repository_type) # fetch metadata from repository as catalog dataset repo_dataset: DatasetMetadataDOC = await _get_repo_meta_as_catalog_record(adapter=adapter, identifier=identifier) + s3_path = None if repository_type == RepositoryType.S3: s3_endpoint_url, bucket, path = identifier.split("+") - identifier = f"{s3_endpoint_url}/{bucket}/{path}" + s3_path = S3Path(endpoint_url=s3_endpoint_url, bucket=bucket, path=path) + identifier = s3_path.identifier if submission is None: # new registration await repo_dataset.insert() submission = repo_dataset.as_submission() submission = adapter.update_submission(submission=submission, repo_record_id=identifier) + submission.s3_path = s3_path user.submissions.append(submission) await user.save(link_rule=WriteRules.WRITE) dataset = repo_dataset @@ -241,6 +249,7 @@ async def _save_to_db(repository_type: RepositoryType, identifier: str, user: Us updated_submission = adapter.update_submission(submission=updated_submission, repo_record_id=identifier) updated_submission.id = submission.id updated_submission.submitted = submission.submitted + updated_submission.s3_path = s3_path await updated_submission.replace() dataset = updated_dataset submission = updated_submission From fcff298098c998de85f50eee80a6fd0b3770852c Mon Sep 17 00:00:00 2001 From: pkdash Date: Wed, 29 May 2024 17:08:21 -0400 Subject: [PATCH 5/9] [#124] fixing make pre-post command script path --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f13b2c7..e3c130d 100644 --- a/Makefile +++ b/Makefile @@ -33,4 +33,4 @@ test: .PHONY: pre-post pre-post: - docker-compose run catalog-trigger python /app/triggers/management/change_streams_pre_and_post.py + docker-compose run catalog-trigger python /app/api/models/management/change_streams_pre_and_post.py From c3bd1fa6b37d2a24eaef7ddf65b7d53e8fc4d617 Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 31 May 2024 16:39:38 -0400 Subject: [PATCH 6/9] [#140] using one workflow file for deploying to prod and dev (staging) --- .github/workflows/deploy-dev.yaml | 96 ------------------------------- .github/workflows/deploy.yaml | 35 ++++++----- 2 files changed, 21 insertions(+), 110 deletions(-) delete mode 100644 .github/workflows/deploy-dev.yaml diff --git a/.github/workflows/deploy-dev.yaml b/.github/workflows/deploy-dev.yaml deleted file mode 100644 index c764159..0000000 --- a/.github/workflows/deploy-dev.yaml +++ /dev/null @@ -1,96 +0,0 @@ -name: Deploy I-GUIDE to GKE Autopilot (Beta) - -on: - workflow_dispatch: - push: - branches: - - 'develop' - -env: - DOMAIN: iguide-dev.cuahsi.io - IP: iguide-dev - TESTING: false - OIDC_ISSUER: https://orcid.org - DATABASE_NAME: iguide_dev - DB_PROTOCOL: mongodb+srv - HYDROSHARE_META_READ_URL: https://www.hydroshare.org/hsapi2/resource/%s/json/ - HYDROSHARE_FILE_READ_URL: https://www.hydroshare.org/hsapi/resource/%s/files/ - VITE_APP_NAME: I-GUIDE - VITE_APP_URL: https://iguide-dev.cuahsi.io - VITE_APP_API_URL: https://iguide-dev.cuahsi.io/api - VITE_APP_LOGIN_URL: https://orcid.org/oauth/authorize - VITE_APP_SUPPORT_EMAIL: help@example.com - VITE_APP_CLIENT_ID: APP-4ZA8C8BYAH3QHNE9 - SEARCH_RELEVANCE_SCORE_THRESHOLD: 1.4 - - -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - name: code checkout - uses: actions/checkout@v2 - - - name: Install the gcloud cli - uses: google-github-actions/setup-gcloud@v0 - with: - project_id: ${{ secrets.GOOGLE_PROJECT }} - service_account_key: ${{ secrets.GOOGLE_APPLICATION_CREDENTIALS }} - install_components: 'gke-gcloud-auth-plugin' - export_default_credentials: true - - - name: Compile the root env file - env: - DB_HOST: ${{ secrets.DB_HOST_BETA }} - DB_USERNAME: ${{ secrets.DB_USERNAME_BETA }} - DB_PASSWORD: ${{ secrets.DB_PASSWORD_BETA }} - run: | - variables=("OIDC_ISSUER" "DB_USERNAME" "DB_PASSWORD" "DB_HOST" "DATABASE_NAME" "DB_PROTOCOL" "TESTING" "VITE_APP_LOGIN_URL" "HYDROSHARE_META_READ_URL" "HYDROSHARE_FILE_READ_URL" "SEARCH_RELEVANCE_SCORE_THRESHOLD") - - # Empty the .env file - > .env - - # Loop through the variables and add them to the .env file - for var in "${variables[@]}"; do - echo "$var=${!var}" >> .env - done - - - name: Compile the frontend env file - env: - VITE_APP_GOOGLE_MAPS_API_KEY: ${{ secrets.VITE_APP_GOOGLE_MAPS_API_KEY }} - - run: | - variables=("VITE_APP_NAME" "VITE_APP_API_URL" "VITE_APP_SUPPORT_EMAIL" "VITE_APP_URL" "VITE_APP_LOGIN_URL" "VITE_APP_CLIENT_ID" "VITE_APP_GOOGLE_MAPS_API_KEY") - - # Empty the .env file - > frontend/.env - - # Loop through the variables and add them to the .env file - for var in "${variables[@]}"; do - echo "$var=${!var}" >> frontend/.env - done - - - name: Build and push docker images - env: - GOOGLE_PROJECT: ${{ secrets.GOOGLE_PROJECT }} - run: | - gcloud auth configure-docker us-central1-docker.pkg.dev - docker build -t us-central1-docker.pkg.dev/$GOOGLE_PROJECT/iguide/api:$GITHUB_SHA -f docker/api/Dockerfile . - docker push us-central1-docker.pkg.dev/$GOOGLE_PROJECT/iguide/api:$GITHUB_SHA - docker build -t us-central1-docker.pkg.dev/$GOOGLE_PROJECT/iguide/frontend:$GITHUB_SHA -f docker/frontend/Dockerfile . - docker push us-central1-docker.pkg.dev/$GOOGLE_PROJECT/iguide/frontend:$GITHUB_SHA - docker build -t us-central1-docker.pkg.dev/$GOOGLE_PROJECT/iguide/trigger:$GITHUB_SHA -f docker/triggers/Dockerfile . - docker push us-central1-docker.pkg.dev/$GOOGLE_PROJECT/iguide/trigger:$GITHUB_SHA - - - name: Deploy to GKE - env: - USE_GKE_GCLOUD_AUTH_PLUGIN: True - GOOGLE_PROJECT: ${{ secrets.GOOGLE_PROJECT }} - run: | - gcloud container clusters get-credentials iguide-dev --region us-central1 - find ./kubernetes -type f | xargs -i sed -i "s/GOOGLE_PROJECT/$GOOGLE_PROJECT/g" {} - find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_TAG/$GITHUB_SHA/g" {} - find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_DOMAIN/$DOMAIN/g" {} - kubectl apply -f kubernetes/ - # Refresh pods - kubectl delete pods --all diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 063bf22..4b08833 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -4,28 +4,30 @@ on: workflow_dispatch: push: branches: + - 'develop' - 'productionalization' env: - DOMAIN: iguide.cuahsi.io - TAG: latest + DEPLOY_TO_PRODUCTION: ${{ github.ref == 'refs/heads/productionalization' && true || false }} + DOMAIN: ${{ env.DEPLOY_TO_PRODUCTION == false && 'iguide-dev.cuahsi.io' || 'iguide.cuahsi.io' }} + IP: ${{ env.DEPLOY_TO_PRODUCTION == false && 'iguide-dev' || 'iguide' }} + KUBE_CLUSTER_NAME: ${{ env.DEPLOY_TO_PRODUCTION == false && 'iguide-dev' || 'iguide' }} + TAG: ${{ env.DEPLOY_TO_PRODUCTION == false && github.sha || 'latest' }} TESTING: false - IP: iguide OIDC_ISSUER: https://orcid.org - DATABASE_NAME: iguide_beta + # why are we using iguide_beta for production deployment? Should it be iguide_demo? + DATABASE_NAME: ${{ env.DEPLOY_TO_PRODUCTION == false && 'iguide_dev' || 'iguide_beta' }} DB_PROTOCOL: mongodb+srv HYDROSHARE_META_READ_URL: https://www.hydroshare.org/hsapi2/resource/%s/json/ HYDROSHARE_FILE_READ_URL: https://www.hydroshare.org/hsapi/resource/%s/files/ VITE_APP_NAME: I-GUIDE - VITE_APP_URL: https://iguide.cuahsi.io - VITE_APP_API_URL: https://iguide.cuahsi.io/api + VITE_APP_URL: ${{ env.DEPLOY_TO_PRODUCTION == false && 'https://iguide-dev.cuahsi.io' || 'https://iguide.cuahsi.io' }} + VITE_APP_API_URL: ${{ env.DEPLOY_TO_PRODUCTION == false && 'https://iguide-dev.cuahsi.io/api' || 'https://iguide.cuahsi.io/api' }} VITE_APP_LOGIN_URL: https://orcid.org/oauth/authorize - VITE_APP_GOOGLE_MAPS_API_KEY: "" VITE_APP_SUPPORT_EMAIL: help@example.com VITE_APP_CLIENT_ID: APP-4ZA8C8BYAH3QHNE9 SEARCH_RELEVANCE_SCORE_THRESHOLD: 1.4 - jobs: deploy: runs-on: ubuntu-latest @@ -43,9 +45,9 @@ jobs: - name: Compile the root env file env: - DB_HOST: ${{ secrets.DB_HOST }} - DB_USERNAME: ${{ secrets.DB_USERNAME }} - DB_PASSWORD: ${{ secrets.DB_PASSWORD }} + DB_HOST: ${{ env.DEPLOY_TO_PRODUCTION == false && secrets.DB_HOST_BETA || secrets.DB_HOST }} + DB_USERNAME: ${{ env.DEPLOY_TO_PRODUCTION == false && secrets.DB_USERNAME_BETA || secrets.DB_USERNAME }} + DB_PASSWORD: ${{ env.DEPLOY_TO_PRODUCTION == false && secrets.DB_PASSWORD_BETA || secrets.DB_PASSWORD }} run: | variables=("OIDC_ISSUER" "DB_USERNAME" "DB_PASSWORD" "DB_HOST" "DATABASE_NAME" "DB_PROTOCOL" "TESTING" "VITE_APP_LOGIN_URL" "HYDROSHARE_META_READ_URL" "HYDROSHARE_FILE_READ_URL" "SEARCH_RELEVANCE_SCORE_THRESHOLD") @@ -58,6 +60,9 @@ jobs: done - name: Compile the frontend env file + env: + VITE_APP_GOOGLE_MAPS_API_KEY: ${{env.DEPLOY_TO_PRODUCTION == false && secrets.VITE_APP_GOOGLE_MAPS_API_KEY || ''}} + run: | variables=("VITE_APP_NAME" "VITE_APP_API_URL" "VITE_APP_SUPPORT_EMAIL" "VITE_APP_URL" "VITE_APP_LOGIN_URL" "VITE_APP_CLIENT_ID" "VITE_APP_GOOGLE_MAPS_API_KEY") @@ -86,11 +91,13 @@ jobs: USE_GKE_GCLOUD_AUTH_PLUGIN: True GOOGLE_PROJECT: ${{ secrets.GOOGLE_PROJECT }} run: | - gcloud container clusters get-credentials iguide --region us-central1 - find ./kubernetes -type f | xargs -i sed -i "s/GOOGLE_PROJECT/$GOOGLE_PROJECT/g" {} + gcloud container clusters get-credentials $KUBE_CLUSTER_NAME --region us-central1 + find ./kubernetes -type f | xargs -i sed -i "s/GOOGLE_PROJECT/$GOOGLE_PROJECT/g" {} find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_TAG/$TAG/g" {} find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_DOMAIN/$DOMAIN/g" {} - find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_IP/$IP/g" {} + if [[ "${{ env.DEPLOY_TO_PRODUCTION }}" == true ]]; then + find ./kubernetes -type f | xargs -i sed -i "s/IGUIDE_IP/$IP/g" {} + fi kubectl apply -f kubernetes/ # Refresh pods kubectl delete pods --all From f0b8eb644b7792bb44bd2417abde61e0de6cb942 Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 31 May 2024 17:56:17 -0400 Subject: [PATCH 7/9] [#140] updating ci workflow to trigger on push to feature branches only --- .github/workflows/ci.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a78d077..250fbab 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -3,7 +3,9 @@ name: Run I-GUIDE Builds and Config Saturations on: workflow_dispatch: push: - branches: ['*'] + branches-ignore: + - productionalization + - develop env: DOMAIN: iguide.cuahsi.io From 9ac1d04c93301312b8c20e026e9b06a102e40e63 Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 31 May 2024 18:12:38 -0400 Subject: [PATCH 8/9] [#140] moving the dependabot.yml to .github folder --- .github/{workflows => }/dependabot.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/{workflows => }/dependabot.yml (100%) diff --git a/.github/workflows/dependabot.yml b/.github/dependabot.yml similarity index 100% rename from .github/workflows/dependabot.yml rename to .github/dependabot.yml From 99671eeb355aa9a98288a217405a4b16e4880790 Mon Sep 17 00:00:00 2001 From: pkdash Date: Tue, 4 Jun 2024 11:32:08 -0400 Subject: [PATCH 9/9] [#140] logging the actual exception message for triggers and scheduler jobs --- triggers/scheduler.py | 4 ++-- triggers/update_catalog.py | 4 ++-- triggers/update_typeahead.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/triggers/scheduler.py b/triggers/scheduler.py index 876da7e..4fe14df 100644 --- a/triggers/scheduler.py +++ b/triggers/scheduler.py @@ -71,8 +71,8 @@ async def do_daily(): else: # couldn't retrieve matching repository record await db["discovery"].delete_one({"_id": submission.identifier}) - except: - logger.exception(f"Failed to collect submission {submission.url}") + except Exception as exp: + logger.exception(f"Failed to collect submission {submission.url}, Error: {str(exp)}") def main(): diff --git a/triggers/update_catalog.py b/triggers/update_catalog.py index 135f837..4162efb 100644 --- a/triggers/update_catalog.py +++ b/triggers/update_catalog.py @@ -23,8 +23,8 @@ async def _main(): while True: try: await watch_catalog(db) - except: - logger.exception("Submission Watch Task failed, restarting the task") + except Exception as exp: + logger.exception(f"Submission Watch Task failed. Error:{str(exp)}, restarting the task") finally: db.close() diff --git a/triggers/update_typeahead.py b/triggers/update_typeahead.py index 8f227c4..b11097c 100644 --- a/triggers/update_typeahead.py +++ b/triggers/update_typeahead.py @@ -19,8 +19,8 @@ async def _main(): while True: try: await watch_discovery(db) - except: - logger.exception("Discovery Watch Task failed, restarting the task") + except Exception as exp: + logger.exception(f"Discovery Watch Task failed. Error:{str(exp)}, restarting the task") finally: db.close()