diff --git a/.circleci/validate_all_studies.sh b/.circleci/validate_all_studies.sh index 38e50a9730..a6fcc96d14 100755 --- a/.circleci/validate_all_studies.sh +++ b/.circleci/validate_all_studies.sh @@ -5,7 +5,7 @@ STUDIES_DIRS=("public" "crdc/gdc") GIT_REMOTE_URL="git@github.com:cbioportal/datahub.git" test_reports_location="$HOME/test-reports" -git remote add upstream "$GIT_REMOTE_URL" +git remote get-url upstream || git remote add upstream "$GIT_REMOTE_URL" git fetch upstream master num_studies=${#list_of_study_dirs[@]} @@ -30,4 +30,13 @@ for STUDIES_DIR in "${STUDIES_DIRS[@]}"; do done done +# Validate resource URLs +echo $'\n\nValidating resource URLs...' +python3 $HOME/repo/.circleci/validate_resource_urls.py +RESOURCE_VALIDATION_STATUS=$? + +if [ $RESOURCE_VALIDATION_STATUS -ne 0 ]; then + EXIT_STATUS=1 +fi + exit "$EXIT_STATUS" diff --git a/.circleci/validate_changed_resource_urls.py b/.circleci/validate_changed_resource_urls.py new file mode 100644 index 0000000000..1152646e94 --- /dev/null +++ b/.circleci/validate_changed_resource_urls.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +""" +Validate resource URLs in changed data_resource_*.txt files. +Only checks files that were modified in the current branch. +""" +import os +import sys +import subprocess +import requests +import csv +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +TIMEOUT = 10 + +def check_url(url, timeout=TIMEOUT): + """Check if a URL returns a 200 status code.""" + if not url or url.strip().lower() in ['[not applicable]', '[not available]', '[pending]', '[discrepancy]', '[completed]', '[null]', '', 'na']: + return True # Skip empty/null values + + try: + response = requests.get(url, timeout=timeout, allow_redirects=True, stream=True) + response.close() + return response.status_code == 200 + except requests.RequestException: + return False + +def validate_resource_file(filepath): + """Validate all URLs in a resource file.""" + errors = [] + + try: + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f, delimiter='\t') + + if not reader.fieldnames or 'URL' not in reader.fieldnames: + return errors + + for row_num, row in enumerate(reader, start=2): # start at 2 because row 1 is header + url = row.get('URL', '').strip() + + if not url or url.lower() in ['[not applicable]', '[not available]', '[pending]', '[discrepancy]', '[completed]', '[null]', '', 'na']: + continue + + if not check_url(url): + errors.append({ + 'file': filepath, + 'line': row_num, + 'url': url, + 'status': f'URL returned non-200 status code or is unreachable' + }) + except Exception as e: + errors.append({ + 'file': filepath, + 'error': f'Error reading file: {str(e)}' + }) + + return errors + +def get_changed_files(): + """Get list of changed resource files from git.""" + try: + result = subprocess.run( + ['git', 'diff', '--name-only', '--diff-filter=ACMRU', 'upstream/master'], + cwd=str(REPO_ROOT), + capture_output=True, + text=True + ) + files = result.stdout.strip().split('\n') + return [f for f in files if 'data_resource_' in f and f.endswith('.txt')] + except Exception: + return [] + +def main(): + """Find and validate changed resource files.""" + changed_files = get_changed_files() + + if not changed_files: + print("No resource files changed") + return 0 + + all_errors = [] + + for filename in changed_files: + filepath = REPO_ROOT / filename + if filepath.is_file(): + errors = validate_resource_file(str(filepath)) + all_errors.extend(errors) + + # Report errors + if all_errors: + print("Resource URL validation errors found:") + for error in all_errors: + if 'error' in error: + print(f" {error['file']}: {error['error']}") + else: + print(f" {error['file']}:{error['line']}: {error['status']}") + print(f" URL: {error['url']}") + return 1 + else: + if changed_files: + print(f"Validated {len(changed_files)} changed resource file(s) - all URLs returned 200 status code") + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/.circleci/validate_changed_studies.sh b/.circleci/validate_changed_studies.sh index 064a888aa9..6daabcbb9f 100755 --- a/.circleci/validate_changed_studies.sh +++ b/.circleci/validate_changed_studies.sh @@ -10,7 +10,7 @@ VALIDATION_SCRIPT="$HOME/cbioportal-core/scripts/importer/validateStudies.py" GIT_REMOTE_URL="git@github.com:cbioportal/datahub.git" MAX_THREADS=7 -git remote add upstream "$GIT_REMOTE_URL" +git remote get-url upstream || git remote add upstream "$GIT_REMOTE_URL" git fetch upstream master mkdir -p "$LOG_DIR" @@ -113,4 +113,13 @@ if [[ $num_studies > 0 ]]; then fi else echo "No studies were changed" +fi + +# Validate changed resource URLs +echo $'\n\nValidating changed resource URLs...' +python3 "$REPO_DIR/.circleci/validate_changed_resource_urls.py" +RESOURCE_VALIDATION_STATUS=$? + +if [ $RESOURCE_VALIDATION_STATUS -ne 0 ]; then + exit 1 fi \ No newline at end of file diff --git a/.circleci/validate_resource_urls.py b/.circleci/validate_resource_urls.py new file mode 100644 index 0000000000..075140f5d0 --- /dev/null +++ b/.circleci/validate_resource_urls.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +""" +Validate that all resource URLs in data_resource_*.txt files return HTTP 200 status code. +""" +import os +import sys +import requests +import csv +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +TIMEOUT = 10 + +def check_url(url, timeout=TIMEOUT): + """Check if a URL returns a 200 status code.""" + if not url or url.strip().lower() in ['[not applicable]', '[not available]', '[pending]', '[discrepancy]', '[completed]', '[null]', '', 'na']: + return True # Skip empty/null values + + try: + response = requests.get(url, timeout=timeout, allow_redirects=True, stream=True) + response.close() + return response.status_code == 200 + except requests.RequestException: + return False + +def validate_resource_file(filepath): + """Validate all URLs in a resource file.""" + errors = [] + + try: + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f, delimiter='\t') + + if not reader.fieldnames or 'URL' not in reader.fieldnames: + return errors + + for row_num, row in enumerate(reader, start=2): # start at 2 because row 1 is header + url = row.get('URL', '').strip() + + if not url or url.lower() in ['[not applicable]', '[not available]', '[pending]', '[discrepancy]', '[completed]', '[null]', '', 'na']: + continue + + if not check_url(url): + errors.append({ + 'file': filepath, + 'line': row_num, + 'url': url, + 'status': f'URL returned non-200 status code or is unreachable' + }) + except Exception as e: + errors.append({ + 'file': filepath, + 'error': f'Error reading file: {str(e)}' + }) + + return errors + +def main(): + """Find and validate all resource files.""" + study_dirs = [REPO_ROOT / 'public', REPO_ROOT / 'crdc' / 'gdc'] + all_errors = [] + + for study_dir in study_dirs: + if not study_dir.exists() or not study_dir.is_dir(): + continue + + # Find all data_resource_*.txt files + for root, dirs, files in os.walk(str(study_dir)): + for file in files: + if file.startswith('data_resource_') and file.endswith('.txt'): + filepath = os.path.join(root, file) + errors = validate_resource_file(filepath) + all_errors.extend(errors) + + # Report errors + if all_errors: + print("Resource URL validation errors found:") + for error in all_errors: + if 'error' in error: + print(f" {error['file']}: {error['error']}") + else: + print(f" {error['file']}:{error['line']}: {error['status']}") + print(f" URL: {error['url']}") + return 1 + else: + print("All resource URLs validated successfully (returned 200 status code)") + return 0 + +if __name__ == '__main__': + sys.exit(main())