Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion .circleci/validate_all_studies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ STUDIES_DIRS=("public" "crdc/gdc")
GIT_REMOTE_URL="git@github.com:cbioportal/datahub.git"
test_reports_location="$HOME/test-reports"

git remote add upstream "$GIT_REMOTE_URL"
git remote get-url upstream || git remote add upstream "$GIT_REMOTE_URL"
git fetch upstream master

num_studies=${#list_of_study_dirs[@]}
Expand All @@ -30,4 +30,13 @@ for STUDIES_DIR in "${STUDIES_DIRS[@]}"; do
done
done

# Validate resource URLs
echo $'\n\nValidating resource URLs...'
python3 $HOME/repo/.circleci/validate_resource_urls.py
RESOURCE_VALIDATION_STATUS=$?

if [ $RESOURCE_VALIDATION_STATUS -ne 0 ]; then
EXIT_STATUS=1
fi

exit "$EXIT_STATUS"
106 changes: 106 additions & 0 deletions .circleci/validate_changed_resource_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/env python3
"""
Validate resource URLs in changed data_resource_*.txt files.
Only checks files that were modified in the current branch.
"""
import os
import sys
import subprocess
import requests
import csv
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]
TIMEOUT = 10

def check_url(url, timeout=TIMEOUT):
"""Check if a URL returns a 200 status code."""
if not url or url.strip().lower() in ['[not applicable]', '[not available]', '[pending]', '[discrepancy]', '[completed]', '[null]', '', 'na']:
return True # Skip empty/null values

try:
response = requests.get(url, timeout=timeout, allow_redirects=True, stream=True)
response.close()
return response.status_code == 200
except requests.RequestException:
return False

def validate_resource_file(filepath):
"""Validate all URLs in a resource file."""
errors = []

try:
with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f, delimiter='\t')

if not reader.fieldnames or 'URL' not in reader.fieldnames:
return errors

for row_num, row in enumerate(reader, start=2): # start at 2 because row 1 is header
url = row.get('URL', '').strip()

if not url or url.lower() in ['[not applicable]', '[not available]', '[pending]', '[discrepancy]', '[completed]', '[null]', '', 'na']:
continue

if not check_url(url):
errors.append({
'file': filepath,
'line': row_num,
'url': url,
'status': f'URL returned non-200 status code or is unreachable'
})
except Exception as e:
errors.append({
'file': filepath,
'error': f'Error reading file: {str(e)}'
})

return errors

def get_changed_files():
"""Get list of changed resource files from git."""
try:
result = subprocess.run(
['git', 'diff', '--name-only', '--diff-filter=ACMRU', 'upstream/master'],
cwd=str(REPO_ROOT),
capture_output=True,
text=True
)
files = result.stdout.strip().split('\n')
return [f for f in files if 'data_resource_' in f and f.endswith('.txt')]
except Exception:
return []

def main():
"""Find and validate changed resource files."""
changed_files = get_changed_files()

if not changed_files:
print("No resource files changed")
return 0

all_errors = []

for filename in changed_files:
filepath = REPO_ROOT / filename
if filepath.is_file():
errors = validate_resource_file(str(filepath))
all_errors.extend(errors)

# Report errors
if all_errors:
print("Resource URL validation errors found:")
for error in all_errors:
if 'error' in error:
print(f" {error['file']}: {error['error']}")
else:
print(f" {error['file']}:{error['line']}: {error['status']}")
print(f" URL: {error['url']}")
return 1
else:
if changed_files:
print(f"Validated {len(changed_files)} changed resource file(s) - all URLs returned 200 status code")
return 0

if __name__ == '__main__':
sys.exit(main())
11 changes: 10 additions & 1 deletion .circleci/validate_changed_studies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ VALIDATION_SCRIPT="$HOME/cbioportal-core/scripts/importer/validateStudies.py"
GIT_REMOTE_URL="git@github.com:cbioportal/datahub.git"
MAX_THREADS=7

git remote add upstream "$GIT_REMOTE_URL"
git remote get-url upstream || git remote add upstream "$GIT_REMOTE_URL"
git fetch upstream master

mkdir -p "$LOG_DIR"
Expand Down Expand Up @@ -113,4 +113,13 @@ if [[ $num_studies > 0 ]]; then
fi
else
echo "No studies were changed"
fi

# Validate changed resource URLs
echo $'\n\nValidating changed resource URLs...'
python3 "$REPO_DIR/.circleci/validate_changed_resource_urls.py"
RESOURCE_VALIDATION_STATUS=$?

if [ $RESOURCE_VALIDATION_STATUS -ne 0 ]; then
exit 1
fi
90 changes: 90 additions & 0 deletions .circleci/validate_resource_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env python3
"""
Validate that all resource URLs in data_resource_*.txt files return HTTP 200 status code.
"""
import os
import sys
import requests
import csv
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]
TIMEOUT = 10

def check_url(url, timeout=TIMEOUT):
"""Check if a URL returns a 200 status code."""
if not url or url.strip().lower() in ['[not applicable]', '[not available]', '[pending]', '[discrepancy]', '[completed]', '[null]', '', 'na']:
return True # Skip empty/null values

try:
response = requests.get(url, timeout=timeout, allow_redirects=True, stream=True)
response.close()
return response.status_code == 200
except requests.RequestException:
return False

def validate_resource_file(filepath):
"""Validate all URLs in a resource file."""
errors = []

try:
with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f, delimiter='\t')

if not reader.fieldnames or 'URL' not in reader.fieldnames:
return errors

for row_num, row in enumerate(reader, start=2): # start at 2 because row 1 is header
url = row.get('URL', '').strip()

if not url or url.lower() in ['[not applicable]', '[not available]', '[pending]', '[discrepancy]', '[completed]', '[null]', '', 'na']:
continue

if not check_url(url):
errors.append({
'file': filepath,
'line': row_num,
'url': url,
'status': f'URL returned non-200 status code or is unreachable'
})
except Exception as e:
errors.append({
'file': filepath,
'error': f'Error reading file: {str(e)}'
})

return errors

def main():
"""Find and validate all resource files."""
study_dirs = [REPO_ROOT / 'public', REPO_ROOT / 'crdc' / 'gdc']
all_errors = []

for study_dir in study_dirs:
if not study_dir.exists() or not study_dir.is_dir():
continue

# Find all data_resource_*.txt files
for root, dirs, files in os.walk(str(study_dir)):
for file in files:
if file.startswith('data_resource_') and file.endswith('.txt'):
filepath = os.path.join(root, file)
errors = validate_resource_file(filepath)
all_errors.extend(errors)

# Report errors
if all_errors:
print("Resource URL validation errors found:")
for error in all_errors:
if 'error' in error:
print(f" {error['file']}: {error['error']}")
else:
print(f" {error['file']}:{error['line']}: {error['status']}")
print(f" URL: {error['url']}")
return 1
else:
print("All resource URLs validated successfully (returned 200 status code)")
return 0

if __name__ == '__main__':
sys.exit(main())