Skip to content

Commit bbc9d77

Browse files
avoid captcha issue with different version determination
1 parent 69330b6 commit bbc9d77

1 file changed

Lines changed: 29 additions & 54 deletions

File tree

parsers/CTD/src/loadCTD.py

Lines changed: 29 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
import tarfile, gzip
55
import requests
66

7+
from datetime import datetime
78
from io import TextIOWrapper
8-
from bs4 import BeautifulSoup
99
from operator import itemgetter
10-
from orion.utils import GetData, GetDataPullError
10+
11+
from orion.utils import GetData
1112
from orion.loader_interface import SourceDataLoader, SourceDataFailedError
1213
from orion.kgxmodel import kgxnode, kgxedge
1314
from orion.prefixes import CTD, NCBITAXON, MESH
@@ -75,61 +76,35 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None):
7576
self.final_record_counter: int = 0
7677
self.final_skipped_record_counter: int = 0
7778

79+
# !!! !!! README !!! !!!
80+
# CTD implemented a CAPTCHA (ALTCHA) on ctdbase.org, which broke dependable programmatic access for determining
81+
# the version they publish from their website https://ctdbase.org/about/dataStatus.go. Here is a workaround which
82+
# accesses a path that is not currently blocked by the CAPTCHA.
7883
def get_latest_source_version(self) -> str:
79-
"""
80-
gets the version of the data
84+
"""Return the CTD data release label used as ``source_version`` in the pipeline.
8185
82-
:return:
86+
Scrapes the html at https://ctdbase.org/reports/ (which is not behind CAPTCHA currently)
87+
and returns the latest modify date for the files included in this ingest,
88+
formatted as ``Month_Year`` (e.g. ``March_2026``).
8389
"""
84-
try:
85-
# load the web page for CTD
86-
html_page: requests.Response = requests.get('https://ctdbase.org/about/dataStatus.go')
87-
html_page.raise_for_status()
88-
89-
# get the html into a parsable object
90-
soup: BeautifulSoup = BeautifulSoup(html_page.content, 'html.parser')
91-
92-
# quick check for e.g. a "Human verification" page or captcha content
93-
lower_body = soup.text.lower()
94-
if "verify you are a human" in lower_body or "captcha" in lower_body:
95-
self.logger.warning("CTD dataStatus is blocked from programmatic access due to human verification / "
96-
"captcha. Trying to use the downloads page for CTD version instead..")
97-
98-
resp = requests.get('https://ctdbase.org/downloads')
99-
resp.raise_for_status()
100-
101-
soup = BeautifulSoup(resp.text, "html.parser")
102-
103-
# compile a loose regex for the target phrase
104-
pattern = re.compile(r"ctd\s+data\s+release", re.I)
105-
for p in soup.find_all("p"):
106-
# use get_text to include text from child tags as well
107-
p_text = p.get_text(" ", strip=True)
108-
if pattern.search(p_text):
109-
a = p.find("a")
110-
if not a:
111-
# found the <p> but no <a> inside it
112-
raise Exception("CTD downloads page version retrieval did not work. "
113-
"CTD data release <p> found but could not be parsed successfully.")
114-
link_text = a.get_text(strip=True).rstrip(".") # remove trailing dot if present
115-
version = link_text.replace(" ", "_")
116-
return version
117-
# the search for the version using regex failed
118-
raise Exception("CTD downloads page version retrieval did not work. "
119-
"Possibly blocked due to CAPTCHA.")
120-
else:
121-
# dataStatus page has something like:
122-
# <h1 id="pgheading">Data Status: October 2025</h1>
123-
#
124-
# find the pgheading and extract a version from it
125-
version: BeautifulSoup.Tag = soup.find(id='pgheading')
126-
if version is not None:
127-
# save the value
128-
return version.text.split(':')[1].strip().replace(' ', '_')
129-
else:
130-
raise Exception("pgheading could not be found on dataStatus page.")
131-
except Exception as e:
132-
raise GetDataPullError(error_message=f'Unable to determine latest version for CTD: {e}')
90+
91+
# The /reports/ page is formatted like an apache autoindex, we can use this regex to extract the file date
92+
# Apache autoindex row: <a href="FILE">label</a> DD-Mon-YYYY HH:MM SIZE
93+
reports_regex = re.compile(r'<a href="([^"]+)">[^<]+</a>\s+(\d{2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2})')
94+
95+
# parse the /reports/ page and find all the dates associated with the relevant files
96+
response = requests.get("https://ctdbase.org/reports/")
97+
response.raise_for_status()
98+
dates = [
99+
datetime.strptime(date_str, "%d-%b-%Y %H:%M")
100+
for href, date_str in reports_regex.findall(response.text)
101+
if href in self.ctd_data_files
102+
]
103+
if not dates:
104+
raise RuntimeError("Could not determine latest CTD version from https://ctdbase.org/reports/")
105+
106+
# return the most recently updated date as the version
107+
return max(dates).strftime("%B_%Y")
133108

134109
def get_data(self):
135110
"""

0 commit comments

Comments
 (0)