|
4 | 4 | import tarfile, gzip |
5 | 5 | import requests |
6 | 6 |
|
| 7 | +from datetime import datetime |
7 | 8 | from io import TextIOWrapper |
8 | | -from bs4 import BeautifulSoup |
9 | 9 | from operator import itemgetter |
10 | | -from orion.utils import GetData, GetDataPullError |
| 10 | + |
| 11 | +from orion.utils import GetData |
11 | 12 | from orion.loader_interface import SourceDataLoader, SourceDataFailedError |
12 | 13 | from orion.kgxmodel import kgxnode, kgxedge |
13 | 14 | from orion.prefixes import CTD, NCBITAXON, MESH |
@@ -75,61 +76,35 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): |
75 | 76 | self.final_record_counter: int = 0 |
76 | 77 | self.final_skipped_record_counter: int = 0 |
77 | 78 |
|
| 79 | + # !!! !!! README !!! !!! |
| 80 | + # CTD implemented a CAPTCHA (ALTCHA) on ctdbase.org, which broke dependable programmatic access for determining |
| 81 | + # the version they publish from their website https://ctdbase.org/about/dataStatus.go. Here is a workaround which |
| 82 | + # accesses a path that is not currently blocked by the CAPTCHA. |
78 | 83 | def get_latest_source_version(self) -> str: |
79 | | - """ |
80 | | - gets the version of the data |
| 84 | + """Return the CTD data release label used as ``source_version`` in the pipeline. |
81 | 85 |
|
82 | | - :return: |
| 86 | + Scrapes the html at https://ctdbase.org/reports/ (which is not behind CAPTCHA currently) |
| 87 | + and returns the latest modify date for the files included in this ingest, |
| 88 | + formatted as ``Month_Year`` (e.g. ``March_2026``). |
83 | 89 | """ |
84 | | - try: |
85 | | - # load the web page for CTD |
86 | | - html_page: requests.Response = requests.get('https://ctdbase.org/about/dataStatus.go') |
87 | | - html_page.raise_for_status() |
88 | | - |
89 | | - # get the html into a parsable object |
90 | | - soup: BeautifulSoup = BeautifulSoup(html_page.content, 'html.parser') |
91 | | - |
92 | | - # quick check for e.g. a "Human verification" page or captcha content |
93 | | - lower_body = soup.text.lower() |
94 | | - if "verify you are a human" in lower_body or "captcha" in lower_body: |
95 | | - self.logger.warning("CTD dataStatus is blocked from programmatic access due to human verification / " |
96 | | - "captcha. Trying to use the downloads page for CTD version instead..") |
97 | | - |
98 | | - resp = requests.get('https://ctdbase.org/downloads') |
99 | | - resp.raise_for_status() |
100 | | - |
101 | | - soup = BeautifulSoup(resp.text, "html.parser") |
102 | | - |
103 | | - # compile a loose regex for the target phrase |
104 | | - pattern = re.compile(r"ctd\s+data\s+release", re.I) |
105 | | - for p in soup.find_all("p"): |
106 | | - # use get_text to include text from child tags as well |
107 | | - p_text = p.get_text(" ", strip=True) |
108 | | - if pattern.search(p_text): |
109 | | - a = p.find("a") |
110 | | - if not a: |
111 | | - # found the <p> but no <a> inside it |
112 | | - raise Exception("CTD downloads page version retrieval did not work. " |
113 | | - "CTD data release <p> found but could not be parsed successfully.") |
114 | | - link_text = a.get_text(strip=True).rstrip(".") # remove trailing dot if present |
115 | | - version = link_text.replace(" ", "_") |
116 | | - return version |
117 | | - # the search for the version using regex failed |
118 | | - raise Exception("CTD downloads page version retrieval did not work. " |
119 | | - "Possibly blocked due to CAPTCHA.") |
120 | | - else: |
121 | | - # dataStatus page has something like: |
122 | | - # <h1 id="pgheading">Data Status: October 2025</h1> |
123 | | - # |
124 | | - # find the pgheading and extract a version from it |
125 | | - version: BeautifulSoup.Tag = soup.find(id='pgheading') |
126 | | - if version is not None: |
127 | | - # save the value |
128 | | - return version.text.split(':')[1].strip().replace(' ', '_') |
129 | | - else: |
130 | | - raise Exception("pgheading could not be found on dataStatus page.") |
131 | | - except Exception as e: |
132 | | - raise GetDataPullError(error_message=f'Unable to determine latest version for CTD: {e}') |
| 90 | + |
| 91 | + # The /reports/ page is formatted like an apache autoindex, we can use this regex to extract the file date |
| 92 | + # Apache autoindex row: <a href="FILE">label</a> DD-Mon-YYYY HH:MM SIZE |
| 93 | + reports_regex = re.compile(r'<a href="([^"]+)">[^<]+</a>\s+(\d{2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2})') |
| 94 | + |
| 95 | + # parse the /reports/ page and find all the dates associated with the relevant files |
| 96 | + response = requests.get("https://ctdbase.org/reports/") |
| 97 | + response.raise_for_status() |
| 98 | + dates = [ |
| 99 | + datetime.strptime(date_str, "%d-%b-%Y %H:%M") |
| 100 | + for href, date_str in reports_regex.findall(response.text) |
| 101 | + if href in self.ctd_data_files |
| 102 | + ] |
| 103 | + if not dates: |
| 104 | + raise RuntimeError("Could not determine latest CTD version from https://ctdbase.org/reports/") |
| 105 | + |
| 106 | + # return the most recently updated date as the version |
| 107 | + return max(dates).strftime("%B_%Y") |
133 | 108 |
|
134 | 109 | def get_data(self): |
135 | 110 | """ |
|
0 commit comments