Skip to content

Commit 658ea37

Browse files
author
Alfred Nwolisa
committed
Feat: Add support for SSL exception handling
1 parent f1b8ad4 commit 658ea37

2 files changed

Lines changed: 71 additions & 58 deletions

File tree

.github/workflows/direct_download_urls_test_for_sources.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,19 @@ jobs:
185185
os.makedirs(os.path.dirname(zip_path), exist_ok=True)
186186
187187
try:
188+
# First attempt with SSL verification
188189
zip_file_req = requests.get(url, params=params, headers=headers, allow_redirects=True)
189190
zip_file_req.raise_for_status()
191+
except requests.exceptions.SSLError as ssl_err:
192+
print(f"{base}: SSL verification failed. Retrying without verification.")
193+
try:
194+
zip_file_req = requests.get(url, params=params, headers=headers, allow_redirects=True, verify=False)
195+
zip_file_req.raise_for_status()
196+
print(f"Warning: SSL verification was disabled for {url}. This is a security risk.")
197+
except Exception as retry_e:
198+
raise Exception(
199+
f"{base}: Exception {retry_e} occurred when downloading the URL {url} with SSL verification disabled.\n"
200+
)
190201
except Exception as e:
191202
raise Exception(
192203
f"{base}: Exception {e} occurred when downloading the URL {url}.\n"

tools/helpers.py

Lines changed: 60 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -78,76 +78,78 @@ def to_csv(path, catalog, columns):
7878
catalog.to_csv(path, sep=",", index=False)
7979

8080

81+
def get_fallback_headers(url, original_headers=None):
82+
"""Generate browser-like fallback headers for a given URL"""
83+
return {
84+
**FALLBACK_HEADERS,
85+
**(original_headers or {}),
86+
"Referer": f"{urlparse(url).scheme}://{urlparse(url).netloc}/",
87+
"Host": urlparse(url).netloc
88+
}
89+
90+
8191
def download_dataset(url, authentication_type, api_key_parameter_name=None, api_key_parameter_value=None):
8292
"""
8393
Downloads a dataset from the given URL using specified authentication mechanisms.
8494
The method performs a request to the URL with API key passed as either a query
85-
parameter or a header, based on the chosen authentication type. If the download
86-
fails with certain 403 errors, a fallback request with alternative headers is attempted.
87-
It writes the dataset contents to a temporary file and returns the file path.
88-
89-
:param url: The dataset's source URL.
90-
:type url: str
91-
:param authentication_type: The type of authentication mechanism to use (e.g.,
92-
1 for parameter-based, 2 for header-based).
93-
:type authentication_type: int
94-
:param api_key_parameter_name: The name of the API key parameter/header. It is
95-
optional if the dataset is publicly accessible or no authentication is
96-
required.
97-
:type api_key_parameter_name: str, optional
98-
:param api_key_parameter_value: The value of the API key to authenticate the
99-
request. It is optional if no authentication is required.
100-
:type api_key_parameter_value: str, optional
101-
:return: The file path where the downloaded dataset is temporarily stored.
102-
:type return: str
103-
:raises RequestException: If all attempts to download the dataset fail.
95+
parameter or a header, based on the chosen authentication type. It implements
96+
adaptive fallback strategies for HTTP 403 errors and SSL certificate errors.
10497
"""
105-
106-
def make_request(url, params=None, headers=None):
107-
try:
108-
response = requests.get(url, params=params, headers=headers, allow_redirects=True, verify=True)
109-
response.raise_for_status()
110-
return response.content
111-
except requests.exceptions.SSLError as ssl_err:
112-
ca_bundle_path = os.environ.get("SSL_CERT_PATH")
113-
if ca_bundle_path and os.path.exists(ca_bundle_path):
114-
print(f"SSL verification failed. Retrying with custom CA bundle: {ca_bundle_path}")
115-
try:
116-
response = requests.get(url, params=params, headers=headers, allow_redirects=True,
117-
verify=ca_bundle_path)
118-
response.raise_for_status()
119-
return response.content
120-
except Exception as e:
121-
print(f"SSL retry failed: {e}")
122-
return None
123-
else:
124-
print("Custom CA bundle not found. SSL verification failed.")
125-
return None
126-
except HTTPError as e:
127-
return None if e.response.status_code == 403 else RequestException(
128-
f"HTTP error {e} when accessing {url}. Fallback headers will be tried."
129-
)
130-
except RequestException as e:
131-
raise RequestException(f"Request failed: {e}")
132-
13398
file_path = os.path.join(os.getcwd(), str(uuid.uuid4()))
13499

135100
params = {api_key_parameter_name: api_key_parameter_value} if authentication_type == 1 else None
136101
headers = {api_key_parameter_name: api_key_parameter_value} if authentication_type == 2 else None
137102

138-
zip_file = make_request(url, params, headers) or make_request(
139-
url,
140-
params,
141-
{**FALLBACK_HEADERS, **(headers or {}), "Referer": f"{urlparse(url).scheme}://{urlparse(url).netloc}/"}
142-
)
143-
144-
if zip_file is None:
145-
raise RequestException(f"FAILURE! Retry attempts failed for {url}.")
103+
tried_options = set()
104+
current_headers = headers
105+
verify_ssl = True
146106

147-
with open(file_path, "wb") as f:
148-
f.write(zip_file)
107+
for attempt in range(3):
108+
try:
109+
response = requests.get(
110+
url,
111+
params=params,
112+
headers=current_headers,
113+
allow_redirects=True,
114+
verify=verify_ssl
115+
)
116+
response.raise_for_status()
149117

150-
return file_path
118+
if not verify_ssl:
119+
import warnings
120+
warnings.warn(
121+
f"SSL verification was disabled when downloading {url}."
122+
)
123+
124+
with open(file_path, "wb") as f:
125+
f.write(response.content)
126+
return file_path
127+
128+
except requests.exceptions.HTTPError as e:
129+
if e.response.status_code == 403 and "fallback_headers" not in tried_options:
130+
current_headers = get_fallback_headers(url, headers)
131+
tried_options.add("fallback_headers")
132+
continue
133+
134+
except requests.exceptions.SSLError:
135+
if "disable_ssl" not in tried_options:
136+
verify_ssl = False
137+
tried_options.add("disable_ssl")
138+
continue
139+
140+
except requests.exceptions.RequestException:
141+
pass
142+
143+
if "fallback_headers" not in tried_options:
144+
current_headers = get_fallback_headers(url, headers)
145+
tried_options.add("fallback_headers")
146+
elif "disable_ssl" not in tried_options:
147+
verify_ssl = False
148+
tried_options.add("disable_ssl")
149+
else:
150+
break
151+
152+
raise requests.exceptions.RequestException(f"FAILURE! All download attempts failed for {url}.")
151153

152154

153155
#########################

0 commit comments

Comments
 (0)