Skip to content

Commit 79977d9

Browse files
author
Alfred Nwolisa
authored
Feat: Add fallback mechanism to handle ssl certification error (#727)
* Feat: Improved fallback mechanism to handle ssl certification error issues in make_request function * Feat: Add support for SSL exception handling * file cleanup * updated tests & import
1 parent 7ea3d5e commit 79977d9

3 files changed

Lines changed: 144 additions & 74 deletions

File tree

.github/workflows/direct_download_urls_test_for_sources.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,19 @@ jobs:
185185
os.makedirs(os.path.dirname(zip_path), exist_ok=True)
186186
187187
try:
188+
# First attempt with SSL verification
188189
zip_file_req = requests.get(url, params=params, headers=headers, allow_redirects=True)
189190
zip_file_req.raise_for_status()
191+
except requests.exceptions.SSLError as ssl_err:
192+
print(f"{base}: SSL verification failed. Retrying without verification.")
193+
try:
194+
zip_file_req = requests.get(url, params=params, headers=headers, allow_redirects=True, verify=False)
195+
zip_file_req.raise_for_status()
196+
print(f"Warning: SSL verification was disabled for {url}. This is a security risk.")
197+
except Exception as retry_e:
198+
raise Exception(
199+
f"{base}: Exception {retry_e} occurred when downloading the URL {url} with SSL verification disabled.\n"
200+
)
190201
except Exception as e:
191202
raise Exception(
192203
f"{base}: Exception {e} occurred when downloading the URL {url}.\n"

tools/helpers.py

Lines changed: 80 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,25 @@
1+
import datetime
12
import json
23
import os
3-
import datetime
4+
import uuid
5+
from urllib.parse import urlparse
6+
47
import gtfs_kit
5-
import requests
6-
from requests.exceptions import RequestException, HTTPError
78
import pandas as pd
9+
import requests
810
from pandas.errors import ParserError
11+
from requests.exceptions import RequestException, HTTPError
912
from unidecode import unidecode
10-
import uuid
13+
1114
from tools.constants import (
1215
STOP_LAT,
1316
STOP_LON,
1417
MDB_ARCHIVES_LATEST_URL_TEMPLATE,
1518
MDB_SOURCE_FILENAME,
1619
ZIP,
1720
FALLBACK_HEADERS,
18-
1921
)
20-
from urllib.parse import urlparse
22+
2123

2224
#########################
2325
# I/O FUNCTIONS
@@ -75,60 +77,78 @@ def to_csv(path, catalog, columns):
7577
catalog.to_csv(path, sep=",", index=False)
7678

7779

80+
def get_fallback_headers(url, original_headers=None):
81+
"""Generate browser-like fallback headers for a given URL"""
82+
return {
83+
**FALLBACK_HEADERS,
84+
**(original_headers or {}),
85+
"Referer": f"{urlparse(url).scheme}://{urlparse(url).netloc}/",
86+
"Host": urlparse(url).netloc
87+
}
88+
89+
7890
def download_dataset(url, authentication_type, api_key_parameter_name=None, api_key_parameter_value=None):
7991
"""
8092
Downloads a dataset from the given URL using specified authentication mechanisms.
8193
The method performs a request to the URL with API key passed as either a query
82-
parameter or a header, based on the chosen authentication type. If the download
83-
fails with certain 403 errors, a fallback request with alternative headers is attempted.
84-
It writes the dataset contents to a temporary file and returns the file path.
85-
86-
:param url: The dataset's source URL.
87-
:type url: str
88-
:param authentication_type: The type of authentication mechanism to use (e.g.,
89-
1 for parameter-based, 2 for header-based).
90-
:type authentication_type: int
91-
:param api_key_parameter_name: The name of the API key parameter/header. It is
92-
optional if the dataset is publicly accessible or no authentication is
93-
required.
94-
:type api_key_parameter_name: str, optional
95-
:param api_key_parameter_value: The value of the API key to authenticate the
96-
request. It is optional if no authentication is required.
97-
:type api_key_parameter_value: str, optional
98-
:return: The file path where the downloaded dataset is temporarily stored.
99-
:type return: str
100-
:raises RequestException: If all attempts to download the dataset fail.
94+
parameter or a header, based on the chosen authentication type. It implements
95+
adaptive fallback strategies for HTTP 403 errors and SSL certificate errors.
10196
"""
97+
file_path = os.path.join(os.getcwd(), str(uuid.uuid4()))
10298

103-
def make_request(url, params=None, headers=None):
99+
params = {api_key_parameter_name: api_key_parameter_value} if authentication_type == 1 else None
100+
headers = {api_key_parameter_name: api_key_parameter_value} if authentication_type == 2 else None
101+
102+
tried_options = set()
103+
current_headers = headers
104+
verify_ssl = True
105+
106+
for attempt in range(3):
104107
try:
105-
response = requests.get(url, params=params, headers=headers, allow_redirects=True)
108+
response = requests.get(
109+
url,
110+
params=params,
111+
headers=current_headers,
112+
allow_redirects=True,
113+
verify=verify_ssl
114+
)
106115
response.raise_for_status()
107-
return response.content
108-
except HTTPError as e:
109-
return None if e.response.status_code == 403 else RequestException(
110-
f"HTTP error {e} when accessing {url}. A fallback attempt with alternative headers will be made.")
111-
except RequestException as e:
112-
raise RequestException(f"Request failed: {e}")
113116

114-
file_path = os.path.join(os.getcwd(), str(uuid.uuid4()))
117+
if not verify_ssl:
118+
import warnings
119+
warnings.warn(
120+
f"SSL verification was disabled when downloading {url}."
121+
)
115122

116-
params = {api_key_parameter_name: api_key_parameter_value} if authentication_type == 1 else None
117-
headers = {api_key_parameter_name: api_key_parameter_value} if authentication_type == 2 else None
123+
with open(file_path, "wb") as f:
124+
f.write(response.content)
125+
return file_path
126+
127+
except requests.exceptions.HTTPError as e:
128+
if e.response.status_code == 403 and "fallback_headers" not in tried_options:
129+
current_headers = get_fallback_headers(url, headers)
130+
tried_options.add("fallback_headers")
131+
continue
118132

119-
zip_file = make_request(url, params, headers) or (
120-
make_request(url, params, {**FALLBACK_HEADERS, **(headers or {}),
121-
"Referer": f"{urlparse(url).scheme}://{urlparse(url).netloc}/",
122-
"Host": urlparse(url).netloc})
123-
)
133+
except requests.exceptions.SSLError:
134+
if "disable_ssl" not in tried_options:
135+
verify_ssl = False
136+
tried_options.add("disable_ssl")
137+
continue
124138

125-
if zip_file is None:
126-
raise RequestException(f"FAILURE! Retry attempts failed for {url}.")
139+
except requests.exceptions.RequestException:
140+
pass
127141

128-
with open(file_path, "wb") as f:
129-
f.write(zip_file)
142+
if "fallback_headers" not in tried_options:
143+
current_headers = get_fallback_headers(url, headers)
144+
tried_options.add("fallback_headers")
145+
elif "disable_ssl" not in tried_options:
146+
verify_ssl = False
147+
tried_options.add("disable_ssl")
148+
else:
149+
break
130150

131-
return file_path
151+
raise requests.exceptions.RequestException(f"FAILURE! All download attempts failed for {url}.")
132152

133153

134154
#########################
@@ -137,14 +157,14 @@ def make_request(url, params=None, headers=None):
137157

138158

139159
def are_overlapping_boxes(
140-
source_minimum_latitude,
141-
source_maximum_latitude,
142-
source_minimum_longitude,
143-
source_maximum_longitude,
144-
filter_minimum_latitude,
145-
filter_maximum_latitude,
146-
filter_minimum_longitude,
147-
filter_maximum_longitude,
160+
source_minimum_latitude,
161+
source_maximum_latitude,
162+
source_minimum_longitude,
163+
source_maximum_longitude,
164+
filter_minimum_latitude,
165+
filter_maximum_latitude,
166+
filter_minimum_longitude,
167+
filter_maximum_longitude,
148168
):
149169
"""
150170
Verifies if two boxes are overlapping in two dimensions.
@@ -178,7 +198,7 @@ def are_overlapping_boxes(
178198

179199

180200
def are_overlapping_edges(
181-
source_minimum, source_maximum, filter_minimum, filter_maximum
201+
source_minimum, source_maximum, filter_minimum, filter_maximum
182202
):
183203
"""
184204
Verifies if two edges are overlapping in one dimension.
@@ -238,7 +258,7 @@ def is_readable(file_path, load_func):
238258

239259

240260
def create_latest_url(
241-
country_code, subdivision_name, provider, data_type, mdb_source_id
261+
country_code, subdivision_name, provider, data_type, mdb_source_id
242262
):
243263
"""
244264
Creates the latest URL for an MDB Source.
@@ -270,7 +290,7 @@ def create_latest_url(
270290

271291

272292
def create_filename(
273-
country_code, subdivision_name, provider, data_type, mdb_source_id, extension
293+
country_code, subdivision_name, provider, data_type, mdb_source_id, extension
274294
):
275295
"""
276296
Creates the filename for an MDB Source.
@@ -401,9 +421,9 @@ def extract_gtfs_bounding_box(file_path):
401421

402422
stops_required_columns = {STOP_LAT, STOP_LON}
403423
stops_are_present = (
404-
stops is not None
405-
and stops_required_columns.issubset(stops.columns)
406-
and not (stops[STOP_LAT].dropna().empty or stops[STOP_LON].dropna().empty)
424+
stops is not None
425+
and stops_required_columns.issubset(stops.columns)
426+
and not (stops[STOP_LAT].dropna().empty or stops[STOP_LON].dropna().empty)
407427
)
408428

409429
minimum_latitude = stops[STOP_LAT].dropna().min() if stops_are_present else None

tools/tests/test_helpers.py

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
from unittest import TestCase, skip
22
from unittest.mock import patch, Mock
3+
4+
import pandas as pd
5+
import requests
6+
from freezegun import freeze_time
7+
from requests.exceptions import HTTPError
8+
39
from tools.helpers import (
410
are_overlapping_edges,
511
are_overlapping_boxes,
@@ -18,9 +24,6 @@
1824
normalize,
1925
download_dataset,
2026
)
21-
import pandas as pd
22-
from freezegun import freeze_time
23-
from requests.exceptions import HTTPError
2427

2528

2629
class TestVerificationFunctions(TestCase):
@@ -396,7 +399,7 @@ def test_to_csv(self):
396399
@patch("tools.helpers.os")
397400
@patch("tools.helpers.requests.get")
398401
def test_download_dataset_auth_type_empty(
399-
self, mock_requests, mock_os, mock_uuid4, mock_open
402+
self, mock_requests, mock_os, mock_uuid4, mock_open
400403
):
401404
test_authentication_type = None
402405
test_api_key_parameter_name = None
@@ -422,7 +425,7 @@ def test_download_dataset_auth_type_empty(
422425
@patch("tools.helpers.os")
423426
@patch("tools.helpers.requests.get")
424427
def test_download_dataset_auth_type_0(
425-
self, mock_requests, mock_os, mock_uuid4, mock_open
428+
self, mock_requests, mock_os, mock_uuid4, mock_open
426429
):
427430
test_authentication_type = 0
428431
test_api_key_parameter_name = None
@@ -448,7 +451,7 @@ def test_download_dataset_auth_type_0(
448451
@patch("tools.helpers.os")
449452
@patch("tools.helpers.requests.get")
450453
def test_download_dataset_auth_type_1(
451-
self, mock_requests, mock_os, mock_uuid4, mock_open
454+
self, mock_requests, mock_os, mock_uuid4, mock_open
452455
):
453456
test_authentication_type = 1
454457
test_api_key_parameter_name = "some_name"
@@ -477,7 +480,7 @@ def test_download_dataset_auth_type_1(
477480
@patch("tools.helpers.os")
478481
@patch("tools.helpers.requests.get")
479482
def test_download_dataset_auth_type_2(
480-
self, mock_requests, mock_os, mock_uuid4, mock_open
483+
self, mock_requests, mock_os, mock_uuid4, mock_open
481484
):
482485
test_authentication_type = 2
483486
test_api_key_parameter_name = "some_name"
@@ -506,7 +509,7 @@ def test_download_dataset_auth_type_2(
506509
@patch("tools.helpers.os")
507510
@patch("tools.helpers.requests.get")
508511
def test_download_dataset_exception(
509-
self, mock_requests, mock_os, mock_uuid4, mock_open
512+
self, mock_requests, mock_os, mock_uuid4, mock_open
510513
):
511514
test_authentication_type = None
512515
test_api_key_parameter_name = None
@@ -527,7 +530,6 @@ def test_download_dataset_exception(
527530
@patch("tools.helpers.os")
528531
@patch("tools.helpers.requests.get")
529532
def test_download_dataset_403_fallback_success(self, mock_requests, mock_os, mock_uuid4, mock_open):
530-
531533
response_403 = Mock(status_code=403)
532534
response_403.raise_for_status.side_effect = HTTPError(response=response_403)
533535

@@ -537,7 +539,7 @@ def test_download_dataset_403_fallback_success(self, mock_requests, mock_os, moc
537539
mock_os.path.join.return_value = self.test_path
538540

539541
under_test = download_dataset(url=self.test_url, authentication_type=0, api_key_parameter_name=None,
540-
api_key_parameter_value=None, )
542+
api_key_parameter_value=None, )
541543

542544
self.assertEqual(under_test, self.test_path)
543545
self.assertEqual(mock_requests.call_count, 2)
@@ -555,16 +557,53 @@ def test_download_dataset_403_fallback_failure(self, mock_requests, mock_os, moc
555557
response_403_1.raise_for_status.side_effect = HTTPError(response=response_403_1)
556558
response_403_2 = Mock(status_code=403)
557559
response_403_2.raise_for_status.side_effect = HTTPError(response=response_403_2)
560+
response_403_3 = Mock(status_code=403)
561+
response_403_3.raise_for_status.side_effect = HTTPError(response=response_403_3)
558562

559-
mock_requests.side_effect = [response_403_1, response_403_2]
563+
mock_requests.side_effect = [response_403_1, response_403_2, response_403_3]
560564

561565
mock_os.path.join.return_value = self.test_path
562566
self.assertRaises(RequestException, download_dataset, url=self.test_url,
563-
authentication_type=test_authentication_type, api_key_parameter_name=test_api_key_parameter_name,
564-
api_key_parameter_value=test_api_key_parameter_value, )
567+
authentication_type=test_authentication_type,
568+
api_key_parameter_name=test_api_key_parameter_name,
569+
api_key_parameter_value=test_api_key_parameter_value)
565570

566-
self.assertEqual(mock_requests.call_count, 2)
571+
self.assertEqual(mock_requests.call_count, 3)
567572
mock_os.path.join.assert_called_once()
568573
mock_os.getcwd.assert_called_once()
569574
mock_uuid4.assert_called_once()
570575
mock_open.assert_not_called()
576+
577+
@patch("tools.helpers.open")
578+
@patch("tools.helpers.uuid.uuid4")
579+
@patch("tools.helpers.os")
580+
@patch("tools.helpers.requests.get")
581+
def test_download_dataset_ssl_error_fallback(self, mock_requests, mock_os, mock_uuid4, mock_open):
582+
test_authentication_type = 0
583+
test_api_key_parameter_name = None
584+
test_api_key_parameter_value = None
585+
586+
ssl_error = requests.exceptions.SSLError("SSL Certificate Verification Failed")
587+
588+
response_200 = Mock(status_code=200, content=b"file_content")
589+
590+
mock_requests.side_effect = [ssl_error, response_200]
591+
mock_os.path.join.return_value = self.test_path
592+
593+
under_test = download_dataset(
594+
url=self.test_url,
595+
authentication_type=test_authentication_type,
596+
api_key_parameter_name=test_api_key_parameter_name,
597+
api_key_parameter_value=test_api_key_parameter_value,
598+
)
599+
600+
self.assertEqual(under_test, self.test_path)
601+
self.assertEqual(mock_requests.call_count, 2)
602+
603+
self.assertTrue(mock_requests.call_args_list[0].kwargs["verify"])
604+
self.assertFalse(mock_requests.call_args_list[1].kwargs["verify"])
605+
606+
mock_os.path.join.assert_called_once()
607+
mock_os.getcwd.assert_called_once()
608+
mock_uuid4.assert_called_once()
609+
mock_open.assert_called_once()

0 commit comments

Comments
 (0)