1+ import datetime
12import json
23import os
3- import datetime
4+ import uuid
5+ from urllib .parse import urlparse
6+
47import gtfs_kit
5- import requests
6- from requests .exceptions import RequestException , HTTPError
78import pandas as pd
9+ import requests
810from pandas .errors import ParserError
11+ from requests .exceptions import RequestException , HTTPError
912from unidecode import unidecode
10- import uuid
13+
1114from tools .constants import (
1215 STOP_LAT ,
1316 STOP_LON ,
1417 MDB_ARCHIVES_LATEST_URL_TEMPLATE ,
1518 MDB_SOURCE_FILENAME ,
1619 ZIP ,
1720 FALLBACK_HEADERS ,
18-
1921)
20- from urllib . parse import urlparse
22+
2123
2224#########################
2325# I/O FUNCTIONS
@@ -75,60 +77,78 @@ def to_csv(path, catalog, columns):
7577 catalog .to_csv (path , sep = "," , index = False )
7678
7779
80+ def get_fallback_headers (url , original_headers = None ):
81+ """Generate browser-like fallback headers for a given URL"""
82+ return {
83+ ** FALLBACK_HEADERS ,
84+ ** (original_headers or {}),
85+ "Referer" : f"{ urlparse (url ).scheme } ://{ urlparse (url ).netloc } /" ,
86+ "Host" : urlparse (url ).netloc
87+ }
88+
89+
7890def download_dataset (url , authentication_type , api_key_parameter_name = None , api_key_parameter_value = None ):
7991 """
8092 Downloads a dataset from the given URL using specified authentication mechanisms.
8193 The method performs a request to the URL with API key passed as either a query
82- parameter or a header, based on the chosen authentication type. If the download
83- fails with certain 403 errors, a fallback request with alternative headers is attempted.
84- It writes the dataset contents to a temporary file and returns the file path.
85-
86- :param url: The dataset's source URL.
87- :type url: str
88- :param authentication_type: The type of authentication mechanism to use (e.g.,
89- 1 for parameter-based, 2 for header-based).
90- :type authentication_type: int
91- :param api_key_parameter_name: The name of the API key parameter/header. It is
92- optional if the dataset is publicly accessible or no authentication is
93- required.
94- :type api_key_parameter_name: str, optional
95- :param api_key_parameter_value: The value of the API key to authenticate the
96- request. It is optional if no authentication is required.
97- :type api_key_parameter_value: str, optional
98- :return: The file path where the downloaded dataset is temporarily stored.
99- :type return: str
100- :raises RequestException: If all attempts to download the dataset fail.
94+ parameter or a header, based on the chosen authentication type. It implements
95+ adaptive fallback strategies for HTTP 403 errors and SSL certificate errors.
10196 """
97+ file_path = os .path .join (os .getcwd (), str (uuid .uuid4 ()))
10298
103- def make_request (url , params = None , headers = None ):
99+ params = {api_key_parameter_name : api_key_parameter_value } if authentication_type == 1 else None
100+ headers = {api_key_parameter_name : api_key_parameter_value } if authentication_type == 2 else None
101+
102+ tried_options = set ()
103+ current_headers = headers
104+ verify_ssl = True
105+
106+ for attempt in range (3 ):
104107 try :
105- response = requests .get (url , params = params , headers = headers , allow_redirects = True )
108+ response = requests .get (
109+ url ,
110+ params = params ,
111+ headers = current_headers ,
112+ allow_redirects = True ,
113+ verify = verify_ssl
114+ )
106115 response .raise_for_status ()
107- return response .content
108- except HTTPError as e :
109- return None if e .response .status_code == 403 else RequestException (
110- f"HTTP error { e } when accessing { url } . A fallback attempt with alternative headers will be made." )
111- except RequestException as e :
112- raise RequestException (f"Request failed: { e } " )
113116
114- file_path = os .path .join (os .getcwd (), str (uuid .uuid4 ()))
117+ if not verify_ssl :
118+ import warnings
119+ warnings .warn (
120+ f"SSL verification was disabled when downloading { url } ."
121+ )
115122
116- params = {api_key_parameter_name : api_key_parameter_value } if authentication_type == 1 else None
117- headers = {api_key_parameter_name : api_key_parameter_value } if authentication_type == 2 else None
123+ with open (file_path , "wb" ) as f :
124+ f .write (response .content )
125+ return file_path
126+
127+ except requests .exceptions .HTTPError as e :
128+ if e .response .status_code == 403 and "fallback_headers" not in tried_options :
129+ current_headers = get_fallback_headers (url , headers )
130+ tried_options .add ("fallback_headers" )
131+ continue
118132
119- zip_file = make_request ( url , params , headers ) or (
120- make_request ( url , params , { ** FALLBACK_HEADERS , ** ( headers or {}),
121- "Referer" : f" { urlparse ( url ). scheme } :// { urlparse ( url ). netloc } /" ,
122- "Host" : urlparse ( url ). netloc } )
123- )
133+ except requests . exceptions . SSLError :
134+ if "disable_ssl" not in tried_options :
135+ verify_ssl = False
136+ tried_options . add ( "disable_ssl" )
137+ continue
124138
125- if zip_file is None :
126- raise RequestException ( f"FAILURE! Retry attempts failed for { url } ." )
139+ except requests . exceptions . RequestException :
140+ pass
127141
128- with open (file_path , "wb" ) as f :
129- f .write (zip_file )
142+ if "fallback_headers" not in tried_options :
143+ current_headers = get_fallback_headers (url , headers )
144+ tried_options .add ("fallback_headers" )
145+ elif "disable_ssl" not in tried_options :
146+ verify_ssl = False
147+ tried_options .add ("disable_ssl" )
148+ else :
149+ break
130150
131- return file_path
151+ raise requests . exceptions . RequestException ( f"FAILURE! All download attempts failed for { url } ." )
132152
133153
134154#########################
@@ -137,14 +157,14 @@ def make_request(url, params=None, headers=None):
137157
138158
139159def are_overlapping_boxes (
140- source_minimum_latitude ,
141- source_maximum_latitude ,
142- source_minimum_longitude ,
143- source_maximum_longitude ,
144- filter_minimum_latitude ,
145- filter_maximum_latitude ,
146- filter_minimum_longitude ,
147- filter_maximum_longitude ,
160+ source_minimum_latitude ,
161+ source_maximum_latitude ,
162+ source_minimum_longitude ,
163+ source_maximum_longitude ,
164+ filter_minimum_latitude ,
165+ filter_maximum_latitude ,
166+ filter_minimum_longitude ,
167+ filter_maximum_longitude ,
148168):
149169 """
150170 Verifies if two boxes are overlapping in two dimensions.
@@ -178,7 +198,7 @@ def are_overlapping_boxes(
178198
179199
180200def are_overlapping_edges (
181- source_minimum , source_maximum , filter_minimum , filter_maximum
201+ source_minimum , source_maximum , filter_minimum , filter_maximum
182202):
183203 """
184204 Verifies if two edges are overlapping in one dimension.
@@ -238,7 +258,7 @@ def is_readable(file_path, load_func):
238258
239259
240260def create_latest_url (
241- country_code , subdivision_name , provider , data_type , mdb_source_id
261+ country_code , subdivision_name , provider , data_type , mdb_source_id
242262):
243263 """
244264 Creates the latest URL for an MDB Source.
@@ -270,7 +290,7 @@ def create_latest_url(
270290
271291
272292def create_filename (
273- country_code , subdivision_name , provider , data_type , mdb_source_id , extension
293+ country_code , subdivision_name , provider , data_type , mdb_source_id , extension
274294):
275295 """
276296 Creates the filename for an MDB Source.
@@ -401,9 +421,9 @@ def extract_gtfs_bounding_box(file_path):
401421
402422 stops_required_columns = {STOP_LAT , STOP_LON }
403423 stops_are_present = (
404- stops is not None
405- and stops_required_columns .issubset (stops .columns )
406- and not (stops [STOP_LAT ].dropna ().empty or stops [STOP_LON ].dropna ().empty )
424+ stops is not None
425+ and stops_required_columns .issubset (stops .columns )
426+ and not (stops [STOP_LAT ].dropna ().empty or stops [STOP_LON ].dropna ().empty )
407427 )
408428
409429 minimum_latitude = stops [STOP_LAT ].dropna ().min () if stops_are_present else None
0 commit comments