@@ -78,76 +78,78 @@ def to_csv(path, catalog, columns):
7878 catalog .to_csv (path , sep = "," , index = False )
7979
8080
81+ def get_fallback_headers (url , original_headers = None ):
82+ """Generate browser-like fallback headers for a given URL"""
83+ return {
84+ ** FALLBACK_HEADERS ,
85+ ** (original_headers or {}),
86+ "Referer" : f"{ urlparse (url ).scheme } ://{ urlparse (url ).netloc } /" ,
87+ "Host" : urlparse (url ).netloc
88+ }
89+
90+
8191def download_dataset (url , authentication_type , api_key_parameter_name = None , api_key_parameter_value = None ):
8292 """
8393 Downloads a dataset from the given URL using specified authentication mechanisms.
8494 The method performs a request to the URL with API key passed as either a query
85- parameter or a header, based on the chosen authentication type. If the download
86- fails with certain 403 errors, a fallback request with alternative headers is attempted.
87- It writes the dataset contents to a temporary file and returns the file path.
88-
89- :param url: The dataset's source URL.
90- :type url: str
91- :param authentication_type: The type of authentication mechanism to use (e.g.,
92- 1 for parameter-based, 2 for header-based).
93- :type authentication_type: int
94- :param api_key_parameter_name: The name of the API key parameter/header. It is
95- optional if the dataset is publicly accessible or no authentication is
96- required.
97- :type api_key_parameter_name: str, optional
98- :param api_key_parameter_value: The value of the API key to authenticate the
99- request. It is optional if no authentication is required.
100- :type api_key_parameter_value: str, optional
101- :return: The file path where the downloaded dataset is temporarily stored.
102- :type return: str
103- :raises RequestException: If all attempts to download the dataset fail.
95+ parameter or a header, based on the chosen authentication type. It implements
96+ adaptive fallback strategies for HTTP 403 errors and SSL certificate errors.
10497 """
105-
106- def make_request (url , params = None , headers = None ):
107- try :
108- response = requests .get (url , params = params , headers = headers , allow_redirects = True , verify = True )
109- response .raise_for_status ()
110- return response .content
111- except requests .exceptions .SSLError as ssl_err :
112- ca_bundle_path = os .environ .get ("SSL_CERT_PATH" )
113- if ca_bundle_path and os .path .exists (ca_bundle_path ):
114- print (f"SSL verification failed. Retrying with custom CA bundle: { ca_bundle_path } " )
115- try :
116- response = requests .get (url , params = params , headers = headers , allow_redirects = True ,
117- verify = ca_bundle_path )
118- response .raise_for_status ()
119- return response .content
120- except Exception as e :
121- print (f"SSL retry failed: { e } " )
122- return None
123- else :
124- print ("Custom CA bundle not found. SSL verification failed." )
125- return None
126- except HTTPError as e :
127- return None if e .response .status_code == 403 else RequestException (
128- f"HTTP error { e } when accessing { url } . Fallback headers will be tried."
129- )
130- except RequestException as e :
131- raise RequestException (f"Request failed: { e } " )
132-
13398 file_path = os .path .join (os .getcwd (), str (uuid .uuid4 ()))
13499
135100 params = {api_key_parameter_name : api_key_parameter_value } if authentication_type == 1 else None
136101 headers = {api_key_parameter_name : api_key_parameter_value } if authentication_type == 2 else None
137102
138- zip_file = make_request (url , params , headers ) or make_request (
139- url ,
140- params ,
141- {** FALLBACK_HEADERS , ** (headers or {}), "Referer" : f"{ urlparse (url ).scheme } ://{ urlparse (url ).netloc } /" }
142- )
143-
144- if zip_file is None :
145- raise RequestException (f"FAILURE! Retry attempts failed for { url } ." )
103+ tried_options = set ()
104+ current_headers = headers
105+ verify_ssl = True
146106
147- with open (file_path , "wb" ) as f :
148- f .write (zip_file )
107+ for attempt in range (3 ):
108+ try :
109+ response = requests .get (
110+ url ,
111+ params = params ,
112+ headers = current_headers ,
113+ allow_redirects = True ,
114+ verify = verify_ssl
115+ )
116+ response .raise_for_status ()
149117
150- return file_path
118+ if not verify_ssl :
119+ import warnings
120+ warnings .warn (
121+ f"SSL verification was disabled when downloading { url } ."
122+ )
123+
124+ with open (file_path , "wb" ) as f :
125+ f .write (response .content )
126+ return file_path
127+
128+ except requests .exceptions .HTTPError as e :
129+ if e .response .status_code == 403 and "fallback_headers" not in tried_options :
130+ current_headers = get_fallback_headers (url , headers )
131+ tried_options .add ("fallback_headers" )
132+ continue
133+
134+ except requests .exceptions .SSLError :
135+ if "disable_ssl" not in tried_options :
136+ verify_ssl = False
137+ tried_options .add ("disable_ssl" )
138+ continue
139+
140+ except requests .exceptions .RequestException :
141+ pass
142+
143+ if "fallback_headers" not in tried_options :
144+ current_headers = get_fallback_headers (url , headers )
145+ tried_options .add ("fallback_headers" )
146+ elif "disable_ssl" not in tried_options :
147+ verify_ssl = False
148+ tried_options .add ("disable_ssl" )
149+ else :
150+ break
151+
152+ raise requests .exceptions .RequestException (f"FAILURE! All download attempts failed for { url } ." )
151153
152154
153155#########################
0 commit comments