8585]
8686
8787
88+ class NonRetryableHTTPError (Exception ):
89+
90+ def __init__ (self , e ):
91+ super ().__init__ (str (e ))
92+ self .response = e .response
93+
94+
8895@retry (tries = 3 ,
8996 delay = 5 ,
9097 backoff = 2 ,
91- exceptions = requests .exceptions .ConnectionError )
92- def retry_method (url , headers = None ):
93- response = requests .get (url , stream = True , headers = headers , timeout = 120 )
94- response .raise_for_status ()
95- return response
98+ exceptions = requests .exceptions .RequestException )
99+ def retry_method (url , filepath , headers = None ):
100+ try :
101+ with requests .get (url , stream = True , headers = headers ,
102+ timeout = (30 , 300 )) as response :
103+ response .raise_for_status ()
104+ with open (filepath , 'wb' ) as f :
105+ for chunk in response .iter_content (chunk_size = 8192 ):
106+ if chunk :
107+ f .write (chunk )
108+ except requests .exceptions .HTTPError as e :
109+ if e .response is not None and 400 <= e .response .status_code < 500 :
110+ raise NonRetryableHTTPError (e )
111+ raise
96112
97113
98114def download_files ():
@@ -109,17 +125,17 @@ def download_files():
109125 filename = name_template .format (last_two_digits_formatted )
110126 url = url_template .format (year , last_two_digits_formatted )
111127 logging .info (f"downloading url: { url } " )
128+
129+ # Temporary path to save the zip file instead of keeping it in memory
130+ temp_zip_path = os .path .join (_LOCAL_OUTPUT_PATH , f"temp_{ filename } " )
112131 try :
113- response = retry_method (url )
114- zip_content_stream = io .BytesIO (response .content )
115- with zipfile .ZipFile (zip_content_stream , 'r' ) as zip_ref :
132+ retry_method (url , temp_zip_path )
133+ with zipfile .ZipFile (temp_zip_path , 'r' ) as zip_ref :
116134 for member in zip_ref .namelist ():
117135 if not member .endswith ('/' ) and member .lower ().endswith (
118136 '.txt' ):
119137 extract_path = os .path .join (
120- _LOCAL_OUTPUT_PATH ,
121- os .path .join (_LOCAL_OUTPUT_PATH ,
122- os .path .basename (member )))
138+ _LOCAL_OUTPUT_PATH , os .path .basename (member ))
123139 abs_extract_path = os .path .abspath (extract_path )
124140 abs_target_dir = os .path .abspath (_LOCAL_OUTPUT_PATH )
125141
@@ -136,22 +152,33 @@ def download_files():
136152 logging .info (
137153 f" Skipping non-txt file/folder in zip: '{ member } '"
138154 )
139- except (requests .exceptions .RequestException ,
140- zipfile .BadZipFile ) as e :
141- # Check if this is the latest year which might not be published yet (404)
142- is_404 = (isinstance (e , requests .exceptions .HTTPError ) and
143- e .response .status_code == 404 )
144- if year == latest_year and is_404 :
155+ except (requests .exceptions .RequestException , zipfile .BadZipFile ,
156+ NonRetryableHTTPError ) as e :
157+ status_code = None
158+ if isinstance (e , NonRetryableHTTPError ):
159+ status_code = e .response .status_code if e .response is not None else None
160+ elif hasattr (e , 'response' ) and e .response is not None :
161+ status_code = getattr (e .response , 'status_code' , None )
162+ if year == latest_year and (status_code == 404 or
163+ isinstance (e , zipfile .BadZipFile )):
145164 logging .warning (
146- f"Latest year { year } not yet available at { url } . Skipping."
165+ f"Latest year { year } data is invalid or not yet available at { url } . Skipping."
147166 )
148167 continue
149168 else :
150169 # For historical years or non-404 errors, we want the script to fail
151170 logging .error (
152- f"Critical failure: Could not download historical data for { year } at { url } ."
171+ f"Critical failure: Could not download historical data for { year } at { url } . Error: { e } "
153172 )
154173 raise e
174+ finally :
175+ if os .path .exists (temp_zip_path ):
176+ try :
177+ os .remove (temp_zip_path )
178+ except OSError as cleanup_error :
179+ logging .warning (
180+ f"Failed to delete temp file { temp_zip_path } : { cleanup_error } "
181+ )
155182
156183
157184def main (argv ):
0 commit comments