Skip to content

Commit 327588e

Browse files
authored
CensusCountyBusinessPatterns_fix_read_timout (#2060)
1 parent bb92f76 commit 327588e

1 file changed

Lines changed: 46 additions & 19 deletions

File tree

  • scripts/census_county_business_patterns

scripts/census_county_business_patterns/main.py

Lines changed: 46 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,30 @@
8585
]
8686

8787

88+
class NonRetryableHTTPError(Exception):
89+
90+
def __init__(self, e):
91+
super().__init__(str(e))
92+
self.response = e.response
93+
94+
8895
@retry(tries=3,
8996
delay=5,
9097
backoff=2,
91-
exceptions=requests.exceptions.ConnectionError)
92-
def retry_method(url, headers=None):
93-
response = requests.get(url, stream=True, headers=headers, timeout=120)
94-
response.raise_for_status()
95-
return response
98+
exceptions=requests.exceptions.RequestException)
99+
def retry_method(url, filepath, headers=None):
100+
try:
101+
with requests.get(url, stream=True, headers=headers,
102+
timeout=(30, 300)) as response:
103+
response.raise_for_status()
104+
with open(filepath, 'wb') as f:
105+
for chunk in response.iter_content(chunk_size=8192):
106+
if chunk:
107+
f.write(chunk)
108+
except requests.exceptions.HTTPError as e:
109+
if e.response is not None and 400 <= e.response.status_code < 500:
110+
raise NonRetryableHTTPError(e)
111+
raise
96112

97113

98114
def download_files():
@@ -109,17 +125,17 @@ def download_files():
109125
filename = name_template.format(last_two_digits_formatted)
110126
url = url_template.format(year, last_two_digits_formatted)
111127
logging.info(f"downloading url: {url}")
128+
129+
# Temporary path to save the zip file instead of keeping it in memory
130+
temp_zip_path = os.path.join(_LOCAL_OUTPUT_PATH, f"temp_{filename}")
112131
try:
113-
response = retry_method(url)
114-
zip_content_stream = io.BytesIO(response.content)
115-
with zipfile.ZipFile(zip_content_stream, 'r') as zip_ref:
132+
retry_method(url, temp_zip_path)
133+
with zipfile.ZipFile(temp_zip_path, 'r') as zip_ref:
116134
for member in zip_ref.namelist():
117135
if not member.endswith('/') and member.lower().endswith(
118136
'.txt'):
119137
extract_path = os.path.join(
120-
_LOCAL_OUTPUT_PATH,
121-
os.path.join(_LOCAL_OUTPUT_PATH,
122-
os.path.basename(member)))
138+
_LOCAL_OUTPUT_PATH, os.path.basename(member))
123139
abs_extract_path = os.path.abspath(extract_path)
124140
abs_target_dir = os.path.abspath(_LOCAL_OUTPUT_PATH)
125141

@@ -136,22 +152,33 @@ def download_files():
136152
logging.info(
137153
f" Skipping non-txt file/folder in zip: '{member}'"
138154
)
139-
except (requests.exceptions.RequestException,
140-
zipfile.BadZipFile) as e:
141-
# Check if this is the latest year which might not be published yet (404)
142-
is_404 = (isinstance(e, requests.exceptions.HTTPError) and
143-
e.response.status_code == 404)
144-
if year == latest_year and is_404:
155+
except (requests.exceptions.RequestException, zipfile.BadZipFile,
156+
NonRetryableHTTPError) as e:
157+
status_code = None
158+
if isinstance(e, NonRetryableHTTPError):
159+
status_code = e.response.status_code if e.response is not None else None
160+
elif hasattr(e, 'response') and e.response is not None:
161+
status_code = getattr(e.response, 'status_code', None)
162+
if year == latest_year and (status_code == 404 or
163+
isinstance(e, zipfile.BadZipFile)):
145164
logging.warning(
146-
f"Latest year {year} not yet available at {url}. Skipping."
165+
f"Latest year {year} data is invalid or not yet available at {url}. Skipping."
147166
)
148167
continue
149168
else:
150169
# For historical years or non-404 errors, we want the script to fail
151170
logging.error(
152-
f"Critical failure: Could not download historical data for {year} at {url}."
171+
f"Critical failure: Could not download historical data for {year} at {url}. Error: {e}"
153172
)
154173
raise e
174+
finally:
175+
if os.path.exists(temp_zip_path):
176+
try:
177+
os.remove(temp_zip_path)
178+
except OSError as cleanup_error:
179+
logging.warning(
180+
f"Failed to delete temp file {temp_zip_path}: {cleanup_error}"
181+
)
155182

156183

157184
def main(argv):

0 commit comments

Comments
 (0)