55import pandas as pd
66import requests
77from sqlmodel import Session , create_engine
8+ from tenacity import (
9+ before_sleep_log ,
10+ retry ,
11+ retry_if_exception_type ,
12+ stop_after_attempt ,
13+ wait_exponential ,
14+ )
815
916from policyengine_us_data .storage import STORAGE_FOLDER
1017from policyengine_us_data .db .create_database_tables import (
1926logger = logging .getLogger (__name__ )
2027
2128ACF_DATA_YEAR = 2024
22- CASELOAD_PAGE_URL = "https://www.acf.hhs.gov/ofa/data/tanf-caseload-data-2024"
23- FINANCIAL_PAGE_URL = "https://www.acf.hhs.gov/ofa/data/tanf-financial-data-fy-2024"
24- CASELOAD_URL_PATTERN = re .compile (
25- r"https://acf\.gov/sites/default/files/documents/ofa/fy\d{4}_tanf_caseload\.xlsx"
26- )
27- FINANCIAL_URL_PATTERN = re .compile (
28- r"https://acf\.gov/sites/default/files/documents/ofa/fy-\d{4}-tanf-moe-financial-data\.xlsx"
29+ ACF_REQUEST_TIMEOUT = 60
30+
31+ # Direct URLs for the FY-stamped ACF workbooks. The previous implementation
32+ # scraped the HTML landing page (`acf.gov/ofa/data/tanf-...`) to discover
33+ # these links, but that page is intermittently unreachable on `acf.gov` and
34+ # was the dominant source of `make database` build failures (see #852). The
35+ # workbook URLs themselves on `acf.gov/sites/default/files/documents/ofa/`
36+ # return 200 reliably, so we hit them directly and skip the page entirely.
37+ #
38+ # Update this dict when:
39+ # - ACF publishes a new fiscal year's workbooks (add a new top-level key
40+ # and bump `ACF_DATA_YEAR` / `_validate_supported_year`),
41+ # - or ACF renames an existing FY's workbook on disk. A 404 from
42+ # `_acf_get` is the early signal — that's an authoritative file
43+ # rename, not a transient outage, and the new path needs to be
44+ # copied in by hand from the corresponding ACF page.
45+ TANF_WORKBOOK_URLS : dict [int , dict [str , str ]] = {
46+ 2024 : {
47+ "caseload" : (
48+ "https://acf.gov/sites/default/files/documents/ofa/"
49+ "fy2024_tanf_caseload.xlsx"
50+ ),
51+ "financial" : (
52+ "https://acf.gov/sites/default/files/documents/ofa/"
53+ "fy-2024-tanf-moe-financial-data.xlsx"
54+ ),
55+ },
56+ }
57+
58+
59+ @retry (
60+ stop = stop_after_attempt (5 ),
61+ wait = wait_exponential (multiplier = 2 , min = 5 , max = 60 ),
62+ retry = retry_if_exception_type (
63+ (
64+ requests .exceptions .Timeout ,
65+ requests .exceptions .ConnectionError ,
66+ requests .exceptions .ChunkedEncodingError ,
67+ )
68+ ),
69+ before_sleep = before_sleep_log (logger , logging .WARNING ),
70+ reraise = True ,
2971)
72+ def _acf_get (session : requests .Session , url : str ) -> requests .Response :
73+ response = session .get (url , timeout = ACF_REQUEST_TIMEOUT )
74+ response .raise_for_status ()
75+ return response
3076
3177
3278def _validate_supported_year (year : int ) -> None :
@@ -37,9 +83,7 @@ def _validate_supported_year(year: int) -> None:
3783 )
3884
3985
40- def _download_acf_excel (
41- page_url : str , cache_file : str , url_pattern : re .Pattern
42- ) -> bytes :
86+ def _download_acf_excel (workbook_url : str , cache_file : str ) -> bytes :
4387 if is_cached (cache_file ):
4488 logger .info ("Using cached %s" , cache_file )
4589 return load_bytes (cache_file )
@@ -54,25 +98,16 @@ def _download_acf_excel(
5498 }
5599 )
56100
57- page_response = session .get (page_url , timeout = 30 )
58- page_response .raise_for_status ()
59- match = url_pattern .search (page_response .text )
60- if match is None :
61- raise ValueError (f"Could not find TANF workbook URL on { page_url } " )
62-
63- workbook_url = match .group (0 )
64- workbook_response = session .get (workbook_url , timeout = 60 )
65- workbook_response .raise_for_status ()
101+ workbook_response = _acf_get (session , workbook_url )
66102 save_bytes (cache_file , workbook_response .content )
67103 return workbook_response .content
68104
69105
70106def extract_tanf_caseload_data (year : int ) -> pd .DataFrame :
71107 _validate_supported_year (year )
72108 workbook = _download_acf_excel (
73- CASELOAD_PAGE_URL ,
109+ TANF_WORKBOOK_URLS [ ACF_DATA_YEAR ][ "caseload" ] ,
74110 f"tanf_caseload_{ ACF_DATA_YEAR } .xlsx" ,
75- CASELOAD_URL_PATTERN ,
76111 )
77112 return pd .read_excel (io .BytesIO (workbook ), sheet_name = "TFam" , header = 3 )
78113
@@ -115,9 +150,8 @@ def extract_tanf_financial_data(
115150) -> tuple [pd .DataFrame , dict [str , pd .DataFrame ]]:
116151 _validate_supported_year (year )
117152 workbook = _download_acf_excel (
118- FINANCIAL_PAGE_URL ,
153+ TANF_WORKBOOK_URLS [ ACF_DATA_YEAR ][ "financial" ] ,
119154 f"tanf_financial_{ ACF_DATA_YEAR } .xlsx" ,
120- FINANCIAL_URL_PATTERN ,
121155 )
122156 xls = pd .ExcelFile (io .BytesIO (workbook ))
123157 national_df = pd .read_excel (
0 commit comments