PolicyEngine
diff --git a/‎CHANGELOG.md‎
Lines changed: 7 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎policyengine_us_data/datasets/org/org.py‎
Lines changed: 79 additions & 45 deletions b/‎policyengine_us_data/datasets/org/org.py‎
Lines changed: 79 additions & 45 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
@@ -1,3 +1,10 @@
+## [1.78.2] - 2026-04-12
+
+### Fixed
+
+- Harden CPS basic ORG donor loading against transient fetch failures and concurrent cache builds.
+
+
 ## [1.78.1] - 2026-04-12
 
 ### Changed
 
@@ -6,11 +6,16 @@
 imputation onto CPS records.
 """
 
+from contextlib import contextmanager
 from functools import lru_cache
+from io import BytesIO
+from pathlib import Path
+import fcntl
 
 from microimpute.models.qrf import QRF
 import numpy as np
 import pandas as pd
+import requests
 
 from policyengine_us_data.storage import STORAGE_FOLDER
 
@@ -181,11 +186,13 @@ def _cps_basic_org_month_url(year: int, month: str) -> str:
     )
 
 
-def _select_cps_basic_org_columns(month_df: pd.DataFrame) -> pd.DataFrame:
-    """Normalize CPS basic-month columns onto the ORG schema."""
+def _resolve_cps_basic_org_column_names(
+    columns: pd.Index | list[str],
+) -> list[str]:
+    """Resolve CPS basic-month columns onto the expected ORG schema order."""
     column_lookup = {
-        str(column).lower(): column
-        for column in month_df.columns
+        str(column).lower(): str(column)
+        for column in columns
         if isinstance(column, str)
     }
     missing = [
@@ -196,35 +203,14 @@ def _select_cps_basic_org_columns(month_df: pd.DataFrame) -> pd.DataFrame:
     if missing:
         raise ValueError(f"CPS basic ORG month is missing required columns: {missing}")
 
-    selected = month_df[
-        [column_lookup[column.lower()] for column in CPS_BASIC_MONTHLY_ORG_COLUMNS]
-    ].copy()
-    selected.columns = CPS_BASIC_MONTHLY_ORG_COLUMNS
-    return selected
-
+    return [column_lookup[column.lower()] for column in CPS_BASIC_MONTHLY_ORG_COLUMNS]
 
-def _resolve_cps_basic_org_usecols(url: str) -> list[str]:
-    """Resolve the exact remote column names before reading the full CPS month.
 
-    Pandas' callable `usecols` path against remote CSVs can intermittently
-    mis-handle the header row and return an empty selection. Resolving the
-    concrete header first avoids that parser path while keeping the full read
-    column-limited.
-    """
-    header_df = pd.read_csv(url, nrows=0)
-    column_lookup = {
-        str(column).lower(): column
-        for column in header_df.columns
-        if isinstance(column, str)
-    }
-    missing = [
-        column
-        for column in CPS_BASIC_MONTHLY_ORG_COLUMNS
-        if column.lower() not in column_lookup
-    ]
-    if missing:
-        raise ValueError(f"CPS basic ORG month is missing required columns: {missing}")
-    return [column_lookup[column.lower()] for column in CPS_BASIC_MONTHLY_ORG_COLUMNS]
+def _select_cps_basic_org_columns(month_df: pd.DataFrame) -> pd.DataFrame:
+    """Normalize CPS basic-month columns onto the ORG schema."""
+    selected = month_df[_resolve_cps_basic_org_column_names(month_df.columns)].copy()
+    selected.columns = CPS_BASIC_MONTHLY_ORG_COLUMNS
+    return selected
 
 
 def _load_cps_basic_org_month(
@@ -239,10 +225,14 @@ def _load_cps_basic_org_month(
 
     for _ in range(max_attempts):
         try:
-            usecols = _resolve_cps_basic_org_usecols(url)
+            response = requests.get(url, timeout=60)
+            response.raise_for_status()
+            content = response.content
+            header = pd.read_csv(BytesIO(content), nrows=0)
+            selected_columns = _resolve_cps_basic_org_column_names(header.columns)
             month_df = pd.read_csv(
-                url,
-                usecols=usecols,
+                BytesIO(content),
+                usecols=selected_columns,
                 low_memory=False,
             )
             return _select_cps_basic_org_columns(month_df)
@@ -255,6 +245,36 @@ def _load_cps_basic_org_month(
     ) from last_error
 
 
+@contextmanager
+def _org_cache_build_lock(lock_path: Path):
+    lock_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(lock_path, "w") as lock_file:
+        fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
+        try:
+            yield
+        finally:
+            fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
+
+
+def _load_valid_cached_org_training_data(cache_path: Path) -> pd.DataFrame | None:
+    """Return a cached ORG training frame when it is present and structurally valid."""
+    required_columns = set(
+        ORG_PREDICTORS + ORG_QRF_IMPUTED_VARIABLES + ["sample_weight"]
+    )
+    try:
+        cached = pd.read_csv(cache_path)
+    except (FileNotFoundError, OSError, pd.errors.EmptyDataError):
+        return None
+
+    if cached.empty:
+        return None
+
+    if not required_columns.issubset(cached.columns):
+        return None
+
+    return cached
+
+
 def _transform_cps_basic_org_month(month_df: pd.DataFrame) -> pd.DataFrame:
     """Convert one monthly CPS basic file into ORG donor rows.
 
@@ -473,17 +493,31 @@ def _predict_union_coverage_from_bls_tables(
 def load_org_training_data() -> pd.DataFrame:
     """Load ORG donor rows built from official CPS basic monthly files."""
     cache_path = STORAGE_FOLDER / ORG_FILENAME
-    if cache_path.exists():
-        return pd.read_csv(cache_path)
-
-    months = []
-    for month in ORG_MONTHS:
-        month_df = _load_cps_basic_org_month(ORG_YEAR, month)
-        months.append(_transform_cps_basic_org_month(month_df))
-
-    org = pd.concat(months, ignore_index=True)
-    org.to_csv(cache_path, index=False, compression="gzip")
-    return org
+    lock_path = cache_path.parent / f"{cache_path.name}.lock"
+    cached = _load_valid_cached_org_training_data(cache_path)
+    if cached is not None:
+        return cached
+
+    with _org_cache_build_lock(lock_path):
+        cached = _load_valid_cached_org_training_data(cache_path)
+        if cached is not None:
+            return cached
+        if cache_path.exists():
+            cache_path.unlink()
+
+        months = []
+        for month in ORG_MONTHS:
+            month_df = _load_cps_basic_org_month(ORG_YEAR, month)
+            months.append(_transform_cps_basic_org_month(month_df))
+
+        org = pd.concat(months, ignore_index=True)
+        temp_path = cache_path.parent / f"{cache_path.name}.tmp.gz"
+        org.to_csv(temp_path, index=False, compression="gzip")
+        temp_path.replace(cache_path)
+        cached = _load_valid_cached_org_training_data(cache_path)
+        if cached is None:
+            raise ValueError("Failed to build a valid cached ORG donor file")
+        return cached
 
 
 @lru_cache(maxsize=1)
 
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "policyengine_us_data"
-version = "1.78.1"
+version = "1.78.2"
 description = "A package to create representative microdata for the US."
 readme = "README.md"
 authors = [