Skip to content

Commit 0ce59ea

Browse files
committed
Fix release manifest and ORG CI regressions
1 parent 1bbfc33 commit 0ce59ea

4 files changed

Lines changed: 145 additions & 16 deletions

File tree

policyengine_us_data/datasets/org/org.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,30 @@ def _select_cps_basic_org_columns(month_df: pd.DataFrame) -> pd.DataFrame:
203203
return selected
204204

205205

206+
def _resolve_cps_basic_org_usecols(url: str) -> list[str]:
207+
"""Resolve the exact remote column names before reading the full CPS month.
208+
209+
Pandas' callable `usecols` path against remote CSVs can intermittently
210+
mis-handle the header row and return an empty selection. Resolving the
211+
concrete header first avoids that parser path while keeping the full read
212+
column-limited.
213+
"""
214+
header_df = pd.read_csv(url, nrows=0)
215+
column_lookup = {
216+
str(column).lower(): column
217+
for column in header_df.columns
218+
if isinstance(column, str)
219+
}
220+
missing = [
221+
column
222+
for column in CPS_BASIC_MONTHLY_ORG_COLUMNS
223+
if column.lower() not in column_lookup
224+
]
225+
if missing:
226+
raise ValueError(f"CPS basic ORG month is missing required columns: {missing}")
227+
return [column_lookup[column.lower()] for column in CPS_BASIC_MONTHLY_ORG_COLUMNS]
228+
229+
206230
def _load_cps_basic_org_month(
207231
year: int,
208232
month: str,
@@ -211,16 +235,14 @@ def _load_cps_basic_org_month(
211235
) -> pd.DataFrame:
212236
"""Load one CPS basic-month file with light retry around transient fetch/parser issues."""
213237
url = _cps_basic_org_month_url(year, month)
214-
required_columns = {column.lower() for column in CPS_BASIC_MONTHLY_ORG_COLUMNS}
215238
last_error: Exception | None = None
216239

217240
for _ in range(max_attempts):
218241
try:
242+
usecols = _resolve_cps_basic_org_usecols(url)
219243
month_df = pd.read_csv(
220244
url,
221-
usecols=lambda column: (
222-
isinstance(column, str) and column.lower() in required_columns
223-
),
245+
usecols=usecols,
224246
low_memory=False,
225247
)
226248
return _select_cps_basic_org_columns(month_df)

policyengine_us_data/utils/data_upload.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,14 @@ def get_matching_finalized_release_manifest(
380380
],
381381
existing_manifest=finalized_manifest,
382382
)
383+
if "created_at" in finalized_manifest:
384+
candidate_manifest["created_at"] = finalized_manifest["created_at"]
385+
finalized_build = finalized_manifest.get("build")
386+
if isinstance(finalized_build, dict):
387+
candidate_build = candidate_manifest.setdefault("build", {})
388+
for field in ("build_id", "built_at"):
389+
if field in finalized_build:
390+
candidate_build[field] = finalized_build[field]
383391
if candidate_manifest != finalized_manifest:
384392
raise RuntimeError(
385393
f"Release {version} is already finalized on {hf_repo_name}. "

tests/unit/datasets/test_org.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,11 +172,14 @@ def test_load_cps_basic_org_month_retries_after_transient_parser_failure(
172172
"PEIO1COW": [1],
173173
}
174174
)
175+
header_df = pd.DataFrame(columns=month_df.columns)
175176

176177
def fake_read_csv(*args, **kwargs):
177178
calls.append(kwargs)
178179
if len(calls) == 1:
179-
raise ValueError("Usecols do not match columns")
180+
raise ValueError("Temporary header parse failure")
181+
if kwargs.get("nrows") == 0:
182+
return header_df
180183
return month_df
181184

182185
monkeypatch.setattr(
@@ -185,8 +188,10 @@ def fake_read_csv(*args, **kwargs):
185188

186189
loaded = _load_cps_basic_org_month(2024, "may", max_attempts=2)
187190

188-
assert len(calls) == 2
189-
assert callable(calls[0]["usecols"])
191+
assert len(calls) == 3
192+
assert calls[0]["nrows"] == 0
193+
assert calls[1]["nrows"] == 0
194+
assert calls[2]["usecols"] == month_df.columns.tolist()
190195
assert loaded.columns.tolist() == CPS_BASIC_MONTHLY_ORG_COLUMNS
191196

192197

tests/unit/test_release_manifest.py

Lines changed: 103 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ def _sha256(content: bytes) -> str:
2828
return hashlib.sha256(content).hexdigest()
2929

3030

31-
EXPECTED_COMPATIBLE_MODEL_PACKAGES = [{"name": "policyengine-us", "version": "1.634.4"}]
31+
EXPECTED_COMPATIBLE_MODEL_PACKAGES = [
32+
{"name": "policyengine-us", "specifier": "==1.634.4"}
33+
]
3234

3335

3436
def _build_local_area_manifest(
@@ -64,6 +66,8 @@ def test_build_release_manifest_tracks_uploaded_artifacts(tmp_path):
6466
version="1.73.0",
6567
repo_id="policyengine/policyengine-us-data",
6668
model_package_version="1.634.4",
69+
model_package_git_sha="deadbeef",
70+
model_package_data_build_fingerprint="sha256:fingerprint",
6771
created_at="2026-04-10T12:00:00Z",
6872
)
6973

@@ -73,6 +77,16 @@ def test_build_release_manifest_tracks_uploaded_artifacts(tmp_path):
7377
}
7478
assert manifest["schema_version"] == RELEASE_MANIFEST_SCHEMA_VERSION
7579
assert manifest["compatible_model_packages"] == EXPECTED_COMPATIBLE_MODEL_PACKAGES
80+
assert manifest["build"] == {
81+
"build_id": "policyengine-us-data-1.73.0",
82+
"built_at": "2026-04-10T12:00:00Z",
83+
"built_with_model_package": {
84+
"name": "policyengine-us",
85+
"version": "1.634.4",
86+
"git_sha": "deadbeef",
87+
"data_build_fingerprint": "sha256:fingerprint",
88+
},
89+
}
7690
assert manifest["default_datasets"] == {"national": "enhanced_cps_2024"}
7791

7892
assert manifest["artifacts"]["enhanced_cps_2024"] == {
@@ -122,13 +136,25 @@ def test_build_release_manifest_merges_existing_release_same_version(tmp_path):
122136
version="1.73.0",
123137
repo_id="policyengine/policyengine-us-data",
124138
model_package_version="1.634.4",
139+
model_package_git_sha="deadbeef",
140+
model_package_data_build_fingerprint="sha256:fingerprint",
125141
existing_manifest=existing_manifest,
126142
created_at="2026-04-10T12:00:00Z",
127143
)
128144

129145
assert set(manifest["artifacts"]) == {"enhanced_cps_2024", "districts/NC-01"}
130146
assert manifest["default_datasets"] == {"national": "enhanced_cps_2024"}
131-
assert manifest["created_at"] == "2026-04-09T12:00:00Z"
147+
assert manifest["created_at"] == "2026-04-10T12:00:00Z"
148+
assert manifest["build"] == {
149+
"build_id": "policyengine-us-data-1.73.0",
150+
"built_at": "2026-04-10T12:00:00Z",
151+
"built_with_model_package": {
152+
"name": "policyengine-us",
153+
"version": "1.634.4",
154+
"git_sha": "deadbeef",
155+
"data_build_fingerprint": "sha256:fingerprint",
156+
},
157+
}
132158
assert manifest["artifacts"]["districts/NC-01"]["sha256"] == _sha256(district_bytes)
133159

134160

@@ -167,8 +193,12 @@ def test_upload_files_to_hf_adds_release_manifest_operations(tmp_path):
167193
return_value=None,
168194
),
169195
patch(
170-
"policyengine_us_data.utils.data_upload.metadata.version",
171-
return_value="1.634.4",
196+
"policyengine_us_data.utils.data_upload._get_model_package_build_metadata",
197+
return_value={
198+
"version": "1.634.4",
199+
"git_sha": "deadbeef",
200+
"data_build_fingerprint": "sha256:fingerprint",
201+
},
172202
),
173203
patch.dict(
174204
"policyengine_us_data.utils.data_upload.os.environ",
@@ -215,8 +245,12 @@ def test_upload_files_to_hf_does_not_tag_until_finalize(tmp_path):
215245
return_value=None,
216246
),
217247
patch(
218-
"policyengine_us_data.utils.data_upload.metadata.version",
219-
return_value="1.634.4",
248+
"policyengine_us_data.utils.data_upload._get_model_package_build_metadata",
249+
return_value={
250+
"version": "1.634.4",
251+
"git_sha": "deadbeef",
252+
"data_build_fingerprint": "sha256:fingerprint",
253+
},
220254
),
221255
patch.dict(
222256
"policyengine_us_data.utils.data_upload.os.environ",
@@ -250,6 +284,16 @@ def test_publish_release_manifest_to_hf_can_finalize_and_tag(tmp_path):
250284
"compatible_model_packages": EXPECTED_COMPATIBLE_MODEL_PACKAGES,
251285
"default_datasets": {"national": "enhanced_cps_2024"},
252286
"created_at": "2026-04-10T12:00:00Z",
287+
"build": {
288+
"build_id": "policyengine-us-data-1.73.0",
289+
"built_at": "2026-04-10T12:00:00Z",
290+
"built_with_model_package": {
291+
"name": "policyengine-us",
292+
"version": "1.634.4",
293+
"git_sha": "deadbeef",
294+
"data_build_fingerprint": "sha256:fingerprint",
295+
},
296+
},
253297
"artifacts": {
254298
"enhanced_cps_2024": {
255299
"kind": "microdata",
@@ -271,22 +315,36 @@ def test_publish_release_manifest_to_hf_can_finalize_and_tag(tmp_path):
271315
),
272316
),
273317
patch(
274-
"policyengine_us_data.utils.data_upload.metadata.version",
275-
return_value="1.634.4",
318+
"policyengine_us_data.utils.data_upload._get_model_package_build_metadata",
319+
return_value={
320+
"version": "1.634.4",
321+
"git_sha": "deadbeef",
322+
"data_build_fingerprint": "sha256:fingerprint",
323+
},
276324
),
277325
patch.dict(
278326
"policyengine_us_data.utils.data_upload.os.environ",
279327
{"HUGGING_FACE_TOKEN": "token"},
280328
clear=False,
281329
),
282330
):
283-
publish_release_manifest_to_hf(
331+
manifest = publish_release_manifest_to_hf(
284332
[(state_path, "states/AL.h5")],
285333
version="1.73.0",
286334
create_tag=True,
287335
)
288336

289337
mock_api.create_tag.assert_called_once()
338+
assert manifest["build"] == {
339+
"build_id": "policyengine-us-data-1.73.0",
340+
"built_at": "2026-04-10T12:00:00Z",
341+
"built_with_model_package": {
342+
"name": "policyengine-us",
343+
"version": "1.634.4",
344+
"git_sha": "deadbeef",
345+
"data_build_fingerprint": "sha256:fingerprint",
346+
},
347+
}
290348

291349

292350
def test_missing_release_prefixes_requires_full_local_area_bundle():
@@ -373,6 +431,16 @@ def test_publish_release_manifest_to_hf_rejects_finalized_release(tmp_path):
373431
"compatible_model_packages": EXPECTED_COMPATIBLE_MODEL_PACKAGES,
374432
"default_datasets": {"national": "enhanced_cps_2024"},
375433
"created_at": "2026-04-10T12:00:00Z",
434+
"build": {
435+
"build_id": "policyengine-us-data-1.73.0",
436+
"built_at": "2026-04-10T12:00:00Z",
437+
"built_with_model_package": {
438+
"name": "policyengine-us",
439+
"version": "1.634.4",
440+
"git_sha": "deadbeef",
441+
"data_build_fingerprint": "sha256:fingerprint",
442+
},
443+
},
376444
"artifacts": {
377445
"states/AL": {
378446
"kind": "microdata",
@@ -396,6 +464,14 @@ def test_publish_release_manifest_to_hf_rejects_finalized_release(tmp_path):
396464
"policyengine_us_data.utils.data_upload._get_model_package_version",
397465
return_value="1.634.4",
398466
),
467+
patch(
468+
"policyengine_us_data.utils.data_upload._get_model_package_build_metadata",
469+
return_value={
470+
"version": "1.634.4",
471+
"git_sha": "deadbeef",
472+
"data_build_fingerprint": "sha256:fingerprint",
473+
},
474+
),
399475
):
400476
manifest = publish_release_manifest_to_hf(
401477
[(state_path, "states/AL.h5")],
@@ -420,6 +496,16 @@ def test_publish_release_manifest_to_hf_rejects_mutating_finalized_release(tmp_p
420496
"compatible_model_packages": EXPECTED_COMPATIBLE_MODEL_PACKAGES,
421497
"default_datasets": {"national": "enhanced_cps_2024"},
422498
"created_at": "2026-04-10T12:00:00Z",
499+
"build": {
500+
"build_id": "policyengine-us-data-1.73.0",
501+
"built_at": "2026-04-10T12:00:00Z",
502+
"built_with_model_package": {
503+
"name": "policyengine-us",
504+
"version": "1.634.4",
505+
"git_sha": "deadbeef",
506+
"data_build_fingerprint": "sha256:fingerprint",
507+
},
508+
},
423509
"artifacts": {
424510
"states/AL": {
425511
"kind": "microdata",
@@ -443,6 +529,14 @@ def test_publish_release_manifest_to_hf_rejects_mutating_finalized_release(tmp_p
443529
"policyengine_us_data.utils.data_upload._get_model_package_version",
444530
return_value="1.634.4",
445531
),
532+
patch(
533+
"policyengine_us_data.utils.data_upload._get_model_package_build_metadata",
534+
return_value={
535+
"version": "1.634.4",
536+
"git_sha": "deadbeef",
537+
"data_build_fingerprint": "sha256:fingerprint",
538+
},
539+
),
446540
):
447541
try:
448542
publish_release_manifest_to_hf(

0 commit comments

Comments
 (0)