Skip to content

Commit 7ac01eb

Browse files
MaxGhenisclaude
andauthored
Accept run_id in upload_to_staging_hf, unblock build-and-test (#794)
* Accept run_id in upload_to_staging_hf, scope staging prefix by run Main's build-and-test has been failing on every real-content push with `TypeError: upload_to_staging_hf() got an unexpected keyword argument 'run_id'`. The four callers (`storage/upload_completed_datasets.py`, `calibration/promote_local_h5s.py`, `modal_app/local_area.py`, `modal_app/pipeline.py`) all pass `run_id=`, but the function's signature was never updated. Existing tests mocked the function with `**kwargs`, which hid the divergence. Add `run_id: str = ""` to the signature and, when set, stage files under `staging/{run_id}/{rel_path}` — same convention the surrounding helpers (`_download_staged_dataset_artifacts`, `_promote_staging_to_hf`) already use. Fall back to bare `staging/{rel_path}` when `run_id` is empty so behavior is unchanged for the no-run_id call path. New unit tests in `tests/unit/utils/test_data_upload.py` exercise the real function (not a mock) with and without `run_id` and assert on the resulting `path_in_repo` values, so the next caller drift will surface in unit CI rather than on a 3-hour integration run. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * Rename changelog fragment to PR number Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 64873b1 commit 7ac01eb

3 files changed

Lines changed: 81 additions & 1 deletion

File tree

changelog.d/794.fixed.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Add missing `run_id` parameter to `upload_to_staging_hf` in
2+
`policyengine_us_data.utils.data_upload`, matching the kwarg its
3+
callers have been passing. Files now land under
4+
`staging/{run_id}/{rel_path}` when `run_id` is set and under
5+
`staging/{rel_path}` otherwise, mirroring the prefix convention used
6+
elsewhere in the package. Unblocks the main `build-and-test` workflow,
7+
which had been failing with `TypeError: upload_to_staging_hf() got an
8+
unexpected keyword argument 'run_id'` on every real-content push.

policyengine_us_data/utils/data_upload.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,7 @@ def upload_to_staging_hf(
751751
hf_repo_name: str = "policyengine/policyengine-us-data",
752752
hf_repo_type: str = "model",
753753
batch_size: int = 50,
754+
run_id: str = "",
754755
) -> int:
755756
"""
756757
Upload files to staging/ paths in HuggingFace.
@@ -762,12 +763,16 @@ def upload_to_staging_hf(
762763
hf_repo_name: HuggingFace repository name
763764
hf_repo_type: Repository type
764765
batch_size: Number of files per commit batch
766+
run_id: Optional per-run scope. When set, files land under
767+
``staging/{run_id}/{rel_path}`` so concurrent runs do not
768+
collide; otherwise they land under ``staging/{rel_path}``.
765769
766770
Returns:
767771
Number of files uploaded
768772
"""
769773
token = os.environ.get("HUGGING_FACE_TOKEN")
770774
api = HfApi()
775+
staging_prefix = f"staging/{run_id}" if run_id else "staging"
771776

772777
total_uploaded = 0
773778
for i in range(0, len(files_with_paths), batch_size):
@@ -780,7 +785,7 @@ def upload_to_staging_hf(
780785
continue
781786
operations.append(
782787
CommitOperationAdd(
783-
path_in_repo=f"staging/{rel_path}",
788+
path_in_repo=f"{staging_prefix}/{rel_path}",
784789
path_or_fileobj=str(local_path),
785790
)
786791
)
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from pathlib import Path
2+
3+
from policyengine_us_data.utils import data_upload
4+
5+
6+
class _FakeHfApi:
7+
def __init__(self):
8+
self.commits = []
9+
10+
11+
def _install_fake_hf(monkeypatch, tmp_path):
12+
fake = _FakeHfApi()
13+
monkeypatch.setattr(data_upload, "HfApi", lambda: fake)
14+
15+
captured_ops = []
16+
17+
def fake_commit(api, operations, repo_id, repo_type, token, commit_message):
18+
captured_ops.extend(operations)
19+
20+
monkeypatch.setattr(data_upload, "hf_create_commit_with_retry", fake_commit)
21+
return captured_ops
22+
23+
24+
def _make_files(tmp_path, rel_paths):
25+
files = []
26+
for rel in rel_paths:
27+
local = tmp_path / Path(rel).name
28+
local.write_text("stub")
29+
files.append((local, rel))
30+
return files
31+
32+
33+
def test_upload_to_staging_hf_accepts_run_id_kwarg(monkeypatch, tmp_path):
34+
captured_ops = _install_fake_hf(monkeypatch, tmp_path)
35+
files = _make_files(tmp_path, ["states/AL.h5"])
36+
37+
n = data_upload.upload_to_staging_hf(
38+
files,
39+
version="1.73.0",
40+
run_id="abc123",
41+
)
42+
43+
assert n == 1
44+
assert len(captured_ops) == 1
45+
46+
47+
def test_upload_to_staging_hf_run_id_scopes_staging_prefix(monkeypatch, tmp_path):
48+
captured_ops = _install_fake_hf(monkeypatch, tmp_path)
49+
files = _make_files(tmp_path, ["states/AL.h5", "states/CA.h5"])
50+
51+
data_upload.upload_to_staging_hf(files, version="1.73.0", run_id="abc123")
52+
53+
assert [op.path_in_repo for op in captured_ops] == [
54+
"staging/abc123/states/AL.h5",
55+
"staging/abc123/states/CA.h5",
56+
]
57+
58+
59+
def test_upload_to_staging_hf_without_run_id_uses_bare_staging_prefix(
60+
monkeypatch, tmp_path
61+
):
62+
captured_ops = _install_fake_hf(monkeypatch, tmp_path)
63+
files = _make_files(tmp_path, ["states/AL.h5"])
64+
65+
data_upload.upload_to_staging_hf(files, version="1.73.0")
66+
67+
assert [op.path_in_repo for op in captured_ops] == ["staging/states/AL.h5"]

0 commit comments

Comments
 (0)