Skip to content

Commit deda9e1

Browse files
committed
Allow explicit local dataset paths
1 parent eb81739 commit deda9e1

2 files changed

Lines changed: 72 additions & 0 deletions

File tree

src/policyengine/provenance/manifest.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,23 @@ def resolve_dataset_reference(country_id: str, dataset: str) -> str:
438438
return artifact.uri
439439

440440

441+
def _existing_local_dataset_path(dataset: str) -> Optional[Path]:
442+
path = Path(dataset).expanduser()
443+
if not path.exists():
444+
return None
445+
446+
is_path_like = (
447+
path.is_absolute()
448+
or dataset.startswith(("~", "."))
449+
or os.sep in dataset
450+
or (os.altsep is not None and os.altsep in dataset)
451+
or path.suffix.lower() in {".h5", ".hdf5"}
452+
)
453+
if not is_path_like:
454+
return None
455+
return path.resolve()
456+
457+
441458
def resolve_managed_dataset_reference(
442459
country_id: str,
443460
dataset: Optional[str] = None,
@@ -472,6 +489,17 @@ def resolve_managed_dataset_reference(
472489
"bypass bundle enforcement."
473490
)
474491

492+
local_dataset_path = _existing_local_dataset_path(dataset)
493+
if local_dataset_path is not None:
494+
if allow_unmanaged:
495+
return str(local_dataset_path)
496+
raise ValueError(
497+
"Local dataset paths bypass the policyengine.py release bundle. "
498+
"Pass a manifest dataset name or omit `dataset` to use the certified "
499+
"default dataset. Set `allow_unmanaged=True` only if you intend to "
500+
"run against a local dataset outside the bundle."
501+
)
502+
475503
return resolve_dataset_reference(country_id, dataset)
476504

477505

tests/test_release_manifests.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,31 @@ def test__given_explicit_uri__then_managed_resolution_requires_opt_in(self):
166166
== dataset
167167
)
168168

169+
def test__given_local_dataset_path__then_managed_resolution_requires_opt_in(
170+
self,
171+
tmp_path,
172+
):
173+
dataset_path = tmp_path / "local_2100.h5"
174+
dataset_path.write_bytes(b"not a real h5; resolution only")
175+
176+
try:
177+
resolve_managed_dataset_reference("us", str(dataset_path))
178+
except ValueError as error:
179+
assert "Local dataset paths bypass the policyengine.py release bundle" in str(
180+
error
181+
)
182+
else:
183+
raise AssertionError("Expected local dataset path to be rejected")
184+
185+
assert (
186+
resolve_managed_dataset_reference(
187+
"us",
188+
str(dataset_path),
189+
allow_unmanaged=True,
190+
)
191+
== str(dataset_path.resolve())
192+
)
193+
169194
def test__given_versioned_dataset_url__then_logical_name_drops_version(self):
170195
dataset = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0"
171196

@@ -633,6 +658,25 @@ def test__given_us_unmanaged_dataset_uri__then_source_is_not_rewritten(self):
633658
assert microsim.policyengine_bundle["runtime_dataset_uri"] == dataset
634659
assert microsim.policyengine_bundle["runtime_dataset_source"] == dataset
635660

661+
def test__given_us_unmanaged_local_dataset__then_source_is_local_path(
662+
self,
663+
tmp_path,
664+
):
665+
dataset_path = tmp_path / "local_2100.h5"
666+
dataset_path.write_bytes(b"not a real h5; source plumbing only")
667+
668+
with patch("policyengine_us.Microsimulation") as mock_microsimulation:
669+
microsim = managed_us_microsimulation(
670+
dataset=str(dataset_path),
671+
allow_unmanaged=True,
672+
)
673+
674+
resolved_path = str(dataset_path.resolve())
675+
assert mock_microsimulation.call_args.kwargs["dataset"] == resolved_path
676+
assert microsim.policyengine_bundle["runtime_dataset"] == "local_2100"
677+
assert microsim.policyengine_bundle["runtime_dataset_uri"] == resolved_path
678+
assert microsim.policyengine_bundle["runtime_dataset_source"] == resolved_path
679+
636680
def test__given_uk_managed_dataset_name__then_resolves_within_bundle(self):
637681
with patch("policyengine_uk.Microsimulation") as mock_microsimulation:
638682
microsim = managed_uk_microsimulation(dataset="enhanced_frs_2023_24")

0 commit comments

Comments
 (0)