Skip to content

Commit 9c31db7

Browse files
committed
Merge branch 'codex/fix-private-default-dataset' of https://github.com/PolicyEngine/policyengine-uk into codex/merge-1549
2 parents 321ecae + 6b3aeae commit 9c31db7

5 files changed

Lines changed: 60 additions & 10 deletions

File tree

policyengine_uk/simulation.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Standard library imports
2+
import os
23
from typing import Dict, Optional, Union, Type, List
34

45
# Third-party imports
@@ -27,7 +28,7 @@
2728
from policyengine_uk.utils.dependencies import get_variable_dependencies
2829
from policyengine_uk.reforms import create_structural_reforms_from_parameters
2930

30-
from .tax_benefit_system import CountryTaxBenefitSystem
31+
from .tax_benefit_system import CountryTaxBenefitSystem, DEFAULT_DATASET_ENV_VAR
3132

3233
from microdf import MicroDataFrame
3334

@@ -37,6 +38,16 @@
3738
_url_dataset_cache: dict = {}
3839

3940

41+
def get_default_dataset_url() -> str:
42+
dataset_url = os.environ.get(DEFAULT_DATASET_ENV_VAR)
43+
if dataset_url:
44+
return dataset_url
45+
raise ValueError(
46+
"Simulation() requires an explicit dataset when no situation is provided. "
47+
f"Pass dataset=..., or set {DEFAULT_DATASET_ENV_VAR} to opt into a default dataset."
48+
)
49+
50+
4051
def _pre_encode_enum_columns(
4152
dataset: UKMultiYearDataset, tbs: "CountryTaxBenefitSystem"
4253
) -> None:
@@ -143,9 +154,7 @@ def __init__(
143154
elif isinstance(dataset, UKMultiYearDataset):
144155
self.build_from_multi_year_dataset(dataset)
145156
elif dataset is None:
146-
self.build_from_url(
147-
"hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5"
148-
)
157+
self.build_from_url(get_default_dataset_url())
149158
else:
150159
raise ValueError(f"Unsupported dataset type: {dataset.__class__}")
151160

policyengine_uk/tax_benefit_system.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737

3838
# Module constants
3939
COUNTRY_DIR = Path(__file__).parent
40-
ENHANCED_FRS = "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5"
40+
DEFAULT_DATASET_ENV_VAR = "POLICYENGINE_UK_DEFAULT_DATASET"
4141

4242
# Cache for fully-processed parameter tree, so convert_to_fiscal_year_parameters
4343
# (22,538 param.update() calls) only runs once per process.

policyengine_uk/tests/conftest.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import os
2+
3+
import pytest
4+
5+
DEFAULT_TEST_DATASET_URL = (
6+
"hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.3"
7+
)
8+
9+
if os.environ.get("HUGGING_FACE_TOKEN") and not os.environ.get(
10+
"POLICYENGINE_UK_DEFAULT_DATASET"
11+
):
12+
os.environ["POLICYENGINE_UK_DEFAULT_DATASET"] = DEFAULT_TEST_DATASET_URL
13+
14+
15+
def pytest_collection_modifyitems(config, items):
16+
has_default_dataset = bool(os.environ.get("POLICYENGINE_UK_DEFAULT_DATASET"))
17+
if has_default_dataset:
18+
return
19+
20+
skip_microsimulation = pytest.mark.skip(
21+
reason=(
22+
"Requires POLICYENGINE_UK_DEFAULT_DATASET or HUGGING_FACE_TOKEN "
23+
"for microsimulation dataset access"
24+
)
25+
)
26+
for item in items:
27+
if "microsimulation" in item.keywords:
28+
item.add_marker(skip_microsimulation)

policyengine_uk/tests/test_behavioral_responses.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,11 @@
1616
from policyengine_uk import Microsimulation
1717
from policyengine_uk.model_api import Scenario
1818

19-
# Check if HF token is available for data-dependent tests
20-
HF_TOKEN_AVAILABLE = bool(os.environ.get("HUGGING_FACE_TOKEN"))
19+
# Check if a default dataset is available for data-dependent tests
20+
HF_TOKEN_AVAILABLE = bool(
21+
os.environ.get("HUGGING_FACE_TOKEN")
22+
or os.environ.get("POLICYENGINE_UK_DEFAULT_DATASET")
23+
)
2124
requires_hf_data = pytest.mark.skipif(
2225
not HF_TOKEN_AVAILABLE,
2326
reason="Requires HUGGING_FACE_TOKEN for private data access",

policyengine_uk/tests/test_deterministic_variables.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,15 @@ def test_household_owns_tv_can_be_set_false(self):
155155

156156

157157
class TestDefaultDatasetUrl:
158-
"""Test that the default dataset URL points at the private HF repo."""
158+
"""Test explicit handling of the default dataset URL."""
159159

160-
def test_simulation_defaults_to_private_hf_repo(self, monkeypatch):
160+
def test_simulation_requires_explicit_default_dataset(self, monkeypatch):
161+
monkeypatch.delenv("POLICYENGINE_UK_DEFAULT_DATASET", raising=False)
162+
163+
with pytest.raises(ValueError, match="requires an explicit dataset"):
164+
Simulation()
165+
166+
def test_simulation_uses_opt_in_default_dataset(self, monkeypatch):
161167
captured = {}
162168

163169
class _StopDefaultDatasetLoad(Exception):
@@ -168,12 +174,16 @@ def fake_build_from_url(self, url):
168174
raise _StopDefaultDatasetLoad
169175

170176
monkeypatch.setattr(Simulation, "build_from_url", fake_build_from_url)
177+
monkeypatch.setenv(
178+
"POLICYENGINE_UK_DEFAULT_DATASET",
179+
"hf://policyengine/policyengine-uk-data/enhanced_frs_2022_23.h5",
180+
)
171181

172182
with pytest.raises(_StopDefaultDatasetLoad):
173183
Simulation()
174184

175185
assert captured["url"] == (
176-
"hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5"
186+
"hf://policyengine/policyengine-uk-data/enhanced_frs_2022_23.h5"
177187
)
178188

179189

0 commit comments

Comments
 (0)