Skip to content

Commit 56c8b82

Browse files
committed
Fix subsampling with formula-backed IDs
1 parent ddc57fc commit 56c8b82

3 files changed

Lines changed: 46 additions & 2 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Preserve computed structural dataset variables when subsampling simulations.

policyengine_core/simulations/simulation.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1753,8 +1753,9 @@ def subsample(
17531753
if time_period is None:
17541754
time_period = self.default_calculation_period
17551755

1756-
# Convert simulation inputs to DataFrame
1757-
df = self.to_input_dataframe()
1756+
# Subsampling rebuilds the complete dataset, so preserve computed
1757+
# structural variables such as formula-backed IDs.
1758+
df = self.to_input_dataframe(include_computed_variables=True)
17581759

17591760
# Extract time period from DataFrame columns
17601761
df_time_period = (

tests/core/test_subsample_invalidates_cache.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,24 @@
1818
import pandas as pd
1919

2020
from policyengine_core.country_template import Microsimulation
21+
from policyengine_core.country_template import Simulation as CountryTemplateSimulation
22+
from policyengine_core.country_template.entities import Person
2123
from policyengine_core.data import Dataset
24+
from policyengine_core.model_api import Variable
25+
from policyengine_core.periods import YEAR
2226
from policyengine_core.periods import period as make_period
2327

2428

29+
class person_id(Variable):
30+
value_type = int
31+
entity = Person
32+
definition_period = YEAR
33+
label = "Formula-backed person ID for subsample regression tests."
34+
35+
def formula(person, period):
36+
return np.arange(person.count)
37+
38+
2539
def _build_mini_dataset() -> Dataset:
2640
"""Build a 5-household / 10-person in-memory dataset for subsample tests."""
2741
df = pd.DataFrame(
@@ -40,6 +54,34 @@ def _build_mini_dataset() -> Dataset:
4054
return Dataset.from_dataframe(df, "2022")
4155

4256

57+
def _build_formula_backed_id_simulation(
58+
isolated_tax_benefit_system,
59+
) -> CountryTemplateSimulation:
60+
isolated_tax_benefit_system.replace_variable(person_id)
61+
return CountryTemplateSimulation(
62+
tax_benefit_system=isolated_tax_benefit_system,
63+
dataset=_build_mini_dataset(),
64+
)
65+
66+
67+
def test_subsample_preserves_formula_backed_structural_ids(
68+
isolated_tax_benefit_system,
69+
) -> None:
70+
"""Subsampling needs IDs that safe public exports intentionally omit."""
71+
sim = _build_formula_backed_id_simulation(isolated_tax_benefit_system)
72+
73+
safe_columns = sim.to_input_dataframe().columns
74+
full_columns = sim.to_input_dataframe(include_computed_variables=True).columns
75+
76+
assert "person_id__2022" not in safe_columns
77+
assert "person_id__2022" in full_columns
78+
79+
sim.subsample(n=1, seed="formula-backed-person-id")
80+
81+
assert sim.populations["household"].count == 1
82+
assert sim.persons.count == 2
83+
84+
4385
def test_subsample_clears_stale_fast_cache_entries() -> None:
4486
"""A pre-subsample entry in ``_fast_cache`` must not survive subsample.
4587

0 commit comments

Comments
 (0)