diff --git a/changelog.d/fix-consumption-weeks-rng.fixed.md b/changelog.d/fix-consumption-weeks-rng.fixed.md new file mode 100644 index 000000000..06e3a1955 --- /dev/null +++ b/changelog.d/fix-consumption-weeks-rng.fixed.md @@ -0,0 +1 @@ +Replace bare `* 52` weekly-to-annual conversion in LCFS imputation with the shared `WEEKS_IN_YEAR = 365.25 / 7` constant used by `datasets/frs.py`, and replace two `np.random.seed(42)` calls with local `np.random.default_rng(42)` so consumption imputation stops mutating the process-wide RNG state. diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index 368da0d44..fc6aeaf71 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -28,6 +28,13 @@ from policyengine_uk_data.parameters import load_take_up_rate, load_parameter +# Canonical weeks-per-year conversion factor for annualising weekly survey +# variables. 365.25 / 7 ≈ 52.1786 accounts for leap years; using the rounded +# integer 52 would under-count by ~0.34%. Exposed at module level so sibling +# loaders (e.g. LCFS/ETB in `datasets/imputations/consumption.py`) can import +# the same value rather than re-defining `* 52` locally and drifting. +WEEKS_IN_YEAR = 365.25 / 7 + LEGACY_JOBSEEKER_MIN_AGE = 18 HOURS_WORKED_WEEKS_PER_YEAR = 52 ESA_MIN_AGE = 16 diff --git a/policyengine_uk_data/datasets/imputations/consumption.py b/policyengine_uk_data/datasets/imputations/consumption.py index efbf2aff6..09bee2090 100644 --- a/policyengine_uk_data/datasets/imputations/consumption.py +++ b/policyengine_uk_data/datasets/imputations/consumption.py @@ -25,9 +25,16 @@ from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk.data import UKSingleYearDataset from policyengine_uk import Microsimulation +from policyengine_uk_data.datasets.frs import WEEKS_IN_YEAR LCFS_TAB_FOLDER = STORAGE_FOLDER / "lcfs_2021_22" +# Default seed for the stochastic ICE-vehicle flag drawn from +# `NTS_2024_ICE_VEHICLE_SHARE`. Kept at 42 for backward compatibility with +# existing artefact fingerprints; callers can override via the fixture's +# local RNG rather than the process-wide np.random global. +_HAS_FUEL_SEED = 42 + # EV/ICE vehicle mix from NTS 2024 NTS_2024_ICE_VEHICLE_SHARE = 0.90 @@ -406,9 +413,12 @@ def create_has_fuel_model(): num_vehicles = was["vcarnr7"].fillna(0).clip(lower=0) has_vehicle = num_vehicles > 0 - np.random.seed(42) + # Use a local RNG so we don't mutate the global np.random state (which + # would silently change any unrelated consumer of np.random that runs + # after this function). + rng = np.random.default_rng(_HAS_FUEL_SEED) has_fuel = ( - has_vehicle & (np.random.random(len(was)) < NTS_2024_ICE_VEHICLE_SHARE) + has_vehicle & (rng.random(len(was)) < NTS_2024_ICE_VEHICLE_SHARE) ).astype(float) was_df = pd.DataFrame( @@ -481,7 +491,10 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame) household = household.rename(columns=CONSUMPTION_VARIABLE_RENAMES) - # Annualise weekly LCFS values (× 52) + # Annualise weekly LCFS values. Use the same WEEKS_IN_YEAR constant + # (365.25 / 7 ≈ 52.1786) as `datasets/frs.py` rather than a bare `* 52`, + # which underestimates annual totals by ~0.34% and skews VAT / energy + # imputation targets against FRS income. annualise = list(CONSUMPTION_VARIABLE_RENAMES.values()) + [ "hbai_household_net_income", "household_gross_income", @@ -489,10 +502,10 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame) "gas_consumption", ] for variable in annualise: - household[variable] = household[variable] * 52 + household[variable] = household[variable] * WEEKS_IN_YEAR for variable in PERSON_LCF_RENAMES.values(): household[variable] = ( - person[variable].groupby(person.case).sum()[household.case] * 52 + person[variable].groupby(person.case).sum()[household.case] * WEEKS_IN_YEAR ) household.household_weight *= 1_000 @@ -577,9 +590,10 @@ def impute_consumption(dataset: UKSingleYearDataset) -> UKSingleYearDataset: sim = Microsimulation(dataset=dataset) num_vehicles = sim.calculate("num_vehicles", map_to="household").values - np.random.seed(42) + # Local RNG — see note at module level (_HAS_FUEL_SEED). + rng = np.random.default_rng(_HAS_FUEL_SEED) has_vehicle = num_vehicles > 0 - is_ice = np.random.random(len(num_vehicles)) < NTS_2024_ICE_VEHICLE_SHARE + is_ice = rng.random(len(num_vehicles)) < NTS_2024_ICE_VEHICLE_SHARE has_fuel_consumption = (has_vehicle & is_ice).astype(float) dataset.household["has_fuel_consumption"] = has_fuel_consumption diff --git a/policyengine_uk_data/tests/test_consumption_weeks_rng.py b/policyengine_uk_data/tests/test_consumption_weeks_rng.py new file mode 100644 index 000000000..b885785d8 --- /dev/null +++ b/policyengine_uk_data/tests/test_consumption_weeks_rng.py @@ -0,0 +1,76 @@ +"""Regression tests for `datasets/imputations/consumption.py`. + +Covers the two sub-fixes in bug-hunt finding U6: + +- Weekly → annual conversion used a bare ``* 52`` while the sibling FRS + loader uses ``WEEKS_IN_YEAR = 365.25 / 7 ≈ 52.1786``. The ~0.34% gap + skews VAT / energy imputation targets against FRS income. +- Two ``np.random.seed(42)`` calls mutated the global RNG state, so any + unrelated numpy random call after consumption imputation changed output. +""" + +from __future__ import annotations + +import importlib.util + +import numpy as np +import pytest + +if importlib.util.find_spec("policyengine_uk") is None: + pytest.skip( + "policyengine_uk not available in test environment", + allow_module_level=True, + ) + + +def test_consumption_imports_shared_weeks_in_year(): + """consumption.py should use the same WEEKS_IN_YEAR as frs.py.""" + from policyengine_uk_data.datasets.frs import WEEKS_IN_YEAR as frs_value + from policyengine_uk_data.datasets.imputations import ( + consumption as consumption_module, + ) + + assert frs_value == 365.25 / 7 + # Imported at module scope, not re-computed locally with a different value. + assert consumption_module.WEEKS_IN_YEAR == frs_value + + +def test_consumption_module_no_longer_calls_np_random_seed(): + """The fixed module must not reach into the process-wide RNG state. + + The buggy version called `np.random.seed(42)` in two places, which + mutated the global RNG and silently changed output for any later + numpy random consumer in the same process. + """ + import inspect + + from policyengine_uk_data.datasets.imputations import ( + consumption as consumption_module, + ) + + src = inspect.getsource(consumption_module) + assert "np.random.seed(" not in src, ( + "consumption.py still mutates the global RNG state via " + "np.random.seed(...) — use np.random.default_rng(seed) instead." + ) + + +def test_consumption_does_not_perturb_global_rng_on_import(): + """Importing consumption.py must not change global RNG state. + + Regression against the original bug where running module-level code + paths (e.g. during test collection) could have reseeded the global RNG. + """ + np.random.seed(1234) + before = np.random.random(3) + + # Force reimport to exercise module-level side effects. + import importlib + + from policyengine_uk_data.datasets.imputations import consumption + + importlib.reload(consumption) + + np.random.seed(1234) + after = np.random.random(3) + np.testing.assert_allclose(before, after)