Skip to content

Commit 959d47f

Browse files
authored
Use shared WEEKS_IN_YEAR and local RNG in consumption imputation (#353)
* Use shared WEEKS_IN_YEAR and local RNG in consumption imputation * Apply ruff format
1 parent 54ee554 commit 959d47f

4 files changed

Lines changed: 105 additions & 7 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Replace bare `* 52` weekly-to-annual conversion in LCFS imputation with the shared `WEEKS_IN_YEAR = 365.25 / 7` constant used by `datasets/frs.py`, and replace two `np.random.seed(42)` calls with local `np.random.default_rng(42)` so consumption imputation stops mutating the process-wide RNG state.

policyengine_uk_data/datasets/frs.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@
2828
from policyengine_uk_data.parameters import load_take_up_rate, load_parameter
2929

3030

31+
# Canonical weeks-per-year conversion factor for annualising weekly survey
32+
# variables. 365.25 / 7 ≈ 52.1786 accounts for leap years; using the rounded
33+
# integer 52 would under-count by ~0.34%. Exposed at module level so sibling
34+
# loaders (e.g. LCFS/ETB in `datasets/imputations/consumption.py`) can import
35+
# the same value rather than re-defining `* 52` locally and drifting.
36+
WEEKS_IN_YEAR = 365.25 / 7
37+
3138
LEGACY_JOBSEEKER_MIN_AGE = 18
3239
HOURS_WORKED_WEEKS_PER_YEAR = 52
3340
ESA_MIN_AGE = 16

policyengine_uk_data/datasets/imputations/consumption.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,16 @@
2525
from policyengine_uk_data.storage import STORAGE_FOLDER
2626
from policyengine_uk.data import UKSingleYearDataset
2727
from policyengine_uk import Microsimulation
28+
from policyengine_uk_data.datasets.frs import WEEKS_IN_YEAR
2829

2930
LCFS_TAB_FOLDER = STORAGE_FOLDER / "lcfs_2021_22"
3031

32+
# Default seed for the stochastic ICE-vehicle flag drawn from
33+
# `NTS_2024_ICE_VEHICLE_SHARE`. Kept at 42 for backward compatibility with
34+
# existing artefact fingerprints; callers can override via the fixture's
35+
# local RNG rather than the process-wide np.random global.
36+
_HAS_FUEL_SEED = 42
37+
3138
# EV/ICE vehicle mix from NTS 2024
3239
NTS_2024_ICE_VEHICLE_SHARE = 0.90
3340

@@ -406,9 +413,12 @@ def create_has_fuel_model():
406413

407414
num_vehicles = was["vcarnr7"].fillna(0).clip(lower=0)
408415
has_vehicle = num_vehicles > 0
409-
np.random.seed(42)
416+
# Use a local RNG so we don't mutate the global np.random state (which
417+
# would silently change any unrelated consumer of np.random that runs
418+
# after this function).
419+
rng = np.random.default_rng(_HAS_FUEL_SEED)
410420
has_fuel = (
411-
has_vehicle & (np.random.random(len(was)) < NTS_2024_ICE_VEHICLE_SHARE)
421+
has_vehicle & (rng.random(len(was)) < NTS_2024_ICE_VEHICLE_SHARE)
412422
).astype(float)
413423

414424
was_df = pd.DataFrame(
@@ -481,18 +491,21 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame)
481491

482492
household = household.rename(columns=CONSUMPTION_VARIABLE_RENAMES)
483493

484-
# Annualise weekly LCFS values (× 52)
494+
# Annualise weekly LCFS values. Use the same WEEKS_IN_YEAR constant
495+
# (365.25 / 7 ≈ 52.1786) as `datasets/frs.py` rather than a bare `* 52`,
496+
# which underestimates annual totals by ~0.34% and skews VAT / energy
497+
# imputation targets against FRS income.
485498
annualise = list(CONSUMPTION_VARIABLE_RENAMES.values()) + [
486499
"hbai_household_net_income",
487500
"household_gross_income",
488501
"electricity_consumption",
489502
"gas_consumption",
490503
]
491504
for variable in annualise:
492-
household[variable] = household[variable] * 52
505+
household[variable] = household[variable] * WEEKS_IN_YEAR
493506
for variable in PERSON_LCF_RENAMES.values():
494507
household[variable] = (
495-
person[variable].groupby(person.case).sum()[household.case] * 52
508+
person[variable].groupby(person.case).sum()[household.case] * WEEKS_IN_YEAR
496509
)
497510
household.household_weight *= 1_000
498511

@@ -577,9 +590,10 @@ def impute_consumption(dataset: UKSingleYearDataset) -> UKSingleYearDataset:
577590
sim = Microsimulation(dataset=dataset)
578591
num_vehicles = sim.calculate("num_vehicles", map_to="household").values
579592

580-
np.random.seed(42)
593+
# Local RNG — see note at module level (_HAS_FUEL_SEED).
594+
rng = np.random.default_rng(_HAS_FUEL_SEED)
581595
has_vehicle = num_vehicles > 0
582-
is_ice = np.random.random(len(num_vehicles)) < NTS_2024_ICE_VEHICLE_SHARE
596+
is_ice = rng.random(len(num_vehicles)) < NTS_2024_ICE_VEHICLE_SHARE
583597
has_fuel_consumption = (has_vehicle & is_ice).astype(float)
584598
dataset.household["has_fuel_consumption"] = has_fuel_consumption
585599

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""Regression tests for `datasets/imputations/consumption.py`.
2+
3+
Covers the two sub-fixes in bug-hunt finding U6:
4+
5+
- Weekly → annual conversion used a bare ``* 52`` while the sibling FRS
6+
loader uses ``WEEKS_IN_YEAR = 365.25 / 7 ≈ 52.1786``. The ~0.34% gap
7+
skews VAT / energy imputation targets against FRS income.
8+
- Two ``np.random.seed(42)`` calls mutated the global RNG state, so any
9+
unrelated numpy random call after consumption imputation changed output.
10+
"""
11+
12+
from __future__ import annotations
13+
14+
import importlib.util
15+
16+
import numpy as np
17+
import pytest
18+
19+
if importlib.util.find_spec("policyengine_uk") is None:
20+
pytest.skip(
21+
"policyengine_uk not available in test environment",
22+
allow_module_level=True,
23+
)
24+
25+
26+
def test_consumption_imports_shared_weeks_in_year():
27+
"""consumption.py should use the same WEEKS_IN_YEAR as frs.py."""
28+
from policyengine_uk_data.datasets.frs import WEEKS_IN_YEAR as frs_value
29+
from policyengine_uk_data.datasets.imputations import (
30+
consumption as consumption_module,
31+
)
32+
33+
assert frs_value == 365.25 / 7
34+
# Imported at module scope, not re-computed locally with a different value.
35+
assert consumption_module.WEEKS_IN_YEAR == frs_value
36+
37+
38+
def test_consumption_module_no_longer_calls_np_random_seed():
39+
"""The fixed module must not reach into the process-wide RNG state.
40+
41+
The buggy version called `np.random.seed(42)` in two places, which
42+
mutated the global RNG and silently changed output for any later
43+
numpy random consumer in the same process.
44+
"""
45+
import inspect
46+
47+
from policyengine_uk_data.datasets.imputations import (
48+
consumption as consumption_module,
49+
)
50+
51+
src = inspect.getsource(consumption_module)
52+
assert "np.random.seed(" not in src, (
53+
"consumption.py still mutates the global RNG state via "
54+
"np.random.seed(...) — use np.random.default_rng(seed) instead."
55+
)
56+
57+
58+
def test_consumption_does_not_perturb_global_rng_on_import():
59+
"""Importing consumption.py must not change global RNG state.
60+
61+
Regression against the original bug where running module-level code
62+
paths (e.g. during test collection) could have reseeded the global RNG.
63+
"""
64+
np.random.seed(1234)
65+
before = np.random.random(3)
66+
67+
# Force reimport to exercise module-level side effects.
68+
import importlib
69+
70+
from policyengine_uk_data.datasets.imputations import consumption
71+
72+
importlib.reload(consumption)
73+
74+
np.random.seed(1234)
75+
after = np.random.random(3)
76+
np.testing.assert_allclose(before, after)

0 commit comments

Comments
 (0)