Skip to content

Commit b0814b9

Browse files
committed
Generate random seeds in dataset for reproducible stochastic simulations
This change moves random number generation from policyengine-uk into the dataset generation, following the pattern established in policyengine-us-data. Changes: - Add random seed generation in FRS dataset for 11 independent random decisions (4 person-level, 4 benunit-level, 3 household-level seeds) - Update SPI dataset to use seeded generator for age assignment - Update income imputation to use seeded generator for age assignment - Update capital gains imputation to use seeded generator for quantile sampling - Update childcare assumptions to use seeded generator All random generation now uses np.random.default_rng(seed=100) for full reproducibility across dataset builds. Each seed corresponds to a specific independent random decision to avoid artificial correlations between unrelated stochastic processes. Related: policyengine-uk PR (must be merged after this)
1 parent a655ff7 commit b0814b9

4 files changed

Lines changed: 40 additions & 40 deletions

File tree

policyengine_uk_data/datasets/frs.py

Lines changed: 30 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -751,48 +751,41 @@ def determine_education_level(fted_val, typeed2_val, age_val):
751751
paragraph_3 | paragraph_4 | paragraph_5
752752
)
753753

754-
# Add random variables which are for now in policyengine-uk.
755-
756-
RANDOM_VARIABLES = [
757-
"would_evade_tv_licence_fee",
758-
"would_claim_pc",
759-
"would_claim_uc",
760-
"would_claim_child_benefit",
761-
"main_residential_property_purchased_is_first_home",
762-
"household_owns_tv",
763-
"is_higher_earner",
764-
"attends_private_school",
765-
]
766-
767-
for variable in RANDOM_VARIABLES:
768-
value = sim.calculate(variable).values
769-
entity = sim.tax_benefit_system.variables[variable].entity.key
770-
if entity == "person":
771-
pe_person[variable] = value
772-
elif entity == "household":
773-
pe_household[variable] = value
774-
elif entity == "benunit":
775-
pe_benunit[variable] = value
776-
777-
# Add Tax-Free Childcare assumptions
778-
779-
count_benunits = len(pe_benunit)
780-
781-
extended_would_claim = np.random.random(count_benunits) < 0.812
782-
tfc_would_claim = np.random.random(count_benunits) < 0.586
783-
universal_would_claim = np.random.random(count_benunits) < 0.563
784-
targeted_would_claim = np.random.random(count_benunits) < 0.597
754+
# Add random seed variables for stochastic simulation
755+
# Each seed is for a specific independent random decision to avoid artificial correlations
756+
# Random seeds are generated once during dataset creation and stored
757+
758+
generator = np.random.default_rng(seed=100)
759+
760+
# Person-level seeds
761+
pe_person["is_disabled_for_benefits_seed"] = generator.random(len(pe_person))
762+
pe_person["marriage_allowance_take_up_seed"] = generator.random(len(pe_person))
763+
pe_person["is_higher_earner_seed"] = generator.random(len(pe_person))
764+
pe_person["attends_private_school_seed"] = generator.random(len(pe_person))
765+
766+
# Benefit unit-level seeds
767+
pe_benunit["child_benefit_take_up_seed"] = generator.random(len(pe_benunit))
768+
pe_benunit["child_benefit_opts_out_seed"] = generator.random(len(pe_benunit))
769+
pe_benunit["pension_credit_take_up_seed"] = generator.random(len(pe_benunit))
770+
pe_benunit["universal_credit_take_up_seed"] = generator.random(len(pe_benunit))
771+
772+
# Household-level seeds
773+
pe_household["first_home_purchase_seed"] = generator.random(len(pe_household))
774+
pe_household["household_owns_tv_seed"] = generator.random(len(pe_household))
775+
pe_household["tv_licence_evasion_seed"] = generator.random(len(pe_household))
776+
777+
# Add childcare take-up seeds
778+
# These will be used by the formulas in policyengine-uk with parameters
779+
pe_benunit["tax_free_childcare_take_up_seed"] = generator.random(len(pe_benunit))
780+
pe_benunit["extended_childcare_take_up_seed"] = generator.random(len(pe_benunit))
781+
pe_benunit["universal_childcare_take_up_seed"] = generator.random(len(pe_benunit))
782+
pe_benunit["targeted_childcare_take_up_seed"] = generator.random(len(pe_benunit))
785783

786784
# Generate extended childcare hours usage values with mean 15.019 and sd 4.972
787-
extended_hours_values = np.random.normal(15.019, 4.972, count_benunits)
785+
extended_hours_values = generator.normal(15.019, 4.972, len(pe_benunit))
788786
# Clip values to be between 0 and 30 hours
789787
extended_hours_values = np.clip(extended_hours_values, 0, 30)
790788

791-
pe_benunit["would_claim_extended_childcare"] = extended_would_claim
792-
pe_benunit["would_claim_tfc"] = tfc_would_claim
793-
pe_benunit["would_claim_universal_childcare"] = universal_would_claim
794-
pe_benunit["would_claim_targeted_childcare"] = targeted_would_claim
795-
796789
# Add the maximum extended childcare hours usage
797790
pe_benunit["maximum_extended_childcare_hours_usage"] = (
798791
extended_hours_values

policyengine_uk_data/datasets/imputations/capital_gains.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,9 @@ def loss(blend_factor):
125125

126126
logging.info("Imputing capital gains among those with gains")
127127

128+
# Use seeded generator for reproducibility
129+
generator = np.random.default_rng(seed=100)
130+
128131
for i in range(len(capital_gains)):
129132
row = capital_gains.iloc[i]
130133
spline = UnivariateSpline(
@@ -136,7 +139,7 @@ def loss(blend_factor):
136139
upper = row.maximum_total_income
137140
ti_in_range = (ti >= lower) * (ti < upper)
138141
in_target_range = has_cg * ti_in_range > 0
139-
quantiles = np.random.random(int(in_target_range.sum()))
142+
quantiles = generator.random(int(in_target_range.sum()))
140143
pred_capital_gains = spline(quantiles)
141144
new_cg[in_target_range] = pred_capital_gains
142145

policyengine_uk_data/datasets/imputations/income.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ def generate_spi_table(spi: pd.DataFrame):
5151
LOWER = np.array([0, 16, 25, 35, 45, 55, 65, 75])
5252
UPPER = np.array([16, 25, 35, 45, 55, 65, 75, 80])
5353
age_range = spi.AGERANGE
54-
spi["age"] = LOWER[age_range] + np.random.rand(len(spi)) * (
54+
# Use seeded generator for reproducibility
55+
generator = np.random.default_rng(seed=100)
56+
spi["age"] = LOWER[age_range] + generator.random(len(spi)) * (
5557
UPPER[age_range] - LOWER[age_range]
5658
)
5759

policyengine_uk_data/datasets/spi.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,10 @@ def create_spi(
7373
age_range = df.AGERANGE
7474

7575
# Randomly assign ages in age ranges
76+
# Use seeded generator for reproducibility
7677

77-
percent_along_age_range = np.random.rand(len(df))
78+
generator = np.random.default_rng(seed=100)
79+
percent_along_age_range = generator.random(len(df))
7880
min_age = np.array([AGE_RANGES[age][0] for age in age_range])
7981
max_age = np.array([AGE_RANGES[age][1] for age in age_range])
8082
person["age"] = (

0 commit comments

Comments
 (0)