Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 14 additions & 26 deletions policyengine_uk_data/datasets/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -751,40 +751,28 @@ def determine_education_level(fted_val, typeed2_val, age_val):
paragraph_3 | paragraph_4 | paragraph_5
)

# Add random variables which are for now in policyengine-uk.

RANDOM_VARIABLES = [
"would_evade_tv_licence_fee",
"would_claim_pc",
"would_claim_uc",
"would_claim_child_benefit",
"main_residential_property_purchased_is_first_home",
"household_owns_tv",
"is_higher_earner",
"attends_private_school",
]
# Add random seed variables for stochastic simulation
# These replace the old approach of calculating random variables directly
# Random seeds are generated once during dataset creation and stored

generator = np.random.default_rng(seed=100)

for variable in RANDOM_VARIABLES:
value = sim.calculate(variable).values
entity = sim.tax_benefit_system.variables[variable].entity.key
if entity == "person":
pe_person[variable] = value
elif entity == "household":
pe_household[variable] = value
elif entity == "benunit":
pe_benunit[variable] = value
pe_person["person_random_seed"] = generator.random(len(pe_person))
pe_benunit["benunit_random_seed"] = generator.random(len(pe_benunit))
pe_household["household_random_seed"] = generator.random(len(pe_household))

# Add Tax-Free Childcare assumptions
# Use seeded generator for reproducibility

count_benunits = len(pe_benunit)

extended_would_claim = np.random.random(count_benunits) < 0.812
tfc_would_claim = np.random.random(count_benunits) < 0.586
universal_would_claim = np.random.random(count_benunits) < 0.563
targeted_would_claim = np.random.random(count_benunits) < 0.597
extended_would_claim = generator.random(count_benunits) < 0.812
tfc_would_claim = generator.random(count_benunits) < 0.586
universal_would_claim = generator.random(count_benunits) < 0.563
targeted_would_claim = generator.random(count_benunits) < 0.597

# Generate extended childcare hours usage values with mean 15.019 and sd 4.972
extended_hours_values = np.random.normal(15.019, 4.972, count_benunits)
extended_hours_values = generator.normal(15.019, 4.972, count_benunits)
# Clip values to be between 0 and 30 hours
extended_hours_values = np.clip(extended_hours_values, 0, 30)

Expand Down
5 changes: 4 additions & 1 deletion policyengine_uk_data/datasets/imputations/capital_gains.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ def loss(blend_factor):

logging.info("Imputing capital gains among those with gains")

# Use seeded generator for reproducibility
generator = np.random.default_rng(seed=100)

for i in range(len(capital_gains)):
row = capital_gains.iloc[i]
spline = UnivariateSpline(
Expand All @@ -136,7 +139,7 @@ def loss(blend_factor):
upper = row.maximum_total_income
ti_in_range = (ti >= lower) * (ti < upper)
in_target_range = has_cg * ti_in_range > 0
quantiles = np.random.random(int(in_target_range.sum()))
quantiles = generator.random(int(in_target_range.sum()))
pred_capital_gains = spline(quantiles)
new_cg[in_target_range] = pred_capital_gains

Expand Down
4 changes: 3 additions & 1 deletion policyengine_uk_data/datasets/imputations/income.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ def generate_spi_table(spi: pd.DataFrame):
LOWER = np.array([0, 16, 25, 35, 45, 55, 65, 75])
UPPER = np.array([16, 25, 35, 45, 55, 65, 75, 80])
age_range = spi.AGERANGE
spi["age"] = LOWER[age_range] + np.random.rand(len(spi)) * (
# Use seeded generator for reproducibility
generator = np.random.default_rng(seed=100)
spi["age"] = LOWER[age_range] + generator.random(len(spi)) * (
UPPER[age_range] - LOWER[age_range]
)

Expand Down
4 changes: 3 additions & 1 deletion policyengine_uk_data/datasets/spi.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,10 @@ def create_spi(
age_range = df.AGERANGE

# Randomly assign ages in age ranges
# Use seeded generator for reproducibility

percent_along_age_range = np.random.rand(len(df))
generator = np.random.default_rng(seed=100)
percent_along_age_range = generator.random(len(df))
min_age = np.array([AGE_RANGES[age][0] for age in age_range])
max_age = np.array([AGE_RANGES[age][1] for age in age_range])
person["age"] = (
Expand Down
Loading