Skip to content

Commit 3c502c8

Browse files
Merge pull request #136 from PolicyEngine/nikhilwoodruff/issue135
Local employment income targets do not sum to nationals
2 parents 074f546 + 4223657 commit 3c502c8

6 files changed

Lines changed: 98 additions & 158 deletions

File tree

.github/workflows/pull_request.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ jobs:
5656
- name: Save calibration log
5757
uses: actions/upload-artifact@v4
5858
with:
59-
name: training_log
59+
name: calibration_log
6060
path: calibration_log.csv
6161
- name: Run tests
6262
run: pytest

.github/workflows/push.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ jobs:
6060
- name: Save calibration log
6161
uses: actions/upload-artifact@v4
6262
with:
63-
name: training_log
63+
name: calibration_log
6464
path: calibration_log.csv
6565
- name: Run tests
6666
run: pytest

changelog_entry.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- bump: patch
2+
changes:
3+
added:
4+
- Calibration improvements.

policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
def calibrate(
2929
epochs: int = 128,
3030
excluded_training_targets=[],
31-
log_csv="training_log.csv",
31+
log_csv="calibration_log.csv",
3232
overwrite_efrs=True,
3333
):
3434
matrix_, y_, country_mask = create_constituency_target_matrix(

policyengine_uk_data/datasets/frs/local_areas/constituencies/loss.py

Lines changed: 46 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,13 @@ def create_constituency_target_matrix(
3434
sim = Microsimulation(dataset=dataset, reform=reform)
3535
sim.default_calculation_period = time_period
3636

37+
national_incomes = pd.read_csv(STORAGE_FOLDER / "incomes_projection.csv")
38+
national_incomes = national_incomes[national_incomes.year == 2025]
39+
3740
matrix = pd.DataFrame()
3841
y = pd.DataFrame()
3942

4043
INCOME_VARIABLES = [
41-
"total_income",
4244
"self_employment_income",
4345
"employment_income",
4446
]
@@ -49,15 +51,31 @@ def create_constituency_target_matrix(
4951
matrix[f"hmrc/{income_variable}/amount"] = sim.map_result(
5052
income_values * in_spi_frame, "person", "household"
5153
)
52-
y[f"hmrc/{income_variable}/amount"] = incomes[
53-
f"{income_variable}_amount"
54-
].values
54+
local_targets = incomes[f"{income_variable}_amount"].values
55+
local_target_sum = local_targets.sum()
56+
national_target = national_incomes[
57+
(national_incomes.total_income_lower_bound == 12_570)
58+
& (national_incomes.total_income_upper_bound == np.inf)
59+
][income_variable + "_amount"].iloc[0]
60+
national_consistency_adjustment_factor = (
61+
national_target / local_target_sum
62+
)
63+
y[f"hmrc/{income_variable}/amount"] = (
64+
local_targets * national_consistency_adjustment_factor
65+
)
5566
matrix[f"hmrc/{income_variable}/count"] = sim.map_result(
5667
(income_values != 0) * in_spi_frame, "person", "household"
5768
)
58-
y[f"hmrc/{income_variable}/count"] = incomes[
59-
f"{income_variable}_count"
60-
].values
69+
local_targets = incomes[f"{income_variable}_count"].values
70+
local_target_sum = local_targets.sum()
71+
national_target = national_incomes[
72+
(national_incomes.total_income_lower_bound == 12_570)
73+
& (national_incomes.total_income_upper_bound == np.inf)
74+
][income_variable + "_count"].iloc[0]
75+
y[f"hmrc/{income_variable}/count"] = (
76+
incomes[f"{income_variable}_count"].values
77+
* national_consistency_adjustment_factor
78+
)
6179

6280
age = sim.calculate("age").values
6381
for lower_age in range(0, 80, 10):
@@ -82,60 +100,31 @@ def create_constituency_target_matrix(
82100
employment_incomes.employment_income_lower_bound.sort_values().unique()
83101
) + [np.inf]
84102

85-
employment_incomes_all = (
86-
employment_incomes.groupby("code")[
87-
["employment_income_count", "employment_income_amount"]
88-
]
89-
.sum()
90-
.reset_index()
91-
)
92-
93-
hmrc_all_count_target = incomes["employment_income_count"].values
94-
ons_all_count_target = employment_incomes_all[
95-
"employment_income_count"
96-
].values
97-
count_scaling_factors = hmrc_all_count_target / ons_all_count_target
98-
99-
hmrc_all_amount_target = incomes["employment_income_amount"].values
100-
ons_all_amount_target = employment_incomes_all[
101-
"employment_income_amount"
102-
].values
103-
amount_scaling_factors = hmrc_all_amount_target / ons_all_amount_target
104-
105103
for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
106104
if (
107105
lower_bound <= 15_000
108106
): # Skip some targets with very small sample sizes
109107
continue
110108
if upper_bound >= 200_000:
111109
continue
112-
count_target = (
113-
employment_incomes[
114-
(
115-
employment_incomes.employment_income_lower_bound
116-
== lower_bound
117-
)
118-
& (
119-
employment_incomes.employment_income_upper_bound
120-
== upper_bound
121-
)
122-
].employment_income_count.values
123-
* count_scaling_factors
124-
)
125110

126-
amount_target = (
127-
employment_incomes[
128-
(
129-
employment_incomes.employment_income_lower_bound
130-
== lower_bound
131-
)
132-
& (
133-
employment_incomes.employment_income_upper_bound
134-
== upper_bound
135-
)
136-
].employment_income_amount.values
137-
* amount_scaling_factors
138-
)
111+
national_data_row = national_incomes[
112+
national_incomes.total_income_lower_bound == lower_bound
113+
]["employment_income_amount"].iloc[0]
114+
115+
count_target = employment_incomes[
116+
(employment_incomes.employment_income_lower_bound == lower_bound)
117+
& (employment_incomes.employment_income_upper_bound == upper_bound)
118+
].employment_income_count.values
119+
120+
amount_target = employment_incomes[
121+
(employment_incomes.employment_income_lower_bound == lower_bound)
122+
& (employment_incomes.employment_income_upper_bound == upper_bound)
123+
].employment_income_amount.values
124+
125+
sum_of_local_area_values = amount_target.sum()
126+
127+
adjustment = national_data_row / sum_of_local_area_values
139128

140129
if count_target.mean() < 200:
141130
print(
@@ -159,7 +148,9 @@ def create_constituency_target_matrix(
159148
matrix[f"hmrc/employment_income/amount/{band_str}"] = sim.map_result(
160149
employment_income * in_bound, "person", "household"
161150
)
162-
y[f"hmrc/employment_income/amount/{band_str}"] = amount_target
151+
y[f"hmrc/employment_income/amount/{band_str}"] = (
152+
amount_target * adjustment
153+
)
163154

164155
if uprate:
165156
y = uprate_targets(y, time_period)
@@ -243,29 +234,7 @@ def uprate_targets(y: pd.DataFrame, target_year: int = 2025) -> pd.DataFrame:
243234
is_uprated_from_2020
244235
]
245236

246-
rel_change_21_final = (weights_final @ matrix_final) / (
247-
weights_21 @ matrix_21
248-
) - 1
249-
is_uprated_from_2021 = [
250-
col.startswith("hmrc/") for col in matrix_21.columns
251-
]
252-
uprating_from_2021 = np.zeros_like(matrix_21.columns, dtype=float)
253-
uprating_from_2021[is_uprated_from_2021] = rel_change_21_final[
254-
is_uprated_from_2021
255-
]
256-
257-
rel_change_23_final = (weights_final @ matrix_final) / (
258-
weights_23 @ matrix_23
259-
) - 1
260-
is_uprated_from_2023 = [
261-
col.startswith("hmrc/") for col in matrix_23.columns
262-
]
263-
uprating_from_2023 = np.zeros_like(matrix_23.columns, dtype=float)
264-
uprating_from_2023[is_uprated_from_2023] = rel_change_23_final[
265-
is_uprated_from_2023
266-
]
267-
268-
uprating = uprating_from_2020 + uprating_from_2021 + uprating_from_2023
237+
uprating = uprating_from_2020
269238
y = y * (1 + uprating)
270239

271240
return y

policyengine_uk_data/datasets/frs/local_areas/local_authorities/loss.py

Lines changed: 45 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -32,25 +32,44 @@ def create_local_authority_target_matrix(
3232
y = pd.DataFrame()
3333

3434
INCOME_VARIABLES = [
35-
"total_income",
3635
"self_employment_income",
36+
"employment_income",
3737
]
3838

39+
national_incomes = pd.read_csv(STORAGE_FOLDER / "incomes_projection.csv")
40+
national_incomes = national_incomes[national_incomes.year == 2025]
41+
3942
for income_variable in INCOME_VARIABLES:
4043
income_values = sim.calculate(income_variable).values
4144
in_spi_frame = sim.calculate("income_tax").values > 0
4245
matrix[f"hmrc/{income_variable}/amount"] = sim.map_result(
4346
income_values * in_spi_frame, "person", "household"
4447
)
45-
y[f"hmrc/{income_variable}/amount"] = incomes[
46-
f"{income_variable}_amount"
47-
].values
48+
local_targets = incomes[f"{income_variable}_amount"].values
49+
local_target_sum = local_targets.sum()
50+
national_target = national_incomes[
51+
(national_incomes.total_income_lower_bound == 12_570)
52+
& (national_incomes.total_income_upper_bound == np.inf)
53+
][income_variable + "_amount"].iloc[0]
54+
national_consistency_adjustment_factor = (
55+
national_target / local_target_sum
56+
)
57+
y[f"hmrc/{income_variable}/amount"] = (
58+
local_targets * national_consistency_adjustment_factor
59+
)
4860
matrix[f"hmrc/{income_variable}/count"] = sim.map_result(
4961
(income_values != 0) * in_spi_frame, "person", "household"
5062
)
51-
y[f"hmrc/{income_variable}/count"] = incomes[
52-
f"{income_variable}_count"
53-
].values
63+
local_targets = incomes[f"{income_variable}_count"].values
64+
local_target_sum = local_targets.sum()
65+
national_target = national_incomes[
66+
(national_incomes.total_income_lower_bound == 12_570)
67+
& (national_incomes.total_income_upper_bound == np.inf)
68+
][income_variable + "_count"].iloc[0]
69+
y[f"hmrc/{income_variable}/count"] = (
70+
incomes[f"{income_variable}_count"].values
71+
* national_consistency_adjustment_factor
72+
)
5473

5574
age = sim.calculate("age").values
5675
for lower_age in range(0, 80, 10):
@@ -75,61 +94,30 @@ def create_local_authority_target_matrix(
7594
employment_incomes.employment_income_lower_bound.sort_values().unique()
7695
) + [np.inf]
7796

78-
employment_incomes_all = (
79-
employment_incomes.groupby("code")[
80-
["employment_income_count", "employment_income_amount"]
81-
]
82-
.sum()
83-
.reset_index()
84-
)
85-
86-
hmrc_all_count_target = incomes["employment_income_count"].values
87-
ons_all_count_target = employment_incomes_all[
88-
"employment_income_count"
89-
].values
90-
count_scaling_factors = hmrc_all_count_target / ons_all_count_target
91-
92-
hmrc_all_amount_target = incomes["employment_income_amount"].values
93-
ons_all_amount_target = employment_incomes_all[
94-
"employment_income_amount"
95-
].values
96-
amount_scaling_factors = hmrc_all_amount_target / ons_all_amount_target
97-
9897
for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
9998
if (
10099
lower_bound <= 15_000
101100
): # Skip some targets with very small sample sizes
102101
continue
103102
if upper_bound >= 200_000:
104103
continue
105-
count_target = (
106-
employment_incomes[
107-
(
108-
employment_incomes.employment_income_lower_bound
109-
== lower_bound
110-
)
111-
& (
112-
employment_incomes.employment_income_upper_bound
113-
== upper_bound
114-
)
115-
].employment_income_count.values
116-
* count_scaling_factors
117-
)
118104

119-
amount_target = (
120-
employment_incomes[
121-
(
122-
employment_incomes.employment_income_lower_bound
123-
== lower_bound
124-
)
125-
& (
126-
employment_incomes.employment_income_upper_bound
127-
== upper_bound
128-
)
129-
].employment_income_amount.values
130-
* amount_scaling_factors
131-
)
105+
national_data_row = national_incomes[
106+
national_incomes.total_income_lower_bound == lower_bound
107+
]["employment_income_amount"].iloc[0]
108+
109+
count_target = employment_incomes[
110+
(employment_incomes.employment_income_lower_bound == lower_bound)
111+
& (employment_incomes.employment_income_upper_bound == upper_bound)
112+
].employment_income_count.values
132113

114+
amount_target = employment_incomes[
115+
(employment_incomes.employment_income_lower_bound == lower_bound)
116+
& (employment_incomes.employment_income_upper_bound == upper_bound)
117+
].employment_income_amount.values
118+
sum_of_local_area_values = amount_target.sum()
119+
120+
adjustment = national_data_row / sum_of_local_area_values
133121
if count_target.mean() < 200:
134122
print(
135123
f"Skipping employment income band {lower_bound} to {upper_bound} due to low count target mean: {count_target.mean()}"
@@ -152,7 +140,9 @@ def create_local_authority_target_matrix(
152140
matrix[f"hmrc/employment_income/amount/{band_str}"] = sim.map_result(
153141
employment_income * in_bound, "person", "household"
154142
)
155-
y[f"hmrc/employment_income/amount/{band_str}"] = amount_target
143+
y[f"hmrc/employment_income/amount/{band_str}"] = (
144+
amount_target * adjustment
145+
)
156146

157147
if uprate:
158148
y = uprate_targets(y, time_period)
@@ -221,30 +211,7 @@ def uprate_targets(y: pd.DataFrame, target_year: int = 2025) -> pd.DataFrame:
221211
uprating_from_2020[is_uprated_from_2020] = rel_change_20_final[
222212
is_uprated_from_2020
223213
]
224-
225-
rel_change_21_final = (weights_final @ matrix_final) / (
226-
weights_21 @ matrix_21
227-
) - 1
228-
is_uprated_from_2021 = [
229-
col.startswith("hmrc/") for col in matrix_21.columns
230-
]
231-
uprating_from_2021 = np.zeros_like(matrix_21.columns, dtype=float)
232-
uprating_from_2021[is_uprated_from_2021] = rel_change_21_final[
233-
is_uprated_from_2021
234-
]
235-
236-
rel_change_23_final = (weights_final @ matrix_final) / (
237-
weights_23 @ matrix_23
238-
) - 1
239-
is_uprated_from_2023 = [
240-
col.startswith("hmrc/") for col in matrix_23.columns
241-
]
242-
uprating_from_2023 = np.zeros_like(matrix_23.columns, dtype=float)
243-
uprating_from_2023[is_uprated_from_2023] = rel_change_23_final[
244-
is_uprated_from_2023
245-
]
246-
247-
uprating = uprating_from_2020 + uprating_from_2021 + uprating_from_2023
214+
uprating = uprating_from_2020
248215
y = y * (1 + uprating)
249216

250217
return y

0 commit comments

Comments
 (0)