Merge pull request #120 from PolicyEngine/nikhilwoodruff/issue119

nikhilwoodruff · web-flow · commit c231a889d519 · 2025-06-09T21:25:22.000+01:00
Enforce consistency between constituency targets
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    fixed:
+    - Inconsistent local area targets removed.
diff --git a/policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py b/policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py
@@ -46,6 +46,8 @@ def calibrate(
     # Weights - 650 x 100180
     original_weights = np.log(
         sim.calculate("household_weight", 2025).values / COUNT_CONSTITUENCIES
+        + np.random.random(len(sim.calculate("household_weight", 2025).values))
+        * 0.01
     )
     weights = torch.tensor(
         np.ones((COUNT_CONSTITUENCIES, len(original_weights)))
@@ -123,7 +125,7 @@ def dropout_weights(weights, p):
         masked_weights[mask] = mean
         return masked_weights
 
-    optimizer = torch.optim.Adam([weights], lr=0.15)
+    optimizer = torch.optim.Adam([weights], lr=1e-1)
 
     desc = range(128) if os.environ.get("DATA_LITE") else range(epochs)
     final_weights = (torch.exp(weights) * r).detach().numpy()
@@ -133,10 +135,8 @@ def dropout_weights(weights, p):
         optimizer.zero_grad()
         weights_ = torch.exp(dropout_weights(weights, 0.05)) * r
         l = loss(weights_)
-        l.backward()
-        optimizer.step()
-        c_close = pct_close(weights_, constituency=True, national=False)
-        n_close = pct_close(weights_, constituency=False, national=True)
+        c_close = pct_close(weights_, constituency=True, national=False, t=0.1)
+        n_close = pct_close(weights_, constituency=False, national=True, t=0.1)
         if epoch % 1 == 0:
             if dropout_targets:
                 validation_loss = loss(weights_, validation=True)
@@ -181,6 +181,8 @@ def dropout_weights(weights, p):
                     f.create_dataset(
                         "household_weight/2025", data=final_weights.sum(axis=0)
                     )
+        l.backward()
+        optimizer.step()
 
     return final_weights
 
diff --git a/policyengine_uk_data/datasets/frs/local_areas/constituencies/loss.py b/policyengine_uk_data/datasets/frs/local_areas/constituencies/loss.py
@@ -40,6 +40,7 @@ def create_constituency_target_matrix(
     INCOME_VARIABLES = [
         "total_income",
         "self_employment_income",
+        "employment_income",
     ]
 
     for income_variable in INCOME_VARIABLES:
@@ -81,31 +82,84 @@ def create_constituency_target_matrix(
         employment_incomes.employment_income_lower_bound.sort_values().unique()
     ) + [np.inf]
 
+    employment_incomes_all = (
+        employment_incomes.groupby("code")[
+            ["employment_income_count", "employment_income_amount"]
+        ]
+        .sum()
+        .reset_index()
+    )
+
+    hmrc_all_count_target = incomes["employment_income_count"].values
+    ons_all_count_target = employment_incomes_all[
+        "employment_income_count"
+    ].values
+    count_scaling_factors = hmrc_all_count_target / ons_all_count_target
+
+    hmrc_all_amount_target = incomes["employment_income_amount"].values
+    ons_all_amount_target = employment_incomes_all[
+        "employment_income_amount"
+    ].values
+    amount_scaling_factors = hmrc_all_amount_target / ons_all_amount_target
+
     for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
-        if lower_bound < 12_570 or upper_bound > 70_000:
+        if (
+            lower_bound <= 15_000
+        ):  # Skip some targets with very small sample sizes
+            continue
+        if upper_bound >= 200_000:
+            continue
+        count_target = (
+            employment_incomes[
+                (
+                    employment_incomes.employment_income_lower_bound
+                    == lower_bound
+                )
+                & (
+                    employment_incomes.employment_income_upper_bound
+                    == upper_bound
+                )
+            ].employment_income_count.values
+            * count_scaling_factors
+        )
+
+        amount_target = (
+            employment_incomes[
+                (
+                    employment_incomes.employment_income_lower_bound
+                    == lower_bound
+                )
+                & (
+                    employment_incomes.employment_income_upper_bound
+                    == upper_bound
+                )
+            ].employment_income_amount.values
+            * amount_scaling_factors
+        )
+
+        if count_target.mean() < 200:
+            print(
+                f"Skipping employment income band {lower_bound} to {upper_bound} due to low count target mean: {count_target.mean()}"
+            )
+            continue
+
+        if amount_target.mean() < 200 * 30e3:
+            print(
+                f"Skipping employment income band {lower_bound} to {upper_bound} due to low amount target mean: {amount_target.mean()}"
+            )
             continue
+
         in_bound = (
             (employment_income >= lower_bound)
             & (employment_income < upper_bound)
             & (employment_income != 0)
             & (age >= 16)
         )
         band_str = f"{lower_bound}_{upper_bound}"
-        matrix[f"hmrc/employment_income/count/{band_str}"] = sim.map_result(
-            in_bound, "person", "household"
-        )
-        y[f"hmrc/employment_income/count/{band_str}"] = employment_incomes[
-            (employment_incomes.employment_income_lower_bound == lower_bound)
-            & (employment_incomes.employment_income_upper_bound == upper_bound)
-        ].employment_income_count.values
-
         matrix[f"hmrc/employment_income/amount/{band_str}"] = sim.map_result(
             employment_income * in_bound, "person", "household"
         )
-        y[f"hmrc/employment_income/amount/{band_str}"] = employment_incomes[
-            (employment_incomes.employment_income_lower_bound == lower_bound)
-            & (employment_incomes.employment_income_upper_bound == upper_bound)
-        ].employment_income_amount.values
+        y[f"hmrc/employment_income/amount/{band_str}"] = amount_target
 
     if uprate:
         y = uprate_targets(y, time_period)
@@ -128,7 +182,6 @@ def create_constituency_target_matrix(
         household_countries=sim.calculate("country").values,
         codes=const_2024.code,
     )
-
     return matrix, y, country_mask