Skip some targets

nikhilwoodruff · nikhilwoodruff · commit ce8e19e2f1dd · 2025-06-09T15:33:28.000+01:00
diff --git a/policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py b/policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py
@@ -46,7 +46,8 @@ def calibrate(
     # Weights - 650 x 100180
     original_weights = np.log(
         sim.calculate("household_weight", 2025).values / COUNT_CONSTITUENCIES
-        + np.random.random(len(sim.calculate("household_weight", 2025).values)) * 0.1
+        + np.random.random(len(sim.calculate("household_weight", 2025).values))
+        * 0.01
     )
     weights = torch.tensor(
         np.ones((COUNT_CONSTITUENCIES, len(original_weights)))
@@ -90,7 +91,7 @@ def loss(w, validation: bool = False):
         else:
             mse_n = torch.mean((pred_n / (1 + y_national) - 1) ** 2)
 
-        return mse_c# + mse_n
+        return mse_c + mse_n
 
     def pct_close(w, t=0.1, constituency=True, national=True):
         # Return the percentage of metrics that are within t% of the target
@@ -124,7 +125,7 @@ def dropout_weights(weights, p):
         masked_weights[mask] = mean
         return masked_weights
 
-    optimizer = torch.optim.Adam([weights], lr=0.15)
+    optimizer = torch.optim.Adam([weights], lr=1e-1)
 
     desc = range(128) if os.environ.get("DATA_LITE") else range(epochs)
     final_weights = (torch.exp(weights) * r).detach().numpy()
@@ -134,10 +135,8 @@ def dropout_weights(weights, p):
         optimizer.zero_grad()
         weights_ = torch.exp(dropout_weights(weights, 0.05)) * r
         l = loss(weights_)
-        l.backward()
-        optimizer.step()
-        c_close = pct_close(weights_, constituency=True, national=False)
-        n_close = pct_close(weights_, constituency=False, national=True)
+        c_close = pct_close(weights_, constituency=True, national=False, t=0.1)
+        n_close = pct_close(weights_, constituency=False, national=True, t=0.1)
         if epoch % 1 == 0:
             if dropout_targets:
                 validation_loss = loss(weights_, validation=True)
@@ -182,6 +181,8 @@ def dropout_weights(weights, p):
                     f.create_dataset(
                         "household_weight/2025", data=final_weights.sum(axis=0)
                     )
+        l.backward()
+        optimizer.step()
 
     return final_weights
 
diff --git a/policyengine_uk_data/datasets/frs/local_areas/constituencies/loss.py b/policyengine_uk_data/datasets/frs/local_areas/constituencies/loss.py
@@ -82,39 +82,60 @@ def create_constituency_target_matrix(
         employment_incomes.employment_income_lower_bound.sort_values().unique()
     ) + [np.inf]
 
-    employment_incomes_all = employment_incomes.groupby("code")[["employment_income_count","employment_income_amount"]].sum().reset_index()
-
+    employment_incomes_all = (
+        employment_incomes.groupby("code")[
+            ["employment_income_count", "employment_income_amount"]
+        ]
+        .sum()
+        .reset_index()
+    )
 
     hmrc_all_count_target = incomes["employment_income_count"].values
-    ons_all_count_target = employment_incomes_all["employment_income_count"].values
+    ons_all_count_target = employment_incomes_all[
+        "employment_income_count"
+    ].values
     count_scaling_factors = hmrc_all_count_target / ons_all_count_target
 
     hmrc_all_amount_target = incomes["employment_income_amount"].values
-    ons_all_amount_target = employment_incomes_all["employment_income_amount"].values
+    ons_all_amount_target = employment_incomes_all[
+        "employment_income_amount"
+    ].values
     amount_scaling_factors = hmrc_all_amount_target / ons_all_amount_target
 
-    print(f"Average count scaling factor: {count_scaling_factors.mean():.1%}")
-    print(f"Average count (HMRC): {hmrc_all_count_target.mean()/1e3:,.0f} (thousands)")
-    print(f"Average count (ONS): {ons_all_count_target.mean()/1e3:,.0f} (thousands)")
-    print(f"Average amount scaling factor: {amount_scaling_factors.mean():.1%}")
-    print(f"Average amount (HMRC): {hmrc_all_amount_target.mean()/1e6:,.0f} (millions)")
-    print(f"Average amount (ONS): {ons_all_amount_target.mean()/1e6:,.0f} (millions)")
-
     for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
-        continue
-        if lower_bound <= 12_570:
+        if (
+            lower_bound <= 15_000
+        ):  # Skip some targets with very small sample sizes
             continue
         if upper_bound >= 200_000:
             continue
-        count_target = employment_incomes[
-            (employment_incomes.employment_income_lower_bound == lower_bound)
-            & (employment_incomes.employment_income_upper_bound == upper_bound)
-        ].employment_income_count.values * count_scaling_factors
+        count_target = (
+            employment_incomes[
+                (
+                    employment_incomes.employment_income_lower_bound
+                    == lower_bound
+                )
+                & (
+                    employment_incomes.employment_income_upper_bound
+                    == upper_bound
+                )
+            ].employment_income_count.values
+            * count_scaling_factors
+        )
 
-        amount_target = employment_incomes[
-            (employment_incomes.employment_income_lower_bound == lower_bound)
-            & (employment_incomes.employment_income_upper_bound == upper_bound)
-        ].employment_income_amount.values * amount_scaling_factors
+        amount_target = (
+            employment_incomes[
+                (
+                    employment_incomes.employment_income_lower_bound
+                    == lower_bound
+                )
+                & (
+                    employment_incomes.employment_income_upper_bound
+                    == upper_bound
+                )
+            ].employment_income_amount.values
+            * amount_scaling_factors
+        )
 
         if count_target.mean() < 200:
             print(
@@ -135,11 +156,6 @@ def create_constituency_target_matrix(
             & (age >= 16)
         )
         band_str = f"{lower_bound}_{upper_bound}"
-        matrix[f"hmrc/employment_income/count/{band_str}"] = sim.map_result(
-            in_bound, "person", "household"
-        )
-        y[f"hmrc/employment_income/count/{band_str}"] = count_target
-
         matrix[f"hmrc/employment_income/amount/{band_str}"] = sim.map_result(
             employment_income * in_bound, "person", "household"
         )