Merge pull request #129 from PolicyEngine/fixes

nikhilwoodruff · web-flow · commit a06f23923a87 · 2025-06-13T15:51:24.000+01:00
Add tests for calibration improvements
diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml
@@ -68,11 +68,11 @@ jobs:
       - name: Build Jupyter Book
         run: make documentation
       - name: Deploy documentation
-        uses: JamesIves/github-pages-deploy-action@releases/v4
+        uses: JamesIves/github-pages-deploy-action@releases/v3
         with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-          branch: gh-pages
-          folder: docs/_build/html
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          BRANCH: gh-pages  # The branch the action should deploy to.
+          FOLDER: docs/_build/html
       - name: Publish a git tag
         run: ".github/publish-git-tag.sh || true"
       - name: Remove .whl files
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,6 @@
+- bump: patch
+  changes:
+    fixed:
+    - Documentation publishes.
+    - Local authority calibration consistent with constituency calibration.
+    - Domestic rates are nonzero.
diff --git a/policyengine_uk_data/datasets/frs/frs.py b/policyengine_uk_data/datasets/frs/frs.py
@@ -86,6 +86,15 @@ def generate(self):
         for variable in frs:
             frs[variable] = {self.dwp_frs.time_period: np.array(frs[variable])}
 
+        # Domestic rates need to be set for 2025 too
+        domestic_rates = np.array(
+            frs["domestic_rates"][self.dwp_frs.time_period]
+        )
+        frs["domestic_rates"] = {
+            self.dwp_frs.time_period: domestic_rates,
+            "2025": domestic_rates,
+        }
+
         self.save_dataset(frs)
 
         impute_brmas(self, frs)
@@ -414,7 +423,7 @@ def add_household_variables(frs: h5py.File, household: DataFrame, year: int):
             ],
         )
         * 52
-    )
+    ).astype(float)
 
 
 def add_market_income(
diff --git a/policyengine_uk_data/datasets/frs/local_areas/local_authorities/calibrate.py b/policyengine_uk_data/datasets/frs/local_areas/local_authorities/calibrate.py
@@ -32,8 +32,9 @@ def calibrate():
 
     # Weights - 360 x 100180
     original_weights = np.log(
-        (sim.calculate("household_weight", 2025).values + 1e-3)
-        / count_local_authority
+        sim.calculate("household_weight", 2025).values / count_local_authority
+        + np.random.random(len(sim.calculate("household_weight", 2025).values))
+        * 0.01
     )
     weights = torch.tensor(
         np.ones((count_local_authority, len(original_weights)))
@@ -93,7 +94,7 @@ def dropout_weights(weights, p):
         masked_weights[mask] = mean
         return masked_weights
 
-    optimizer = torch.optim.Adam([weights], lr=0.15)
+    optimizer = torch.optim.Adam([weights], lr=1e-1)
 
     desc = range(32) if os.environ.get("DATA_LITE") else range(128)
 
diff --git a/policyengine_uk_data/datasets/frs/local_areas/local_authorities/loss.py b/policyengine_uk_data/datasets/frs/local_areas/local_authorities/loss.py
@@ -75,31 +75,84 @@ def create_local_authority_target_matrix(
         employment_incomes.employment_income_lower_bound.sort_values().unique()
     ) + [np.inf]
 
+    employment_incomes_all = (
+        employment_incomes.groupby("code")[
+            ["employment_income_count", "employment_income_amount"]
+        ]
+        .sum()
+        .reset_index()
+    )
+
+    hmrc_all_count_target = incomes["employment_income_count"].values
+    ons_all_count_target = employment_incomes_all[
+        "employment_income_count"
+    ].values
+    count_scaling_factors = hmrc_all_count_target / ons_all_count_target
+
+    hmrc_all_amount_target = incomes["employment_income_amount"].values
+    ons_all_amount_target = employment_incomes_all[
+        "employment_income_amount"
+    ].values
+    amount_scaling_factors = hmrc_all_amount_target / ons_all_amount_target
+
     for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
-        if lower_bound >= 70_000 or lower_bound < 12_570:
+        if (
+            lower_bound <= 15_000
+        ):  # Skip some targets with very small sample sizes
             continue
+        if upper_bound >= 200_000:
+            continue
+        count_target = (
+            employment_incomes[
+                (
+                    employment_incomes.employment_income_lower_bound
+                    == lower_bound
+                )
+                & (
+                    employment_incomes.employment_income_upper_bound
+                    == upper_bound
+                )
+            ].employment_income_count.values
+            * count_scaling_factors
+        )
+
+        amount_target = (
+            employment_incomes[
+                (
+                    employment_incomes.employment_income_lower_bound
+                    == lower_bound
+                )
+                & (
+                    employment_incomes.employment_income_upper_bound
+                    == upper_bound
+                )
+            ].employment_income_amount.values
+            * amount_scaling_factors
+        )
+
+        if count_target.mean() < 200:
+            print(
+                f"Skipping employment income band {lower_bound} to {upper_bound} due to low count target mean: {count_target.mean()}"
+            )
+            continue
+
+        if amount_target.mean() < 200 * 30e3:
+            print(
+                f"Skipping employment income band {lower_bound} to {upper_bound} due to low amount target mean: {amount_target.mean()}"
+            )
+            continue
+
         in_bound = (
             (employment_income >= lower_bound)
             & (employment_income < upper_bound)
             & (employment_income != 0)
             & (age >= 16)
         )
         band_str = f"{lower_bound}_{upper_bound}"
-        matrix[f"hmrc/employment_income/count/{band_str}"] = sim.map_result(
-            in_bound, "person", "household"
-        )
-        y[f"hmrc/employment_income/count/{band_str}"] = employment_incomes[
-            (employment_incomes.employment_income_lower_bound == lower_bound)
-            & (employment_incomes.employment_income_upper_bound == upper_bound)
-        ].employment_income_count.values
-
         matrix[f"hmrc/employment_income/amount/{band_str}"] = sim.map_result(
             employment_income * in_bound, "person", "household"
         )
-        y[f"hmrc/employment_income/amount/{band_str}"] = employment_incomes[
-            (employment_incomes.employment_income_lower_bound == lower_bound)
-            & (employment_incomes.employment_income_upper_bound == upper_bound)
-        ].employment_income_amount.values
+        y[f"hmrc/employment_income/amount/{band_str}"] = amount_target
 
     if uprate:
         y = uprate_targets(y, time_period)