PolicyEngine
diff --git a/‎.github/workflows/pipeline.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pipeline.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Makefile‎
Lines changed: 6 additions & 5 deletions b/‎Makefile‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎changelog.d/753.changed.md‎
Lines changed: 42 additions & 0 deletions b/‎changelog.d/753.changed.md‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎docs/appendix.md‎
Lines changed: 4 additions & 3 deletions b/‎docs/appendix.md‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎docs/calibration.md‎
Lines changed: 6 additions & 7 deletions b/‎docs/calibration.md‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎modal_app/pipeline.py‎
Lines changed: 5 additions & 5 deletions b/‎modal_app/pipeline.py‎
Lines changed: 5 additions & 5 deletions
@@ -15,7 +15,7 @@ on:
         type: string
       national_epochs:
         description: "Epochs for national calibration"
-        default: "4000"
+        default: "1000"
         type: string
       num_workers:
         description: "Number of parallel H5 workers"
@@ -63,7 +63,7 @@ jobs:
 
           GPU="${{ inputs.gpu || 'T4' }}"
           EPOCHS="${{ inputs.epochs || '1000' }}"
-          NATIONAL_EPOCHS="${{ inputs.national_epochs || '4000' }}"
+          NATIONAL_EPOCHS="${{ inputs.national_epochs || '1000' }}"
           NUM_WORKERS="${{ inputs.num_workers || '50' }}"
           SKIP_NATIONAL="${{ inputs.skip_national || 'false' }}"
           RESUME_RUN_ID="${{ inputs.resume_run_id || '' }}"
 
@@ -8,7 +8,7 @@ YEAR ?= 2024
 GPU ?= T4
 EPOCHS ?= 1000
 NATIONAL_GPU ?= T4
-NATIONAL_EPOCHS ?= 4000
+NATIONAL_EPOCHS ?= 1000
 BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD)
 NUM_WORKERS ?= 8
 N_CLONES ?= 430
@@ -116,7 +116,8 @@ data-legacy: data
 
 calibrate: data
 	python -m policyengine_us_data.calibration.unified_calibration \
-		--target-config policyengine_us_data/calibration/target_config.yaml
+		--target-config policyengine_us_data/calibration/target_config.yaml \
+		--log-freq 100
 
 calibrate-build: data
 	python -m policyengine_us_data.calibration.unified_calibration \
@@ -187,15 +188,15 @@ build-matrices:
 calibrate-modal:
 	modal run --detach modal_app/remote_calibration_runner.py::main \
 		--branch $(BRANCH) --gpu $(GPU) --epochs $(EPOCHS) \
-		--beta 0.65 --lambda-l0 1e-7 --lambda-l2 1e-8 --log-freq 500 \
+		--beta 0.65 --lambda-l0 1e-7 --lambda-l2 1e-8 --log-freq 100 \
 		--target-config policyengine_us_data/calibration/target_config.yaml \
 		--push-results
 
 calibrate-modal-national:
 	modal run --detach modal_app/remote_calibration_runner.py::main \
 		--branch $(BRANCH) --gpu $(NATIONAL_GPU) \
 		--epochs $(NATIONAL_EPOCHS) \
-		--beta 0.65 --lambda-l0 1e-4 --lambda-l2 1e-12 --log-freq 500 \
+		--beta 0.65 --lambda-l0 2e-2 --lambda-l2 1e-12 --log-freq 100 \
 		--target-config policyengine_us_data/calibration/target_config.yaml \
 		--push-results --national
 
@@ -258,7 +259,7 @@ pipeline:
 clean:
 	rm -f policyengine_us_data/storage/*.h5
 	rm -f policyengine_us_data/storage/*.db
-	git clean -fX -- '*.csv'
+	git ls-files --others --ignored --exclude-standard -- '*.csv' | grep -Ev '(^|/)(\.venv|venv|env|\.tox|\.nox|node_modules)/' | xargs -r rm -f
 	rm -rf policyengine_us_data/docs/_build
 
 build:
 
@@ -0,0 +1,42 @@
+Add a chunked mixed-geography matrix builder for memory-bounded national
+calibration (`--chunked-matrix`) that streams matrix columns in clone-household
+chunks with resumable per-chunk COO shards, progress logging (running average,
+elapsed, ETA), and a shared `entity_clone` module for household-subset
+materialization.
+
+Fix three target-input integrity bugs surfaced by a new
+`analyze_target_consistency` diagnostic that flags cross-level and
+AGI-bucket-coverage inconsistencies:
+
+- Drop the IRS workbook override for `total_self_employment_income`,
+  `tax_unit_partnership_s_corp_income`, and `net_capital_gains`. The workbook
+  columns `business_net_profits` / `partnership_and_s_corp_income` /
+  `capital_gains_gross` are gross-only, while the geography-file line codes
+  00900 / 26270 / 01000 already report net-of-loss. The override inflated
+  these national targets by +40.7% / +26.1% / +3.1% at 2023 values. After
+  the fix, all three reconcile to the penny across national, state, and
+  district levels.
+- Remove the self-employment QRF winsor in `puf_impute.py`. QRF predictions
+  are already bounded by training support; the 0.5/99.5 percentile clip
+  was discarding the top 0.5% of legitimate signal and truncating imputed
+  self-employment income at ~$1.1M vs the PUF training max of $74.6M.
+- Replace percentile-based top selection in `create_stratified_cps` with
+  per-bracket caps (400/400/400/300/300 for the $500k-$1M through $10M+
+  bands). Stops PUF templates from piling up above $10M and starving the
+  middle-high $1M-$10M range.
+
+Split calibration checkpoint signature validation into fatal structural
+mismatches and soft hyperparameter mismatches, letting callers tune
+`lambda_l0`, `beta`, `lambda_l2`, and `learning_rate` across resume phases.
+
+Add `income_tax` national and state SOI targets, drop the unachievable
+JCT `deductible_mortgage_interest` target, and preserve positive mortgage
+interest inputs through structural conversion.
+
+Retune the national Modal calibration to `lambda_l0=2e-2` at 1000 epochs
+and align `modal_app/pipeline.py` `log_freq` to 100.
+
+Harden `make clean` so its ignored-CSV cleanup skips local environment and
+dependency directories such as `.venv/`, `venv/`, `env/`, `.tox/`, `.nox/`,
+and `node_modules/`, avoiding accidental deletion of package data inside local
+virtual environments.
@@ -122,9 +122,10 @@ for iteration in range(5000):
 - farm_operations_income_would_be_qualified
 - farm_rent_income_would_be_qualified
 
-The current PUF/calibration pipeline uses the legacy `business_is_sstb` flag to
-split these SSTB variables on an all-or-nothing basis. It does not yet infer
-mixed SSTB and non-SSTB allocations within the same record.
+The current PUF/calibration pipeline uses the legacy `business_is_sstb` flag to split these SSTB
+variables on an all-or-nothing basis. It does not yet infer mixed SSTB and non-SSTB allocations
+within the same record.
+
 - partnership_s_corp_income_would_be_qualified
 - rental_income_would_be_qualified
 - self_employment_income_would_be_qualified
 
@@ -95,10 +95,9 @@ You can re-run Step 2 as many times as you want with different hyperparameters.
 build only happens once.
 
 Every fit now also writes a checkpoint next to the weights output
-(`calibration_weights.checkpoint.pt` by default). To continue the same fit,
-pass `--resume-from` with the weights file or checkpoint path. If a sibling
-checkpoint exists next to the weights file, it is used automatically so the
-L0 gate state is restored as well.
+(`calibration_weights.checkpoint.pt` by default). To continue the same fit, pass `--resume-from`
+with the weights file or checkpoint path. If a sibling checkpoint exists next to the weights file,
+it is used automatically so the L0 gate state is restored as well.
 
 ```bash
 python -m policyengine_us_data.calibration.unified_calibration \
@@ -114,9 +113,9 @@ python -m policyengine_us_data.calibration.unified_calibration \
   --resume-from policyengine_us_data/storage/calibration/national/weights.npy
 ```
 
-When `--resume-from` points to a checkpoint, `--epochs` means additional epochs
-to run beyond the saved checkpoint epoch count. If only a `.npy` weights file
-exists, the run warm-starts from those weights.
+When `--resume-from` points to a checkpoint, `--epochs` means additional epochs to run beyond the
+saved checkpoint epoch count. If only a `.npy` weights file exists, the run warm-starts from those
+weights.
 
 ### 2. Full pipeline with PUF
 
 
@@ -605,7 +605,7 @@ def run_pipeline(
     gpu: str = "T4",
     epochs: int = 1000,
     national_gpu: str = "T4",
-    national_epochs: int = 4000,
+    national_epochs: int = 1000,
     num_workers: int = 50,
     n_clones: int = 430,
     skip_national: bool = False,
@@ -795,7 +795,7 @@ def run_pipeline(
                 beta=0.65,
                 lambda_l0=1e-7,
                 lambda_l2=1e-8,
-                log_freq=500,
+                log_freq=100,
             )
             print(f"    → regional fit fc: {regional_handle.object_id}")
 
@@ -814,9 +814,9 @@ def run_pipeline(
                     volume_package_path=vol_path,
                     target_config=target_cfg,
                     beta=0.65,
-                    lambda_l0=1e-4,
+                    lambda_l0=2e-2,
                     lambda_l2=1e-12,
-                    log_freq=500,
+                    log_freq=100,
                 )
                 print(f"    → national fit fc: {national_handle.object_id}")
 
@@ -1283,7 +1283,7 @@ def main(
     gpu: str = "T4",
     epochs: int = 1000,
     national_gpu: str = "T4",
-    national_epochs: int = 4000,
+    national_epochs: int = 1000,
     num_workers: int = 50,
     n_clones: int = 430,
     skip_national: bool = False,