Skip to content

Commit 667f45a

Browse files
authored
Merge pull request #660 from PolicyEngine/codex/update-soi-targets-2023
Update SOI-backed calibration targets through TY2023
2 parents c750c7b + 96f7cc6 commit 667f45a

19 files changed

Lines changed: 8708 additions & 125 deletions

.github/CONTRIBUTING.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
11
## Updating data
22

33
If your changes present a non-bugfix change to one or more datasets which are cloud-hosted (CPS, ECPS and PUF), then please change both the filename and URL (in both the class definition file and in `storage/upload_completed_datasets.py`. This enables us to store historical versions of datasets separately and reproducibly.
4+
5+
## Opening PRs
6+
7+
Push PR branches to the upstream `PolicyEngine/policyengine-us-data` repository, not to a personal fork. From the repo root, run:
8+
9+
`make push-pr-branch`
10+
11+
This avoids the fork-only CI failure path and sets the upstream tracking branch correctly before opening the PR.

CLAUDE.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,14 @@
2828
- **CRITICAL**: NEVER create PRs from personal forks - ALL PRs MUST be created from branches pushed to the upstream PolicyEngine repository
2929
- CI requires access to secrets that are not available to fork PRs for security reasons
3030
- Fork PRs will fail on data download steps and cannot be merged
31+
- Before opening a PR, always run `make push-pr-branch` from the repo root. This pushes the current branch to the `upstream` remote and sets the upstream tracking branch correctly for PR creation.
32+
- Do not prefix PR titles with `[codex]` or any other agent label. Use the plain descriptive title.
3133
- Always create branches directly on the upstream repository:
3234
```bash
3335
git checkout main
3436
git pull upstream main
3537
git checkout -b your-branch-name
36-
git push -u upstream your-branch-name
38+
make push-pr-branch
3739
```
3840
- Use descriptive branch names like `fix-issue-123` or `add-feature-name`
3941
- Always run `make format` before committing
@@ -62,4 +64,4 @@
6264
- Blacklisting from future publications
6365
- Damage to institutional reputation
6466
- Legal consequences in funded research
65-
- Career-ending academic misconduct charges
67+
- Career-ending academic misconduct charges

Makefile

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
.PHONY: all format test install download upload docker documentation data validate-data calibrate calibrate-build publish-local-area upload-calibration upload-dataset upload-database push-to-modal build-data-modal build-matrices calibrate-modal calibrate-modal-national calibrate-both stage-h5s stage-national-h5 stage-all-h5s pipeline validate-staging validate-staging-full upload-validation check-staging check-sanity clean build paper clean-paper presentations database database-refresh promote-database promote-dataset promote build-h5s validate-local
1+
.PHONY: all format test install download upload docker documentation data validate-data calibrate calibrate-build publish-local-area upload-calibration upload-dataset upload-database push-to-modal build-data-modal build-matrices calibrate-modal calibrate-modal-national calibrate-both stage-h5s stage-national-h5 stage-all-h5s pipeline validate-staging validate-staging-full upload-validation check-staging check-sanity clean build paper clean-paper presentations database database-refresh promote-database promote-dataset promote build-h5s validate-local refresh-soi-targets push-pr-branch
2+
3+
SOI_SOURCE_YEAR ?= 2021
4+
SOI_TARGET_YEAR ?= 2023
25

36
GPU ?= T4
47
EPOCHS ?= 1000
@@ -8,6 +11,8 @@ BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD)
811
NUM_WORKERS ?= 8
912
N_CLONES ?= 430
1013
VERSION ?=
14+
SOI_SOURCE_YEAR ?= 2021
15+
SOI_TARGET_YEAR ?= 2023
1116

1217
HF_CLONE_DIR ?= $(HOME)/huggingface/policyengine-us-data
1318

@@ -139,6 +144,18 @@ validate-local:
139144
validate-data:
140145
python -c "from policyengine_us_data.storage.upload_completed_datasets import validate_all_datasets; validate_all_datasets()"
141146

147+
refresh-soi-targets:
148+
python policyengine_us_data/storage/calibration_targets/refresh_soi_table_targets.py \
149+
--source-year $(SOI_SOURCE_YEAR) \
150+
--target-year $(SOI_TARGET_YEAR)
151+
152+
push-pr-branch:
153+
@if [ "$(BRANCH)" = "main" ]; then \
154+
echo "Refusing to push main as a PR branch."; \
155+
exit 1; \
156+
fi
157+
@git push -u upstream $(BRANCH)
158+
142159
upload-calibration:
143160
python -c "from policyengine_us_data.utils.huggingface import upload_calibration_artifacts; \
144161
upload_calibration_artifacts()"
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Refresh tracked national SOI workbook targets through TY2023, backfill TY2022,
2+
teach `get_soi()` to pick the best available source year per variable, and
3+
overlay the national DB IRS-SOI targets that can now use the newer workbook
4+
release instead of staying stuck on the TY2022 geography file.

policyengine_us_data/calibration/unified_matrix_builder.py

Lines changed: 85 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -934,6 +934,7 @@ def __init__(
934934
self.time_period = time_period
935935
self.dataset_path = dataset_path
936936
self._entity_rel_cache = None
937+
self._target_overview_columns = None
937938

938939
# ---------------------------------------------------------------
939940
# Entity relationships
@@ -962,8 +963,8 @@ def _build_state_values(
962963
sim,
963964
target_vars: set,
964965
constraint_vars: set,
965-
reform_vars: set,
966-
geography,
966+
reform_vars: set = None,
967+
geography=None,
967968
rerandomize_takeup: bool = True,
968969
workers: int = 1,
969970
) -> dict:
@@ -1000,6 +1001,9 @@ def _build_state_values(
10001001
TAKEUP_AFFECTED_TARGETS,
10011002
)
10021003

1004+
if geography is None:
1005+
raise ValueError("geography is required")
1006+
10031007
unique_states = sorted(set(int(s) for s in geography.state_fips))
10041008
n_hh = geography.n_records
10051009

@@ -1025,7 +1029,7 @@ def _build_state_values(
10251029
# Convert sets to sorted lists for deterministic iteration
10261030
target_vars_list = sorted(target_vars)
10271031
constraint_vars_list = sorted(constraint_vars)
1028-
reform_vars_list = sorted(reform_vars)
1032+
reform_vars_list = sorted(reform_vars or set())
10291033

10301034
state_values = {}
10311035

@@ -1521,63 +1525,103 @@ def _get_stratum_constraints(self, stratum_id: int) -> List[dict]:
15211525
)
15221526
return df.to_dict("records")
15231527

1528+
def _get_target_overview_columns(self) -> set:
1529+
if self._target_overview_columns is None:
1530+
with self.engine.connect() as conn:
1531+
rows = conn.execute(
1532+
text("PRAGMA table_info(target_overview)")
1533+
).fetchall()
1534+
self._target_overview_columns = {row[1] for row in rows}
1535+
return self._target_overview_columns
1536+
15241537
def _query_targets(self, target_filter: dict) -> pd.DataFrame:
15251538
"""Query targets via target_overview view with
15261539
best-period selection."""
1527-
or_conditions = []
1540+
and_conditions = []
15281541

15291542
if "domain_variables" in target_filter:
15301543
dvs = target_filter["domain_variables"]
15311544
ph = ",".join(f"'{dv}'" for dv in dvs)
1532-
or_conditions.append(f"tv.domain_variable IN ({ph})")
1545+
and_conditions.append(f"tv.domain_variable IN ({ph})")
15331546

15341547
if "variables" in target_filter:
15351548
vs = ",".join(f"'{v}'" for v in target_filter["variables"])
1536-
or_conditions.append(f"tv.variable IN ({vs})")
1549+
and_conditions.append(f"tv.variable IN ({vs})")
15371550

15381551
if "target_ids" in target_filter:
15391552
ids = ",".join(map(str, target_filter["target_ids"]))
1540-
or_conditions.append(f"tv.target_id IN ({ids})")
1553+
and_conditions.append(f"tv.target_id IN ({ids})")
15411554

15421555
if "stratum_ids" in target_filter:
15431556
ids = ",".join(map(str, target_filter["stratum_ids"]))
1544-
or_conditions.append(f"tv.stratum_id IN ({ids})")
1557+
and_conditions.append(f"tv.stratum_id IN ({ids})")
15451558

1546-
if not or_conditions:
1559+
if not and_conditions:
15471560
where_clause = "1=1"
15481561
else:
1549-
where_clause = " OR ".join(f"({c})" for c in or_conditions)
1550-
1551-
query = f"""
1552-
WITH filtered_targets AS (
1553-
SELECT tv.target_id, tv.stratum_id, tv.variable, tv.reform_id,
1554-
tv.value, tv.period, tv.geo_level,
1555-
tv.geographic_id, tv.domain_variable
1556-
FROM target_overview tv
1557-
WHERE tv.active = 1
1558-
AND ({where_clause})
1559-
),
1560-
best_periods AS (
1561-
SELECT stratum_id, variable, reform_id,
1562-
CASE
1563-
WHEN MAX(CASE WHEN period <= :time_period
1564-
THEN period END) IS NOT NULL
1565-
THEN MAX(CASE WHEN period <= :time_period
1566-
THEN period END)
1567-
ELSE MIN(period)
1568-
END as best_period
1569-
FROM filtered_targets
1570-
GROUP BY stratum_id, variable, reform_id
1571-
)
1572-
SELECT ft.*
1573-
FROM filtered_targets ft
1574-
JOIN best_periods bp
1575-
ON ft.stratum_id = bp.stratum_id
1576-
AND ft.variable = bp.variable
1577-
AND ft.reform_id = bp.reform_id
1578-
AND ft.period = bp.best_period
1579-
ORDER BY ft.target_id
1580-
"""
1562+
where_clause = " AND ".join(f"({c})" for c in and_conditions)
1563+
1564+
if "reform_id" in self._get_target_overview_columns():
1565+
query = f"""
1566+
WITH filtered_targets AS (
1567+
SELECT tv.target_id, tv.stratum_id, tv.variable, tv.reform_id,
1568+
tv.value, tv.period, tv.geo_level,
1569+
tv.geographic_id, tv.domain_variable
1570+
FROM target_overview tv
1571+
WHERE tv.active = 1
1572+
AND ({where_clause})
1573+
),
1574+
best_periods AS (
1575+
SELECT stratum_id, variable, reform_id,
1576+
CASE
1577+
WHEN MAX(CASE WHEN period <= :time_period
1578+
THEN period END) IS NOT NULL
1579+
THEN MAX(CASE WHEN period <= :time_period
1580+
THEN period END)
1581+
ELSE MIN(period)
1582+
END as best_period
1583+
FROM filtered_targets
1584+
GROUP BY stratum_id, variable, reform_id
1585+
)
1586+
SELECT ft.*
1587+
FROM filtered_targets ft
1588+
JOIN best_periods bp
1589+
ON ft.stratum_id = bp.stratum_id
1590+
AND ft.variable = bp.variable
1591+
AND ft.reform_id = bp.reform_id
1592+
AND ft.period = bp.best_period
1593+
ORDER BY ft.target_id
1594+
"""
1595+
else:
1596+
query = f"""
1597+
WITH filtered_targets AS (
1598+
SELECT tv.target_id, tv.stratum_id, tv.variable,
1599+
0 AS reform_id, tv.value, tv.period, tv.geo_level,
1600+
tv.geographic_id, tv.domain_variable
1601+
FROM target_overview tv
1602+
WHERE tv.active = 1
1603+
AND ({where_clause})
1604+
),
1605+
best_periods AS (
1606+
SELECT stratum_id, variable,
1607+
CASE
1608+
WHEN MAX(CASE WHEN period <= :time_period
1609+
THEN period END) IS NOT NULL
1610+
THEN MAX(CASE WHEN period <= :time_period
1611+
THEN period END)
1612+
ELSE MIN(period)
1613+
END as best_period
1614+
FROM filtered_targets
1615+
GROUP BY stratum_id, variable
1616+
)
1617+
SELECT ft.*
1618+
FROM filtered_targets ft
1619+
JOIN best_periods bp
1620+
ON ft.stratum_id = bp.stratum_id
1621+
AND ft.variable = bp.variable
1622+
AND ft.period = bp.best_period
1623+
ORDER BY ft.target_id
1624+
"""
15811625

15821626
with self.engine.connect() as conn:
15831627
return pd.read_sql(

0 commit comments

Comments
 (0)