Skip to content

Commit c56196b

Browse files
committed
Merge upstream main into capital gains branch
2 parents 6c69223 + 7e5a3f7 commit c56196b

15 files changed

Lines changed: 539 additions & 166 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Export a data-backed FLSA overtime premium proxy from CPS and enhanced CPS.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"base_release_version": "1.115.5",
3+
"candidate_scope": "1.115.5-minor",
4+
"release_bump": "minor",
5+
"run_id": "usdata-gha26379540291-a1",
6+
"would_release_as_at_build_time": "1.116.0"
7+
}

.github/publication_scope.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"base_release_version": "1.115.5",
3-
"candidate_scope": "1.115.5-patch",
4-
"release_bump": "patch",
5-
"run_id": "usdata-gha26360054055-a1",
6-
"would_release_as_at_build_time": "1.115.6"
3+
"candidate_scope": "1.115.5-minor",
4+
"release_bump": "minor",
5+
"run_id": "usdata-gha26379540291-a1",
6+
"would_release_as_at_build_time": "1.116.0"
77
}

.github/scripts/check_policyengine_us_dependency.py

Lines changed: 59 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@
1717
REPO_ROOT = Path(__file__).resolve().parents[2]
1818
PYPI_JSON_TIMEOUT_SECONDS = 20
1919
POLICYENGINE_US = "policyengine-us"
20+
POLICYENGINE_US_GITHUB_REPO = "github.com/PolicyEngine/policyengine-us"
2021
STALE_LOCK_PREFIX = "uv.lock has policyengine-us "
21-
LOCK_GIT_REF_PREFIX = "uv.lock resolves policyengine-us from a Git ref."
22-
PROJECT_GIT_REF_PREFIX = "pyproject.toml pins policyengine-us to a Git ref."
22+
GIT_REF_PREFIX = "uv.lock resolves policyengine-us from a Git ref"
2323

2424

2525
def _annotation(level: str, message: str) -> str:
@@ -85,11 +85,39 @@ def _latest_pypi_version() -> str:
8585
return version
8686

8787

88+
def _is_policyengine_us_git_source(source: dict[str, object]) -> bool:
89+
git_source = source.get("git")
90+
return isinstance(git_source, str) and POLICYENGINE_US_GITHUB_REPO in git_source
91+
92+
93+
def _is_policyengine_us_git_dependency(dependency: str) -> bool:
94+
return (
95+
dependency.startswith(f"{POLICYENGINE_US} @ git+")
96+
and POLICYENGINE_US_GITHUB_REPO in dependency
97+
and re.search(r"@[0-9a-f]{40}$", dependency) is not None
98+
)
99+
100+
101+
def _allows_temporary_git_ref(
102+
locked_version: str,
103+
source: dict[str, object],
104+
project_dependency: str,
105+
latest_version: str | None,
106+
) -> bool:
107+
return (
108+
latest_version is not None
109+
and _compare_versions(locked_version, latest_version) > 0
110+
and _is_policyengine_us_git_source(source)
111+
and _is_policyengine_us_git_dependency(project_dependency)
112+
)
113+
114+
88115
def check_dependency(root: Path, latest_version: str | None = None) -> list[str]:
89116
locked_version, source = _locked_policyengine_us(root)
90117
project_dependency = _project_policyengine_us_dependency(root)
91-
lock_uses_git_ref = "git" in source
92-
project_uses_git_ref = "@" in project_dependency and "git+" in project_dependency
118+
git_ref_allowed = _allows_temporary_git_ref(
119+
locked_version, source, project_dependency, latest_version
120+
)
93121

94122
violations: list[str] = []
95123
if (
@@ -103,40 +131,31 @@ def check_dependency(root: Path, latest_version: str | None = None) -> list[str]
103131
)
104132

105133
expected_dependency = f"{POLICYENGINE_US}=={locked_version}"
106-
if not project_uses_git_ref and project_dependency != expected_dependency:
134+
if project_dependency != expected_dependency and not git_ref_allowed:
107135
violations.append(
108136
f"pyproject.toml must pin {expected_dependency} to match uv.lock; "
109137
f"found {project_dependency!r}."
110138
)
111139

112-
if lock_uses_git_ref:
140+
if "git" in source and not git_ref_allowed:
113141
violations.append(
114-
f"{LOCK_GIT_REF_PREFIX} Prefer an exact "
142+
f"{GIT_REF_PREFIX}. Prefer an exact "
115143
f"PyPI release pin once policyengine-us {locked_version} is published."
116144
)
117145

118-
if project_uses_git_ref:
146+
if (
147+
"@" in project_dependency
148+
and "git+" in project_dependency
149+
and not git_ref_allowed
150+
):
119151
violations.append(
120-
f"{PROJECT_GIT_REF_PREFIX} Prefer an exact "
152+
"pyproject.toml pins policyengine-us to a Git ref. Prefer an exact "
121153
"PyPI release pin for production data builds."
122154
)
123155

124156
return violations
125157

126158

127-
def _is_unreleased_git_ref_violation(
128-
violation: str,
129-
locked_version: str,
130-
latest_version: str | None,
131-
) -> bool:
132-
if latest_version is None:
133-
return False
134-
git_ref_violation = violation.startswith(
135-
LOCK_GIT_REF_PREFIX
136-
) or violation.startswith(PROJECT_GIT_REF_PREFIX)
137-
return git_ref_violation and _compare_versions(locked_version, latest_version) > 0
138-
139-
140159
def main() -> int:
141160
parser = argparse.ArgumentParser()
142161
parser.add_argument(
@@ -176,34 +195,35 @@ def main() -> int:
176195
return 0
177196

178197
if not violations:
179-
locked_version, _source = _locked_policyengine_us(REPO_ROOT)
198+
locked_version, source = _locked_policyengine_us(REPO_ROOT)
180199
print(f"policyengine-us dependency is current at {locked_version}.")
200+
if _allows_temporary_git_ref(
201+
locked_version,
202+
source,
203+
_project_policyengine_us_dependency(REPO_ROOT),
204+
latest_version,
205+
):
206+
print(
207+
_annotation(
208+
"warning",
209+
f"policyengine-us {locked_version} is temporarily pinned to "
210+
"GitHub because it is newer than the latest PyPI release. "
211+
"Replace it with an exact PyPI release pin once published.",
212+
)
213+
)
181214
return 0
182215

183-
locked_version, _source = _locked_policyengine_us(REPO_ROOT)
184216
has_blocking_violation = False
185217
allowed_stale_version = False
186-
allowed_unreleased_git_ref = False
187218
for violation in violations:
188219
stale_version_violation = violation.startswith(STALE_LOCK_PREFIX)
189220
allowed_by_override = allow_stale and stale_version_violation
190-
allowed_git_ref = _is_unreleased_git_ref_violation(
191-
violation,
192-
locked_version,
193-
latest_version,
194-
)
195-
level = (
196-
"warning"
197-
if args.mode == "warn" or allowed_by_override or allowed_git_ref
198-
else "error"
199-
)
221+
level = "warning" if args.mode == "warn" or allowed_by_override else "error"
200222
print(_annotation(level, violation))
201-
if args.mode == "fail" and not allowed_by_override and not allowed_git_ref:
223+
if args.mode == "fail" and not allowed_by_override:
202224
has_blocking_violation = True
203225
if allowed_by_override:
204226
allowed_stale_version = True
205-
if allowed_git_ref:
206-
allowed_unreleased_git_ref = True
207227

208228
if allowed_stale_version:
209229
print(
@@ -213,18 +233,10 @@ def main() -> int:
213233
"policyengine-us lagging the latest PyPI release.",
214234
)
215235
)
216-
if allowed_unreleased_git_ref:
217-
print(
218-
_annotation(
219-
"warning",
220-
"policyengine-us is pinned to an unreleased Git ref; switch to "
221-
f"policyengine-us=={locked_version} once that PyPI release exists.",
222-
)
223-
)
224236

225237
if has_blocking_violation:
226238
return 1
227-
if allowed_stale_version or allowed_unreleased_git_ref:
239+
if allowed_stale_version:
228240
return 0
229241

230242
return 1 if args.mode == "fail" else 0

docs/generated/pipeline_api.json

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
"docstring": "\"Add auto loan balance, interest and net_worth variable.",
6262
"id": "add_auto_loan",
6363
"kind": "function",
64-
"line": 2951,
64+
"line": 3074,
6565
"metadata": {
6666
"api_refs": [
6767
"policyengine_us_data.datasets.cps.cps.add_auto_loan_interest_and_net_worth"
@@ -88,7 +88,7 @@
8888
"docstring": "Populate household-level geography variables used by PolicyEngine US.\n\nArgs:\n cps: Output CPS H5 group receiving derived household variables.\n household: Raw CPS household table.",
8989
"id": "add_household_variables",
9090
"kind": "function",
91-
"line": 1531,
91+
"line": 1667,
9292
"metadata": {
9393
"api_refs": [
9494
"policyengine_us_data.datasets.cps.cps.add_household_variables"
@@ -115,7 +115,7 @@
115115
"docstring": "Add basic ID and weight variables.\n\nArgs:\n cps (h5py.File): The CPS dataset file.\n person (DataFrame): The person table of the ASEC.\n tax_unit (DataFrame): The tax unit table created from the person table\n of the ASEC.\n family (DataFrame): The family table of the ASEC.\n spm_unit (DataFrame): The SPM unit table created from the person table\n of the ASEC.\n household (DataFrame): The household table of the ASEC.",
116116
"id": "add_id_variables",
117117
"kind": "function",
118-
"line": 997,
118+
"line": 1043,
119119
"metadata": {
120120
"api_refs": [
121121
"policyengine_us_data.datasets.cps.cps.add_id_variables"
@@ -139,15 +139,15 @@
139139
"source_file": "policyengine_us_data/datasets/cps/cps.py"
140140
},
141141
"add_org_inputs": {
142-
"docstring": "Impute ORG-derived wage and union inputs onto CPS persons.",
142+
"docstring": "Impute ORG-derived labor-market inputs and derive overtime premium.",
143143
"id": "add_org_inputs",
144144
"kind": "function",
145-
"line": 2835,
145+
"line": 2974,
146146
"metadata": {
147147
"api_refs": [
148148
"policyengine_us_data.datasets.cps.cps.add_org_labor_market_inputs"
149149
],
150-
"description": "Impute hourly wage, hourly-pay status, and union coverage from CPS ORG donors.",
150+
"description": "Impute hourly wage, hourly-pay status, and union coverage from CPS ORG donors, then derive FLSA overtime premium.",
151151
"id": "add_org_inputs",
152152
"label": "ORG Labor-Market Inputs",
153153
"node_type": "library",
@@ -162,14 +162,14 @@
162162
]
163163
},
164164
"object_path": "policyengine_us_data.datasets.cps.cps.add_org_labor_market_inputs",
165-
"signature": "def add_org_labor_market_inputs(cps: h5py.File) -> None",
165+
"signature": "def add_org_labor_market_inputs(cps: h5py.File, time_period: int) -> None",
166166
"source_file": "policyengine_us_data/datasets/cps/cps.py"
167167
},
168168
"add_personal_income_variables": {
169169
"docstring": "Add income variables.\n\nArgs:\n cps (h5py.File): The CPS dataset file.\n person (DataFrame): The CPS person table.\n year (int): The CPS year",
170170
"id": "add_personal_income_variables",
171171
"kind": "function",
172-
"line": 1206,
172+
"line": 1342,
173173
"metadata": {
174174
"api_refs": [
175175
"policyengine_us_data.datasets.cps.cps.add_personal_income_variables"
@@ -196,7 +196,7 @@
196196
"docstring": "Add personal demographic variables.\n\nArgs:\n cps (h5py.File): The CPS dataset file.\n person (DataFrame): The CPS person table.",
197197
"id": "add_personal_variables",
198198
"kind": "function",
199-
"line": 1059,
199+
"line": 1105,
200200
"metadata": {
201201
"api_refs": [
202202
"policyengine_us_data.datasets.cps.cps.add_personal_variables"
@@ -223,7 +223,7 @@
223223
"docstring": "",
224224
"id": "add_previous_year_income",
225225
"kind": "function",
226-
"line": 1573,
226+
"line": 1709,
227227
"metadata": {
228228
"api_refs": [
229229
"policyengine_us_data.datasets.cps.cps.add_previous_year_income"
@@ -250,7 +250,7 @@
250250
"docstring": "",
251251
"id": "add_rent",
252252
"kind": "function",
253-
"line": 371,
253+
"line": 417,
254254
"metadata": {
255255
"api_refs": [
256256
"policyengine_us_data.datasets.cps.cps.add_rent"
@@ -277,7 +277,7 @@
277277
"docstring": "",
278278
"id": "add_spm_variables",
279279
"kind": "function",
280-
"line": 1492,
280+
"line": 1628,
281281
"metadata": {
282282
"api_refs": [
283283
"policyengine_us_data.datasets.cps.cps.add_spm_variables"
@@ -304,7 +304,7 @@
304304
"docstring": "Assign SSN card type using PRCITSHP, employment status, and ASEC-UA conditions.\nCodes:\n- 0: \"NONE\" - Likely undocumented immigrants\n- 1: \"CITIZEN\" - US citizens (born or naturalized)\n- 2: \"NON_CITIZEN_VALID_EAD\" - Non-citizens with work/study authorization\n- 3: \"OTHER_NON_CITIZEN\" - Non-citizens with indicators of legal status",
305305
"id": "add_ssn_card_type",
306306
"kind": "function",
307-
"line": 1679,
307+
"line": 1815,
308308
"metadata": {
309309
"api_refs": [
310310
"policyengine_us_data.datasets.cps.cps.add_ssn_card_type"
@@ -331,7 +331,7 @@
331331
"docstring": "",
332332
"id": "add_takeup",
333333
"kind": "function",
334-
"line": 519,
334+
"line": 565,
335335
"metadata": {
336336
"api_refs": [
337337
"policyengine_us_data.datasets.cps.cps.add_takeup"
@@ -358,7 +358,7 @@
358358
"docstring": "",
359359
"id": "add_tips",
360360
"kind": "function",
361-
"line": 2578,
361+
"line": 2714,
362362
"metadata": {
363363
"api_refs": [
364364
"policyengine_us_data.datasets.cps.cps.add_tips"
@@ -815,7 +815,7 @@
815815
"docstring": "Replace clone-half person-level feature variables with donor matches.",
816816
"id": "clone_features",
817817
"kind": "function",
818-
"line": 585,
818+
"line": 604,
819819
"metadata": {
820820
"api_refs": [
821821
"policyengine_us_data.datasets.cps.extended_cps._splice_clone_feature_predictions"
@@ -878,7 +878,7 @@
878878
"docstring": "Assert that final exported variables are leaf inputs.",
879879
"id": "computed_export_contract",
880880
"kind": "function",
881-
"line": 1775,
881+
"line": 1795,
882882
"metadata": {
883883
"api_refs": [
884884
"policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._assert_no_computed_variables_exported"
@@ -972,7 +972,7 @@
972972
"docstring": "Second-stage QRF: train on CPS, predict for PUF clones.\n\nFor the PUF clone half of the extended CPS we need plausible values\nof CPS-only variables (retirement distributions, transfers, hours,\nSPM components, etc.) that are consistent with the clone's\nPUF-imputed income -- not just naively copied from the CPS donor.\n\nWe train a QRF on CPS person-level data where:\n * predictors = demographics + key income variables\n * outputs = CPS-only variables listed in\n ``CPS_ONLY_IMPUTED_VARIABLES``\n\nFor PUF clone prediction we use the PUF-imputed income values\nfrom the second half of ``data`` (the clone half, which already\nhas PUF-imputed income from stage 1).\n\nUses ``fit_predict()`` with ``max_train_samples`` instead of\nmanual sampling + separate fit/predict.\n\nArgs:\n data: Extended dataset dict after ``puf_clone_dataset()`` --\n already doubled, with PUF-imputed income in the second half.\n time_period: Tax year.\n dataset_path: Path to the CPS h5 file for Microsimulation.\n\nReturns:\n DataFrame with one column per CPS-only variable, containing\n predicted values for the PUF clone half (person-level).",
973973
"id": "cps_only",
974974
"kind": "function",
975-
"line": 624,
975+
"line": 643,
976976
"metadata": {
977977
"api_refs": [
978978
"policyengine_us_data.datasets.cps.extended_cps._impute_cps_only_variables"
@@ -1064,7 +1064,7 @@
10641064
"docstring": "Subsample the loaded CPS dataset and preserve downsampled arrays.\n\nArgs:\n frac: Fraction of records to retain.",
10651065
"id": "downsample",
10661066
"kind": "function",
1067-
"line": 338,
1067+
"line": 384,
10681068
"metadata": {
10691069
"api_refs": [
10701070
"policyengine_us_data.datasets.cps.cps.CPS.downsample"
@@ -1325,7 +1325,7 @@
13251325
"docstring": "Check formula-reconstructed housing assistance before export.\n\nThe final H5 must not export formula outputs such as ``housing_assistance``.\nThis guard verifies that the remaining leaf inputs still make those\nformulas produce nonzero values before the export contract strips or\nrejects computed variables.",
13261326
"id": "housing_assistance_microsim_validation",
13271327
"kind": "function",
1328-
"line": 1545,
1328+
"line": 1565,
13291329
"metadata": {
13301330
"api_refs": [
13311331
"policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._validate_housing_assistance_microsimulation"
@@ -3216,7 +3216,7 @@
32163216
"docstring": "Replace PUF clone half of CPS-only variables with QRF predictions.\n\nAfter ``puf_clone_dataset()`` the CPS-only variables in the second\nhalf are naive copies of the CPS donor values. This function\nreplaces them with the second-stage QRF predictions that are\nconsistent with the clone's PUF-imputed income.\n\nArgs:\n data: Extended dataset dict (already doubled).\n predictions: DataFrame from ``_impute_cps_only_variables()``.\n time_period: Tax year.\n dataset_path: Path to CPS h5 file for entity mapping.\n\nReturns:\n Modified data dict with CPS-only variables spliced in.",
32173217
"id": "qrf_pass2",
32183218
"kind": "function",
3219-
"line": 1015,
3219+
"line": 1034,
32203220
"metadata": {
32213221
"api_refs": [
32223222
"policyengine_us_data.datasets.cps.extended_cps._splice_cps_only_predictions"

docs/generated/pipeline_map.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2739,7 +2739,7 @@
27392739
"api_refs": [
27402740
"policyengine_us_data.datasets.cps.cps.add_org_labor_market_inputs"
27412741
],
2742-
"description": "Impute hourly wage, hourly-pay status, and union coverage from CPS ORG donors.",
2742+
"description": "Impute hourly wage, hourly-pay status, and union coverage from CPS ORG donors, then derive FLSA overtime premium.",
27432743
"id": "add_org_inputs",
27442744
"label": "ORG Labor-Market Inputs",
27452745
"node_type": "library",

0 commit comments

Comments
 (0)