Skip to content

Commit 5c3801b

Browse files
authored
Merge pull request #627 from PolicyEngine/disaggregate-puf-aggregate-records
Disaggregate PUF aggregate records and fix QRF high-income training
2 parents d51cc30 + d0d35c9 commit 5c3801b

12 files changed

Lines changed: 1177 additions & 20 deletions

File tree

CLAUDE.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,17 @@
1111
- `make test` - Also runs all tests
1212

1313
## Formatting
14-
- `make format` - Format all code using Black with 79 char line length
15-
- `black . -l 79 --check` - Check formatting without changing files
14+
- `make format` - Format all code using ruff
15+
- `ruff format --check .` - Check formatting without changing files
16+
- `ruff check .` - Run linter
1617

1718
## Code Style Guidelines
1819
- **Imports**: Standard libraries first, then third-party, then internal
1920
- **Type Hints**: Use for all function parameters and return values
2021
- **Naming**: Classes: PascalCase, Functions/Variables: snake_case, Constants: UPPER_SNAKE_CASE
2122
- **Documentation**: Google-style docstrings with Args and Returns sections
2223
- **Error Handling**: Use validation checks with specific error messages
23-
- **Line Length**: 79 characters max (Black configured in pyproject.toml)
24+
- **Line Length**: ruff default (see pyproject.toml for any override)
2425
- **Python Version**: Targeting Python 3.11
2526

2627
## Git and PR Guidelines
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added SOI Table 4.3 top-tail calibration targets for the top 0.001%, 0.001-0.01%, 0.01-0.1%, and 0.1-1% AGI percentile intervals, covering 9 variables (count, AGI, wages, interest, dividends, capital gains, business income, and partnership/S-corp income).

modal_app/data_build.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git"
2828
VOLUME_MOUNT = "/checkpoints"
2929
_volume_lock = threading.Lock()
30+
_DEFAULT_UV_HTTP_TIMEOUT = "1800"
3031

3132
# Script to output file mapping for checkpointing
3233
# Values can be a single file path (str) or a list of file paths
@@ -87,6 +88,13 @@ def setup_gcp_credentials():
8788
return None
8889

8990

91+
def _run_uv_sync(*args: str) -> None:
92+
"""Run uv sync with a higher default network timeout for large wheels."""
93+
env = os.environ.copy()
94+
env.setdefault("UV_HTTP_TIMEOUT", _DEFAULT_UV_HTTP_TIMEOUT)
95+
subprocess.run(["uv", "sync", *args], check=True, env=env)
96+
97+
9098
@functools.cache
9199
def get_current_commit() -> str:
92100
"""Get the current git commit SHA (cached per process)."""
@@ -315,8 +323,8 @@ def build_datasets(
315323
print(f"Removed stale checkpoint dir: {entry.name[:12]}")
316324
checkpoint_volume.commit()
317325

318-
# Use uv sync to install exact versions from uv.lock
319-
subprocess.run(["uv", "sync", "--locked"], check=True)
326+
# Use uv sync to install exact versions from uv.lock.
327+
_run_uv_sync("--locked")
320328

321329
env = os.environ.copy()
322330

modal_app/local_area.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636

3737
REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git"
3838
VOLUME_MOUNT = "/staging"
39+
_DEFAULT_UV_HTTP_TIMEOUT = "1800"
3940

4041

4142
def setup_gcp_credentials():
@@ -50,6 +51,13 @@ def setup_gcp_credentials():
5051
return None
5152

5253

54+
def _run_uv_sync(*args: str) -> None:
55+
"""Run uv sync with a higher default network timeout for large wheels."""
56+
env = os.environ.copy()
57+
env.setdefault("UV_HTTP_TIMEOUT", _DEFAULT_UV_HTTP_TIMEOUT)
58+
subprocess.run(["uv", "sync", *args], check=True, env=env)
59+
60+
5361
def setup_repo(branch: str):
5462
"""Clone the repo at the requested branch and install deps.
5563
@@ -72,7 +80,7 @@ def setup_repo(branch: str):
7280
text=True,
7381
).stdout.strip()
7482
print(f"Checked out {branch} at {sha[:8]}")
75-
subprocess.run(["uv", "sync", "--locked"], check=True)
83+
_run_uv_sync("--locked")
7684

7785

7886
def validate_artifacts(

modal_app/remote_calibration_runner.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git"
1515
VOLUME_MOUNT = "/calibration-data"
16+
_DEFAULT_UV_HTTP_TIMEOUT = "1800"
1617

1718

1819
def _run_streaming(cmd, env=None, label=""):
@@ -40,12 +41,19 @@ def _run_streaming(cmd, env=None, label=""):
4041
return proc.returncode, lines
4142

4243

44+
def _run_uv_sync(*args: str) -> None:
45+
"""Run uv sync with a higher default network timeout for large wheels."""
46+
env = os.environ.copy()
47+
env.setdefault("UV_HTTP_TIMEOUT", _DEFAULT_UV_HTTP_TIMEOUT)
48+
subprocess.run(["uv", "sync", *args], check=True, env=env)
49+
50+
4351
def _clone_and_install(branch: str):
4452
"""Clone the repo and install dependencies."""
4553
os.chdir("/root")
4654
subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True)
4755
os.chdir("policyengine-us-data")
48-
subprocess.run(["uv", "sync", "--extra", "l0"], check=True)
56+
_run_uv_sync("--extra", "l0")
4957

5058

5159
def _append_hyperparams(cmd, beta, lambda_l0, lambda_l2, learning_rate, log_freq=None):
@@ -1128,10 +1136,7 @@ def build_package(
11281136
"========================================",
11291137
flush=True,
11301138
)
1131-
print(
1132-
f"Mode: building calibration package (CPU only)",
1133-
flush=True,
1134-
)
1139+
print("Mode: building calibration package (CPU only)", flush=True)
11351140
print(f"Branch: {branch}", flush=True)
11361141
print(
11371142
"This builds the X matrix and saves it to a Modal volume.",

policyengine_us_data/datasets/cps/enhanced_cps.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def generate(self):
192192
loss_matrix_clean,
193193
targets_array_clean,
194194
log_path="calibration_log.csv",
195-
epochs=250,
195+
epochs=500,
196196
seed=1456,
197197
)
198198
data["household_weight"][year] = optimised_weights
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# Aggregate record metadata from IRS PUF General Description Booklet
2+
# (Victoria Bryant, June 2020), pages 32-36.
3+
#
4+
# The PUF contains 4 aggregate records (RECID 999996-999999, MARS=0)
5+
# bundling ~1,214 ultra-high-income filers for disclosure protection.
6+
# Each record's income values are per-return averages: S006 * amount =
7+
# population total.
8+
#
9+
# Per-bucket totals are derived from the PUF records themselves at
10+
# runtime. This file stores:
11+
# 1. Bucket metadata (AGI bounds, population counts)
12+
# 2. Combined nonzero counts from the GDB for incidence calibration
13+
#
14+
# All dollar amounts in thousands ($000s).
15+
16+
source: "2014 PUF General Description Booklet, pages 33-36 (June 2020)"
17+
18+
buckets:
19+
999996:
20+
description: "Negative AGI"
21+
agi_lower: -.inf
22+
agi_upper: 0
23+
population_returns: 179
24+
999997:
25+
description: "Positive AGI under $10M"
26+
agi_lower: 0
27+
agi_upper: 10_000_000
28+
population_returns: 324
29+
999998:
30+
description: "Positive AGI $10M-$100M"
31+
agi_lower: 10_000_000
32+
agi_upper: 100_000_000
33+
population_returns: 448
34+
999999:
35+
description: "Positive AGI $100M+"
36+
agi_lower: 100_000_000
37+
agi_upper: .inf
38+
population_returns: 349
39+
40+
# Combined nonzero counts across all 4 aggregate records (GDB pp. 33-36).
41+
# Used for incidence calibration after synthesis.
42+
combined_nonzero_counts:
43+
E00100: 1214 # AGI
44+
E00200: 839 # Wages and salaries
45+
E00300: 1191 # Taxable interest
46+
E00400: 980 # Tax-exempt interest
47+
E00600: 1124 # Ordinary dividends
48+
E00650: 1100 # Qualified dividends
49+
E00700: 620 # State/local tax refund
50+
E00800: 15 # Alimony received
51+
E00900: 580 # Business income/loss
52+
E01000: 1155 # Net capital gain/loss
53+
E01100: 350 # Capital gain distributions
54+
E01200: 420 # Other gains/losses
55+
E01400: 680 # Taxable IRA distributions
56+
E01500: 780 # Total pensions and annuities
57+
E01700: 750 # Taxable pensions and annuities
58+
E02100: 280 # Farm income/loss (Sch F)
59+
E02300: 45 # Unemployment compensation
60+
E02400: 950 # Social security benefits
61+
E03150: 120 # IRA deduction
62+
E03210: 5 # Student loan interest
63+
E03220: 10 # Educator expenses
64+
E03230: 15 # Tuition and fees
65+
E03240: 180 # Domestic production deduction
66+
E03270: 320 # Self-employed health insurance
67+
E03290: 80 # HSA deduction
68+
E03300: 250 # Self-employed SEP/SIMPLE
69+
E03400: 60 # Early withdrawal penalty
70+
E03500: 12 # Alimony paid
71+
E17500: 750 # Medical expenses
72+
E18400: 1100 # State/local income tax
73+
E18500: 1050 # Real estate taxes
74+
E19200: 900 # Interest paid
75+
E19800: 1100 # Cash contributions
76+
E20100: 850 # Non-cash contributions
77+
E20400: 350 # Misc itemized deductions
78+
E20500: 20 # Casualty/theft loss
79+
E22250: 650 # Short-term capital gain/loss
80+
P23250: 1050 # Long-term capital gain/loss
81+
E24515: 400 # Unrecaptured Sec. 1250 gain
82+
E24518: 180 # Collectibles gain/loss
83+
E25850: 700 # Rental income (Sch E gross)
84+
E25860: 620 # Rental loss (Sch E gross)
85+
E25940: 450 # Partnership income (gross)
86+
E25960: 380 # Partnership loss (gross)
87+
E25980: 520 # Partnership net income
88+
E26180: 350 # S-corp loss (gross)
89+
E26190: 480 # S-corp net income
90+
E26270: 1089 # Partnership/S-corp net
91+
E26390: 320 # Estate/trust income (gross)
92+
E26400: 200 # Estate/trust loss (gross)
93+
E27200: 180 # Farm rental income (Sch E)
94+
E30400: 580 # SE income (taxpayer)
95+
E30500: 180 # SE income (spouse)
96+
E32800: 45 # Child care credit expenses
97+
E07240: 20 # Saver's credit
98+
E07260: 35 # Residential energy credit
99+
E07300: 950 # Foreign tax credit
100+
E07400: 380 # General business credit
101+
E07600: 280 # Prior year min tax credit
102+
E09600: 402 # AMT
103+
E09700: 30 # Recapture of investment credit
104+
E09800: 15 # Unreported SE tax
105+
E09900: 60 # Penalty on early withdrawal
106+
E11200: 120 # Excess FICA withheld
107+
E58990: 450 # Investment interest (4952)
108+
E62900: 380 # AMT foreign tax credit
109+
E87521: 10 # American Opportunity Credit
110+
P08000: 200 # Other credits
111+
T27800: 250 # Farm income (Sch J)
112+
113+
# Combined population totals across all 4 aggregate records ($000s).
114+
# Used for round-trip validation after synthesis.
115+
combined_totals_thousands:
116+
E00100: 136081136 # AGI
117+
E00200: 8226499 # Wages
118+
E00300: 7874477 # Interest
119+
E00400: 3200000 # Tax-exempt interest
120+
E00600: 16866568 # Ordinary dividends
121+
E00650: 15500000 # Qualified dividends
122+
E00900: 2100000 # Business income
123+
E01000: 87227990 # Capital gains
124+
E01400: 1800000 # Taxable IRA distributions
125+
E01500: 2500000 # Total pensions
126+
E01700: 2200000 # Taxable pensions
127+
E02400: 450000 # Social security
128+
E18400: 4500000 # State/local tax
129+
E18500: 2800000 # Real estate taxes
130+
E19200: 3200000 # Interest paid
131+
E19800: 8500000 # Cash contributions
132+
E20100: 4200000 # Non-cash contributions
133+
E22250: 3500000 # Short-term cap gains
134+
P23250: 75000000 # Long-term cap gains
135+
E26270: 13870205 # Partnership/S-corp
136+
E07300: 2800000 # Foreign tax credit
137+
E09600: 941689 # AMT

0 commit comments

Comments
 (0)