Skip to content

Commit 4b35735

Browse files
MaxGhenisclaude
andcommitted
Add downstream tax-aggregate validation module (paper B2)
Addresses the reviewer's B2 ask for downstream-policy-output validation, not just input-target validation. After calibration the ``policyengine_us.h5`` artifact is ingested by ``policyengine_us.Microsimulation``; this module computes a canonical set of 2024 aggregates (income_tax, eitc, ctc, snap, ssi, aca_ptc) and compares them against IRS/USDA/SSA/CMS published totals. Each benchmark has a cited source — no magic numbers. - ``DownstreamBenchmark`` record carrying computed, benchmark, unit, source, and derived abs/rel error. - ``DOWNSTREAM_BENCHMARKS_2024`` canonical 2024 benchmark set (six headline aggregates, each sourced). - ``compute_downstream_aggregates(dataset_path, period)`` runs ``policyengine_us.Microsimulation`` on an h5 and returns per- variable weighted sums. - ``compute_downstream_comparison(aggs, benchmarks)`` joins computed values to their benchmarks with signed relative error. Tests: 7 new unit tests covering record fields, JSON serialization, zero-benchmark guard, canonical-set completeness, source-presence invariant, and the comparison join. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 8fa62e4 commit 4b35735

4 files changed

Lines changed: 298 additions & 2 deletions

File tree

AGENTS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ To avoid rebuilding long prompts in chat:
8484
<!-- gitnexus:start -->
8585
# GitNexus — Code Intelligence
8686

87-
This project is indexed by GitNexus as **microplex-us** (4732 symbols, 12778 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
87+
This project is indexed by GitNexus as **microplex-us** (4732 symbols, 12777 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
8888

8989
> If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first.
9090

CLAUDE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<!-- gitnexus:start -->
22
# GitNexus — Code Intelligence
33

4-
This project is indexed by GitNexus as **microplex-us** (4732 symbols, 12778 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
4+
This project is indexed by GitNexus as **microplex-us** (4732 symbols, 12777 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
55

66
> If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first.
77
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
"""Downstream tax-benefit aggregate validation (paper reviewer response B2).
2+
3+
Input-target validation (see ``soi.py``, ``baseline.py``) asks whether
4+
the calibrated synthetic frame's marginal sums match administrative
5+
totals on the *variables the calibrator was told to target*.
6+
Downstream validation asks the different, stricter question: when the
7+
calibrated frame is ingested by ``policyengine_us.Microsimulation``,
8+
do the *computed policy outputs* — federal income tax, EITC, CTC,
9+
SNAP, SSI, ACA PTC — match administrative aggregates?
10+
11+
This module contains:
12+
13+
- ``DownstreamBenchmark`` record (name, computed, benchmark, unit, source).
14+
- ``DOWNSTREAM_BENCHMARKS_2024`` canonical 2024 benchmark set. Each
15+
record is sourced to an IRS / USDA / SSA / CMS / CBO publication.
16+
- ``compute_downstream_aggregates(dataset_path, period)`` runs the
17+
simulation and returns a dict of variable → weighted sum.
18+
- ``compute_downstream_comparison(aggregates, benchmarks)`` joins
19+
computed values to benchmarks and returns per-variable errors.
20+
21+
Benchmark numbers are rounded publicly-reported totals; each has a
22+
citation. Updates should be traceable to the cited source.
23+
"""
24+
25+
from __future__ import annotations
26+
27+
from dataclasses import asdict, dataclass, field
28+
from pathlib import Path
29+
from typing import Iterable
30+
31+
32+
@dataclass(frozen=True)
33+
class DownstreamBenchmark:
34+
"""One external-benchmark comparison.
35+
36+
``benchmark`` is the published external aggregate (e.g. IRS SOI
37+
total EITC disbursed 2024). ``computed`` is the aggregate computed
38+
on the calibrated synthetic frame by ``policyengine_us``.
39+
"""
40+
41+
name: str
42+
computed: float
43+
benchmark: float
44+
unit: str
45+
source: str
46+
47+
@property
48+
def abs_error(self) -> float:
49+
return self.computed - self.benchmark
50+
51+
@property
52+
def rel_error(self) -> float | None:
53+
if self.benchmark == 0:
54+
return None
55+
return (self.computed - self.benchmark) / self.benchmark
56+
57+
def to_dict(self) -> dict[str, object]:
58+
return {
59+
"name": self.name,
60+
"computed": self.computed,
61+
"benchmark": self.benchmark,
62+
"unit": self.unit,
63+
"source": self.source,
64+
"abs_error": self.abs_error,
65+
"rel_error": self.rel_error,
66+
}
67+
68+
69+
@dataclass(frozen=True)
70+
class DownstreamBenchmarkSpec:
71+
"""A benchmark definition without a computed value attached."""
72+
73+
name: str
74+
benchmark: float
75+
unit: str
76+
source: str
77+
78+
79+
DOWNSTREAM_BENCHMARKS_2024: tuple[DownstreamBenchmarkSpec, ...] = (
80+
DownstreamBenchmarkSpec(
81+
name="income_tax",
82+
benchmark=2_400_000_000_000.0,
83+
unit="USD",
84+
source=(
85+
"IRS SOI 2022 total federal individual income tax liability "
86+
"~$2.22T; CBO 2024 projection ~$2.4T"
87+
),
88+
),
89+
DownstreamBenchmarkSpec(
90+
name="eitc",
91+
benchmark=64_000_000_000.0,
92+
unit="USD",
93+
source="IRS SOI 2023 EITC disbursed ~$64B (Table 2.5)",
94+
),
95+
DownstreamBenchmarkSpec(
96+
name="ctc",
97+
benchmark=115_000_000_000.0,
98+
unit="USD",
99+
source=(
100+
"IRS SOI 2023 CTC disbursed ~$115B (pre-OBBBA CTC of $2,000 "
101+
"per qualifying child)"
102+
),
103+
),
104+
DownstreamBenchmarkSpec(
105+
name="snap",
106+
benchmark=100_000_000_000.0,
107+
unit="USD",
108+
source="USDA FNS FY2024 SNAP benefits total ~$100B",
109+
),
110+
DownstreamBenchmarkSpec(
111+
name="ssi",
112+
benchmark=66_000_000_000.0,
113+
unit="USD",
114+
source="SSA SSI Annual Statistical Report 2024 ~$66B total payments",
115+
),
116+
DownstreamBenchmarkSpec(
117+
name="aca_ptc",
118+
benchmark=60_000_000_000.0,
119+
unit="USD",
120+
source=(
121+
"CMS/IRS ACA Advance Premium Tax Credit & reconciled PTC "
122+
"2024 ~$60B (IRA-enhanced subsidies in effect)"
123+
),
124+
),
125+
)
126+
127+
128+
def compute_downstream_comparison(
129+
aggregates: dict[str, float],
130+
benchmarks: Iterable[DownstreamBenchmarkSpec],
131+
) -> dict[str, DownstreamBenchmark]:
132+
"""Join computed aggregates to their external benchmarks.
133+
134+
Variables in ``aggregates`` without a matching benchmark are
135+
silently omitted — they're either not in the benchmark set or the
136+
caller passed extra diagnostic values.
137+
"""
138+
benchmark_by_name = {spec.name: spec for spec in benchmarks}
139+
result: dict[str, DownstreamBenchmark] = {}
140+
for name, computed in aggregates.items():
141+
spec = benchmark_by_name.get(name)
142+
if spec is None:
143+
continue
144+
result[name] = DownstreamBenchmark(
145+
name=name,
146+
computed=float(computed),
147+
benchmark=spec.benchmark,
148+
unit=spec.unit,
149+
source=spec.source,
150+
)
151+
return result
152+
153+
154+
def compute_downstream_aggregates(
155+
dataset_path: str | Path,
156+
period: int = 2024,
157+
variables: Iterable[str] = (
158+
"income_tax",
159+
"eitc",
160+
"ctc",
161+
"snap",
162+
"ssi",
163+
"aca_ptc",
164+
),
165+
) -> dict[str, float]:
166+
"""Load a PolicyEngine-US dataset and compute weighted sums for ``variables``.
167+
168+
Returns a dict of variable → weighted aggregate (float). Requires
169+
``policyengine_us`` to be installed.
170+
"""
171+
# Import lazily so the rest of this module (benchmark records,
172+
# comparison function) stays importable in environments without PE.
173+
from policyengine_us import Microsimulation # noqa: PLC0415
174+
175+
simulation = Microsimulation(dataset=str(dataset_path))
176+
aggregates: dict[str, float] = {}
177+
for variable in variables:
178+
series = simulation.calculate(variable, period)
179+
aggregates[variable] = float(series.sum())
180+
return aggregates
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
"""Downstream tax-benefit aggregate validation (B2).
2+
3+
After calibration, the synthesized microdata is ingested by
4+
``policyengine_us.Microsimulation``. This module computes a canonical
5+
set of downstream aggregates — federal income tax, EITC, CTC, SNAP,
6+
SSI, ACA PTC — and compares them against external benchmarks (IRS
7+
SOI, USDA, SSA, CMS). The comparison is the validation a tax-microsim
8+
reviewer actually wants: not whether input targets were hit, but
9+
whether the downstream policy outputs computed on the synthetic frame
10+
look like the real-world outputs.
11+
12+
These tests drive:
13+
14+
1. ``DownstreamBenchmark`` is a typed record for one
15+
external-benchmark comparison (name, computed, benchmark, source,
16+
unit).
17+
2. ``compute_downstream_comparison`` returns a dict of benchmark
18+
name → ``DownstreamBenchmark`` with absolute and relative errors.
19+
3. The module's canonical benchmark set for 2024 includes the six
20+
required headline aggregates.
21+
4. Relative error is signed (computed − benchmark) / benchmark.
22+
5. A benchmark record round-trips to JSON.
23+
"""
24+
25+
from __future__ import annotations
26+
27+
import json
28+
from pathlib import Path
29+
30+
import pytest
31+
32+
from microplex_us.validation.downstream import (
33+
DOWNSTREAM_BENCHMARKS_2024,
34+
DownstreamBenchmark,
35+
compute_downstream_comparison,
36+
)
37+
38+
39+
class TestDownstreamBenchmark:
40+
def test_benchmark_record_fields(self) -> None:
41+
record = DownstreamBenchmark(
42+
name="eitc",
43+
computed=65_000_000_000.0,
44+
benchmark=64_000_000_000.0,
45+
unit="USD",
46+
source="IRS SOI 2024",
47+
)
48+
assert record.abs_error == pytest.approx(1_000_000_000.0)
49+
assert record.rel_error == pytest.approx(1_000_000_000.0 / 64_000_000_000.0)
50+
51+
def test_benchmark_record_serializes_to_json(self) -> None:
52+
record = DownstreamBenchmark(
53+
name="snap",
54+
computed=100.0,
55+
benchmark=110.0,
56+
unit="USD",
57+
source="USDA 2024",
58+
)
59+
as_json = json.loads(json.dumps(record.to_dict()))
60+
assert as_json["name"] == "snap"
61+
assert as_json["computed"] == 100.0
62+
assert as_json["benchmark"] == 110.0
63+
assert as_json["rel_error"] == pytest.approx(-10.0 / 110.0)
64+
65+
def test_benchmark_zero_benchmark_returns_none_rel(self) -> None:
66+
"""Guard against divide-by-zero in report generation."""
67+
record = DownstreamBenchmark(
68+
name="zero",
69+
computed=5.0,
70+
benchmark=0.0,
71+
unit="USD",
72+
source="test",
73+
)
74+
assert record.rel_error is None
75+
76+
77+
class TestDownstreamBenchmarksSet:
78+
def test_2024_benchmark_set_covers_headline_aggregates(self) -> None:
79+
names = {b.name for b in DOWNSTREAM_BENCHMARKS_2024}
80+
assert names >= {"income_tax", "eitc", "ctc", "snap", "ssi", "aca_ptc"}
81+
82+
def test_2024_benchmarks_have_sources_cited(self) -> None:
83+
"""No magic numbers — each benchmark must declare its source."""
84+
for benchmark in DOWNSTREAM_BENCHMARKS_2024:
85+
assert benchmark.source, f"missing source on {benchmark.name}"
86+
assert benchmark.benchmark > 0, f"non-positive benchmark on {benchmark.name}"
87+
88+
89+
class TestComputeDownstreamComparison:
90+
def test_compute_from_aggregates_dict(self) -> None:
91+
"""The pure comparison step: given computed numbers, wrap them
92+
with their benchmarks and errors. No PE-sim needed.
93+
"""
94+
computed = {
95+
"income_tax": 2_300_000_000_000.0,
96+
"eitc": 64_000_000_000.0,
97+
"ctc": 115_000_000_000.0,
98+
"snap": 98_000_000_000.0,
99+
"ssi": 66_000_000_000.0,
100+
"aca_ptc": 55_000_000_000.0,
101+
}
102+
result = compute_downstream_comparison(computed, DOWNSTREAM_BENCHMARKS_2024)
103+
104+
assert set(result) == set(computed)
105+
eitc = result["eitc"]
106+
assert eitc.computed == 64_000_000_000.0
107+
assert eitc.benchmark > 0
108+
assert abs(eitc.rel_error) < 0.2, "EITC computed ~ benchmark"
109+
assert eitc.source
110+
111+
def test_compute_skips_missing_variables(self) -> None:
112+
"""If a variable doesn't have a benchmark, it's silently omitted."""
113+
computed = {"not_a_benchmark_name": 1.0, "eitc": 60_000_000_000.0}
114+
result = compute_downstream_comparison(computed, DOWNSTREAM_BENCHMARKS_2024)
115+
assert "not_a_benchmark_name" not in result
116+
assert "eitc" in result

0 commit comments

Comments
 (0)