Add downstream tax-aggregate validation module (paper B2)

MaxGhenis · claude · MaxGhenis · commit 4b357356bb0a · 2026-04-22T12:15:18.000-04:00
Addresses the reviewer's B2 ask for downstream-policy-output
validation, not just input-target validation. After calibration the
``policyengine_us.h5`` artifact is ingested by
``policyengine_us.Microsimulation``; this module computes a canonical
set of 2024 aggregates (income_tax, eitc, ctc, snap, ssi, aca_ptc) and
compares them against IRS/USDA/SSA/CMS published totals. Each
benchmark has a cited source — no magic numbers.

- ``DownstreamBenchmark`` record carrying computed, benchmark,
  unit, source, and derived abs/rel error.
- ``DOWNSTREAM_BENCHMARKS_2024`` canonical 2024 benchmark set
  (six headline aggregates, each sourced).
- ``compute_downstream_aggregates(dataset_path, period)`` runs
  ``policyengine_us.Microsimulation`` on an h5 and returns per-
  variable weighted sums.
- ``compute_downstream_comparison(aggs, benchmarks)`` joins
  computed values to their benchmarks with signed relative error.

Tests: 7 new unit tests covering record fields, JSON serialization,
zero-benchmark guard, canonical-set completeness, source-presence
invariant, and the comparison join.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/AGENTS.md b/AGENTS.md
@@ -84,7 +84,7 @@ To avoid rebuilding long prompts in chat:
 <!-- gitnexus:start -->
 # GitNexus — Code Intelligence
 
-This project is indexed by GitNexus as **microplex-us** (4732 symbols, 12778 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
+This project is indexed by GitNexus as **microplex-us** (4732 symbols, 12777 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
 
 > If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first.
 
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -1,7 +1,7 @@
 <!-- gitnexus:start -->
 # GitNexus — Code Intelligence
 
-This project is indexed by GitNexus as **microplex-us** (4732 symbols, 12778 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
+This project is indexed by GitNexus as **microplex-us** (4732 symbols, 12777 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
 
 > If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first.
 
diff --git a/src/microplex_us/validation/downstream.py b/src/microplex_us/validation/downstream.py
@@ -0,0 +1,180 @@
+"""Downstream tax-benefit aggregate validation (paper reviewer response B2).
+
+Input-target validation (see ``soi.py``, ``baseline.py``) asks whether
+the calibrated synthetic frame's marginal sums match administrative
+totals on the *variables the calibrator was told to target*.
+Downstream validation asks the different, stricter question: when the
+calibrated frame is ingested by ``policyengine_us.Microsimulation``,
+do the *computed policy outputs* — federal income tax, EITC, CTC,
+SNAP, SSI, ACA PTC — match administrative aggregates?
+
+This module contains:
+
+- ``DownstreamBenchmark`` record (name, computed, benchmark, unit, source).
+- ``DOWNSTREAM_BENCHMARKS_2024`` canonical 2024 benchmark set. Each
+  record is sourced to an IRS / USDA / SSA / CMS / CBO publication.
+- ``compute_downstream_aggregates(dataset_path, period)`` runs the
+  simulation and returns a dict of variable → weighted sum.
+- ``compute_downstream_comparison(aggregates, benchmarks)`` joins
+  computed values to benchmarks and returns per-variable errors.
+
+Benchmark numbers are rounded publicly-reported totals; each has a
+citation. Updates should be traceable to the cited source.
+"""
+
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Iterable
+
+
+@dataclass(frozen=True)
+class DownstreamBenchmark:
+    """One external-benchmark comparison.
+
+    ``benchmark`` is the published external aggregate (e.g. IRS SOI
+    total EITC disbursed 2024). ``computed`` is the aggregate computed
+    on the calibrated synthetic frame by ``policyengine_us``.
+    """
+
+    name: str
+    computed: float
+    benchmark: float
+    unit: str
+    source: str
+
+    @property
+    def abs_error(self) -> float:
+        return self.computed - self.benchmark
+
+    @property
+    def rel_error(self) -> float | None:
+        if self.benchmark == 0:
+            return None
+        return (self.computed - self.benchmark) / self.benchmark
+
+    def to_dict(self) -> dict[str, object]:
+        return {
+            "name": self.name,
+            "computed": self.computed,
+            "benchmark": self.benchmark,
+            "unit": self.unit,
+            "source": self.source,
+            "abs_error": self.abs_error,
+            "rel_error": self.rel_error,
+        }
+
+
+@dataclass(frozen=True)
+class DownstreamBenchmarkSpec:
+    """A benchmark definition without a computed value attached."""
+
+    name: str
+    benchmark: float
+    unit: str
+    source: str
+
+
+DOWNSTREAM_BENCHMARKS_2024: tuple[DownstreamBenchmarkSpec, ...] = (
+    DownstreamBenchmarkSpec(
+        name="income_tax",
+        benchmark=2_400_000_000_000.0,
+        unit="USD",
+        source=(
+            "IRS SOI 2022 total federal individual income tax liability "
+            "~$2.22T; CBO 2024 projection ~$2.4T"
+        ),
+    ),
+    DownstreamBenchmarkSpec(
+        name="eitc",
+        benchmark=64_000_000_000.0,
+        unit="USD",
+        source="IRS SOI 2023 EITC disbursed ~$64B (Table 2.5)",
+    ),
+    DownstreamBenchmarkSpec(
+        name="ctc",
+        benchmark=115_000_000_000.0,
+        unit="USD",
+        source=(
+            "IRS SOI 2023 CTC disbursed ~$115B (pre-OBBBA CTC of $2,000 "
+            "per qualifying child)"
+        ),
+    ),
+    DownstreamBenchmarkSpec(
+        name="snap",
+        benchmark=100_000_000_000.0,
+        unit="USD",
+        source="USDA FNS FY2024 SNAP benefits total ~$100B",
+    ),
+    DownstreamBenchmarkSpec(
+        name="ssi",
+        benchmark=66_000_000_000.0,
+        unit="USD",
+        source="SSA SSI Annual Statistical Report 2024 ~$66B total payments",
+    ),
+    DownstreamBenchmarkSpec(
+        name="aca_ptc",
+        benchmark=60_000_000_000.0,
+        unit="USD",
+        source=(
+            "CMS/IRS ACA Advance Premium Tax Credit & reconciled PTC "
+            "2024 ~$60B (IRA-enhanced subsidies in effect)"
+        ),
+    ),
+)
+
+
+def compute_downstream_comparison(
+    aggregates: dict[str, float],
+    benchmarks: Iterable[DownstreamBenchmarkSpec],
+) -> dict[str, DownstreamBenchmark]:
+    """Join computed aggregates to their external benchmarks.
+
+    Variables in ``aggregates`` without a matching benchmark are
+    silently omitted — they're either not in the benchmark set or the
+    caller passed extra diagnostic values.
+    """
+    benchmark_by_name = {spec.name: spec for spec in benchmarks}
+    result: dict[str, DownstreamBenchmark] = {}
+    for name, computed in aggregates.items():
+        spec = benchmark_by_name.get(name)
+        if spec is None:
+            continue
+        result[name] = DownstreamBenchmark(
+            name=name,
+            computed=float(computed),
+            benchmark=spec.benchmark,
+            unit=spec.unit,
+            source=spec.source,
+        )
+    return result
+
+
+def compute_downstream_aggregates(
+    dataset_path: str | Path,
+    period: int = 2024,
+    variables: Iterable[str] = (
+        "income_tax",
+        "eitc",
+        "ctc",
+        "snap",
+        "ssi",
+        "aca_ptc",
+    ),
+) -> dict[str, float]:
+    """Load a PolicyEngine-US dataset and compute weighted sums for ``variables``.
+
+    Returns a dict of variable → weighted aggregate (float). Requires
+    ``policyengine_us`` to be installed.
+    """
+    # Import lazily so the rest of this module (benchmark records,
+    # comparison function) stays importable in environments without PE.
+    from policyengine_us import Microsimulation  # noqa: PLC0415
+
+    simulation = Microsimulation(dataset=str(dataset_path))
+    aggregates: dict[str, float] = {}
+    for variable in variables:
+        series = simulation.calculate(variable, period)
+        aggregates[variable] = float(series.sum())
+    return aggregates
diff --git a/tests/validation/test_downstream.py b/tests/validation/test_downstream.py
@@ -0,0 +1,116 @@
+"""Downstream tax-benefit aggregate validation (B2).
+
+After calibration, the synthesized microdata is ingested by
+``policyengine_us.Microsimulation``. This module computes a canonical
+set of downstream aggregates — federal income tax, EITC, CTC, SNAP,
+SSI, ACA PTC — and compares them against external benchmarks (IRS
+SOI, USDA, SSA, CMS). The comparison is the validation a tax-microsim
+reviewer actually wants: not whether input targets were hit, but
+whether the downstream policy outputs computed on the synthetic frame
+look like the real-world outputs.
+
+These tests drive:
+
+1. ``DownstreamBenchmark`` is a typed record for one
+   external-benchmark comparison (name, computed, benchmark, source,
+   unit).
+2. ``compute_downstream_comparison`` returns a dict of benchmark
+   name → ``DownstreamBenchmark`` with absolute and relative errors.
+3. The module's canonical benchmark set for 2024 includes the six
+   required headline aggregates.
+4. Relative error is signed (computed − benchmark) / benchmark.
+5. A benchmark record round-trips to JSON.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+from microplex_us.validation.downstream import (
+    DOWNSTREAM_BENCHMARKS_2024,
+    DownstreamBenchmark,
+    compute_downstream_comparison,
+)
+
+
+class TestDownstreamBenchmark:
+    def test_benchmark_record_fields(self) -> None:
+        record = DownstreamBenchmark(
+            name="eitc",
+            computed=65_000_000_000.0,
+            benchmark=64_000_000_000.0,
+            unit="USD",
+            source="IRS SOI 2024",
+        )
+        assert record.abs_error == pytest.approx(1_000_000_000.0)
+        assert record.rel_error == pytest.approx(1_000_000_000.0 / 64_000_000_000.0)
+
+    def test_benchmark_record_serializes_to_json(self) -> None:
+        record = DownstreamBenchmark(
+            name="snap",
+            computed=100.0,
+            benchmark=110.0,
+            unit="USD",
+            source="USDA 2024",
+        )
+        as_json = json.loads(json.dumps(record.to_dict()))
+        assert as_json["name"] == "snap"
+        assert as_json["computed"] == 100.0
+        assert as_json["benchmark"] == 110.0
+        assert as_json["rel_error"] == pytest.approx(-10.0 / 110.0)
+
+    def test_benchmark_zero_benchmark_returns_none_rel(self) -> None:
+        """Guard against divide-by-zero in report generation."""
+        record = DownstreamBenchmark(
+            name="zero",
+            computed=5.0,
+            benchmark=0.0,
+            unit="USD",
+            source="test",
+        )
+        assert record.rel_error is None
+
+
+class TestDownstreamBenchmarksSet:
+    def test_2024_benchmark_set_covers_headline_aggregates(self) -> None:
+        names = {b.name for b in DOWNSTREAM_BENCHMARKS_2024}
+        assert names >= {"income_tax", "eitc", "ctc", "snap", "ssi", "aca_ptc"}
+
+    def test_2024_benchmarks_have_sources_cited(self) -> None:
+        """No magic numbers — each benchmark must declare its source."""
+        for benchmark in DOWNSTREAM_BENCHMARKS_2024:
+            assert benchmark.source, f"missing source on {benchmark.name}"
+            assert benchmark.benchmark > 0, f"non-positive benchmark on {benchmark.name}"
+
+
+class TestComputeDownstreamComparison:
+    def test_compute_from_aggregates_dict(self) -> None:
+        """The pure comparison step: given computed numbers, wrap them
+        with their benchmarks and errors. No PE-sim needed.
+        """
+        computed = {
+            "income_tax": 2_300_000_000_000.0,
+            "eitc": 64_000_000_000.0,
+            "ctc": 115_000_000_000.0,
+            "snap": 98_000_000_000.0,
+            "ssi": 66_000_000_000.0,
+            "aca_ptc": 55_000_000_000.0,
+        }
+        result = compute_downstream_comparison(computed, DOWNSTREAM_BENCHMARKS_2024)
+
+        assert set(result) == set(computed)
+        eitc = result["eitc"]
+        assert eitc.computed == 64_000_000_000.0
+        assert eitc.benchmark > 0
+        assert abs(eitc.rel_error) < 0.2, "EITC computed ~ benchmark"
+        assert eitc.source
+
+    def test_compute_skips_missing_variables(self) -> None:
+        """If a variable doesn't have a benchmark, it's silently omitted."""
+        computed = {"not_a_benchmark_name": 1.0, "eitc": 60_000_000_000.0}
+        result = compute_downstream_comparison(computed, DOWNSTREAM_BENCHMARKS_2024)
+        assert "not_a_benchmark_name" not in result
+        assert "eitc" in result