PolicyEngine
diff --git a/‎changelog.d/add-cap-gains-agi-targets.fixed.md‎
Lines changed: 1 addition & 0 deletions b/‎changelog.d/add-cap-gains-agi-targets.fixed.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎policyengine_us_data/calibration/target_config.yaml‎
Lines changed: 60 additions & 4 deletions b/‎policyengine_us_data/calibration/target_config.yaml‎
Lines changed: 60 additions & 4 deletions
diff --git a/‎policyengine_us_data/datasets/puf/aggregate_record_utils.py‎
Lines changed: 7 additions & 1 deletion b/‎policyengine_us_data/datasets/puf/aggregate_record_utils.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎policyengine_us_data/datasets/puf/forbes_backbone.py‎
Lines changed: 141 additions & 32 deletions b/‎policyengine_us_data/datasets/puf/forbes_backbone.py‎
Lines changed: 141 additions & 32 deletions
@@ -0,0 +1 @@
+Add CBO aggregate and AGI-bracket targets for capital gains, dividends, and interest income, and scale Forbes top-tail SCF draws to Forbes AGI estimates instead of Forbes wealth, to constrain inflated capital-income aggregates (issues #555, #866).
@@ -182,6 +182,57 @@ include:
   - variable: net_capital_gains
     geo_level: national
     domain_variable: net_capital_gains
+  # Per-AGI-bracket capital-income targets — without these the optimizer can
+  # match the single national aggregate while letting individual records
+  # blow up (see issues #555, #866).
+  - variable: net_capital_gains
+    geo_level: national
+    domain_variable: adjusted_gross_income,net_capital_gains
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: net_capital_gains
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: adjusted_gross_income,net_capital_gains
+  # Re-include dividend / interest aggregates that were previously dropped
+  # for "high error or tension". 30% rel-error on a soft target is still
+  # vastly better than the 5-15x inflation we get with no constraint.
+  - variable: qualified_dividend_income
+    geo_level: national
+    domain_variable: qualified_dividend_income
+  - variable: qualified_dividend_income
+    geo_level: national
+    domain_variable: adjusted_gross_income,qualified_dividend_income
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: qualified_dividend_income
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: adjusted_gross_income,qualified_dividend_income
+  - variable: dividend_income
+    geo_level: national
+    domain_variable: dividend_income
+  - variable: dividend_income
+    geo_level: national
+    domain_variable: adjusted_gross_income,dividend_income
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: dividend_income
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: adjusted_gross_income,dividend_income
+  - variable: taxable_interest_income
+    geo_level: national
+    domain_variable: taxable_interest_income
+  - variable: taxable_interest_income
+    geo_level: national
+    domain_variable: adjusted_gross_income,taxable_interest_income
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: taxable_interest_income
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: adjusted_gross_income,taxable_interest_income
   - variable: refundable_ctc
     geo_level: national
     domain_variable: refundable_ctc
@@ -272,8 +323,13 @@ include:
 
   # NOT INCLUDED — high error or tension (from prior validation)
   # =====================================================================
-  # dividend_income (26%, tension), qualified_dividend_income (29%, tension),
   # rental_income (20%), income_tax_before_credits (21%),
-  # salt SOI (102%), taxable_interest_income (61%),
-  # tax_exempt_interest_income (61%), taxable_ira_distributions (68%),
-  # taxable_social_security (55%), person_count by AGI bins (100%)
+  # salt SOI (102%), tax_exempt_interest_income (61%),
+  # taxable_ira_distributions (68%), taxable_social_security (55%),
+  # person_count by AGI bins (100%)
+  #
+  # Re-included above (despite higher error) because excluding them lets
+  # aggregates run wild — see issues #555 and #866:
+  #   dividend_income (was 26% rel-err)
+  #   qualified_dividend_income (was 29%)
+  #   taxable_interest_income (was 61%)
@@ -95,6 +95,12 @@ def _get_bucket_targets(row: pd.Series) -> tuple[float, float, float]:
     return pop_weight, target_mean_agi, target_total_agi
 
 
+def _finite_amount(value) -> float:
+    """Return a finite aggregate amount, treating missing targets as zero."""
+    value = float(value)
+    return value if np.isfinite(value) else 0.0
+
+
 def _get_donor_bucket(regular: pd.DataFrame, recid: int) -> pd.DataFrame:
     """Return donor records for one aggregate bucket, with a safe fallback."""
     donor_bucket = regular[_get_bucket_mask(regular, recid)].copy()
@@ -406,7 +412,7 @@ def _calibrate_amount_columns(
         if column == "E00100":
             continue
 
-        target_total = pop_weight * float(row.get(column, 0))
+        target_total = pop_weight * _finite_amount(row.get(column, 0))
         synthetic[column] = _allocate_weighted_values(
             base_values=selected[column].to_numpy(dtype=float),
             weights=synthetic_weights,
 
@@ -227,10 +227,14 @@ def build_forbes_top_tail_artifact(
         target_n=target_n,
         scf_donors=scf_donors,
     )
+    if selected_forbes.empty:
+        raise ValueError("Forbes backbone produced no eligible units.")
     if len(selected_forbes) < target_n:
-        raise ValueError(
-            "Forbes backbone produced only "
-            f"{len(selected_forbes)} eligible units for target {target_n}."
+        logger.warning(
+            "Forbes backbone produced %s eligible units for target %s; "
+            "scaling replicate weights to the aggregate-row population.",
+            len(selected_forbes),
+            target_n,
         )
 
     forbes_draws = expand_forbes_replicates(
@@ -265,16 +269,16 @@ def build_forbes_top_tail_artifact(
         next_recid + len(scf_draws),
         dtype=int,
     )
-    synthetic["S006"] = config.unit_weight_hundredths
+    synthetic_weight_hundredths = _scaled_replicate_weight_hundredths(
+        total_units=target_n,
+        row_count=len(scf_draws),
+    )
+    synthetic["S006"] = synthetic_weight_hundredths
 
     utils._apply_structural_templates(synthetic, donor_templates)
     apply_forbes_structural_overrides(synthetic, scf_draws)
 
-    synthetic_weights = np.full(
-        len(scf_draws),
-        1.0 / config.replicate_count,
-        dtype=float,
-    )
+    synthetic_weights = synthetic_weight_hundredths.astype(float) / 100
     utils._calibrate_amount_columns(
         synthetic=synthetic,
         selected=puf_priors,
@@ -285,7 +289,7 @@ def build_forbes_top_tail_artifact(
         amount_columns=amount_columns,
         synthetic_weights=synthetic_weights,
     )
-    synthetic["S006"] = config.unit_weight_hundredths
+    synthetic["S006"] = synthetic_weight_hundredths
 
     artifact = ForbesTopTailArtifact(
         source_forbes=source_forbes,
@@ -343,6 +347,34 @@ def build_forbes_top_tail_diagnostics(
     }
 
 
+def _scaled_replicate_weight_hundredths(
+    total_units: int,
+    row_count: int,
+) -> np.ndarray:
+    """Return integer hundredth weights that exactly sum to a unit target."""
+
+    if total_units <= 0:
+        raise ValueError("Forbes synthetic target units must be positive.")
+    if row_count <= 0:
+        raise ValueError("Forbes synthetic row count must be positive.")
+
+    total_hundredths = int(total_units * 100)
+    base = total_hundredths // row_count
+    remainder = total_hundredths - base * row_count
+    if base <= 0:
+        raise ValueError(
+            "Forbes synthetic row count exceeds available hundredth weights."
+        )
+
+    weights = np.full(row_count, base, dtype=int)
+    if remainder:
+        remainder_positions = (
+            np.arange(remainder, dtype=np.int64) * row_count // remainder
+        )
+        weights[remainder_positions] += 1
+    return weights
+
+
 def build_forbes_top_tail_diagnostic_tables(
     artifact: ForbesTopTailArtifact,
     row: pd.Series,
@@ -509,12 +541,15 @@ def validate_forbes_top_tail_artifact(
 
     config.validate()
     expected_units = int(round(pop_weight))
-    expected_draws = expected_units * config.replicate_count
+    selected_units = len(artifact.selected_forbes)
+    expected_draws = selected_units * config.replicate_count
 
-    if len(artifact.selected_forbes) != expected_units:
+    if selected_units <= 0:
+        raise ValueError("Forbes artifact selected no units.")
+    if selected_units > expected_units:
         raise ValueError(
             "Forbes artifact selected "
-            f"{len(artifact.selected_forbes)} units for target {expected_units}."
+            f"{selected_units} units for target {expected_units}."
         )
     for name, frame in {
         "scf_draws": artifact.scf_draws,
@@ -534,8 +569,8 @@ def validate_forbes_top_tail_artifact(
             "Forbes synthetic weights sum to "
             f"{synthetic_weight}; expected {expected_units}."
         )
-    if not artifact.synthetic["S006"].eq(config.unit_weight_hundredths).all():
-        raise ValueError("Forbes synthetic replicate weights are not uniform.")
+    if not (artifact.synthetic["S006"] > 0).all():
+        raise ValueError("Forbes synthetic replicate weights must be positive.")
 
     required_columns = {"RECID", "S006", "E00100", *amount_columns}
     missing_columns = required_columns.difference(artifact.synthetic.columns)
@@ -554,7 +589,7 @@ def validate_forbes_top_tail_artifact(
 
     weights = artifact.synthetic["S006"].to_numpy(dtype=float) / 100
     for column in amount_columns:
-        target_total = pop_weight * float(row.get(column, 0.0))
+        target_total = pop_weight * utils._finite_amount(row.get(column, 0.0))
         actual_total = float(
             np.dot(artifact.synthetic[column].to_numpy(dtype=float), weights)
         )
@@ -957,7 +992,7 @@ def score_forbes_selection_with_scf(
     for row in prepared_forbes.itertuples(index=False):
         candidates = scf_candidates_for_receiver(scf_donors, row)
         probabilities = scf_match_probabilities(candidates, row)
-        agi_values = scf_implied_agi_values(candidates, row)
+        agi_values = scf_wealth_ratio_agi_values(candidates, row)
         tail_probabilities.append(
             float(probabilities[agi_values >= FORBES_TOP_TAIL_AGI_THRESHOLD].sum())
         )
@@ -966,10 +1001,6 @@ def score_forbes_selection_with_scf(
     scored = prepared_forbes.copy()
     scored["scf_tail_probability"] = tail_probabilities
     scored["scf_expected_agi"] = expected_agi
-    scored["estimated_agi"] = np.maximum(
-        scored["scf_expected_agi"].to_numpy(dtype=float),
-        1.0,
-    )
     return scored
 
 
@@ -1068,24 +1099,56 @@ def scf_implied_component_values(
     candidates: pd.DataFrame,
     receiver,
 ) -> dict[str, np.ndarray]:
-    """Scale SCF donor ratios up to one Forbes receiver's wealth level."""
+    """Scale SCF donor income composition to one Forbes receiver's AGI level."""
 
-    networth = float(getattr(receiver, "networth_dollars", 0.0))
+    receiver_agi = _receiver_estimated_agi(receiver)
+    employment_base = np.maximum(
+        0.0,
+        candidates["wageinc"].to_numpy(dtype=float),
+    )
+    capital_gains_base = candidates["kginc"].to_numpy(dtype=float)
+    interest_dividend_base = np.maximum(
+        0.0,
+        candidates["intdivinc"].to_numpy(dtype=float),
+    )
+    business_farm_base = candidates["bussefarminc"].to_numpy(dtype=float)
+    pension_base = np.maximum(
+        0.0,
+        candidates["ssretinc"].to_numpy(dtype=float),
+    )
+    donor_agi_base = (
+        employment_base
+        + capital_gains_base
+        + interest_dividend_base
+        + business_farm_base
+        + 0.5 * pension_base
+    )
+    donor_abs_income_base = (
+        np.abs(employment_base)
+        + np.abs(capital_gains_base)
+        + np.abs(interest_dividend_base)
+        + np.abs(business_farm_base)
+        + 0.5 * np.abs(pension_base)
+    )
+    scale_base = np.where(
+        donor_agi_base > 1.0,
+        donor_agi_base,
+        np.maximum(donor_abs_income_base, 1.0),
+    )
+    scale = receiver_agi / scale_base
     employment_income = np.maximum(
         0.0,
-        networth * candidates["wageinc_ratio"].to_numpy(dtype=float),
+        employment_base * scale,
     )
-    capital_gains = networth * candidates["kginc_ratio"].to_numpy(dtype=float)
+    capital_gains = capital_gains_base * scale
     interest_dividend_income = np.maximum(
         0.0,
-        networth * candidates["intdivinc_ratio"].to_numpy(dtype=float),
-    )
-    business_farm_income = networth * candidates["bussefarminc_ratio"].to_numpy(
-        dtype=float
+        interest_dividend_base * scale,
     )
+    business_farm_income = business_farm_base * scale
     pension_income = np.maximum(
         0.0,
-        networth * candidates["ssretinc_ratio"].to_numpy(dtype=float),
+        pension_base * scale,
     )
     agi = np.maximum(
         employment_income
@@ -1105,6 +1168,52 @@ def scf_implied_component_values(
     }
 
 
+def scf_wealth_ratio_agi_values(
+    candidates: pd.DataFrame,
+    receiver,
+) -> np.ndarray:
+    """Return wealth-ratio AGI values for selection only, not amount priors."""
+
+    networth = float(getattr(receiver, "networth_dollars", 0.0))
+    employment_income = np.maximum(
+        0.0,
+        networth * candidates["wageinc_ratio"].to_numpy(dtype=float),
+    )
+    capital_gains = networth * candidates["kginc_ratio"].to_numpy(dtype=float)
+    interest_dividend_income = np.maximum(
+        0.0,
+        networth * candidates["intdivinc_ratio"].to_numpy(dtype=float),
+    )
+    business_farm_income = networth * candidates["bussefarminc_ratio"].to_numpy(
+        dtype=float
+    )
+    pension_income = np.maximum(
+        0.0,
+        networth * candidates["ssretinc_ratio"].to_numpy(dtype=float),
+    )
+    return np.maximum(
+        employment_income
+        + capital_gains
+        + interest_dividend_income
+        + business_farm_income
+        + 0.5 * pension_income,
+        0.0,
+    )
+
+
+def _receiver_estimated_agi(receiver) -> float:
+    """Return the Forbes receiver AGI anchor used for SCF composition draws."""
+
+    estimated_agi = float(getattr(receiver, "estimated_agi", np.nan))
+    if np.isfinite(estimated_agi) and estimated_agi > 0:
+        return estimated_agi
+
+    networth = float(getattr(receiver, "networth_dollars", 0.0))
+    agi_ratio = float(getattr(receiver, "agi_ratio", DEFAULT_PROFILE.agi_ratio))
+    self_made_scale = 1.05 if bool(getattr(receiver, "self_made_flag", False)) else 0.95
+    return max(networth * agi_ratio * self_made_scale, 1.0)
+
+
 def scf_implied_agi_values(
     candidates: pd.DataFrame,
     receiver,
@@ -1530,7 +1639,7 @@ def _build_calibration_diagnostics(
 ) -> pd.DataFrame:
     rows = []
     for column in amount_columns:
-        target_total = pop_weight * float(row.get(column, 0.0))
+        target_total = pop_weight * utils._finite_amount(row.get(column, 0.0))
         synthetic_total = _weighted_columns_total(synthetic, (column,), weights)
         absolute_error = synthetic_total - target_total
         if abs(target_total) > ARTIFACT_NUMERIC_TOL:
@@ -1577,7 +1686,7 @@ def _build_composition_diagnostics(
             continue
 
         target_total = pop_weight * sum(
-            float(row.get(column, 0.0)) for column in columns
+            utils._finite_amount(row.get(column, 0.0)) for column in columns
         )
         synthetic_total = _weighted_columns_total(
             artifact.synthetic,
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add CBO aggregate and AGI-bracket targets for capital gains, dividends, and interest income, and scale Forbes top-tail SCF draws to Forbes AGI estimates instead of Forbes wealth, to constrain inflated capital-income aggregates (issues #555, #866).`