Update publication candidate

MaxGhenis · MaxGhenis · commit b117a6c773ff · 2026-05-16T15:44:34.000Z
diff --git a/.github/publication_candidates/usdata-gha25966054079-a1/changelog.d/974.fixed b/.github/publication_candidates/usdata-gha25966054079-a1/changelog.d/974.fixed
diff --git a/.github/publication_candidates/usdata-gha25966054079-a1/publication_scope.json b/.github/publication_candidates/usdata-gha25966054079-a1/publication_scope.json
@@ -0,0 +1,7 @@
+{
+  "base_release_version": "1.115.2",
+  "candidate_scope": "1.115.2-patch",
+  "release_bump": "patch",
+  "run_id": "usdata-gha25966054079-a1",
+  "would_release_as_at_build_time": "1.115.3"
+}
diff --git a/.github/publication_scope.json b/.github/publication_scope.json
@@ -2,6 +2,6 @@
   "base_release_version": "1.115.2",
   "candidate_scope": "1.115.2-patch",
   "release_bump": "patch",
-  "run_id": "usdata-gha25953131994-a1",
+  "run_id": "usdata-gha25966054079-a1",
   "would_release_as_at_build_time": "1.115.3"
 }
diff --git a/docs/engineering/pipeline-map.md b/docs/engineering/pipeline-map.md
@@ -184,7 +184,7 @@ Merge CPS + PUF via cloning, rematch clone features, QRF-impute incomes and CPS-
 | `qrf_pass2` Splice CPS-Only Predictions | `process` | `transitional` | `moving` | `policyengine_us_data.datasets.cps.extended_cps._splice_cps_only_predictions` |
 | `mortgage_hints` Mortgage Balance Hint Imputation | `library` | `current` | `moving` | `policyengine_us_data.utils.mortgage_interest.impute_tax_unit_mortgage_balance_hints` |
 | `mortgage_convert` Structural Mortgage Conversion | `library` | `current` | `moving` | `policyengine_us_data.utils.mortgage_interest.convert_mortgage_interest_to_structural_inputs` |
-| `formula_drop` Drop Formula Variables | `process` | `transitional` | `moving` | `policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._drop_formula_variables` |
+| `computed_export_contract` Validate Leaf-Input Export | `process` | `transitional` | `moving` | `policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._assert_no_computed_variables_exported` |
 
 #### Edges
 
@@ -204,8 +204,8 @@ Merge CPS + PUF via cloning, rematch clone features, QRF-impute incomes and CPS-
 - `qrf_pass2` -> `mortgage_hints` `data_flow`
 - `in_scf_s2` -> `mortgage_hints` `data_flow` (SCF donor sample)
 - `mortgage_hints` -> `mortgage_convert` `data_flow`
-- `mortgage_convert` -> `formula_drop` `data_flow`
-- `formula_drop` -> `out_ext` `produces_artifact`
+- `mortgage_convert` -> `computed_export_contract` `data_flow`
+- `computed_export_contract` -> `out_ext` `produces_artifact`
 - `util_qrf_s2` -> `puf_qrf_pass` `uses_utility`
 - `util_qrf_s2` -> `cps_only` `uses_utility`
 - `util_qrf_s2` -> `mortgage_hints` `uses_utility`
diff --git a/docs/generated/pipeline_api.json b/docs/generated/pipeline_api.json
@@ -810,7 +810,7 @@
     "docstring": "Replace clone-half person-level feature variables with donor matches.",
     "id": "clone_features",
     "kind": "function",
-    "line": 400,
+    "line": 403,
     "metadata": {
       "api_refs": [
         "policyengine_us_data.datasets.cps.extended_cps._splice_clone_feature_predictions"
@@ -869,6 +869,37 @@
     "signature": "class CloneWeightMatrix",
     "source_file": "policyengine_us_data/build_outputs/weights.py"
   },
+  "computed_export_contract": {
+    "docstring": "Assert that final exported variables are leaf inputs.",
+    "id": "computed_export_contract",
+    "kind": "function",
+    "line": 1266,
+    "metadata": {
+      "api_refs": [
+        "policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._assert_no_computed_variables_exported"
+      ],
+      "artifacts_in": [
+        "extended_cps_stage2"
+      ],
+      "artifacts_out": [
+        "validated_extended_cps"
+      ],
+      "description": "Fails the build if the final export still contains variables computed by policyengine-us formulas, adds, or subtracts.",
+      "id": "computed_export_contract",
+      "label": "Validate Leaf-Input Export",
+      "node_type": "process",
+      "pathways": [
+        "data_build"
+      ],
+      "pydoc": true,
+      "source_file": "policyengine_us_data/datasets/cps/extended_cps.py",
+      "stability": "moving",
+      "status": "transitional"
+    },
+    "object_path": "policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._assert_no_computed_variables_exported",
+    "signature": "def _assert_no_computed_variables_exported(cls, data, time_period)",
+    "source_file": "policyengine_us_data/datasets/cps/extended_cps.py"
+  },
   "coordinate_publish": {
     "docstring": "Coordinate the full publishing workflow.",
     "id": "coordinate_publish",
@@ -936,7 +967,7 @@
     "docstring": "Second-stage QRF: train on CPS, predict for PUF clones.\n\nFor the PUF clone half of the extended CPS we need plausible values\nof CPS-only variables (retirement distributions, transfers, hours,\nSPM components, etc.) that are consistent with the clone's\nPUF-imputed income -- not just naively copied from the CPS donor.\n\nWe train a QRF on CPS person-level data where:\n  * predictors = demographics + key income variables\n  * outputs    = CPS-only variables listed in\n                 ``CPS_ONLY_IMPUTED_VARIABLES``\n\nFor PUF clone prediction we use the PUF-imputed income values\nfrom the second half of ``data`` (the clone half, which already\nhas PUF-imputed income from stage 1).\n\nUses ``fit_predict()`` with ``max_train_samples`` instead of\nmanual sampling + separate fit/predict.\n\nArgs:\n    data: Extended dataset dict after ``puf_clone_dataset()`` --\n        already doubled, with PUF-imputed income in the second half.\n    time_period: Tax year.\n    dataset_path: Path to the CPS h5 file for Microsimulation.\n\nReturns:\n    DataFrame with one column per CPS-only variable, containing\n    predicted values for the PUF clone half (person-level).",
     "id": "cps_only",
     "kind": "function",
-    "line": 439,
+    "line": 442,
     "metadata": {
       "api_refs": [
         "policyengine_us_data.datasets.cps.extended_cps._impute_cps_only_variables"
@@ -1085,37 +1116,6 @@
     "signature": "def fit_l0_weights(X_sparse, targets: np.ndarray, lambda_l0: float, epochs: int = DEFAULT_EPOCHS, device: str = 'cpu', verbose_freq: Optional[int] = None, beta: float = BETA, lambda_l2: float = LAMBDA_L2, learning_rate: float = LEARNING_RATE, log_freq: int = None, log_path: str = None, target_names: list = None, initial_weights: np.ndarray = None, targets_df: 'pd.DataFrame' = None, achievable: np.ndarray = None, target_groups: Optional[np.ndarray] = None, resume_from: str = None, checkpoint_path: str = None) -> np.ndarray",
     "source_file": "policyengine_us_data/calibration/unified_calibration.py"
   },
-  "formula_drop": {
-    "docstring": "Remove variables that are computed by policyengine-us.\n\nVariables with formulas, ``adds``, or ``subtracts`` are\nrecomputed by the simulation engine, so storing them wastes\nspace and can mislead validation.\n\nAggregate variables whose ``adds`` include a behavioral-\nresponse input (e.g. ``employment_income_before_lsr``) are\nrenamed to that input before dropping so the raw data is\npreserved under the correct input-variable name.",
-    "id": "formula_drop",
-    "kind": "function",
-    "line": 1197,
-    "metadata": {
-      "api_refs": [
-        "policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._drop_formula_variables"
-      ],
-      "artifacts_in": [
-        "extended_cps_stage2"
-      ],
-      "artifacts_out": [
-        "formula_pruned_extended_cps"
-      ],
-      "description": "Removes variables computed by policyengine-us formulas, while preserving selected imputed inputs under canonical leaf variable names.",
-      "id": "formula_drop",
-      "label": "Drop Formula Variables",
-      "node_type": "process",
-      "pathways": [
-        "data_build"
-      ],
-      "pydoc": true,
-      "source_file": "policyengine_us_data/datasets/cps/extended_cps.py",
-      "stability": "moving",
-      "status": "transitional"
-    },
-    "object_path": "policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._drop_formula_variables",
-    "signature": "def _drop_formula_variables(cls, data)",
-    "source_file": "policyengine_us_data/datasets/cps/extended_cps.py"
-  },
   "geo_assign": {
     "docstring": "Assign random census block geography to cloned\nCPS records.\n\nEach of n_records * n_clones total records gets a\nrandom census block sampled from the global\npopulation-weighted distribution. State and CD are\nderived from the block GEOID.\n\nArgs:\n    n_records: Number of households in the base CPS\n        dataset.\n    n_clones: Number of clones (default 10).\n    seed: Random seed for reproducibility.\n    fixed_state_fips: Optional state FIPS per base record. Positive\n        values constrain every clone of that record to blocks in the\n        requested state; zero or missing values remain unrestricted.\n\nReturns:\n    GeographyAssignment with arrays of length\n    n_records * n_clones.",
     "id": "geo_assign",
@@ -2619,7 +2619,7 @@
     "docstring": "Replace PUF clone half of CPS-only variables with QRF predictions.\n\nAfter ``puf_clone_dataset()`` the CPS-only variables in the second\nhalf are naive copies of the CPS donor values. This function\nreplaces them with the second-stage QRF predictions that are\nconsistent with the clone's PUF-imputed income.\n\nArgs:\n    data: Extended dataset dict (already doubled).\n    predictions: DataFrame from ``_impute_cps_only_variables()``.\n    time_period: Tax year.\n    dataset_path: Path to CPS h5 file for entity mapping.\n\nReturns:\n    Modified data dict with CPS-only variables spliced in.",
     "id": "qrf_pass2",
     "kind": "function",
-    "line": 717,
+    "line": 748,
     "metadata": {
       "api_refs": [
         "policyengine_us_data.datasets.cps.extended_cps._splice_cps_only_predictions"
diff --git a/docs/generated/pipeline_map.json b/docs/generated/pipeline_map.json
@@ -2260,11 +2260,11 @@
         {
           "edge_type": "data_flow",
           "source": "mortgage_convert",
-          "target": "formula_drop"
+          "target": "computed_export_contract"
         },
         {
           "edge_type": "produces_artifact",
-          "source": "formula_drop",
+          "source": "computed_export_contract",
           "target": "out_ext"
         },
         {
@@ -2543,17 +2543,17 @@
         },
         {
           "api_refs": [
-            "policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._drop_formula_variables"
+            "policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._assert_no_computed_variables_exported"
           ],
           "artifacts_in": [
             "extended_cps_stage2"
           ],
           "artifacts_out": [
-            "formula_pruned_extended_cps"
+            "validated_extended_cps"
           ],
-          "description": "Removes variables computed by policyengine-us formulas, while preserving selected imputed inputs under canonical leaf variable names.",
-          "id": "formula_drop",
-          "label": "Drop Formula Variables",
+          "description": "Fails the build if the final export still contains variables computed by policyengine-us formulas, adds, or subtracts.",
+          "id": "computed_export_contract",
+          "label": "Validate Leaf-Input Export",
           "node_type": "process",
           "pathways": [
             "data_build"

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,6 @@`
`2`	`2`	`"base_release_version": "1.115.2",`
`3`	`3`	`"candidate_scope": "1.115.2-patch",`
`4`	`4`	`"release_bump": "patch",`
`5`		`- "run_id": "usdata-gha25953131994-a1",`
	`5`	`+ "run_id": "usdata-gha25966054079-a1",`
`6`	`6`	`"would_release_as_at_build_time": "1.115.3"`
`7`	`7`	`}`