|
810 | 810 | "docstring": "Replace clone-half person-level feature variables with donor matches.", |
811 | 811 | "id": "clone_features", |
812 | 812 | "kind": "function", |
813 | | - "line": 400, |
| 813 | + "line": 403, |
814 | 814 | "metadata": { |
815 | 815 | "api_refs": [ |
816 | 816 | "policyengine_us_data.datasets.cps.extended_cps._splice_clone_feature_predictions" |
|
869 | 869 | "signature": "class CloneWeightMatrix", |
870 | 870 | "source_file": "policyengine_us_data/build_outputs/weights.py" |
871 | 871 | }, |
| 872 | + "computed_export_contract": { |
| 873 | + "docstring": "Assert that final exported variables are leaf inputs.", |
| 874 | + "id": "computed_export_contract", |
| 875 | + "kind": "function", |
| 876 | + "line": 1266, |
| 877 | + "metadata": { |
| 878 | + "api_refs": [ |
| 879 | + "policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._assert_no_computed_variables_exported" |
| 880 | + ], |
| 881 | + "artifacts_in": [ |
| 882 | + "extended_cps_stage2" |
| 883 | + ], |
| 884 | + "artifacts_out": [ |
| 885 | + "validated_extended_cps" |
| 886 | + ], |
| 887 | + "description": "Fails the build if the final export still contains variables computed by policyengine-us formulas, adds, or subtracts.", |
| 888 | + "id": "computed_export_contract", |
| 889 | + "label": "Validate Leaf-Input Export", |
| 890 | + "node_type": "process", |
| 891 | + "pathways": [ |
| 892 | + "data_build" |
| 893 | + ], |
| 894 | + "pydoc": true, |
| 895 | + "source_file": "policyengine_us_data/datasets/cps/extended_cps.py", |
| 896 | + "stability": "moving", |
| 897 | + "status": "transitional" |
| 898 | + }, |
| 899 | + "object_path": "policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._assert_no_computed_variables_exported", |
| 900 | + "signature": "def _assert_no_computed_variables_exported(cls, data, time_period)", |
| 901 | + "source_file": "policyengine_us_data/datasets/cps/extended_cps.py" |
| 902 | + }, |
872 | 903 | "coordinate_publish": { |
873 | 904 | "docstring": "Coordinate the full publishing workflow.", |
874 | 905 | "id": "coordinate_publish", |
|
936 | 967 | "docstring": "Second-stage QRF: train on CPS, predict for PUF clones.\n\nFor the PUF clone half of the extended CPS we need plausible values\nof CPS-only variables (retirement distributions, transfers, hours,\nSPM components, etc.) that are consistent with the clone's\nPUF-imputed income -- not just naively copied from the CPS donor.\n\nWe train a QRF on CPS person-level data where:\n * predictors = demographics + key income variables\n * outputs = CPS-only variables listed in\n ``CPS_ONLY_IMPUTED_VARIABLES``\n\nFor PUF clone prediction we use the PUF-imputed income values\nfrom the second half of ``data`` (the clone half, which already\nhas PUF-imputed income from stage 1).\n\nUses ``fit_predict()`` with ``max_train_samples`` instead of\nmanual sampling + separate fit/predict.\n\nArgs:\n data: Extended dataset dict after ``puf_clone_dataset()`` --\n already doubled, with PUF-imputed income in the second half.\n time_period: Tax year.\n dataset_path: Path to the CPS h5 file for Microsimulation.\n\nReturns:\n DataFrame with one column per CPS-only variable, containing\n predicted values for the PUF clone half (person-level).", |
937 | 968 | "id": "cps_only", |
938 | 969 | "kind": "function", |
939 | | - "line": 439, |
| 970 | + "line": 442, |
940 | 971 | "metadata": { |
941 | 972 | "api_refs": [ |
942 | 973 | "policyengine_us_data.datasets.cps.extended_cps._impute_cps_only_variables" |
|
1085 | 1116 | "signature": "def fit_l0_weights(X_sparse, targets: np.ndarray, lambda_l0: float, epochs: int = DEFAULT_EPOCHS, device: str = 'cpu', verbose_freq: Optional[int] = None, beta: float = BETA, lambda_l2: float = LAMBDA_L2, learning_rate: float = LEARNING_RATE, log_freq: int = None, log_path: str = None, target_names: list = None, initial_weights: np.ndarray = None, targets_df: 'pd.DataFrame' = None, achievable: np.ndarray = None, target_groups: Optional[np.ndarray] = None, resume_from: str = None, checkpoint_path: str = None) -> np.ndarray", |
1086 | 1117 | "source_file": "policyengine_us_data/calibration/unified_calibration.py" |
1087 | 1118 | }, |
1088 | | - "formula_drop": { |
1089 | | - "docstring": "Remove variables that are computed by policyengine-us.\n\nVariables with formulas, ``adds``, or ``subtracts`` are\nrecomputed by the simulation engine, so storing them wastes\nspace and can mislead validation.\n\nAggregate variables whose ``adds`` include a behavioral-\nresponse input (e.g. ``employment_income_before_lsr``) are\nrenamed to that input before dropping so the raw data is\npreserved under the correct input-variable name.", |
1090 | | - "id": "formula_drop", |
1091 | | - "kind": "function", |
1092 | | - "line": 1197, |
1093 | | - "metadata": { |
1094 | | - "api_refs": [ |
1095 | | - "policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._drop_formula_variables" |
1096 | | - ], |
1097 | | - "artifacts_in": [ |
1098 | | - "extended_cps_stage2" |
1099 | | - ], |
1100 | | - "artifacts_out": [ |
1101 | | - "formula_pruned_extended_cps" |
1102 | | - ], |
1103 | | - "description": "Removes variables computed by policyengine-us formulas, while preserving selected imputed inputs under canonical leaf variable names.", |
1104 | | - "id": "formula_drop", |
1105 | | - "label": "Drop Formula Variables", |
1106 | | - "node_type": "process", |
1107 | | - "pathways": [ |
1108 | | - "data_build" |
1109 | | - ], |
1110 | | - "pydoc": true, |
1111 | | - "source_file": "policyengine_us_data/datasets/cps/extended_cps.py", |
1112 | | - "stability": "moving", |
1113 | | - "status": "transitional" |
1114 | | - }, |
1115 | | - "object_path": "policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._drop_formula_variables", |
1116 | | - "signature": "def _drop_formula_variables(cls, data)", |
1117 | | - "source_file": "policyengine_us_data/datasets/cps/extended_cps.py" |
1118 | | - }, |
1119 | 1119 | "geo_assign": { |
1120 | 1120 | "docstring": "Assign random census block geography to cloned\nCPS records.\n\nEach of n_records * n_clones total records gets a\nrandom census block sampled from the global\npopulation-weighted distribution. State and CD are\nderived from the block GEOID.\n\nArgs:\n n_records: Number of households in the base CPS\n dataset.\n n_clones: Number of clones (default 10).\n seed: Random seed for reproducibility.\n fixed_state_fips: Optional state FIPS per base record. Positive\n values constrain every clone of that record to blocks in the\n requested state; zero or missing values remain unrestricted.\n\nReturns:\n GeographyAssignment with arrays of length\n n_records * n_clones.", |
1121 | 1121 | "id": "geo_assign", |
|
2619 | 2619 | "docstring": "Replace PUF clone half of CPS-only variables with QRF predictions.\n\nAfter ``puf_clone_dataset()`` the CPS-only variables in the second\nhalf are naive copies of the CPS donor values. This function\nreplaces them with the second-stage QRF predictions that are\nconsistent with the clone's PUF-imputed income.\n\nArgs:\n data: Extended dataset dict (already doubled).\n predictions: DataFrame from ``_impute_cps_only_variables()``.\n time_period: Tax year.\n dataset_path: Path to CPS h5 file for entity mapping.\n\nReturns:\n Modified data dict with CPS-only variables spliced in.", |
2620 | 2620 | "id": "qrf_pass2", |
2621 | 2621 | "kind": "function", |
2622 | | - "line": 717, |
| 2622 | + "line": 748, |
2623 | 2623 | "metadata": { |
2624 | 2624 | "api_refs": [ |
2625 | 2625 | "policyengine_us_data.datasets.cps.extended_cps._splice_cps_only_predictions" |
|
0 commit comments