|
34 | 34 | "docstring": "Impute rent and real_estate_taxes from ACS with state.\n\nArgs:\n data: CPS data dict.\n state_fips: State FIPS per household.\n time_period: Tax year.\n dataset_path: Path to CPS h5 for Microsimulation.\n\nReturns:\n Updated data dict.", |
35 | 35 | "id": "acs_qrf", |
36 | 36 | "kind": "function", |
37 | | - "line": 508, |
| 37 | + "line": 524, |
38 | 38 | "metadata": { |
39 | 39 | "api_refs": [ |
40 | 40 | "policyengine_us_data.calibration.source_impute._impute_acs" |
|
61 | 61 | "docstring": "\"Add auto loan balance, interest and net_worth variable.", |
62 | 62 | "id": "add_auto_loan", |
63 | 63 | "kind": "function", |
64 | | - "line": 2938, |
| 64 | + "line": 2951, |
65 | 65 | "metadata": { |
66 | 66 | "api_refs": [ |
67 | 67 | "policyengine_us_data.datasets.cps.cps.add_auto_loan_interest_and_net_worth" |
|
88 | 88 | "docstring": "Populate household-level geography variables used by PolicyEngine US.\n\nArgs:\n cps: Output CPS H5 group receiving derived household variables.\n household: Raw CPS household table.", |
89 | 89 | "id": "add_household_variables", |
90 | 90 | "kind": "function", |
91 | | - "line": 1519, |
| 91 | + "line": 1531, |
92 | 92 | "metadata": { |
93 | 93 | "api_refs": [ |
94 | 94 | "policyengine_us_data.datasets.cps.cps.add_household_variables" |
|
115 | 115 | "docstring": "Add basic ID and weight variables.\n\nArgs:\n cps (h5py.File): The CPS dataset file.\n person (DataFrame): The person table of the ASEC.\n tax_unit (DataFrame): The tax unit table created from the person table\n of the ASEC.\n family (DataFrame): The family table of the ASEC.\n spm_unit (DataFrame): The SPM unit table created from the person table\n of the ASEC.\n household (DataFrame): The household table of the ASEC.", |
116 | 116 | "id": "add_id_variables", |
117 | 117 | "kind": "function", |
118 | | - "line": 988, |
| 118 | + "line": 997, |
119 | 119 | "metadata": { |
120 | 120 | "api_refs": [ |
121 | 121 | "policyengine_us_data.datasets.cps.cps.add_id_variables" |
|
142 | 142 | "docstring": "Impute ORG-derived wage and union inputs onto CPS persons.", |
143 | 143 | "id": "add_org_inputs", |
144 | 144 | "kind": "function", |
145 | | - "line": 2822, |
| 145 | + "line": 2835, |
146 | 146 | "metadata": { |
147 | 147 | "api_refs": [ |
148 | 148 | "policyengine_us_data.datasets.cps.cps.add_org_labor_market_inputs" |
|
169 | 169 | "docstring": "Add income variables.\n\nArgs:\n cps (h5py.File): The CPS dataset file.\n person (DataFrame): The CPS person table.\n year (int): The CPS year", |
170 | 170 | "id": "add_personal_income_variables", |
171 | 171 | "kind": "function", |
172 | | - "line": 1194, |
| 172 | + "line": 1206, |
173 | 173 | "metadata": { |
174 | 174 | "api_refs": [ |
175 | 175 | "policyengine_us_data.datasets.cps.cps.add_personal_income_variables" |
|
196 | 196 | "docstring": "Add personal demographic variables.\n\nArgs:\n cps (h5py.File): The CPS dataset file.\n person (DataFrame): The CPS person table.", |
197 | 197 | "id": "add_personal_variables", |
198 | 198 | "kind": "function", |
199 | | - "line": 1050, |
| 199 | + "line": 1059, |
200 | 200 | "metadata": { |
201 | 201 | "api_refs": [ |
202 | 202 | "policyengine_us_data.datasets.cps.cps.add_personal_variables" |
|
223 | 223 | "docstring": "", |
224 | 224 | "id": "add_previous_year_income", |
225 | 225 | "kind": "function", |
226 | | - "line": 1561, |
| 226 | + "line": 1573, |
227 | 227 | "metadata": { |
228 | 228 | "api_refs": [ |
229 | 229 | "policyengine_us_data.datasets.cps.cps.add_previous_year_income" |
|
250 | 250 | "docstring": "", |
251 | 251 | "id": "add_rent", |
252 | 252 | "kind": "function", |
253 | | - "line": 362, |
| 253 | + "line": 371, |
254 | 254 | "metadata": { |
255 | 255 | "api_refs": [ |
256 | 256 | "policyengine_us_data.datasets.cps.cps.add_rent" |
|
277 | 277 | "docstring": "", |
278 | 278 | "id": "add_spm_variables", |
279 | 279 | "kind": "function", |
280 | | - "line": 1480, |
| 280 | + "line": 1492, |
281 | 281 | "metadata": { |
282 | 282 | "api_refs": [ |
283 | 283 | "policyengine_us_data.datasets.cps.cps.add_spm_variables" |
|
304 | 304 | "docstring": "Assign SSN card type using PRCITSHP, employment status, and ASEC-UA conditions.\nCodes:\n- 0: \"NONE\" - Likely undocumented immigrants\n- 1: \"CITIZEN\" - US citizens (born or naturalized)\n- 2: \"NON_CITIZEN_VALID_EAD\" - Non-citizens with work/study authorization\n- 3: \"OTHER_NON_CITIZEN\" - Non-citizens with indicators of legal status", |
305 | 305 | "id": "add_ssn_card_type", |
306 | 306 | "kind": "function", |
307 | | - "line": 1667, |
| 307 | + "line": 1679, |
308 | 308 | "metadata": { |
309 | 309 | "api_refs": [ |
310 | 310 | "policyengine_us_data.datasets.cps.cps.add_ssn_card_type" |
|
331 | 331 | "docstring": "", |
332 | 332 | "id": "add_takeup", |
333 | 333 | "kind": "function", |
334 | | - "line": 510, |
| 334 | + "line": 519, |
335 | 335 | "metadata": { |
336 | 336 | "api_refs": [ |
337 | 337 | "policyengine_us_data.datasets.cps.cps.add_takeup" |
|
358 | 358 | "docstring": "", |
359 | 359 | "id": "add_tips", |
360 | 360 | "kind": "function", |
361 | | - "line": 2566, |
| 361 | + "line": 2578, |
362 | 362 | "metadata": { |
363 | 363 | "api_refs": [ |
364 | 364 | "policyengine_us_data.datasets.cps.cps.add_tips" |
|
727 | 727 | "docstring": "", |
728 | 728 | "id": "calibration_diagnostics", |
729 | 729 | "kind": "function", |
730 | | - "line": 1249, |
| 730 | + "line": 1246, |
731 | 731 | "metadata": { |
732 | 732 | "api_refs": [ |
733 | 733 | "policyengine_us_data.calibration.unified_calibration.compute_diagnostics" |
|
815 | 815 | "docstring": "Replace clone-half person-level feature variables with donor matches.", |
816 | 816 | "id": "clone_features", |
817 | 817 | "kind": "function", |
818 | | - "line": 412, |
| 818 | + "line": 585, |
819 | 819 | "metadata": { |
820 | 820 | "api_refs": [ |
821 | 821 | "policyengine_us_data.datasets.cps.extended_cps._splice_clone_feature_predictions" |
|
878 | 878 | "docstring": "Assert that final exported variables are leaf inputs.", |
879 | 879 | "id": "computed_export_contract", |
880 | 880 | "kind": "function", |
881 | | - "line": 1589, |
| 881 | + "line": 1775, |
882 | 882 | "metadata": { |
883 | 883 | "api_refs": [ |
884 | 884 | "policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._assert_no_computed_variables_exported" |
|
972 | 972 | "docstring": "Second-stage QRF: train on CPS, predict for PUF clones.\n\nFor the PUF clone half of the extended CPS we need plausible values\nof CPS-only variables (retirement distributions, transfers, hours,\nSPM components, etc.) that are consistent with the clone's\nPUF-imputed income -- not just naively copied from the CPS donor.\n\nWe train a QRF on CPS person-level data where:\n * predictors = demographics + key income variables\n * outputs = CPS-only variables listed in\n ``CPS_ONLY_IMPUTED_VARIABLES``\n\nFor PUF clone prediction we use the PUF-imputed income values\nfrom the second half of ``data`` (the clone half, which already\nhas PUF-imputed income from stage 1).\n\nUses ``fit_predict()`` with ``max_train_samples`` instead of\nmanual sampling + separate fit/predict.\n\nArgs:\n data: Extended dataset dict after ``puf_clone_dataset()`` --\n already doubled, with PUF-imputed income in the second half.\n time_period: Tax year.\n dataset_path: Path to the CPS h5 file for Microsimulation.\n\nReturns:\n DataFrame with one column per CPS-only variable, containing\n predicted values for the PUF clone half (person-level).", |
973 | 973 | "id": "cps_only", |
974 | 974 | "kind": "function", |
975 | | - "line": 451, |
| 975 | + "line": 624, |
976 | 976 | "metadata": { |
977 | 977 | "api_refs": [ |
978 | 978 | "policyengine_us_data.datasets.cps.extended_cps._impute_cps_only_variables" |
|
1031 | 1031 | "docstring": "Create a stratified sample of CPS data preserving high-income households\nwhile maintaining low-income diversity for poverty analysis.\n\nArgs:\n target_households: Target number of households in output (approximate)\n oversample_poor: If True, boost sampling rate for bottom 25% by 1.5x\n seed: Random seed for reproducibility (default: None for random)\n base_dataset: Path to source h5 file (default: extended_cps_2024.h5)\n output_path: Where to save the stratified h5 file\n high_agi_brackets: List of (lo, hi, cap) tuples defining per-bracket\n caps for the high-AGI tail. Defaults to HIGH_AGI_BRACKETS.", |
1032 | 1032 | "id": "create_stratified", |
1033 | 1033 | "kind": "function", |
1034 | | - "line": 85, |
| 1034 | + "line": 145, |
1035 | 1035 | "metadata": { |
1036 | 1036 | "api_refs": [ |
1037 | 1037 | "policyengine_us_data.calibration.create_stratified_cps.create_stratified_cps_dataset" |
|
1064 | 1064 | "docstring": "Subsample the loaded CPS dataset and preserve downsampled arrays.\n\nArgs:\n frac: Fraction of records to retain.", |
1065 | 1065 | "id": "downsample", |
1066 | 1066 | "kind": "function", |
1067 | | - "line": 329, |
| 1067 | + "line": 338, |
1068 | 1068 | "metadata": { |
1069 | 1069 | "api_refs": [ |
1070 | 1070 | "policyengine_us_data.datasets.cps.cps.CPS.downsample" |
|
1091 | 1091 | "docstring": "Fit L0-regularized calibration weights.\n\nArgs:\n X_sparse: Sparse matrix (targets x records).\n targets: Target values array.\n lambda_l0: L0 regularization strength.\n epochs: Training epochs.\n device: Torch device.\n verbose_freq: Print frequency. Defaults to 10%.\n beta: L0 gate temperature.\n lambda_l2: L2 regularization strength.\n learning_rate: Optimizer learning rate.\n log_freq: Epochs between per-target CSV logs.\n None disables logging.\n log_path: Path for the per-target calibration log CSV.\n target_names: Human-readable target names for the log.\n initial_weights: Pre-computed initial weights. If None,\n computed from targets_df age targets.\n targets_df: Targets DataFrame, used to compute\n initial_weights when not provided.\n target_groups: Optional group ID per target row for balanced loss.\n resume_from: Path to a `.checkpoint.pt` file or `.npy`\n weights file to continue fitting from.\n checkpoint_path: Where to save resumable fit checkpoints.\n\nReturns:\n Weight array of shape (n_records,).", |
1092 | 1092 | "id": "fit_model", |
1093 | 1093 | "kind": "function", |
1094 | | - "line": 893, |
| 1094 | + "line": 890, |
1095 | 1095 | "metadata": { |
1096 | 1096 | "api_refs": [ |
1097 | 1097 | "policyengine_us_data.calibration.unified_calibration.fit_l0_weights" |
|
1325 | 1325 | "docstring": "Check formula-reconstructed housing assistance before export.\n\nThe final H5 must not export formula outputs such as ``housing_assistance``.\nThis guard verifies that the remaining leaf inputs still make those\nformulas produce nonzero values before the export contract strips or\nrejects computed variables.", |
1326 | 1326 | "id": "housing_assistance_microsim_validation", |
1327 | 1327 | "kind": "function", |
1328 | | - "line": 1359, |
| 1328 | + "line": 1545, |
1329 | 1329 | "metadata": { |
1330 | 1330 | "api_refs": [ |
1331 | 1331 | "policyengine_us_data.datasets.cps.extended_cps.ExtendedCPS._validate_housing_assistance_microsimulation" |
|
1410 | 1410 | "docstring": "Compute population-based initial weights from age targets.\n\nFor each congressional district, sums person_count targets where\ndomain_variable == \"age\" to get district population, then divides\nby the number of columns (households) active in that district.\n\nArgs:\n X_sparse: Sparse matrix (targets x records).\n targets_df: Targets DataFrame with columns: variable,\n domain_variable, geo_level, geographic_id, value.\n\nReturns:\n Weight array of shape (n_records,).", |
1411 | 1411 | "id": "init_weights", |
1412 | 1412 | "kind": "function", |
1413 | | - "line": 814, |
| 1413 | + "line": 811, |
1414 | 1414 | "metadata": { |
1415 | 1415 | "api_refs": [ |
1416 | 1416 | "policyengine_us_data.calibration.unified_calibration.compute_initial_weights" |
|
3216 | 3216 | "docstring": "Replace PUF clone half of CPS-only variables with QRF predictions.\n\nAfter ``puf_clone_dataset()`` the CPS-only variables in the second\nhalf are naive copies of the CPS donor values. This function\nreplaces them with the second-stage QRF predictions that are\nconsistent with the clone's PUF-imputed income.\n\nArgs:\n data: Extended dataset dict (already doubled).\n predictions: DataFrame from ``_impute_cps_only_variables()``.\n time_period: Tax year.\n dataset_path: Path to CPS h5 file for entity mapping.\n\nReturns:\n Modified data dict with CPS-only variables spliced in.", |
3217 | 3217 | "id": "qrf_pass2", |
3218 | 3218 | "kind": "function", |
3219 | | - "line": 829, |
| 3219 | + "line": 1015, |
3220 | 3220 | "metadata": { |
3221 | 3221 | "api_refs": [ |
3222 | 3222 | "policyengine_us_data.datasets.cps.extended_cps._splice_cps_only_predictions" |
|
3540 | 3540 | "docstring": "Run unified calibration pipeline.\n\nArgs:\n dataset_path: Path to CPS h5 file.\n db_path: Path to policy_data.db.\n n_clones: Number of dataset clones.\n lambda_l0: L0 regularization strength.\n epochs: Training epochs.\n device: Torch device.\n seed: Random seed.\n domain_variables: Filter targets by domain variable.\n hierarchical_domains: Domains for hierarchical\n uprating + CD reconciliation.\n skip_takeup_rerandomize: Skip takeup step.\n skip_source_impute: Skip ACS/SIPP/SCF imputations.\n target_config: Parsed target config dict.\n target_config_path: Path to target config, for provenance.\n target_config_identity: Resolved target config path/checksum identity.\n build_only: If True, save package and skip fitting.\n package_path: Load pre-built package (skip build).\n package_output_path: Where to save calibration package.\n beta: L0 gate temperature.\n lambda_l2: L2 regularization strength.\n learning_rate: Optimizer learning rate.\n log_freq: Epochs between per-target CSV logs.\n log_path: Path for per-target calibration log CSV.\n resume_from: Path to a checkpoint or weights file to\n continue fitting from.\n checkpoint_path: Where to save resumable fit checkpoints.\n chunked_matrix: Build matrix in clone-household chunks.\n chunk_size: Clone-household columns per chunk.\n chunk_dir: Directory for chunked COO/H5 artifacts.\n keep_chunks: Keep temporary chunk H5 files.\n resume_chunks: Reuse existing chunk COO files.\n\nReturns:\n (weights, targets_df, X_sparse, target_names, geography_info)\n weights is None when build_only=True.\n geography_info is a dict with cd_geoid and base_n_records.", |
3541 | 3541 | "id": "run_calibration", |
3542 | 3542 | "kind": "function", |
3543 | | - "line": 1375, |
| 3543 | + "line": 1372, |
3544 | 3544 | "metadata": { |
3545 | 3545 | "api_refs": [ |
3546 | 3546 | "policyengine_us_data.calibration.unified_calibration.run_calibration" |
|
3670 | 3670 | "docstring": "Impute net_worth and auto_loan from SCF.\n\nArgs:\n data: CPS data dict.\n state_fips: State FIPS per household.\n time_period: Tax year.\n dataset_path: Path to CPS h5 for Microsimulation.\n\nReturns:\n Updated data dict.", |
3671 | 3671 | "id": "scf_qrf", |
3672 | 3672 | "kind": "function", |
3673 | | - "line": 1092, |
| 3673 | + "line": 1108, |
3674 | 3674 | "metadata": { |
3675 | 3675 | "api_refs": [ |
3676 | 3676 | "policyengine_us_data.calibration.source_impute._impute_scf" |
|
3724 | 3724 | "docstring": "Impute tip_income, liquid assets, and vehicle signals from SIPP.\n\nArgs:\n data: CPS data dict.\n state_fips: State FIPS per household.\n time_period: Tax year.\n dataset_path: Path to CPS h5 for Microsimulation.\n\nReturns:\n Updated data dict.", |
3725 | 3725 | "id": "sipp_qrf", |
3726 | 3726 | "kind": "function", |
3727 | | - "line": 633, |
| 3727 | + "line": 649, |
3728 | 3728 | "metadata": { |
3729 | 3729 | "api_refs": [ |
3730 | 3730 | "policyengine_us_data.calibration.source_impute._impute_sipp" |
|
3751 | 3751 | "docstring": "Re-impute ACS/SIPP/ORG/SCF variables from donor surveys.\n\nOverwrites existing imputed values in data. ACS uses\nstate_fips as a QRF predictor; ORG uses state plus labor-market\npredictors; SIPP and SCF use only demographic and financial\npredictors (no state data).\n\nArgs:\n data: CPS dataset dict {variable: {time_period: array}}.\n state_fips: State FIPS per household.\n time_period: Tax year.\n dataset_path: Path to CPS h5 for Microsimulation.\n skip_acs: Skip ACS imputation.\n skip_sipp: Skip SIPP imputation.\n skip_org: Skip ORG imputation.\n skip_scf: Skip SCF imputation.\n\nReturns:\n Updated data dict with re-imputed variables.", |
3752 | 3752 | "id": "source_impute", |
3753 | 3753 | "kind": "function", |
3754 | | - "line": 203, |
| 3754 | + "line": 219, |
3755 | 3755 | "metadata": { |
3756 | 3756 | "api_refs": [ |
3757 | 3757 | "policyengine_us_data.calibration.source_impute.impute_source_variables" |
|
3928 | 3928 | "docstring": "Save calibration package to pickle.\n\nArgs:\n path: Output file path.\n X_sparse: Sparse matrix.\n targets_df: Targets DataFrame.\n target_names: Target name list.\n metadata: Run metadata dict.\n initial_weights: Pre-computed initial weight array.\n cd_geoid: CD GEOID array from geography assignment.\n block_geoid: Block GEOID array from geography assignment.", |
3929 | 3929 | "id": "stage2_calibration_package_writer", |
3930 | 3930 | "kind": "function", |
3931 | | - "line": 661, |
| 3931 | + "line": 658, |
3932 | 3932 | "metadata": { |
3933 | 3933 | "api_refs": [ |
3934 | 3934 | "policyengine_us_data.calibration.unified_calibration.save_calibration_package" |
|
3986 | 3986 | "docstring": "Filter target rows before matrix construction.", |
3987 | 3987 | "id": "stage2_target_config_apply", |
3988 | 3988 | "kind": "function", |
3989 | | - "line": 631, |
| 3989 | + "line": 628, |
3990 | 3990 | "metadata": { |
3991 | 3991 | "api_refs": [ |
3992 | 3992 | "policyengine_us_data.calibration.unified_calibration.apply_target_config_to_targets" |
|
4041 | 4041 | "docstring": "Load target include/exclude config from YAML.\n\nArgs:\n path: Path to YAML config file.\n\nReturns:\n Parsed config dict with include and exclude lists.", |
4042 | 4042 | "id": "stage2_target_config_load", |
4043 | 4043 | "kind": "function", |
4044 | | - "line": 525, |
| 4044 | + "line": 522, |
4045 | 4045 | "metadata": { |
4046 | 4046 | "api_refs": [ |
4047 | 4047 | "policyengine_us_data.calibration.unified_calibration.load_target_config" |
|
0 commit comments