|
9 | 9 | from policyengine_us_data.calibration.formulaic_inputs import ( |
10 | 10 | FORMULAIC_SPM_INPUTS_TO_DROP, |
11 | 11 | ) |
| 12 | +from policyengine_us_data.calibration.puf_impute import ( |
| 13 | + CLONE_ORIGIN_FLAGS, |
| 14 | + IMPUTED_VARIABLES, |
| 15 | + OVERRIDDEN_IMPUTED_VARIABLES, |
| 16 | +) |
12 | 17 | from policyengine_us_data.datasets.cps.cps import ( |
13 | 18 | CPS, |
14 | 19 | CPS_2024, |
@@ -91,6 +96,8 @@ def _supports_structural_mortgage_inputs() -> bool: |
91 | 96 | if has_policyengine_us_variables("treasury_tipped_occupation_code"): |
92 | 97 | CPS_CLONE_FEATURE_VARIABLES.append("treasury_tipped_occupation_code") |
93 | 98 |
|
| 99 | +PUF_IMPUTED_VARIABLES = set(IMPUTED_VARIABLES) | set(OVERRIDDEN_IMPUTED_VARIABLES) |
| 100 | + |
94 | 101 | # Predictors used to rematch CPS features onto the PUF clone half. |
95 | 102 | # These are all available on the CPS half and on the doubled extended CPS. |
96 | 103 | CPS_CLONE_FEATURE_PREDICTORS = [ |
@@ -208,6 +215,27 @@ def _supports_structural_mortgage_inputs() -> bool: |
208 | 215 | # Set for O(1) lookup in the splice loop. |
209 | 216 | _CPS_ONLY_SET = set(CPS_ONLY_IMPUTED_VARIABLES) |
210 | 217 |
|
| 218 | +_CLONE_REFRESH_GEOGRAPHY_VARIABLES = { |
| 219 | + "block_geoid", |
| 220 | + "cbsa_code", |
| 221 | + "congressional_district_geoid", |
| 222 | + "county", |
| 223 | + "county_fips", |
| 224 | + "place_fips", |
| 225 | + "puma", |
| 226 | + "sldl", |
| 227 | + "sldu", |
| 228 | + "state_fips", |
| 229 | + "tract_geoid", |
| 230 | + "vtd", |
| 231 | + "zcta", |
| 232 | + "zip_code", |
| 233 | +} |
| 234 | + |
| 235 | +_CLONE_REFRESH_ANCHOR_VARIABLES = { |
| 236 | + "age", |
| 237 | +} |
| 238 | + |
211 | 239 | # Predictors used for the second-stage CPS-only imputation: demographics |
212 | 240 | # plus key income variables that were already imputed from PUF data. |
213 | 241 | CPS_STAGE2_DEMOGRAPHIC_PREDICTORS = [ |
@@ -259,6 +287,93 @@ def _clone_half_person_values(data: dict, variable: str, time_period: int): |
259 | 287 | return None |
260 | 288 |
|
261 | 289 |
|
| 290 | +def _first_half_person_values(data: dict, variable: str, time_period: int): |
| 291 | + """Return original-CPS-half values for person-level variables.""" |
| 292 | + if variable not in data: |
| 293 | + return None |
| 294 | + |
| 295 | + values = data[variable][time_period] |
| 296 | + n_persons = len(data["person_id"][time_period]) |
| 297 | + if len(values) != n_persons: |
| 298 | + return None |
| 299 | + |
| 300 | + return np.asarray(values[: n_persons // 2]) |
| 301 | + |
| 302 | + |
| 303 | +def _is_structural_clone_variable(variable: str) -> bool: |
| 304 | + """Return whether a variable should remain copied, not rematched.""" |
| 305 | + return ( |
| 306 | + variable.endswith("_id") |
| 307 | + or variable.endswith("_weight") |
| 308 | + or variable in _CLONE_REFRESH_GEOGRAPHY_VARIABLES |
| 309 | + or variable in CLONE_ORIGIN_FLAGS.values() |
| 310 | + or variable in _CLONE_REFRESH_ANCHOR_VARIABLES |
| 311 | + or variable in _STAGE2_COMPUTED_PREDICTORS |
| 312 | + ) |
| 313 | + |
| 314 | + |
| 315 | +def _cps_clone_feature_variables_for_data( |
| 316 | + data: dict, |
| 317 | + time_period: int, |
| 318 | +) -> list[str]: |
| 319 | + """Return person-level CPS-only fields to donor-rematch onto PUF clones. |
| 320 | +
|
| 321 | + The PUF clone starts as a literal copy of each CPS donor, then selected |
| 322 | + tax/income fields are replaced with PUF-imputed values. Any remaining |
| 323 | + person-level CPS-only field should be refreshed from CPS donors unless it |
| 324 | + is structural, a PUF-imputed field, or a QRF-handled CPS-only output. |
| 325 | + """ |
| 326 | + result = [] |
| 327 | + seen = set() |
| 328 | + explicit_clone_features = set(CPS_CLONE_FEATURE_VARIABLES) |
| 329 | + for variable in [*CPS_CLONE_FEATURE_VARIABLES, *data.keys()]: |
| 330 | + if variable in seen: |
| 331 | + continue |
| 332 | + seen.add(variable) |
| 333 | + if variable in PUF_IMPUTED_VARIABLES or variable in _CPS_ONLY_SET: |
| 334 | + continue |
| 335 | + is_explicit_clone_feature = variable in explicit_clone_features |
| 336 | + if not is_explicit_clone_feature and _is_structural_clone_variable(variable): |
| 337 | + continue |
| 338 | + if ( |
| 339 | + not is_explicit_clone_feature |
| 340 | + and _first_half_person_values(data, variable, time_period) is None |
| 341 | + ): |
| 342 | + continue |
| 343 | + result.append(variable) |
| 344 | + return result |
| 345 | + |
| 346 | + |
| 347 | +def _build_cps_train_frame( |
| 348 | + cps_sim, |
| 349 | + data: dict, |
| 350 | + time_period: int, |
| 351 | + variables: list[str], |
| 352 | +) -> pd.DataFrame: |
| 353 | + """Build original-CPS-half training values from PE or stored data.""" |
| 354 | + tbs = getattr(cps_sim, "tax_benefit_system", None) |
| 355 | + if tbs is None: |
| 356 | + calculable_variables = variables |
| 357 | + else: |
| 358 | + calculable_variables = [ |
| 359 | + variable for variable in variables if variable in tbs.variables |
| 360 | + ] |
| 361 | + if calculable_variables: |
| 362 | + train = cps_sim.calculate_dataframe(calculable_variables).copy() |
| 363 | + else: |
| 364 | + n_half = len(data["person_id"][time_period]) // 2 |
| 365 | + train = pd.DataFrame(index=np.arange(n_half)) |
| 366 | + |
| 367 | + for variable in variables: |
| 368 | + if variable in train.columns: |
| 369 | + continue |
| 370 | + values = _first_half_person_values(data, variable, time_period) |
| 371 | + if values is not None: |
| 372 | + train[variable] = values |
| 373 | + |
| 374 | + return train |
| 375 | + |
| 376 | + |
262 | 377 | def _build_clone_test_frame( |
263 | 378 | cps_sim, |
264 | 379 | data: dict, |
@@ -321,13 +436,15 @@ def _impute_clone_cps_features( |
321 | 436 | from sklearn.neighbors import NearestNeighbors |
322 | 437 |
|
323 | 438 | cps_sim = Microsimulation(dataset=dataset_path) |
324 | | - X_train = cps_sim.calculate_dataframe( |
325 | | - CPS_CLONE_FEATURE_PREDICTORS + CPS_CLONE_FEATURE_VARIABLES |
| 439 | + feature_variables = _cps_clone_feature_variables_for_data(data, time_period) |
| 440 | + X_train = _build_cps_train_frame( |
| 441 | + cps_sim, |
| 442 | + data, |
| 443 | + time_period, |
| 444 | + CPS_CLONE_FEATURE_PREDICTORS + feature_variables, |
326 | 445 | ) |
327 | 446 | available_outputs = [ |
328 | | - variable |
329 | | - for variable in CPS_CLONE_FEATURE_VARIABLES |
330 | | - if variable in X_train.columns |
| 447 | + variable for variable in feature_variables if variable in X_train.columns |
331 | 448 | ] |
332 | 449 | if not available_outputs: |
333 | 450 | n_half = len(data["person_id"][time_period]) // 2 |
|
0 commit comments