Skip to content

Commit 18de498

Browse files
authored
Filter invalid SIPP imputation weights (#1108)
* Filter invalid SIPP imputation weights * Fix SIPP status flag filtering
1 parent e6fdcaf commit 18de498

10 files changed

Lines changed: 350 additions & 31 deletions

File tree

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Filter non-positive SIPP donor weights before fitting source imputation models.
2+
Interpret SIPP status flags with Census status semantics when filtering observed donor targets.
3+
Bump policyengine-us to 1.703.1.

policyengine_us_data/calibration/source_impute.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
from policyengine_us_data.pipeline_schema import PipelineNode
8383
from policyengine_us_data.utils.source_quality import (
8484
cap_training_sample,
85+
filter_positive_finite_weight_rows,
8586
require_columns_present,
8687
target_observed_source_masks,
8788
)
@@ -710,6 +711,12 @@ def _impute_sipp(
710711
"household_weight",
711712
]
712713
tip_train = sipp_df[tip_cols].dropna()
714+
tip_train, tip_target_filters = filter_positive_finite_weight_rows(
715+
tip_train,
716+
weight_col="household_weight",
717+
target_filters=tip_target_filters,
718+
context_name="SIPP source tip donor",
719+
)
713720
tip_train, tip_target_filters = cap_training_sample(
714721
tip_train,
715722
max_train_samples=10_000,
@@ -849,6 +856,12 @@ def _impute_sipp(
849856
target_source_columns=SIPP_ASSET_TARGET_SOURCE_COLUMNS,
850857
target_allocation_flag_columns=SIPP_ASSET_TARGET_ALLOCATION_COLUMNS,
851858
)
859+
asset_train, asset_target_filters = filter_positive_finite_weight_rows(
860+
asset_train,
861+
weight_col="household_weight",
862+
target_filters=asset_target_filters,
863+
context_name="SIPP source asset donor",
864+
)
852865
asset_train, asset_target_filters = cap_training_sample(
853866
asset_train,
854867
max_train_samples=20_000,
@@ -1013,6 +1026,12 @@ def _impute_sipp(
10131026
targets=vehicle_vars,
10141027
target_allocation_flag_columns=SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS,
10151028
)
1029+
vehicle_train, vehicle_target_filters = filter_positive_finite_weight_rows(
1030+
vehicle_train,
1031+
weight_col="household_weight",
1032+
target_filters=vehicle_target_filters,
1033+
context_name="SIPP source vehicle donor",
1034+
)
10161035
vehicle_train, vehicle_target_filters = cap_training_sample(
10171036
vehicle_train,
10181037
max_train_samples=20_000,

policyengine_us_data/datasets/sipp/sipp.py

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
)
1212
from policyengine_us_data.utils.source_quality import (
1313
cap_training_sample,
14+
filter_positive_finite_weight_rows,
1415
filter_observed_source_rows,
1516
require_columns_present,
1617
sipp_allocation_flag_for,
@@ -188,6 +189,12 @@ def train_tip_model():
188189
]
189190

190191
sipp = sipp[~sipp.isna().any(axis=1)]
192+
sipp, tip_target_filters = filter_positive_finite_weight_rows(
193+
sipp,
194+
weight_col="household_weight",
195+
target_filters=tip_target_filters,
196+
context_name="SIPP tip donor",
197+
)
191198
sipp, tip_target_filters = cap_training_sample(
192199
sipp,
193200
max_train_samples=10_000,
@@ -232,9 +239,40 @@ def get_tip_model() -> QRF:
232239
"stock_assets": ["TVAL_STMF"],
233240
"bond_assets": ["TVAL_BOND"],
234241
}
242+
SIPP_BANK_ACCOUNT_ASSET_ALLOCATION_COLUMNS = [
243+
"AJSSAVVAL",
244+
"AJOSAVVAL",
245+
"AOSAVVAL",
246+
"AJSMMVAL",
247+
"AJOMMVAL",
248+
"AOMMVAL",
249+
"AJSCDVAL",
250+
"AJOCDVAL",
251+
"AOCDVAL",
252+
"AJSCHKVAL",
253+
"AJOCHKVAL",
254+
"AOCHKVAL",
255+
]
256+
SIPP_STOCK_ASSET_ALLOCATION_COLUMNS = [
257+
"AJSSTVAL",
258+
"AJOSTVAL",
259+
"AOSTVAL",
260+
"AJSMFVAL",
261+
"AJOMFVAL",
262+
"AOMFVAL",
263+
]
264+
SIPP_BOND_ASSET_ALLOCATION_COLUMNS = [
265+
"AJSGOVSVAL",
266+
"AJOGOVSVAL",
267+
"AOGOVSVAL",
268+
"AJSMCBDVAL",
269+
"AJOMCBDVAL",
270+
"AOMCBDVAL",
271+
]
235272
SIPP_ASSET_TARGET_ALLOCATION_COLUMNS = {
236-
target: [sipp_allocation_flag_for(column) for column in columns]
237-
for target, columns in SIPP_ASSET_TARGET_SOURCE_COLUMNS.items()
273+
"bank_account_assets": SIPP_BANK_ACCOUNT_ASSET_ALLOCATION_COLUMNS,
274+
"stock_assets": SIPP_STOCK_ASSET_ALLOCATION_COLUMNS,
275+
"bond_assets": SIPP_BOND_ASSET_ALLOCATION_COLUMNS,
238276
}
239277
SIPP_ASSET_ALLOCATION_COLUMNS = sorted(
240278
{
@@ -326,7 +364,7 @@ def get_tip_model() -> QRF:
326364

327365
SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS = {
328366
"household_vehicles_owned": [sipp_allocation_flag_for("TVEH_NUM")],
329-
"household_vehicles_value": [sipp_allocation_flag_for("THVAL_VEH")],
367+
"household_vehicles_value": ["AVEH1VAL", "AVEH2VAL", "AVEH3VAL"],
330368
}
331369

332370
VEHICLE_COLUMNS = [
@@ -347,6 +385,9 @@ def get_tip_model() -> QRF:
347385
"THVAL_HOME",
348386
"AVEH_NUM",
349387
"AHVAL_VEH",
388+
"AVEH1VAL",
389+
"AVEH2VAL",
390+
"AVEH3VAL",
350391
]
351392

352393

@@ -652,6 +693,12 @@ def train_asset_model():
652693
target_source_columns=SIPP_ASSET_TARGET_SOURCE_COLUMNS,
653694
target_allocation_flag_columns=SIPP_ASSET_TARGET_ALLOCATION_COLUMNS,
654695
)
696+
sipp, asset_target_filters = filter_positive_finite_weight_rows(
697+
sipp,
698+
weight_col="household_weight",
699+
target_filters=asset_target_filters,
700+
context_name="SIPP asset donor",
701+
)
655702
sipp, asset_target_filters = cap_training_sample(
656703
sipp,
657704
max_train_samples=20_000,
@@ -799,6 +846,9 @@ def build_vehicle_training_frame() -> pd.DataFrame:
799846
"household_vehicles_value": grouped["THVAL_VEH"].first().fillna(0),
800847
"AVEH_NUM": grouped["AVEH_NUM"].max().fillna(0),
801848
"AHVAL_VEH": grouped["AHVAL_VEH"].first().fillna(0),
849+
"AVEH1VAL": grouped["AVEH1VAL"].max().fillna(0),
850+
"AVEH2VAL": grouped["AVEH2VAL"].max().fillna(0),
851+
"AVEH3VAL": grouped["AVEH3VAL"].max().fillna(0),
802852
"is_homeowner": (grouped["THVAL_HOME"].first().fillna(0) > 0).astype(
803853
np.float32
804854
),
@@ -839,6 +889,12 @@ def train_vehicle_model():
839889
targets=vehicle_vars,
840890
target_allocation_flag_columns=SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS,
841891
)
892+
sipp, vehicle_target_filters = filter_positive_finite_weight_rows(
893+
sipp,
894+
weight_col="household_weight",
895+
target_filters=vehicle_target_filters,
896+
context_name="SIPP vehicle donor",
897+
)
842898
sipp, vehicle_target_filters = cap_training_sample(
843899
sipp,
844900
max_train_samples=20_000,

policyengine_us_data/utils/source_quality.py

Lines changed: 77 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,18 @@
1212

1313
logger = logging.getLogger(__name__)
1414

15+
SIPP_OBSERVED_STATUS_VALUES = frozenset((0, 1, 9))
16+
SIPP_STATUS_FLAG_PREFIXES = (
17+
"AJB",
18+
"AJS",
19+
"AJO",
20+
"AO",
21+
"ASSI",
22+
"AVAL",
23+
"AVEH",
24+
"AHVAL",
25+
)
26+
1527

1628
def sipp_allocation_flag_for(source_column: str) -> str:
1729
"""Return the SIPP allocation flag name for a source variable."""
@@ -20,6 +32,11 @@ def sipp_allocation_flag_for(source_column: str) -> str:
2032
return f"A{source_column[1:]}"
2133

2234

35+
def is_sipp_status_flag_column(column: str) -> bool:
36+
"""Return whether a column name looks like a Census SIPP status flag."""
37+
return column.startswith(SIPP_STATUS_FLAG_PREFIXES)
38+
39+
2340
def require_columns_present(
2441
available_columns: Container[str],
2542
required_columns: Sequence[str],
@@ -47,9 +64,13 @@ def observed_source_mask(
4764
) -> pd.Series:
4865
"""Mask rows whose donor source values are observed for one target.
4966
50-
Source-survey allocation flags conventionally use ``0`` for not allocated
51-
and non-zero values for allocated/imputed. Missing flag columns are ignored
52-
so callers can use this helper across sources with different flag coverage.
67+
Generic allocation flags use ``0`` for not allocated and non-zero values
68+
for allocated/imputed. Census SIPP ``A*`` status flags instead encode
69+
``0`` as not in universe, ``1`` as reported, and ``9`` as derivable from
70+
component flags; values ``2`` through ``8`` indicate imputation.
71+
72+
Missing flag columns are ignored so callers can use this helper across
73+
sources with different flag coverage.
5374
"""
5475
mask = pd.Series(True, index=df.index)
5576

@@ -62,7 +83,10 @@ def observed_source_mask(
6283
if column not in df:
6384
continue
6485
flag = pd.to_numeric(df[column], errors="coerce").fillna(0)
65-
mask &= flag.eq(0)
86+
if is_sipp_status_flag_column(column):
87+
mask &= flag.isin(SIPP_OBSERVED_STATUS_VALUES)
88+
else:
89+
mask &= flag.eq(0)
6690

6791
return mask
6892

@@ -225,3 +249,52 @@ def cap_training_sample(
225249
for target, mask in filters.items()
226250
}
227251
return sampled_df, sampled_filters
252+
253+
254+
def filter_positive_finite_weight_rows(
255+
df: pd.DataFrame,
256+
*,
257+
weight_col: str,
258+
target_filters: Mapping[str, pd.Series] | None = None,
259+
context_name: str = "donor training frame",
260+
) -> tuple[pd.DataFrame, dict[str, pd.Series]]:
261+
"""Drop rows whose fit weight cannot be passed to microimpute."""
262+
if weight_col not in df:
263+
raise KeyError(f"{context_name} is missing weight column {weight_col!r}")
264+
265+
filters = {}
266+
for target, mask in (target_filters or {}).items():
267+
aligned = mask.reindex(df.index)
268+
if aligned.isna().any():
269+
raise ValueError(f"target_filters[{target!r}] contains missing values")
270+
filters[target] = aligned.astype(bool)
271+
272+
weights = pd.to_numeric(df[weight_col], errors="coerce")
273+
valid_weight = np.isfinite(weights) & weights.gt(0)
274+
dropped = int((~valid_weight).sum())
275+
if dropped:
276+
logger.info(
277+
"Dropped %d/%d %s rows with non-positive or non-finite %s",
278+
dropped,
279+
len(df),
280+
context_name,
281+
weight_col,
282+
)
283+
284+
filtered_df = df.loc[valid_weight].copy().reset_index(drop=True)
285+
filtered_filters = {
286+
target: pd.Series(
287+
mask.loc[valid_weight].to_numpy(dtype=bool),
288+
index=filtered_df.index,
289+
)
290+
for target, mask in filters.items()
291+
}
292+
293+
for target, mask in filtered_filters.items():
294+
if not mask.any():
295+
raise ValueError(
296+
f"No observed donor rows with positive finite {weight_col} "
297+
f"available for {target}"
298+
)
299+
300+
return filtered_df, filtered_filters

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ classifiers = [
2222
"Programming Language :: Python :: 3.14",
2323
]
2424
dependencies = [
25-
"policyengine-us==1.702.1",
25+
"policyengine-us==1.703.1",
2626
# policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for
2727
# PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost
2828
# after _invalidate_all_caches) and is required by policyengine-us 1.682.1+.

tests/unit/calibration/test_source_impute.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ def test_calibration_sipp_qrf_passes_target_filters(self, monkeypatch):
410410
for column in source_impute.SIPP_TIP_AMOUNT_COLUMNS:
411411
tip_columns[column] = [10.0, 5.0, 0.0]
412412
for column in source_impute.SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN.values():
413-
tip_columns[column] = [0, 1, 0]
413+
tip_columns[column] = [1, 2, 0]
414414
for column in source_impute.SIPP_JOB_OCCUPATION_COLUMNS:
415415
tip_columns[column] = [0, 0, 0]
416416
tip_source = pd.DataFrame(tip_columns)
@@ -437,8 +437,8 @@ def test_calibration_sipp_qrf_passes_target_filters(self, monkeypatch):
437437
asset_columns[column] = [1_000.0, 2_000.0, 0.0]
438438
for column in source_impute.SIPP_ASSET_ALLOCATION_COLUMNS:
439439
asset_columns[column] = [0, 0, 0]
440-
asset_columns["AVAL_BANK"] = [0, 1, 0]
441-
asset_columns["AVAL_STMF"] = [0, 0, 1]
440+
asset_columns["AJSSAVVAL"] = [0, 2, 0]
441+
asset_columns["AJSSTVAL"] = [0, 0, 6]
442442
asset_source = pd.DataFrame(asset_columns)
443443

444444
vehicle_train = pd.DataFrame(
@@ -449,8 +449,10 @@ def test_calibration_sipp_qrf_passes_target_filters(self, monkeypatch):
449449
},
450450
"household_vehicles_owned": [1.0, 2.0, 3.0],
451451
"household_vehicles_value": [5_000.0, 10_000.0, 15_000.0],
452-
"AVEH_NUM": [0, 1, 0],
453-
"AHVAL_VEH": [0, 0, 1],
452+
"AVEH_NUM": [1, 2, 1],
453+
"AVEH1VAL": [1, 1, 5],
454+
"AVEH2VAL": [0, 0, 0],
455+
"AVEH3VAL": [0, 0, 0],
454456
"household_weight": [1.0, 1.0, 1.0],
455457
}
456458
)

tests/unit/datasets/test_sipp_ssi_disability.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def test_ssi_disability_training_usecols_include_label_and_income_columns():
7474

7575
def test_build_ssi_disability_training_frame_excludes_allocated_label_source():
7676
frame = _base_sipp_frame()
77-
frame.loc[0, "ASSI_YRYN"] = 1
77+
frame.loc[0, "ASSI_YRYN"] = 3
7878
frame.loc[1:, "ASSI_YRYN"] = 0
7979
frame["ASSI_BRSN"] = 0
8080

0 commit comments

Comments
 (0)