Skip to content

Commit 22076cb

Browse files
MaxGhenisclaude
andcommitted
Fix validation to accept both employment_income variable names
The CPS dataset stores employment_income (older name), while employment_income_before_lsr is only present after policyengine-us formula processing. The assertions now accept either variable name. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 36b15fc commit 22076cb

2 files changed

Lines changed: 50 additions & 33 deletions

File tree

policyengine_us_data/datasets/cps/enhanced_cps.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -224,15 +224,23 @@ def generate(self):
224224
f"{int(np.sum(w > 0))} non-zero"
225225
)
226226

227-
# Validate critical variables exist in data
228-
if "employment_income_before_lsr" not in data:
227+
# Validate critical income variable exists in data.
228+
# The CPS stores employment_income_before_lsr (or the older
229+
# employment_income key); either must be present with data.
230+
income_key = None
231+
for k in ("employment_income_before_lsr", "employment_income"):
232+
if k in data and data[k]:
233+
income_key = k
234+
break
235+
if income_key is None:
229236
raise ValueError(
230-
"employment_income_before_lsr missing from dataset"
237+
"Neither employment_income_before_lsr nor "
238+
"employment_income found with data in dataset"
231239
)
232-
eib_periods = data["employment_income_before_lsr"]
233-
if not eib_periods:
234-
raise ValueError("employment_income_before_lsr has no period data")
235-
logging.info("Post-generation validation passed for EnhancedCPS")
240+
logging.info(
241+
f"Post-generation validation passed for EnhancedCPS "
242+
f"(income key: {income_key})"
243+
)
236244

237245
self.save_dataset(data)
238246

policyengine_us_data/storage/upload_completed_datasets.py

Lines changed: 35 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,12 @@
2424
# H5 groups that must exist and contain data.
2525
REQUIRED_GROUPS = [
2626
"household_weight",
27+
]
28+
29+
# At least one of these income groups must exist with data.
30+
INCOME_GROUPS = [
2731
"employment_income_before_lsr",
32+
"employment_income",
2833
]
2934

3035
# Aggregate thresholds for sanity checks (year 2024).
@@ -70,34 +75,38 @@ def validate_dataset(file_path: Path) -> None:
7075
)
7176

7277
# 2. H5 structure check - verify critical groups exist with data
78+
def _check_group_has_data(f, name):
79+
"""Return True if the H5 group/dataset has non-empty data."""
80+
if name not in f:
81+
return False
82+
group = f[name]
83+
if isinstance(group, h5py.Group):
84+
if len(group.keys()) == 0:
85+
return False
86+
first_key = list(group.keys())[0]
87+
return len(group[first_key][:]) > 0
88+
elif isinstance(group, h5py.Dataset):
89+
return group.size > 0
90+
return False
91+
7392
try:
7493
with h5py.File(file_path, "r") as f:
7594
for group_name in REQUIRED_GROUPS:
76-
if group_name not in f:
95+
if not _check_group_has_data(f, group_name):
7796
errors.append(
78-
f"Required group '{group_name}' missing from H5 file."
97+
f"Required group '{group_name}' missing "
98+
f"or empty in H5 file."
7999
)
80-
continue
81-
group = f[group_name]
82-
# Group should have at least one year key with data
83-
if isinstance(group, h5py.Group):
84-
if len(group.keys()) == 0:
85-
errors.append(
86-
f"Group '{group_name}' exists but has no year keys."
87-
)
88-
else:
89-
# Check first year key has non-empty data
90-
first_key = list(group.keys())[0]
91-
data = group[first_key][:]
92-
if len(data) == 0:
93-
errors.append(
94-
f"Group '{group_name}/{first_key}' has empty data."
95-
)
96-
elif isinstance(group, h5py.Dataset):
97-
if group.size == 0:
98-
errors.append(
99-
f"Dataset '{group_name}' has empty data."
100-
)
100+
101+
# At least one income group must have data
102+
has_income = any(
103+
_check_group_has_data(f, g) for g in INCOME_GROUPS
104+
)
105+
if not has_income:
106+
errors.append(
107+
f"No income data found. Need at least one of "
108+
f"{INCOME_GROUPS} with data in H5 file."
109+
)
101110
except Exception as e:
102111
errors.append(f"Failed to read H5 file: {e}")
103112

@@ -115,10 +124,10 @@ def validate_dataset(file_path: Path) -> None:
115124
sim = Microsimulation(dataset=file_path)
116125
year = 2024
117126

118-
emp_income = sim.calculate("employment_income_before_lsr", year).sum()
127+
emp_income = sim.calculate("employment_income", year).sum()
119128
if emp_income < MIN_EMPLOYMENT_INCOME_SUM:
120129
errors.append(
121-
f"employment_income_before_lsr sum = ${emp_income:,.0f}, "
130+
f"employment_income sum = ${emp_income:,.0f}, "
122131
f"expected > ${MIN_EMPLOYMENT_INCOME_SUM:,.0f}. "
123132
f"Data may have dropped employment income."
124133
)
@@ -145,7 +154,7 @@ def validate_dataset(file_path: Path) -> None:
145154

146155
print(f" ✓ Validation passed for {filename}")
147156
print(f" File size: {file_size / 1024 / 1024:.1f} MB")
148-
print(f" Employment income sum: ${emp_income:,.0f}")
157+
print(f" employment_income sum: ${emp_income:,.0f}")
149158
print(f" Household weight sum: {hh_weight:,.0f}")
150159

151160

0 commit comments

Comments
 (0)