Skip to content

Commit 8868ad3

Browse files
authored
Allow raw CPS source artifact through publication contract (#1012)
1 parent 49c22a5 commit 8868ad3

5 files changed

Lines changed: 137 additions & 3 deletions

File tree

changelog.d/1012.fixed.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Allow raw CPS source artifacts to pass publication validation while retaining the leaf-input export guard for final enhanced CPS datasets.

policyengine_us_data/storage/upload_completed_datasets.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,15 @@
4646
"cps_2024.h5",
4747
}
4848

49+
# Raw CPS is a source artifact used to construct final simulation datasets.
50+
# It intentionally carries some Census source measures whose names now map to
51+
# formula variables in policyengine-us. The leaf-input export contract is
52+
# enforced on final simulation datasets instead.
53+
ENFORCE_LEAF_INPUT_EXPORT_BY_FILENAME = {
54+
"enhanced_cps_2024.h5": True,
55+
"cps_2024.h5": False,
56+
}
57+
4958
# Minimum file sizes in bytes for validated datasets.
5059
MIN_FILE_SIZES = {
5160
"enhanced_cps_2024.h5": 95 * 1024 * 1024, # 95 MB
@@ -424,7 +433,13 @@ def validate_dataset(file_path: Path) -> None:
424433
)
425434

426435
try:
427-
contract_summary = validate_dataset_contract(file_path)
436+
contract_summary = validate_dataset_contract(
437+
file_path,
438+
enforce_no_computed_policyengine_us_variables=ENFORCE_LEAF_INPUT_EXPORT_BY_FILENAME.get(
439+
filename,
440+
True,
441+
),
442+
)
428443
except DatasetContractError as e:
429444
errors.append(f"Dataset contract validation failed: {e}")
430445
raise DatasetValidationError(

policyengine_us_data/utils/dataset_validation.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ def validate_dataset_contract(
268268
microsimulation_cls=None,
269269
dataset_loader=None,
270270
smoke_test_variable: str = "household_weight",
271+
enforce_no_computed_policyengine_us_variables: bool = True,
271272
) -> DatasetContractSummary:
272273
file_path = Path(file_path)
273274
policyengine_us_info = assert_locked_policyengine_us_version()
@@ -281,7 +282,7 @@ def validate_dataset_contract(
281282

282283
dataset_lengths = _dataset_lengths(file_path)
283284
time_period = _infer_time_period_from_file(file_path)
284-
if time_period is not None:
285+
if time_period is not None and enforce_no_computed_policyengine_us_variables:
285286
assert_no_computed_policyengine_us_variables_exported(
286287
variable_names=dataset_lengths.keys(),
287288
time_period=time_period,

tests/unit/test_dataset_validation.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,39 @@ def test_validate_dataset_contract_rejects_computed_policyengine_variables(
239239
)
240240

241241

242+
def test_validate_dataset_contract_can_skip_computed_variable_check(
243+
tmp_path,
244+
monkeypatch,
245+
):
246+
file_path = tmp_path / "cps_2024.h5"
247+
_write_test_h5(
248+
file_path,
249+
{
250+
"person_id": np.array([101, 102], dtype=np.int32),
251+
"household_id": np.array([501], dtype=np.int32),
252+
"computed_income": np.array([10_000.0, 20_000.0], dtype=np.float32),
253+
"household_weight": np.array([1.5], dtype=np.float32),
254+
},
255+
)
256+
monkeypatch.setattr(
257+
"policyengine_us_data.utils.dataset_validation.assert_locked_policyengine_us_version",
258+
lambda: PolicyEngineUSBuildInfo(version="1.587.0"),
259+
)
260+
tbs = _fake_tax_benefit_system()
261+
tbs.variables["computed_income"] = _fake_variable(
262+
"person",
263+
adds=["computed_income_before_response"],
264+
)
265+
266+
validate_dataset_contract(
267+
file_path,
268+
tax_benefit_system=tbs,
269+
microsimulation_cls=_FakeMicrosimulation,
270+
dataset_loader=lambda path: path,
271+
enforce_no_computed_policyengine_us_variables=False,
272+
)
273+
274+
242275
def test_validate_dataset_contract_allows_future_period_formulas(tmp_path, monkeypatch):
243276
file_path = tmp_path / "enhanced_cps_2024.h5"
244277
_write_test_h5(

tests/unit/test_upload_completed_datasets.py

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,15 @@ def _fake_tax_benefit_system():
109109
)
110110

111111

112+
def _fake_variable(entity_key, *, formulas=None, adds=None, subtracts=None):
113+
return SimpleNamespace(
114+
entity=SimpleNamespace(key=entity_key),
115+
formulas=formulas or {},
116+
adds=adds,
117+
subtracts=subtracts,
118+
)
119+
120+
112121
def _write_h5(path, datasets: dict[str, np.ndarray]) -> None:
113122
with h5py.File(path, "w") as h5_file:
114123
for name, values in datasets.items():
@@ -151,11 +160,12 @@ def patch_contract_validation(monkeypatch):
151160
monkeypatch.setattr(
152161
upload_module,
153162
"validate_dataset_contract",
154-
lambda file_path: validate_dataset_contract(
163+
lambda file_path, **kwargs: validate_dataset_contract(
155164
file_path,
156165
tax_benefit_system=_fake_tax_benefit_system(),
157166
microsimulation_cls=_FakeMicrosimulation,
158167
dataset_loader=lambda path: path,
168+
**kwargs,
159169
),
160170
)
161171

@@ -221,6 +231,80 @@ def test_validate_dataset_infers_time_period_for_flat_h5(tmp_path, monkeypatch):
221231
assert _TimePeriodCheckingAggregateMicrosimulation.last_dataset.time_period == 2024
222232

223233

234+
def test_validate_cps_allows_source_computed_policyengine_variables(
235+
tmp_path,
236+
monkeypatch,
237+
):
238+
file_path = tmp_path / "cps_2024.h5"
239+
_write_h5(
240+
file_path,
241+
{
242+
"person_id": np.array([101], dtype=np.int32),
243+
"household_id": np.array([201], dtype=np.int32),
244+
"employment_income": np.array([50_000.0], dtype=np.float32),
245+
"household_weight": np.array([1.0], dtype=np.float32),
246+
},
247+
)
248+
tbs = _fake_tax_benefit_system()
249+
tbs.variables["employment_income"] = _fake_variable(
250+
"person",
251+
adds=["employment_income_before_lsr"],
252+
)
253+
monkeypatch.setattr(
254+
upload_module,
255+
"validate_dataset_contract",
256+
lambda file_path, **kwargs: validate_dataset_contract(
257+
file_path,
258+
tax_benefit_system=tbs,
259+
microsimulation_cls=_FakeMicrosimulation,
260+
dataset_loader=lambda path: path,
261+
**kwargs,
262+
),
263+
)
264+
monkeypatch.setattr(
265+
"policyengine_us.Microsimulation",
266+
_TimePeriodCheckingAggregateMicrosimulation,
267+
)
268+
269+
validate_dataset(file_path)
270+
271+
272+
def test_validate_enhanced_cps_rejects_computed_policyengine_variables(
273+
tmp_path,
274+
monkeypatch,
275+
):
276+
file_path = tmp_path / "enhanced_cps_2024.h5"
277+
_write_h5(file_path, _minimal_enhanced_cps_contract_datasets())
278+
tbs = _fake_tax_benefit_system()
279+
tbs.variables["employment_income"] = _fake_variable(
280+
"person",
281+
adds=["employment_income_before_lsr"],
282+
)
283+
monkeypatch.setattr(
284+
upload_module,
285+
"REQUIRED_VARIABLES_BY_FILENAME",
286+
{},
287+
)
288+
monkeypatch.setattr(
289+
upload_module,
290+
"validate_dataset_contract",
291+
lambda file_path, **kwargs: validate_dataset_contract(
292+
file_path,
293+
tax_benefit_system=tbs,
294+
microsimulation_cls=_FakeMicrosimulation,
295+
dataset_loader=lambda path: path,
296+
**kwargs,
297+
),
298+
)
299+
monkeypatch.setattr(
300+
"policyengine_us.Microsimulation",
301+
_TimePeriodCheckingAggregateMicrosimulation,
302+
)
303+
304+
with pytest.raises(DatasetValidationError, match="employment_income"):
305+
validate_dataset(file_path)
306+
307+
224308
def test_validate_dataset_rejects_temporary_reported_source_variables(
225309
tmp_path,
226310
monkeypatch,

0 commit comments

Comments
 (0)