1- import h5py
21from pathlib import Path
32
4- from policyengine_us_data .datasets import (
5- EnhancedCPS_2024 ,
6- )
3+ import h5py
4+ from policyengine_core .data import Dataset
5+
6+ from policyengine_us_data .datasets import EnhancedCPS_2024
77from policyengine_us_data .datasets .cps .cps import CPS_2024
88from policyengine_us_data .storage import STORAGE_FOLDER
99from policyengine_us_data .utils .data_upload import upload_data_files
10+ from policyengine_us_data .utils .dataset_validation import (
11+ DatasetContractError ,
12+ load_dataset_for_validation ,
13+ validate_dataset_contract ,
14+ )
1015
1116# Datasets that require full validation before upload.
1217# These are the main datasets used in production simulations.
1520 "cps_2024.h5" ,
1621}
1722
18- FILENAME_TO_DATASET = {
19- "enhanced_cps_2024.h5" : EnhancedCPS_2024 ,
20- "cps_2024.h5" : CPS_2024 ,
21- }
22-
2323# Minimum file sizes in bytes for validated datasets.
2424MIN_FILE_SIZES = {
25- "enhanced_cps_2024.h5" : 100 * 1024 * 1024 , # 100 MB
25+ "enhanced_cps_2024.h5" : 95 * 1024 * 1024 , # 95 MB
2626 "cps_2024.h5" : 50 * 1024 * 1024 , # 50 MB
2727}
2828
@@ -118,15 +118,23 @@ def _check_group_has_data(f, name):
118118 + "\n " .join (f" - { e } " for e in errors )
119119 )
120120
121+ try :
122+ contract_summary = validate_dataset_contract (file_path )
123+ except DatasetContractError as e :
124+ errors .append (f"Dataset contract validation failed: { e } " )
125+ raise DatasetValidationError (
126+ f"Validation failed for { filename } :\n "
127+ + "\n " .join (f" - { e } " for e in errors )
128+ ) from e
129+
121130 # 3. Aggregate statistics check via Microsimulation
122131 # Import here to avoid heavy import at module level.
123132 from policyengine_us import Microsimulation
124133
125134 try :
126- dataset_cls = FILENAME_TO_DATASET .get (filename )
127- if dataset_cls is None :
128- raise DatasetValidationError (f"No dataset class registered for { filename } " )
129- sim = Microsimulation (dataset = dataset_cls )
135+ sim = Microsimulation (
136+ dataset = load_dataset_for_validation (file_path , Dataset .from_file )
137+ )
130138 year = 2024
131139
132140 emp_income = sim .calculate ("employment_income" , year ).sum ()
@@ -159,6 +167,15 @@ def _check_group_has_data(f, name):
159167
160168 print (f" ✓ Validation passed for { filename } " )
161169 print (f" File size: { file_size / 1024 / 1024 :.1f} MB" )
170+ print (
171+ " policyengine-us: "
172+ f"{ contract_summary .policyengine_us .version } "
173+ + (
174+ f" (locked { contract_summary .policyengine_us .locked_version } )"
175+ if contract_summary .policyengine_us .locked_version
176+ else ""
177+ )
178+ )
162179 print (f" employment_income sum: ${ emp_income :,.0f} " )
163180 print (f" Household weight sum: { hh_weight :,.0f} " )
164181
@@ -210,14 +227,18 @@ def upload_datasets(require_enhanced_cps: bool = True):
210227
211228def validate_all_datasets ():
212229 """Validate all main datasets in storage. Called by `make validate-data`."""
213- for filename in VALIDATED_FILENAMES :
214- file_path = STORAGE_FOLDER / filename
215- if file_path .exists ():
216- validate_dataset (file_path )
217- else :
218- raise FileNotFoundError (
219- f"Expected dataset { filename } not found at { file_path } "
220- )
230+ validate_built_datasets (require_enhanced_cps = True )
231+
232+
233+ def validate_built_datasets (require_enhanced_cps : bool = True ):
234+ required_files = [CPS_2024 .file_path ]
235+ if require_enhanced_cps :
236+ required_files .append (EnhancedCPS_2024 .file_path )
237+
238+ for file_path in required_files :
239+ if not file_path .exists ():
240+ raise FileNotFoundError (f"Expected dataset not found at { file_path } " )
241+ validate_dataset (file_path )
221242 print ("\n All dataset validations passed." )
222243
223244
@@ -230,5 +251,13 @@ def validate_all_datasets():
230251 action = "store_true" ,
231252 help = "Treat enhanced_cps and small_enhanced_cps as optional." ,
232253 )
254+ parser .add_argument (
255+ "--validate-only" ,
256+ action = "store_true" ,
257+ help = "Validate built datasets without uploading them." ,
258+ )
233259 args = parser .parse_args ()
234- upload_datasets (require_enhanced_cps = not args .no_require_enhanced_cps )
260+ if args .validate_only :
261+ validate_built_datasets (require_enhanced_cps = not args .no_require_enhanced_cps )
262+ else :
263+ upload_datasets (require_enhanced_cps = not args .no_require_enhanced_cps )
0 commit comments