2424# H5 groups that must exist and contain data.
2525REQUIRED_GROUPS = [
2626 "household_weight" ,
27+ ]
28+
29+ # At least one of these income groups must exist with data.
30+ INCOME_GROUPS = [
2731 "employment_income_before_lsr" ,
32+ "employment_income" ,
2833]
2934
3035# Aggregate thresholds for sanity checks (year 2024).
@@ -70,34 +75,38 @@ def validate_dataset(file_path: Path) -> None:
7075 )
7176
7277 # 2. H5 structure check - verify critical groups exist with data
78+ def _check_group_has_data (f , name ):
79+ """Return True if the H5 group/dataset has non-empty data."""
80+ if name not in f :
81+ return False
82+ group = f [name ]
83+ if isinstance (group , h5py .Group ):
84+ if len (group .keys ()) == 0 :
85+ return False
86+ first_key = list (group .keys ())[0 ]
87+ return len (group [first_key ][:]) > 0
88+ elif isinstance (group , h5py .Dataset ):
89+ return group .size > 0
90+ return False
91+
7392 try :
7493 with h5py .File (file_path , "r" ) as f :
7594 for group_name in REQUIRED_GROUPS :
76- if group_name not in f :
95+ if not _check_group_has_data ( f , group_name ) :
7796 errors .append (
78- f"Required group '{ group_name } ' missing from H5 file."
97+ f"Required group '{ group_name } ' missing "
98+ f"or empty in H5 file."
7999 )
80- continue
81- group = f [group_name ]
82- # Group should have at least one year key with data
83- if isinstance (group , h5py .Group ):
84- if len (group .keys ()) == 0 :
85- errors .append (
86- f"Group '{ group_name } ' exists but has no year keys."
87- )
88- else :
89- # Check first year key has non-empty data
90- first_key = list (group .keys ())[0 ]
91- data = group [first_key ][:]
92- if len (data ) == 0 :
93- errors .append (
94- f"Group '{ group_name } /{ first_key } ' has empty data."
95- )
96- elif isinstance (group , h5py .Dataset ):
97- if group .size == 0 :
98- errors .append (
99- f"Dataset '{ group_name } ' has empty data."
100- )
100+
101+ # At least one income group must have data
102+ has_income = any (
103+ _check_group_has_data (f , g ) for g in INCOME_GROUPS
104+ )
105+ if not has_income :
106+ errors .append (
107+ f"No income data found. Need at least one of "
108+ f"{ INCOME_GROUPS } with data in H5 file."
109+ )
101110 except Exception as e :
102111 errors .append (f"Failed to read H5 file: { e } " )
103112
@@ -115,10 +124,10 @@ def validate_dataset(file_path: Path) -> None:
115124 sim = Microsimulation (dataset = file_path )
116125 year = 2024
117126
118- emp_income = sim .calculate ("employment_income_before_lsr " , year ).sum ()
127+ emp_income = sim .calculate ("employment_income " , year ).sum ()
119128 if emp_income < MIN_EMPLOYMENT_INCOME_SUM :
120129 errors .append (
121- f"employment_income_before_lsr sum = ${ emp_income :,.0f} , "
130+ f"employment_income sum = ${ emp_income :,.0f} , "
122131 f"expected > ${ MIN_EMPLOYMENT_INCOME_SUM :,.0f} . "
123132 f"Data may have dropped employment income."
124133 )
@@ -145,7 +154,7 @@ def validate_dataset(file_path: Path) -> None:
145154
146155 print (f" ✓ Validation passed for { filename } " )
147156 print (f" File size: { file_size / 1024 / 1024 :.1f} MB" )
148- print (f" Employment income sum: ${ emp_income :,.0f} " )
157+ print (f" employment_income sum: ${ emp_income :,.0f} " )
149158 print (f" Household weight sum: { hh_weight :,.0f} " )
150159
151160
0 commit comments