1414
1515logger = logging .getLogger (__name__ )
1616
17+ WEEKS_IN_YEAR = 52
18+ STANDARD_HOURS_PER_WEEK = 40
19+ OVERTIME_RATE_MULTIPLIER = 1.5
20+ HOURLY_WAGE_INCOME_RELATIVE_TOLERANCE = 0.10
21+ HOURLY_WAGE_INCOME_MISMATCH_SHARE_WARN_THRESHOLD = 0.25
22+ HOURLY_WAGE_INCOME_MEAN_ABS_REL_ERROR_WARN_THRESHOLD = 0.20
23+
1724KEY_MONETARY_VARS = [
1825 "employment_income" ,
1926 "adjusted_gross_income" ,
3643]
3744
3845
46+ def _weighted_mean (values : np .ndarray , weights : np .ndarray ) -> float :
47+ if values .size == 0 or weights .sum () <= 0 :
48+ return 0.0
49+ return float (np .average (values , weights = weights ))
50+
51+
52+ def _weighted_quantile (
53+ values : np .ndarray ,
54+ weights : np .ndarray ,
55+ quantile : float ,
56+ ) -> float :
57+ if values .size == 0 or weights .sum () <= 0 :
58+ return 0.0
59+
60+ sorter = np .argsort (values )
61+ values = values [sorter ]
62+ weights = weights [sorter ]
63+ cumulative_weights = np .cumsum (weights )
64+ cutoff = quantile * cumulative_weights [- 1 ]
65+ return float (values [np .searchsorted (cumulative_weights , cutoff , side = "left" )])
66+
67+
68+ def _format_hourly_wage_income_detail (
69+ * ,
70+ comparable_count : int ,
71+ comparable_weight : float ,
72+ mismatch_count : int ,
73+ mismatch_share : float ,
74+ mean_abs_rel_error : float ,
75+ p90_abs_rel_error : float ,
76+ over_income_share : float ,
77+ tolerance : float ,
78+ ) -> str :
79+ return (
80+ f"{ mismatch_count :,} /{ comparable_count :,} unweighted mismatches; "
81+ f"{ mismatch_share :.1%} weighted mismatch share at "
82+ f">{ tolerance :.0%} tolerance among { comparable_weight :,.0f} weighted workers; "
83+ f"mean absolute relative gap { mean_abs_rel_error :.1%} ; "
84+ f"p90 absolute relative gap { p90_abs_rel_error :.1%} ; "
85+ f"{ over_income_share :.1%} imply annual wages above employment_income "
86+ f"by >{ tolerance :.0%} "
87+ )
88+
89+
90+ def build_hourly_wage_income_consistency_diagnostics (
91+ employment_income : np .ndarray ,
92+ hourly_wage : np .ndarray ,
93+ hours_worked_last_week : np .ndarray ,
94+ is_paid_hourly : np .ndarray ,
95+ weights : np .ndarray | None = None ,
96+ * ,
97+ relative_tolerance : float = HOURLY_WAGE_INCOME_RELATIVE_TOLERANCE ,
98+ mismatch_share_warn_threshold : float = (
99+ HOURLY_WAGE_INCOME_MISMATCH_SHARE_WARN_THRESHOLD
100+ ),
101+ mean_abs_rel_error_warn_threshold : float = (
102+ HOURLY_WAGE_INCOME_MEAN_ABS_REL_ERROR_WARN_THRESHOLD
103+ ),
104+ ) -> List [dict ]:
105+ """Compare hourly facts with annual employment income.
106+
107+ Warns when more than 25 percent of comparable hourly workers differ by
108+ more than 10 percent, or when the weighted mean absolute relative gap
109+ exceeds 20 percent. These thresholds flag broad inconsistencies while
110+ allowing last-week hours and annual wages to differ for normal reasons.
111+ """
112+ employment_income = np .asarray (employment_income , dtype = float )
113+ hourly_wage = np .asarray (hourly_wage , dtype = float )
114+ hours_worked_last_week = np .asarray (hours_worked_last_week , dtype = float )
115+ is_paid_hourly = np .asarray (is_paid_hourly , dtype = bool )
116+
117+ if weights is None :
118+ weights = np .ones_like (employment_income , dtype = float )
119+ else :
120+ weights = np .asarray (weights , dtype = float )
121+
122+ straight_time_hours = np .minimum (hours_worked_last_week , STANDARD_HOURS_PER_WEEK )
123+ overtime_hours = np .maximum (hours_worked_last_week - STANDARD_HOURS_PER_WEEK , 0 )
124+ straight_time_equivalent_hours = WEEKS_IN_YEAR * (
125+ straight_time_hours + overtime_hours * OVERTIME_RATE_MULTIPLIER
126+ )
127+ implied_annual_wages = hourly_wage * straight_time_equivalent_hours
128+
129+ base_mask = (
130+ is_paid_hourly
131+ & (hourly_wage > 0 )
132+ & (hours_worked_last_week > 0 )
133+ & (employment_income > 0 )
134+ & np .isfinite (implied_annual_wages )
135+ & np .isfinite (employment_income )
136+ & np .isfinite (weights )
137+ & (weights > 0 )
138+ )
139+
140+ results = []
141+ subsets = [
142+ ("hourly_wage_income_consistency" , base_mask ),
143+ (
144+ "hourly_wage_income_consistency_overtime" ,
145+ base_mask & (overtime_hours > 0 ),
146+ ),
147+ ]
148+
149+ for check_name , mask in subsets :
150+ if not mask .any ():
151+ results .append (
152+ {
153+ "check" : check_name ,
154+ "status" : "SKIP" ,
155+ "detail" : "no comparable hourly workers" ,
156+ }
157+ )
158+ continue
159+
160+ rel_gap = (
161+ implied_annual_wages [mask ] - employment_income [mask ]
162+ ) / employment_income [mask ]
163+ subset_weights = weights [mask ]
164+ mismatch = np .abs (rel_gap ) >= relative_tolerance
165+ over_income = rel_gap >= relative_tolerance
166+ mismatch_share = _weighted_mean (mismatch .astype (float ), subset_weights )
167+ mean_abs_rel_error = _weighted_mean (np .abs (rel_gap ), subset_weights )
168+ p90_abs_rel_error = _weighted_quantile (
169+ np .abs (rel_gap ),
170+ subset_weights ,
171+ 0.9 ,
172+ )
173+ over_income_share = _weighted_mean (
174+ over_income .astype (float ),
175+ subset_weights ,
176+ )
177+
178+ warn = (
179+ mismatch_share > mismatch_share_warn_threshold
180+ or mean_abs_rel_error > mean_abs_rel_error_warn_threshold
181+ )
182+ results .append (
183+ {
184+ "check" : check_name ,
185+ "status" : "WARN" if warn else "PASS" ,
186+ "detail" : _format_hourly_wage_income_detail (
187+ comparable_count = int (mask .sum ()),
188+ comparable_weight = float (subset_weights .sum ()),
189+ mismatch_count = int (mismatch .sum ()),
190+ mismatch_share = mismatch_share ,
191+ mean_abs_rel_error = mean_abs_rel_error ,
192+ p90_abs_rel_error = p90_abs_rel_error ,
193+ over_income_share = over_income_share ,
194+ tolerance = relative_tolerance ,
195+ ),
196+ }
197+ )
198+
199+ return results
200+
201+
39202def run_sanity_checks (
40203 h5_path : str ,
41204 period : int = 2024 ,
@@ -61,6 +224,32 @@ def _get(f, path):
61224 except KeyError :
62225 return None
63226
227+ def _get_person_weights (f , period , person_count , household_weights ):
228+ if household_weights is None :
229+ return None
230+ if len (household_weights ) == person_count :
231+ return household_weights
232+
233+ person_hh_arr = _get (f , f"person_household_id/{ period } " )
234+ if person_hh_arr is None :
235+ person_hh_arr = _get (f , "person_household_id" )
236+ hh_id_arr = _get (f , f"household_id/{ period } " )
237+ if hh_id_arr is None :
238+ hh_id_arr = _get (f , "household_id" )
239+ if person_hh_arr is None or hh_id_arr is None :
240+ return None
241+ if len (hh_id_arr ) != len (household_weights ):
242+ return None
243+
244+ household_weight_by_id = dict (zip (hh_id_arr .tolist (), household_weights ))
245+ try :
246+ return np .array (
247+ [household_weight_by_id [hh_id ] for hh_id in person_hh_arr .tolist ()],
248+ dtype = float ,
249+ )
250+ except KeyError :
251+ return None
252+
64253 with h5py .File (h5_path , "r" ) as f :
65254 # 1. Weight non-negativity
66255 w_key = f"household_weight/{ period } "
@@ -249,6 +438,48 @@ def _get(f, path):
249438 }
250439 )
251440
441+ employment_income = _get (f , f"employment_income/{ period } " )
442+ hourly_wage = _get (f , f"hourly_wage/{ period } " )
443+ hours_worked_last_week = _get (f , f"hours_worked_last_week/{ period } " )
444+ is_paid_hourly = _get (f , f"is_paid_hourly/{ period } " )
445+ hourly_inputs = [
446+ employment_income ,
447+ hourly_wage ,
448+ hours_worked_last_week ,
449+ is_paid_hourly ,
450+ ]
451+ if any (value is None for value in hourly_inputs ):
452+ results .append (
453+ {
454+ "check" : "hourly_wage_income_consistency" ,
455+ "status" : "SKIP" ,
456+ "detail" : "missing one or more hourly wage consistency inputs" ,
457+ }
458+ )
459+ results .append (
460+ {
461+ "check" : "hourly_wage_income_consistency_overtime" ,
462+ "status" : "SKIP" ,
463+ "detail" : "missing one or more hourly wage consistency inputs" ,
464+ }
465+ )
466+ else :
467+ person_weights = _get_person_weights (
468+ f ,
469+ period ,
470+ len (employment_income ),
471+ weights ,
472+ )
473+ results .extend (
474+ build_hourly_wage_income_consistency_diagnostics (
475+ employment_income = employment_income ,
476+ hourly_wage = hourly_wage ,
477+ hours_worked_last_week = hours_worked_last_week ,
478+ is_paid_hourly = is_paid_hourly ,
479+ weights = person_weights ,
480+ )
481+ )
482+
252483 return results
253484
254485
0 commit comments