@@ -522,6 +522,7 @@ def generate_report(
522522
523523 # Load TMD data and weight files (once, shared by all diagnostics)
524524 report_data = _load_report_data (areas , weight_dir )
525+ per_area_metrics = {}
525526 if report_data is not None :
526527 tmd , s006 , state_weights , n_loaded = report_data
527528 # Weight distribution by AGI stub (old vs new)
@@ -530,6 +531,31 @@ def generate_report(
530531 tmd , s006 , state_weights , n_loaded , target_dir
531532 )
532533 )
534+ pop_shares = {
535+ r ["area" ]: r .get ("pop_share" )
536+ for _ , r in df .iterrows ()
537+ if r .get ("pop_share" ) is not None
538+ }
539+ per_area_metrics = _compute_per_area_metrics (
540+ areas , target_dir , tmd , s006 , state_weights , pop_shares
541+ )
542+
543+ # Attach per-area metrics to the dataframe
544+ for col in ("returns" , "agi" , "avg_agi" , "ess" , "unusualness" ):
545+ df [col ] = df ["area" ].map (
546+ lambda a , c = col : (per_area_metrics .get (a ) or {}).get (c )
547+ )
548+
549+ # Compute pct_zero column for CSV / table
550+ def _pct_zero (row ):
551+ dist = row .get ("dist_bins" , {})
552+ n_rec = row .get ("n_records" , 0 ) or 0
553+ if not isinstance (dist , dict ) or n_rec == 0 :
554+ return None
555+ n_zero = dist .get ((0.0 , 0.0 ), {}).get ("count" , 0 )
556+ return 100.0 * n_zero / n_rec
557+
558+ df ["pct_zero" ] = df .apply (_pct_zero , axis = 1 )
533559
534560 # Per-area table
535561 # For small area counts (states), show all areas.
@@ -568,6 +594,16 @@ def generate_report(
568594 " wP05/wMed/wP95/wMax = multiplier percentiles."
569595 " %zero = records excluded."
570596 )
597+ lines .append (
598+ " AvgAGI = weighted AGI per return ($K)."
599+ " Unusual% = mean |area_target - pop_share*national|"
600+ " / |pop_share*national| across targets"
601+ )
602+ lines .append (
603+ " (0% = looks exactly like the nation; higher ="
604+ " more unlike the nation)."
605+ " ESS = Kish effective sample size (sum w)^2 / sum(w^2)."
606+ )
571607 max_id = max (
572608 (len (str (row ["area" ])) for _ , row in display_df .iterrows ()),
573609 default = 4 ,
@@ -577,7 +613,8 @@ def generate_report(
577613 f"{ 'Area' :<{id_w }} { 'Status' :<14} { 'Hit' :>5} { 'Tot' :>5} "
578614 f"{ 'Viol' :>5} { 'MeanErr' :>8} { 'MaxErr' :>8} "
579615 f"{ 'wRMSE' :>7} { 'wP05' :>7} { 'wMed' :>7} "
580- f"{ 'wP95' :>7} { 'wMax' :>8} { '%zero' :>6} "
616+ f"{ 'wP95' :>7} { 'wMax' :>8} { '%zero' :>6} "
617+ f"{ 'AvgAGI' :>8} { 'Unusual%' :>8} { 'ESS' :>9} "
581618 )
582619 lines .append (header )
583620 lines .append ("-" * len (header ))
@@ -592,18 +629,26 @@ def generate_report(
592629 med = row .get ("w_median" , 0 )
593630 p95 = row .get ("w_p95" , 0 )
594631 wmax = row .get ("w_max" , 0 )
595- dist = row .get ("dist_bins" , {})
596- n_rec = row .get ("n_records" , 0 )
597- n_zero = 0
598- if isinstance (dist , dict ):
599- n_zero = dist .get ((0.0 , 0.0 ), {}).get ("count" , 0 )
600- pct_zero = 100 * n_zero / n_rec if n_rec > 0 else 0
632+ pct_zero = row .get ("pct_zero" )
633+ if pct_zero is None :
634+ pct_zero = 0.0
635+ avg_agi = row .get ("avg_agi" )
636+ unusual = row .get ("unusualness" )
637+ ess = row .get ("ess" )
638+ avg_agi_s = (
639+ f"${ avg_agi / 1000 :>6.1f} K" if avg_agi is not None else " -"
640+ )
641+ unusual_s = (
642+ f"{ unusual * 100 :>7.1f} %" if unusual is not None else " -"
643+ )
644+ ess_s = f"{ ess :>9,.0f} " if ess is not None else " -"
601645 area_str = str (row ["area" ])
602646 lines .append (
603647 f"{ area_str :<{id_w }} { row ['status' ]:<14} { hit :>5} { tot :>5} "
604648 f"{ viol :>5} { me :>8.4f} { mx :>8.4f} "
605649 f"{ rmse :>7.3f} { p5 :>7.3f} { med :>7.3f} "
606- f"{ p95 :>7.3f} { wmax :>8.1f} { pct_zero :>5.1f} %"
650+ f"{ p95 :>7.3f} { wmax :>8.1f} { pct_zero :>5.1f} % "
651+ f"{ avg_agi_s } { unusual_s } { ess_s } "
607652 )
608653 if not show_all :
609654 n_omitted = n_areas - len (display_df )
@@ -697,6 +742,39 @@ def generate_report(
697742 )
698743 )
699744
745+ # Write per-area CSV for ALL areas (regardless of which were
746+ # displayed in the table above) for further analysis.
747+ csv_cols = [
748+ "area" ,
749+ "status" ,
750+ "pop_share" ,
751+ "solve_time" ,
752+ "targets_hit" ,
753+ "targets_total" ,
754+ "n_violated" ,
755+ "mean_err" ,
756+ "max_err" ,
757+ "w_rmse" ,
758+ "w_min" ,
759+ "w_p5" ,
760+ "w_median" ,
761+ "w_p95" ,
762+ "w_max" ,
763+ "pct_zero" ,
764+ "n_records" ,
765+ "returns" ,
766+ "agi" ,
767+ "avg_agi" ,
768+ "unusualness" ,
769+ "ess" ,
770+ ]
771+ csv_df = df .reindex (columns = csv_cols )
772+ csv_path = weight_dir / "quality_report_per_area.csv"
773+ csv_df .to_csv (csv_path , index = False )
774+ lines .append ("" )
775+ lines .append (f"Per-area metrics for ALL { n_areas } areas saved to:" )
776+ lines .append (f" { csv_path } " )
777+
700778 report = "\n " .join (lines )
701779 return report
702780
@@ -784,6 +862,117 @@ def _load_report_data(areas, weight_dir):
784862 return tmd , s006 , state_weights , n_loaded
785863
786864
865+ def _compute_per_area_metrics (
866+ areas , target_dir , tmd , s006 , state_weights , pop_shares
867+ ):
868+ """
869+ Per-area metrics computed from weights and target files:
870+ - returns: sum of area weights
871+ - agi: weighted total AGI
872+ - avg_agi: AGI per return
873+ - ess: Kish effective sample size = (sum w)^2 / sum(w^2)
874+ - unusualness: mean over targets of
875+ |area_target - pop_share * national| / |pop_share * national|
876+ where national is computed by applying the target's recipe
877+ (varname, count, scope, agi range, fstatus) to the full TMD
878+ with s006 weighting. The XTOT population target is excluded
879+ because by definition pop_share * national_pop = area_pop.
880+ """
881+ metrics = {}
882+ if "c00100" not in tmd .columns :
883+ return metrics
884+
885+ c00100 = tmd ["c00100" ].values
886+ mars_arr = tmd ["MARS" ].values .astype (float )
887+ data_source_arr = tmd ["data_source" ].values
888+ n_records = len (tmd )
889+ ones_arr = np .ones (n_records , dtype = float )
890+
891+ # Cache national totals per recipe to avoid recomputation across areas
892+ nat_cache = {}
893+
894+ def national_for_recipe (varname , count , scope , lo , hi , fstatus ):
895+ key = (varname , count , scope , lo , hi , fstatus )
896+ if key in nat_cache :
897+ return nat_cache [key ]
898+ if varname not in tmd .columns :
899+ nat_cache [key ] = None
900+ return None
901+ var_arr = tmd [varname ].values .astype (float )
902+ if count == 0 :
903+ vals = var_arr
904+ elif count == 1 :
905+ vals = ones_arr
906+ elif count == 2 :
907+ vals = (var_arr != 0 ).astype (float )
908+ elif count == 3 :
909+ vals = (var_arr > 0 ).astype (float )
910+ elif count == 4 :
911+ vals = (var_arr < 0 ).astype (float )
912+ else :
913+ nat_cache [key ] = None
914+ return None
915+ if scope == 1 :
916+ mask = (data_source_arr == 1 ).astype (float )
917+ elif scope == 2 :
918+ mask = (data_source_arr == 0 ).astype (float )
919+ else :
920+ mask = ones_arr .copy ()
921+ mask = mask * ((c00100 >= lo ) & (c00100 < hi ))
922+ if fstatus > 0 :
923+ mask = mask * (mars_arr == fstatus )
924+ nat = float ((s006 * vals * mask ).sum ())
925+ nat_cache [key ] = nat
926+ return nat
927+
928+ for area in areas :
929+ if area not in state_weights :
930+ continue
931+ w = state_weights [area ]
932+ sum_w = float (w .sum ())
933+ sum_w2 = float ((w * w ).sum ())
934+ returns = sum_w
935+ agi = float ((w * c00100 ).sum ())
936+ avg_agi = agi / returns if returns > 0 else 0.0
937+ ess = (sum_w * sum_w ) / sum_w2 if sum_w2 > 0 else 0.0
938+
939+ unusual = None
940+ pop_share = pop_shares .get (area )
941+ tgt_path = target_dir / f"{ area } _targets.csv"
942+ if pop_share is not None and tgt_path .exists ():
943+ tdf = pd .read_csv (tgt_path , comment = "#" )
944+ gaps = []
945+ for _ , row in tdf .iterrows ():
946+ # Skip the XTOT population row (gap is zero by construction)
947+ if row .varname == "XTOT" and float (row .agilo ) < - 8e99 :
948+ continue
949+ nat = national_for_recipe (
950+ row .varname ,
951+ int (row ["count" ]),
952+ int (row .scope ),
953+ float (row .agilo ),
954+ float (row .agihi ),
955+ int (row .fstatus ),
956+ )
957+ if nat is None :
958+ continue
959+ prop = pop_share * nat
960+ if abs (prop ) < 1.0 :
961+ continue
962+ gaps .append (abs (float (row .target ) - prop ) / abs (prop ))
963+ if gaps :
964+ unusual = float (np .mean (gaps ))
965+
966+ metrics [area ] = {
967+ "returns" : returns ,
968+ "agi" : agi ,
969+ "avg_agi" : avg_agi ,
970+ "ess" : ess ,
971+ "unusualness" : unusual ,
972+ }
973+ return metrics
974+
975+
787976def _weight_distribution_by_stub (
788977 tmd , s006 , state_weights , n_loaded , target_dir
789978):
0 commit comments