Skip to content

Commit af9dfb7

Browse files
authored
Merge pull request #509 from PSLmodels/improve-quality-report
Add average AGI, unusualness, and Kish ESS to quality report and write a per-area CSV for every area on every run
2 parents 73fd9f8 + c486880 commit af9dfb7

1 file changed

Lines changed: 197 additions & 8 deletions

File tree

tmd/areas/quality_report.py

Lines changed: 197 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,7 @@ def generate_report(
522522

523523
# Load TMD data and weight files (once, shared by all diagnostics)
524524
report_data = _load_report_data(areas, weight_dir)
525+
per_area_metrics = {}
525526
if report_data is not None:
526527
tmd, s006, state_weights, n_loaded = report_data
527528
# Weight distribution by AGI stub (old vs new)
@@ -530,6 +531,31 @@ def generate_report(
530531
tmd, s006, state_weights, n_loaded, target_dir
531532
)
532533
)
534+
pop_shares = {
535+
r["area"]: r.get("pop_share")
536+
for _, r in df.iterrows()
537+
if r.get("pop_share") is not None
538+
}
539+
per_area_metrics = _compute_per_area_metrics(
540+
areas, target_dir, tmd, s006, state_weights, pop_shares
541+
)
542+
543+
# Attach per-area metrics to the dataframe
544+
for col in ("returns", "agi", "avg_agi", "ess", "unusualness"):
545+
df[col] = df["area"].map(
546+
lambda a, c=col: (per_area_metrics.get(a) or {}).get(c)
547+
)
548+
549+
# Compute pct_zero column for CSV / table
550+
def _pct_zero(row):
551+
dist = row.get("dist_bins", {})
552+
n_rec = row.get("n_records", 0) or 0
553+
if not isinstance(dist, dict) or n_rec == 0:
554+
return None
555+
n_zero = dist.get((0.0, 0.0), {}).get("count", 0)
556+
return 100.0 * n_zero / n_rec
557+
558+
df["pct_zero"] = df.apply(_pct_zero, axis=1)
533559

534560
# Per-area table
535561
# For small area counts (states), show all areas.
@@ -568,6 +594,16 @@ def generate_report(
568594
" wP05/wMed/wP95/wMax = multiplier percentiles."
569595
" %zero = records excluded."
570596
)
597+
lines.append(
598+
" AvgAGI = weighted AGI per return ($K)."
599+
" Unusual% = mean |area_target - pop_share*national|"
600+
" / |pop_share*national| across targets"
601+
)
602+
lines.append(
603+
" (0% = looks exactly like the nation; higher ="
604+
" more unlike the nation)."
605+
" ESS = Kish effective sample size (sum w)^2 / sum(w^2)."
606+
)
571607
max_id = max(
572608
(len(str(row["area"])) for _, row in display_df.iterrows()),
573609
default=4,
@@ -577,7 +613,8 @@ def generate_report(
577613
f"{'Area':<{id_w}} {'Status':<14} {'Hit':>5} {'Tot':>5} "
578614
f"{'Viol':>5} {'MeanErr':>8} {'MaxErr':>8} "
579615
f"{'wRMSE':>7} {'wP05':>7} {'wMed':>7} "
580-
f"{'wP95':>7} {'wMax':>8} {'%zero':>6}"
616+
f"{'wP95':>7} {'wMax':>8} {'%zero':>6} "
617+
f"{'AvgAGI':>8} {'Unusual%':>8} {'ESS':>9}"
581618
)
582619
lines.append(header)
583620
lines.append("-" * len(header))
@@ -592,18 +629,26 @@ def generate_report(
592629
med = row.get("w_median", 0)
593630
p95 = row.get("w_p95", 0)
594631
wmax = row.get("w_max", 0)
595-
dist = row.get("dist_bins", {})
596-
n_rec = row.get("n_records", 0)
597-
n_zero = 0
598-
if isinstance(dist, dict):
599-
n_zero = dist.get((0.0, 0.0), {}).get("count", 0)
600-
pct_zero = 100 * n_zero / n_rec if n_rec > 0 else 0
632+
pct_zero = row.get("pct_zero")
633+
if pct_zero is None:
634+
pct_zero = 0.0
635+
avg_agi = row.get("avg_agi")
636+
unusual = row.get("unusualness")
637+
ess = row.get("ess")
638+
avg_agi_s = (
639+
f"${avg_agi / 1000:>6.1f}K" if avg_agi is not None else " -"
640+
)
641+
unusual_s = (
642+
f"{unusual * 100:>7.1f}%" if unusual is not None else " -"
643+
)
644+
ess_s = f"{ess:>9,.0f}" if ess is not None else " -"
601645
area_str = str(row["area"])
602646
lines.append(
603647
f"{area_str:<{id_w}} {row['status']:<14} {hit:>5} {tot:>5} "
604648
f"{viol:>5} {me:>8.4f} {mx:>8.4f} "
605649
f"{rmse:>7.3f} {p5:>7.3f} {med:>7.3f} "
606-
f"{p95:>7.3f} {wmax:>8.1f} {pct_zero:>5.1f}%"
650+
f"{p95:>7.3f} {wmax:>8.1f} {pct_zero:>5.1f}% "
651+
f"{avg_agi_s} {unusual_s} {ess_s}"
607652
)
608653
if not show_all:
609654
n_omitted = n_areas - len(display_df)
@@ -697,6 +742,39 @@ def generate_report(
697742
)
698743
)
699744

745+
# Write per-area CSV for ALL areas (regardless of which were
746+
# displayed in the table above) for further analysis.
747+
csv_cols = [
748+
"area",
749+
"status",
750+
"pop_share",
751+
"solve_time",
752+
"targets_hit",
753+
"targets_total",
754+
"n_violated",
755+
"mean_err",
756+
"max_err",
757+
"w_rmse",
758+
"w_min",
759+
"w_p5",
760+
"w_median",
761+
"w_p95",
762+
"w_max",
763+
"pct_zero",
764+
"n_records",
765+
"returns",
766+
"agi",
767+
"avg_agi",
768+
"unusualness",
769+
"ess",
770+
]
771+
csv_df = df.reindex(columns=csv_cols)
772+
csv_path = weight_dir / "quality_report_per_area.csv"
773+
csv_df.to_csv(csv_path, index=False)
774+
lines.append("")
775+
lines.append(f"Per-area metrics for ALL {n_areas} areas saved to:")
776+
lines.append(f" {csv_path}")
777+
700778
report = "\n".join(lines)
701779
return report
702780

@@ -784,6 +862,117 @@ def _load_report_data(areas, weight_dir):
784862
return tmd, s006, state_weights, n_loaded
785863

786864

865+
def _compute_per_area_metrics(
866+
areas, target_dir, tmd, s006, state_weights, pop_shares
867+
):
868+
"""
869+
Per-area metrics computed from weights and target files:
870+
- returns: sum of area weights
871+
- agi: weighted total AGI
872+
- avg_agi: AGI per return
873+
- ess: Kish effective sample size = (sum w)^2 / sum(w^2)
874+
- unusualness: mean over targets of
875+
|area_target - pop_share * national| / |pop_share * national|
876+
where national is computed by applying the target's recipe
877+
(varname, count, scope, agi range, fstatus) to the full TMD
878+
with s006 weighting. The XTOT population target is excluded
879+
because by definition pop_share * national_pop = area_pop.
880+
"""
881+
metrics = {}
882+
if "c00100" not in tmd.columns:
883+
return metrics
884+
885+
c00100 = tmd["c00100"].values
886+
mars_arr = tmd["MARS"].values.astype(float)
887+
data_source_arr = tmd["data_source"].values
888+
n_records = len(tmd)
889+
ones_arr = np.ones(n_records, dtype=float)
890+
891+
# Cache national totals per recipe to avoid recomputation across areas
892+
nat_cache = {}
893+
894+
def national_for_recipe(varname, count, scope, lo, hi, fstatus):
895+
key = (varname, count, scope, lo, hi, fstatus)
896+
if key in nat_cache:
897+
return nat_cache[key]
898+
if varname not in tmd.columns:
899+
nat_cache[key] = None
900+
return None
901+
var_arr = tmd[varname].values.astype(float)
902+
if count == 0:
903+
vals = var_arr
904+
elif count == 1:
905+
vals = ones_arr
906+
elif count == 2:
907+
vals = (var_arr != 0).astype(float)
908+
elif count == 3:
909+
vals = (var_arr > 0).astype(float)
910+
elif count == 4:
911+
vals = (var_arr < 0).astype(float)
912+
else:
913+
nat_cache[key] = None
914+
return None
915+
if scope == 1:
916+
mask = (data_source_arr == 1).astype(float)
917+
elif scope == 2:
918+
mask = (data_source_arr == 0).astype(float)
919+
else:
920+
mask = ones_arr.copy()
921+
mask = mask * ((c00100 >= lo) & (c00100 < hi))
922+
if fstatus > 0:
923+
mask = mask * (mars_arr == fstatus)
924+
nat = float((s006 * vals * mask).sum())
925+
nat_cache[key] = nat
926+
return nat
927+
928+
for area in areas:
929+
if area not in state_weights:
930+
continue
931+
w = state_weights[area]
932+
sum_w = float(w.sum())
933+
sum_w2 = float((w * w).sum())
934+
returns = sum_w
935+
agi = float((w * c00100).sum())
936+
avg_agi = agi / returns if returns > 0 else 0.0
937+
ess = (sum_w * sum_w) / sum_w2 if sum_w2 > 0 else 0.0
938+
939+
unusual = None
940+
pop_share = pop_shares.get(area)
941+
tgt_path = target_dir / f"{area}_targets.csv"
942+
if pop_share is not None and tgt_path.exists():
943+
tdf = pd.read_csv(tgt_path, comment="#")
944+
gaps = []
945+
for _, row in tdf.iterrows():
946+
# Skip the XTOT population row (gap is zero by construction)
947+
if row.varname == "XTOT" and float(row.agilo) < -8e99:
948+
continue
949+
nat = national_for_recipe(
950+
row.varname,
951+
int(row["count"]),
952+
int(row.scope),
953+
float(row.agilo),
954+
float(row.agihi),
955+
int(row.fstatus),
956+
)
957+
if nat is None:
958+
continue
959+
prop = pop_share * nat
960+
if abs(prop) < 1.0:
961+
continue
962+
gaps.append(abs(float(row.target) - prop) / abs(prop))
963+
if gaps:
964+
unusual = float(np.mean(gaps))
965+
966+
metrics[area] = {
967+
"returns": returns,
968+
"agi": agi,
969+
"avg_agi": avg_agi,
970+
"ess": ess,
971+
"unusualness": unusual,
972+
}
973+
return metrics
974+
975+
787976
def _weight_distribution_by_stub(
788977
tmd, s006, state_weights, n_loaded, target_dir
789978
):

0 commit comments

Comments
 (0)