Skip to content

Commit e32f170

Browse files
committed
Bettter metadata for label recognition - see HEA-807
1 parent 2bf0134 commit e32f170

1 file changed

Lines changed: 56 additions & 12 deletions

File tree

pipelines/assets/livelihood_activity.py

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ def livelihood_activity_label_recognition_dataframe(
432432
all_other_cash_income_labels_dataframe: pd.DataFrame,
433433
all_wild_foods_labels_dataframe: pd.DataFrame,
434434
all_livelihood_summary_labels_dataframe: pd.DataFrame,
435-
):
435+
) -> Output[dict[str, pd.DataFrame]]:
436436
"""
437437
A saved spreadsheet showing how each BSS label is recognized, either from the ActivityLabel model or a regex.
438438
"""
@@ -570,8 +570,34 @@ def livelihood_activity_label_recognition_dataframe(
570570
how="inner",
571571
)
572572

573-
# Save the dataframes to an Excel workbook
574-
return {"Label Summary": summary_label_df, "All Labels": all_labels_df}
573+
return Output(
574+
{"Label Summary": summary_label_df, "All Labels": all_labels_df},
575+
metadata={
576+
"num_labels": len(all_labels_df),
577+
"num_distinct_labels": len(summary_label_df),
578+
"num_used_labels": len(summary_label_df[summary_label_df["datapoint_count"] > 0]),
579+
"num_recognized_labels": len(summary_label_df[summary_label_df["activity_label"] != ""]),
580+
"pct_recognized_labels": (
581+
round(len(summary_label_df[summary_label_df["activity_label"] != ""]) / len(summary_label_df) * 100, 1)
582+
if len(summary_label_df) > 0
583+
else 0
584+
),
585+
"pct_recognized_used_labels": (
586+
round(
587+
len(
588+
summary_label_df[
589+
(summary_label_df["datapoint_count"] > 0) & (summary_label_df["activity_label"] != "")
590+
]
591+
)
592+
/ len(summary_label_df[summary_label_df["datapoint_count"] > 0])
593+
* 100,
594+
1,
595+
)
596+
if len(summary_label_df) > 0
597+
else 0
598+
),
599+
},
600+
)
575601

576602

577603
def get_instances_from_dataframe(
@@ -642,15 +668,15 @@ def get_instances_from_dataframe(
642668
# Keep the same shape as the non-empty case (label, rows, datapoint_count, in_summary)
643669
unrecognized_labels = pd.DataFrame(columns=["label", "rows", "datapoint_count", "in_summary"])
644670
else:
645-
# Boolean mask of which cells are numeric (coerce non-numeric to NaN then notna)
646-
numeric_mask = unrecognized_labels.loc[:, "B":].apply(lambda col: pd.to_numeric(col, errors="coerce").notna())
647-
# Count numeric datapoints per row
648-
unrecognized_labels["datapoint_count"] = numeric_mask.sum(axis=1)
649-
# Count numeric datapoints per row that are in the summary columns
650-
summary_numeric_mask = unrecognized_labels.loc[:, summary_columns].apply(
651-
lambda col: pd.to_numeric(col, errors="coerce").notna()
671+
# Boolean mask of which cells are used
672+
# Count datapoints per row
673+
unrecognized_labels["datapoint_count"] = unrecognized_labels.loc[:, "B":].apply(
674+
lambda row: sum((row != 0) & (row != "")), axis="columns"
675+
)
676+
# Count datapoints per row that are in the summary columns
677+
unrecognized_labels["summary_datapoint_count"] = unrecognized_labels.loc[:, summary_columns].apply(
678+
lambda row: sum((row != 0) & (row != "")), axis="columns"
652679
)
653-
unrecognized_labels["summary_datapoint_count"] = summary_numeric_mask.sum(axis=1)
654680
# Aggregate datapoint count by label
655681
unrecognized_labels.loc[:, "label"] = prepare_lookup(unrecognized_labels["A"])
656682
unrecognized_labels = (
@@ -1396,7 +1422,25 @@ def get_instances_from_dataframe(
13961422
"pct_rows_recognized": round(
13971423
(
13981424
1
1399-
- len(df.iloc[num_header_rows:][df.iloc[num_header_rows:]["A"].isin(unrecognized_labels["label"])])
1425+
- len(
1426+
df.iloc[num_header_rows:][
1427+
prepare_lookup(df.iloc[num_header_rows:]["A"]).isin(unrecognized_labels["label"])
1428+
]
1429+
)
1430+
/ len(df.iloc[num_header_rows:])
1431+
)
1432+
* 100
1433+
),
1434+
"pct_used_rows_recognized": round(
1435+
(
1436+
1
1437+
- len(
1438+
df.iloc[num_header_rows:][
1439+
prepare_lookup(df.iloc[num_header_rows:]["A"]).isin(
1440+
unrecognized_labels[unrecognized_labels["datapoints"] > 0]["label"]
1441+
)
1442+
]
1443+
)
14001444
/ len(df.iloc[num_header_rows:])
14011445
)
14021446
* 100

0 commit comments

Comments
 (0)