@@ -432,7 +432,7 @@ def livelihood_activity_label_recognition_dataframe(
432432 all_other_cash_income_labels_dataframe : pd .DataFrame ,
433433 all_wild_foods_labels_dataframe : pd .DataFrame ,
434434 all_livelihood_summary_labels_dataframe : pd .DataFrame ,
435- ):
435+ ) -> Output [ dict [ str , pd . DataFrame ]] :
436436 """
437437 A saved spreadsheet showing how each BSS label is recognized, either from the ActivityLabel model or a regex.
438438 """
@@ -570,8 +570,34 @@ def livelihood_activity_label_recognition_dataframe(
570570 how = "inner" ,
571571 )
572572
573- # Save the dataframes to an Excel workbook
574- return {"Label Summary" : summary_label_df , "All Labels" : all_labels_df }
573+ return Output (
574+ {"Label Summary" : summary_label_df , "All Labels" : all_labels_df },
575+ metadata = {
576+ "num_labels" : len (all_labels_df ),
577+ "num_distinct_labels" : len (summary_label_df ),
578+ "num_used_labels" : len (summary_label_df [summary_label_df ["datapoint_count" ] > 0 ]),
579+ "num_recognized_labels" : len (summary_label_df [summary_label_df ["activity_label" ] != "" ]),
580+ "pct_recognized_labels" : (
581+ round (len (summary_label_df [summary_label_df ["activity_label" ] != "" ]) / len (summary_label_df ) * 100 , 1 )
582+ if len (summary_label_df ) > 0
583+ else 0
584+ ),
585+ "pct_recognized_used_labels" : (
586+ round (
587+ len (
588+ summary_label_df [
589+ (summary_label_df ["datapoint_count" ] > 0 ) & (summary_label_df ["activity_label" ] != "" )
590+ ]
591+ )
592+ / len (summary_label_df [summary_label_df ["datapoint_count" ] > 0 ])
593+ * 100 ,
594+ 1 ,
595+ )
596+ if len (summary_label_df ) > 0
597+ else 0
598+ ),
599+ },
600+ )
575601
576602
577603def get_instances_from_dataframe (
@@ -642,15 +668,15 @@ def get_instances_from_dataframe(
642668 # Keep the same shape as the non-empty case (label, rows, datapoint_count, in_summary)
643669 unrecognized_labels = pd .DataFrame (columns = ["label" , "rows" , "datapoint_count" , "in_summary" ])
644670 else :
645- # Boolean mask of which cells are numeric (coerce non-numeric to NaN then notna)
646- numeric_mask = unrecognized_labels .loc [:, "B" :].apply (lambda col : pd .to_numeric (col , errors = "coerce" ).notna ())
647- # Count numeric datapoints per row
648- unrecognized_labels ["datapoint_count" ] = numeric_mask .sum (axis = 1 )
649- # Count numeric datapoints per row that are in the summary columns
650- summary_numeric_mask = unrecognized_labels .loc [:, summary_columns ].apply (
651- lambda col : pd .to_numeric (col , errors = "coerce" ).notna ()
671+ # Boolean mask of which cells are used
672+ # Count datapoints per row
673+ unrecognized_labels ["datapoint_count" ] = unrecognized_labels .loc [:, "B" :].apply (
674+ lambda row : sum ((row != 0 ) & (row != "" )), axis = "columns"
675+ )
676+ # Count datapoints per row that are in the summary columns
677+ unrecognized_labels ["summary_datapoint_count" ] = unrecognized_labels .loc [:, summary_columns ].apply (
678+ lambda row : sum ((row != 0 ) & (row != "" )), axis = "columns"
652679 )
653- unrecognized_labels ["summary_datapoint_count" ] = summary_numeric_mask .sum (axis = 1 )
654680 # Aggregate datapoint count by label
655681 unrecognized_labels .loc [:, "label" ] = prepare_lookup (unrecognized_labels ["A" ])
656682 unrecognized_labels = (
@@ -1396,7 +1422,25 @@ def get_instances_from_dataframe(
13961422 "pct_rows_recognized" : round (
13971423 (
13981424 1
1399- - len (df .iloc [num_header_rows :][df .iloc [num_header_rows :]["A" ].isin (unrecognized_labels ["label" ])])
1425+ - len (
1426+ df .iloc [num_header_rows :][
1427+ prepare_lookup (df .iloc [num_header_rows :]["A" ]).isin (unrecognized_labels ["label" ])
1428+ ]
1429+ )
1430+ / len (df .iloc [num_header_rows :])
1431+ )
1432+ * 100
1433+ ),
1434+ "pct_used_rows_recognized" : round (
1435+ (
1436+ 1
1437+ - len (
1438+ df .iloc [num_header_rows :][
1439+ prepare_lookup (df .iloc [num_header_rows :]["A" ]).isin (
1440+ unrecognized_labels [unrecognized_labels ["datapoints" ] > 0 ]["label" ]
1441+ )
1442+ ]
1443+ )
14001444 / len (df .iloc [num_header_rows :])
14011445 )
14021446 * 100
0 commit comments