Skip to content

Commit 706b03f

Browse files
authored
Merge pull request #286 from American-Institutes-for-Research/HEA-1084/better_completeness_summary
Improved completeness summary - see HEA-1084
2 parents bd00b7d + 8295410 commit 706b03f

1 file changed

Lines changed: 126 additions & 77 deletions

File tree

pipelines/assets/livelihood_activity.py

Lines changed: 126 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@
9191
ActivityLabel,
9292
LivelihoodActivityScenario,
9393
LivelihoodStrategyType,
94+
WealthGroupCategory,
9495
)
9596

9697
# Indexes of header rows in the Data3 dataframe (wealth_group_category, district, village, household size)
@@ -2002,110 +2003,158 @@ def get_annotated_instances_from_dataframe(
20022003
)
20032004

20042005
# Annotate the output metadata with completeness information
2005-
# Get the summary dataframe, grouped by strategy_type
2006+
# Get the summary dataframe, filtered to the BaselineLivelihoodActivities and grouped by strategy_type
20062007
summary_df = pd.DataFrame(reported_summary_output.value["LivelihoodActivity"])
2008+
summary_df = summary_df[
2009+
(summary_df["scenario"] == LivelihoodActivityScenario.BASELINE) # Baseline activities only
2010+
& summary_df["wealth_group"].apply(lambda x: x[3] == "") # Baseline-level activities, not community ones.
2011+
]
20072012
for col in ["income", "expenditure", "percentage_kcals", "kcals_consumed"]:
20082013
summary_df[col] = pd.to_numeric(summary_df[col], errors="coerce").fillna(0)
2014+
summary_df["wealth_group_category"] = summary_df["wealth_group"].apply(lambda x: x[2])
20092015
summary_df = (
2010-
summary_df[["strategy_type", "income", "expenditure", "percentage_kcals", "kcals_consumed"]]
2011-
.groupby("strategy_type")
2016+
summary_df[
2017+
[
2018+
"strategy_type",
2019+
"wealth_group_category",
2020+
"income",
2021+
"expenditure",
2022+
"percentage_kcals",
2023+
"kcals_consumed",
2024+
]
2025+
]
2026+
.groupby(["strategy_type", "wealth_group_category"])
20122027
.sum()
20132028
)
20142029

2015-
# Add the recognized Livelihood Activities, also grouped by strategy_type
2030+
# Add the recognized Livelihood Activities, also filtered to the BaselineLivelihoodActivities and grouped by strategy_type
20162031
recognized_activities_df = pd.DataFrame(output.value["LivelihoodActivity"])
2017-
for column in ["income", "expenditure", "kcals_consumed"]:
2032+
recognized_activities_df = recognized_activities_df[
2033+
(recognized_activities_df["scenario"] == LivelihoodActivityScenario.BASELINE)
2034+
& recognized_activities_df["wealth_group"].apply(
2035+
lambda x: x[3] == ""
2036+
) # Baseline-level activities, not community ones.
2037+
]
2038+
for column in ["income", "expenditure", "percentage_kcals", "kcals_consumed"]:
20182039
if column in recognized_activities_df:
20192040
recognized_activities_df[column] = pd.to_numeric(
20202041
recognized_activities_df[column], errors="coerce"
20212042
).fillna(0)
20222043
else:
20232044
recognized_activities_df[column] = 0
2045+
recognized_activities_df["wealth_group_category"] = recognized_activities_df["wealth_group"].apply(
2046+
lambda x: x[2]
2047+
)
2048+
recognized_activities_df = (
2049+
recognized_activities_df[
2050+
[
2051+
"strategy_type",
2052+
"wealth_group_category",
2053+
"income",
2054+
"expenditure",
2055+
"percentage_kcals",
2056+
"kcals_consumed",
2057+
]
2058+
]
2059+
.groupby(["strategy_type", "wealth_group_category"])
2060+
.sum()
2061+
)
2062+
20242063
summary_df = summary_df.join(
2025-
recognized_activities_df[["strategy_type", "income", "expenditure", "kcals_consumed"]]
2026-
.groupby("strategy_type")
2027-
.sum(),
2028-
on="strategy_type",
2064+
recognized_activities_df,
2065+
on=["strategy_type", "wealth_group_category"],
20292066
lsuffix="_summary",
20302067
rsuffix="_recognized",
20312068
).fillna(0)
20322069

2033-
# Add a totals row at the end
2034-
summary_df.loc["Total"] = summary_df.sum(numeric_only=True)
2035-
2036-
# Add completeness percentages
2037-
summary_df = summary_df.round(0)
2038-
summary_df["income_completeness"] = summary_df.apply(
2039-
lambda row: (
2040-
round(row["income_recognized"] / row["income_summary"] * 100, 1)
2041-
if row["income_summary"] > 0
2042-
else pd.NA
2043-
),
2044-
axis=1,
2045-
)
2046-
summary_df["expenditure_completeness"] = summary_df.apply(
2047-
lambda row: (
2048-
round(row["expenditure_recognized"] / row["expenditure_summary"] * 100, 1)
2049-
if row["expenditure_summary"] > 0
2050-
else pd.NA
2051-
),
2052-
axis=1,
2053-
)
2054-
summary_df["kcals_consumed_completeness"] = summary_df.apply(
2055-
lambda row: (
2056-
round(row["kcals_consumed_recognized"] / row["kcals_consumed_summary"] * 100, 1)
2057-
if row["kcals_consumed_summary"] > 0
2058-
else pd.NA
2059-
),
2060-
axis=1,
2061-
)
2062-
# Format the numbers as integers, for better display in the markdown table
2063-
for column in ["income", "expenditure", "kcals_consumed"]:
2064-
for source in ["recognized", "summary"]:
2065-
summary_df[f"{column}_{source}"] = summary_df.apply(
2066-
lambda row: (
2067-
int(row[f"{column}_{source}"])
2068-
if (pd.notna(row[f"{column}_recognized"]) and row[f"{column}_recognized"] > 0)
2069-
or (pd.notna(row[f"{column}_summary"]) and row[f"{column}_summary"] > 0)
2070-
else pd.NA
2071-
),
2072-
axis="columns",
2073-
)
2074-
2075-
# Transpose and reorder the columns and rows
2076-
# Sort the rows so that Strategy Types appear in the same order as in the BSS
2070+
# Sort the rows so that Strategy Types and Wealth Group Categories appear in the same order as in the BSS
2071+
summary_df = summary_df.reset_index(drop=False)
20772072
ordered_strategy_types = ["LivestockProduction"] + [x for x in LivelihoodStrategyType] + ["Total"]
20782073
summary_df["strategy_type"] = pd.Categorical(
2079-
summary_df.reset_index(drop=False)["strategy_type"],
2074+
summary_df["strategy_type"],
20802075
categories=ordered_strategy_types,
20812076
ordered=True,
20822077
)
2083-
summary_df = summary_df.reset_index(drop=True).sort_values(by="strategy_type")
2084-
summary_df = summary_df[
2085-
[
2086-
"strategy_type",
2087-
"kcals_consumed_recognized",
2088-
"kcals_consumed_summary",
2089-
"kcals_consumed_completeness",
2090-
"income_recognized",
2091-
"income_summary",
2092-
"income_completeness",
2093-
"expenditure_recognized",
2094-
"expenditure_summary",
2095-
"expenditure_completeness",
2078+
ordered_wealth_group_categories = list(
2079+
WealthGroupCategory.objects.all().order_by("ordering").values_list("code", flat=True)
2080+
) + ["All"]
2081+
summary_df["wealth_group_category"] = pd.Categorical(
2082+
summary_df["wealth_group_category"],
2083+
categories=ordered_wealth_group_categories,
2084+
ordered=True,
2085+
)
2086+
summary_df = summary_df.sort_values(by=["strategy_type", "wealth_group_category"])
2087+
2088+
completeness_dfs = {}
2089+
for column in ["income", "expenditure", "percentage_kcals", "kcals_consumed"]:
2090+
completeness_df = summary_df[
2091+
["strategy_type", "wealth_group_category", f"{column}_recognized", f"{column}_summary"]
2092+
]
2093+
completeness_df.columns = ["strategy_type", "wealth_group_category", "recognized", "summary"]
2094+
2095+
# Ignore irrelevant strategy types
2096+
completeness_df = completeness_df[
2097+
completeness_df["strategy_type"].isin(
2098+
completeness_df.groupby("strategy_type")[["recognized", "summary"]]
2099+
.sum()
2100+
.sum(axis=1)
2101+
.loc[lambda x: x > 0]
2102+
.index
2103+
)
20962104
]
2097-
]
2098-
summary_df = summary_df.set_index("strategy_type").transpose()
2105+
2106+
# Add a total row
2107+
completeness_df = completeness_df.set_index(["strategy_type", "wealth_group_category"])
2108+
completeness_df = completeness_df.unstack()
2109+
completeness_df.loc["Total"] = completeness_df.sum()
2110+
completeness_df = completeness_df.stack()
2111+
# Add the difference
2112+
completeness_df["unrecognized"] = completeness_df["summary"] - completeness_df["recognized"]
2113+
# Add the completeness percentage
2114+
completeness_df[f"{column}_completeness"] = completeness_df.apply(
2115+
lambda row: (round(row["recognized"] / row["summary"] * 100, 1) if row["summary"] > 0 else pd.NA),
2116+
axis=1,
2117+
)
2118+
# Format the numbers as integers, for better display in the markdown table
2119+
if column in ["income", "expenditure", "kcals_consumed"]:
2120+
for source in ["recognized", "summary", "unrecognized"]:
2121+
completeness_df[source] = completeness_df.apply(
2122+
lambda row: (
2123+
int(row[source])
2124+
if (pd.notna(row["recognized"]) and row["recognized"] > 0)
2125+
or (pd.notna(row["summary"]) and row["summary"] > 0)
2126+
else pd.NA
2127+
),
2128+
axis=1,
2129+
)
2130+
else:
2131+
# Format percentage_kcals as percentages.
2132+
for source in ["recognized", "summary", "unrecognized"]:
2133+
completeness_df[source] = (completeness_df[source] * 100).round(1).replace(-0.0, 0.0)
2134+
2135+
completeness_dfs[column] = completeness_df
20992136

21002137
# Add the completeness summary to the output metadata
2101-
output.metadata["pct_kcals_consumed_recognized"] = float(
2102-
summary_df.loc["kcals_consumed_completeness", "Total"]
2103-
)
2104-
output.metadata["pct_income_recognized"] = float(summary_df.loc["income_completeness", "Total"])
2105-
output.metadata["pct_expenditure_recognized"] = float(summary_df.loc["expenditure_completeness", "Total"])
2106-
output.metadata["completeness_summary"] = MetadataValue.md(
2107-
summary_df.replace(pd.NA, None).to_markdown(floatfmt=",.0f")
2108-
)
2138+
def get_overall_recognized_percentage(metric: str) -> int:
2139+
recognized_total = completeness_dfs[metric].loc["Total", "recognized"].sum()
2140+
summary_total = completeness_dfs[metric].loc["Total", "summary"].sum()
2141+
return round(recognized_total / summary_total * 100) if summary_total > 0 else 0
2142+
2143+
output.metadata["pct_income_recognized"] = get_overall_recognized_percentage("income")
2144+
output.metadata["pct_expenditure_recognized"] = get_overall_recognized_percentage("expenditure")
2145+
output.metadata["pct_kcals_consumed_recognized"] = get_overall_recognized_percentage("kcals_consumed")
2146+
completeness_summary = "### Completeness of recognized Livelihood Activities compared to the summary for Baseline-level activities\n"
2147+
for column in ["income", "expenditure", "percentage_kcals", "kcals_consumed"]:
2148+
completeness_summary += f"<details>\n\n<summary>{column.replace('_', ' ').title()}</summary>\n\n"
2149+
completeness_summary += f"#### {column.replace('_', ' ').title()}\n\n"
2150+
completeness_summary += (
2151+
completeness_dfs[column]
2152+
.replace(pd.NA, None)
2153+
.reset_index()
2154+
.to_markdown(index=False, floatfmt=",.1f", intfmt=",")
2155+
)
2156+
completeness_summary += "\n\n</details>\n\n"
2157+
output.metadata["completeness_summary"] = MetadataValue.md(completeness_summary)
21092158

21102159
# Move the preview and errors metadata item to the end of the dict
21112160
if "errors" in output.metadata:

0 commit comments

Comments
 (0)