Skip to content

Commit 0a15130

Browse files
authored
Merge pull request #198 from American-Institutes-for-Research/HEA-807/recognize_bare_product_labels
Better handlng of summary activities - see HEA-807
2 parents a4a0d9f + a3e319c commit 0a15130

1 file changed

Lines changed: 92 additions & 53 deletions

File tree

pipelines/assets/livelihood_activity.py

Lines changed: 92 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -744,8 +744,14 @@ def get_instances_from_dataframe(
744744
continue
745745

746746
# When we process the values for the LivelihoodActivity records, we need to know the actual attribute
747-
# that the values in this row are for
748-
activity_attribute = label_attributes["attribute"]
747+
# that the values in this row are for. Livelihood Summary rows are grouped by percentage_kcals, income and
748+
# expenditure, so we can keep the activity_attribute from the previous Livelihood Strategy if it hasn't
749+
# been set by the label_attributes.
750+
if (
751+
activity_type != ActivityLabel.LivelihoodActivityType.LIVELIHOOD_SUMMARY
752+
or label_attributes["attribute"]
753+
):
754+
activity_attribute = label_attributes["attribute"]
749755

750756
if label_attributes["is_start"]:
751757
# We are starting a new livelihood activity, so append the previous livelihood strategy
@@ -789,30 +795,14 @@ def get_instances_from_dataframe(
789795
"Found Livelihood Activities from row %s, but there is no Livelihood Strategy defined." % row
790796
)
791797

792-
# Copy the attribute from the previous livelihood strategy if this is a Livelihood Summary and the
793-
# attribute hasn't been set by the label_attributes.
794-
if (
795-
activity_type == ActivityLabel.LivelihoodActivityType.LIVELIHOOD_SUMMARY
796-
and not activity_attribute
797-
and previous_livelihood_strategy
798-
and previous_livelihood_activities_for_strategy
799-
):
800-
for attribute in ["income", "expenditure", "percentage_kcals"]:
801-
if attribute in previous_livelihood_activities_for_strategy[0]:
802-
activity_attribute = attribute
803-
break
804-
if not activity_attribute:
805-
raise ValueError(
806-
f"Could not determine attribute for Livelihood Summary strategy from row {row}"
807-
)
808-
809798
# Copy the product_id for MilkProduction and ButterProduction from the previous livelihood strategy
810799
# if necessary.
811800
if (
812801
livelihood_strategy["strategy_type"] in ["MilkProduction", "ButterProduction"]
813802
and ("product_id" not in livelihood_strategy or not livelihood_strategy["product_id"])
814803
and livelihood_strategy["season"] == season2_name
815804
and previous_livelihood_strategy
805+
and "product_id" in previous_livelihood_strategy
816806
and previous_livelihood_strategy["product_id"]
817807
):
818808
livelihood_strategy["attribute_rows"]["product_id"] = row
@@ -1071,15 +1061,45 @@ def get_instances_from_dataframe(
10711061

10721062
# Headings like CROP PRODUCTION: set the strategy type for subsequent rows.
10731063
# Some other labels imply specific strategy types, such as MilkProduction, MeatProduction or LivestockSales
1074-
if label_attributes["strategy_type"]:
1064+
# For Livelihood Summary activities, the strategy_type is always set from the label_attributes.
1065+
if (
1066+
label_attributes["strategy_type"]
1067+
or activity_type == ActivityLabel.LivelihoodActivityType.LIVELIHOOD_SUMMARY
1068+
):
10751069
strategy_type = label_attributes["strategy_type"]
1070+
1071+
# In the Summary section at the top of the Data worksheet, many of the labels are ambiguous but the
1072+
# rows are organized into percentage_kcals, income and expenditure sections. Therefore, we can set the
1073+
# Strategy Type based on the activity_attribute.
1074+
if strategy_type == "ReliefGift_or_Purchase":
1075+
if activity_attribute in ("percentage_kcals", "income"):
1076+
strategy_type = LivelihoodStrategyType.RELIEF_GIFT_OTHER
1077+
elif activity_attribute == "expenditure":
1078+
strategy_type = LivelihoodStrategyType.OTHER_PURCHASE
1079+
else:
1080+
errors.append(
1081+
"Invalid strategy_type %s for attribute %s from label '%s'"
1082+
% (strategy_type, activity_attribute, label)
1083+
)
1084+
activity_attribute = None
1085+
elif strategy_type == "CashIncome_or_Purchase":
1086+
if activity_attribute == "income":
1087+
strategy_type = LivelihoodStrategyType.OTHER_CASH_INCOME
1088+
elif activity_attribute == "expenditure":
1089+
strategy_type = LivelihoodStrategyType.OTHER_PURCHASE
1090+
else:
1091+
errors.append(
1092+
"Invalid strategy_type %s for attribute %s from label '%s'"
1093+
% (strategy_type, activity_attribute, label)
1094+
)
1095+
activity_attribute = None
1096+
10761097
# Get the valid fields names so we can determine if the attribute is stored in LivelihoodActivity.extra
1077-
# LivestockProduction is an artificial, composite strategy type representing the sum of
1078-
# MilkProduction, ButterProduction and MeatProduction. It isn't stored in the database, and it only
1079-
# requires income, expenditure and kcals_consumed, so we use the base LivelihoodActivity model.
1098+
# Livelihood Summary activities only contain kcals, income and expenditure, and aren't stored in
1099+
# the database, so can use the base LivelihoodActivity model.
10801100
model = (
10811101
LivelihoodActivity
1082-
if strategy_type == "LivestockProduction"
1102+
if activity_type == ActivityLabel.LivelihoodActivityType.LIVELIHOOD_SUMMARY
10831103
else class_from_name(f"baseline.models.{strategy_type}")
10841104
)
10851105
activity_field_names = [field.name for field in model._meta.concrete_fields]
@@ -1419,36 +1439,53 @@ def get_instances_from_dataframe(
14191439
"num_livelihood_strategies": len(livelihood_strategies),
14201440
"num_livelihood_activities": len(livelihood_activities),
14211441
"num_unrecognized_labels": len(unrecognized_labels),
1422-
"pct_rows_recognized": round(
1423-
(
1424-
1
1425-
- len(
1426-
df.iloc[num_header_rows:][
1427-
prepare_lookup(df.iloc[num_header_rows:]["A"]).isin(unrecognized_labels["label"])
1428-
]
1429-
)
1430-
/ len(df.iloc[num_header_rows:])
1431-
)
1432-
* 100
1433-
),
1434-
"pct_used_rows_recognized": round(
1435-
(
1436-
1
1437-
- len(
1438-
df.iloc[num_header_rows:][
1439-
prepare_lookup(df.iloc[num_header_rows:]["A"]).isin(
1440-
unrecognized_labels[unrecognized_labels["datapoints"] > 0]["label"]
1441-
)
1442-
]
1443-
)
1444-
/ len(df.iloc[num_header_rows:])
1445-
)
1446-
* 100
1447-
),
1448-
"preview": MetadataValue.md(f"```json\n{json.dumps(result, indent=4, ensure_ascii=False)}\n```"),
14491442
}
14501443
if not unrecognized_labels.empty:
14511444
metadata["unrecognized_labels"] = MetadataValue.md(unrecognized_labels.to_markdown(index=False))
1445+
metadata["pct_rows_recognized"] = round(
1446+
(
1447+
1
1448+
- len(
1449+
df.iloc[num_header_rows:][
1450+
prepare_lookup(df.iloc[num_header_rows:]["A"]).isin(unrecognized_labels["label"])
1451+
]
1452+
)
1453+
/ len(df.iloc[num_header_rows:])
1454+
)
1455+
* 100,
1456+
1,
1457+
)
1458+
metadata["pct_used_rows_recognized"] = round(
1459+
(
1460+
1
1461+
- len(
1462+
df.iloc[num_header_rows:][
1463+
prepare_lookup(df.iloc[num_header_rows:]["A"]).isin(
1464+
unrecognized_labels[unrecognized_labels["datapoints"] > 0]["label"]
1465+
)
1466+
]
1467+
)
1468+
/ len(df.iloc[num_header_rows:])
1469+
)
1470+
* 100,
1471+
1,
1472+
)
1473+
metadata["pct_used_summary_rows_recognized"] = round(
1474+
(
1475+
1
1476+
- len(
1477+
df.iloc[num_header_rows:][
1478+
prepare_lookup(df.iloc[num_header_rows:]["A"]).isin(
1479+
unrecognized_labels[unrecognized_labels["summary_datapoints"] > 0]["label"]
1480+
)
1481+
]
1482+
)
1483+
/ len(df.iloc[num_header_rows:])
1484+
)
1485+
* 100,
1486+
1,
1487+
)
1488+
metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(result, indent=4, ensure_ascii=False)}\n```")
14521489

14531490
if errors:
14541491
if config.strict:
@@ -1515,10 +1552,12 @@ def get_annotated_instances_from_dataframe(
15151552
# Annotate the output metadata with completeness information
15161553
# Get the summary dataframe, grouped by strategy_type
15171554
summary_df = pd.DataFrame(reported_summary_output.value["LivelihoodActivity"])
1518-
for col in ["income", "expenditure", "kcals_consumed"]:
1555+
for col in ["income", "expenditure", "percentage_kcals", "kcals_consumed"]:
15191556
summary_df[col] = pd.to_numeric(summary_df[col], errors="coerce").fillna(0)
15201557
summary_df = (
1521-
summary_df[["strategy_type", "income", "expenditure", "kcals_consumed"]].groupby("strategy_type").sum()
1558+
summary_df[["strategy_type", "income", "expenditure", "percentage_kcals", "kcals_consumed"]]
1559+
.groupby("strategy_type")
1560+
.sum()
15221561
)
15231562

15241563
# Add the recognized Livelihood Activities, also grouped by strategy_type

0 commit comments

Comments
 (0)