diff --git a/src/scripts/costing/cost_estimation.py b/src/scripts/costing/cost_estimation.py index 02d0971955..594c2d1d01 100644 --- a/src/scripts/costing/cost_estimation.py +++ b/src/scripts/costing/cost_estimation.py @@ -156,7 +156,8 @@ def get_discount_factor(year): # Compute the cumulative discount factor as the product of (1 + discount_rate) for all previous years discount_factor = 1 for y in range(_initial_year + 1, - year + 1): # only starting from initial year + 1 as the discount factor for initial year should be 1 + year + 1): # only starting from initial year + 1 as the discount factor for initial year + # should be 1 discount_factor *= (1 + _discount_rate.get(y, 0)) # Default to 0 if year not in dictionary return discount_factor else: @@ -282,6 +283,7 @@ def clean_equipment_name(name: str, equipment_drop_list = None) -> str: def estimate_input_cost_of_scenarios(results_folder: Path, resourcefilepath: Path, + suspended_results_folder: Path = None, _draws: Optional[list[int]] = None, _runs: Optional[list[int]] = None, summarize: bool = False, @@ -298,6 +300,10 @@ def estimate_input_cost_of_scenarios(results_folder: Path, Path to the directory containing simulation output files. resourcefilepath : Path, optional Path to the resource files + suspended_results_folder: Path, optional + Path to the directory containing suspended simulation output files (using the suspend and resume functionality), + This is used to extract the scaling_factor to scale result to actual population size. If None, then the + 'scaling_factor' is obtained from the results_folder. _draws : list, optional Specific draws to include in the cost estimation. Defaults to all available draws. _runs : list, optional @@ -316,8 +322,10 @@ def estimate_input_cost_of_scenarios(results_folder: Path, Returns: ------- pd.DataFrame - A dataframe containing discounted costs disaggregated by category, sub-category, category-specific subgroup, year, draw, and run. - Note that if a discount rate is used, the dataframe will provide cost as the NPV during the first year of the dataframe + A dataframe containing discounted costs disaggregated by category, sub-category, category-specific subgroup, + year, draw, and run. + Note that if a discount rate is used, the dataframe will provide cost as the NPV during the first year of the + dataframe """ # Useful common functions @@ -343,6 +351,8 @@ def melt_model_output_draws_and_runs(_df, id_vars): _draws = range(0, info['number_of_draws']) if _runs is None: _runs = range(0, info['runs_per_draw']) + if suspended_results_folder is None: + suspended_results_folder = results_folder # Load cost input files # ------------------------ @@ -354,7 +364,8 @@ def melt_model_output_draws_and_runs(_df, id_vars): facility_id_levels_dict = dict(zip(mfl['Facility_ID'], mfl['Facility_Level'])) fac_levels = set(mfl.Facility_Level) - # If variable discount rate is provided, use the average across the relevant years for the purpose of annuitization of HR and equipment costs + # If variable discount rate is provided, use the average across the relevant years for the purpose of annuitization + # of HR and equipment costs def calculate_annuitization_rate(_discount_rate, _years): if isinstance(_discount_rate, (int, float)): # Single discount rate, return as is @@ -477,25 +488,68 @@ def merge_cost_and_model_data(cost_df, model_df, varnames): return merged_df # Get available staff count for each year and draw - def get_staff_count_by_facid_and_officer_type(_df: pd.Series) -> pd.Series: - """Summarise the parsed logged-key results for one draw (as dataframe) into a pd.Series.""" - _df = _df.set_axis(_df['date'].dt.year).drop(columns=['date']) - _df.index.name = 'year' + def get_staff_count_by_facid_and_officer_type(_df: pd.DataFrame) -> pd.Series: + """ + Convert logged staff dictionary output into tidy format, + summing staff counts across all clinic columns. + + Returns pd.Series indexed by: + (year, FacilityID, Officer) + """ - def change_to_standard_flattened_index_format(col): - parts = col.split("_", 3) # Split by "_" only up to 3 parts - if len(parts) > 2: - return parts[0] + "=" + parts[1] + "|" + parts[2] + "=" + parts[ - 3] # Rejoin with "I" at the second occurrence - return col # If there's no second underscore, return the string as it is + df = _df.copy() + df["year"] = df["date"].dt.year + df = df.drop(columns=["date"]) - _df.columns = [change_to_standard_flattened_index_format(col) for col in _df.columns] + clinic_cols = df.columns.difference(["year"]) - return unflatten_flattened_multi_index_in_logging(_df).stack(level=[0, 1]) # expanded flattened axis + long_frames = [] + + for clinic in clinic_cols: + expanded = df[[clinic, "year"]].copy() + expanded = expanded[expanded[clinic].notna()] + + expanded_dict = expanded[clinic].apply(pd.Series) + expanded_dict["year"] = expanded["year"].values + + long_frames.append(expanded_dict) + + # Combine all clinics + combined = pd.concat(long_frames, ignore_index=True) + + # Melt to long format + long_df = ( + combined + .melt(id_vars=["year"], + var_name="facility_officer", + value_name="count") + .dropna(subset=["count"]) + ) + + # Split FacilityID and Officer + parts = long_df["facility_officer"].str.split("_Officer_", expand=True) + + long_df["FacilityID"] = ( + parts[0] + .str.replace("FacilityID_", "", regex=False) + .astype(int) + ) + long_df["Officer"] = parts[1] + + # SUM ACROSS CLINICS HERE + result = ( + long_df + .groupby(["year", "FacilityID", "Officer"])["count"] + .sum() + .sort_index() + ) + + return result # Staff count by Facility ID available_staff_count_by_facid_and_officertype = extract_results( Path(results_folder), + suspended_results_folder=suspended_results_folder, module='tlo.methods.healthsystem.summary', key='number_of_hcw_staff', custom_generate_series=get_staff_count_by_facid_and_officer_type, @@ -519,22 +573,71 @@ def change_to_standard_flattened_index_format(col): 'Facility_Level'].astype(str) # make sure facility level is stored as string available_staff_count_by_level_and_officer_type = available_staff_count_by_level_and_officer_type.drop( available_staff_count_by_level_and_officer_type[available_staff_count_by_level_and_officer_type[ - 'Facility_Level'] == '5'].index) # drop headquarters because we're only concerned with staff engaged in service delivery + 'Facility_Level'] == '5'].index) # drop headquarters + # because we're only concerned with staff engaged in service delivery available_staff_count_by_level_and_officer_type.rename(columns={'value': 'staff_count'}, inplace=True) # Get list of cadres which were utilised in each run to get the count of staff used in the simulation - # Note that we still cost the full staff count for any cadre-Facility_Level combination that was ever used in a run, and - # not the amount of time which was used - def get_capacity_used_by_officer_type_and_facility_level(_df: pd.Series) -> pd.Series: - """Summarise the parsed logged-key results for one draw (as dataframe) into a pd.Series.""" - _df = _df.set_axis(_df['date'].dt.year).drop(columns=['date']) - _df.index.name = 'year' - return unflatten_flattened_multi_index_in_logging(_df).stack(level=[0, 1]) # expanded flattened axis + # Note that we still cost the full staff count for any cadre-Facility_Level combination that was ever used in a run, + # and not the amount of time which was used + def get_capacity_used_by_officer_type_and_facility_level( + _df: pd.DataFrame + ) -> pd.Series: + """ + Parse logging output and return a Series indexed by: + (year, OfficerType, FacilityLevel) + + Collapses (sums) across clinics. + Uses facility_id_levels_dict to map FacilityID → FacilityLevel. + """ + + # ---- 1. Set year index ---- + _df = _df.set_axis(_df["date"].dt.year).drop(columns=["date"]) + _df.index.name = "year" + + # ---- 2. Unflatten logging columns ---- + _df = unflatten_flattened_multi_index_in_logging(_df) + + # Expect columns like: + # ('Clinic', 'facID_and_officer') + + col_df = _df.columns.to_frame(index=False) + + # ---- 3. Extract OfficerType ---- + col_df["OfficerType"] = ( + col_df["facID_and_officer"] + .str.split("_Officer_") + .str[-1] + ) + + # ---- 4. Extract FacilityID ---- + col_df["FacilityID"] = ( + col_df["facID_and_officer"] + .str.split("_Officer_") + .str[0] + .str.replace("FacilityID_", "", regex=False) + .astype(int) + ) + + # ---- 5. Map to FacilityLevel ---- + col_df["FacilityLevel"] = col_df["FacilityID"].map(facility_id_levels_dict) + + # ---- 6. Rebuild MultiIndex (drop clinic level) ---- + _df.columns = pd.MultiIndex.from_frame( + col_df[["OfficerType", "FacilityLevel"]] + ) + + # ---- 7. Collapse across clinics ---- + _df = _df.groupby(level=["OfficerType", "FacilityLevel"], axis=1).sum() + + # ---- 8. Return stacked format ---- + return _df.stack(["OfficerType", "FacilityLevel"]) annual_capacity_used_by_cadre_and_level = extract_results( Path(results_folder), + suspended_results_folder=suspended_results_folder, module='tlo.methods.healthsystem.summary', - key='Capacity_By_OfficerType_And_FacilityLevel', + key='Capacity_By_FacID_and_Officer', custom_generate_series=get_capacity_used_by_officer_type_and_facility_level, do_scaling=False, ) @@ -552,7 +655,10 @@ def get_capacity_used_by_officer_type_and_facility_level(_df: pd.Series) -> pd.S average_capacity_used_by_cadre_and_level[average_capacity_used_by_cadre_and_level['capacity_used'] != 0][ ['OfficerType', 'FacilityLevel', 'draw', 'run']] print( - f"Out of {average_capacity_used_by_cadre_and_level.groupby(['OfficerType', 'FacilityLevel']).size().count()} cadre and level combinations available, {list_of_cadre_and_level_combinations_used.groupby(['OfficerType', 'FacilityLevel']).size().count()} are used across the simulations") + f"Out of {average_capacity_used_by_cadre_and_level.groupby(['OfficerType', 'FacilityLevel']).size().count()} " + f"cadre and level combinations available, " + f"{list_of_cadre_and_level_combinations_used.groupby(['OfficerType', 'FacilityLevel']).size().count()} " + f"are used across the simulations") list_of_cadre_and_level_combinations_used = list_of_cadre_and_level_combinations_used.rename( columns={'FacilityLevel': 'Facility_Level'}) @@ -564,11 +670,13 @@ def get_capacity_used_by_officer_type_and_facility_level(_df: pd.Series) -> pd.S if (cost_only_used_staff): print( - "The input for 'cost_only_used_staff' implies that only cadre-level combinations which have been used in the run are costed") + "The input for 'cost_only_used_staff' implies that only cadre-level combinations which have been used in " + "the run are costed") staff_size_chosen_for_costing = used_staff_count_by_level_and_officer_type else: print( - "The input for 'cost_only_used_staff' implies that all staff are costed regardless of the cadre-level combinations which have been used in the run are costed") + "The input for 'cost_only_used_staff' implies that all staff are costed regardless of the cadre-level " + "combinations which have been used in the run are costed") staff_size_chosen_for_costing = available_staff_count_by_level_and_officer_type # Calculate various components of HR cost @@ -607,7 +715,8 @@ def calculate_npv_past_training_expenses_by_row(row, r=_discount_rate): if partial_year > 0: npv += annual_cost * partial_year * (1 + r) ** (1 + r) - # Add recruitment cost assuming this happens during the partial year or the year after graduation if partial year == 0 + # Add recruitment cost assuming this happens during the partial year or the year after graduation if + # partial year == 0 npv += row['recruitment_cost_per_person_recruited_usd'] * (1 + r) return npv @@ -619,34 +728,33 @@ def calculate_npv_past_training_expenses_by_row(row, r=_discount_rate): npv_values.append(npv) preservice_training_cost['npv_of_training_and_recruitment_cost'] = npv_values - preservice_training_cost['npv_of_training_and_recruitment_cost_per_recruit'] = preservice_training_cost[ - 'npv_of_training_and_recruitment_cost'] * \ - (1 / (preservice_training_cost[ - 'absorption_rate_of_students_into_public_workforce'] + - preservice_training_cost[ - 'proportion_of_workforce_recruited_from_abroad'])) * \ - (1 / preservice_training_cost[ - 'graduation_rate']) * (1 / - preservice_training_cost[ - 'licensure_exam_passing_rate']) - if _discount_rate == 0: # if the discount rate is 0, then the pre-service + recruitment cost simply needs to be divided by the number of years in tenure + preservice_training_cost['npv_of_training_and_recruitment_cost_per_recruit'] \ + = (preservice_training_cost['npv_of_training_and_recruitment_cost'] * + (1 / (preservice_training_cost['absorption_rate_of_students_into_public_workforce'] + + preservice_training_cost['proportion_of_workforce_recruited_from_abroad'])) * + (1 / preservice_training_cost['graduation_rate']) * + (1 /preservice_training_cost['licensure_exam_passing_rate'])) + if _discount_rate == 0: # if the discount rate is 0, then the pre-service + recruitment cost simply + # needs to be divided by the number of years in tenure preservice_training_cost['annuitisation_rate'] = preservice_training_cost[ 'average_length_of_tenure_in_the_public_sector'] else: preservice_training_cost['annuitisation_rate'] = 1 + (1 - (1 + annuitization_rate) ** ( -preservice_training_cost[ 'average_length_of_tenure_in_the_public_sector'] + 1)) / annuitization_rate - preservice_training_cost['annuitised_training_and_recruitment_cost_per_recruit'] = preservice_training_cost[ - 'npv_of_training_and_recruitment_cost_per_recruit'] / \ - preservice_training_cost[ - 'annuitisation_rate'] - - # Cost per student trained * 1/Rate of absorption from the local and foreign graduates * 1/Graduation rate * attrition rate - # the inverse of attrition rate is the average expected tenure; and the preservice training cost needs to be divided by the average tenure + preservice_training_cost['annuitised_training_and_recruitment_cost_per_recruit'] = \ + (preservice_training_cost['npv_of_training_and_recruitment_cost_per_recruit'] / + preservice_training_cost['annuitisation_rate']) + + # Cost per student trained * 1/Rate of absorption from the local and foreign graduates + # * 1/Graduation rate * attrition rate + # the inverse of attrition rate is the average expected tenure; and the preservice training cost needs to + # be divided by the average tenure preservice_training_cost['cost'] = preservice_training_cost[ 'annuitised_training_and_recruitment_cost_per_recruit'] * \ preservice_training_cost['staff_count'] * preservice_training_cost[ - 'annual_attrition_rate'] # not multiplied with attrition rate again because this is already factored into 'Annual_cost_per_staff_recruited' + 'annual_attrition_rate'] # not multiplied with attrition rate again + # because this is already factored into 'Annual_cost_per_staff_recruited' preservice_training_cost = preservice_training_cost[ ['draw', 'run', 'year', 'OfficerType', 'Facility_Level', 'cost']] @@ -680,7 +788,8 @@ def label_rows_of_cost_dataframe(_df, label_var, label): # Initialize HR with the salary data if (cost_only_used_staff): human_resource_costs = retain_relevant_column_subset( - label_rows_of_cost_dataframe(salary_for_staff, 'cost_subcategory', 'salary_for_cadres_used'), 'OfficerType') + label_rows_of_cost_dataframe(salary_for_staff, 'cost_subcategory', 'salary_for_cadres_used'), + 'OfficerType') # Concatenate additional cost categories additional_costs = [ (preservice_training_cost, 'preservice_training_and_recruitment_cost_for_attrited_workers'), @@ -689,7 +798,8 @@ def label_rows_of_cost_dataframe(_df, label_var, label): ] else: human_resource_costs = retain_relevant_column_subset( - label_rows_of_cost_dataframe(salary_for_staff, 'cost_subcategory', 'salary_for_all_staff'), 'OfficerType') + label_rows_of_cost_dataframe(salary_for_staff, 'cost_subcategory', 'salary_for_all_staff'), + 'OfficerType') # Concatenate additional cost categories additional_costs = [ (preservice_training_cost, 'preservice_training_and_recruitment_cost_for_attrited_workers'), @@ -741,6 +851,7 @@ def get_counts_of_items_requested(_df): cons_req = extract_results( results_folder, + suspended_results_folder=suspended_results_folder, module='tlo.methods.healthsystem.summary', key='Consumables', custom_generate_series=get_counts_of_items_requested, @@ -762,7 +873,8 @@ def get_counts_of_items_requested(_df): # 2.1 Cost of consumables dispensed # --------------------------------------------------------------------------------------------------------------- # Multiply number of items needed by cost of consumable - # consumables_dispensed.columns = consumables_dispensed.columns.get_level_values(0).str() + "_" + consumables_dispensed.columns.get_level_values(1) # Flatten multi-level columns for pandas merge + # consumables_dispensed.columns = consumables_dispensed.columns.get_level_values(0).str() + "_" + + # consumables_dispensed.columns.get_level_values(1) # Flatten multi-level columns for pandas merge unit_costs['consumables'].columns = pd.MultiIndex.from_arrays( [unit_costs['consumables'].columns, [''] * len(unit_costs['consumables'].columns)]) cost_of_consumables_dispensed = consumables_dispensed.merge(unit_costs['consumables'], on=idx['Item_Code'], @@ -796,8 +908,20 @@ def get_counts_of_items_requested(_df): left_on='Item_Code', right_on='item_code', validate='m:1', how='left') + # Identify rows where excess_stock_proportion_of_dispensed is NaN + missing_excess_stock = ( + cost_of_excess_consumables_stocked + ['excess_stock_proportion_of_dispensed'] + .isna() + ) + + # Fill missing values with the average inflow-to-outflow ratio minus 1 + fill_value = average_inflow_to_outflow_ratio_ratio - 1 + cost_of_excess_consumables_stocked.loc[ - cost_of_excess_consumables_stocked.excess_stock_proportion_of_dispensed.isna(), 'excess_stock_proportion_of_dispensed'] = average_inflow_to_outflow_ratio_ratio - 1 # TODO disaggregate the average by program + missing_excess_stock, + 'excess_stock_proportion_of_dispensed' + ] = fill_value # TODO: disaggregate the average by program cost_of_excess_consumables_stocked[quantity_columns] = cost_of_excess_consumables_stocked[ quantity_columns].multiply(cost_of_excess_consumables_stocked[idx[price_column]], axis=0) cost_of_excess_consumables_stocked[quantity_columns] = cost_of_excess_consumables_stocked[ @@ -815,7 +939,8 @@ def get_counts_of_items_requested(_df): def melt_and_label_consumables_cost(_df, label): multi_index = pd.MultiIndex.from_tuples(_df.columns) _df.columns = multi_index - # Select 'Item_Code', 'year', and all columns where both levels of the MultiIndex are numeric (these are the (draw,run) columns with cost values) + # Select 'Item_Code', 'year', and all columns where both levels of the MultiIndex are numeric + # (these are the (draw,run) columns with cost values) selected_columns = [col for col in _df.columns if (col[0] in ['Item_Code', 'year']) or (isinstance(col[0], int) and isinstance(col[1], int))] _df = _df[selected_columns] # Subset the dataframe with the selected columns @@ -827,13 +952,14 @@ def melt_and_label_consumables_cost(_df, label): melted_df['consumable'] = melted_df['Item_Code'].map(consumables_dict) melted_df['cost_subcategory'] = label melted_df[ - 'Facility_Level'] = 'all' # TODO this is temporary until 'tlo.methods.healthsystem.summary' only logs consumable at the aggregate level + 'Facility_Level'] = 'all' + # TODO this is temporary until 'tlo.methods.healthsystem.summary' only logs consumable at the aggregate level melted_df = melted_df.rename(columns={'value': 'cost'}) return melted_df def disaggregate_separately_managed_medical_supplies_from_consumable_costs(_df, _consumables_dict, - # This is a dictionary mapping codes to names + # This is a dictionary mapping codes to names list_of_unique_medical_products): reversed_consumables_dict = {value: key for key, value in _consumables_dict.items()} # reverse dictionary to map names to codes @@ -849,24 +975,26 @@ def disaggregate_separately_managed_medical_supplies_from_consumable_costs(_df, columns='item_code') separately_managed_medical_supplies = [127, 141, 161] # Oxygen, Blood, IRS - cost_of_consumables_dispensed, cost_of_separately_managed_medical_supplies_dispensed = disaggregate_separately_managed_medical_supplies_from_consumable_costs( + cost_of_consumables_dispensed, cost_of_separately_managed_medical_supplies_dispensed = ( + disaggregate_separately_managed_medical_supplies_from_consumable_costs( _df=retain_relevant_column_subset( melt_and_label_consumables_cost(cost_of_consumables_dispensed, 'cost_of_consumables_dispensed'), 'consumable'), _consumables_dict=consumables_dict, - list_of_unique_medical_products=separately_managed_medical_supplies) - cost_of_excess_consumables_stocked, cost_of_separately_managed_medical_supplies_excess_stock = disaggregate_separately_managed_medical_supplies_from_consumable_costs( + list_of_unique_medical_products=separately_managed_medical_supplies)) + cost_of_excess_consumables_stocked, cost_of_separately_managed_medical_supplies_excess_stock = ( + disaggregate_separately_managed_medical_supplies_from_consumable_costs( _df=retain_relevant_column_subset( melt_and_label_consumables_cost(cost_of_excess_consumables_stocked, 'cost_of_excess_consumables_stocked'), 'consumable'), _consumables_dict=consumables_dict, - list_of_unique_medical_products=separately_managed_medical_supplies) + list_of_unique_medical_products=separately_managed_medical_supplies)) consumable_costs = pd.concat([cost_of_consumables_dispensed, cost_of_excess_consumables_stocked]) # 2.4 Supply chain costs # --------------------------------------------------------------------------------------------------------------- - # Assume that the cost of procurement, warehousing and distribution is a fixed proportion of consumable purchase costs + # Assume that the cost of procurement,warehousing and distribution is a fixed proportion of consumable purchase # The fixed proportion is based on Resource Mapping Expenditure data from 2018 resource_mapping_data = unit_costs['actual_expenditure_data'] # Make sure values are numeric @@ -922,7 +1050,8 @@ def disaggregate_separately_managed_medical_supplies_from_consumable_costs(_df, # -------------------------------------------- print("Now estimating Medical equipment costs...") - # Total cost of equipment required as per SEL (HSSP-III) only at facility IDs where it has been used in the simulation + # Total cost of equipment required as per SEL (HSSP-III) only at facility IDs where it has been used in the + # simulation # Get list of equipment used in the simulation by district and level def get_equipment_used_by_district_and_facility(_df: pd.Series) -> pd.Series: """Summarise the parsed logged-key results for one draw (as dataframe) into a pd.Series.""" @@ -987,7 +1116,8 @@ def get_equipment_used_by_district_and_facility(_df: pd.Series) -> pd.Series: on=['District', 'Facility_Level'], how='left') equipment_df.loc[equipment_df.Facility_Count.isna(), 'Facility_Count'] = 0 - # Because levels 1b and 2 are collapsed together, we assume that the same equipment is used by level 1b as that recorded for level 2 + # Because levels 1b and 2 are collapsed together, we assume that the same equipment is used by level 1b as + # that recorded for level 2 def update_itemuse_for_level1b_using_level2_data(_df): # Create a list of District and Item_code combinations for which use == True list_of_equipment_used_at_level2 = \ @@ -1043,7 +1173,8 @@ def update_itemuse_for_level1b_using_level2_data(_df): # Assume that the annual costs are constant each year of the simulation equipment_costs = pd.concat([equipment_costs.assign(year=year) for year in years]) - # TODO If the logger is updated to include year, we may wish to calculate equipment costs by year - currently we assume the same annuitised equipment cost each year + # TODO If the logger is updated to include year, we may wish to calculate equipment costs by year + # (currently we assume the same annuitised equipment cost each year) equipment_costs = equipment_costs.reset_index(drop=True) equipment_costs = equipment_costs.rename(columns={'Equipment_tlo': 'Equipment'}) equipment_costs = prepare_cost_dataframe(equipment_costs, _category_specific_group='Equipment', @@ -1137,8 +1268,8 @@ def update_itemuse_for_level1b_using_level2_data(_df): # Define a function to summarize cost data from -# Note that the dataframe needs to have draw as index and run as columns. if the dataframe is long with draw and run as index, then -# first unstack the dataframe and subsequently apply the summarize function +# Note that the dataframe needs to have draw as index and run as columns. if the dataframe is long with draw and run as +# index, then first unstack the dataframe and subsequently apply the summarize function def summarize_cost_data(_df, _metric: Literal['mean', 'median'] = 'mean') -> pd.DataFrame: """ @@ -1194,8 +1325,8 @@ def estimate_projected_health_spending(resourcefilepath: Path, """ Estimate total projected health spending for a simulation period. - Combines health spending per capita projections (Dieleman et al, 2019) with simulated population estimates to calculate - total health expenditure, optionally applying a discount rate and summarizing across runs. + Combines health spending per capita projections (Dieleman et al, 2019) with simulated population estimates to + calculate total health expenditure, optionally applying a discount rate and summarizing across runs. Parameters: ---------- @@ -1424,7 +1555,8 @@ def do_stacked_bar_plot_of_cost_by_category(_df: pd.DataFrame, if (_disaggregate_by_subgroup is True): for name, df in dfs.items(): dfs[name] = df.copy() # Choose the dataframe to modify - # If sub-groups are more than 10 in number, then disaggregate the top 10 and group the rest into an 'other' category + # If sub-groups are more than 10 in number, then disaggregate the top 10 and group the rest into an + # 'other' category if (len(dfs[name]['cost_subgroup'].unique()) > 10): # Calculate total cost per subgroup subgroup_totals = dfs[name].groupby('cost_subgroup')['cost'].sum() @@ -1870,7 +2002,8 @@ def wrap_text(text, width=15): if (len(_df['cost_subgroup'].unique()) > 10): # Step 2: Group all other consumables into "Other" other_cost = _df.iloc[10:]["cost"].sum() - top_10 = pd.concat([top_10, pd.DataFrame([{"cost_subgroup": "Other", "cost": other_cost}])], ignore_index=True) + top_10 = pd.concat([top_10, pd.DataFrame([{"cost_subgroup": "Other", "cost": other_cost}])], + ignore_index=True) # Prepare data for the treemap total_cost = top_10["cost"].sum() @@ -2015,7 +2148,8 @@ def generate_multiple_scenarios_roi_plot(_monetary_value_of_incremental_health: # Initialize an empty DataFrame to store values for each 'run' all_run_values = pd.DataFrame() - # Create an array of implementation costs ranging from 0 to the max value of max ability to pay for the current draw + # Create an array of implementation costs ranging from 0 to the max value of max ability to pay for the current + # draw implementation_costs = np.linspace(0, max_ability_to_pay_for_implementation.loc[draw_index].max(), 50) # Add fixed values for ROI ratio calculation additional_costs = np.array([1_000_000_000, 3_000_000_000]) @@ -2121,7 +2255,8 @@ def generate_multiple_scenarios_roi_plot(_monetary_value_of_incremental_health: # Replace specific x-ticks with % of health spending values if _projected_health_spending: xtick_labels[ - 1] = f'{xticks[1]:,.0f}\n({xticks[1] / (_projected_health_spending / 1e6) :.2%} of \n projected total \n health spend)' + 1] = (f'{xticks[1]:,.0f}\n({xticks[1] / (_projected_health_spending / 1e6) :.2%} of \n projected total ' + f'\n health spend)') for i, tick in enumerate(xticks): if (i != 0) & (i != 1): # Replace for 4000 xtick_labels[i] = f'{tick:,.0f}\n({tick / (_projected_health_spending / 1e6) :.2%})' diff --git a/src/tlo/analysis/utils.py b/src/tlo/analysis/utils.py index 6b4d2cbf9b..487f965ea9 100644 --- a/src/tlo/analysis/utils.py +++ b/src/tlo/analysis/utils.py @@ -295,6 +295,7 @@ def extract_results(results_folder: Path, index: str = None, custom_generate_series=None, do_scaling: bool = False, + suspended_results_folder: Path = None, ) -> pd.DataFrame: """Utility function to unpack results. @@ -307,16 +308,19 @@ def extract_results(results_folder: Path, `custom_generate_series`. Optionally, with `do_scaling=True`, each element is multiplied by the scaling_factor recorded in the simulation. + If the suspend-and-resume functionality is used, scaling factor may be avaialble in the folder where the log + of the suspended run are stored. Note that if runs in the batch have failed (such that logs have not been generated), these are dropped silently. """ - def get_multiplier(_draw, _run): + + def get_multiplier(results_folder, _draw, _run): """Helper function to get the multiplier from the simulation. Note that if the scaling factor cannot be found a `KeyError` is thrown.""" return load_pickled_dataframes( - results_folder, _draw, _run, 'tlo.methods.population' - )['tlo.methods.population']['scaling_factor']['scaling_factor'].values[0] + results_folder, _draw, _run, 'tlo.methods.demography' + )['tlo.methods.demography']['scaling_factor']['scaling_factor'].values[0] if custom_generate_series is None: # If there is no `custom_generate_series` provided, it implies that function required selects the specified @@ -352,7 +356,10 @@ def generate_series(dataframe: pd.DataFrame) -> pd.Series: 'Custom command does not generate a pd.Series' ) if do_scaling: - res[draw_run] = output_from_eval * get_multiplier(draw, run) + if suspended_results_folder is not None: + res[draw_run] = output_from_eval * get_multiplier(suspended_results_folder, 0, 0) + else: + res[draw_run] = output_from_eval * get_multiplier(results_folder, draw, run) else: res[draw_run] = output_from_eval @@ -386,7 +393,7 @@ def check_info_value_changes(df): prev_info = row["Info"] return problems - + def remove_events_for_individual_after_death(df): rows_to_drop = [] @@ -430,8 +437,8 @@ def reconstruct_individual_histories(df): if len(problems)>0: print("Values didn't change but were still detected") print(problems) - - + + return df_final