diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv index 5e2b85db4b..4b16f40a25 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_Item_Designations.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c03fd1b3ad2a7352dcbc41ab891744fb8b82f37c131798596e46fe3c00d0a15 -size 72294 +oid sha256:41bd104db59bf4b7ccd35639c58024ed94d6584d69b9ef648381aa56a33e2461 +size 74345 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv index 2cf60fec13..4301af678d 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ab824924a4c0e798eeec6c2514c50577ba83185120ec257d5065c583fe1dfa6 -size 57341233 +oid sha256:b0922f6c12c1303bdd1ce5f1a3c1e212b1635069664113a18a80ce77742ac031 +size 71215257 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_original.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_original.csv index 6afeff00bb..9a944ff91f 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_original.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small_original.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bc044868ad9f1c618fc9b3fd6fc0ed23f32b153c8438f52fd4b72e40d8f503f -size 55831644 +oid sha256:e8661e85489bf7422dd466b25af898220777e8c16e22c6ba3f8d7bfc2531f833 +size 70405856 diff --git a/src/scripts/consumables_analyses/descriptive_analysis.py b/src/scripts/consumables_analyses/descriptive_analysis.py new file mode 100644 index 0000000000..fe151394ac --- /dev/null +++ b/src/scripts/consumables_analyses/descriptive_analysis.py @@ -0,0 +1,460 @@ +''' +Produce plots and estimates for the manuscript "Estimating the health gains and value for money of reducing drug stock-outs in Malawi: an individual-based modelling study" +''' + +import datetime +import os +import textwrap +from pathlib import Path + +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +from typing import Literal, Sequence, Optional, Union, List + +from tlo import Date +from tlo.analysis.utils import ( + extract_params, + get_scenario_info, + get_scenario_outputs, + load_pickled_dataframes, +) + +# Define a timestamp for script outputs +timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") + +# Print the start time of the script +print('Script Start', datetime.datetime.now().strftime('%H:%M')) + +# Create folders to store results +resourcefilepath = Path("./resources") +consumable_resourcefilepath = resourcefilepath / "healthsystem/consumables" +simulationfilepath = Path('./outputs/sakshi.mohan@york.ac.uk') +outputfilepath = Path('./outputs/consumables_impact_analysis') +if not os.path.exists(outputfilepath): + os.makedirs(outputfilepath) + +# Utility functions +# Assign item names to item codes +def assign_consumable_names_to_item_codes(df): + # Create dictionary mapping item_codes to consumables names + consumables_df = pd.read_csv(consumable_resourcefilepath / "ResourceFile_Consumables_Items_and_Packages.csv")[['Item_Code', 'Items']] + consumables_df = consumables_df[consumables_df['Item_Code'].notna()] + consumables_dict = dict(zip(consumables_df['Item_Code'], consumables_df['Items'])) + + # Add consumable_name to df + df = df.copy() + df['item_name'] = df['item_code'].map(consumables_dict) + + return df + +# Prepare availability data +def prepare_availability_dataset_for_plots( + _df: pd.DataFrame, + scenario_list: Optional[list[int]] = None, + scenario_names_dict: Optional[dict[str, str]] = None, + consumable_resourcefilepath: Path = None, + resourcefilepath: Path = None +) -> pd.DataFrame: + """ + Prepares a consumable availability dataset by merging facility and item category data, + renaming columns for scenarios, and cleaning category names for plotting. + """ + if scenario_list is None: + scenario_list = [] + if scenario_names_dict is None: + scenario_names_dict = {} + + # Load item category mapping + program_item_mapping = pd.read_csv( + consumable_resourcefilepath / 'ResourceFile_Consumables_Item_Designations.csv', + usecols=['Item_Code', 'item_category'] + ) + program_item_mapping = program_item_mapping.rename(columns={'Item_Code': 'item_code'}) + program_item_mapping = program_item_mapping[program_item_mapping['item_category'].notna()] + + # Load facility list + mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") + + # Merge facility and program info + _df = _df.merge( + mfl[['District', 'Facility_Level', 'Facility_ID']], + on='Facility_ID', how='left' + ) + _df = _df.merge(program_item_mapping, on='item_code', how='left') + + # Rename scenario columns + _df = _df.rename(columns=scenario_names_dict) + + # Clean item category names + clean_category_names = { + 'cancer': 'Cancer', + 'cardiometabolicdisorders': 'Cardiometabolic Disorders', + 'contraception': 'Contraception', + 'general': 'General', + 'hiv': 'HIV', + 'malaria': 'Malaria', + 'ncds': 'Non-communicable Diseases', + 'neonatal_health': 'Neonatal Health', + 'other_childhood_illnesses': 'Other Childhood Illnesses', + 'reproductive_health': 'Reproductive Health', + 'road_traffic_injuries': 'Road Traffic Injuries', + 'tb': 'Tuberculosis', + 'undernutrition': 'Undernutrition', + 'epi': 'Expanded programme on immunization' + } + _df['item_category'] = _df['item_category'].map(clean_category_names) + + return _df + +# Wrap Labels +def wrap_labels(labels, width=15): + """Wrap each label to the given character width.""" + return [textwrap.fill(str(lab), width) if lab is not None else "" for lab in labels] + +# Generate heatmaps of average availability +def generate_heatmap( + df: pd.DataFrame, + include_levels: Optional[List[str]] = None, + value_col: str = "Actual", + row: str = "item_category", + col: str = "Facility_Level", + row_order: Optional[Sequence[str]] = None, + col_order: Optional[Sequence[str]] = None, + figurespath: Optional[Path] = None, + filename: str = "heatmap_consumable_availability.png", + figsize: tuple[int, int] = (10, 8), + cmap: str = "RdBu_r", + annot: bool = True, + fmt: Optional[str] = None, # None -> auto choose + font_scale: float = 0.75, + cbar_label: str = "Proportion of days on which consumable is available", + xlabel: Optional[str] = None, + ylabel: Optional[str] = None, + as_pct: bool = True, # format annotations as percentages + round_decimals: int = 2, + # option to plot scenarios on the x_axis + scenario_axis: bool = False, # if True, columns become scenarios + scenario_cols: Optional[Sequence[str]] = None, +): + """ + Build a heatmap either by a single column (e.g., Facility_Level) or across multiple + scenario columns placed on the x-axis. + """ + if include_levels is not None: + df = df[df.Facility_Level.isin(include_levels)] + if scenario_axis: + aggregated = ( + df.groupby([row], dropna=True)[scenario_cols] + .mean() + .reset_index() + ) + # Add perfect scenario + aggregated['Perfect'] = 1 # Add a column representing the perfect scenario + heatmap_df = aggregated.set_index('item_category') + else: + # Standard mode: columns = `col`, values = mean(value_col) + aggregated = ( + df.groupby([row, col], dropna=True)[value_col] + .mean() + .reset_index() + ) + heatmap_df = aggregated.pivot(index=row, columns=col, values=value_col) + + # Optional ordering + if row_order is not None: + heatmap_df = heatmap_df.reindex(row_order) + if col_order is not None: + heatmap_df = heatmap_df.reindex(columns=col_order) + + # 2) Totals (means across the raw data, not the pivot means) + if scenario_axis: + # Means by row across all programs for each scenario + row_means = heatmap_df.mean(axis=0) # average per scenario across programs + avg_row = row_means.copy() + heatmap_df.loc["Average"] = avg_row + else: + # Compute from raw df to avoid double-averaging + col_means = df.groupby(row, dropna=False)[value_col].mean() + row_means = df.groupby(col, dropna=False)[value_col].mean() + overall_mean = df[value_col].mean() + + heatmap_df["Average"] = col_means.reindex(heatmap_df.index) + avg_row = row_means.reindex(heatmap_df.columns).copy() + avg_row.loc["Average"] = overall_mean + heatmap_df.loc["Average"] = avg_row + + # 3) Annotation formatting + if as_pct: + # If values are 0–1 proportions, annotate as percentages + display_df = (heatmap_df * 100).round(round_decimals) + if fmt is None: + fmt = ".0f" if round_decimals == 0 else f".{round_decimals}f" + annot_kws = {"fmt": fmt} + # Build string labels with % sign + annot_data = display_df.astype(float) + else: + display_df = heatmap_df.round(round_decimals) + if fmt is None: + fmt = f".{round_decimals}f" + annot_kws = {"fmt": fmt} + annot_data = display_df.astype(float) + + # 4) Plot + sns.set(font_scale=font_scale) + fig, ax = plt.subplots(figsize=figsize) + hm = sns.heatmap( + annot_data, + annot=annot, + cmap=cmap, + cbar_kws={"label": cbar_label}, + ax=ax, + fmt=".2f" + ) + + # If percentage labels requested, overwrite text with % suffix + if annot and as_pct: + for t in ax.texts: + t.set_text(f"{t.get_text()}%") + + # 5) Labels & ticks + xlab = (xlabel or ("Scenario" if scenario_axis else col.replace("_", " ").title())) + ylab = (ylabel or row.replace("_", " ").title()) + ax.set_xlabel(xlab) + ax.set_ylabel(ylab) + wrapped_xticklabels = [ + "\n".join(textwrap.wrap(label.get_text(), 20)) # adjust width as needed + for label in ax.get_xticklabels() + ] + + ax.set_xticklabels(wrapped_xticklabels, rotation=90) + ax.set_yticklabels(ax.get_yticklabels(), rotation=0) + + # 6) Save (optional) + if figurespath is not None: + figurespath = Path(figurespath) + figurespath.mkdir(parents=True, exist_ok=True) + outpath = figurespath / filename + plt.savefig(outpath, dpi=300, bbox_inches="tight") + + plt.show() + plt.close(fig) + return fig, ax, heatmap_df + +# Generate LaTex-compatible detailed table of availability +def generate_detail_availability_table_by_scenario( + df: pd.DataFrame, + groupby_var: str, + scenario_cols: Sequence[str], + include_levels: Union[str, Sequence[str]], + longtable: bool = False, + outputpath: Path = None, + decimals: int = 2, + caption: Optional[str] = None, + label_prefix: str = "tab:availability_by_", + col_width_groupbyvar: str = "4cm", + col_width_scenario: str = "1.8cm", +) -> str: + """ + Create a LaTeX table of average availability (as percentages) for each consumable across scenarios, + filtered to selected facility levels. + + Returns the LaTeX string and writes it to figurespath / f"availability_by_{groupby_var}.tex" + """ + + # --- Setup --- + if outputpath is None: + outputpath = outputfilepath / "appendix" # falls back to your existing default + outputpath.mkdir(parents=True, exist_ok=True) + + # Accept str OR list for include_levels + if isinstance(include_levels, str): + include_levels = [include_levels] + + table_df = df.copy() + + # Filter by facility level if the column is present + if "Facility_Level" in table_df.columns: + table_df = table_df[table_df["Facility_Level"].isin(include_levels)] + + # Aggregate means per item + grouped = ( + table_df + .groupby([groupby_var], dropna=False)[list(scenario_cols)] + .mean() + .reset_index() + ) + + # Rename first column to "Consumable" + grouped.rename(columns={groupby_var: "Consumable"}, inplace=True) + + # Escape LaTeX in Consumable names + def _latex_escape(s): + if pd.isna(s): + return "" + s = str(s) + # Order matters for backslashes; escape backslash first + s = s.replace("\\", r"\\") + s = s.replace("&", r"\&").replace("%", r"\%").replace("_", r"\_") + s = s.replace("#", r"\#").replace("{", r"\{").replace("}", r"\}") + s = s.replace("$", r"\$").replace("^", r"\^{}").replace("~", r"\~{}") + return s + + grouped["Consumable"] = grouped["Consumable"].map(_latex_escape) + + # Convert proportions -> percentage strings with escaped % + def pct_format(x: float) -> str: + if pd.isna(x): + return "" + return f"{x * 100:.{decimals}f}\\%" + + for c in scenario_cols: + grouped[c] = grouped[c].map(pct_format) + + # Build column format dynamically + # First col wider for names, then one col per scenario + column_format = ( + f"|R{{{col_width_groupbyvar}}}|" + + "|".join([f"R{{{col_width_scenario}}}"] * len(scenario_cols)) + + "|" + ) + + # Caption/label + if caption is None: + caption = "Summarized availability by consumable" + label = f"{label_prefix}{groupby_var}_{include_levels}" + + # Export to LaTeX (escape=False since we already escaped) + latex_table = grouped.to_latex( + longtable=longtable, + column_format=column_format, + caption=caption, + label=label, + position="h", + index=False, + escape=False, + header=True, + ) + + # Add \hline after each row + latex_table = latex_table.replace("\\\\\n", "\\\\ \\hline\n") + + # Save + outpath = outputpath / f"availability_by_{groupby_var}_{include_levels}.tex" + with open(outpath, "w", encoding="utf-8") as f: + f.write(latex_table) + + return latex_table + + +# Import and clean data files +#********************************** +# Import TLO model availability data +tlo_availability_df = pd.read_csv(consumable_resourcefilepath / "ResourceFile_Consumables_availability_small_original.csv") +scenario_names_dict={ + 'available_prop': 'Actual', + 'available_prop_scenario1': 'Non-therapeutic consumables (NTC)', + 'available_prop_scenario2': 'NTC + Vital medicines (VM)', + 'available_prop_scenario3': 'NTC + VM + Pharmacist- managed', + 'available_prop_scenario4': 'Level 1b', + 'available_prop_scenario5': 'CHAM', + 'available_prop_scenario6': '75th percentile facility', + 'available_prop_scenario7': '90th percentile facility', + 'available_prop_scenario8': 'Best facility', + 'available_prop_scenario9': 'Best facility (including DHO)', + 'available_prop_scenario10': 'HIV supply chain', + 'available_prop_scenario11': 'EPI supply chain', + 'available_prop_scenario12': 'HIV moved to Govt supply chain (Avg by Level)', + 'available_prop_scenario13': 'HIV moved to Govt supply chain (Avg by Facility_ID)', + 'available_prop_scenario14': 'HIV moved to Govt supply chain (Avg by Facility_ID times 1.25)', + 'available_prop_scenario15': 'HIV moved to Govt supply chain (Avg by Facility_ID times 0.75)', + 'available_prop_scenario16': 'District pooling', + 'available_prop_scenario17': 'Neighbourhood pooling', + 'available_prop_scenario18': 'Pairwise exchange (Large radius)', + 'available_prop_scenario19': 'Redistribution (Small radius)' + } + +tlo_availability_df = prepare_availability_dataset_for_plots( + _df=tlo_availability_df, + scenario_list=[1, 2, 3, 6, 7, 8, 16, 17, 18, 19], + scenario_names_dict=scenario_names_dict, + consumable_resourcefilepath=consumable_resourcefilepath, + resourcefilepath=resourcefilepath +) + +# Generate figures for manuscript +#********************************** +# TODO Consider redoing these plots with the legacy Resourcefile +# Figure 1: Average probability of consumable availability in public and CHAM health facilities in Malawi +_ = generate_heatmap( + df=tlo_availability_df, + value_col="Actual", + row="item_category", + col="Facility_Level", + figurespath = outputfilepath / 'manuscript', + filename="heatmap_program_and_level_actual.png", + figsize=(10, 8), + cmap="RdBu", + round_decimals=4, + cbar_label="Proportion of days on which consumable is available", + xlabel="Facility Level", + ylabel="Program", +) + +# Figure 3: Comparison of consumable availability across modelled scenarios +scenario_cols = ['Actual', 'Non-therapeutic consumables (NTC)', 'NTC + Vital medicines (VM)', 'NTC + VM + Pharmacist- managed', + '75th percentile facility', '90th percentile facility', 'Best facility', + 'District pooling', 'Neighbourhood pooling', + 'Pairwise exchange (Large radius)', + 'Redistribution (Small radius)',] +for level in ['1a', '1b']: + _ = generate_heatmap( + df=tlo_availability_df, + include_levels = [level], + value_col="Actual", + row="item_category", + col="Facility_Level", + figurespath=outputfilepath / 'manuscript', + filename=f"scenarios_heatmap_{level}.png", + figsize=(10, 8), + cmap="RdBu", + round_decimals=4, + cbar_label="Proportion of days on which consumable is available", + xlabel="Facility Level", + ylabel="Program", + scenario_axis = True, # if True, columns become scenarios + scenario_cols = scenario_cols, + ) + + +# Figure A.1: Trend in average consumable availability by facility level + +# Figure A.2: Comparison of consumable availability as per Open Logistics Management Information System (OpenLMIS), 2018 and Harmonised +# Health Facility Assessment, 2018-19 + +# Table B.1: Average probability of availability for each consumable under all scenarios (Level 1a) +tlo_availability_df = assign_consumable_names_to_item_codes(tlo_availability_df) + +# Table B.1: Average probability of availability for each consumable under all scenarios (Level 1a) +availablity_by_item_1a = generate_detail_availability_table_by_scenario( + df=tlo_availability_df, + groupby_var="item_name", + scenario_cols=scenario_cols, + include_levels="1a", + longtable=True, + outputpath=outputfilepath / "appendix", + decimals=2, + caption="Average probability of availability for each consumable under all scenarios (Level 1a)", +) + +# Table B.2: Average probability of availability for each consumable under all scenarios (Level 1b) +availablity_by_item_1b = generate_detail_availability_table_by_scenario( + df=tlo_availability_df, + groupby_var="item_name", + scenario_cols=scenario_cols, + include_levels="1b", + longtable=True, + outputpath=outputfilepath / "appendix", + decimals=2, + caption="Average probability of availability for each consumable under all scenarios (Level 1b)", +) diff --git a/src/scripts/consumables_analyses/manuscript/analysis_improved_consumable_availability.py b/src/scripts/consumables_analyses/manuscript/analysis_improved_consumable_availability.py new file mode 100644 index 0000000000..c30722a2b4 --- /dev/null +++ b/src/scripts/consumables_analyses/manuscript/analysis_improved_consumable_availability.py @@ -0,0 +1,1443 @@ +"""Produce outputs for Impact of Improved Consumables Availability Paper +""" +import datetime +import os +import textwrap +from pathlib import Path + +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import numpy as np + +from scripts.costing.cost_estimation import ( + clean_consumable_name, + create_summary_treemap_by_cost_subgroup, + do_line_plot_of_cost, + do_stacked_bar_plot_of_cost_by_category, + estimate_input_cost_of_scenarios, + summarize_cost_data +) +from tlo import Date +from tlo.analysis.utils import ( + compute_summary_statistics, + extract_params, + extract_results, + get_scenario_info, + get_scenario_outputs, + load_pickled_dataframes, + create_pickles_locally, + summarize, +) + +# Define a timestamp for script outputs +timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") + +# Print the start time of the script +print('Script Start', datetime.datetime.now().strftime('%H:%M')) + +# Create folders to store results +resourcefilepath = Path("./resources") +outputfilepath = Path('./outputs/') +figurespath = Path('./outputs/consumables_impact_analysis/manuscript') +if not os.path.exists(figurespath): + os.makedirs(figurespath) +path_for_consumable_resourcefiles = resourcefilepath / "healthsystem/consumables" + +# Load result files +# ------------------------------------------------------------------------------------------------------------------ +results_folder = get_scenario_outputs('consumables_impact-2026-02-14T203020Z.py', outputfilepath)[0] # Dec 2025 runs +suspended_results_folder = get_scenario_outputs('consumables_impact-2026-02-13T183325Z.py', outputfilepath)[0] +#create_pickles_locally(scenario_output_dir = "./outputs/consumables_impact-2026-02-14T203020Z") # from .log files +scaling_factor = load_pickled_dataframes( + suspended_results_folder, draw = 0, run = 0, name = 'tlo.methods.demography' + )['tlo.methods.demography']['scaling_factor']['scaling_factor'].values[0] + +# Check can read results from draw=0, run=0 +log = load_pickled_dataframes(results_folder, 0, 0) # look at one log (so can decide what to extract) +params = extract_params(results_folder) +info = get_scenario_info(results_folder) + +# Declare default parameters for cost analysis +# ------------------------------------------------------------------------------------------------------------------ +# Period relevant for costing +TARGET_PERIOD = (Date(2025, 1, 1), Date(2029, 12, 31)) # TODO change to 2040 +relevant_period_for_costing = [i.year for i in TARGET_PERIOD] +list_of_relevant_years_for_costing = list(range(relevant_period_for_costing[0], relevant_period_for_costing[1] + 1)) +list_of_years_for_plot = list(range(2025, 2030)) +number_of_years_costed = relevant_period_for_costing[1] - 2025 + 1 + +discount_rate_health = 0 +chosen_metric = 'mean' +chosen_cet = 65 + +# Scenarios +cons_scenarios = { + 0: "Baseline availability – Default health system", + 1: "Baseline availability – Perfect health system", + + 2: "Non-therapeutic consumables (NTC) – Default health system", + 3: "Non-therapeutic consumables (NTC) – Perfect health system", + + 4: "NTC + Vital medicines (VM) – Default health system", + 5: "NTC + Vital medicines (VM) – Perfect health system", + + 6: "NTC + VM + Pharmacist-managed stocks – Default health system", + 7: "NTC + VM + Pharmacist-managed stocks – Perfect health system", + + 8: "75th percentile facility – Default health system", + 9: "75th percentile facility – Perfect health system", + + 10: "90th percentile facility – Default health system", + 11: "90th percentile facility – Perfect health system", + + 12: "Best facility – Default health system", + 13: "Best facility – Perfect health system", + + 14: "District pooling – Default health system", + 15: "District pooling – Perfect health system", + + 16: "Neighbourhood pooling – Default health system", + 17: "Neighbourhood pooling – Perfect health system", + + 18: "Pairwise exchange (Large radius) – Default health system", + 19: "Pairwise exchange (Large radius) – Perfect health system", + + 20: "Pairwise exchange (Small radius) – Default health system", + 21: "Pairwise exchange (Small radius) – Perfect health system", + + 22: "Perfect availability – Default health system", + 23: "Perfect availability – Perfect health system", +} + +main_analysis_subset = [ + k for k, v in cons_scenarios.items() + if "Default health system" in v +] + +perfect_analysis_subset = [ + k for k, v in cons_scenarios.items() + if "Perfect health system" in v +] + +cons_scenarios_main = { + k: v.replace(" – Default health system", "") + for k, v in cons_scenarios.items() + if k in main_analysis_subset +} + +cons_scenarios_perfect = { + k: v.replace(" – Perfect health system", "") + for k, v in cons_scenarios.items() + if k in perfect_analysis_subset +} + +# Dict to assign DALY causes to disease groups +disease_groups = { + # --- HIV / TB / Malaria --- + "HIV/AIDS": [ + "AIDS", + ], + "Malaria": [ + "Malaria", + ], + "TB (non-AIDS)": [ + "TB (non-AIDS)", + ], + + # --- MNCH --- + "RMNCH": [ + "Maternal Disorders", + "Neonatal Disorders", + "Congenital birth defects", + "Childhood Diarrhoea", + "Childhood Undernutrition", + "Lower respiratory infections", + "Measles", + ], + + # --- NCDs --- + "Cardiometabolic": [ + "Heart Disease", + "Stroke", + "Diabetes", + "Kidney Disease", + ], + "Cancer": [ + "Cancer (Bladder)", + "Cancer (Breast)", + "Cancer (Cervix)", + "Cancer (Oesophagus)", + "Cancer (Prostate)", + "Cancer (Other)", + ], + "Mental & Neurological": [ + "Depression / Self-harm", + "Epilepsy", + "Lower Back Pain", + ], + "Other": [ + "COPD", + "Schistosomiasis", + "Other", + ], + + # --- Injuries --- + "Injuries": [ + "Transport Injuries", + ], +} +# Dict to assign colours to disease groups +disease_colors = { + "HIV/AIDS": "#e41a1c", + "Malaria": "#377eb8", + "RMNCH": "#4daf4a", + "Cardiometabolic": "#984ea3", + "Cancer": "#ff7f00", + "Mental & Neurological": "#a65628", + "Injuries": "#f781bf", + "Other": "#999999", +} +# Dict to recategorize above TREATMENT_IDs into disease groups (as used to classify DALYs averted) +service_to_group = { + # --- HIV / TB / Malaria --- + "Hiv*": "HIV/AIDS", + "Tb*": "HIV/AIDS", # TB grouped with HIV/AIDS in DALYs + "Malaria*": "Malaria", + + # --- RMNCH --- + "AntenatalCare*": "RMNCH", + "DeliveryCare*": "RMNCH", + "PostnatalCare*": "RMNCH", + "Contraception*": "RMNCH", + "Diarrhoea*": "RMNCH", + "Undernutrition*": "RMNCH", + "Alri*": "RMNCH", + "Measles*": "RMNCH", + "Epi*": "RMNCH", + + # --- Cardiometabolic --- + "CardioMetabolicDisorders*": "Cardiometabolic", + + # --- Cancer --- + "BladderCancer*": "Cancer", + "BreastCancer*": "Cancer", + "CervicalCancer*": "Cancer", + "OesophagealCancer*": "Cancer", + "ProstateCancer*": "Cancer", + "OtherAdultCancer*": "Cancer", + + # --- Mental & Neurological --- + "Depression*": "Mental & Neurological", + "Epilepsy*": "Mental & Neurological", + + # --- Other --- + "Copd*": "Other", + "Inpatient*": "Other", + "Schisto*": "Other", + + # --- Injuries --- + "Rti*": "Injuries", +} + + +# Function to get incremental values +def find_difference_relative_to_comparison( + df: pd.DataFrame, + comparison: str, + scaled: bool = False, + drop_comparison: bool = True, +): + """ + Compute difference relative to a comparison draw + for a DataFrame with MultiIndex columns (draw, run). + """ + + # Ensure draw is first level + if df.columns.names[0] != "draw": + df = df.swaplevel(0, 1, axis=1).sort_index(axis=1) + + # Extract comparison values + comp_df = df.xs(comparison, level="draw", axis=1) + + # Broadcast subtraction across draws + if scaled: + result = (df - comp_df) / comp_df + else: + result = df - comp_df + + if drop_comparison: + result = result.drop(columns=comparison, level="draw") + + return result + +# ---------------------------- +# Define utility functions +# ---------------------------- + +# Define a function to create bar plots +def do_standard_bar_plot_with_ci(_df: pd.DataFrame, set_colors=None, annotations=None, + xticklabels_wrapped=False, + put_labels_in_legend=True, scenarios_dict = None, + offset=1e6): + """Make a vertical bar plot for each row of _df, using the columns to identify the height of the bar and the + extent of the error bar.""" + + substitute_labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + + yerr = np.array([ + (_df[chosen_metric] - _df['lower']).values, + (_df['upper'] - _df[chosen_metric]).values, + ]) + + xticks = {(i + 0.5): k for i, k in enumerate(_df.index)} + + if set_colors is not None: + # dict mapping -> use index keys; list/tuple/Series -> use as-is + if isinstance(set_colors, dict): + colors = [set_colors.get(k, 'grey') for k in _df.index] + # Optional debug: + # missing = [k for k in _df.index if k not in set_colors] + # if missing: print("No color for:", missing) + else: + colors = list(set_colors) + else: + cmap = sns.color_palette('Spectral', as_cmap=True) + rescale = lambda y: (y - np.min(y)) / (np.max(y) - np.min(y)) # noqa: E731 + colors = list(map(cmap, rescale(np.array(list(xticks.keys()))))) if put_labels_in_legend else None + + fig, ax = plt.subplots(figsize=(10, 5)) + ax.bar( + xticks.keys(), + _df[chosen_metric].values, + yerr=yerr, + ecolor='black', + color=colors, + capsize=10, + label=xticks.values() + ) + + if annotations: + for xpos, (ypos, text) in zip(xticks.keys(), zip(_df['upper'].values.flatten(), annotations)): + annotation_y = ypos + offset + + ax.text( + xpos, + annotation_y, + '\n'.join(text.split(' ', 1)), + horizontalalignment='center', + verticalalignment='bottom', # Aligns text at the bottom of the annotation position + fontsize='x-small', + rotation='horizontal' + ) + + ax.set_xticks(list(xticks.keys())) + + if put_labels_in_legend: + # Update xticks label with substitute labels + # Insert legend with updated labels that shows correspondence between substitute label and original label + # Use all_manuscript_scenarios for the legend + xtick_legend = [f'{letter}: {scenarios_dict.get(label, label)}' for letter, label in + zip(substitute_labels, xticks.values())] + xtick_values = [letter for letter, label in zip(substitute_labels, xticks.values())] + + h, legs = ax.get_legend_handles_labels() + ax.legend(h, xtick_legend, loc='center left', fontsize='small', bbox_to_anchor=(1, 0.5)) + ax.set_xticklabels(xtick_values) + else: + # Use scenarios_dict if provided, otherwise fall back to original labels + if scenarios_dict is not None: + labels = [scenarios_dict.get(label, label) for label in xticks.values()] + else: + labels = list(xticks.values()) + + if not xticklabels_wrapped: + ax.set_xticklabels(labels, rotation=90) + else: + wrapped_labs = ["\n".join(textwrap.wrap(_lab, 20)) for _lab in labels] + ax.set_xticklabels(wrapped_labs, rotation=90) + + # Extend ylim to accommodate data labels + ymin, ymax = ax.get_ylim() + extension = 0.1 * (ymax - ymin) # 10% of range + ax.set_ylim(ymin - extension, ymax + extension) # Set new y-axis limits with the extended range + + ax.grid(axis="y") + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + # fig.tight_layout() + fig.tight_layout(pad=2.0) + + if put_labels_in_legend: + # Leave space on right for legend + plt.subplots_adjust(left=0.15, right=0.5, top=0.88) + else: + # Use full width of figure + plt.subplots_adjust(left=0.15, right=0.95, top=0.88) + + return fig, ax + +def plot_stacked_mean_with_total_ci( + summary_df, + colors: dict, + scenario_labels: dict | None = None, + ylabel: str = "", + xlabel: str = "Scenario", + title: str | None = None, + figsize=(12, 6), + legend_outside: bool = True, + xticklabels_wrapped: bool = False, + wrap_width: int = 20, +): + """ + Plot stacked bars using mean values by disease group, with a confidence + interval for the total (lower/upper summed across groups). + + Parameters + ---------- + summary_df : pd.DataFrame + Index: disease_group + Columns: MultiIndex (draw, stat) where stat ∈ {'lower','mean','upper'} + + colors : dict + Mapping {disease_group: color} + + scenario_labels : dict, optional + Mapping {draw: label} for x-axis + + ylabel : str + Y-axis label + + xlabel : str + X-axis label + + title : str, optional + Figure title + + figsize : tuple + Figure size + + legend_outside : bool + Whether to place legend outside the plot + """ + + # ---- Extract mean values (for stacking) ---- + mean_df = summary_df.xs("mean", level="stat", axis=1) + + # ---- Extract totals for CI ---- + total_mean = mean_df.sum(axis=0) + total_lower = ( + summary_df.xs("lower", level="stat", axis=1) + .sum(axis=0) + ) + total_upper = ( + summary_df.xs("upper", level="stat", axis=1) + .sum(axis=0) + ) + + # ---- X axis ---- + draws = mean_df.columns + x = np.arange(len(draws)) + + # ---- Plot ---- + fig, ax = plt.subplots(figsize=figsize) + + bottom = np.zeros(len(draws)) + + for group in mean_df.index: + vals = mean_df.loc[group].values + ax.bar( + x, + vals, + bottom=bottom, + color=colors.get(group, "grey"), + label=group + ) + bottom += vals + + # ---- CI for total only ---- + yerr = np.vstack([ + total_mean - total_lower, + total_upper - total_mean + ]) + + ax.errorbar( + x, + total_mean, + yerr=yerr, + fmt="none", + ecolor="black", + elinewidth=1.5, + capsize=4, + zorder=5 + ) + + # ---- Formatting ---- + ax.axhline(0, color="black", linewidth=0.8) + + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + + if title: + ax.set_title(title, pad=10) + + # ---- X tick labels ---- + rotation = 90 + if scenario_labels: + labels = [scenario_labels.get(d, d) for d in draws] + else: + labels = list(draws) + + if xticklabels_wrapped: + labels = [ + "\n".join(textwrap.wrap(str(l), wrap_width)) + for l in labels + ] + ha = "center" + else: + ha = "right" + + ax.set_xticks(x) + ax.set_xticklabels(labels, rotation=rotation, ha=ha) + + if legend_outside: + ax.legend( + title="Disease group", + bbox_to_anchor=(1.02, 1), + loc="upper left", + frameon=False + ) + else: + ax.legend(frameon=False) + + fig.tight_layout() + return fig, ax + + +def plot_percentage_change_with_ci( + summary_df, + colors: dict, + scenario_labels: dict | None = None, + ylabel: str = "Percentage change relative to baseline", + xlabel: str = "Scenario", + title: str | None = None, + figsize=(12, 6), + xticklabels_wrapped: bool = False, + wrap_width: int = 20, + markers: list | None = None, +): + """ + Dot + 95% CI plot for percentage change outcomes (non-additive). + + Parameters + ---------- + summary_df : pd.DataFrame + Index: disease_group + Columns: MultiIndex (draw, stat) where stat ∈ {'mean','lower','upper'} + + colors : dict + Mapping {disease_group: color} + + scenario_labels : dict, optional + Mapping {draw: label} + + markers : list, optional + Custom marker list per disease group + """ + + # ---- Extract statistics ---- + mean_df = summary_df.xs("mean", level="stat", axis=1) + lower_df = summary_df.xs("lower", level="stat", axis=1) + upper_df = summary_df.xs("upper", level="stat", axis=1) + + draws = mean_df.columns + x = np.arange(len(draws)) + + disease_groups = mean_df.index.tolist() + + # Default markers if not provided + if markers is None: + markers = ["o", "s", "D", "^", "P", "X", "v", "*", "<", ">"] + marker_map = { + dg: markers[i % len(markers)] + for i, dg in enumerate(disease_groups) + } + + fig, ax = plt.subplots(figsize=figsize) + + # Slight horizontal jitter to prevent overlap + jitter_strength = 0.15 + offsets = np.linspace( + -jitter_strength, jitter_strength, len(disease_groups) + ) + + # ---- Plot each disease group ---- + for i, group in enumerate(disease_groups): + + y = mean_df.loc[group].values + yerr = np.vstack([ + y - lower_df.loc[group].values, + upper_df.loc[group].values - y + ]) + + ax.errorbar( + x + offsets[i], + y, + yerr=yerr, + fmt=marker_map[group], + color=colors.get(group, "black"), + markersize=6, + capsize=3, + linestyle="none", + alpha=0.85, + label=group + ) + + # Reference line at zero + ax.axhline(0, color="black", linestyle="--", linewidth=1) + + # ---- X tick labels ---- + if scenario_labels: + labels = [scenario_labels.get(d, d) for d in draws] + else: + labels = list(draws) + + if xticklabels_wrapped: + labels = [ + "\n".join(textwrap.wrap(str(l), wrap_width)) + for l in labels + ] + ha = "center" + else: + ha = "right" + + ax.set_xticks(x) + ax.set_xticklabels(labels, rotation=90, ha=ha) + + ax.set_ylabel(ylabel) + ax.set_xlabel(xlabel) + + if title: + ax.set_title(title, pad=12) + + ax.grid(axis="y", alpha=0.3) + + ax.legend( + title="Disease group", + bbox_to_anchor=(1.02, 1), + loc="upper left", + frameon=False + ) + + fig.tight_layout() + return fig, ax + + +def plot_change_in_cons_unavailability_by_scenario( + nat_mean, + nat_lower, + nat_upper, + scenario_labels =cons_scenarios_main, + figsize=(14, 6), + wrap_width=20, +): + + fig, ax = plt.subplots(figsize=figsize) + + draws = nat_mean.index + x = np.arange(len(draws)) + + # Map draw → scenario name + labels = [ + "\n".join(textwrap.wrap(scenario_labels.get(d, str(d)), wrap_width)) + for d in draws + ] + + # CI + yerr = np.vstack([ + nat_mean - nat_lower, + nat_upper - nat_mean + ]) + + ax.errorbar( + x, + nat_mean, + yerr=yerr, + fmt="o", + capsize=4, + color="black" + ) + + ax.axhline(0, linestyle="--", color="black", linewidth=1) + + ax.set_xticks(x) + ax.set_xticklabels(labels, rotation=90, ha="right") + + ax.set_ylabel( + "Change in consumable unavailability \n across consumables (percentage points)" + ) + ax.set_xlabel("Scenario") + + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + + fig.tight_layout() + +def plot_change_in_cons_unavailability_by_program( + delta_mean, + figsize=(14, 6), + wrap_width=20, +): + """ + Plot distribution of programme-specific change in consumable + unavailability across scenarios. + + Parameters + ---------- + delta_mean : pd.DataFrame + Index = disease_group + Columns = draw (scenarios) + Values = change in percentage points + """ + + fig, ax = plt.subplots(figsize=figsize) + + # Transpose so each box represents a programme + data_to_plot = delta_mean.T + + # Clean item category names + clean_category_names = { + 'cancer': 'Cancer', + 'cardiometabolicdisorders': 'Cardiometabolic Disorders', + 'contraception': 'Contraception', + 'general': 'General', + 'hiv': 'HIV', + 'malaria': 'Malaria', + 'ncds': 'Non-communicable Diseases', + 'neonatal_health': 'Neonatal Health', + 'other_childhood_illnesses': 'Other Childhood Illnesses', + 'reproductive_health': 'Reproductive Health', + 'road_traffic_injuries': 'Road Traffic Injuries', + 'tb': 'Tuberculosis', + 'undernutrition': 'Undernutrition', + 'epi': 'Expanded programme on immunization' + } + delta_mean.index = delta_mean.index.map(clean_category_names) + + sns.boxplot( + data=data_to_plot, + showfliers=False, + ax=ax + ) + + # Wrap programme names + wrapped_labels = [ + "\n".join(textwrap.wrap(str(label), wrap_width)) + for label in delta_mean.index + ] + + ax.set_xticklabels( + wrapped_labels, + rotation=90, + ha="right" + ) + + ax.axhline(0, linestyle="--", color="black", linewidth=1) + + ax.set_ylabel( + "Change in consumable unavailability \n across scenarios (percentage points)" + ) + ax.set_xlabel("Disease programme") + + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + + fig.tight_layout() + +def plot_heatmap_delta(delta_mean, + scenario_labels=None, + figsize=(12, 6), + baseline_draw=0, + wrap_xticks=True, + wrap_width=20): + + df = delta_mean.copy() + + # Clean item category names + clean_category_names = { + 'cancer': 'Cancer', + 'cardiometabolicdisorders': 'Cardiometabolic Disorders', + 'contraception': 'Contraception', + 'general': 'General', + 'hiv': 'HIV', + 'malaria': 'Malaria', + 'ncds': 'Non-communicable Diseases', + 'neonatal_health': 'Neonatal Health', + 'other_childhood_illnesses': 'Other Childhood Illnesses', + 'reproductive_health': 'Reproductive Health', + 'road_traffic_injuries': 'Road Traffic Injuries', + 'tb': 'Tuberculosis', + 'undernutrition': 'Undernutrition', + 'epi': 'Expanded programme on immunization' + } + df.index = df.index.map(clean_category_names) + + # Drop baseline + df = df.drop(columns=baseline_draw) + + # Rename scenarios if provided + if scenario_labels: + df = df.rename(columns=scenario_labels) + + fig, ax = plt.subplots(figsize=figsize) + + sns.heatmap( + df, + cmap="RdBu_r", # blue = reduction (good), red = increase (bad) + center=0, + linewidths=0.5, + cbar_kws={"label": "Change in % unavailable (vs baseline)"}, + ax=ax + ) + + # Wrap x tick labels + if wrap_xticks: + wrapped_labels = [ + "\n".join(textwrap.wrap(label.get_text(), wrap_width)) + for label in ax.get_xticklabels() + ] + ax.set_xticklabels(wrapped_labels, rotation=90, ha="center") + else: + ax.set_xticklabels(ax.get_xticklabels(), rotation=90) + + ax.set_xlabel("Scenario") + ax.set_ylabel("Disease group") + ax.set_title("Change in consumable unavailability relative to baseline") + + plt.tight_layout() + return fig, ax + + +def aggregate_by_disease_group(df: pd.DataFrame, + disease_groups: dict) -> pd.DataFrame: + """ + Aggregate disease-level rows into disease-group rows. + + Parameters + ---------- + df : pd.DataFrame + Index: disease names + Columns: MultiIndex (draw, run) + Values: DALYs + + disease_groups : dict + Mapping {group_name: [list of disease names]} + + Returns + ------- + pd.DataFrame + Index: disease groups + Columns: same as df (draw, run) + Values: summed DALYs per group + """ + grouped_rows = {} + + for group_name, diseases in disease_groups.items(): + # Select diseases that actually exist in the index + present = [d for d in diseases if d in df.index] + + if not present: + continue + + grouped_rows[group_name] = df.loc[present].sum(axis=0) + + return pd.DataFrame.from_dict(grouped_rows, orient="index") + +def summarize_aggregated_results_for_figure( + df, + main_analysis_subset, + chosen_metric="mean" +): + """ + Prepare a draw/run DataFrame for plotting: + - drop redundant column level if present + - restrict to main_analysis_subset + - summarize across runs + """ + + # If columns have extra level (e.g. ('mean', run)), drop first level + if isinstance(df.columns, pd.MultiIndex) and df.columns.nlevels > 1: + df = df.copy() + df.columns = df.columns.droplevel(0) + + # Restrict to subset of draws + df = df[df.index.isin(main_analysis_subset)] + + # Summarize across runs + summarized = summarize_cost_data(df, _metric=chosen_metric) + + return summarized + +def summarize_disaggregated_results_for_figure( + df_grouped, + main_analysis_subset, + chosen_metric="mean" +): + """ + Summarize grouped (e.g., disease_group) results + from wide draw/run format into MultiIndex columns (draw, stat). + """ + + # Ensure long format: index = (group, draw, run) + df_long = df_grouped.stack(level=["draw", "run"]) + df_long.index.names = ["group", "draw", "run"] + + summaries = {} + + for group in df_long.index.get_level_values("group").unique(): + ser = df_long.xs(group, level="group") + df_wide = ser.unstack("run") + summaries[group] = summarize_cost_data(df_wide, _metric=chosen_metric) + + result = pd.concat(summaries, names=["group"]) + + # Reformat columns to (draw, stat) + result = result.unstack() + result.columns = result.columns.swaplevel("stat", "draw") + + # Restrict draws + result = result.loc[ + :, + result.columns.get_level_values("draw").isin(main_analysis_subset) + ] + + return result + +def set_param_names_as_column_index_level_0(_df): + """Set the columns index (level 0) as the param_names.""" + ordered_param_names_no_prefix = {i: x for i, x in enumerate(cons_scenarios)} + names_of_cols_level0 = [ordered_param_names_no_prefix.get(col) for col in _df.columns.levels[0]] + assert len(names_of_cols_level0) == len(_df.columns.levels[0]) + _df.columns = _df.columns.set_levels(names_of_cols_level0, level=0) + return _df + +# Functions to extract results +def get_num_dalys(_df): + """Return total number of DALYS (Stacked) by label (total within the TARGET_PERIOD). + Throw error if not a record for every year in the TARGET PERIOD (to guard against inadvertently using + results from runs that crashed mid-way through the simulation. + """ + years_needed = relevant_period_for_costing + assert set(_df.year.unique()).issuperset(years_needed), "Some years are not recorded." + _df = _df.loc[_df.year.between(*years_needed)].drop(columns=['date', 'sex', 'age_range']).groupby( + 'year').sum().sum(axis=1) + + # Initial year and discount rate + initial_year = min(_df.index.unique()) + + # Calculate the discounted values + discounted_values = _df / (1 + discount_rate_health) ** (_df.index - initial_year) + + return pd.Series(discounted_values.sum()) + +def get_num_dalys_by_disease(_df): + """ + Return discounted total DALYs by disease over the TARGET_PERIOD. + Output: Series indexed by disease name. + """ + years_needed = relevant_period_for_costing + assert set(_df.year.unique()).issuperset(years_needed), \ + "Some years are not recorded." + + # Keep only years of interest + _df = _df.loc[_df.year.between(*years_needed)] + + # Drop non-disease columns + disease_cols = _df.columns.difference( + ['date', 'sex', 'age_range', 'year'] + ) + + # Sum by year × disease + by_year_disease = ( + _df[['year'] + list(disease_cols)] + .groupby('year') + .sum() + ) + + # Discounting + initial_year = by_year_disease.index.min() + discount_factors = (1 + discount_rate_health) ** ( + by_year_disease.index - initial_year + ) + + discounted = by_year_disease.div(discount_factors, axis=0) + + # Sum over time → total DALYs by disease + return discounted.sum() + +def get_num_treatments_total(_df): + """Return the number of treatments in total of all treatments (total within the TARGET_PERIOD)""" + _df = _df.loc[pd.to_datetime(_df.date).between(*TARGET_PERIOD), 'TREATMENT_ID'].apply(pd.Series).sum() + _df.index = _df.index.map(lambda x: x.split('_')[0] + "*") + _df = _df.groupby(level=0).sum().sum() + return pd.Series(_df) + +def get_num_treatments_by_disease_group(_df): + """Return the number of treatments by short treatment id (total within the TARGET_PERIOD)""" + _df = _df.loc[pd.to_datetime(_df.date).between(*TARGET_PERIOD), 'TREATMENT_ID'].apply(pd.Series).sum() + _df.index = _df.index.map(lambda x: x.split('_')[0] + "*") + _df = _df.rename(index=disease_groups) + _df = _df.groupby(level=0).sum() + return _df + +def get_monetary_value_of_incremental_health(_num_dalys_averted, _chosen_value_of_life_year): + monetary_value_of_incremental_health = (_num_dalys_averted * _chosen_value_of_life_year) + return monetary_value_of_incremental_health + +def get_percentage_unavailable_by_program(_df): + """ + Compute percentage of times consumables were unavailable + by disease program within TARGET_PERIOD. + """ + + # Restrict to target period + _df = _df.loc[ + pd.to_datetime(_df.date).between(*TARGET_PERIOD), + ['Item_Available', 'Item_NotAvailable'] + ] + + # ---- Sum dictionaries across rows ---- + available = ( + _df['Item_Available'] + .apply(pd.Series) + .sum() + ) + + not_available = ( + _df['Item_NotAvailable'] + .apply(pd.Series) + .sum() + ) + + # Align indices + total = available.add(not_available, fill_value=0) + + # % unavailable per item + pct_unavailable = ( + not_available / total.replace(0, np.nan) + ) + + # Map items to program + pct_unavailable.index = pct_unavailable.index.astype(str) + pct_unavailable = pct_unavailable.rename(index=item_to_program_map) + + # Aggregate to program level + pct_unavailable = ( + pct_unavailable + .groupby(level=0) + .mean() + ) + + return pct_unavailable + +def compute_delta_unavailability_from_baseline(summary_df, comparator_draw=0): + """ + Convert absolute % unavailable to change relative to baseline. + Negative = improvement. + """ + mean_df = summary_df.xs("mean", level="stat", axis=1) + baseline = mean_df[comparator_draw] + delta_mean = mean_df.subtract(baseline, axis=0) + + return delta_mean + +def compute_national_unavailability_summary(summary_df, comparator_draw=0): + + mean_df = summary_df.xs("mean", level="stat", axis=1) + lower_df = summary_df.xs("lower", level="stat", axis=1) + upper_df = summary_df.xs("upper", level="stat", axis=1) + + baseline = mean_df[comparator_draw] + + delta_mean = mean_df.subtract(baseline, axis=0) + delta_lower = lower_df.subtract(baseline, axis=0) + delta_upper = upper_df.subtract(baseline, axis=0) + + national_mean = delta_mean.mean(axis=0) + national_lower = delta_lower.mean(axis=0) + national_upper = delta_upper.mean(axis=0) + + return delta_mean, national_mean, national_lower, national_upper + +def reformat_with_draw_as_index_and_stat_as_column(_df): + df = _df.copy() + df.index = df.index.set_names(["stat", "draw"]) + formatted = df.unstack("stat") + formatted.columns = formatted.columns.droplevel(0) + return formatted + + +def generate_all_consumable_figures( + scenario_dict: dict, + results_folder: Path, + suspended_results_folder: Path, + comparator_draw: int, + figurespath: Path, +): + """ + Generate all consumables impact figures for a given scenario set. + + Parameters + ---------- + scenario_dict : dict + Mapping {draw: scenario_name} + + results_folder : Path + Folder containing model results + + suspended_results_folder : Path + Folder containing suspended results (if applicable) + + figurespath : Path + Output folder for figures + """ + figurespath.mkdir(parents=True, exist_ok=True) + scenario_subset = list(scenario_dict.keys()) + + # -------------------------------------------------- + # 1) TOTAL DALYs AVERTED + # -------------------------------------------------- + num_dalys = extract_results( + results_folder, + module='tlo.methods.healthburden', + key='dalys_stacked', + custom_generate_series=get_num_dalys, + do_scaling=True, + suspended_results_folder=suspended_results_folder, + ) + + # Absolute + num_dalys_averted = ( + -1.0 * + pd.DataFrame( + find_difference_relative_to_comparison( + num_dalys, + comparison=comparator_draw, + scaled=False + ) + ).T.unstack(level='run') + ) + + num_dalys_averted_summarized = summarize_aggregated_results_for_figure( + num_dalys_averted, + scenario_subset, + chosen_metric + ) + + # Percentage + num_dalys_averted_percent = ( + -1.0 * + pd.DataFrame( + find_difference_relative_to_comparison( + num_dalys, + comparison=comparator_draw, + scaled=True + ) + ).T.unstack(level='run') + ) + + num_dalys_averted_percent_summarized = summarize_aggregated_results_for_figure( + num_dalys_averted_percent, + scenario_subset, + chosen_metric + ) + + fig, ax = do_standard_bar_plot_with_ci( + (num_dalys_averted_summarized / 1e6).clip(0.0), + annotations=[ + f"{row[chosen_metric]*100:.1f}% " + f"({row['lower']*100:.1f}–{row['upper']*100:.1f}%)" + for _, row in num_dalys_averted_percent_summarized.iterrows() + ], + xticklabels_wrapped=True, + put_labels_in_legend=False, + offset=0.05, + scenarios_dict=scenario_dict + ) + ax.set_ylabel('DALYs (Millions)') + ax.set_ylim(bottom=0) + fig.savefig(figurespath / 'dalys_averted_total.png', dpi=600) + plt.close(fig) + + + # -------------------------------------------------- + # 2) DALYs BY DISEASE GROUP + # -------------------------------------------------- + num_dalys_by_disease = extract_results( + results_folder, + module='tlo.methods.healthburden', + key='dalys_stacked', + custom_generate_series=get_num_dalys_by_disease, + do_scaling=True, + suspended_results_folder=suspended_results_folder, + ) + + num_dalys_by_group = aggregate_by_disease_group( + num_dalys_by_disease, + disease_groups + ) + + num_dalys_by_group.columns.names = ["draw", "run"] + + num_dalys_averted_by_group = ( + -1.0 * + pd.DataFrame( + find_difference_relative_to_comparison( + num_dalys_by_group, + comparison=comparator_draw, + scaled=True, + drop_comparison=True + ) + ) + ) + + num_dalys_averted_by_group_summarized = summarize_disaggregated_results_for_figure( + num_dalys_averted_by_group, + scenario_subset, + chosen_metric + ) + + fig, ax = plot_percentage_change_with_ci( + summary_df=num_dalys_averted_by_group_summarized, + colors=disease_colors, + scenario_labels=scenario_dict, + ylabel="DALYs averted (% relative to baseline)", + xticklabels_wrapped=True, + ) + fig.savefig(figurespath / "dalys_averted_by_disease_group.png", + dpi=300, bbox_inches="tight") + plt.close(fig) + + + # -------------------------------------------------- + # 3) TOTAL SERVICES DELIVERED + # -------------------------------------------------- + num_treatments_total = extract_results( + results_folder, + module='tlo.methods.healthsystem.summary', + key='HSI_Event_non_blank_appt_footprint', + custom_generate_series=get_num_treatments_total, + do_scaling=True, + suspended_results_folder=suspended_results_folder, + ).pipe(set_param_names_as_column_index_level_0) + + num_incremental_treatments = ( + pd.DataFrame( + find_difference_relative_to_comparison( + num_treatments_total, + comparison = comparator_draw, + scaled=True + ) + ).T.unstack(level='run') + ) + + num_incremental_treatments_summarized = summarize_aggregated_results_for_figure( + num_incremental_treatments, + scenario_subset, + chosen_metric + ) + + fig, ax = do_standard_bar_plot_with_ci( + num_incremental_treatments_summarized.clip(0.0), + annotations=[ + f"{row[chosen_metric]*100:.1f}% " + f"({row['lower']*100:.1f}–{row['upper']*100:.1f}%)" + for _, row in num_incremental_treatments_summarized.iterrows() + ], + xticklabels_wrapped=True, + put_labels_in_legend=False, + offset=0.05, + scenarios_dict=scenario_dict + ) + ax.set_ylabel('Additional services delivered (% relative to baseline)') + ax.set_ylim(bottom=0) + fig.savefig(figurespath / 'incremental_services_delivered_total.png', dpi=600) + plt.close(fig) + + + # -------------------------------------------------- + # 4) SERVICES BY DISEASE GROUP + # -------------------------------------------------- + num_treatments_by_disease_group = extract_results( + results_folder, + module='tlo.methods.healthsystem.summary', + key='HSI_Event_non_blank_appt_footprint', + custom_generate_series=get_num_treatments_by_disease_group, + do_scaling=True, + suspended_results_folder=suspended_results_folder, + ).pipe(set_param_names_as_column_index_level_0) + + num_treatments_by_disease_group['disease_group'] = ( + num_treatments_by_disease_group.index.map(service_to_group) + ) + + num_treatments_by_disease_group = ( + num_treatments_by_disease_group + .set_index("disease_group", append=True) + .groupby(level="disease_group") + .sum() + ) + + num_incremental_by_group = pd.DataFrame( + find_difference_relative_to_comparison( + num_treatments_by_disease_group, + comparison=comparator_draw, + scaled=True + ) + ) + + summarized = summarize_disaggregated_results_for_figure( + num_incremental_by_group, + scenario_subset, + chosen_metric + ) + + fig, ax = plot_percentage_change_with_ci( + summary_df=summarized, + colors=disease_colors, + scenario_labels=scenario_dict, + ylabel="Additional services (% relative to baseline)", + xticklabels_wrapped=True, + ) + fig.savefig(figurespath / "incremental_services_delivered_by_disease_group.png", + dpi=300, bbox_inches="tight") + plt.close(fig) + + + # -------------------------------------------------- + # 5) MAXIMUM ABILITY TO PAY + # -------------------------------------------------- + max_ability_to_pay = ( + get_monetary_value_of_incremental_health( + num_dalys_averted, + _chosen_value_of_life_year=chosen_cet + ) + ) + + max_ability_to_pay = max_ability_to_pay[ + max_ability_to_pay.index.get_level_values('draw').isin(scenario_subset) + ] + + max_ability_to_pay_summarized = summarize_cost_data( + max_ability_to_pay, + _metric=chosen_metric + ).clip(lower=0.0) + + max_ability_to_pay_summarized = reformat_with_draw_as_index_and_stat_as_column( + max_ability_to_pay_summarized + ) + + fig, ax = do_standard_bar_plot_with_ci( + (max_ability_to_pay_summarized / 1e6), + annotations=[ + f"{row[chosen_metric] / 1e6 :.2f} " + f"({row['lower'] / 1e6 :.2f}–{row['upper'] / 1e6:.2f})" + for _, row in max_ability_to_pay_summarized.iterrows() + ], + xticklabels_wrapped=True, + put_labels_in_legend=False, + offset=3, + scenarios_dict=scenario_dict + ) + ax.set_ylabel('Maximum ability to pay (USD millions)') + ax.set_ylim(bottom=0) + fig.savefig(figurespath / 'max_ability_to_pay.png', + dpi=300, bbox_inches="tight") + plt.close(fig) + + + # -------------------------------------------------- + # 6) CONSUMABLE UNAVAILABILITY + # -------------------------------------------------- + item_to_program_df = pd.read_csv( + resourcefilepath / 'healthsystem' / 'consumables' / 'ResourceFile_Consumables_Item_Designations.csv' + )[['Item_Code', 'item_category']] + + item_to_program_map = dict( + zip( + item_to_program_df['Item_Code'].astype(str), + item_to_program_df['item_category'] + ) + ) + + # Plot the proportion of instances that a consumable was not available when requested + pct_unavailable_by_program = extract_results( + results_folder, + module='tlo.methods.healthsystem.summary', + key='Consumables', + custom_generate_series=get_percentage_unavailable_by_program, + do_scaling=False, + suspended_results_folder=suspended_results_folder, + ) + + pct_unavailable_by_program_summarized = summarize_disaggregated_results_for_figure( + pct_unavailable_by_program, + scenario_dict, + chosen_metric + ) + + fig, ax = plot_percentage_change_with_ci( + summary_df=pct_unavailable_by_program_summarized, + colors=disease_colors, + scenario_labels=scenario_dict, + ylabel="% instances of consumables being unavailable", + title="", + xticklabels_wrapped=True, + ) + fig.savefig(figurespath / "pct_unavailable_by_program.png", + dpi=300, bbox_inches="tight") + + delta_mean = compute_delta_unavailability_from_baseline(pct_unavailable_by_program_summarized, + comparator_draw = comparator_draw) + + plot_heatmap_delta(delta_mean, scenario_labels=scenario_dict, baseline_draw=comparator_draw) + plt.savefig(figurespath / "pct_change_in_availability_by_scenario_and_program_heatmap.png", + dpi=300, bbox_inches="tight") + + delta_mean, nat_mean, nat_lower, nat_upper = compute_national_unavailability_summary( + pct_unavailable_by_program_summarized, + comparator_draw = comparator_draw + ) + + plot_change_in_cons_unavailability_by_program( + delta_mean, + ) + plt.savefig(figurespath / "change_in_cons_unavailability_by_program.png", + dpi=300, bbox_inches="tight") + + plot_change_in_cons_unavailability_by_scenario( + nat_mean, + nat_lower, + nat_upper, + scenario_labels=scenario_dict + ) + plt.savefig(figurespath / "change_in_cons_unavailability_by_scenario.png", + dpi=300, bbox_inches="tight") + + print("✓ All figures generated.") + +generate_all_consumable_figures( + scenario_dict=cons_scenarios_main, + results_folder=results_folder, + suspended_results_folder=suspended_results_folder, + figurespath=figurespath / "main", + comparator_draw=0, +) + +generate_all_consumable_figures( + scenario_dict=cons_scenarios_perfect, + results_folder=results_folder, + suspended_results_folder=suspended_results_folder, + figurespath=figurespath / "perfect", + comparator_draw=1, +) diff --git a/src/scripts/consumables_analyses/manuscript/scenario_improved_consumable_availability.py b/src/scripts/consumables_analyses/manuscript/scenario_improved_consumable_availability.py new file mode 100644 index 0000000000..f352d651e8 --- /dev/null +++ b/src/scripts/consumables_analyses/manuscript/scenario_improved_consumable_availability.py @@ -0,0 +1,124 @@ + + +"""This Scenario file run the model under different assumptions for the consumable availability in order to estimate the +cost under each scenario for the HSSP-III duration + +Run on the batch system using: +``` +tlo batch-submit src/scripts/consumables_analyses/manuscript/scenario_improved_consumable_availability.py +``` + +or locally using: +``` +tlo scenario-run src/scripts/consumables_analyses/manuscript/scenario_improved_consumable_availability.py + +To use the suspend-resume feature +1. First remove all the consumable scenario apart from 'default' +2. In terminal - tlo scenario-run src/scripts/consumables_analyses/manuscript/scenario_improved_consumable_availability.py --suspend-date 2025-12-31 +When the program terminates, there will be two output files: the log file and a file named suspended_simulation.pickle +3. To resume the scenario run, +tlo scenario-run src/scripts/consumables_analyses/manuscript/scenario_improved_consumable_availability.py --resume-simulation outputs/consumables_impact-2026-02-13T183325Z + +# TODO add private market substitution scenarios + ``` + +""" +from tlo import Date, logging +from tlo.methods.fullmodel import fullmodel +from tlo.methods.scenario_switcher import ImprovedHealthSystemAndCareSeekingScenarioSwitcher +from tlo.scenario import BaseScenario + +class ConsumablesImpact(BaseScenario): + # ----------------------------- + # 1) DEFINE SCENARIOS EXPLICITLY + # ----------------------------- + CONSUMABLE_SCENARIOS = [ + 'default', + 'scenario1', 'scenario2', 'scenario3', # Predictive factors + 'scenario6', 'scenario7', 'scenario8', # Benchmark facilities + 'scenario16', 'scenario17', 'scenario18', 'scenario19', # Redistribution + 'all' # Perfect + ] + + SYSTEM_MODES = [ + { + "mode_appt_constraints": 2, + "max_healthsystem_function": [False, False], + "max_healthcare_seeking": [False, False], + }, + { + "mode_appt_constraints": 1, + "max_healthsystem_function": [False, True], + "max_healthcare_seeking": [False, True], + }, + ] + + def __init__(self): + super().__init__() + self.seed = 0 + self.start_date = Date(2010, 1, 1) + self.end_date = Date(2030, 1, 1) # TODO change to 2041 + # Run until 2040 even though analysis maybe focused on years until 2030 + self.pop_size = 10_000 # TODO change to 100_000 + + + # Build cartesian product of scenarios + self.SCENARIOS = [ + (cons, sys) + for cons in self.CONSUMABLE_SCENARIOS + for sys in self.SYSTEM_MODES + ] + + self.number_of_draws = len(self.SCENARIOS) + self.scenarios = list(range(self.number_of_draws)) + + self.runs_per_draw = 2 #TODO change to 5 + + def log_configuration(self): + return { + 'filename': 'consumables_impact', + 'directory': './outputs', + 'custom_levels': { + '*': logging.WARNING, + "tlo.methods.demography": logging.INFO, + "tlo.methods.healthburden": logging.INFO, + "tlo.methods.healthsystem.summary": logging.INFO, + } + } + + def modules(self): + return ( + fullmodel() + + [ImprovedHealthSystemAndCareSeekingScenarioSwitcher()] + ) + + def draw_parameters(self, draw_number, rng): + cons_scenario, sys = self.SCENARIOS[draw_number] + + return { + 'HealthSystem': { + 'cons_availability': 'default', + 'data_source_for_cons_availability_estimates': 'updated', + 'year_cons_availability_switch': 2026, + 'cons_availability_postSwitch': cons_scenario, + 'mode_appt_constraints':1, + 'mode_appt_constraints_postSwitch':sys["mode_appt_constraints"], # once without HR constraints and once with HR constraints + 'year_mode_switch':2026, + 'policy_name': 'EHP_III', + 'use_funded_or_actual_staffing': 'actual', + 'scale_to_effective_capabilities':True, # <-- Transition into Mode2 with the effective capabilities in HRH 'revealed' in Mode 1 + 'yearly_HR_scaling_mode': 'historical_scaling', # allow historical HRH scaling to occur 2018-2024 + 'equip_availability':'all', + 'beds_availability':'all', + }, + "ImprovedHealthSystemAndCareSeekingScenarioSwitcher": { + "max_healthsystem_function": sys["max_healthsystem_function"], + "max_healthcare_seeking": sys["max_healthcare_seeking"], + "year_of_switch": 2026, + } + } + +if __name__ == '__main__': + from tlo.cli import scenario_run + + scenario_run([__file__]) diff --git a/src/scripts/costing/cost_estimation.py b/src/scripts/costing/cost_estimation.py index 02d0971955..9110bbb6c0 100644 --- a/src/scripts/costing/cost_estimation.py +++ b/src/scripts/costing/cost_estimation.py @@ -14,6 +14,7 @@ import pandas as pd import squarify +from scripts.consumables_analyses.manuscript.analysis_improved_consumable_availability import suspended_results_folder from tlo import Date from tlo.analysis.utils import ( extract_results, @@ -282,6 +283,7 @@ def clean_equipment_name(name: str, equipment_drop_list = None) -> str: def estimate_input_cost_of_scenarios(results_folder: Path, resourcefilepath: Path, + suspended_results_folder: Path = None, _draws: Optional[list[int]] = None, _runs: Optional[list[int]] = None, summarize: bool = False, @@ -298,6 +300,10 @@ def estimate_input_cost_of_scenarios(results_folder: Path, Path to the directory containing simulation output files. resourcefilepath : Path, optional Path to the resource files + suspended_results_folder: Path, optional + Path to the directory containing suspended simulation output files (using the suspend and resume functionality), + This is used to extract the scaling_factor to scale result to actual population size. If None, then the + 'scaling_factor' is obtained from the results_folder. _draws : list, optional Specific draws to include in the cost estimation. Defaults to all available draws. _runs : list, optional @@ -477,25 +483,68 @@ def merge_cost_and_model_data(cost_df, model_df, varnames): return merged_df # Get available staff count for each year and draw - def get_staff_count_by_facid_and_officer_type(_df: pd.Series) -> pd.Series: - """Summarise the parsed logged-key results for one draw (as dataframe) into a pd.Series.""" - _df = _df.set_axis(_df['date'].dt.year).drop(columns=['date']) - _df.index.name = 'year' + def get_staff_count_by_facid_and_officer_type(_df: pd.DataFrame) -> pd.Series: + """ + Convert logged staff dictionary output into tidy format, + summing staff counts across all clinic columns. + + Returns pd.Series indexed by: + (year, FacilityID, Officer) + """ + + df = _df.copy() + df["year"] = df["date"].dt.year + df = df.drop(columns=["date"]) + + clinic_cols = df.columns.difference(["year"]) + + long_frames = [] + + for clinic in clinic_cols: + expanded = df[[clinic, "year"]].copy() + expanded = expanded[expanded[clinic].notna()] - def change_to_standard_flattened_index_format(col): - parts = col.split("_", 3) # Split by "_" only up to 3 parts - if len(parts) > 2: - return parts[0] + "=" + parts[1] + "|" + parts[2] + "=" + parts[ - 3] # Rejoin with "I" at the second occurrence - return col # If there's no second underscore, return the string as it is + expanded_dict = expanded[clinic].apply(pd.Series) + expanded_dict["year"] = expanded["year"].values - _df.columns = [change_to_standard_flattened_index_format(col) for col in _df.columns] + long_frames.append(expanded_dict) - return unflatten_flattened_multi_index_in_logging(_df).stack(level=[0, 1]) # expanded flattened axis + # Combine all clinics + combined = pd.concat(long_frames, ignore_index=True) + + # Melt to long format + long_df = ( + combined + .melt(id_vars=["year"], + var_name="facility_officer", + value_name="count") + .dropna(subset=["count"]) + ) + + # Split FacilityID and Officer + parts = long_df["facility_officer"].str.split("_Officer_", expand=True) + + long_df["FacilityID"] = ( + parts[0] + .str.replace("FacilityID_", "", regex=False) + .astype(int) + ) + long_df["Officer"] = parts[1] + + # SUM ACROSS CLINICS HERE + result = ( + long_df + .groupby(["year", "FacilityID", "Officer"])["count"] + .sum() + .sort_index() + ) + + return result # Staff count by Facility ID available_staff_count_by_facid_and_officertype = extract_results( Path(results_folder), + suspended_results_folder=suspended_results_folder, module='tlo.methods.healthsystem.summary', key='number_of_hcw_staff', custom_generate_series=get_staff_count_by_facid_and_officer_type, @@ -525,16 +574,64 @@ def change_to_standard_flattened_index_format(col): # Get list of cadres which were utilised in each run to get the count of staff used in the simulation # Note that we still cost the full staff count for any cadre-Facility_Level combination that was ever used in a run, and # not the amount of time which was used - def get_capacity_used_by_officer_type_and_facility_level(_df: pd.Series) -> pd.Series: - """Summarise the parsed logged-key results for one draw (as dataframe) into a pd.Series.""" - _df = _df.set_axis(_df['date'].dt.year).drop(columns=['date']) - _df.index.name = 'year' - return unflatten_flattened_multi_index_in_logging(_df).stack(level=[0, 1]) # expanded flattened axis + def get_capacity_used_by_officer_type_and_facility_level( + _df: pd.DataFrame + ) -> pd.Series: + """ + Parse logging output and return a Series indexed by: + (year, OfficerType, FacilityLevel) + + Collapses (sums) across clinics. + Uses facility_id_levels_dict to map FacilityID → FacilityLevel. + """ + + # ---- 1. Set year index ---- + _df = _df.set_axis(_df["date"].dt.year).drop(columns=["date"]) + _df.index.name = "year" + + # ---- 2. Unflatten logging columns ---- + _df = unflatten_flattened_multi_index_in_logging(_df) + + # Expect columns like: + # ('Clinic', 'facID_and_officer') + + col_df = _df.columns.to_frame(index=False) + + # ---- 3. Extract OfficerType ---- + col_df["OfficerType"] = ( + col_df["facID_and_officer"] + .str.split("_Officer_") + .str[-1] + ) + + # ---- 4. Extract FacilityID ---- + col_df["FacilityID"] = ( + col_df["facID_and_officer"] + .str.split("_Officer_") + .str[0] + .str.replace("FacilityID_", "", regex=False) + .astype(int) + ) + + # ---- 5. Map to FacilityLevel ---- + col_df["FacilityLevel"] = col_df["FacilityID"].map(facility_id_levels_dict) + + # ---- 6. Rebuild MultiIndex (drop clinic level) ---- + _df.columns = pd.MultiIndex.from_frame( + col_df[["OfficerType", "FacilityLevel"]] + ) + + # ---- 7. Collapse across clinics ---- + _df = _df.groupby(level=["OfficerType", "FacilityLevel"], axis=1).sum() + + # ---- 8. Return stacked format ---- + return _df.stack(["OfficerType", "FacilityLevel"]) annual_capacity_used_by_cadre_and_level = extract_results( Path(results_folder), + suspended_results_folder=suspended_results_folder, module='tlo.methods.healthsystem.summary', - key='Capacity_By_OfficerType_And_FacilityLevel', + key='Capacity_By_FacID_and_Officer', custom_generate_series=get_capacity_used_by_officer_type_and_facility_level, do_scaling=False, ) @@ -741,6 +838,7 @@ def get_counts_of_items_requested(_df): cons_req = extract_results( results_folder, + suspended_results_folder=suspended_results_folder, module='tlo.methods.healthsystem.summary', key='Consumables', custom_generate_series=get_counts_of_items_requested, diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py index a5862a02a9..9fb8b135ee 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py @@ -39,16 +39,19 @@ import os from collections import defaultdict from pathlib import Path -from typing import List, Optional import matplotlib.pyplot as plt import numpy as np import pandas as pd +from typing import Optional, List +import re -from scripts.data_file_processing.healthsystem.consumables.generating_consumable_scenarios import ( +from scripts.data_file_processing.healthsystem.consumables.generating_consumable_scenarios.generate_consumable_availability_scenarios_for_impact_analysis import ( generate_alternative_availability_scenarios, - generate_descriptive_consumable_availability_plots, + generate_descriptive_consumable_availability_plots ) +from scripts.data_file_processing.healthsystem.consumables.generating_consumable_scenarios.create_consumable_redistribution_scenarios import generate_redistribution_scenarios + from tlo.methods.consumables import check_format_of_consumables_file # Set local shared folder source @@ -70,8 +73,8 @@ resourcefilepath = Path("./resources") path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" + # Define necessary functions -# Functions to clean LMIS data def change_colnames(df, NameChangeList): # Change column names ColNames = df.columns ColNames2 = ColNames @@ -328,6 +331,15 @@ def update_level1b_availability( return availability_df +# Function to count scenarios at any stage of generating the cons availability RF +def get_max_scenario_number(df: pd.DataFrame) -> int: + scenario_nums = [ + int(m.group(1)) + for c in df.columns + if (m := re.match(r"available_prop_scenario(\d+)$", c)) + ] + return max(scenario_nums) if scenario_nums else 0 + # Function to compute average availability by facility level def compute_avg_availability_by_var(df: pd.DataFrame = None, # TLO availability dataframe with each row representing one Facility_ID, item, month, mfl: Optional[pd.DataFrame] = None, # Master Facility list mapping Facility_Level to Faciility_ID @@ -567,7 +579,9 @@ def comparison_plot_by_level(fac_type): months_withdata = ['January', 'February', 'April', 'October', 'November'] months_interpolated = ['March', 'May', 'June', 'July', 'August', 'September', 'December'] -# 1B. RESHAPE AND REORDER +# 2. RESHAPE AND REORDER ## +######################################################################################### + # Reshape dataframe so that each row represent a unique consumable and facility lmis_df_wide = lmis_df.pivot_table(index=['district', 'fac_type_tlo', 'fac_name', 'program', 'item'], columns='month', values=['closing_bal', 'dispensed', 'received', 'stkout_days', 'amc'], @@ -582,7 +596,8 @@ def comparison_plot_by_level(fac_type): num = lmis_df_wide._get_numeric_data() lmis_df_wide[num < 0] = np.nan -# 1C. INTERPOLATE MISSING VALUES ## +# 3. INTERPOLATE MISSING VALUES ## +######################################################################################### # When stkout_days is empty but closing balance, dispensed and received entries are available lmis_df_wide_flat = lmis_df_wide.reset_index() count_stkout_entries = lmis_df_wide_flat['stkout_days'].count(axis=1).sum() @@ -658,6 +673,45 @@ def comparison_plot_by_level(fac_type): # TODO check whether there is any issue with the above items_introduced_in_september which only show up from September # onwards +def rename_items_to_address_inconsistentencies(_df, item_dict): + """Return a dataframe with rows for the same item with inconsistent names collapsed into one""" + # Recode item names appearing from Jan to Aug to the new names adopted from September onwards + old_unique_item_count = _df.item.nunique() + for item in item_dict: + print(len(_df[_df.item == item_dict[item]]), ''' instances of "''', item_dict[item], '''"''' + ''' changed to "''', item, + '''"''') + # row_newname = _df.item == item + row_oldname = _df.item == item_dict[item] + _df.loc[row_oldname, 'item'] = item + + # Make a list of column names to be collapsed using different methods + columns_to_sum = [col for col in _df.columns if + col[0].startswith(('amc', 'closing_bal', 'dispensed', 'received', 'stkout_days'))] + columns_to_preserve = [col for col in _df.columns if + col[0].startswith(('data_source'))] + + # Define aggregation function to be applied to collapse data by item + def custom_agg(x): + if x.name in columns_to_sum: + return x.sum(skipna=True) if np.any( + x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained + # , i.e. not changed to 0, when the corresponding data for both item name variations are NaN, and when there + # is a 0 or positive value for one or both item name variation, the sum is taken. + elif x.name in columns_to_preserve: + return x.str.cat( + sep='') # for the data_source column, this function concatenates the string values + + # Collapse dataframe + _collapsed_df = _df.groupby(['program', 'item', 'district', 'fac_type_tlo', 'fac_name']).agg( + {col: custom_agg for col in columns_to_preserve + columns_to_sum} + ).reset_index() + + # Test that all items in the dictionary have been found in the dataframe + new_unique_item_count = _collapsed_df.item.nunique() + assert len(item_dict) == old_unique_item_count - new_unique_item_count + return _collapsed_df + # Hold out the dataframe with no naming inconsistencies list_of_items_with_inconsistent_names_zipped = set(zip(inconsistent_item_names_mapping.keys(), inconsistent_item_names_mapping.values())) list_of_items_with_inconsistent_names = [item for sublist in list_of_items_with_inconsistent_names_zipped for item in sublist] @@ -670,7 +724,7 @@ def comparison_plot_by_level(fac_type): lmis_df_wide_flat = pd.concat([df_without_consistent_item_names_corrected, df_with_consistent_item_names], ignore_index=True) -# 1. --- RULE: 1.If i) stockout is missing, ii) closing_bal, amc and received are not missing , and iii) amc !=0 and, +# --- 3.1 RULE: 1.If i) stockout is missing, ii) closing_bal, amc and received are not missing , and iii) amc !=0 and, # then stkout_days[m] = (amc[m] - closing_bal[m-1] - received)/amc * number of days in the month --- # (Note that the number of entries for closing balance, dispensed and received is always the same) for m in range(2, 13): @@ -706,7 +760,7 @@ def comparison_plot_by_level(fac_type): count_stkout_entries = lmis_df_wide_flat['stkout_days'].count(axis=1).sum() print(count_stkout_entries, "stockout entries after first interpolation") -# 2. --- If any stockout_days < 0 after the above interpolation, update to stockout_days = 0 --- +# 3.2 --- If any stockout_days < 0 after the above interpolation, update to stockout_days = 0 --- # RULE: If closing balance[previous month] - dispensed[this month] + received[this month] > 0, stockout == 0 for m in range(1, 13): cond1 = lmis_df_wide_flat['stkout_days', months_dict[m]] < 0 @@ -724,7 +778,7 @@ def comparison_plot_by_level(fac_type): # Flatten multilevel columns lmis_df_wide_flat.columns = [' '.join(col).strip() for col in lmis_df_wide_flat.columns.values] -# 3. --- If the consumable was previously reported and during a given month, if any consumable was reported, assume +# 3.3 --- If the consumable was previously reported and during a given month, if any consumable was reported, assume # 100% days of stckout --- # RULE: If the balance on a consumable is ever reported and if any consumables are reported during the month, stkout_ # days = number of days of the month @@ -752,7 +806,9 @@ def comparison_plot_by_level(fac_type): count_stkout_entries = count_stkout_entries + lmis_df_wide_flat['stkout_days ' + months_dict[m]].count().sum() print(count_stkout_entries, "stockout entries after third interpolation") -# 1D. CALCULATE STOCK OUT RATES BY MONTH and FACILITY +# 4. CALCULATE STOCK OUT RATES BY MONTH and FACILITY ## +######################################################################################### + lmis = lmis_df_wide_flat # choose dataframe # Generate variables denoting the stockout proportion in each month @@ -771,9 +827,10 @@ def comparison_plot_by_level(fac_type): sep=' ', suffix=r'\w+') lmis = lmis.reset_index() -# 2. LOAD CLEANED MATCHED CONSUMABLE LIST FROM TLO MODEL AND MERGE WITH LMIS DATA -######################################################################################################################## -# 1. --- Load and clean data --- +# 5. LOAD CLEANED MATCHED CONSUMABLE LIST FROM TLO MODEL AND MERGE WITH LMIS DATA ## +######################################################################################### + +# 5.1 --- Load and clean data --- # Import matched list of consumanbles consumables_df = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_consumables_matched.csv', low_memory=False, encoding="ISO-8859-1") @@ -792,12 +849,49 @@ def comparison_plot_by_level(fac_type): # Update matched consumable name where the name in the OpenLMIS data was updated in September +def replace_old_item_names_in_lmis_data(_df, item_dict): + """Return a dataframe with old LMIS consumable names replaced with the new name""" + for item in item_dict: + cond_oldname = _df.item == item_dict[item] + _df.loc[cond_oldname, 'item'] = item + return _df + + matched_consumables = replace_old_item_names_in_lmis_data(matched_consumables, inconsistent_item_names_mapping) -# 2. --- Merge data with LMIS data --- +# 5.2 --- Merge data with LMIS data --- lmis_matched_df = pd.merge(lmis, matched_consumables, how='inner', on=['item']) lmis_matched_df = lmis_matched_df.sort_values('data_source') + +def collapse_stockout_data(_df, groupby_list, var): + """Return a dataframe with rows for the same TLO model item code collapsed into 1""" + # Define column lists based on the aggregation function to be applied + columns_to_multiply = [var] + columns_to_sum = ['closing_bal', 'amc', 'dispensed', 'received'] + columns_to_preserve = ['data_source', 'consumable_reporting_freq', 'consumables_reported_in_mth'] + + # Define aggregation function to be applied to collapse data by item + def custom_agg_stkout(x): + if x.name in columns_to_multiply: + return x.prod(skipna=True) if np.any( + x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained + elif x.name in columns_to_sum: + return x.sum(skipna=True) if np.any( + x.notnull() & (x >= 0)) else np.nan # this ensures that the NaNs are retained + # , i.e. not changed to 1, when the corresponding data for both item name variations are NaN, and when there + # is a 0 or positive value for one or both item name variation, the sum is taken. + elif x.name in columns_to_preserve: + return x.iloc[0] # this function extracts the first value + + # Collapse dataframe + _collapsed_df = _df.groupby(groupby_list).agg( + {col: custom_agg_stkout for col in columns_to_multiply + columns_to_sum + columns_to_preserve} + ).reset_index() + + return _collapsed_df + + # 2.i. For substitable drugs (within drug category), collapse by taking the product of stkout_prop (OR condition) # This represents Pr(all substitutes with the item code are stocked out) groupby_list1 = ['module_name', 'district', 'fac_type_tlo', 'fac_name', 'month', 'item_code', 'consumable_name_tlo', @@ -845,9 +939,10 @@ def comparison_plot_by_level(fac_type): 'available_prop', 'closing_bal', 'amc', 'dispensed', 'received', 'data_source', 'consumable_reporting_freq', 'consumables_reported_in_mth']] -# 3. ADD STOCKOUT DATA FROM OTHER SOURCES TO COMPLETE STOCKOUT DATAFRAME -######################################################################################################################## -# --- 1. Generate a dataframe of stock availability for consumables which were not found in the OpenLMIS data but +# 6. ADD STOCKOUT DATA FROM OTHER SOURCES TO COMPLETE STOCKOUT DATAFRAME ## +######################################################################################### + +# --- 6.1. Generate a dataframe of stock availability for consumables which were not found in the OpenLMIS data but # available in the HHFA 2018/19 --- # # Save the list of items for which a match was not found in the OpenLMIS data unmatched_consumables = consumables_df.drop_duplicates(['item_code']) @@ -923,13 +1018,13 @@ def comparison_plot_by_level(fac_type): ('available_prop_hhfa', 'available_prop')] change_colnames(unmatched_consumables_df, NameChangeList) -# --- 2. Append OpenLMIS stockout dataframe with HHFA stockout dataframe and Extract in .csv format --- # +# --- 6.2 Append OpenLMIS stockout dataframe with HHFA stockout dataframe and Extract in .csv format --- # # Append common consumables stockout dataframe with the main dataframe cond = unmatched_consumables_df['available_prop'].notna() unmatched_consumables_df.loc[~cond, 'data_source'] = 'Not available' stkout_df = pd.concat([stkout_df, unmatched_consumables_df], axis=0, ignore_index=True) -# --- 3. Append stockout rate for facility level 0 from HHFA --- # +# --- 6.3 Append stockout rate for facility level 0 from HHFA --- # cond = hhfa_df['item_code'].notna() hhfa_fac0 = hhfa_df[cond][ ['item_code', 'consumable_name_tlo', 'fac_count_Facility_level_0', 'available_prop_hhfa_Facility_level_0']] @@ -946,7 +1041,47 @@ def comparison_plot_by_level(fac_type): stkout_df = stkout_df[~cond] stkout_df = pd.concat([stkout_df, hhfa_fac0], axis=0, ignore_index=True) -# --- 4. Generate new category variable for analysis --- # +# --- 6.4 Generate new category variable for analysis --- # +def recategorize_modules_into_consumable_categories(_df): + _df['item_category'] = _df['module_name'].str.lower() + cond_RH = (_df['item_category'].str.contains('care_of_women_during_pregnancy')) | \ + (_df['item_category'].str.contains('labour')) + cond_newborn = (_df['item_category'].str.contains('newborn')) + cond_newborn[cond_newborn.isna()] = False + cond_childhood = (_df['item_category'] == 'acute lower respiratory infections') | \ + (_df['item_category'] == 'measles') | \ + (_df['item_category'] == 'diarrhoea') + cond_rti = _df['item_category'] == 'road traffic injuries' + cond_cancer = _df['item_category'].str.contains('cancer') + cond_cancer[cond_cancer.isna()] = False + cond_ncds = (_df['item_category'] == 'epilepsy') | \ + (_df['item_category'] == 'depression') + _df.loc[cond_RH, 'item_category'] = 'reproductive_health' + _df.loc[cond_cancer, 'item_category'] = 'cancer' + _df.loc[cond_newborn, 'item_category'] = 'neonatal_health' + _df.loc[cond_childhood, 'item_category'] = 'other_childhood_illnesses' + _df.loc[cond_rti, 'item_category'] = 'road_traffic_injuries' + _df.loc[cond_ncds, 'item_category'] = 'ncds' + cond_condom = _df['item_code'] == 2 + _df.loc[cond_condom, 'item_category'] = 'contraception' + + # Create a general consumables category + general_cons_list = [300, 33, 57, 58, 141, 5, 6, 10, 21, 23, 127, 24, 80, 93, 144, 149, 154, 40, 67, 73, 76, + 82, 101, 103, 88, 126, 135, 71, 98, 171, 133, 134, 244, 247, 49, 112, 1933, 1960] + cond_general = _df['item_code'].isin(general_cons_list) + _df.loc[cond_general, 'item_category'] = 'general' + + # Fill gaps in categories + dict_for_missing_categories = {292: 'acute lower respiratory infections', 293: 'acute lower respiratory infections', + 307: 'reproductive_health', 2019: 'reproductive_health', + 2678: 'tb', 1171: 'other_childhood_illnesses', 1237: 'cancer', 1239: 'cancer'} + # Use map to create a new series from item_code to fill missing values in category + mapped_categories = _df['item_code'].map(dict_for_missing_categories) + # Use fillna on the 'item_category' column to fill missing values using the mapped_categories + _df['item_category'] = _df['item_category'].fillna(mapped_categories) + + return _df + stkout_df = recategorize_modules_into_consumable_categories(stkout_df) item_code_category_mapping = stkout_df[['item_category', 'item_code']].drop_duplicates() @@ -956,12 +1091,12 @@ def comparison_plot_by_level(fac_type): item_designations = item_designations.merge(item_code_category_mapping, left_on = 'Item_Code', right_on = 'item_code', how = 'left', validate = '1:1') item_designations.drop(columns = 'item_code').to_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv', index = False) -# --- 5. Replace district/fac_name/month entries where missing --- # +# --- 6.5 Replace district/fac_name/month entries where missing --- # for var in ['district', 'fac_name', 'month']: cond = stkout_df[var].isna() stkout_df.loc[cond, var] = 'Aggregate' -# --- 6. Export final stockout dataframe --- # +# --- 6.6 Export final stockout dataframe --- # # stkout_df.to_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_and_usage.csv") # <-- this line commented out as the file is very large. @@ -971,6 +1106,26 @@ def comparison_plot_by_level(fac_type): lmis_consumable_usage = stkout_df.copy() # TODO Generate a smaller version of this file # Collapse individual facilities +def get_inflow_to_outflow_ratio_by_item_and_facilitylevel(_df): + df_by_item_level_month = \ + _df.groupby(['item_category', 'item_code', 'district', 'fac_type_tlo', 'month'])[ + ['closing_bal', 'dispensed', 'received']].sum() + df_by_item_level_month = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') != "Aggregate"] + # Opening balance in January is the closing balance for the month minus what was received during the month plus what was dispensed + opening_bal_january = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'closing_bal'] + \ + df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'dispensed'] - \ + df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'January', 'received'] + closing_bal_december = df_by_item_level_month.loc[df_by_item_level_month.index.get_level_values('month') == 'December', 'closing_bal'] + # the consumable inflow during the year is the opening balance in January + what was received throughout the year - what was transferred to the next year (i.e. closing bal of December) + total_consumables_inflow_during_the_year = df_by_item_level_month['received'].groupby(level=['item_category', 'item_code', 'district', 'fac_type_tlo']).sum() +\ + opening_bal_january.reset_index(level='month', drop=True) -\ + closing_bal_december.reset_index(level='month', drop=True) + total_consumables_outflow_during_the_year = df_by_item_level_month['dispensed'].groupby(level=['item_category', 'item_code', 'district', 'fac_type_tlo']).sum() + inflow_to_outflow_ratio = total_consumables_inflow_during_the_year.div(total_consumables_outflow_during_the_year, fill_value=1) + inflow_to_outflow_ratio.loc[inflow_to_outflow_ratio < 1] = 1 # Ratio can't be less than 1 + + return inflow_to_outflow_ratio + inflow_to_outflow_ratio = get_inflow_to_outflow_ratio_by_item_and_facilitylevel(lmis_consumable_usage) # Clean values for analysis inflow_to_outflow_ratio.loc[inflow_to_outflow_ratio < 1] = 1 # Ratio can't be less than 1 @@ -992,6 +1147,7 @@ def comparison_plot_by_level(fac_type): # the Master Facilities List. # unify the set within each facility_id + mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District']) fac_levels = {'0', '1a', '1b', '2', '3', '4'} @@ -1017,8 +1173,7 @@ def comparison_plot_by_level(fac_type): # Take averages (now that 'Mzimba' is mapped-to by both 'Mzimba South' and 'Mzimba North'.) sf = sf.groupby(by=['district_std', 'fac_type_tlo', 'month', 'item_code'])['available_prop'].mean().reset_index() -# 4. INTERPOLATE MISSING DATA TO ENSURE DATA IS AVAILABLE FOR ALL ITEMS, MONTHS, LEVELS, DISTRICTS -######################################################################################################################## +# Fill in missing data: # 1) Cities to get same results as their respective regions copy_source_to_destination = { 'Mzimba': 'Mzuzu City', @@ -1118,10 +1273,26 @@ def comparison_plot_by_level(fac_type): full_set = full_set.combine_first(sf_final.set_index(['Facility_ID', 'month', 'item_code'])['available_prop']) # Fill in the blanks with rules for interpolation. + facilities_by_level = defaultdict(set) for ix, row in mfl.iterrows(): facilities_by_level[row['Facility_Level']].add(row['Facility_ID']) + +def get_other_facilities_of_same_level(_fac_id): + """Return a set of facility_id for other facilities that are of the same level as that provided.""" + for v in facilities_by_level.values(): + if _fac_id in v: + return v - {_fac_id} + + +def interpolate_missing_with_mean(_ser): + """Return a series in which any values that are null are replaced with the mean of the non-missing.""" + if pd.isnull(_ser).all(): + raise ValueError + return _ser.fillna(_ser.mean()) + + # Create new dataset that include the interpolations (The operation is not done "in place", because the logic is based # on what results are missing before the interpolations in other facilities). full_set_interpolated = full_set * np.nan @@ -1168,11 +1339,22 @@ def comparison_plot_by_level(fac_type): full_set_interpolated = full_set_interpolated.reset_index() #full_set_interpolated = full_set_interpolated.reset_index().merge(item_code_category_mapping, on = 'item_code', how = 'left', validate = 'm:1') +full_set_interpolated.to_csv( + path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv", + index=False +) # Save as .csv - this file is then read to apply the following functions - generate_alternative_availability_scenarios +# & generate_redistribution_scenarios # 5. ADD ALTERNATIVE AVAILABILITY SCENARIOS ######################################################################################################################## # Add alternative availability scenarios to represent improved or reduce consumable availability full_set_interpolated_with_scenarios = generate_alternative_availability_scenarios(full_set_interpolated) +max_scenario = get_max_scenario_number(full_set_interpolated_with_scenarios) # Get current scenario count +full_set_interpolated_with_scenarios = generate_redistribution_scenarios(full_set_interpolated_with_scenarios, + scenario_count = max_scenario, + outputfilepath = Path("./outputs/consumables_impact_analysis")) +available_cols = [c for c in full_set_interpolated_with_scenarios.columns if c.startswith("available_prop")] +full_set_interpolated_with_scenarios = full_set_interpolated_with_scenarios[['Facility_ID', 'month', 'item_code'] + available_cols] full_set_interpolated_with_scenarios_level1b_fixed = update_level1b_availability( availability_df=full_set_interpolated_with_scenarios, @@ -1182,8 +1364,13 @@ def comparison_plot_by_level(fac_type): weighting = 'district_1b_to_2_ratio', ) +# Verify that the shape of this dataframe is identical to the original availability dataframe +assert sorted(set(full_set_interpolated_with_scenarios_level1b_fixed.Facility_ID)) == sorted(set(pd.unique(full_set_interpolated.Facility_ID))) +assert sorted(set(full_set_interpolated_with_scenarios_level1b_fixed.month)) == sorted(set(pd.unique(full_set_interpolated.month))) +assert sorted(set(full_set_interpolated_with_scenarios_level1b_fixed.item_code)) == sorted(set(pd.unique(full_set_interpolated.item_code))) +assert len(full_set_interpolated_with_scenarios_level1b_fixed) == len(full_set_interpolated.item_code) + # Compare availability averages by Facility_Level before and after the 1b fix -available_cols = [c for c in full_set_interpolated_with_scenarios.columns if c.startswith("available_prop")] level1b_fix_plots_path = outputfilepath / 'comparison_plots' figurespath_scenarios = outputfilepath / 'consumable_scenarios' if not os.path.exists(level1b_fix_plots_path): @@ -1206,9 +1393,16 @@ def comparison_plot_by_level(fac_type): index=False ) +# Save legacy availability resourcefile before level 1b-2 fix +check_format_of_consumables_file(df=full_set_interpolated_with_scenarios, fac_ids=fac_ids) +full_set_interpolated_with_scenarios.to_csv( + path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small_original.csv", + index=False +) + # %% -# 7. COMPARISON WITH HHFA DATA, 2018/19 -######################################################################################################################## +# 7. COMPARISON WITH HHFA DATA, 2018/19 ## +######################################################################################### # --- 7.1 Prepare comparison dataframe --- ## # Note that this only plot consumables for which data is available in the HHFA # i. Prepare data from HHFA @@ -1223,7 +1417,7 @@ def comparison_plot_by_level(fac_type): hhfa_comparison_df = hhfa_comparison_df.rename({'fac_type_tlo': 'Facility_Level'}, axis=1) # ii. Collapse final model availability data by facility level -final_availability_df = full_set_interpolated_with_scenarios_level1b_fixed +final_availability_df = full_set_interpolated mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") final_availability_df = pd.merge(final_availability_df, mfl[['District', 'Facility_Level', 'Facility_ID']], how="left", on=['Facility_ID'], @@ -1245,7 +1439,39 @@ def comparison_plot_by_level(fac_type): size = 10 comparison_df['consumable_labels'] = comparison_df['consumable_name_tlo'].str[:10] +# Define function to draw calibration plots at different levels of disaggregation +def comparison_plot(level_of_disaggregation, group_by_var, colour): + comparison_df_agg = comparison_df.groupby([group_by_var], + as_index=False).agg({'available_prop': 'mean', + 'available_prop_hhfa': 'mean', + 'Facility_Level': 'first', + 'consumable_labels': 'first'}) + comparison_df_agg['labels'] = comparison_df_agg[level_of_disaggregation] + + ax = comparison_df_agg.plot.scatter('available_prop', 'available_prop_hhfa', c=colour) + ax.axline([0, 0], [1, 1]) + for i, label in enumerate(comparison_df_agg['labels']): + plt.annotate(label, + (comparison_df_agg['available_prop'][i] + 0.005, + comparison_df_agg['available_prop_hhfa'][i] + 0.005), + fontsize=6, rotation=38) + if level_of_disaggregation != 'aggregate': + plt.title('Disaggregated by ' + level_of_disaggregation, fontsize=size, weight="bold") + else: + plt.title('Aggregate', fontsize=size, weight="bold") + plt.xlabel('Pr(drug available) as per TLO model') + plt.ylabel('Pr(drug available) as per HHFA') + save_name = 'comparison_plots/calibration_to_hhfa_' + level_of_disaggregation + '.png' + plt.savefig(outputfilepath / save_name) + + # 7.2.1 Aggregate plot +# First create folder in which to store the plots + +if not os.path.exists(outputfilepath / 'comparison_plots'): + os.makedirs(outputfilepath / 'comparison_plots') + print("folder to store Model-HHFA comparison plots created") + comparison_df['aggregate'] = 'aggregate' level_of_disaggregation = 'aggregate' colour = 'red' @@ -1264,7 +1490,23 @@ def comparison_plot_by_level(fac_type): colour = 'yellow' comparison_plot(level_of_disaggregation, group_by_var, colour) + # 7.2.4 Plot by item and facility level +def comparison_plot_by_level(fac_type): + cond_fac_type = comparison_df['Facility_Level'] == fac_type + comparison_df_by_level = comparison_df[cond_fac_type].reset_index() + plt.scatter(comparison_df_by_level['available_prop'], + comparison_df_by_level['available_prop_hhfa']) + plt.axline([0, 0], [1, 1]) + for i, label in enumerate(comparison_df_by_level['consumable_labels']): + plt.annotate(label, (comparison_df_by_level['available_prop'][i] + 0.005, + comparison_df_by_level['available_prop_hhfa'][i] + 0.005), + fontsize=6, rotation=27) + plt.title(fac_type, fontsize=size, weight="bold") + plt.xlabel('Pr(drug available) as per TLO model') + plt.ylabel('Pr(drug available) as per HHFA') + + fig = plt.figure(figsize=(22, 22)) plt.subplot(421) comparison_plot_by_level(comparison_df['Facility_Level'].unique()[1]) @@ -1291,7 +1533,10 @@ def comparison_plot_by_level(fac_type): 'available_prop_scenario9': 'Best facility \n (including DHO)','available_prop_scenario10': 'HIV supply \n chain', 'available_prop_scenario11': 'EPI supply \n chain', 'available_prop_scenario12': 'HIV moved to \n Govt supply chain \n (Avg by Level)', 'available_prop_scenario13': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID)', 'available_prop_scenario14': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 1.25)', - 'available_prop_scenario15': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 0.75)'} + 'available_prop_scenario15': 'HIV moved to \n Govt supply chain \n (Avg by Facility_ID times 0.75)', + 'available_prop_scenario16': 'Redistribution: District pooling', 'available_prop_scenario17': 'Redistribution: Cluster pooling', + 'available_prop_scenario18': 'Redistribution: Pairwise (large radius)', 'available_prop_scenario19': 'Redistribution: Pairwise (small radius)', +} # Generate descriptive plots of consumable availability program_item_mapping = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']] @@ -1300,7 +1545,7 @@ def comparison_plot_by_level(fac_type): figurespath = figurespath_scenarios, mfl = mfl, program_item_mapping = program_item_mapping, - chosen_availability_columns = None, + chosen_availability_columns = chosen_availability_columns, scenario_names_dict = scenario_names_dict,) diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py new file mode 100644 index 0000000000..c544a3d0e1 --- /dev/null +++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/create_consumable_redistribution_scenarios.py @@ -0,0 +1,2184 @@ +import datetime +from pathlib import Path +import pickle +import calendar + +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import pandas as pd +import time +import matplotlib.patches as mpatches +import matplotlib.lines as mlines + +from typing import Literal, Optional, Dict, Tuple, Iterable +import textwrap +from functools import reduce +import requests +from collections import defaultdict + +from pulp import LpProblem, LpMaximize, LpVariable, LpBinary, LpStatus, value, lpSum, LpContinuous, PULP_CBC_CMD +from math import ceil + +from scripts.costing.cost_estimation import clean_consumable_name + +# define a timestamp for script outputs +timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") + +# print the start time of the script +print('Script Start', datetime.datetime.now().strftime('%H:%M')) + +# define folder pathways +outputfilepath = Path("./outputs/consumables_impact_analysis") +resourcefilepath = Path("./resources") +path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" +# Set local shared drive source +path_to_share = Path( # <-- point to the shared folder + '/Users/sm2511/CloudStorage/OneDrive-SharedLibraries-ImperialCollegeLondon/TLOModel - WP - Documents/' +) + +def generate_redistribution_scenarios(tlo_availability_df: pd.DataFrame, + scenario_count: int, + outputfilepath: Path = Path("./outputs/consumables_impact_analysis")) -> pd.DataFrame: + # 1. Import and clean data files + #********************************** + # Import Cleaned OpenLMIS data from 2018 + lmis = (pd.read_csv(outputfilepath / "ResourceFile_Consumables_availability_and_usage.csv") + [['district', 'fac_type_tlo', 'fac_name', 'month', 'item_code', 'available_prop', + 'closing_bal', 'amc', 'dispensed', 'received']]) + + # Drop duplicated facility, item, month combinations + print(lmis.shape, "rows before collapsing duplicates") + key_cols = ["district", "item_code", "fac_name", "month"] # keys that define a unique record + + # helper to keep one facility level per group (mode → most common; fallback to first non-null) + def _mode_or_first(s: pd.Series): + s = s.dropna() + if s.empty: + return np.nan + m = s.mode() + return m.iloc[0] if not m.empty else s.iloc[0] + + lmis = ( + lmis + .groupby(key_cols, as_index=False) + .agg( + closing_bal=("closing_bal", "sum"), + dispensed=("dispensed", "sum"), + received=("received", "sum"), + amc=("amc", "sum"), + available_prop=("available_prop", "mean"), + fac_type_tlo=("fac_type_tlo", _mode_or_first), # optional; remove if not needed + ) + ) + + print(lmis.shape, "rows after collapsing duplicates") + + # Import data on facility location + location = (pd.read_excel(path_to_share / "07 - Data/Facility_GPS_Coordinates/gis_data_for_openlmis/LMISFacilityLocations_raw.xlsx") + [['LMIS Facility List', 'LATITUDE', 'LONGITUDE']]) + # Find duplicates in facilty names in the location dataset + duplicates = location[location['LMIS Facility List'].duplicated(keep=False)] + location = location.drop(duplicates[duplicates['LATITUDE'].isna()].index).reset_index(drop=True) # Drop those duplicates where location is missing + # Import ownership data + ownership = (pd.read_csv(path_to_share / "07 - Data/Consumables data/OpenLMIS/lmis_facility_ownership.csv"))[['fac_name', 'fac_owner']] + ownership = ownership.drop_duplicates(subset=['fac_name']) + + # Merge OpenLMIS and location and ownership data + lmis = lmis.merge(location, left_on='fac_name', right_on = 'LMIS Facility List', how = 'left', validate='m:1') + lmis = lmis.merge(ownership, on='fac_name', how = 'left', validate='m:1') + lmis.rename(columns = {'LATITUDE':'lat', 'LONGITUDE':'long', 'fac_type_tlo': 'Facility_Level'}, inplace = True) + + # Cleaning to match date to the same format as consumable availability RF in TLO model + month_map = { + "January": 1, "February": 2, "March": 3, "April": 4, + "May": 5, "June": 6, "July": 7, "August": 8, + "September": 9, "October": 10, "November": 11, "December": 12 + } + lmis["month"] = lmis["month"].map(month_map) + lmis["Facility_Level"] = lmis["Facility_Level"].str.replace("Facility_level_", "", regex=False) + + # Clean data types before analysis + # 1) Normalize fac_name + lmis["fac_name"] = ( + lmis["fac_name"] + .astype("string") # Pandas string dtype (not just object) + .str.normalize("NFKC") # unify unicode forms + .str.strip() # trim leading/trailing spaces + .str.replace(r"\s+", "_", regex=True) # collapse internal whitespace + ) + + # 2) Normalize other key columns used in grouping/joins + lmis["item_code"] = lmis["item_code"].astype("string").str.strip() + lmis["district"] = lmis["district"].astype("string").str.strip().str.replace(r"\s+", "_", regex=True) + lmis["Facility_Level"] = lmis["Facility_Level"].astype("string").str.strip() + + # 3) Ensure numeric types (quietly coerce bad strings to NaN) + lmis["amc"] = pd.to_numeric(lmis["amc"], errors="coerce") + lmis["closing_bal"] = pd.to_numeric(lmis["closing_bal"], errors="coerce") + + # Keep only those facilities whose location is available + old_facility_count = lmis.fac_name.nunique() + lmis = lmis[lmis.lat.notna()] + new_facility_count = lmis.fac_name.nunique() + print(f"{old_facility_count - new_facility_count} facilities out of {old_facility_count} in the lmis data dropped due to " + f"missing location information") + + # Explore missingness + def compute_opening_balance(df: pd.DataFrame) -> pd.Series: + """ + Compute opening balance from same-month records. + + Formula: + OB = closing_bal - received + dispensed + Any negative OB values are replaced with 0. + Equivalent to: OB_(m) = CB_(m-1) + """ + ob = df["closing_bal"] - df["received"] + df["dispensed"] + return ob.clip(lower=0) + + # 1. Compute opening balance + lmis["opening_bal"] = compute_opening_balance(lmis).replace([np.inf, -np.inf], np.nan).fillna(0.0) + # Mechanistic probability (p_mech = OB / AMC) + amc_safe = np.maximum(1e-6, lmis["amc"].astype(float)) + lmis["p_mech"] = np.clip(lmis["opening_bal"] / amc_safe, 0.0, 1.0) + # Identify inconsistent rows (where reported p > mechanistic p) + mask_inconsistent = lmis["p_mech"] < lmis["available_prop"] + # Adjust opening balance upward to match reported availability + lmis.loc[mask_inconsistent, "opening_bal"] = ( + lmis.loc[mask_inconsistent, "available_prop"] * lmis.loc[mask_inconsistent, "amc"] + ) + print(f"Adjusted {mask_inconsistent.sum():,} rows " + f"({mask_inconsistent.mean()*100:.2f}%) where recorded availability " + f"exceeded mechanistic availability.") + + lmis.reset_index(inplace=True, drop = True) + + # ---------------------------------------------------------------------------------------------------------------------- + # 1) Data exploration + # ---------------------------------------------------------------------------------------------------------------------- + def generate_stock_adequacy_heatmap( + df: pd.DataFrame, + figures_path: Path = Path("figures"), + filename: str = "heatmap_adequacy_opening_vs_3xamc.png", + y_var: str = "district", # the variable on the y-axis of the heatmap + value_var: str = "item_code", # the count variable on the basis of which the values are calculated + value_label: str = "", # label describing the values in the heatmap + include_missing_as_fail: bool = False, # if True, items with NaN opening/amc count as NOT adequate + amc_threshold: float = 3.0, + compare: str = "ge" , # "ge" for >= threshold*AMC, "le" for <= threshold*AMC + decimals: int = 0, + cmap: str = "RdYlGn", + figsize= None, + xtick_rotation: int = 45, + ytick_rotation: int = 0, + annotation: bool = True, + footnote: str = None, + ): + """ + Heatmap values: for each (month, district), the % of item_code groups where + sum(opening_balance over Facility_ID) >= 3 * sum(amc over Facility_ID). + """ + df = df.copy() + + # --- 1. Ensure month is int and build label --- + df["month"] = pd.to_numeric(df["month"], errors="coerce").astype("Int64") + df = df.dropna(subset=["month"]) + df["month"] = df["month"].astype(int) + + df["_month_label"] = df["month"].map(lambda m: calendar.month_abbr[m]) + + # ---- 2) Aggregate to (month, district, item_code) over facilities ---- + agg = ( + df.groupby(["month", "_month_label", y_var, value_var], dropna=False) + .agg(opening_bal=("opening_bal", "sum"), + amc=("amc", "sum")) + .reset_index() + ) + + # Keep: + # - 1. all rows where amc != 0 + # - 2. rows where the (fac_name, item_code) pair never had any non-zero amc + # (because this would indicate that their AMC may in fact be zero) + # - 3. rows where both Opening balance and AMC are not zero + agg = agg[(agg["amc"] != 0)] + agg = agg[~((agg["amc"] == 0) & (agg["opening_bal"] == 0))] + + # ---- 3) Adequacy indicator per (month, district, item_code) ---- + if include_missing_as_fail: + # NaNs treated as fail -> fill with NaN-safe compare: set False when either missing + ok = agg[["opening_bal", "amc"]].notna().all(axis=1) + left = agg["opening_bal"] + right = amc_threshold * agg["amc"] + if compare == "le": + cond = (left <= right) & ok + else: # default to ">=" + cond = (left >= right) & ok + else: + valid = agg.dropna(subset=["opening_bal", "amc"]) + cond = pd.Series(False, index=agg.index) + left = valid["opening_bal"] + right = amc_threshold * valid["amc"] + if compare == "le": + cond.loc[valid.index] = left <= right + else: + cond.loc[valid.index] = left >= right + + agg["condition_met"] = cond.astype(int) + + # --- % meeting condition per (month, district) across item_code --- + if include_missing_as_fail: + denom = agg.groupby(["month", "_month_label", y_var])[value_var].nunique() + numer = agg.groupby(["month", "_month_label", y_var])["condition_met"].sum() + else: + valid_mask = agg[["opening_bal", "amc"]].notna().all(axis=1) + denom = agg[valid_mask].groupby(["month", "_month_label", y_var])[value_var].nunique() + numer = agg[valid_mask].groupby(["month", "_month_label", y_var])["condition_met"].sum() + + pct = (numer / denom * 100).replace([np.inf, -np.inf], np.nan).reset_index(name="pct_meeting") + + # ---- 5) Pivot: districts (rows) x months (columns) ---- + # Sort months by _month_sort and use _month_label as the displayed column name + month_order = ( + pct[["month", "_month_label"]] + .drop_duplicates() + .sort_values("month") + ["_month_label"] + .tolist() + ) + heatmap_df = pct.pivot(index=y_var, columns="_month_label", values="pct_meeting") + heatmap_df = heatmap_df.reindex(columns=month_order) + + # --- Add average row and column --- + # Column average (mean of each month) + heatmap_df.loc["Average"] = heatmap_df.mean(axis=0) + # Row average (mean of each y_var) + heatmap_df["Average"] = heatmap_df.mean(axis=1) + + # Fix overall average (bottom-right) + overall_avg = heatmap_df.loc["Average", "Average"] + heatmap_df.loc["Average", "Average"] = overall_avg + + # Optional rounding for nicer colorbar ticks (doesn't affect color) + if decimals is not None: + heatmap_df = heatmap_df.round(decimals) + + # --- Dynamic figure size --- + if figsize is None: + n_rows = len(heatmap_df) + n_cols = len(heatmap_df.columns) + height = max(6, n_rows * 0.2) # taller if many rows + width = max(8, n_cols * 0.6) + figsize = (width, height) + + # ---- 6) Plot heatmap ---- + sns.set(font_scale=1.0) + fig, ax = plt.subplots(figsize=figsize) + + cbar_kws = value_label + hm = sns.heatmap( + heatmap_df, + cmap=cmap, + cbar_kws={"label": value_label}, + ax=ax, + annot=annotation, annot_kws={"size": 10}, + vmin = 0, vmax = 100) + + ax.set_xlabel("Month") + ax.set_ylabel(f"{y_var}") + ax.set_xticklabels(ax.get_xticklabels(), rotation=xtick_rotation) + ax.set_yticklabels(ax.get_yticklabels(), rotation=ytick_rotation) + + # Keep colorbar ticks plain (no scientific notation) + try: + cbar_ax = ax.figure.axes[-1] + cbar_ax.ticklabel_format(style="plain") + except Exception: + pass + + # ---- Footnote ---- + if footnote is not None: + fig.subplots_adjust(bottom=0.08) + fig.text( + 0.5, 0.035, + footnote, + ha="center", + va="top", + fontsize=10 + ) + + # ---- 7) Save & return ---- + figures_path.mkdir(parents=True, exist_ok=True) + outpath = figures_path / filename + plt.savefig(outpath, dpi=300, bbox_inches="tight") + plt.close(fig) + + return fig, ax, heatmap_df + + # ---------------------------------------------------------------------------------------------------------------------- + # 2) Estimate travel time matrix + # ---------------------------------------------------------------------------------------------------------------------- + def _chunk_indices(n: int, chunk: int): + """Yield (start, end) index pairs for chunking 0..n-1.""" + for s in range(0, n, chunk): + e = min(n, s + chunk) + yield s, e + + def build_travel_time_matrix( + fac_df: pd.DataFrame, + *, + id_col: str = "fac_name", + lat_col: str = "lat", + lon_col: str = "long", + mode: Literal["car", "bicycle"] = "car", + backend: Literal["ors", "osrm"] = "ors", + # ORS options + ors_api_key: Optional[str] = None, + ors_base_url: str = "https://api.openrouteservice.org/v2/matrix", + # OSRM options (self-hosted or public; note: public often has only 'car') + osrm_base_url: str = "https://router.project-osrm.org", + osrm_profile_map: dict = None, + # matrix request chunking (keeps requests within API limits) + max_chunk: int = 40, + timeout: int = 60, + ) -> pd.DataFrame: + """ + Build an NxN *road* travel-time matrix (minutes) for facilities, by CAR or BICYCLE + + backends: + - 'ors' -> uses OpenRouteService Matrix API (profiles: driving-car, cycling-regular). + Requires ors_api_key. Has rate/size limits; we auto-chunk. + - 'osrm' -> uses OSRM 'table' service (profiles typically 'car' or 'bike'). + For bicycle, you'll likely need a self-hosted OSRM with the bicycle profile. + + Parameters + ---------- + mode : 'car' | 'bicycle' + Travel mode on roads. + max_chunk : int + Max #origins (and #destinations) per sub-matrix request to keep within API limits. + + Returns + ------- + pd.DataFrame + Square DataFrame (minutes), index/columns = facility names. + """ + facs = fac_df[[id_col, lat_col, lon_col]].dropna().drop_duplicates().reset_index(drop=True) + ids = facs[id_col].tolist() + lats = facs[lat_col].to_numpy() + lons = facs[lon_col].to_numpy() + n = len(ids) + + T = pd.DataFrame(np.full((n, n), np.nan, dtype=float), index=ids, columns=ids) + np.fill_diagonal(T.values, 0.0) + + if n == 0: + return T + + if backend == "ors": + if ors_api_key is None: + raise ValueError("OpenRouteService requires ors_api_key.") + profile = "driving-car" if mode == "car" else "cycling-regular" + + # ORS expects [lon, lat] + coords = [[float(lons[i]), float(lats[i])] for i in range(n)] + + headers = {"Authorization": ors_api_key, "Content-Type": "application/json"} + + # Chunk both sources and destinations to stay under limits + for si, sj in _chunk_indices(n, max_chunk): + for di, dj in _chunk_indices(n, max_chunk): + sources = list(range(si, sj)) + destinations = list(range(di, dj)) + + body = { + "locations": coords, + "sources": sources, + "destinations": destinations, + "metrics": ["duration"], + } + url = f"{ors_base_url}/{profile}" + r = requests.post(url, json=body, headers=headers, timeout=timeout) + r.raise_for_status() + data = r.json() + + # durations in seconds; fill submatrix + durs = data.get("durations") + if durs is None: + raise RuntimeError(f"ORS returned no durations for block {si}:{sj} x {di}:{dj}") + sub = (np.array(durs, dtype=float) / 60.0) # minutes + T.iloc[si:sj, di:dj] = sub + + elif backend == "osrm": + # Map desired mode to OSRM profile names + if osrm_profile_map is None: + osrm_profile_map = {"car": "car", "bicycle": "bike"} + profile = osrm_profile_map.get(mode) + if profile is None: + raise ValueError(f"No OSRM profile mapped for mode='{mode}'.") + + # NOTE: The public OSRM demo often supports *car only*. + # For bicycle, run your own OSRM with the bicycle profile. + # We use the OSRM 'table' service; POST with 'sources' and 'destinations' indices. + + coords = ";".join([f"{lons[i]},{lats[i]}" for i in range(n)]) # lon,lat + + for si, sj in _chunk_indices(n, max_chunk): + for di, dj in _chunk_indices(n, max_chunk): + sources = ";".join(map(str, range(si, sj))) + destinations = ";".join(map(str, range(di, dj))) + + url = ( + f"{osrm_base_url}/table/v1/{profile}/{coords}" + f"?sources={sources}&destinations={destinations}&annotations=duration" + ) + r = requests.get(url, timeout=timeout) + r.raise_for_status() + data = r.json() + + durs = data.get("durations") + if durs is None: + raise RuntimeError(f"OSRM returned no durations for block {si}:{sj} x {di}:{dj}") + sub = (np.array(durs, dtype=float) / 60.0) # minutes + T.iloc[si:sj, di:dj] = sub + + else: + raise ValueError("backend must be 'ors' or 'osrm'.") + + # Safety: replace any residual NaNs (unroutable pairs) with inf or a large number + T = T.fillna(np.inf) + return T + + # Because ORS and ORSM can only handle a limited number of elements at a time, it's better to run this by district + # each of which already has under 50 facilities + def build_time_matrices_by_district( + df: pd.DataFrame, + *, + district_col: str = "district", + id_col: str = "fac_name", + lat_col: str = "lat", + lon_col: str = "long", + mode: str = "car", + backend: str = "osrm", + osrm_base_url: str = "https://router.project-osrm.org", + ors_api_key: str | None = None, + max_chunk: int = 50, # safe for both OSRM/ORS + ) -> dict[str, pd.DataFrame]: + """ + Returns a dict: {district -> square minutes matrix DataFrame}, computed within each district only. + """ + matrices = {} + # unique facilities per district (drop duplicates in case of repeated rows) + fac_cols = [district_col, id_col, lat_col, lon_col] + fac_coords = df[fac_cols].dropna().drop_duplicates() + + for d, facs_d in fac_coords.groupby(district_col): + # Skip tiny groups + if len(facs_d) < 2: + continue + + T = build_travel_time_matrix( + fac_df=facs_d[[id_col, lat_col, lon_col]], + id_col=id_col, lat_col=lat_col, lon_col=lon_col, + mode=mode, backend=backend, + osrm_base_url=osrm_base_url, + ors_api_key=ors_api_key, + max_chunk=max_chunk, + ) + matrices[d] = T + + return matrices + + # ----------------------------------------------- + # 3) Data prep for redistribution linear program + # ----------------------------------------------- + def presumed_availability(ob, amc, eps=1e-9) -> float: + """ + Presumed likelihood of cons availability = p = min(1, OB/AMC) at month start (no additional receipts considered + at this point in time). + """ + return float(min(1.0, max(0.0, (ob / max(eps, amc))))) + + + def build_edges_within_radius( + time_matrix: pd.DataFrame, + max_minutes: float + ) -> Dict[str, set]: + """ + For each facility g, return set of receivers f such that T[g,f] <= max_minutes, f != g. + """ + edges = {} + for g in time_matrix.index: + feasible = set(time_matrix.columns[(time_matrix.loc[g] <= max_minutes) & (time_matrix.columns != g)]) + edges[g] = feasible + return edges + + def build_edges_within_radius_flat(T_by_dist: dict, max_minutes: float) -> dict[str, set[str]]: + """ + Takes the district-wise dictionary of time travel matrices and converts it into a flat dictionary of facilities + and their edge neighbours depending on the maximum allowable travel distance. + T_by_dist: {district -> square DataFrame of minutes (index/cols = facility IDs)} + Returns: {facility_id -> set(of facility_ids)} for all districts combined. + """ + edges: dict[str, set[str]] = {} + for _, T in T_by_dist.items(): + for g in T.index: + row = T.loc[g].to_numpy() + feasible_mask = (row <= max_minutes) & np.isfinite(row) + # Exclude self + feasible = [f for f in T.columns[feasible_mask] if f != g] + if g not in edges: + edges[g] = set() + edges[g].update(feasible) + return edges + + # a = build_edges_within_radius(T_car, max_minutes = 18) + + # Defining clusters of health facilities within district + # This function helps find the facilities which would be appropriate cluster centers + def _farthest_first_seeds(T: pd.DataFrame, k: int, big: float = 1e9) -> list: + """ + Pick k seed medoids via farthest-first traversal on a travel-time matrix. + Treat inf/NaN distances as 'big' so disconnected components get separate seeds. + """ + n = T.shape[0] + facs = T.index.tolist() + D = T.to_numpy().astype(float) + D[~np.isfinite(D)] = big + + # Start at the row with largest average distance (covers sparse areas first) + start = int(np.nanargmax(np.nanmean(D, axis=1))) # the remotest facility + seeds_idx = [start] + + # Iteratively add the point with max distance to its nearest seed + for _ in range(1, k): + # min distance to any existing seed for every point + min_to_seed = np.min(D[:, seeds_idx], axis=1) # this has a dimension of [facs, number of seeds] + next_idx = int(np.argmax(min_to_seed)) # for each facility find the distance to its nearest seed + # and the facility farthest from the nearest seed becomes the next seed + if next_idx in seeds_idx: + # Fallback: pick any non-seed with highest min_to_seed + candidates = [i for i in range(n) if i not in seeds_idx] + if not candidates: + break + next_idx = int(candidates[np.argmax(min_to_seed[candidates])]) + seeds_idx.append(next_idx) + + return [facs[i] for i in seeds_idx] # list of length k representing the clustering points + + # Assign each facility to its nearest seed subject to a hard cluster capacity (≤ X members) + def _assign_to_cluster_with_fixed_capacity(T: pd.DataFrame, seeds: list, capacity: int, big: float = 1e9) -> Dict[str, int]: + """ + Greedy assignment of facilities to nearest seed that still has capacity (based on maximum cluster size). + Returns: mapping facility -> seed_index (position in seeds list). + """ + facs = T.index.tolist() + D = T.loc[facs, seeds].to_numpy().astype(float) # Distance of all facilities from the k seeds + D[~np.isfinite(D)] = big + + # each facility: nearest distance to any seed (for stable ordering) + nearest = D.min(axis=1) # find the shortest distance to a cluster for each facility + order = np.argsort(nearest) # sort all facilities in ascending order of their distance from the nearest facility + + cap_left = {j: capacity for j in range(len(seeds))} # the capacity left for each seed + assign = {} + + for idx in order: + f = facs[idx] + # try seeds in ascending distance + seq = np.argsort(D[idx, :]) # the sequence of seeds most suitable for idx + placed = False + for j in seq: + if cap_left[j] > 0: + assign[f] = j + cap_left[j] -= 1 + placed = True + break + if not placed: + # total capacity >= n, so this should be rare; if it happens, put in least-loaded seed + j = min(cap_left, key=lambda jj: cap_left[jj]) + assign[f] = j + cap_left[j] -= 1 + + return assign + + def capacity_clusters_for_district( + T_d: pd.DataFrame, cluster_size: int = 5, big: float = 1e9, refine_swaps: int = 0 + ) -> Dict[str, str]: + """ + Build ~equal-size clusters (size<=cluster_size) from a district's travel-time matrix via + capacity-constrained k-medoids (farthest-first seeds + greedy capacity assignment). + + Returns: {facility_id -> cluster_id} (cluster ids like 'C00','C01',...) + """ + facs = T_d.index.tolist() + n = len(facs) + if n == 0: + return {} + if n <= cluster_size: + return {f: "C00" for f in facs} + + k = ceil(n / cluster_size) + seeds = _farthest_first_seeds(T_d, k=k, big=big) + assign_seed_idx = _assign_to_cluster_with_fixed_capacity(T_d, seeds=seeds, capacity=cluster_size, big=big) + + # Optional tiny refinement: (off by default) + # You could add 1–2 passes of medoid swap within clusters to reduce intra-cluster travel. + + # Build cluster ids in seed order + seed_to_cid = {j: f"C{j:02d}" for j in range(len(seeds))} + return {f: seed_to_cid[assign_seed_idx[f]] for f in facs} + + def build_capacity_clusters_all( + T_by_dist: Dict[str, pd.DataFrame], cluster_size: int = 5 + ) -> pd.Series: + """ + Apply capacity clustering to all districts. + Args: + T_by_dist: {'DistrictName': time_matrix (minutes, square DF with facility ids)} + cluster_size: desired max cluster size (e.g., 5) + + Returns: + pd.Series mapping facility_id -> cluster_id names scoped by district, e.g. 'Nkhotakota#C03' + """ + mappings = [] + for d, T_d in T_by_dist.items(): + if T_d is None or T_d.empty: + continue + local_map = capacity_clusters_for_district(T_d, cluster_size=cluster_size) + if not local_map: + continue + s = pd.Series(local_map, name="cluster_id") + s = s.map(lambda cid: f"{d}#{cid}") # scope cluster name by district + mappings.append(s) + if not mappings: + return pd.Series(dtype=object) + return pd.concat(mappings) + + # ----------------------------------------------- + # 3) LP/MILP Redistribution + # ----------------------------------------------- + def redistribute_radius_lp( + df: pd.DataFrame, + time_matrix: Dict[str, pd.DataFrame] | pd.DataFrame, + radius_minutes: float, + # policy knobs + tau_keep: float = 1.0, # donors must keep ≥ tau_keep * AMC + tau_tar: float = 1.0, # receivers target OB = AMC + K_in: int = 1, # per-item: max donors per receiver + K_out: int = 10, # per-item: max receivers per donor + Qmin_proportion: float = 0.25, # min lot as a fraction of receiver AMC (e.g., 0.25 ≈ 7–8 days) + eligible_levels: Iterable[str] = ("1a","1b"), + # schema + id_cols=("district","month","item_code"), + facility_col="fac_name", + level_col="Facility_Level", + amc_col="amc", + # outputs/behaviour + return_edge_log: bool = True, + floor_to_baseline: bool = True, # if True, never let reported availability drop below baseline + # numerics + amc_eps: float = 1e-6, + eps: float = 1e-9, + ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: + """ + Pairwise (radius) redistribution with per-item degree caps. + + MILP per (district, month, item): + variables: t[g,f] ≥ 0 (transfer), y[g,f] ∈ {0,1} (edge activation) + objective: maximize Σ_p + key constraints: + - donors keep ≥ tau_keep * AMC + - receivers (only eligible levels) limited to deficit: tau_tar*AMC - OB + - travel-time ≤ radius + - min lot: t[g,f] ≥ Qmin * y[g,f], with Qmin = Qmin_proportion * AMC_receiver + - edge capacity: t[g,f] ≤ min(surplus_g, deficit_f) * y[g,f] + - degree caps per item: inbound ≤ K_in, outbound ≤ K_out + Availability is recomputed mechanistically and written back **only** where a transfer occurred. + """ + out = df.copy() + + # Opening balance + out["OB"] = out["opening_bal"] + # Preserve only necessary columns + selected_cols = list(id_cols) + [level_col, facility_col, 'OB', amc_col, 'available_prop'] + out = out[selected_cols] + + # clean AMC & level + out[amc_col] = pd.to_numeric(out[amc_col], errors="coerce").fillna(0.0) + out[level_col] = out[level_col].astype(str) + + # Container for updated OB and edge log + out["OB_prime"] = out["OB"] + edge_rows = [] if return_edge_log else None + + # Group at (district, month, item_code) + group_cols = list(id_cols) + + skipped_nodes = [] + for (d, m, i), g in out.groupby(group_cols, sort=False): + g = g.copy() + + # --- Pick the travel-time matrix slice --- + if isinstance(time_matrix, dict): + T_d = time_matrix.get(d) + if T_d is None or T_d.empty: + continue + else: + T_d = time_matrix # single matrix for all, if you pass one + + facs_slice = g[facility_col].dropna().unique().tolist() + facs = [f for f in facs_slice if f in T_d.index and f in T_d.columns] + if len(facs) < 2: + continue + + T_sub = T_d.loc[facs, facs].replace(np.nan, np.inf) + + # --- Pull per-fac data for this item --- + AMC = (g.set_index(facility_col)[amc_col].astype(float) + .replace([np.inf, -np.inf], np.nan).fillna(0.0)) + OB0 = (g.set_index(facility_col)["OB"].astype(float) + .replace([np.inf, -np.inf], np.nan).fillna(0.0)) + LVL = (g.set_index(facility_col)[level_col].astype(str)) + + + # Align to facs and guard AMC + AMC = AMC.reindex(facs).fillna(0.0) + AMC_guard = AMC.copy() + AMC_guard[AMC_guard <= 0.0] = amc_eps + OB0 = OB0.reindex(facs).fillna(0.0) + LVL = LVL.reindex(facs) + + # --- Surplus / deficit --- + surplus = np.maximum(0.0, OB0.values - tau_keep * AMC_guard.values) # donors + deficit = np.maximum(0.0, tau_tar * AMC_guard.values - OB0.values) # receivers + + # Leave AMC == 0 untouched + #recv_pos_mask = AMC.values > amc_eps # forbid AMC≈0 from receiving + #deficit = np.where(recv_pos_mask, deficit0, 0.0) + + # Only eligible levels can receive + elig_recv = LVL.isin(eligible_levels).values + deficit = np.where(elig_recv, deficit, 0.0) + + donors = [f for f, s in zip(facs, surplus) if s > eps] + recvs = [f for f, h in zip(facs, deficit) if h > eps] + if not donors or not recvs: + continue + + s_map = dict(zip(facs, surplus)) + h_map = dict(zip(facs, deficit)) + qmin_map = dict(zip(facs, Qmin_proportion * AMC_guard.values)) + + # --- Feasible edges (within radius), HARD PRUNE if capacity < qmin --- + M_edge = {} # capacity per edge + Qmin = {} # min lot per edge + for g_fac in donors: + row = T_sub.loc[g_fac].to_numpy() + feas_idx = np.where((row <= radius_minutes) & np.isfinite(row))[0] + for idx in feas_idx: + f_fac = T_sub.columns[idx] + if f_fac == g_fac or f_fac not in recvs: + continue + M = min(s_map[g_fac], h_map[f_fac]) + if not np.isfinite(M) or M <= eps: + continue + qmin = float(qmin_map[f_fac]) + if not np.isfinite(qmin) or qmin <= eps or M < qmin: + # cannot move at least qmin -> drop the edge + continue + M_edge[(g_fac, f_fac)] = float(M) + Qmin[(g_fac, f_fac)] = float(qmin) + + if not M_edge: + continue + + # --- MILP (per item) --- + prob = LpProblem(f"Radius_{d}_{m}_{i}", LpMaximize) + + # decision vars + t = {e: LpVariable(f"t_{e[0]}->{e[1]}", lowBound=0, upBound=M_edge[e], cat=LpContinuous) + for e in M_edge.keys()} + y = {e: LpVariable(f"y_{e[0]}->{e[1]}", lowBound=0, upBound=1, cat=LpBinary) + for e in M_edge.keys()} + p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in + facs} # or only for eligible receivers + + # objective: maximize total shipped + prob += lpSum(p[f] for f in recvs) # or for level-eligible facilities + + AMC_guard = AMC.reindex(facs).fillna(0.0) + AMC_guard[AMC_guard <= 0.0] = amc_eps + + # donor outflow caps (per item) + for g_fac in donors: + vars_out = [t[(g_fac, f_fac)] for (gg, f_fac) in t.keys() if gg == g_fac] + if vars_out: + s_cap = float(max(0.0, OB0[g_fac] - tau_keep * AMC_guard[g_fac])) + prob += lpSum(vars_out) <= s_cap + + # receiver inflow caps (per item; eligibility already enforced) + for f_fac in recvs: + vars_in = [t[(g_fac, f_fac)] for (g_fac, ff) in t.keys() if ff == f_fac] + if vars_in: + h_cap = float(max(0.0, tau_tar * AMC_guard[f_fac] - OB0[f_fac])) + prob += lpSum(vars_in) <= h_cap + + # link t & y + min-lot + for e, M in M_edge.items(): + prob += t[e] <= M * y[e] + qmin = min(Qmin[e], M_edge[e]) # TODO should this be qmin = Qmin[e]? + if qmin > eps: + prob += t[e] >= qmin * y[e] + + # per-item degree caps + for f_fac in recvs: + inbound_y = [y[(g_fac, f_fac)] for (g_fac, ff) in y.keys() if ff == f_fac] + if inbound_y: + prob += lpSum(inbound_y) <= K_in + for g_fac in donors: + outbound_y = [y[(g_fac, f_fac)] for (gg, f_fac) in y.keys() if gg == g_fac] + if outbound_y: + prob += lpSum(outbound_y) <= K_out + + # 3) Availability linearization per facility + # Need inflow and outflow expressions from your t[(g,f)] + for f_fac in facs: + inflow = lpSum(t[(g_fac, f_fac)] for (g_fac, ff) in t.keys() if ff == f_fac) + outflow = lpSum(t[(f_fac, h_fac)] for (gg, h_fac) in t.keys() if gg == f_fac) + prob += float(AMC_guard[f_fac]) * p[f_fac] <= float(OB0.get(f_fac, 0.0)) + inflow - outflow + + # solve + status = prob.solve(PULP_CBC_CMD(msg=False, cuts=0, presolve=True, threads=1)) + if LpStatus[prob.status] != "Optimal": + skipped_nodes.append((d, m, i)) + continue + + # --- Apply transfers & log --- + delta = {f: 0.0 for f in facs} # net change in OB by facility (this item) + any_transfer = False + for (g_fac, f_fac), var in t.items(): + moved = float(value(var) or 0.0) + if moved > eps: + any_transfer = True + delta[g_fac] -= moved + delta[f_fac] += moved + if return_edge_log: + tm = float(T_sub.loc[g_fac, f_fac]) if np.isfinite(T_sub.loc[g_fac, f_fac]) else np.nan + edge_rows.append({ + "district": d, "month": m, "item_code": i, + "donor_fac": g_fac, "receiver_fac": f_fac, + "units_moved": moved, "travel_minutes": tm + }) + + if not any_transfer: + continue + + sel = (out["district"].eq(d) & out["month"].eq(m) & out["item_code"].eq(i)) + out.loc[sel, "OB_prime"] = out.loc[sel].apply( + lambda r: r["OB"] + delta.get(r[facility_col], 0.0), + axis=1 + ) + print("Skipped ", len(skipped_nodes), "district-month-item combinations - no optimal solution") + + # ---------- Availability: update ONLY where positive transfers happened ---------- + changed_mask = (out["OB_prime"] - out["OB"]) > 1e-6 + denom = np.maximum(amc_eps, out[amc_col].astype(float).values) + p_mech = np.minimum(1.0, np.maximum(0.0, out["OB_prime"].values / denom)) + + # start from baseline + new_p = out["available_prop"].astype(float).values if floor_to_baseline else out["available_prop"].astype(float).values.copy() + # update only changed rows; optionally floor to baseline + if floor_to_baseline: + new_p[changed_mask] = np.maximum(p_mech[changed_mask], out["available_prop"].astype(float).values[changed_mask]) + else: + new_p[changed_mask] = p_mech[changed_mask] + + # force non-eligible levels to keep baseline (mirrors pooling) + non_elig = ~out[level_col].isin(eligible_levels) + new_p[non_elig] = out.loc[non_elig, "available_prop"].astype(float).values + + out["available_prop_redis"] = new_p + + edge_df = pd.DataFrame(edge_rows) if return_edge_log else None + return out, edge_df + + def redistribute_pooling_lp( + df: pd.DataFrame, + tau_min: float = 0.25, # lower floor in "months of demand" (≈ 7–8 days) - minimum transfer required + tau_max: float = 3.0, # upper ceiling (storage/policy max) + tau_donor_keep: float = 3.0, # minimum the donor keeps before donating + id_cols=("district","month","item_code"), + facility_col="fac_name", + level_col="Facility_Level", + amc_col="amc", + close_cols=("closing_bal","received","dispensed"), + amc_eps: float = 1e-6, # threshold to treat AMC as "zero" + return_move_log: bool = True, # return a detailed df showing net movement of consumables after redistribution + pooling_level: str = "district", # "district" or "cluster" + cluster_map: pd.Series | None = None, # required if pooling_level=="cluster"; this specifes which cluster each facility belongs to + floor_to_baseline: bool = True # if True, report availability floored to baseline (no decrease in outputs) + ) -> pd.DataFrame: + """ + Scenario 3: district-level pooling/push . + Maximizes total availability with: + - NaN/inf guards on AMC/OB + - duplicate facility IDs collapsed within group + - floors scaled if total stock < sum floors + - optional 'excess' sink if total stock > sum ceilings + - availability computed safely; AMC≈0 rows keep baseline (optional) + + Pooling redistribution that can operate at the district level (default) + or within fixed-size clusters inside districts. + + If pooling_level == "cluster", you must pass cluster_map: Series mapping facility_id -> cluster_id + (cluster ids should already be district-scoped, e.g., "Dedza#C01"). + + Returns: + - out: original df plus columns: + OB, OB_prime, available_prop_redis, received_from_pool + where received_from_pool = OB_prime - OB (pos=received, neg=donated) + - (optional) move_log: per (district, month, item, facility) net movement summary + """ + if pooling_level not in ("district", "cluster"): + raise ValueError("pooling_level must be 'district' or 'cluster'.") + + closing_bal, received, dispensed = close_cols + out = df.copy() + + # Opening balance + # Ensure OB is consistent with observed availability + amc_safe = np.maximum(1e-6, lmis["amc"].astype(float)) + lmis["p_mech"] = np.clip(lmis["opening_bal"] / amc_safe, 0.0, 1.0) + + mask_inconsistent = lmis["p_mech"] < lmis["available_prop"] + lmis.loc[mask_inconsistent, "opening_bal"] = ( + lmis.loc[mask_inconsistent, "available_prop"] * lmis.loc[mask_inconsistent, "amc"] + ) + out["OB"] = out["opening_bal"] + + # Default (will overwrite per group) + out["OB_prime"] = out["OB"] + + # Attach cluster if needed + if pooling_level == "cluster": + if cluster_map is None: + raise ValueError("cluster_map is required when pooling_level='cluster'.") + # cluster_map: index = facility_id (facility_col), value = cluster_id (already district-scoped) + out = out.merge( + cluster_map.rename("cluster_id"), + how="left", + left_on=facility_col, + right_index=True, + ) + if out["cluster_id"].isna().any(): + # facilities missing a cluster—assign singleton clusters to keep them + out["cluster_id"] = out["cluster_id"].fillna( + out["district"].astype(str) + "#CXX_" + out[facility_col].astype(str)) + + group_cols = list(id_cols) + node_label = "district" + if pooling_level == "cluster": + group_cols = ["cluster_id", "month", "item_code"] + node_label = "cluster_id" + move_rows = [] # optional movement log + # TODO could remove the movement log + + skipped_nodes = [] # collect nodes that did NOT solve optimally + for keys, g in out.groupby(group_cols, sort=False): + g = g.copy() + # Resolve node ID for logging and selection masks + if pooling_level == "district": + node_val, m, i = g["district"].iloc[0], keys[1], keys[2] + else: + node_val, m, i = keys + + # Build per-facility Series (unique index) for AMC, OB, Level, Baseline p + AMC = (g.set_index(facility_col)[amc_col] + .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)) + OB0 = (g.set_index(facility_col)["OB"] + .astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0)) + LVL = (g.set_index(facility_col)[level_col].astype(str) + .replace({np.nan: ""})) + + # collapse duplicates if any + if AMC.index.duplicated().any(): + AMC = AMC[~AMC.index.duplicated(keep="first")] + if LVL.index.duplicated().any(): + LVL = LVL[~LVL.index.duplicated(keep="first")] + if OB0.index.duplicated().any(): + OB0 = OB0.groupby(level=0).sum() + + total_stock = float(OB0.sum()) + if total_stock <= 1e-9: + continue + + # Participants (positive demand) + mask_pos = AMC >= amc_eps + facs_pos = AMC.index[mask_pos].tolist() + if not facs_pos: + # nothing to reallocate to; they will be donors only (handled by OB' defaults) + continue + + AMC_pos = AMC.loc[facs_pos] + OB0_pos = OB0.loc[facs_pos] + LVL_pos = LVL.reindex(facs_pos) + + # policy floors/ceilings + tau_min_floor = (tau_min * AMC_pos).astype(float) + donor_protect = np.minimum(OB0_pos, tau_donor_keep * AMC_pos) # retain min(OB, tau_donor_keep*AMC) + LB0 = np.maximum(tau_min_floor, donor_protect) + + UB = (tau_max * AMC_pos).astype(float) + UB.loc[~LVL_pos.isin(["1a", "1b"])] = np.minimum( + OB0_pos.loc[~LVL_pos.isin(["1a", "1b"])], + UB.loc[~LVL_pos.isin(["1a", "1b"])] + ) + + # Feasibility: scale only the tau_min component if sum LB > total_stock + sum_LB0 = float(LB0.sum()) + if total_stock + 1e-9 < sum_LB0: + # Scale down the tau_min part (not the donor protection) + base_guard = donor_protect + extra = np.maximum(0.0, tau_min_floor - np.minimum(tau_min_floor, base_guard)) + need = float(extra.sum()) + budget = total_stock - float(base_guard.sum()) + scale = 0.0 if need <= 1e-12 else max(0.0, min(1.0, budget / max(1e-9, need))) + tau_min_scaled = np.minimum(base_guard, tau_min_floor) + extra * scale + LB = np.maximum(base_guard, tau_min_scaled) + else: + LB = LB0 + + # ---- Excess sink if ceilings bind + sum_UB = float(UB.sum()) + allow_excess_sink = total_stock > sum_UB + 1e-9 + + # 1) Per-facility feasibility guard + bad = LB > UB + 1e-12 + if bad.any(): + # clip LB to UB; if that still leaves negative room, the facility is degenerate + LB = np.minimum(LB, UB - 1e-9) + + # ---------- LP ---------- + prob = LpProblem(f"Pooling_{node_val}_{m}_{i}", LpMaximize) + x = {f: LpVariable(f"x_{f}", lowBound=0) for f in facs_pos} + p = {f: LpVariable(f"p_{f}", lowBound=0, upBound=1) for f in facs_pos} + excess = LpVariable("excess", lowBound=0) if allow_excess_sink else None + # note that even though facilities with AMC == 0 are not considered for optimisation, their postive OB is + # included in the total stock + + # Objective: maximize total availability + prob += lpSum(p.values()) + + # Conservation + if excess is None: + prob += lpSum(x.values()) == total_stock + else: + prob += lpSum(x.values()) + excess == total_stock + + # Bounds + linearization + for f in facs_pos: + prob += x[f] >= float(LB.loc[f]) # donor protection + tau_min (scaled) + (optional) no-harm + prob += x[f] <= float(UB.loc[f]) # eligibility-aware ceiling + prob += float(max(AMC_pos.loc[f], amc_eps)) * p[f] <= x[f] # TODO CHECK max(AMC_pos.loc[f], amc_eps) or just AMC_pos + + # Solve + prob.solve(PULP_CBC_CMD(msg=False, cuts=0, presolve=True, threads=1)) + if LpStatus[prob.status] != "Optimal": + skipped_nodes.append((node_val, m, i)) + #print("NO Optimal solution found", node_val, m, i) + continue + #else: + #print("Optimal solution found", node_val, m, i) + + # Apply solution to OB' + x_sol = {f: float(value(var) or 0.0) for f, var in x.items()} + + # Selection mask for writing back + if pooling_level == "district": + sel = (out["district"].eq(node_val) & out["month"].eq(m) & out["item_code"].eq(i)) + else: + sel = (out["cluster_id"].eq(node_val) & out["month"].eq(m) & out["item_code"].eq(i)) + + # Facilities with AMC>=eps get x_f + mask_rows_pos = sel & out[facility_col].isin(facs_pos) + out.loc[mask_rows_pos, "OB_prime"] = out.loc[mask_rows_pos, facility_col].map(x_sol).values + + # Facilities with AMC OB' = 0 + # (this matches the "donate to pool" assumption) + mask_rows_zero = sel & ~out[facility_col].isin(facs_pos) + out.loc[mask_rows_zero, "OB_prime"] = 0.0 + + if return_move_log: + for f in AMC.index: # include amc≈0 facilities (x_f=0) + x_f = x_sol.get(f, 0.0) if f in facs_pos else 0.0 + net = x_f - float(OB0.get(f, 0.0)) + move_rows.append({ + node_label: node_val, + "month": m, + "item_code": i, + "facility": f, + "received_from_pool": net, + "x_allocated": x_f, + "OB0_agg": float(OB0.get(f, 0.0)), + "eligible_receiver": bool(LVL.get(f, "") in {"1a", "1b"}), + }) + + # --- Availability after redistribution: update ONLY where OB' changed --- + amc_safe_all = out[amc_col].astype(float).replace([np.inf, -np.inf], np.nan).fillna(0.0) + denom = np.maximum(amc_eps, amc_safe_all.values) + + # Start from baseline everywhere + out["available_prop_redis"] = out["available_prop"].astype(float).values + + # Change availability only for those rows where OB has increased + changed = (out["OB_prime"] - out["OB"]) > 1e-6 + p_new = np.minimum(1.0, np.maximum(0.0, out.loc[changed, "OB_prime"].values / denom[changed])) + if floor_to_baseline: + p_new = np.maximum(p_new, out.loc[changed, "available_prop"].astype(float).values) + + out.loc[changed, "available_prop_redis"] = p_new + out["received_from_pool"] = out["OB_prime"] - out["OB"] + + move_log = pd.DataFrame(move_rows) if return_move_log else None + + # Force non-eligible levels back to baseline (mirror analysis scope) + non_elig = ~out[level_col].isin(["1a", "1b"]) + out.loc[non_elig, "available_prop_redis"] = out.loc[non_elig, "available_prop"].values # this should ideally happen automatically + # however, there are facilities at levels 2-4 whether some stock out was experienced even though OB > AMC + # We want to retain the original probability in these cases because our overall analysis is restricted to levels 1a and 1b + + # Per-row movement + out["received_from_pool"] = out["OB_prime"] - out["OB"] + + # Check if the rules are correctly applied + # This section until if return_move_log:, is not required for the solution + #--------------------------------------------------------------------------------- + # --- Build masks for skipping --- + # 1. Nodes that failed to solve optimally + # Exclude any node/month/item combinations that didn't solve optimally or which were skipped due to AMC == 0 + if skipped_nodes: + skipped_df = pd.DataFrame(skipped_nodes, columns=[node_label, "month", "item_code"]) + + # Merge to flag rows belonging to skipped groups + out = out.merge( + skipped_df.assign(skip_flag=True), + on=[node_label, "month", "item_code"], + how="left", + ) + mask_skip_solution = out["skip_flag"].fillna(False) + else: + mask_skip_solution = pd.Series(False, index=out.index) + + #out[mask_skip_solution].to_csv(outputfilepath / 'skipped_nodes_no_optiimal_soln.csv', index = False) + + # 2. Facilities with AMC effectively zero + mask_skip_amc = out["amc"].astype(float) <= 1e-9 + + # Combined skip mask + mask_skip = mask_skip_solution | mask_skip_amc + mask_solved = ~mask_skip + print(f"Skipping {mask_skip.sum()} rows due to non-optimal LPs or AMC≈0") + + # No facility should end below min(OB, tau_donor_keep*AMC) (# Lower bound check) + tol = 1e-6 #tolerance + viol_lb = ( + (out.loc[mask_solved, "OB_prime"] < + (np.minimum(out.loc[mask_solved, "OB"], tau_donor_keep * out.loc[mask_solved, "amc"]) - tol)) + ) + + # No facility ends up above upper bounds (# Upper bound check) + elig = out.loc[mask_solved, "Facility_Level"].isin(["1a", "1b"]).values + ub = np.where( + elig, + tau_max * out.loc[mask_solved, "amc"], + np.minimum(out.loc[mask_solved, "OB"], tau_max * out.loc[mask_solved, "amc"]) + ) + viol_ub = out.loc[mask_solved, "OB_prime"].values > (ub + tol) + + temp = out[mask_solved] + if viol_lb.any(): + print("For the following rows (facility, item and month combinations), unclear why OB_prime < tau_donor_keep * AMC " + "which violates a constraint in the LPP") + print(temp[viol_lb][['Facility_Level', 'amc', 'OB', 'OB_prime']]) + temp[viol_lb][['Facility_Level', 'fac_name', 'amc', 'OB', 'OB_prime']].to_csv('violates_lb.csv') + if viol_ub.any(): + print("For the following rows (facility, item and month combinations), unclear why OB_prime > tau_max * AMC " + "which violates a constraint in the LPP") + print(temp[viol_ub][['Facility_Level', 'amc', 'OB', 'OB_prime']]) + temp[viol_ub][['Facility_Level', 'fac_name', 'amc', 'OB', 'OB_prime']].to_csv('violates_ub.csv') + + if return_move_log: + move_log = pd.DataFrame(move_rows) + return out, move_log + + return out + # pooled_df, pool_moves = redistribute_pooling_lp(lmis, tau_min=0.25, tau_max=3.0, return_move_log=True) + + # Functions to generate summary plots of the outcomes of redistribution + def prep_violin_df(df, scenario_name, keep_facs_with_no_change = True): + out = df.copy() + out["delta_p"] = out["available_prop_redis"] - out["available_prop"] + + if (keep_facs_with_no_change == True): + mask = ( + (out["Facility_Level"].isin(["1a", "1b"])) & + (out["amc"] > 0) + ) + else: + mask = ( + (out["OB_prime"] > out["OB"]) & + (out["Facility_Level"].isin(["1a", "1b"])) & + (out["amc"] > 0) + ) + + return ( + out.loc[mask, ["district", "delta_p"]] + .assign(scenario=scenario_name) + ) + + def _add_custom_legend(fig=None, legend_location="upper right"): + iqr_patch = mpatches.Rectangle( + (0, 0), 1, 1, + facecolor="grey", + edgecolor="black", + linewidth=1, + label="Interquartile range (IQR)" + ) + median_patch = mlines.Line2D( + [], [], color="#b2182b", marker="o", linestyle="None", + markersize=5, label="Median" + ) + mean_patch = mlines.Line2D( + [], [], color="#b2182b", marker="D", linestyle="None", + markersize=6, label="Mean" + ) + + if fig is None: + plt.legend(handles=[iqr_patch, median_patch, mean_patch], + loc=legend_location, fontsize=8, frameon=True) + else: + fig.legend( + handles=[iqr_patch, median_patch, mean_patch], + loc=legend_location, + ncol=3, + fontsize=8, + frameon=True + ) + + def do_violin_plot_change_in_p( + violin_df: pd.DataFrame, + figname: str, + by_district: bool = False, + district_col: str = "district", + ncol: int = 4, + legend_location = "upper right", + ): + """ + Create violin + box + mean/median overlay plots of change in availability. + + If by_district=False: + Single national-level plot. + + If by_district=True: + One combined faceted figure with one panel per district. + """ + + # ---------- National-level plot ---------- + if not by_district: + mean_df = violin_df.groupby("scenario", as_index=False)["delta_p"].mean() + median_df = violin_df.groupby("scenario", as_index=False)["delta_p"].median() + + plt.figure(figsize=(10, 5)) + + sns.violinplot( + data=violin_df, + x="scenario", + y="delta_p", + cut=0, + scale="width", + inner=None, + linewidth=0.8, + color="#4C72B0", + alpha=0.6 + ) + + sns.boxplot( + data=violin_df, + x="scenario", + y="delta_p", + width=0.03, + showcaps=True, + showfliers=False, + boxprops={"facecolor": "grey", "edgecolor": "black", "linewidth": 1}, + whiskerprops={"linewidth": 1}, + medianprops={"linewidth": 0} + ) + + sns.scatterplot( + data=mean_df, + x="scenario", + y="delta_p", + color="#b2182b", + marker="D", + s=60, + zorder=10, + label="Mean" + ) + + sns.scatterplot( + data=median_df, + x="scenario", + y="delta_p", + color="#b2182b", + marker="o", + s=45, + zorder=11, + label="Median" + ) + + plt.axhline(0, color="black", linewidth=0.8, linestyle="--") + plt.ylabel("Change in probability of availability (Δp)") + plt.xlabel("") + + _add_custom_legend(legend_location=legend_location) + plt.tight_layout() + plt.savefig(outputfilepath / figname, dpi=600) + plt.close() + return + + # ---------- District-faceted plot ---------- + g = sns.catplot( + data=violin_df, + x="scenario", + y="delta_p", + col=district_col, + col_wrap=ncol, + kind="violin", + cut=0, + scale="width", + inner=None, + linewidth=0.6, + color="#4C72B0", + alpha=0.6, + height=3, + aspect=1 + ) + + # Overlay boxplots, means, medians per facet + for ax, (district, df_d) in zip(g.axes.flat, violin_df.groupby(district_col)): + mean_df = df_d.groupby("scenario", as_index=False)["delta_p"].mean() + median_df = df_d.groupby("scenario", as_index=False)["delta_p"].median() + + sns.boxplot( + data=df_d, + x="scenario", + y="delta_p", + width=0.03, + showcaps=True, + showfliers=False, + boxprops={"facecolor": "grey", "edgecolor": "black", "linewidth": 0.8}, + whiskerprops={"linewidth": 0.8}, + medianprops={"linewidth": 0}, + ax=ax + ) + + sns.scatterplot( + data=mean_df, + x="scenario", + y="delta_p", + color="#b2182b", + marker="D", + s=35, + zorder=10, + ax=ax + ) + + sns.scatterplot( + data=median_df, + x="scenario", + y="delta_p", + color="#b2182b", + marker="o", + s=30, + zorder=11, + ax=ax + ) + + ax.axhline(0, color="black", linewidth=0.6, linestyle="--") + ax.set_xlabel("") + ax.set_ylabel("Δp") + ax.tick_params(axis="x", labelrotation=45, labelsize=8) + ax.tick_params(axis="y", labelsize=8) + ax.set_title(district, fontsize=9) + + _add_custom_legend(fig=g.fig, legend_location = legend_location) + g.fig.tight_layout() + g.fig.savefig(outputfilepath / figname, dpi=600) + plt.close() + + + # IMPLEMENT + # 1) Build a time matrix + fac_coords = lmis[['fac_name', 'district', 'lat','long']] + #T_car = build_time_matrices_by_district( + # fac_coords, + # mode="car", + # backend="osrm", + # osrm_base_url="https://router.project-osrm.org", + # max_chunk=50) + + # Store dictionary in pickle format + #with open(outputfilepath / "T_car2.pkl", "wb") as f: + # pickle.dump(T_car, f) + # -> Commented out because it takes long to run. The result has been stored in pickle format + + # Load pre-generated dictionary + with open(outputfilepath / "T_car2.pkl", "rb") as f: + T_car = pickle.load(f) + # T_car2 was created after cleaning fac names and getting rid of spaces in the text + + #edges_flat = build_edges_within_radius_flat(T_car, max_minutes= 60) + + # 2) Explore the availability and distances to make decisions about optimisation rules + # Attach clean item names to lmis + consumables_dict = \ + pd.read_csv(resourcefilepath / 'healthsystem' / 'consumables' / 'ResourceFile_Consumables_Items_and_Packages.csv', + low_memory=False, + encoding="ISO-8859-1")[['Items', 'Item_Code']] + consumables_dict = dict(zip(consumables_dict['Item_Code'], consumables_dict['Items'])) + lmis['item_name'] = pd.to_numeric(lmis["item_code"], errors="coerce").map(consumables_dict) + lmis['item_name'] = ( + lmis['item_name'] + .astype(str) + .apply(clean_consumable_name) + ) + + # Plot stock adequacy by district and month to assess what bounds to set when pooling + empty_cell_note = "Note: Grey cells in the heatmap indicate missing data." + fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'district', value_var = 'item_name', + value_label= f"% of consumables with Opening Balance ≥ 3 × AMC", + amc_threshold = 3, compare = "ge", + filename = "mth_district_stock_adequacy_3amc.png", figsize = (12,10)) + fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'district', value_var = 'item_name', + value_label= f"% of consumables with Opening Balance ≥ 1.5 × AMC", + amc_threshold = 1.5, compare = "ge", + filename = "mth_district_stock_adequacy_1.5amc.png", figsize = (12,10)) + fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'district', value_var = 'item_name', + value_label= f"% of consumables with Opening Balance <= 1 × AMC", + amc_threshold = 1, compare = "le", + filename = "mth_district_stock_inadequacy_1amc.png", figsize = (12,10)) + fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'item_name', value_var = 'fac_name', + value_label= f"% of facilities with Opening Balance ≥ 3 × AMC", + amc_threshold = 3, compare = "ge", + footnote = empty_cell_note, + filename = "mth_item_stock_adequacy_3amc.png") + fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'item_name', value_var = 'fac_name', + value_label= f"% of facilities with Opening Balance ≥ 1.5 × AMC", + amc_threshold = 1.5, compare = "ge", + footnote = empty_cell_note, + filename = "mth_item_stock_adequacy_1.5amc.png") + fig, ax, hm_df = generate_stock_adequacy_heatmap(df = lmis, figures_path = outputfilepath, + y_var = 'item_name', value_var = 'fac_name', + value_label= f"% of facilities with Opening Balance <= 1 × AMC", + amc_threshold = 1, compare = "le", + footnote = empty_cell_note, + filename = "mth_item_stock_inadequacy_1amc.png") + + + # Browse the number of eligible neighbours depending on allowable travel time + results = [] + for mins in [30, 60, 90, 120]: + edges_flat = build_edges_within_radius_flat(T_car, max_minutes=mins) + neighbors_count = pd.Series({fac: len(neigh) for fac, neigh in edges_flat.items()}) + mean = neighbors_count.mean() + sem = neighbors_count.sem() # standard error of mean + ci95 = 1.96 * sem + results.append({"radius": mins, "mean": mean, "ci95": ci95}) + + results_df = pd.DataFrame(results) + + # Plot + plt.figure(figsize=(6,4)) + plt.bar(results_df["radius"], results_df["mean"], yerr=results_df["ci95"], capsize=5, color="skyblue") + plt.xlabel("Travel time radius (minutes)") + plt.ylabel("Average number of facilities within radius") + plt.title("Average connectivity of facilities with 95% CI") + plt.xticks(results_df["radius"]) + plt.savefig(outputfilepath / "neighbour_count_by_max_travel_time") + + # A manual check shows that for distances greater than 60 minutes ORS underestimates the travel time a little + # TODO consider using google maps API + + #Drop NAs + # TODO find a more generalisable solution for this issue (within the optimisation functions) + #lmis = lmis[(lmis.amc != 0) & (lmis.amc.notna())] + + # ---------------------------------------------------------------------------------------------------------------------- + # 3) Implement pooled redistribution + # ---------------------------------------------------------------------------------------------------------------------- + # Build clusters from per-district travel-time matrices + # T_car_by_dist: {"District A": DF(index=fac_ids, cols=fac_ids), ...} + cluster_size = 3 + cluster_series = build_capacity_clusters_all(T_car, cluster_size=cluster_size) + # cluster_series is a pd.Series: index=facility_id, value like "District A#C00", "District A#C01", ... + + # a) Run optimisation at district level + ''' + # Commented out for quicker runs + print("Now running Pooled Redistribution at District level") + start = time.time() + pooled_district_df, cluster_district_moves = redistribute_pooling_lp( + df=lmis, # the LMIS dataframe + tau_min=0.25, tau_max=3.0, + tau_donor_keep = 1.5, + pooling_level="district", + cluster_map=None, + return_move_log=True, + floor_to_baseline=True + ) + print(pooled_district_df.drop(columns = ['LMIS Facility List', 'lat', 'long', 'fac_owner']).groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean()) + pooled_district_df[['district', 'item_code', 'fac_name', 'month', 'amc', 'available_prop', 'Facility_Level', + 'OB', 'OB_prime', 'available_prop_redis', 'received_from_pool']].to_csv( + outputfilepath/ 'clustering_district_df.csv', index=False) + end = time.time() + print(f"District redistribution completed in {end - start:.3f} seconds") # 1.1 hour + ''' + pooled_district_df = pd.read_csv(outputfilepath / 'clustering_district_df.csv') + tlo_pooled_district = ( + pooled_district_df + .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False) + .agg(available_prop_scenario16=("available_prop_redis", "mean")) + .sort_values(["item_code","district","Facility_Level","month"]) + ) + + + # b) Run optimisation at cluster (size = 3) level + ''' + # Commented out for quicker runs + print("Now running pooled redistribution at Cluster (Size = 3) level") + start = time.time() + pooled_cluster_df, cluster_moves = redistribute_pooling_lp( + df=lmis, # the LMIS dataframe + tau_min=0, tau_max=3.0, + tau_donor_keep = 1.5, + pooling_level="cluster", + cluster_map=cluster_series, + return_move_log=True, + floor_to_baseline=True + ) + print(pooled_cluster_df.drop(columns = ['LMIS Facility List', 'lat', 'long', 'fac_owner']).groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean()) + pooled_cluster_df[['district', 'item_code', 'fac_name', 'month', 'amc', 'available_prop', 'Facility_Level', + 'OB', 'OB_prime', 'available_prop_redis', 'received_from_pool']].to_csv( + outputfilepath/ 'clustering_n3_df.csv', index=False) + + end = time.time() + print(f"Cluster redistribution completed in {end - start:.3f} seconds") # 18 hours + ''' + pooled_cluster_df = pd.read_csv(outputfilepath / 'clustering_n3_df.csv') + + tlo_pooled_cluster = ( + pooled_cluster_df + .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False) + .agg(available_prop_scenario17=("available_prop_redis", "mean")) + .sort_values(["item_code","district","Facility_Level","month"]) + ) + + + + # c) Implement pairwise redistribution + ''' + # Commented out for quicker runs + print("Now running pairwise redistribution with maximum radius 60 minutes") + start = time.time() + # c.i) 1-hour radius + large_radius_df, large_radius_moves = redistribute_radius_lp( + df=lmis, + time_matrix=T_car, + radius_minutes=60, # facilities within 1 hour by car + tau_keep=1.5, # donor must keep 1.5 × AMC + tau_tar=1.0, # receivers target 1× AMC + K_in=2, # at most 1 inbound transfers per item + K_out=10, # at most 10 outbound transfers # TODO could increase this + Qmin_proportion=0.25, # min lot = one week of demand + eligible_levels=("1a", "1b"), # only 1a/1b can receive + ) + print(large_radius_df.groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean()) + large_radius_df.to_csv(outputfilepath/ 'large_radius_df.csv', index=False) + end = time.time() + print(f"Large radius exchange distribution completed in {end - start:.3f} seconds") + ''' + large_radius_df = pd.read_csv(outputfilepath / 'large_radius_df.csv') + tlo_large_radius = ( + large_radius_df + .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False) + .agg(available_prop_scenario18=("available_prop_redis", "mean")) + .sort_values(["item_code","district","Facility_Level","month"]) + ) + + + + # c.ii) 30-minute radius + ''' + print("Now running pairwise redistribution with maximum radius 30 minutes") + start = time.time() + small_radius_df, small_radius_moves = redistribute_radius_lp( + df=lmis, + time_matrix=T_car, + radius_minutes=30, # facilities within 1 hour by car + tau_keep=1.5, # donor must keep 1 × AMC + tau_tar=1.0, # receivers target 1 × AMC + K_in=2, # at most 2 inbound transfers per item + K_out=10, # at most 10 outbound transfers + Qmin_proportion=0.25, # min lot = one week of demand + eligible_levels=("1a", "1b"), # only 1a/1b can receive + ) + print(small_radius_df.groupby('Facility_Level')[['available_prop_redis', 'available_prop']].mean()) + small_radius_df.to_csv(outputfilepath/ 'small_radius_df.csv', index=False) + end = time.time() + print(f"Small radius exchange redistributino completed in {end - start:.3f} seconds") + ''' + small_radius_df = pd.read_csv(outputfilepath / 'small_radius_df.csv') + tlo_small_radius = ( + small_radius_df + .groupby(["item_code", "district", "Facility_Level", "month"], as_index=False) + .agg(available_prop_scenario19=("available_prop_redis", "mean")) + .sort_values(["item_code","district","Facility_Level","month"]) + ) + + # Summarise the outcome of the redistribution in violin plots + violin_df_all_facs = pd.concat([ + prep_violin_df(pooled_district_df, "District pooling", keep_facs_with_no_change = True), + prep_violin_df(pooled_cluster_df, "Cluster pooling", keep_facs_with_no_change = True), + prep_violin_df(large_radius_df, "Pairwise (large radius)", keep_facs_with_no_change = True), + prep_violin_df(small_radius_df, "Pairwise (small radius)", keep_facs_with_no_change = True) + ], ignore_index=True) + violin_df_only_facs_with_change = pd.concat([ + prep_violin_df(pooled_district_df, "District pooling", keep_facs_with_no_change = False), + prep_violin_df(pooled_cluster_df, "Cluster pooling", keep_facs_with_no_change = False), + prep_violin_df(large_radius_df, "Pairwise (large radius)", keep_facs_with_no_change = False), + prep_violin_df(small_radius_df, "Pairwise (small radius)", keep_facs_with_no_change = False) + ], ignore_index=True) + + do_violin_plot_change_in_p( + violin_df = violin_df_all_facs, + figname="violin_redistribution_national_all_facs.png", + legend_location= "upper right" + ) + do_violin_plot_change_in_p( + violin_df = violin_df_only_facs_with_change, + figname="violin_redistribution_national_only_facs_with_change.png", + legend_location = "lower right" + ) + + do_violin_plot_change_in_p( + violin_df = violin_df_all_facs, + figname="violin_by_district_all_facs", + by_district=True, + ncol=4 + ) + + do_violin_plot_change_in_p( + violin_df = violin_df_only_facs_with_change, + figname="violin_redistribution_national_only_facs_with_change", + by_district=True, + ncol=4 + ) + + + # ---------------------------------------------------------------------------------------------------------------------- + # 4) Compile update probabilities and merge with Resourcefile + # ---------------------------------------------------------------------------------------------------------------------- + # 1) Merge the new dataframes together + # ---------------------------------------------------------------------------------------------------------------------- + tlo_redis = reduce( + lambda left, right: pd.merge( + left, right, + on=["item_code", "district", "Facility_Level", "month"], + how="outer" + ), + [tlo_pooled_district, tlo_pooled_cluster, tlo_large_radius, tlo_small_radius] + ) + + tlo_redis.to_csv(outputfilepath/ 'tlo_redis.csv', index=False) + + # Edit new dataframe to match mfl formatting + list_of_new_scenario_variables = ['available_prop_scenario16', 'available_prop_scenario17', + 'available_prop_scenario18', 'available_prop_scenario19'] + tlo_redis = tlo_redis[['item_code', 'month', 'district', 'Facility_Level'] + list_of_new_scenario_variables].dropna() + tlo_redis["item_code"] = tlo_redis["item_code"].astype(float).astype(int) + + # Load master facility list + mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") + mfl["District"] = mfl["District"].astype("string").str.strip().str.replace(r"\s+", "_", regex=True) + districts = set(mfl[mfl.District.notna()]["District"].unique()) + kch = (mfl.Region == 'Central') & (mfl.Facility_Level == '3') + qech = (mfl.Region == 'Southern') & (mfl.Facility_Level == '3') + mch = (mfl.Region == 'Northern') & (mfl.Facility_Level == '3') + zmh = mfl.Facility_Level == '4' + mfl.loc[kch, "District"] = "Lilongwe" + mfl.loc[qech, "District"] = "Blantyre" + mfl.loc[mch, "District"] = "Mzimba" + mfl.loc[zmh, "District"] = "Zomba" + + # Do some mapping to make the Districts line-up with the definition of Districts in the model + rename_and_collapse_to_model_districts = { + 'Nkhota_Kota': 'Nkhotakota', + 'Mzimba_South': 'Mzimba', + 'Mzimba_North': 'Mzimba', + 'Nkhata_bay': 'Nkhata_Bay', + } + + tlo_redis['district_std'] = tlo_redis['district'].replace(rename_and_collapse_to_model_districts) + # Take averages (now that 'Mzimba' is mapped-to by both 'Mzimba South' and 'Mzimba North'.) + tlo_redis = tlo_redis.groupby(by=['district_std', 'Facility_Level', 'month', 'item_code'])[list_of_new_scenario_variables].mean().reset_index() + + # Fill in missing data: + # 1) Cities to get same results as their respective regions + copy_source_to_destination = { + 'Mzimba': 'Mzuzu_City', + 'Lilongwe': 'Lilongwe_City', + 'Zomba': 'Zomba_City', + 'Blantyre': 'Blantyre_City' + } + + for source, destination in copy_source_to_destination.items(): + new_rows = tlo_redis.loc[(tlo_redis.district_std == source) & (tlo_redis.Facility_Level.isin(['1a', '1b', '2']))].copy() + new_rows.district_std = destination + tlo_redis = pd.concat([tlo_redis, new_rows], axis=0, ignore_index=True) + + # 2) Fill in Likoma (for which no data) with the means + means = tlo_redis.loc[tlo_redis.Facility_Level.isin(['1a', '1b', '2'])].groupby(by=['Facility_Level', 'month', 'item_code'])[ + list_of_new_scenario_variables].mean().reset_index() + new_rows = means.copy() + new_rows['district_std'] = 'Likoma' + tlo_redis = pd.concat([tlo_redis, new_rows], axis=0, ignore_index=True) + assert sorted(set(districts)) == sorted(set(pd.unique(tlo_redis.district_std))) + + # 3) copy the results for 'Mwanza/1b' to be equal to 'Mwanza/1a'. + mwanza_1a = tlo_redis.loc[(tlo_redis.district_std == 'Mwanza') & (tlo_redis.Facility_Level == '1a')] + mwanza_1b = tlo_redis.loc[(tlo_redis.district_std == 'Mwanza') & (tlo_redis.Facility_Level == '1a')].copy().assign(Facility_Level='1b') + tlo_redis= pd.concat([tlo_redis, mwanza_1b], axis=0, ignore_index=True) + + # 4) Copy all the results to create a level 0 with an availability equal to half that in the respective 1a + all_1a = tlo_redis.loc[tlo_redis.Facility_Level == '1a'] + all_0 = tlo_redis.loc[tlo_redis.Facility_Level == '1a'].copy().assign(Facility_Level='0') + all_0[list_of_new_scenario_variables] *= 0.5 + tlo_redis = pd.concat([tlo_redis, all_0], axis=0, ignore_index=True) + + # Now, merge-in facility_id + tlo_redis = tlo_redis.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + left_on=['district_std', 'Facility_Level'], + right_on=['District', 'Facility_Level'], how='left', indicator=True, validate = 'm:1') + tlo_redis = tlo_redis[tlo_redis.Facility_ID.notna()].rename(columns = {'district_std': 'district'}) + assert sorted(set(mfl.loc[mfl.Facility_Level != '5','Facility_ID'].unique())) == sorted(set(pd.unique(tlo_redis.Facility_ID))) + + # Load original availability dataframe + # ---------------------------------------------------------------------------------------------------------------------- + tlo_availability_df = tlo_availability_df + list_of_old_scenario_variables = [f"available_prop_scenario{i}" for i in range(1, scenario_count + 1)] + tlo_availability_df = tlo_availability_df[['Facility_ID', 'month', 'item_code', 'available_prop'] + list_of_old_scenario_variables] + + # Attach district, facility level and item_category to this dataset + program_item_mapping = pd.read_csv(path_for_new_resourcefiles / 'ResourceFile_Consumables_Item_Designations.csv')[['Item_Code', 'item_category']] # Import item_category + program_item_mapping = program_item_mapping.rename(columns ={'Item_Code': 'item_code'})[program_item_mapping.item_category.notna()] + fac_levels = {'0', '1a', '1b', '2', '3', '4'} + tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], + on = ['Facility_ID'], how='left').rename(columns = {'District': 'district'}) + tlo_availability_df = tlo_availability_df.merge(program_item_mapping, + on = ['item_code'], how='left') + #tlo_availability_df = tlo_availability_df[~tlo_availability_df[['District', 'Facility_Level', 'month', 'item_code']].duplicated()] + + # Because some of the availbility data in the original availability comes from data sources other than OpenLMIS, there are + # more unique item codes in tlo_availability_df than in tlo_redis. For these items, assume that the proportion of 'uplift' + # is the same as the average 'uplift' experienced across the consumables in tlo_redis disaggregated by district, + # facility level, and month. + + # First fix any unexpected changes in availability probability + # Merge the old and new dataframe + redis_levels = ['1a','1b'] + tlo_redis = tlo_redis[tlo_redis.Facility_Level.isin(redis_levels)] + + tlo_redis = tlo_redis.merge( + tlo_availability_df[["district", "Facility_Level", "item_code", "month", "available_prop"]], + on=["district", "Facility_Level", "item_code", "month"], + how="left", + validate="one_to_one" + ) + + for redis_scenario_col in list_of_new_scenario_variables: + pre = ( + tlo_redis[redis_scenario_col] < tlo_redis["available_prop"] + ).mean() + print(f"Pre-fix {redis_scenario_col}: {pre:.3%}") + + # Enforce no-harm + tlo_redis[redis_scenario_col] = np.maximum( + tlo_redis[redis_scenario_col], + tlo_redis["available_prop"] + ) + + post = ( + tlo_redis[redis_scenario_col] < tlo_redis["available_prop"] + ).mean() + print(f"Post-fix {redis_scenario_col}: {post:.3%}") + + # Next create an uplift dataframe + modelled_items = tlo_redis["item_code"].unique() + # Compute uplift once per scenario, store in a dict + uplift_maps = {} + + for scenario_col in list_of_new_scenario_variables: + uplift_maps[scenario_col] = ( + tlo_redis.assign( + uplift=lambda x: np.where( + x["available_prop"] > 0, + x[scenario_col] / x["available_prop"], + np.nan + ) + ) + .groupby(["district", "Facility_Level", "month"], as_index=False)["uplift"] + .mean() + .rename(columns={"uplift": f"uplift_{scenario_col}"}) + ) + + # Get baseline rows for missing items + missing_mask = ~tlo_availability_df["item_code"].isin(modelled_items) + + df_missing = ( + tlo_availability_df[ + (tlo_availability_df["Facility_Level"].isin(redis_levels)) & + missing_mask + ] + .copy() + ) + + # Merge all uplifts horizontally + for scenario_col, uplift_df in uplift_maps.items(): + df_missing = df_missing.merge( + uplift_df, + on=["district", "Facility_Level", "month"], + how="left" + ) + df_missing[scenario_col] = np.minimum( + 1.0, + df_missing["available_prop"] * df_missing[f"uplift_{scenario_col}"] + ) + df_missing.drop(columns=[f"uplift_{scenario_col}"], inplace=True) + + # Concatenate + tlo_redis = pd.concat( + [tlo_redis, df_missing], + ignore_index=True + ) + + dupes = tlo_redis.duplicated( + ["district", "Facility_Level", "item_code", "month"] + ) + assert (dupes.sum() == 0) + + for scenario_col in list_of_new_scenario_variables: + assert ((tlo_redis[scenario_col]< + tlo_redis["available_prop"]).sum()) == 0 + + tlo_redis = tlo_redis[['Facility_ID', 'month', 'item_code'] + list_of_new_scenario_variables] + + # Interpolate missing values in tlo_redis for all levels except 0 + # ---------------------------------------------------------------------------------------------------------------------- + # Generate the dataframe that has the desired size and shape + fac_ids = set(mfl.loc[mfl.Facility_Level.isin(redis_levels)].Facility_ID) + item_codes = set(tlo_availability_df.item_code.unique()) + months = range(1, 13) + + # Create a MultiIndex from the product of fac_ids, months, and item_codes + index = pd.MultiIndex.from_product([fac_ids, months, item_codes], names=['Facility_ID', 'month', 'item_code']) + + # Initialize a DataFrame with the MultiIndex and columns, filled with NaN + full_set = pd.DataFrame(index=index, columns=list_of_new_scenario_variables) + full_set = full_set.astype(float) # Ensure all columns are float type and filled with NaN + + # Insert the data, where it is available. + full_set = full_set.combine_first(tlo_redis.set_index(['Facility_ID', 'month', 'item_code'])[list_of_new_scenario_variables]) + + # Fill in the blanks with rules for interpolation. + facilities_by_level = defaultdict(set) + for ix, row in mfl.iterrows(): + facilities_by_level[row['Facility_Level']].add(row['Facility_ID']) + + items_by_category = defaultdict(set) + for ix, row in program_item_mapping.iterrows(): + items_by_category[row['item_category']].add(row['item_code']) + + def get_other_facilities_of_same_level(_fac_id): + """Return a set of facility_id for other facilities that are of the same level as that provided.""" + for v in facilities_by_level.values(): + if _fac_id in v: + return v - {_fac_id} + + def get_other_items_of_same_category(_item_code): + """Return a set of item_codes for other items that are in the same category/program as that provided.""" + for v in items_by_category.values(): + if _item_code in v: + return v - {_item_code} + def interpolate_missing_with_mean(_ser): + """Return a series in which any values that are null are replaced with the mean of the non-missing.""" + if pd.isnull(_ser).all(): + raise ValueError + return _ser.fillna(_ser.mean()) + + # Create new dataset that include the interpolations (The operation is not done "in place", because the logic is based + # on what results are missing before the interpolations in other facilities). + full_set_interpolated = full_set * np.nan + full_set_interpolated[list_of_new_scenario_variables] = full_set[list_of_new_scenario_variables] + + for fac in fac_ids: + for item in item_codes: + for col in list_of_new_scenario_variables: + print(f"Now doing: fac={fac}, item={item}, column={col}") + + # Get records of the availability of this item in this facility. + _monthly_records = full_set.loc[(fac, slice(None), item), col].copy() + + if pd.notnull(_monthly_records).any(): + # If there is at least one record of this item at this facility, then interpolate the missing months from + # the months for there are data on this item in this facility. (If none are missing, this has no effect). + _monthly_records = interpolate_missing_with_mean(_monthly_records) + + else: + # If there is no record of this item at this facility, check to see if it's available at other facilities + # of the same level + # Or if there is no record of item at other facilities at this level, check to see if other items of this category + # are available at this facility level + facilities = list(get_other_facilities_of_same_level(fac)) + + other_items = get_other_items_of_same_category(item) + items = list(other_items) if other_items else other_items + + recorded_at_other_facilities_of_same_level = pd.notnull( + full_set.loc[(facilities, slice(None), item), col] + ).any() + + if not items: + category_recorded_at_other_facilities_of_same_level = False + else: + # Filter only items that exist in the MultiIndex at this facility + valid_items = [ + itm for itm in items + if any((fac, m, itm) in full_set.index for m in months) + ] + + category_recorded_at_other_facilities_of_same_level = pd.notnull( + full_set.loc[(fac, slice(None), valid_items), col] + ).any() + + if recorded_at_other_facilities_of_same_level: + # If it recorded at other facilities of same level, find the average availability of the item at other + # facilities of the same level. + print("Data for facility ", fac, " extrapolated from other facilities within level - ", facilities) + facilities = list(get_other_facilities_of_same_level(fac)) + _monthly_records = interpolate_missing_with_mean( + full_set.loc[(facilities, slice(None), item), col].groupby(level=1).mean() + ) + + elif category_recorded_at_other_facilities_of_same_level and valid_items: + # If it recorded at other facilities of same level, find the average availability of the item at other + # facilities of the same level. + print("Data for item ", item, " extrapolated from other items within category - ", valid_items) + + _monthly_records = interpolate_missing_with_mean( + full_set.loc[(fac, slice(None), valid_items), col].groupby(level=1).mean() + ) + + else: + # If it is not recorded at other facilities of same level, then assume that there is no change + print("No interpolation worked") + _monthly_records = _monthly_records.fillna(1.0) + + # Insert values (including corrections) into the resulting dataset. + full_set_interpolated.loc[(fac, slice(None), item), col] = _monthly_records.values + # temporary code + assert full_set_interpolated.loc[(fac, slice(None), item), col].mean() >= 0 + + # Check that there are not missing values + assert not pd.isnull(full_set_interpolated).any().any() + + full_set_interpolated = full_set_interpolated.reset_index() + + # Add to this dataset original availability for all the other levels of care + base_other_levels = tlo_availability_df[ + ~tlo_availability_df["Facility_Level"].isin(redis_levels) + ].copy() + for col in list_of_new_scenario_variables: + base_other_levels[col] = base_other_levels["available_prop"] + base_other_levels = base_other_levels[['Facility_ID', 'month', 'item_code'] + list_of_new_scenario_variables] + tlo_redis_final = pd.concat( + [full_set_interpolated, base_other_levels], + ignore_index=True, + ) + #tlo_redis_final.to_csv(outputfilepath / 'ResourceFile_consumable_availability_after_redistribution.csv', index = False) + + # Verify that the shape of this dataframe is identical to the original availability dataframe + assert sorted(set(tlo_redis_final.Facility_ID)) == sorted(set(pd.unique(tlo_availability_df.Facility_ID))) + assert sorted(set(tlo_redis_final.month)) == sorted(set(pd.unique(tlo_availability_df.month))) + assert sorted(set(tlo_redis_final.item_code)) == sorted(set(pd.unique(tlo_availability_df.item_code))) + assert len(tlo_redis_final) == len(tlo_availability_df.item_code) + + tlo_redis_final = tlo_availability_df.merge(tlo_redis_final, on = ['Facility_ID', 'item_code', 'month'], + how = 'left', validate = "1:1") + + return tlo_redis_final + +# Plot final availability +def plot_availability_heatmap( + df: pd.DataFrame, + y_var: str = None, + scenario_cols: list[str] = None, + filter_dict: dict = None, + cmap: str = "RdYlGn", + vmin: float = 0, + vmax: float = 1, + figsize: tuple = (10, 8), + annot: bool = True, + rename_scenarios_dict: dict = None, + title: str = 'Availability across scenarios', + figname: Path = None, +): + """ + Flexible heatmap generator that supports: + 1. Filters to subset data + 2. Multiple scenario columns (wide format, like available_prop_scen1–16) + + Parameters + ---------- + df : pd.DataFrame + Input dataframe. + y_var : str, optional + Column name for y-axis. + scenario_cols : list of str, optional + List of scenario columns (e.g., [f"available_prop_scen{i}" for i in range(1,17)]). + filter_dict : dict, optional + Filters to apply before plotting, e.g. {"Facility_Level": "1a"}. + cmap : str + Colormap. + vmin, vmax : float + Color scale range. + figsize : tuple + Figure size. + annot : bool + Annotate cells with values. + rename_scenario_dict : dict, optional + Rename columns (for pretty scenario names, etc.) + title : str, optional + Title for the plot. + figname : str, optional + Save name for PNG; if None, displays interactively. + """ + if filter_dict: + for k, v in filter_dict.items(): + if isinstance(v, (list, tuple, set)): + df = df[df[k].isin(v)] + else: + df = df[df[k] == v] + + aggregated_df = df.groupby([y_var])[scenario_cols].mean().reset_index() + heatmap_data = aggregated_df.set_index(y_var) + + # Calculate aggregate column (true overall mean) + aggregate_col = df[scenario_cols].mean() + if rename_scenarios_dict: + aggregate_col = aggregate_col.rename(index=rename_scenarios_dict) + + # Apply column renames (fix variable name) + if rename_scenarios_dict: + heatmap_data = heatmap_data.rename(columns=rename_dict) + heatmap_data.loc['Average'] = aggregate_col + + # Generate the heatmap + sns.set(font_scale=1) + plt.figure(figsize=figsize) + ax = sns.heatmap( # <── assign to ax + heatmap_data, + annot=annot, + cmap=cmap, + vmin=vmin, + vmax=vmax, + cbar_kws={'label': 'Proportion of days on which consumable is available'} + ) + + # Customize the plot + plt.title(title) + plt.xlabel('Scenarios') + plt.ylabel(y_var) + plt.xticks(rotation=90, fontsize=12) + plt.yticks(rotation=0, fontsize=11) + ax.set_xticklabels( + [textwrap.fill(label.get_text(), 20) for label in ax.get_xticklabels()], + rotation=90, ha='center' + ) + ax.set_yticklabels( + [textwrap.fill(label.get_text(), 25) for label in ax.get_yticklabels()], + rotation=0, va='center' + ) + + if figname: + plt.savefig(outputfilepath / figname, dpi=500, bbox_inches='tight') + plt.close() + else: + plt.show() + plt.close() + +''' +# Clean item category +clean_category_names = {'cancer': 'Cancer', 'cardiometabolicdisorders': 'Cardiometabolic Disorders', + 'contraception': 'Contraception', 'general': 'General', 'hiv': 'HIV', 'malaria': 'Malaria', + 'ncds': 'Non-communicable Diseases', 'neonatal_health': 'Neonatal Health', + 'other_childhood_illnesses': 'Other Childhood Illnesses', 'reproductive_health': 'Reproductive Health', + 'road_traffic_injuries': 'Road Traffic Injuries', 'tb': 'Tuberculosis', + 'undernutrition': 'Undernutrition', 'epi': 'Expanded programme on immunization'} +df_for_plots['item_category_clean'] = df_for_plots['item_category'].map(clean_category_names) + +scenario_cols = ['available_prop', 'available_prop_scenario1', 'available_prop_scenario2', 'available_prop_scenario3', + 'available_prop_scenario6', 'available_prop_scenario7', 'available_prop_scenario8', + 'available_prop_scenario16', 'available_prop_scenario17', 'available_prop_scenario18', 'available_prop_scenario19'] +rename_dict = {'available_prop': 'Actual', + 'available_prop_scenario1': 'Non-therapeutic consumables', + 'available_prop_scenario2': 'Vital medicines', + 'available_prop_scenario3': 'Pharmacist-managed', + 'available_prop_scenario6': '75th percentile facility', + 'available_prop_scenario7': '90th percentile acility', + 'available_prop_scenario8': 'Best facility', + 'available_prop_scenario16': 'District Pooling', + 'available_prop_scenario17' : 'Cluster Pooling', + 'available_prop_scenario18': 'Pairwise exchange (60-min radius)', + 'available_prop_scenario19': 'Pairwise exchange (30-min radius)'} +scenario_names = list(rename_dict.values()) + +# Plot heatmap for level 1a +plot_availability_heatmap( + df=df_for_plots, + scenario_cols=scenario_cols, + y_var="item_category_clean", + filter_dict={"Facility_Level": ["1a"]}, + title="Availability across Scenarios — Level 1a", + rename_scenarios_dict = rename_dict, + cmap = "RdYlGn", + figname = 'availability_1a.png' +) + +# Plot heatmap for level 1b +plot_availability_heatmap( + df=df_for_plots, + scenario_cols=scenario_cols, + y_var="item_category_clean", + filter_dict={"Facility_Level": ["1b"]}, + title="Availability across Scenarios — Level 1b", + rename_scenarios_dict = rename_dict, + cmap = "RdYlGn", + figname = 'availability_1b.png' +) +''' diff --git a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py index 6af7387301..32281bbe96 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/generating_consumable_scenarios/generate_consumable_availability_scenarios_for_impact_analysis.py @@ -37,6 +37,7 @@ Consumable availability is measured as probability of consumable being available at any point in time. """ import datetime +import os from collections import defaultdict from pathlib import Path @@ -44,11 +45,9 @@ import numpy as np import pandas as pd import seaborn as sns - -from tlo.methods.consumables import check_format_of_consumables_file - #from plotnine import aes, element_text, geom_bar, ggplot, labs, theme, ylim # for ggplots from R +from tlo.methods.consumables import check_format_of_consumables_file # define a timestamp for script outputs timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") @@ -79,7 +78,7 @@ def generate_alternative_availability_scenarios(tlo_availability_df: pd.DataFram # Get TLO Facility_ID for each district and facility level mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") districts = set(pd.read_csv(resourcefilepath / 'demography' / 'ResourceFile_Population_2010.csv')['District']) - fac_levels = {'0', '1a', '1b', '2', '3', '4'} # noqa: F841 + fac_levels = {'0', '1a', '1b', '2', '3', '4'} tlo_availability_df = tlo_availability_df.merge(mfl[['District', 'Facility_Level', 'Facility_ID']], on = ['Facility_ID'], how='left') @@ -600,7 +599,7 @@ def interpolate_missing_with_mean(_ser): #------------------------------------------------------ # Combine all scenario suffixes into a single list scenario_suffixes = list_of_scenario_suffixes + [f'scenario{i}' for i in range(6, 16)] - scenario_vars = [f'available_prop_{s}' for s in scenario_suffixes] # noqa: F841 + scenario_vars = [f'available_prop_{s}' for s in scenario_suffixes] old_vars = ['Facility_ID', 'month', 'item_code'] # Prepare the full base dataframe from scenarios 6–8 diff --git a/src/tlo/analysis/utils.py b/src/tlo/analysis/utils.py index 6b4d2cbf9b..dc7f07e511 100644 --- a/src/tlo/analysis/utils.py +++ b/src/tlo/analysis/utils.py @@ -70,7 +70,11 @@ def parse_log_file(log_filepath, level: int = logging.INFO): for line in log_file: # only parse lines that are json log lines (old-style logging is not supported) if line.startswith('{'): - log_data_json = json.loads(line) + try: + log_data_json = json.loads(line) + except json.JSONDecodeError: + # Skip malformed or partial JSON log lines + continue uuid = log_data_json['uuid'] # if this is a header line (only header lines have a `type` key) if 'type' in log_data_json: @@ -295,6 +299,7 @@ def extract_results(results_folder: Path, index: str = None, custom_generate_series=None, do_scaling: bool = False, + suspended_results_folder: Path = None, ) -> pd.DataFrame: """Utility function to unpack results. @@ -307,16 +312,18 @@ def extract_results(results_folder: Path, `custom_generate_series`. Optionally, with `do_scaling=True`, each element is multiplied by the scaling_factor recorded in the simulation. + If the suspend-and-resume functionality is used, scaling factor may be avaialble in the folder where the log of the suspended run are stored. Note that if runs in the batch have failed (such that logs have not been generated), these are dropped silently. """ - def get_multiplier(_draw, _run): + + def get_multiplier(results_folder, _draw, _run): """Helper function to get the multiplier from the simulation. Note that if the scaling factor cannot be found a `KeyError` is thrown.""" return load_pickled_dataframes( - results_folder, _draw, _run, 'tlo.methods.population' - )['tlo.methods.population']['scaling_factor']['scaling_factor'].values[0] + results_folder, _draw, _run, 'tlo.methods.demography' + )['tlo.methods.demography']['scaling_factor']['scaling_factor'].values[0] if custom_generate_series is None: # If there is no `custom_generate_series` provided, it implies that function required selects the specified @@ -352,7 +359,10 @@ def generate_series(dataframe: pd.DataFrame) -> pd.Series: 'Custom command does not generate a pd.Series' ) if do_scaling: - res[draw_run] = output_from_eval * get_multiplier(draw, run) + if suspended_results_folder is not None: + res[draw_run] = output_from_eval * get_multiplier(suspended_results_folder, 0, 0) + else: + res[draw_run] = output_from_eval * get_multiplier(results_folder, draw, run) else: res[draw_run] = output_from_eval @@ -386,7 +396,7 @@ def check_info_value_changes(df): prev_info = row["Info"] return problems - + def remove_events_for_individual_after_death(df): rows_to_drop = [] @@ -430,8 +440,8 @@ def reconstruct_individual_histories(df): if len(problems)>0: print("Values didn't change but were still detected") print(problems) - - + + return df_final @@ -643,10 +653,9 @@ def turn_log_into_pickles(logfile): print(f"Opening {logfile}") outputs = parse_log_file(logfile) for key, output in outputs.items(): - if key.startswith("tlo."): - print(f" - Writing {key}.pickle") - with open(logfile.parent / f"{key}.pickle", "wb") as f: - pickle.dump(output, f) + print(f" - Writing {key}.pickle") + with open(logfile.parent / f"{key}.pickle", "wb") as f: + pickle.dump(output, f) def uncompress_and_save_logfile(compressed_file) -> Path: """Uncompress and save a log file and return its path.""" @@ -662,13 +671,21 @@ def uncompress_and_save_logfile(compressed_file) -> Path: for run_folder in run_folders: # Find the original log-file written by the simulation if compressed_file_name_prefix is None: - logfile = [x for x in os.listdir(run_folder) if x.endswith('.log')][0] + logfile = [Path(run_folder) / x for x in os.listdir(run_folder) if x.endswith('.log')][0] else: - compressed_file_name = [ - x for x in os.listdir(run_folder) if x.startswith(compressed_file_name_prefix) - ][0] - logfile = uncompress_and_save_logfile(Path(run_folder) / compressed_file_name) - + logfiles = [ + Path(run_folder) / x + for x in os.listdir(run_folder) + if x.endswith(".log.gz") + and not x.startswith("stdout") + and not x.startswith("stderr") + ] + + for compressed_log in logfiles: + logfile = uncompress_and_save_logfile(compressed_log) + turn_log_into_pickles(logfile) + + logfile = Path(logfile).resolve() turn_log_into_pickles(logfile) diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py index 4b622cbea7..20255f6432 100644 --- a/src/tlo/methods/consumables.py +++ b/src/tlo/methods/consumables.py @@ -64,6 +64,10 @@ def __init__( "scenario13", "scenario14", "scenario15", + "scenario16", + "scenario17", + "scenario18", + "scenario19", } # Create internal items: @@ -166,6 +170,10 @@ def _update_prob_item_codes_available(self, availability: str): "scenario13", "scenario14", "scenario15", + "scenario16", + "scenario17", + "scenario18", + "scenario19", ): pass # change already picked up in `self._process_consumables_data()` elif availability == "all": @@ -228,6 +236,10 @@ def _process_consumables_data(self, availability_data: pd.DataFrame, availabilit "scenario13", "scenario14", "scenario15", + "scenario16", + "scenario17", + "scenario18", + "scenario19", ): return availability_data.set_index(["month", "Facility_ID", "item_code"])[f"available_prop_{availability}"] diff --git a/src/tlo/methods/healthsystem.py b/src/tlo/methods/healthsystem.py index 29519a015c..12a36ca8ba 100644 --- a/src/tlo/methods/healthsystem.py +++ b/src/tlo/methods/healthsystem.py @@ -207,7 +207,8 @@ class HealthSystem(Module): "consumables in the merged 1b/2 facility level." ), "availability_estimates": Parameter( - Types.DATA_FRAME, "Estimated availability of consumables in the LMIS dataset." + Types.DICT, "Estimated availability of consumables in the LMIS dataset. Dict contains all the databases " + "that might be selected using `data_source_for_cons_availability_estimates`" ), "cons_availability": Parameter( Types.STRING, @@ -646,18 +647,12 @@ def read_parameters(self, resourcefilepath: Optional[Path] = None): dtype={"Item_Code": int, "is_diagnostic": bool, "is_medicine": bool, "is_other": bool}, ).set_index("Item_Code") - # Choose to read-in the updated availabilty estimates or the legacy availability estimates - if self.parameters["data_source_for_cons_availability_estimates"] == 'original': - filename_for_cons_availability_estimates = "ResourceFile_Consumables_availability_small_original.csv" - elif self.parameters["data_source_for_cons_availability_estimates"] == 'updated': - filename_for_cons_availability_estimates = "ResourceFile_Consumables_availability_small.csv" - else: - raise ValueError("data_source_for_cons_availability_estimates should be either 'original' or 'updated'") - - self.parameters["availability_estimates"] = pd.read_csv( - path_to_resourcefiles_for_healthsystem / "consumables" / filename_for_cons_availability_estimates - ) - + def read_consumables(filename): + return pd.read_csv(path_to_resourcefiles_for_healthsystem / "consumables" / filename) + self.parameters["availability_estimates"] = { + "original": read_consumables("ResourceFile_Consumables_availability_small_original.csv"), + "updated": read_consumables("ResourceFile_Consumables_availability_small.csv"), + } # Data on the number of beds available of each type by facility_id self.parameters["BedCapacity"] = pd.read_csv( @@ -797,11 +792,14 @@ def pre_initialise_population(self): self.bed_days = BedDays(hs_module=self, availability=self.get_beds_availability()) self.bed_days.pre_initialise_population() + # Confirm availability data for consumables + _availability_data = self.update_consumables_availability_to_represent_merging_of_levels_1b_and_2( + self.parameters["availability_estimates"][self.parameters["data_source_for_cons_availability_estimates"]] + ) + # Initialise the Consumables class self.consumables = Consumables( - availability_data=self.update_consumables_availability_to_represent_merging_of_levels_1b_and_2( - self.parameters["availability_estimates"] - ), + availability_data=_availability_data, item_code_designations=self.parameters["consumables_item_designations"], rng=rng_for_consumables, availability=self.get_cons_availability(),