feat: add multi-variant convergence and Pareto comparison visualizations

MichalRedm · MichalRedm · commit b963c094e59a · 2026-04-18T14:45:29.000+02:00
diff --git a/scripts/experiment_moead_variants.py b/scripts/experiment_moead_variants.py
@@ -0,0 +1,232 @@
+import os
+import sys
+import random
+import hashlib
+import numpy as np
+import pandas as pd
+import time
+from datetime import datetime
+
+# Add project root to sys.path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from src.config import (
+    ASSET_NAMES,
+    DATA_FOLDER,
+    OUTPUT_FOLDER,
+    TRAINING_ITERATIONS,
+    NUM_MIXTURES,
+    METRIC_TYPE,
+)
+from src.data_handler import load_data
+from src.predictors import ExactGPPredictor
+from src.portfolio import (
+    calculate_expected_returns_and_cov,
+    EpsilonConstrainedOptimizer,
+    MOEADOptimizer,
+    MOEADDRAOptimizer,
+    MOEADAWAOptimizer,
+)
+from src.portfolio.metrics import calculate_igd, calculate_hypervolume
+from src.visualization import (
+    plot_variants_convergence,
+    plot_variants_comparison,
+)
+
+
+def get_data_hash(folder_path):
+    """Calculates a hash of all file contents in the data folder."""
+    hasher = hashlib.sha256()
+    for root, dirs, files in os.walk(folder_path):
+        for name in sorted(files):
+            file_path = os.path.join(root, name)
+            with open(file_path, "rb") as f:
+                while chunk := f.read(8192):
+                    hasher.update(chunk)
+    return hasher.hexdigest()
+
+
+def seed_everything(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    try:
+        import torch
+        torch.manual_seed(seed)
+    except ImportError:
+        pass
+
+
+def run_comparative_experiment():
+    seed_everything(42)
+
+    # Setup folders
+    run_id = datetime.now().strftime("%Y%m%d_%H%M%S_moead_variants_comp")
+    run_folder = os.path.join(OUTPUT_FOLDER, run_id)
+    os.makedirs(run_folder, exist_ok=True)
+
+    print(f"Starting MOEA/D Variants Comparison. Results will be saved to {run_folder}")
+
+    # 1. Load data and calculate expected returns/cov
+    data_hash = get_data_hash(DATA_FOLDER)
+    raw_data_dict = load_data(DATA_FOLDER, ASSET_NAMES, verbose=False)
+    predictor = ExactGPPredictor(
+        training_iterations=TRAINING_ITERATIONS,
+        num_mixtures=NUM_MIXTURES,
+        verbose=False,
+        global_data_hash=data_hash,
+    )
+
+    sample_timestamps = raw_data_dict[ASSET_NAMES[0]]["timestamps"]
+    target_timestamp = sample_timestamps[-1] + 100.0  # 100 steps ahead
+
+    expected_returns, cov_matrix = calculate_expected_returns_and_cov(
+        raw_data_dict, predictor, target_timestamp, ASSET_NAMES, verbose=False
+    )
+
+    data_kwargs = {"expected_returns": expected_returns, "cov_matrix": cov_matrix}
+
+    # 2. Generate Reference Front (2D) using ECM
+    print("Generating Reference Front using Epsilon-Constrained Method...")
+    ecm = EpsilonConstrainedOptimizer()
+    ref_metrics, _ = ecm.generate_pareto_front(100, verbose=False, **data_kwargs)
+    ref_front_2d = np.column_stack([ref_metrics["Return"], ref_metrics["Risk"]])
+
+    # 3. Define Variants
+    variants = {
+        "Standard MOEA/D": MOEADOptimizer,
+        "MOEA/D-DRA": MOEADDRAOptimizer,
+        "MOEA/D-AWA": MOEADAWAOptimizer,
+    }
+
+    num_runs = 5
+    pop_size = 100
+    gens = 200
+    
+    if METRIC_TYPE == "hypervolume":
+        metric_func = calculate_hypervolume
+        metric_name = "Hypervolume"
+    else:
+        metric_func = calculate_igd
+        metric_name = "IGD"
+
+    results = {}
+    
+    # Final metrics storage for summary
+    summary_data = []
+
+    for name, opt_class in variants.items():
+        print(f"Evaluating {name}...")
+        all_metric_histories = []
+        all_times = []
+        final_fronts = []
+        final_metrics_list = []
+
+        for run in range(num_runs):
+            print(f"  Run {run+1}/{num_runs}...", end="\r")
+            start_time = time.time()
+            optimizer = opt_class()
+            
+            # Run optimizer
+            metrics, weights, vectors, history = optimizer.generate_pareto_front(
+                num_points=pop_size,
+                generations=gens,
+                verbose=False,
+                record_history=True,
+                **data_kwargs
+            )
+            elapsed = time.time() - start_time
+            all_times.append(elapsed)
+            
+            # Process history
+            run_metric_history = []
+            for gen_f in history:
+                # gen_f contains objective values
+                # objectives are Return (max) and Risk (min)
+                obt_front = gen_f[:, :2]
+                val = metric_func(obt_front, ref_front_2d)
+                run_metric_history.append(val)
+            
+            all_metric_histories.append(run_metric_history)
+            
+            # Save final front of the last run for comparison plot
+            if run == num_runs - 1:
+                results[name] = {
+                    "metrics": metrics,
+                    "weights": weights,
+                }
+            
+            # Calculate final metrics for this run
+            final_front = np.column_stack([metrics["Return"], metrics["Risk"]])
+            final_val = metric_func(final_front, ref_front_2d)
+            final_metrics_list.append(final_val)
+        
+        print(f"  {name} Finished. Avg Time: {np.mean(all_times):.2f}s")
+        
+        # Calculate mean and std history
+        mean_history = np.mean(all_metric_histories, axis=0)
+        std_history = np.std(all_metric_histories, axis=0)
+        
+        results[name]["history_mean"] = mean_history
+        results[name]["history_std"] = std_history
+        results[name]["avg_time"] = np.mean(all_times)
+        results[name]["std_time"] = np.std(all_times)
+        results[name]["avg_metric"] = np.mean(final_metrics_list)
+        results[name]["std_metric"] = np.std(final_metrics_list)
+
+        summary_data.append({
+            "Variant": name,
+            f"Avg {metric_name}": results[name]["avg_metric"],
+            f"Std {metric_name}": results[name]["std_metric"],
+            "Avg Time (s)": results[name]["avg_time"],
+            "Std Time (s)": results[name]["std_time"]
+        })
+
+    # 4. Generate Visualizations
+    print("\nGenerating comparative visualizations...")
+    
+    # Convergence Plot
+    histories_dict = {
+        name: (res["history_mean"], res["history_std"]) 
+        for name, res in results.items()
+    }
+    plot_variants_convergence(
+        histories_dict,
+        metric_name=metric_name,
+        save_path=os.path.join(run_folder, "convergence_comparison.png")
+    )
+    
+    # Pareto Front Comparison
+    variants_fronts = {
+        name: res["metrics"] for name, res in results.items()
+    }
+    plot_variants_comparison(
+        ref_metrics,
+        variants_fronts,
+        save_path=os.path.join(run_folder, "pareto_comparison.png")
+    )
+    
+    # 5. Export Results
+    print("Exporting results...")
+    df_summary = pd.DataFrame(summary_data)
+    df_summary.to_csv(os.path.join(run_folder, "metrics_comparison.csv"), index=False)
+    
+    # Export Pareto fronts for each variant
+    for name, res in results.items():
+        safe_name = name.lower().replace("/", "_").replace(" ", "_")
+        records = []
+        metrics = res["metrics"]
+        weights = res["weights"]
+        num_points = len(next(iter(metrics.values())))
+        for i in range(num_points):
+            rec = {m_name: m_vals[i] for m_name, m_vals in metrics.items()}
+            for j, w in enumerate(weights[i]):
+                rec[ASSET_NAMES[j]] = w
+            records.append(rec)
+        df_front = pd.DataFrame(records)
+        df_front.to_csv(os.path.join(run_folder, f"pareto_front_{safe_name}.csv"), index=False)
+
+    print(f"\nExperiment finished successfully. All results are in: {run_folder}")
+
+
+if __name__ == "__main__":
+    run_comparative_experiment()
diff --git a/src/visualization.py b/src/visualization.py
@@ -78,13 +78,22 @@ def plot_pareto_front(risks, returns, save_path=None):
         plt.savefig(save_path)
 
 
-def plot_moead_convergence(avg_metric_history, metric_name="IGD", save_path=None):
+def plot_moead_convergence(mean_history, std_history=None, metric_name="IGD", save_path=None):
     plt.figure(figsize=(10, 6))
-    plt.plot(avg_metric_history)
-    plt.title(f"MOEA/D Convergence (Average {metric_name} over 5 runs)")
+    plt.plot(mean_history, label="Mean")
+    if std_history is not None:
+        plt.fill_between(
+            range(len(mean_history)),
+            mean_history - std_history,
+            mean_history + std_history,
+            alpha=0.2,
+            label="Std Dev"
+        )
+    plt.title(f"MOEA/D Convergence ({metric_name})")
     plt.xlabel("Generation")
     plt.ylabel(metric_name)
-    plt.grid(True)
+    plt.legend()
+    plt.grid(True, alpha=0.3)
     if save_path:
         plt.savefig(save_path)
 
@@ -305,3 +314,90 @@ def plot_moead_sensitivity_heatmap(
 
     if save_path:
         plt.savefig(save_path)
+
+
+def plot_variants_convergence(
+    histories_dict, metric_name="IGD", save_path=None
+):
+    """
+    Plots convergence history for multiple algorithm variants.
+    
+    Args:
+        histories_dict: Dict mapping {name: (mean_history, std_history)}
+        metric_name: Name of the metric (e.g., "IGD")
+        save_path: Path to save the plot
+    """
+    plt.figure(figsize=(12, 7))
+    colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"]
+    
+    for i, (name, (mean, std)) in enumerate(histories_dict.items()):
+        color = colors[i % len(colors)]
+        plt.plot(mean, label=name, linewidth=2, color=color)
+        if std is not None:
+            plt.fill_between(
+                range(len(mean)),
+                mean - std,
+                mean + std,
+                alpha=0.15,
+                color=color
+            )
+    
+    plt.title(f"Convergence Comparison ({metric_name})")
+    plt.xlabel("Generation")
+    plt.ylabel(metric_name)
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    if save_path:
+        plt.savefig(save_path)
+
+
+def plot_variants_comparison(
+    ref_metrics, variants_results, save_path=None
+):
+    """
+    Compares Pareto fronts of multiple variants against a reference.
+    
+    Args:
+        ref_metrics: Metrics dict for the reference front (ECM)
+        variants_results: Dict mapping {name: metrics_dict}
+        save_path: Path to save the plot
+    """
+    plt.figure(figsize=(12, 8))
+    
+    # Plot reference
+    if ref_metrics is not None:
+        plt.scatter(
+            ref_metrics["Return"],
+            ref_metrics["Risk"],
+            c="black",
+            marker="x",
+            s=30,
+            label="Reference (ECM)",
+            alpha=0.4,
+            zorder=1
+        )
+    
+    # Plot variants
+    colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"]
+    for i, (name, metrics) in enumerate(variants_results.items()):
+        color = colors[i % len(colors)]
+        plt.scatter(
+            metrics["Return"],
+            metrics["Risk"],
+            marker="o",
+            s=40,
+            label=name,
+            alpha=0.6,
+            color=color,
+            edgecolors="white",
+            linewidth=0.5,
+            zorder=2
+        )
+        
+    plt.title("Pareto Front Comparison: MOEA/D Variants")
+    plt.xlabel("Expected Return")
+    plt.ylabel("Expected Risk")
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    if save_path:
+        plt.savefig(save_path)