|
| 1 | +""" pyplots.ai |
| 2 | +gain-curve: Cumulative Gains Chart |
| 3 | +Library: altair 6.0.0 | Python 3.13.11 |
| 4 | +Quality: 91/100 | Created: 2025-12-29 |
| 5 | +""" |
| 6 | + |
| 7 | +import altair as alt |
| 8 | +import numpy as np |
| 9 | +import pandas as pd |
| 10 | + |
| 11 | + |
| 12 | +# Data - Simulated model predictions for customer response |
| 13 | +np.random.seed(42) |
| 14 | +n_samples = 1000 |
| 15 | + |
| 16 | +# Generate realistic customer response data |
| 17 | +# Model with good discrimination - scores correlate with actual outcomes |
| 18 | +base_score = np.random.beta(2, 5, n_samples) # Skewed distribution of base scores |
| 19 | +noise = np.random.normal(0, 0.15, n_samples) |
| 20 | +y_score = np.clip(base_score + noise, 0, 1) |
| 21 | + |
| 22 | +# Generate actual outcomes with correlation to scores |
| 23 | +response_prob = 0.3 * y_score + 0.05 # Higher score = higher probability of positive |
| 24 | +y_true = (np.random.random(n_samples) < response_prob).astype(int) |
| 25 | + |
| 26 | +# Calculate cumulative gains curve |
| 27 | +sorted_indices = np.argsort(y_score)[::-1] # Sort by score descending |
| 28 | +y_true_sorted = y_true[sorted_indices] |
| 29 | + |
| 30 | +# Calculate cumulative percentages |
| 31 | +cumulative_positives = np.cumsum(y_true_sorted) |
| 32 | +total_positives = y_true_sorted.sum() |
| 33 | +pct_population = np.arange(1, n_samples + 1) / n_samples * 100 |
| 34 | +pct_gain = cumulative_positives / total_positives * 100 |
| 35 | + |
| 36 | +# Add origin point for complete curve |
| 37 | +pct_population = np.insert(pct_population, 0, 0) |
| 38 | +pct_gain = np.insert(pct_gain, 0, 0) |
| 39 | + |
| 40 | +# Subsample for smoother visual (keep every 10th point plus endpoints) |
| 41 | +sample_idx = np.concatenate([[0], np.arange(10, len(pct_population) - 1, 10), [len(pct_population) - 1]]) |
| 42 | +pct_population_smooth = pct_population[sample_idx] |
| 43 | +pct_gain_smooth = pct_gain[sample_idx] |
| 44 | + |
| 45 | +# Create DataFrame for gains curve |
| 46 | +df_gains = pd.DataFrame({"population": pct_population_smooth, "gain": pct_gain_smooth, "Type": "Model"}) |
| 47 | + |
| 48 | +# Create diagonal reference line (random selection baseline) |
| 49 | +df_baseline = pd.DataFrame({"population": [0, 100], "gain": [0, 100], "Type": "Random (Baseline)"}) |
| 50 | + |
| 51 | +# Combine data |
| 52 | +df_combined = pd.concat([df_gains, df_baseline], ignore_index=True) |
| 53 | + |
| 54 | +# Create gain curve chart with proper styling |
| 55 | +model_curve = ( |
| 56 | + alt.Chart(df_combined[df_combined["Type"] == "Model"]) |
| 57 | + .mark_line( |
| 58 | + strokeWidth=4, |
| 59 | + color="#306998", |
| 60 | + interpolate="monotone", # Smooth interpolation |
| 61 | + ) |
| 62 | + .encode( |
| 63 | + x=alt.X( |
| 64 | + "population:Q", |
| 65 | + title="Population Targeted (%)", |
| 66 | + scale=alt.Scale(domain=[0, 100]), |
| 67 | + axis=alt.Axis(titleFontSize=22, labelFontSize=18, tickCount=10), |
| 68 | + ), |
| 69 | + y=alt.Y( |
| 70 | + "gain:Q", |
| 71 | + title="Positive Cases Captured (%)", |
| 72 | + scale=alt.Scale(domain=[0, 100]), |
| 73 | + axis=alt.Axis(titleFontSize=22, labelFontSize=18, tickCount=10), |
| 74 | + ), |
| 75 | + ) |
| 76 | +) |
| 77 | + |
| 78 | +# Add shaded area under model curve |
| 79 | +area_model = ( |
| 80 | + alt.Chart(df_combined[df_combined["Type"] == "Model"]) |
| 81 | + .mark_area(opacity=0.15, color="#306998", interpolate="monotone") |
| 82 | + .encode(x="population:Q", y="gain:Q") |
| 83 | +) |
| 84 | + |
| 85 | +# Baseline diagonal line |
| 86 | +baseline_line = ( |
| 87 | + alt.Chart(df_combined[df_combined["Type"] == "Random (Baseline)"]) |
| 88 | + .mark_line(strokeWidth=3, strokeDash=[8, 4], color="#888888") |
| 89 | + .encode(x="population:Q", y="gain:Q") |
| 90 | +) |
| 91 | + |
| 92 | +# Custom legend using text and lines |
| 93 | +legend_model_line = ( |
| 94 | + alt.Chart(pd.DataFrame({"x": [8, 18], "y": [94, 94]})) |
| 95 | + .mark_line(strokeWidth=4, color="#306998") |
| 96 | + .encode(x=alt.X("x:Q"), y=alt.Y("y:Q")) |
| 97 | +) |
| 98 | + |
| 99 | +legend_model_text = ( |
| 100 | + alt.Chart(pd.DataFrame({"x": [20], "y": [94], "text": ["Model"]})) |
| 101 | + .mark_text(align="left", fontSize=18, color="#333333") |
| 102 | + .encode(x="x:Q", y="y:Q", text="text:N") |
| 103 | +) |
| 104 | + |
| 105 | +legend_baseline_line = ( |
| 106 | + alt.Chart(pd.DataFrame({"x": [8, 18], "y": [88, 88]})) |
| 107 | + .mark_line(strokeWidth=3, strokeDash=[8, 4], color="#888888") |
| 108 | + .encode(x=alt.X("x:Q"), y=alt.Y("y:Q")) |
| 109 | +) |
| 110 | + |
| 111 | +legend_baseline_text = ( |
| 112 | + alt.Chart(pd.DataFrame({"x": [20], "y": [88], "text": ["Random (Baseline)"]})) |
| 113 | + .mark_text(align="left", fontSize=18, color="#333333") |
| 114 | + .encode(x="x:Q", y="y:Q", text="text:N") |
| 115 | +) |
| 116 | + |
| 117 | +# Combine all layers |
| 118 | +chart = ( |
| 119 | + alt.layer( |
| 120 | + area_model, |
| 121 | + baseline_line, |
| 122 | + model_curve, |
| 123 | + legend_model_line, |
| 124 | + legend_model_text, |
| 125 | + legend_baseline_line, |
| 126 | + legend_baseline_text, |
| 127 | + ) |
| 128 | + .properties( |
| 129 | + width=1600, height=900, title=alt.Title("gain-curve · altair · pyplots.ai", fontSize=28, anchor="middle") |
| 130 | + ) |
| 131 | + .configure_axis(grid=True, gridOpacity=0.3, gridDash=[2, 2]) |
| 132 | + .configure_view(strokeWidth=0) |
| 133 | +) |
| 134 | + |
| 135 | +# Save as PNG and HTML |
| 136 | +chart.save("plot.png", scale_factor=3.0) |
| 137 | +chart.save("plot.html") |
0 commit comments