|
| 1 | +""" pyplots.ai |
| 2 | +learning-curve-basic: Model Learning Curve |
| 3 | +Library: letsplot 4.8.2 | Python 3.13.11 |
| 4 | +Quality: 92/100 | Created: 2025-12-26 |
| 5 | +""" |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +import pandas as pd |
| 9 | +from lets_plot import * # noqa: F403 |
| 10 | +from lets_plot import ggsave |
| 11 | + |
| 12 | + |
| 13 | +LetsPlot.setup_html() # noqa: F405 |
| 14 | + |
| 15 | +# Data - Simulate learning curve for a model showing slight overfitting pattern |
| 16 | +np.random.seed(42) |
| 17 | + |
| 18 | +# Training set sizes (10 different sizes) |
| 19 | +train_sizes = np.array([50, 100, 200, 400, 600, 800, 1000, 1200, 1400, 1600]) |
| 20 | + |
| 21 | +# Simulate 5 cross-validation folds |
| 22 | +n_folds = 5 |
| 23 | +n_sizes = len(train_sizes) |
| 24 | + |
| 25 | +# Training scores: Start high, stay high (model fits training data well) |
| 26 | +train_scores_mean = 0.99 - 0.15 * np.exp(-train_sizes / 200) |
| 27 | +train_scores = np.zeros((n_folds, n_sizes)) |
| 28 | +for i in range(n_folds): |
| 29 | + noise = np.random.randn(n_sizes) * 0.01 |
| 30 | + train_scores[i] = train_scores_mean + noise |
| 31 | + |
| 32 | +# Validation scores: Start lower, improve with more data (learning effect) |
| 33 | +# Show a gap with training that narrows as data increases |
| 34 | +validation_scores_mean = 0.65 + 0.20 * (1 - np.exp(-train_sizes / 500)) |
| 35 | +validation_scores = np.zeros((n_folds, n_sizes)) |
| 36 | +for i in range(n_folds): |
| 37 | + noise = np.random.randn(n_sizes) * 0.02 |
| 38 | + validation_scores[i] = validation_scores_mean + noise |
| 39 | + |
| 40 | +# Calculate means and standard deviations |
| 41 | +train_mean = np.mean(train_scores, axis=0) |
| 42 | +train_std = np.std(train_scores, axis=0) |
| 43 | +val_mean = np.mean(validation_scores, axis=0) |
| 44 | +val_std = np.std(validation_scores, axis=0) |
| 45 | + |
| 46 | +# Create DataFrames for plotting |
| 47 | +df_train = pd.DataFrame( |
| 48 | + { |
| 49 | + "Training Set Size": train_sizes, |
| 50 | + "Score": train_mean, |
| 51 | + "Lower": train_mean - train_std, |
| 52 | + "Upper": train_mean + train_std, |
| 53 | + "Type": "Training Score", |
| 54 | + } |
| 55 | +) |
| 56 | + |
| 57 | +df_val = pd.DataFrame( |
| 58 | + { |
| 59 | + "Training Set Size": train_sizes, |
| 60 | + "Score": val_mean, |
| 61 | + "Lower": val_mean - val_std, |
| 62 | + "Upper": val_mean + val_std, |
| 63 | + "Type": "Validation Score", |
| 64 | + } |
| 65 | +) |
| 66 | + |
| 67 | +df = pd.concat([df_train, df_val], ignore_index=True) |
| 68 | + |
| 69 | +# Plot |
| 70 | +plot = ( |
| 71 | + ggplot(df, aes(x="Training Set Size", y="Score", color="Type", fill="Type")) |
| 72 | + + geom_ribbon(aes(ymin="Lower", ymax="Upper"), alpha=0.2, color="rgba(0,0,0,0)") |
| 73 | + + geom_line(size=2) |
| 74 | + + geom_point(size=4) |
| 75 | + + scale_color_manual(values=["#306998", "#FFD43B"]) |
| 76 | + + scale_fill_manual(values=["#306998", "#FFD43B"]) |
| 77 | + + scale_y_continuous(limits=[0.55, 1.02]) |
| 78 | + + scale_x_continuous(limits=[0, 1700], breaks=list(range(0, 1800, 200))) |
| 79 | + + labs( |
| 80 | + x="Training Set Size (samples)", |
| 81 | + y="Accuracy Score", |
| 82 | + title="learning-curve-basic · letsplot · pyplots.ai", |
| 83 | + color="", |
| 84 | + fill="", |
| 85 | + ) |
| 86 | + + theme_minimal() |
| 87 | + + theme( |
| 88 | + plot_title=element_text(size=24), |
| 89 | + axis_title=element_text(size=20), |
| 90 | + axis_text=element_text(size=16), |
| 91 | + legend_text=element_text(size=16), |
| 92 | + legend_position="bottom", |
| 93 | + panel_grid_major=element_line(color="#CCCCCC", size=0.5), |
| 94 | + panel_grid_minor=element_blank(), |
| 95 | + ) |
| 96 | + + ggsize(1600, 900) |
| 97 | +) |
| 98 | + |
| 99 | +# Save as PNG (scale 3x = 4800 x 2700 px) and HTML |
| 100 | +ggsave(plot, "plot.png", path=".", scale=3) |
| 101 | +ggsave(plot, "plot.html", path=".") |
0 commit comments