|
1 | 1 | """ pyplots.ai |
2 | 2 | box-basic: Basic Box Plot |
3 | | -Library: letsplot 4.8.1 | Python 3.13.11 |
4 | | -Quality: 98/100 | Created: 2025-12-23 |
| 3 | +Library: letsplot 4.8.2 | Python 3.14 |
| 4 | +Quality: 90/100 | Created: 2025-12-23 |
5 | 5 | """ |
6 | 6 |
|
7 | 7 | import numpy as np |
8 | 8 | import pandas as pd |
9 | 9 | from lets_plot import ( |
10 | 10 | LetsPlot, |
11 | 11 | aes, |
| 12 | + as_discrete, |
| 13 | + element_blank, |
| 14 | + element_line, |
| 15 | + element_rect, |
12 | 16 | element_text, |
| 17 | + flavor_high_contrast_light, |
13 | 18 | geom_boxplot, |
| 19 | + geom_hline, |
| 20 | + geom_text, |
14 | 21 | ggplot, |
15 | 22 | ggsave, |
16 | 23 | ggsize, |
17 | 24 | labs, |
| 25 | + layer_tooltips, |
18 | 26 | scale_fill_manual, |
| 27 | + scale_y_continuous, |
19 | 28 | theme, |
20 | | - theme_minimal, |
21 | 29 | ) |
22 | 30 |
|
23 | 31 |
|
|
27 | 35 | np.random.seed(42) |
28 | 36 | categories = ["Engineering", "Marketing", "Sales", "HR", "Finance"] |
29 | 37 | data = [] |
30 | | -# Realistic salary distributions for each department |
31 | 38 | distributions = { |
32 | 39 | "Engineering": (85000, 15000), |
33 | 40 | "Marketing": (65000, 12000), |
|
40 | 47 | mean, std = distributions[cat] |
41 | 48 | n = np.random.randint(50, 100) |
42 | 49 | values = np.random.normal(mean, std, n) |
43 | | - # Add a few outliers |
44 | 50 | outliers = np.random.choice([mean + 3.5 * std, mean - 2.5 * std], size=3) |
45 | 51 | values = np.concatenate([values, outliers]) |
46 | 52 | data.extend([(cat, v) for v in values]) |
47 | 53 |
|
48 | | -df = pd.DataFrame(data, columns=["category", "value"]) |
| 54 | +df = pd.DataFrame(data, columns=["department", "salary"]) |
| 55 | + |
| 56 | +# Compute medians for annotation labels |
| 57 | +medians = df.groupby("department")["salary"].median().reset_index() |
| 58 | +medians.columns = ["department", "median_salary"] |
| 59 | +medians["label"] = medians["median_salary"].apply(lambda x: f"${x:,.0f}") |
| 60 | + |
| 61 | +# Insight: compare highest vs lowest median departments |
| 62 | +sorted_medians = medians.sort_values("median_salary") |
| 63 | +low_dept = sorted_medians.iloc[0] |
| 64 | +high_dept = sorted_medians.iloc[-1] |
| 65 | +pct_diff = (high_dept["median_salary"] - low_dept["median_salary"]) / low_dept["median_salary"] |
| 66 | +insight_text = f"+{pct_diff:.0%} vs. {low_dept['department']}" |
| 67 | + |
| 68 | +# Overall mean for reference line |
| 69 | +overall_mean = df["salary"].mean() |
| 70 | + |
| 71 | +# Annotation dataframes |
| 72 | +insight_df = pd.DataFrame( |
| 73 | + { |
| 74 | + "department": [high_dept["department"]], |
| 75 | + "y": [high_dept["median_salary"] + 22000], |
| 76 | + "lbl": [f"{high_dept['department'][:3]}. {insight_text}"], |
| 77 | + } |
| 78 | +) |
| 79 | +mean_label_df = pd.DataFrame( |
| 80 | + {"department": [high_dept["department"]], "y": [overall_mean + 3000], "lbl": [f"Avg: ${overall_mean:,.0f}"]} |
| 81 | +) |
49 | 82 |
|
50 | 83 | # Plot |
51 | | -colors = ["#306998", "#FFD43B", "#DC2626", "#16A34A", "#9333EA"] |
| 84 | +# Wong colorblind-safe palette (no two similar blues) |
| 85 | +colors = ["#0072B2", "#E69F00", "#D55E00", "#009E73", "#CC79A7"] |
52 | 86 |
|
53 | 87 | plot = ( |
54 | | - ggplot(df, aes(x="category", y="value", fill="category")) |
55 | | - + geom_boxplot(alpha=0.8, size=1.5, outlier_size=4) |
| 88 | + ggplot(df, aes(x=as_discrete("department", order=1, order_by="..middle.."), y="salary", fill="department")) |
| 89 | + + geom_boxplot( |
| 90 | + alpha=0.85, |
| 91 | + size=1.2, |
| 92 | + outlier_size=5, |
| 93 | + outlier_shape=21, |
| 94 | + outlier_color="#333333", |
| 95 | + width=0.72, |
| 96 | + tooltips=layer_tooltips() |
| 97 | + .title("@department") |
| 98 | + .line("Median|$@{..middle..}") |
| 99 | + .line("Q1|$@{..lower..}") |
| 100 | + .line("Q3|$@{..upper..}") |
| 101 | + .line("Min|$@{..ymin..}") |
| 102 | + .line("Max|$@{..ymax..}"), |
| 103 | + ) |
56 | 104 | + scale_fill_manual(values=colors) |
57 | | - + labs(x="Department", y="Salary ($)", title="box-basic · letsplot · pyplots.ai") |
58 | | - + theme_minimal() |
| 105 | + # Median value labels above each box |
| 106 | + + geom_text( |
| 107 | + aes(x="department", y="median_salary", label="label"), |
| 108 | + data=medians, |
| 109 | + size=11, |
| 110 | + color="#333333", |
| 111 | + fontface="bold", |
| 112 | + nudge_y=5000, |
| 113 | + inherit_aes=False, |
| 114 | + ) |
| 115 | + # Overall mean reference line |
| 116 | + + geom_hline(yintercept=overall_mean, color="#888888", size=0.8, linetype="dashed") |
| 117 | + + geom_text( |
| 118 | + aes(x="department", y="y", label="lbl"), |
| 119 | + data=mean_label_df, |
| 120 | + size=10, |
| 121 | + color="#666666", |
| 122 | + fontface="italic", |
| 123 | + hjust=0.5, |
| 124 | + inherit_aes=False, |
| 125 | + ) |
| 126 | + # Key insight annotation |
| 127 | + + geom_text( |
| 128 | + aes(x="department", y="y", label="lbl"), |
| 129 | + data=insight_df, |
| 130 | + size=11, |
| 131 | + color="#1E4F72", |
| 132 | + fontface="bold italic", |
| 133 | + inherit_aes=False, |
| 134 | + ) |
| 135 | + + scale_y_continuous(format="${,.0f}") |
| 136 | + + labs( |
| 137 | + x="Department", |
| 138 | + y="Annual Salary (USD)", |
| 139 | + title="box-basic \u00b7 letsplot \u00b7 pyplots.ai", |
| 140 | + subtitle="Salary distributions across five departments, ordered by median", |
| 141 | + ) |
| 142 | + + flavor_high_contrast_light() |
59 | 143 | + theme( |
| 144 | + plot_title=element_text(size=24, face="bold"), |
| 145 | + plot_subtitle=element_text(size=16, color="#555555"), |
60 | 146 | axis_title=element_text(size=20), |
61 | 147 | axis_text=element_text(size=16), |
62 | | - plot_title=element_text(size=24), |
| 148 | + axis_ticks=element_blank(), |
| 149 | + panel_grid_major_x=element_blank(), |
| 150 | + panel_grid_minor=element_blank(), |
| 151 | + panel_grid_major_y=element_line(color="#DDDDDD", size=0.5), |
63 | 152 | legend_position="none", |
| 153 | + plot_background=element_rect(fill="white", color="white"), |
| 154 | + plot_margin=[10, 35, 10, 10], |
64 | 155 | ) |
65 | 156 | + ggsize(1600, 900) |
66 | 157 | ) |
|
0 commit comments