|
1 | 1 | """ pyplots.ai |
2 | 2 | violin-basic: Basic Violin Plot |
3 | | -Library: bokeh 3.8.1 | Python 3.13.11 |
4 | | -Quality: 91/100 | Created: 2025-12-23 |
| 3 | +Library: bokeh 3.8.2 | Python 3.14.3 |
| 4 | +Quality: 92/100 | Updated: 2026-02-21 |
5 | 5 | """ |
6 | 6 |
|
7 | 7 | import numpy as np |
8 | 8 | from bokeh.io import export_png, output_file, save |
| 9 | +from bokeh.models import ColumnDataSource, HoverTool, NumeralTickFormatter |
9 | 10 | from bokeh.plotting import figure |
| 11 | +from scipy.stats import gaussian_kde |
10 | 12 |
|
11 | 13 |
|
12 | 14 | # Data - Salary distributions by department (realistic scenario) |
13 | 15 | np.random.seed(42) |
14 | 16 | categories = ["Engineering", "Marketing", "Sales", "Support"] |
15 | | -data = { |
16 | | - "Engineering": np.random.normal(85000, 15000, 150), |
17 | | - "Marketing": np.random.normal(65000, 12000, 150), |
18 | | - "Sales": np.random.normal(70000, 20000, 150), # Higher variance |
19 | | - "Support": np.random.normal(50000, 8000, 150), # Lower variance |
20 | | -} |
21 | 17 |
|
22 | | -# Colors - Python Blue and Yellow first, then accessible colors |
23 | | -colors = ["#306998", "#FFD43B", "#4B8BBE", "#FFE873"] |
| 18 | +# Engineering: normal, high mean — represents typical salaried professionals |
| 19 | +eng = np.random.normal(85000, 15000, 150) |
24 | 20 |
|
25 | | -# Create figure with categorical x-axis |
| 21 | +# Marketing: normal, mid-range |
| 22 | +mkt = np.random.normal(65000, 12000, 150) |
| 23 | + |
| 24 | +# Sales: right-skewed — most earn base salary, some earn high commissions |
| 25 | +sales_base = np.random.exponential(15000, 150) + 45000 |
| 26 | +sales = np.clip(sales_base, 30000, 150000) |
| 27 | + |
| 28 | +# Support: bimodal — junior vs senior tiers with distinct pay bands |
| 29 | +support_junior = np.random.normal(42000, 5000, 90) |
| 30 | +support_senior = np.random.normal(62000, 6000, 60) |
| 31 | +support = np.concatenate([support_junior, support_senior]) |
| 32 | + |
| 33 | +data = {"Engineering": eng, "Marketing": mkt, "Sales": sales, "Support": support} |
| 34 | + |
| 35 | +# Colors - four distinct colorblind-safe hues |
| 36 | +colors = ["#306998", "#E8943A", "#2A9D8F", "#E76F6F"] |
| 37 | + |
| 38 | +# Visual hierarchy: emphasize non-normal distributions to guide the viewer |
| 39 | +alphas = [0.55, 0.55, 0.85, 0.85] |
| 40 | + |
| 41 | +# Distribution type labels for data storytelling |
| 42 | +dist_labels = ["normal", "normal", "right-skewed", "bimodal"] |
| 43 | + |
| 44 | +# Create figure with subtle warm background tint |
26 | 45 | p = figure( |
27 | 46 | width=4800, |
28 | 47 | height=2700, |
|
31 | 50 | y_axis_label="Annual Salary (USD)", |
32 | 51 | x_range=categories, |
33 | 52 | toolbar_location=None, |
| 53 | + background_fill_color="#FAFAF8", |
34 | 54 | ) |
35 | 55 |
|
36 | | -# Styling for 4800x2700 px |
| 56 | +# Title styling — lighter secondary color for visual weight |
37 | 57 | p.title.text_font_size = "36pt" |
| 58 | +p.title.text_color = "#2D3436" |
| 59 | +p.title.text_font_style = "bold" |
| 60 | + |
| 61 | +# Text sizing for 4800x2700 px |
38 | 62 | p.xaxis.axis_label_text_font_size = "28pt" |
39 | 63 | p.yaxis.axis_label_text_font_size = "28pt" |
40 | 64 | p.xaxis.major_label_text_font_size = "22pt" |
41 | 65 | p.yaxis.major_label_text_font_size = "22pt" |
| 66 | +p.xaxis.axis_label_text_color = "#555555" |
| 67 | +p.yaxis.axis_label_text_color = "#555555" |
| 68 | + |
| 69 | +# Format y-axis as readable currency |
| 70 | +p.yaxis.formatter = NumeralTickFormatter(format="$0,0") |
42 | 71 |
|
43 | | -# Grid styling |
| 72 | +# Visual refinement - clean, polished design |
44 | 73 | p.xgrid.grid_line_color = None |
45 | | -p.ygrid.grid_line_alpha = 0.3 |
| 74 | +p.ygrid.grid_line_alpha = 0.15 |
46 | 75 | p.ygrid.grid_line_dash = "dashed" |
47 | | - |
48 | | -# Violin width scaling (0.4 = 40% of category spacing) |
| 76 | +p.ygrid.grid_line_color = "#B0B0B0" |
| 77 | +p.outline_line_color = None |
| 78 | +p.axis.minor_tick_line_color = None |
| 79 | +p.axis.major_tick_line_color = None |
| 80 | +p.axis.axis_line_color = "#D5D5D5" |
| 81 | +p.border_fill_color = "#FAFAF8" |
| 82 | + |
| 83 | +# Tighten y-axis to data range with room for annotations |
| 84 | +all_values = np.concatenate(list(data.values())) |
| 85 | +y_pad = (all_values.max() - all_values.min()) * 0.12 |
| 86 | +p.y_range.start = all_values.min() - y_pad |
| 87 | +p.y_range.end = all_values.max() + y_pad |
| 88 | + |
| 89 | +# Violin width scaling |
49 | 90 | violin_width = 0.4 |
50 | 91 |
|
51 | 92 | # Draw violins for each category |
52 | 93 | for i, cat in enumerate(categories): |
53 | 94 | values = data[cat] |
54 | | - n = len(values) |
55 | | - |
56 | | - # Compute KDE using Gaussian kernel (Silverman's rule for bandwidth) |
57 | | - std = np.std(values) |
58 | | - iqr = np.percentile(values, 75) - np.percentile(values, 25) |
59 | | - bandwidth = 0.9 * min(std, iqr / 1.34) * n ** (-0.2) |
60 | | - bandwidth = max(bandwidth, 0.1) |
61 | 95 |
|
62 | | - y_grid = np.linspace(values.min() - std, values.max() + std, 100) |
63 | | - density = np.zeros_like(y_grid, dtype=float) |
64 | | - for xi in values: |
65 | | - density += np.exp(-0.5 * ((y_grid - xi) / bandwidth) ** 2) |
66 | | - density /= n * bandwidth * np.sqrt(2 * np.pi) |
| 96 | + # Compute KDE using scipy (idiomatic, robust bandwidth selection) |
| 97 | + kde = gaussian_kde(values) |
| 98 | + y_grid = np.linspace(values.min() - np.std(values) * 0.5, values.max() + np.std(values) * 0.5, 100) |
| 99 | + density = kde(y_grid) |
67 | 100 |
|
68 | 101 | # Scale density to violin width |
69 | 102 | density_scaled = density / density.max() * violin_width |
70 | 103 |
|
71 | | - # Create violin shape (mirrored on both sides) |
72 | | - x_left = -density_scaled |
73 | | - x_right = density_scaled |
74 | | - |
75 | | - # Convert to categorical offset format for bokeh |
76 | | - xs_left = [(cat, float(xl)) for xl in x_left] |
77 | | - xs_right = [(cat, float(xr)) for xr in x_right[::-1]] |
| 104 | + # Create mirrored violin shape using categorical offset tuples |
| 105 | + xs_left = [(cat, float(-d)) for d in density_scaled] |
| 106 | + xs_right = [(cat, float(d)) for d in density_scaled[::-1]] |
78 | 107 |
|
79 | | - # Draw violin patch |
| 108 | + # Draw violin patch via ColumnDataSource with varying alpha for hierarchy |
| 109 | + violin_source = ColumnDataSource(data={"x": xs_left + xs_right, "y": list(y_grid) + list(y_grid[::-1])}) |
80 | 110 | p.patch( |
81 | | - xs_left + xs_right, |
82 | | - list(y_grid) + list(y_grid[::-1]), |
| 111 | + x="x", |
| 112 | + y="y", |
| 113 | + source=violin_source, |
83 | 114 | fill_color=colors[i], |
84 | | - fill_alpha=0.7, |
| 115 | + fill_alpha=alphas[i], |
85 | 116 | line_color=colors[i], |
| 117 | + line_alpha=min(alphas[i] + 0.15, 1.0), |
86 | 118 | line_width=3, |
87 | 119 | ) |
88 | 120 |
|
89 | | - # Compute quartiles |
| 121 | + # Quartiles and median |
90 | 122 | q1, median, q3 = np.percentile(values, [25, 50, 75]) |
91 | 123 |
|
92 | | - # Draw thin box inside violin (quartile markers) |
| 124 | + # Inner box (Q1-Q3) with ColumnDataSource for HoverTool |
93 | 125 | box_width = 0.06 |
94 | | - p.quad( |
95 | | - left=[(cat, -box_width)], |
96 | | - right=[(cat, box_width)], |
97 | | - top=[q3], |
98 | | - bottom=[q1], |
| 126 | + box_source = ColumnDataSource( |
| 127 | + data={ |
| 128 | + "left": [(cat, -box_width)], |
| 129 | + "right": [(cat, box_width)], |
| 130 | + "top": [q3], |
| 131 | + "bottom": [q1], |
| 132 | + "dept": [cat], |
| 133 | + "median_val": [f"${median:,.0f}"], |
| 134 | + "q1_val": [f"${q1:,.0f}"], |
| 135 | + "q3_val": [f"${q3:,.0f}"], |
| 136 | + "n": [str(len(values))], |
| 137 | + } |
| 138 | + ) |
| 139 | + box_renderer = p.quad( |
| 140 | + left="left", |
| 141 | + right="right", |
| 142 | + top="top", |
| 143 | + bottom="bottom", |
| 144 | + source=box_source, |
99 | 145 | fill_color="white", |
100 | 146 | fill_alpha=0.9, |
101 | 147 | line_color="black", |
102 | 148 | line_width=3, |
103 | 149 | ) |
104 | 150 |
|
105 | | - # Draw median line |
106 | | - p.segment( |
107 | | - x0=[(cat, -box_width * 1.5)], |
108 | | - y0=[median], |
109 | | - x1=[(cat, box_width * 1.5)], |
110 | | - y1=[median], |
111 | | - line_color="black", |
112 | | - line_width=5, |
| 151 | + # Add HoverTool for interactive HTML output |
| 152 | + hover = HoverTool( |
| 153 | + renderers=[box_renderer], |
| 154 | + tooltips=[ |
| 155 | + ("Department", "@dept"), |
| 156 | + ("Median", "@median_val"), |
| 157 | + ("Q1", "@q1_val"), |
| 158 | + ("Q3", "@q3_val"), |
| 159 | + ("N", "@n"), |
| 160 | + ], |
| 161 | + ) |
| 162 | + p.add_tools(hover) |
| 163 | + |
| 164 | + # Median line |
| 165 | + med_source = ColumnDataSource( |
| 166 | + data={"x0": [(cat, -box_width * 1.5)], "y0": [median], "x1": [(cat, box_width * 1.5)], "y1": [median]} |
113 | 167 | ) |
| 168 | + p.segment(x0="x0", y0="y0", x1="x1", y1="y1", source=med_source, line_color="black", line_width=5) |
114 | 169 |
|
115 | | - # Whiskers (to 1.5*IQR or data extent) |
| 170 | + # Whiskers (1.5*IQR or data extent) |
116 | 171 | iqr_val = q3 - q1 |
117 | 172 | whisker_low = max(values.min(), q1 - 1.5 * iqr_val) |
118 | 173 | whisker_high = min(values.max(), q3 + 1.5 * iqr_val) |
119 | 174 |
|
120 | | - # Vertical whisker lines |
121 | | - p.segment(x0=[cat], y0=[q1], x1=[cat], y1=[whisker_low], line_color="black", line_width=3) |
122 | | - p.segment(x0=[cat], y0=[q3], x1=[cat], y1=[whisker_high], line_color="black", line_width=3) |
| 175 | + whisker_source = ColumnDataSource( |
| 176 | + data={"x0": [cat, cat], "y0": [q1, q3], "x1": [cat, cat], "y1": [whisker_low, whisker_high]} |
| 177 | + ) |
| 178 | + p.segment(x0="x0", y0="y0", x1="x1", y1="y1", source=whisker_source, line_color="black", line_width=3) |
123 | 179 |
|
124 | 180 | # Whisker caps |
125 | 181 | cap_width = 0.04 |
126 | | - p.segment( |
127 | | - x0=[(cat, -cap_width)], |
128 | | - y0=[whisker_low], |
129 | | - x1=[(cat, cap_width)], |
130 | | - y1=[whisker_low], |
131 | | - line_color="black", |
132 | | - line_width=3, |
133 | | - ) |
134 | | - p.segment( |
135 | | - x0=[(cat, -cap_width)], |
136 | | - y0=[whisker_high], |
137 | | - x1=[(cat, cap_width)], |
138 | | - y1=[whisker_high], |
139 | | - line_color="black", |
140 | | - line_width=3, |
| 182 | + cap_source = ColumnDataSource( |
| 183 | + data={ |
| 184 | + "x0": [(cat, -cap_width), (cat, -cap_width)], |
| 185 | + "y0": [whisker_low, whisker_high], |
| 186 | + "x1": [(cat, cap_width), (cat, cap_width)], |
| 187 | + "y1": [whisker_low, whisker_high], |
| 188 | + } |
141 | 189 | ) |
| 190 | + p.segment(x0="x0", y0="y0", x1="x1", y1="y1", source=cap_source, line_color="black", line_width=3) |
| 191 | + |
| 192 | +# Distribution type annotations — guide the viewer to the data story |
| 193 | +annotation_y = all_values.min() - y_pad * 0.65 |
| 194 | +ann_source = ColumnDataSource(data={"x": categories, "y": [annotation_y] * len(categories), "text": dist_labels}) |
| 195 | +p.text( |
| 196 | + x="x", |
| 197 | + y="y", |
| 198 | + text="text", |
| 199 | + source=ann_source, |
| 200 | + text_font_size="18pt", |
| 201 | + text_font_style="italic", |
| 202 | + text_color="#999999", |
| 203 | + text_align="center", |
| 204 | + text_baseline="top", |
| 205 | +) |
142 | 206 |
|
143 | 207 | # Save outputs |
144 | 208 | export_png(p, filename="plot.png") |
|
0 commit comments