Skip to content
210 changes: 137 additions & 73 deletions plots/violin-basic/implementations/bokeh.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,47 @@
""" pyplots.ai
violin-basic: Basic Violin Plot
Library: bokeh 3.8.1 | Python 3.13.11
Quality: 91/100 | Created: 2025-12-23
Library: bokeh 3.8.2 | Python 3.14.3
Quality: 92/100 | Updated: 2026-02-21
"""

import numpy as np
from bokeh.io import export_png, output_file, save
from bokeh.models import ColumnDataSource, HoverTool, NumeralTickFormatter
from bokeh.plotting import figure
from scipy.stats import gaussian_kde


# Data - Salary distributions by department (realistic scenario)
np.random.seed(42)
categories = ["Engineering", "Marketing", "Sales", "Support"]
data = {
"Engineering": np.random.normal(85000, 15000, 150),
"Marketing": np.random.normal(65000, 12000, 150),
"Sales": np.random.normal(70000, 20000, 150), # Higher variance
"Support": np.random.normal(50000, 8000, 150), # Lower variance
}

# Colors - Python Blue and Yellow first, then accessible colors
colors = ["#306998", "#FFD43B", "#4B8BBE", "#FFE873"]
# Engineering: normal, high mean — represents typical salaried professionals
eng = np.random.normal(85000, 15000, 150)

# Create figure with categorical x-axis
# Marketing: normal, mid-range
mkt = np.random.normal(65000, 12000, 150)

# Sales: right-skewed — most earn base salary, some earn high commissions
sales_base = np.random.exponential(15000, 150) + 45000
sales = np.clip(sales_base, 30000, 150000)

# Support: bimodal — junior vs senior tiers with distinct pay bands
support_junior = np.random.normal(42000, 5000, 90)
support_senior = np.random.normal(62000, 6000, 60)
support = np.concatenate([support_junior, support_senior])

data = {"Engineering": eng, "Marketing": mkt, "Sales": sales, "Support": support}

# Colors - four distinct colorblind-safe hues
colors = ["#306998", "#E8943A", "#2A9D8F", "#E76F6F"]

# Visual hierarchy: emphasize non-normal distributions to guide the viewer
alphas = [0.55, 0.55, 0.85, 0.85]

# Distribution type labels for data storytelling
dist_labels = ["normal", "normal", "right-skewed", "bimodal"]

# Create figure with subtle warm background tint
p = figure(
width=4800,
height=2700,
Expand All @@ -31,114 +50,159 @@
y_axis_label="Annual Salary (USD)",
x_range=categories,
toolbar_location=None,
background_fill_color="#FAFAF8",
)

# Styling for 4800x2700 px
# Title styling — lighter secondary color for visual weight
p.title.text_font_size = "36pt"
p.title.text_color = "#2D3436"
p.title.text_font_style = "bold"

# Text sizing for 4800x2700 px
p.xaxis.axis_label_text_font_size = "28pt"
p.yaxis.axis_label_text_font_size = "28pt"
p.xaxis.major_label_text_font_size = "22pt"
p.yaxis.major_label_text_font_size = "22pt"
p.xaxis.axis_label_text_color = "#555555"
p.yaxis.axis_label_text_color = "#555555"

# Format y-axis as readable currency
p.yaxis.formatter = NumeralTickFormatter(format="$0,0")

# Grid styling
# Visual refinement - clean, polished design
p.xgrid.grid_line_color = None
p.ygrid.grid_line_alpha = 0.3
p.ygrid.grid_line_alpha = 0.15
p.ygrid.grid_line_dash = "dashed"

# Violin width scaling (0.4 = 40% of category spacing)
p.ygrid.grid_line_color = "#B0B0B0"
p.outline_line_color = None
p.axis.minor_tick_line_color = None
p.axis.major_tick_line_color = None
p.axis.axis_line_color = "#D5D5D5"
p.border_fill_color = "#FAFAF8"

# Tighten y-axis to data range with room for annotations
all_values = np.concatenate(list(data.values()))
y_pad = (all_values.max() - all_values.min()) * 0.12
p.y_range.start = all_values.min() - y_pad
p.y_range.end = all_values.max() + y_pad

# Violin width scaling
violin_width = 0.4

# Draw violins for each category
for i, cat in enumerate(categories):
values = data[cat]
n = len(values)

# Compute KDE using Gaussian kernel (Silverman's rule for bandwidth)
std = np.std(values)
iqr = np.percentile(values, 75) - np.percentile(values, 25)
bandwidth = 0.9 * min(std, iqr / 1.34) * n ** (-0.2)
bandwidth = max(bandwidth, 0.1)

y_grid = np.linspace(values.min() - std, values.max() + std, 100)
density = np.zeros_like(y_grid, dtype=float)
for xi in values:
density += np.exp(-0.5 * ((y_grid - xi) / bandwidth) ** 2)
density /= n * bandwidth * np.sqrt(2 * np.pi)
# Compute KDE using scipy (idiomatic, robust bandwidth selection)
kde = gaussian_kde(values)
y_grid = np.linspace(values.min() - np.std(values) * 0.5, values.max() + np.std(values) * 0.5, 100)
density = kde(y_grid)

# Scale density to violin width
density_scaled = density / density.max() * violin_width

# Create violin shape (mirrored on both sides)
x_left = -density_scaled
x_right = density_scaled

# Convert to categorical offset format for bokeh
xs_left = [(cat, float(xl)) for xl in x_left]
xs_right = [(cat, float(xr)) for xr in x_right[::-1]]
# Create mirrored violin shape using categorical offset tuples
xs_left = [(cat, float(-d)) for d in density_scaled]
xs_right = [(cat, float(d)) for d in density_scaled[::-1]]

# Draw violin patch
# Draw violin patch via ColumnDataSource with varying alpha for hierarchy
violin_source = ColumnDataSource(data={"x": xs_left + xs_right, "y": list(y_grid) + list(y_grid[::-1])})
p.patch(
xs_left + xs_right,
list(y_grid) + list(y_grid[::-1]),
x="x",
y="y",
source=violin_source,
fill_color=colors[i],
fill_alpha=0.7,
fill_alpha=alphas[i],
line_color=colors[i],
line_alpha=min(alphas[i] + 0.15, 1.0),
line_width=3,
)

# Compute quartiles
# Quartiles and median
q1, median, q3 = np.percentile(values, [25, 50, 75])

# Draw thin box inside violin (quartile markers)
# Inner box (Q1-Q3) with ColumnDataSource for HoverTool
box_width = 0.06
p.quad(
left=[(cat, -box_width)],
right=[(cat, box_width)],
top=[q3],
bottom=[q1],
box_source = ColumnDataSource(
data={
"left": [(cat, -box_width)],
"right": [(cat, box_width)],
"top": [q3],
"bottom": [q1],
"dept": [cat],
"median_val": [f"${median:,.0f}"],
"q1_val": [f"${q1:,.0f}"],
"q3_val": [f"${q3:,.0f}"],
"n": [str(len(values))],
}
)
box_renderer = p.quad(
left="left",
right="right",
top="top",
bottom="bottom",
source=box_source,
fill_color="white",
fill_alpha=0.9,
line_color="black",
line_width=3,
)

# Draw median line
p.segment(
x0=[(cat, -box_width * 1.5)],
y0=[median],
x1=[(cat, box_width * 1.5)],
y1=[median],
line_color="black",
line_width=5,
# Add HoverTool for interactive HTML output
hover = HoverTool(
renderers=[box_renderer],
tooltips=[
("Department", "@dept"),
("Median", "@median_val"),
("Q1", "@q1_val"),
("Q3", "@q3_val"),
("N", "@n"),
],
)
p.add_tools(hover)

# Median line
med_source = ColumnDataSource(
data={"x0": [(cat, -box_width * 1.5)], "y0": [median], "x1": [(cat, box_width * 1.5)], "y1": [median]}
)
p.segment(x0="x0", y0="y0", x1="x1", y1="y1", source=med_source, line_color="black", line_width=5)

# Whiskers (to 1.5*IQR or data extent)
# Whiskers (1.5*IQR or data extent)
iqr_val = q3 - q1
whisker_low = max(values.min(), q1 - 1.5 * iqr_val)
whisker_high = min(values.max(), q3 + 1.5 * iqr_val)

# Vertical whisker lines
p.segment(x0=[cat], y0=[q1], x1=[cat], y1=[whisker_low], line_color="black", line_width=3)
p.segment(x0=[cat], y0=[q3], x1=[cat], y1=[whisker_high], line_color="black", line_width=3)
whisker_source = ColumnDataSource(
data={"x0": [cat, cat], "y0": [q1, q3], "x1": [cat, cat], "y1": [whisker_low, whisker_high]}
)
p.segment(x0="x0", y0="y0", x1="x1", y1="y1", source=whisker_source, line_color="black", line_width=3)

# Whisker caps
cap_width = 0.04
p.segment(
x0=[(cat, -cap_width)],
y0=[whisker_low],
x1=[(cat, cap_width)],
y1=[whisker_low],
line_color="black",
line_width=3,
)
p.segment(
x0=[(cat, -cap_width)],
y0=[whisker_high],
x1=[(cat, cap_width)],
y1=[whisker_high],
line_color="black",
line_width=3,
cap_source = ColumnDataSource(
data={
"x0": [(cat, -cap_width), (cat, -cap_width)],
"y0": [whisker_low, whisker_high],
"x1": [(cat, cap_width), (cat, cap_width)],
"y1": [whisker_low, whisker_high],
}
)
p.segment(x0="x0", y0="y0", x1="x1", y1="y1", source=cap_source, line_color="black", line_width=3)

# Distribution type annotations — guide the viewer to the data story
annotation_y = all_values.min() - y_pad * 0.65
ann_source = ColumnDataSource(data={"x": categories, "y": [annotation_y] * len(categories), "text": dist_labels})
p.text(
x="x",
y="y",
text="text",
source=ann_source,
text_font_size="18pt",
text_font_style="italic",
text_color="#999999",
text_align="center",
text_baseline="top",
)

# Save outputs
export_png(p, filename="plot.png")
Expand Down
Loading