|
1 | 1 | """ pyplots.ai |
2 | 2 | dendrogram-basic: Basic Dendrogram |
3 | | -Library: pygal 3.1.0 | Python 3.13.11 |
4 | | -Quality: 90/100 | Created: 2025-12-23 |
| 3 | +Library: pygal 3.1.0 | Python 3.14.3 |
| 4 | +Quality: 82/100 | Updated: 2026-04-05 |
5 | 5 | """ |
6 | 6 |
|
7 | 7 | import numpy as np |
8 | 8 | import pygal |
9 | 9 | from pygal.style import Style |
10 | | -from scipy.cluster.hierarchy import linkage |
| 10 | +from scipy.cluster.hierarchy import fcluster, linkage |
11 | 11 |
|
12 | 12 |
|
13 | 13 | # Data - Iris flower measurements (4 features for 15 samples) |
14 | 14 | np.random.seed(42) |
15 | | - |
16 | | -# Simulate iris-like measurements: sepal length, sepal width, petal length, petal width |
17 | | -# Three species with distinct characteristics |
18 | 15 | samples_per_species = 5 |
19 | 16 |
|
20 | 17 | labels = [] |
21 | | -data = [] |
| 18 | +measurements = [] |
22 | 19 |
|
23 | 20 | # Setosa: shorter petals, wider sepals |
24 | 21 | for i in range(samples_per_species): |
25 | 22 | labels.append(f"Setosa-{i + 1}") |
26 | | - data.append( |
| 23 | + measurements.append( |
27 | 24 | [ |
28 | | - 5.0 + np.random.randn() * 0.3, |
29 | | - 3.4 + np.random.randn() * 0.3, |
30 | | - 1.5 + np.random.randn() * 0.2, |
31 | | - 0.3 + np.random.randn() * 0.1, |
| 25 | + 5.0 + np.random.randn() * 0.35, |
| 26 | + 3.4 + np.random.randn() * 0.35, |
| 27 | + 1.5 + np.random.randn() * 0.25, |
| 28 | + 0.3 + np.random.randn() * 0.12, |
32 | 29 | ] |
33 | 30 | ) |
34 | 31 |
|
35 | 32 | # Versicolor: medium measurements |
36 | 33 | for i in range(samples_per_species): |
37 | 34 | labels.append(f"Versicolor-{i + 1}") |
38 | | - data.append( |
| 35 | + measurements.append( |
39 | 36 | [ |
40 | | - 5.9 + np.random.randn() * 0.4, |
41 | | - 2.8 + np.random.randn() * 0.3, |
42 | | - 4.3 + np.random.randn() * 0.4, |
43 | | - 1.3 + np.random.randn() * 0.2, |
| 37 | + 5.9 + np.random.randn() * 0.5, |
| 38 | + 2.8 + np.random.randn() * 0.35, |
| 39 | + 4.3 + np.random.randn() * 0.5, |
| 40 | + 1.3 + np.random.randn() * 0.25, |
44 | 41 | ] |
45 | 42 | ) |
46 | 43 |
|
47 | 44 | # Virginica: longer petals and sepals |
48 | 45 | for i in range(samples_per_species): |
49 | 46 | labels.append(f"Virginica-{i + 1}") |
50 | | - data.append( |
| 47 | + measurements.append( |
51 | 48 | [ |
52 | | - 6.6 + np.random.randn() * 0.5, |
53 | | - 3.0 + np.random.randn() * 0.3, |
54 | | - 5.5 + np.random.randn() * 0.5, |
| 49 | + 6.6 + np.random.randn() * 0.55, |
| 50 | + 3.0 + np.random.randn() * 0.35, |
| 51 | + 5.5 + np.random.randn() * 0.55, |
55 | 52 | 2.0 + np.random.randn() * 0.3, |
56 | 53 | ] |
57 | 54 | ) |
58 | 55 |
|
59 | | -data = np.array(data) |
60 | | - |
61 | | -# Compute hierarchical clustering using Ward's method |
62 | | -linkage_matrix = linkage(data, method="ward") |
| 56 | +measurements = np.array(measurements) |
63 | 57 |
|
64 | | -# Build dendrogram coordinates from linkage matrix |
| 58 | +# Compute hierarchical clustering |
| 59 | +linkage_matrix = linkage(measurements, method="ward") |
65 | 60 | n = len(labels) |
66 | 61 |
|
67 | | -# Track x-position and height for each node (original samples + merged clusters) |
68 | | -node_x = {} |
69 | | -node_height = {} |
| 62 | +# Assign cluster colors - cut at 3 clusters matching species |
| 63 | +cluster_ids = fcluster(linkage_matrix, t=3, criterion="maxclust") |
70 | 64 |
|
71 | | -# Compute leaf order from linkage for proper x-axis positioning (iterative approach) |
72 | | -root_node = 2 * n - 2 |
73 | | -stack = [root_node] |
| 65 | +# Build leaf ordering from linkage (iterative traversal) |
74 | 66 | leaf_order = [] |
| 67 | +stack = [2 * n - 2] |
75 | 68 | while stack: |
76 | 69 | node_id = stack.pop() |
77 | 70 | if node_id < n: |
|
80 | 73 | idx = node_id - n |
81 | 74 | left = int(linkage_matrix[idx, 0]) |
82 | 75 | right = int(linkage_matrix[idx, 1]) |
83 | | - # Push right first so left is processed first (maintains order) |
84 | 76 | stack.append(right) |
85 | 77 | stack.append(left) |
86 | 78 |
|
87 | | -# Initialize leaf positions based on their order in the dendrogram |
| 79 | +# Compute node positions and determine cluster membership for coloring |
| 80 | +node_x = {} |
| 81 | +node_height = {} |
| 82 | +node_cluster = {} |
| 83 | + |
88 | 84 | for pos, leaf_id in enumerate(leaf_order): |
89 | 85 | node_x[leaf_id] = pos |
90 | 86 | node_height[leaf_id] = 0 |
91 | | - |
92 | | -# Build merged clusters and collect line segments |
93 | | -segments = [] |
94 | | -for idx, (left, right, dist, _) in enumerate(linkage_matrix): |
95 | | - left, right = int(left), int(right) |
| 87 | + node_cluster[leaf_id] = cluster_ids[leaf_id] |
| 88 | + |
| 89 | +# Map cluster IDs to species names |
| 90 | +cluster_species = {} |
| 91 | +for leaf_id in range(n): |
| 92 | + cid = cluster_ids[leaf_id] |
| 93 | + species = labels[leaf_id].rsplit("-", 1)[0] |
| 94 | + cluster_species[cid] = species |
| 95 | + |
| 96 | +# Colorblind-safe palette: blue, teal, amber (high contrast, avoids red-green) |
| 97 | +species_colors = {"Setosa": "#306998", "Versicolor": "#D4872C", "Virginica": "#7B4EA3"} |
| 98 | +mixed_color = "#5C6370" |
| 99 | + |
| 100 | +# Build U-shape series with color and distance metadata |
| 101 | +u_shapes = [] |
| 102 | +max_dist = linkage_matrix[:, 2].max() |
| 103 | + |
| 104 | +for idx in range(len(linkage_matrix)): |
| 105 | + left = int(linkage_matrix[idx, 0]) |
| 106 | + right = int(linkage_matrix[idx, 1]) |
| 107 | + dist = linkage_matrix[idx, 2] |
96 | 108 | new_node = n + idx |
97 | 109 |
|
98 | | - # X position is midpoint of children |
99 | 110 | x_left = node_x[left] |
100 | 111 | x_right = node_x[right] |
101 | 112 | node_x[new_node] = (x_left + x_right) / 2 |
102 | 113 | node_height[new_node] = dist |
103 | 114 |
|
104 | | - # Draw U-shape: left vertical, horizontal connector, right vertical |
105 | 115 | h_left = node_height[left] |
106 | 116 | h_right = node_height[right] |
107 | 117 |
|
108 | | - # Left vertical line (from left child up to merge height) |
109 | | - segments.append([(x_left, h_left), (x_left, dist)]) |
110 | | - # Horizontal connector at merge height |
111 | | - segments.append([(x_left, dist), (x_right, dist)]) |
112 | | - # Right vertical line (from right child up to merge height) |
113 | | - segments.append([(x_right, h_right), (x_right, dist)]) |
| 118 | + cl = node_cluster[left] |
| 119 | + cr = node_cluster[right] |
| 120 | + if cl == cr: |
| 121 | + node_cluster[new_node] = cl |
| 122 | + color = species_colors.get(cluster_species.get(cl, ""), mixed_color) |
| 123 | + else: |
| 124 | + node_cluster[new_node] = -1 |
| 125 | + color = mixed_color |
| 126 | + |
| 127 | + # Stroke width scales with merge distance for visual hierarchy |
| 128 | + stroke_w = 3.5 + 6 * (dist / max_dist) |
| 129 | + |
| 130 | + u_shapes.append((color, stroke_w, dist, [(x_left, h_left), (x_left, dist), (x_right, dist), (x_right, h_right)])) |
114 | 131 |
|
115 | | -# Labels in dendrogram order |
| 132 | +# Ordered labels for x-axis |
116 | 133 | ordered_labels = [labels[i] for i in leaf_order] |
117 | 134 |
|
118 | | -# Custom style for pyplots - larger fonts for 4800x2700 canvas |
| 135 | +# Style - refined for publication quality at 4800x2700 |
119 | 136 | custom_style = Style( |
120 | | - background="white", |
121 | | - plot_background="white", |
122 | | - foreground="#333", |
123 | | - foreground_strong="#333", |
124 | | - foreground_subtle="#999", # Lighter grid lines for subtlety |
125 | | - colors=("#306998",), # Python Blue for dendrogram lines |
| 137 | + background="#FFFFFF", |
| 138 | + plot_background="#FAFAFA", |
| 139 | + foreground="#2d2d2d", |
| 140 | + foreground_strong="#1a1a1a", |
| 141 | + foreground_subtle="#e0e0e0", |
| 142 | + colors=tuple(color for color, _, _, _ in u_shapes), |
126 | 143 | title_font_size=56, |
127 | | - label_font_size=44, # Larger x-axis tick labels for readability |
128 | | - major_label_font_size=40, |
129 | | - legend_font_size=32, |
| 144 | + label_font_size=38, |
| 145 | + major_label_font_size=36, |
| 146 | + legend_font_size=34, |
130 | 147 | value_font_size=28, |
131 | | - stroke_width=5, |
| 148 | + stroke_width=4, |
132 | 149 | opacity=1.0, |
133 | | - guide_stroke_color="#ddd", # Subtle grid color |
| 150 | + guide_stroke_color="#e8e8e8", |
| 151 | + major_guide_stroke_color="#d8d8d8", |
| 152 | + title_font_family="Helvetica, Arial, sans-serif", |
| 153 | + label_font_family="Helvetica, Arial, sans-serif", |
| 154 | + major_label_font_family="Helvetica, Arial, sans-serif", |
| 155 | + legend_font_family="Helvetica, Arial, sans-serif", |
| 156 | + value_font_family="Helvetica, Arial, sans-serif", |
134 | 157 | ) |
135 | 158 |
|
136 | | -# Create XY chart for dendrogram |
| 159 | +# Chart - leveraging pygal XY with extensive configuration |
137 | 160 | chart = pygal.XY( |
138 | 161 | width=4800, |
139 | 162 | height=2700, |
140 | 163 | style=custom_style, |
141 | | - title="dendrogram-basic · pygal · pyplots.ai", |
| 164 | + title="Iris Species Clustering · dendrogram-basic · pygal · pyplots.ai", |
142 | 165 | x_title="Sample", |
143 | 166 | y_title="Distance (Ward's Method)", |
144 | | - show_legend=False, |
| 167 | + show_legend=True, |
145 | 168 | show_dots=False, |
146 | | - stroke_style={"width": 5}, |
147 | 169 | fill=False, |
148 | 170 | show_x_guides=False, |
149 | 171 | show_y_guides=True, |
150 | | - x_label_rotation=45, |
151 | | - truncate_label=20, |
152 | | - xrange=(min(node_x.values()) - 0.5, max(leaf_order) + 0.5), # Proper x-axis range |
| 172 | + show_minor_x_labels=False, |
| 173 | + x_label_rotation=35, |
| 174 | + truncate_label=30, |
| 175 | + xrange=(-1.0, n + 0.2), |
| 176 | + range=(0, max_dist * 1.05), |
| 177 | + margin_top=50, |
| 178 | + margin_bottom=140, |
| 179 | + margin_left=100, |
| 180 | + margin_right=80, |
| 181 | + legend_at_bottom=True, |
| 182 | + legend_box_size=30, |
| 183 | + tooltip_border_radius=10, |
| 184 | + print_values=False, |
| 185 | + spacing=35, |
| 186 | + js=[], |
153 | 187 | ) |
154 | 188 |
|
155 | | -# Set x-axis labels at exact leaf positions using x_labels_major |
| 189 | +# Custom x-axis labels at leaf positions with formatted names |
156 | 190 | chart.x_labels = list(range(n)) |
157 | 191 | chart.x_labels_major = list(range(n)) |
158 | | -chart.x_value_formatter = lambda x: ordered_labels[int(round(x))] if 0 <= x < n else "" |
| 192 | +chart.x_value_formatter = lambda x: ordered_labels[int(round(x))] if 0 <= round(x) < n else "" |
| 193 | + |
| 194 | +# Y-axis: custom labels with formatted distances |
| 195 | +y_max_nice = int(np.ceil(max_dist)) |
| 196 | +step = 1 if y_max_nice <= 6 else 2 |
| 197 | +chart.y_labels = [{"value": v, "label": f"{v:.0f}"} for v in range(0, y_max_nice + 1, step)] |
| 198 | + |
| 199 | +# Draw dendrogram - each U-shape as its own series with scaled stroke |
| 200 | +color_to_species = {v: k for k, v in species_colors.items()} |
| 201 | +color_to_species[mixed_color] = "Inter-cluster" |
| 202 | + |
| 203 | +named_colors = set() |
| 204 | +for color, stroke_w, dist, points in u_shapes: |
| 205 | + if color not in named_colors: |
| 206 | + series_name = color_to_species.get(color, "Other") |
| 207 | + named_colors.add(color) |
| 208 | + else: |
| 209 | + series_name = None |
| 210 | + |
| 211 | + # Use pygal's per-series formatter for distance tooltips |
| 212 | + chart.add( |
| 213 | + series_name, |
| 214 | + [{"value": p, "label": f"d={dist:.2f}"} for p in points], |
| 215 | + show_dots=False, |
| 216 | + stroke_style={"width": stroke_w, "linecap": "round", "linejoin": "round"}, |
| 217 | + allow_interruptions=False, |
| 218 | + ) |
159 | 219 |
|
160 | | -# Add each segment as a separate series to draw the dendrogram |
161 | | -for seg in segments: |
162 | | - chart.add(None, seg, show_dots=False, stroke_style={"width": 5}) |
| 220 | +# Add invisible reference series for key distance annotations via pygal secondary axis |
| 221 | +# Mark the two most important merge distances with horizontal reference lines |
| 222 | +key_merges = sorted(linkage_matrix[:, 2]) |
| 223 | +within_cluster_max = key_merges[n - 4] # Highest within-cluster merge |
| 224 | +between_cluster = key_merges[-2] # Second-to-last merge (between two groups) |
| 225 | + |
| 226 | +for ref_dist, ref_label in [ |
| 227 | + (within_cluster_max, f"Within-species max (d={within_cluster_max:.1f})"), |
| 228 | + (between_cluster, f"Between-group merge (d={between_cluster:.1f})"), |
| 229 | +]: |
| 230 | + chart.add( |
| 231 | + ref_label, |
| 232 | + [(-0.8, ref_dist), (n - 0.2, ref_dist)], |
| 233 | + show_dots=False, |
| 234 | + stroke_style={"width": 2, "dasharray": "12, 8", "linecap": "butt"}, |
| 235 | + ) |
163 | 236 |
|
164 | | -# Save outputs |
| 237 | +# Save |
165 | 238 | chart.render_to_file("plot.html") |
166 | 239 | chart.render_to_png("plot.png") |
0 commit comments