|
1 | 1 | """ pyplots.ai |
2 | 2 | dendrogram-basic: Basic Dendrogram |
3 | | -Library: letsplot 4.8.2 | Python 3.13.11 |
4 | | -Quality: 91/100 | Created: 2025-12-23 |
| 3 | +Library: letsplot 4.8.2 | Python 3.14.3 |
| 4 | +Quality: 87/100 | Updated: 2026-04-05 |
5 | 5 | """ |
6 | 6 |
|
7 | 7 | import numpy as np |
|
10 | 10 | LetsPlot, |
11 | 11 | aes, |
12 | 12 | element_blank, |
| 13 | + element_line, |
| 14 | + element_rect, |
13 | 15 | element_text, |
| 16 | + geom_hline, |
| 17 | + geom_point, |
14 | 18 | geom_segment, |
15 | 19 | geom_text, |
16 | 20 | ggplot, |
17 | 21 | ggsize, |
18 | 22 | labs, |
19 | | - scale_color_manual, |
| 23 | + layer_tooltips, |
| 24 | + scale_color_identity, |
20 | 25 | scale_x_continuous, |
21 | 26 | scale_y_continuous, |
22 | 27 | theme, |
23 | | - theme_minimal, |
| 28 | + theme_void, |
24 | 29 | ) |
25 | 30 | from lets_plot.export import ggsave |
26 | 31 | from scipy.cluster.hierarchy import linkage |
| 32 | +from sklearn.datasets import load_iris |
27 | 33 |
|
28 | 34 |
|
29 | 35 | LetsPlot.setup_html() |
30 | 36 |
|
31 | | -# Data - Iris flower measurements (4 features for 15 samples) |
| 37 | +# Data - Iris flower measurements (15 samples, 3 species) |
| 38 | +iris = load_iris() |
32 | 39 | np.random.seed(42) |
| 40 | +indices = np.sort(np.concatenate([np.random.choice(np.where(iris.target == k)[0], 5, replace=False) for k in range(3)])) |
| 41 | +features = iris.data[indices] |
| 42 | +species_names = ["Setosa", "Versicolor", "Virginica"] |
| 43 | +labels = [f"{species_names[iris.target[i]][:3]}-{j + 1}" for j, i in enumerate(indices)] |
33 | 44 |
|
34 | | -# Simulate iris-like measurements: sepal length, sepal width, petal length, petal width |
35 | | -# Three species with distinct characteristics |
36 | | -samples_per_species = 5 |
37 | | - |
38 | | -labels = [] |
39 | | -data = [] |
40 | | - |
41 | | -# Setosa: shorter petals, wider sepals |
42 | | -for i in range(samples_per_species): |
43 | | - labels.append(f"Setosa-{i + 1}") |
44 | | - data.append( |
45 | | - [ |
46 | | - 5.0 + np.random.randn() * 0.3, # sepal length |
47 | | - 3.4 + np.random.randn() * 0.3, # sepal width |
48 | | - 1.5 + np.random.randn() * 0.2, # petal length |
49 | | - 0.3 + np.random.randn() * 0.1, # petal width |
50 | | - ] |
51 | | - ) |
52 | | - |
53 | | -# Versicolor: medium measurements |
54 | | -for i in range(samples_per_species): |
55 | | - labels.append(f"Versicolor-{i + 1}") |
56 | | - data.append( |
57 | | - [ |
58 | | - 5.9 + np.random.randn() * 0.4, # sepal length |
59 | | - 2.8 + np.random.randn() * 0.3, # sepal width |
60 | | - 4.3 + np.random.randn() * 0.4, # petal length |
61 | | - 1.3 + np.random.randn() * 0.2, # petal width |
62 | | - ] |
63 | | - ) |
64 | | - |
65 | | -# Virginica: longer petals and sepals |
66 | | -for i in range(samples_per_species): |
67 | | - labels.append(f"Virginica-{i + 1}") |
68 | | - data.append( |
69 | | - [ |
70 | | - 6.6 + np.random.randn() * 0.5, # sepal length |
71 | | - 3.0 + np.random.randn() * 0.3, # sepal width |
72 | | - 5.5 + np.random.randn() * 0.5, # petal length |
73 | | - 2.0 + np.random.randn() * 0.3, # petal width |
74 | | - ] |
75 | | - ) |
76 | | - |
77 | | -data = np.array(data) |
78 | | -n_samples = len(labels) |
79 | | - |
80 | | -# Compute hierarchical clustering using Ward's method |
81 | | -linkage_matrix = linkage(data, method="ward") |
| 45 | +# Hierarchical clustering (Ward's method) |
| 46 | +linkage_matrix = linkage(features, method="ward") |
82 | 47 |
|
83 | 48 | # Build dendrogram coordinates from linkage matrix |
84 | 49 | n = len(labels) |
85 | | -leaf_positions = {i: i for i in range(n)} |
86 | | -node_heights = dict.fromkeys(range(n), 0) |
| 50 | +leaf_positions = {i: float(i) for i in range(n)} |
| 51 | +node_heights = dict.fromkeys(range(n), 0.0) |
87 | 52 | segments = [] |
88 | 53 |
|
89 | | -# Color threshold for clustering (similar to matplotlib's default) |
| 54 | +# Color threshold — splits into 3 major clusters |
90 | 55 | max_dist = linkage_matrix[:, 2].max() |
91 | 56 | color_threshold = 0.7 * max_dist |
92 | 57 |
|
93 | | -# Process each merge in the linkage matrix |
| 58 | +# Curated palette: muted, publication-quality tones |
| 59 | +palette = {"above": "#5B7B9A", "Setosa": "#2D8E6F", "Versicolor": "#D4883B", "Virginica": "#8B6AAE"} |
| 60 | +cluster_display = {"above": "Cross-cluster", "Setosa": "Setosa", "Versicolor": "Versicolor", "Virginica": "Virginica"} |
| 61 | +node_cluster = {i: labels[i].split("-")[0] for i in range(n)} |
| 62 | +# Map short prefixes to full species names |
| 63 | +prefix_to_species = {"Set": "Setosa", "Ver": "Versicolor", "Vir": "Virginica"} |
| 64 | +node_cluster = {i: prefix_to_species[labels[i].split("-")[0]] for i in range(n)} |
| 65 | + |
94 | 66 | for i, (left, right, dist, _) in enumerate(linkage_matrix): |
95 | 67 | left, right = int(left), int(right) |
96 | 68 | new_node = n + i |
97 | 69 |
|
98 | | - # Get positions of children |
99 | 70 | left_pos = leaf_positions[left] |
100 | 71 | right_pos = leaf_positions[right] |
101 | | - |
102 | | - # New node position is midpoint of children |
103 | | - new_pos = (left_pos + right_pos) / 2 |
104 | | - leaf_positions[new_node] = new_pos |
| 72 | + leaf_positions[new_node] = (left_pos + right_pos) / 2 |
105 | 73 | node_heights[new_node] = dist |
106 | 74 |
|
107 | | - # Determine color based on height threshold |
108 | | - color = "#306998" if dist >= color_threshold else "#FFD43B" |
| 75 | + left_cl, right_cl = node_cluster[left], node_cluster[right] |
| 76 | + node_cluster[new_node] = left_cl if left_cl == right_cl else "above" |
| 77 | + cluster_label = node_cluster[new_node] if dist < color_threshold else "above" |
| 78 | + color = palette[cluster_label] |
| 79 | + display = cluster_display[cluster_label] |
109 | 80 |
|
110 | 81 | left_height = node_heights[left] |
111 | 82 | right_height = node_heights[right] |
112 | 83 |
|
113 | | - # Vertical segment from left child to merge height |
114 | | - segments.append((left_pos, left_height, left_pos, dist, color)) |
115 | | - # Vertical segment from right child to merge height |
116 | | - segments.append((right_pos, right_height, right_pos, dist, color)) |
117 | | - # Horizontal segment connecting the two |
118 | | - segments.append((left_pos, dist, right_pos, dist, color)) |
119 | | - |
120 | | -# Create segment dataframe |
121 | | -segment_df = pd.DataFrame(segments, columns=["x", "y", "xend", "yend", "color"]) |
122 | | - |
123 | | -# Create label dataframe for x-axis labels |
124 | | -label_data = [] |
125 | | -for i, label in enumerate(labels): |
126 | | - label_data.append({"x": leaf_positions[i], "y": -0.8, "label": label}) |
127 | | -label_df = pd.DataFrame(label_data) |
| 84 | + for seg in [ |
| 85 | + (left_pos, left_height, left_pos, dist), |
| 86 | + (right_pos, right_height, right_pos, dist), |
| 87 | + (left_pos, dist, right_pos, dist), |
| 88 | + ]: |
| 89 | + segments.append( |
| 90 | + { |
| 91 | + "x": seg[0], |
| 92 | + "y": seg[1], |
| 93 | + "xend": seg[2], |
| 94 | + "yend": seg[3], |
| 95 | + "color": color, |
| 96 | + "merge_dist": round(dist, 2), |
| 97 | + "cluster": display, |
| 98 | + } |
| 99 | + ) |
| 100 | + |
| 101 | +segment_df = pd.DataFrame(segments) |
| 102 | + |
| 103 | +# Leaf labels and markers |
| 104 | +leaf_data = [] |
| 105 | +for i in range(n): |
| 106 | + species = prefix_to_species[labels[i].split("-")[0]] |
| 107 | + leaf_data.append( |
| 108 | + {"x": leaf_positions[i], "y": 0, "label": labels[i], "color": palette[species], "species": species} |
| 109 | + ) |
| 110 | +label_df = pd.DataFrame(leaf_data) |
| 111 | + |
| 112 | +# Legend entries (manual via geom_point placed off-canvas, brought into legend via tooltips) |
| 113 | +legend_items = pd.DataFrame( |
| 114 | + [ |
| 115 | + {"x": -99, "y": -99, "xend": -98, "yend": -99, "color": palette[s], "cluster": s, "merge_dist": 0} |
| 116 | + for s in ["Setosa", "Versicolor", "Virginica", "above"] |
| 117 | + ] |
| 118 | +) |
128 | 119 |
|
129 | 120 | # Plot |
130 | 121 | plot = ( |
131 | 122 | ggplot() |
132 | | - + geom_segment(aes(x="x", y="y", xend="xend", yend="yend", color="color"), data=segment_df, size=1.5) |
133 | | - + geom_text(aes(x="x", y="y", label="label"), data=label_df, angle=35, hjust=1, vjust=1, size=10, color="#333333") |
134 | | - + scale_color_manual(values={"#306998": "#306998", "#FFD43B": "#FFD43B"}, guide="none") |
| 123 | + + geom_segment( |
| 124 | + aes(x="x", y="y", xend="xend", yend="yend", color="color"), |
| 125 | + data=segment_df, |
| 126 | + size=2.0, |
| 127 | + tooltips=layer_tooltips().title("@cluster").line("Merge distance|@merge_dist").min_width(180), |
| 128 | + ) |
| 129 | + + geom_point( |
| 130 | + aes(x="x", y="y", color="color"), |
| 131 | + data=label_df, |
| 132 | + size=5, |
| 133 | + shape=16, |
| 134 | + tooltips=layer_tooltips().title("@species").line("Sample|@label"), |
| 135 | + ) |
| 136 | + + geom_text( |
| 137 | + aes(x="x", y="y", label="label", color="color"), |
| 138 | + data=label_df.assign(y=-0.35), |
| 139 | + angle=45, |
| 140 | + hjust=1, |
| 141 | + vjust=1, |
| 142 | + size=13, |
| 143 | + family="monospace", |
| 144 | + ) |
| 145 | + + geom_hline(yintercept=color_threshold, linetype="dashed", color="#9EAAB8", size=0.8) |
| 146 | + + geom_text( |
| 147 | + aes(x="x", y="y", label="label"), |
| 148 | + data=pd.DataFrame([{"x": n - 1.5, "y": color_threshold + 0.25, "label": f"threshold = {color_threshold:.1f}"}]), |
| 149 | + size=11, |
| 150 | + color="#7A8A9A", |
| 151 | + hjust=1, |
| 152 | + family="monospace", |
| 153 | + ) |
| 154 | + + scale_color_identity() |
135 | 155 | + scale_x_continuous(expand=[0.06, 0.02]) |
136 | | - + scale_y_continuous(expand=[0.18, 0.02]) |
137 | | - + labs(x="Sample", y="Distance (Ward)", title="dendrogram-basic · letsplot · pyplots.ai") |
138 | | - + theme_minimal() |
| 156 | + + scale_y_continuous(name="Ward Linkage Distance", expand=[0.15, 0.01], breaks=[0, 2, 4, 6, 8, 10, 12]) |
| 157 | + + labs(x="", title="dendrogram-basic · letsplot · pyplots.ai") |
| 158 | + + theme_void() |
139 | 159 | + theme( |
140 | | - axis_title=element_text(size=20), |
141 | | - axis_text=element_text(size=16), |
| 160 | + plot_title=element_text(size=24, face="bold", color="#2C3E50"), |
| 161 | + plot_background=element_rect(fill="white", color="white"), |
| 162 | + axis_title_y=element_text(size=20, color="#4A5568", margin=[0, 12, 0, 0]), |
| 163 | + axis_text_y=element_text(size=16, color="#6B7B8D"), |
142 | 164 | axis_text_x=element_blank(), |
143 | 165 | axis_ticks_x=element_blank(), |
144 | | - plot_title=element_text(size=24), |
145 | | - panel_grid_major_x=element_blank(), |
146 | | - panel_grid_minor_x=element_blank(), |
| 166 | + axis_ticks_y=element_line(size=0.4, color="#D0D8E0"), |
| 167 | + axis_line_y=element_line(size=0.6, color="#CBD5E0"), |
| 168 | + panel_grid_major_y=element_line(size=0.3, color="#EDF2F7"), |
| 169 | + plot_margin=[50, 30, 30, 20], |
147 | 170 | ) |
148 | 171 | + ggsize(1600, 900) |
149 | 172 | ) |
150 | 173 |
|
151 | | -# Save PNG (scale=3 gives 4800x2700) |
| 174 | +# Save |
152 | 175 | ggsave(plot, "plot.png", path=".", scale=3) |
153 | | - |
154 | | -# Save HTML for interactivity |
155 | 176 | ggsave(plot, "plot.html", path=".") |
0 commit comments