|
1 | | -""" pyplots.ai |
| 1 | +"""pyplots.ai |
2 | 2 | dendrogram-basic: Basic Dendrogram |
3 | | -Library: letsplot 4.8.2 | Python 3.13.11 |
4 | | -Quality: 91/100 | Created: 2025-12-23 |
| 3 | +Library: letsplot 4.8.2 | Python 3.14.3 |
| 4 | +Quality: /100 | Updated: 2026-04-05 |
5 | 5 | """ |
6 | 6 |
|
7 | 7 | import numpy as np |
|
10 | 10 | LetsPlot, |
11 | 11 | aes, |
12 | 12 | element_blank, |
| 13 | + element_line, |
13 | 14 | element_text, |
14 | 15 | geom_segment, |
15 | 16 | geom_text, |
16 | 17 | ggplot, |
17 | 18 | ggsize, |
18 | 19 | labs, |
| 20 | + layer_tooltips, |
19 | 21 | scale_color_manual, |
20 | 22 | scale_x_continuous, |
21 | 23 | scale_y_continuous, |
|
24 | 26 | ) |
25 | 27 | from lets_plot.export import ggsave |
26 | 28 | from scipy.cluster.hierarchy import linkage |
| 29 | +from sklearn.datasets import load_iris |
27 | 30 |
|
28 | 31 |
|
29 | 32 | LetsPlot.setup_html() |
30 | 33 |
|
31 | | -# Data - Iris flower measurements (4 features for 15 samples) |
| 34 | +# Data - Iris flower measurements (15 samples, 3 species) |
| 35 | +iris = load_iris() |
32 | 36 | np.random.seed(42) |
| 37 | +indices = np.sort(np.concatenate([np.random.choice(np.where(iris.target == k)[0], 5, replace=False) for k in range(3)])) |
| 38 | +features = iris.data[indices] |
| 39 | +species_names = ["Setosa", "Versicolor", "Virginica"] |
| 40 | +labels = [f"{species_names[iris.target[i]]}-{j + 1}" for j, i in enumerate(indices)] |
33 | 41 |
|
34 | | -# Simulate iris-like measurements: sepal length, sepal width, petal length, petal width |
35 | | -# Three species with distinct characteristics |
36 | | -samples_per_species = 5 |
37 | | - |
38 | | -labels = [] |
39 | | -data = [] |
40 | | - |
41 | | -# Setosa: shorter petals, wider sepals |
42 | | -for i in range(samples_per_species): |
43 | | - labels.append(f"Setosa-{i + 1}") |
44 | | - data.append( |
45 | | - [ |
46 | | - 5.0 + np.random.randn() * 0.3, # sepal length |
47 | | - 3.4 + np.random.randn() * 0.3, # sepal width |
48 | | - 1.5 + np.random.randn() * 0.2, # petal length |
49 | | - 0.3 + np.random.randn() * 0.1, # petal width |
50 | | - ] |
51 | | - ) |
52 | | - |
53 | | -# Versicolor: medium measurements |
54 | | -for i in range(samples_per_species): |
55 | | - labels.append(f"Versicolor-{i + 1}") |
56 | | - data.append( |
57 | | - [ |
58 | | - 5.9 + np.random.randn() * 0.4, # sepal length |
59 | | - 2.8 + np.random.randn() * 0.3, # sepal width |
60 | | - 4.3 + np.random.randn() * 0.4, # petal length |
61 | | - 1.3 + np.random.randn() * 0.2, # petal width |
62 | | - ] |
63 | | - ) |
64 | | - |
65 | | -# Virginica: longer petals and sepals |
66 | | -for i in range(samples_per_species): |
67 | | - labels.append(f"Virginica-{i + 1}") |
68 | | - data.append( |
69 | | - [ |
70 | | - 6.6 + np.random.randn() * 0.5, # sepal length |
71 | | - 3.0 + np.random.randn() * 0.3, # sepal width |
72 | | - 5.5 + np.random.randn() * 0.5, # petal length |
73 | | - 2.0 + np.random.randn() * 0.3, # petal width |
74 | | - ] |
75 | | - ) |
76 | | - |
77 | | -data = np.array(data) |
78 | | -n_samples = len(labels) |
79 | | - |
80 | | -# Compute hierarchical clustering using Ward's method |
81 | | -linkage_matrix = linkage(data, method="ward") |
| 42 | +# Hierarchical clustering (Ward's method) |
| 43 | +linkage_matrix = linkage(features, method="ward") |
82 | 44 |
|
83 | 45 | # Build dendrogram coordinates from linkage matrix |
84 | 46 | n = len(labels) |
85 | | -leaf_positions = {i: i for i in range(n)} |
86 | | -node_heights = dict.fromkeys(range(n), 0) |
| 47 | +leaf_positions = {i: float(i) for i in range(n)} |
| 48 | +node_heights = dict.fromkeys(range(n), 0.0) |
87 | 49 | segments = [] |
88 | 50 |
|
89 | | -# Color threshold for clustering (similar to matplotlib's default) |
| 51 | +# Color threshold — splits into 3 major clusters |
90 | 52 | max_dist = linkage_matrix[:, 2].max() |
91 | 53 | color_threshold = 0.7 * max_dist |
92 | 54 |
|
93 | | -# Process each merge in the linkage matrix |
| 55 | +# Track cluster identity for each node (leaf or merged) |
| 56 | +palette = {"above": "#306998", "Setosa": "#4DAF4A", "Versicolor": "#FF7F00", "Virginica": "#984EA3"} |
| 57 | +node_cluster = {i: labels[i].split("-")[0] for i in range(n)} |
| 58 | + |
94 | 59 | for i, (left, right, dist, _) in enumerate(linkage_matrix): |
95 | 60 | left, right = int(left), int(right) |
96 | 61 | new_node = n + i |
97 | 62 |
|
98 | | - # Get positions of children |
99 | 63 | left_pos = leaf_positions[left] |
100 | 64 | right_pos = leaf_positions[right] |
101 | | - |
102 | | - # New node position is midpoint of children |
103 | | - new_pos = (left_pos + right_pos) / 2 |
104 | | - leaf_positions[new_node] = new_pos |
| 65 | + leaf_positions[new_node] = (left_pos + right_pos) / 2 |
105 | 66 | node_heights[new_node] = dist |
106 | 67 |
|
107 | | - # Determine color based on height threshold |
108 | | - color = "#306998" if dist >= color_threshold else "#FFD43B" |
| 68 | + # Cluster label: same species if both children match, otherwise "above" |
| 69 | + left_cl, right_cl = node_cluster[left], node_cluster[right] |
| 70 | + node_cluster[new_node] = left_cl if left_cl == right_cl else "above" |
| 71 | + cluster_label = node_cluster[new_node] if dist < color_threshold else "above" |
| 72 | + color = palette[cluster_label] |
| 73 | + display_cluster = cluster_label if cluster_label != "above" else "Inter-cluster" |
109 | 74 |
|
110 | 75 | left_height = node_heights[left] |
111 | 76 | right_height = node_heights[right] |
112 | 77 |
|
113 | | - # Vertical segment from left child to merge height |
114 | | - segments.append((left_pos, left_height, left_pos, dist, color)) |
115 | | - # Vertical segment from right child to merge height |
116 | | - segments.append((right_pos, right_height, right_pos, dist, color)) |
117 | | - # Horizontal segment connecting the two |
118 | | - segments.append((left_pos, dist, right_pos, dist, color)) |
| 78 | + # Vertical segment from left child up to merge height |
| 79 | + segments.append( |
| 80 | + { |
| 81 | + "x": left_pos, |
| 82 | + "y": left_height, |
| 83 | + "xend": left_pos, |
| 84 | + "yend": dist, |
| 85 | + "color": color, |
| 86 | + "merge_dist": round(dist, 2), |
| 87 | + "cluster": display_cluster, |
| 88 | + } |
| 89 | + ) |
| 90 | + # Vertical segment from right child up to merge height |
| 91 | + segments.append( |
| 92 | + { |
| 93 | + "x": right_pos, |
| 94 | + "y": right_height, |
| 95 | + "xend": right_pos, |
| 96 | + "yend": dist, |
| 97 | + "color": color, |
| 98 | + "merge_dist": round(dist, 2), |
| 99 | + "cluster": display_cluster, |
| 100 | + } |
| 101 | + ) |
| 102 | + # Horizontal segment connecting the two children |
| 103 | + segments.append( |
| 104 | + { |
| 105 | + "x": left_pos, |
| 106 | + "y": dist, |
| 107 | + "xend": right_pos, |
| 108 | + "yend": dist, |
| 109 | + "color": color, |
| 110 | + "merge_dist": round(dist, 2), |
| 111 | + "cluster": display_cluster, |
| 112 | + } |
| 113 | + ) |
119 | 114 |
|
120 | | -# Create segment dataframe |
121 | | -segment_df = pd.DataFrame(segments, columns=["x", "y", "xend", "yend", "color"]) |
| 115 | +segment_df = pd.DataFrame(segments) |
122 | 116 |
|
123 | | -# Create label dataframe for x-axis labels |
124 | | -label_data = [] |
125 | | -for i, label in enumerate(labels): |
126 | | - label_data.append({"x": leaf_positions[i], "y": -0.8, "label": label}) |
127 | | -label_df = pd.DataFrame(label_data) |
| 117 | +# Leaf labels positioned just below y=0 |
| 118 | +label_df = pd.DataFrame([{"x": leaf_positions[i], "y": -0.3, "label": labels[i]} for i in range(n)]) |
128 | 119 |
|
129 | 120 | # Plot |
| 121 | +color_values = {v: v for v in palette.values()} |
| 122 | + |
130 | 123 | plot = ( |
131 | 124 | ggplot() |
132 | | - + geom_segment(aes(x="x", y="y", xend="xend", yend="yend", color="color"), data=segment_df, size=1.5) |
133 | | - + geom_text(aes(x="x", y="y", label="label"), data=label_df, angle=35, hjust=1, vjust=1, size=10, color="#333333") |
134 | | - + scale_color_manual(values={"#306998": "#306998", "#FFD43B": "#FFD43B"}, guide="none") |
135 | | - + scale_x_continuous(expand=[0.06, 0.02]) |
136 | | - + scale_y_continuous(expand=[0.18, 0.02]) |
137 | | - + labs(x="Sample", y="Distance (Ward)", title="dendrogram-basic · letsplot · pyplots.ai") |
| 125 | + + geom_segment( |
| 126 | + aes(x="x", y="y", xend="xend", yend="yend", color="color"), |
| 127 | + data=segment_df, |
| 128 | + size=1.8, |
| 129 | + tooltips=layer_tooltips().title("Merge").line("Distance|@merge_dist").line("Cluster|@cluster"), |
| 130 | + ) |
| 131 | + + geom_text(aes(x="x", y="y", label="label"), data=label_df, angle=40, hjust=1, vjust=1, size=10, color="#444444") |
| 132 | + + scale_color_manual(values=color_values, guide="none") |
| 133 | + + scale_x_continuous(expand=[0.05, 0.02]) |
| 134 | + + scale_y_continuous(name="Ward Linkage Distance", expand=[0.14, 0.01], breaks=[0, 2, 4, 6, 8, 10, 12]) |
| 135 | + + labs(x="", title="dendrogram-basic \u00b7 letsplot \u00b7 pyplots.ai") |
138 | 136 | + theme_minimal() |
139 | 137 | + theme( |
140 | | - axis_title=element_text(size=20), |
| 138 | + plot_title=element_text(size=24, face="bold"), |
| 139 | + axis_title_y=element_text(size=20), |
141 | 140 | axis_text=element_text(size=16), |
142 | 141 | axis_text_x=element_blank(), |
143 | 142 | axis_ticks_x=element_blank(), |
144 | | - plot_title=element_text(size=24), |
| 143 | + axis_line_x=element_blank(), |
| 144 | + axis_line_y=element_line(size=0.5, color="#CCCCCC"), |
145 | 145 | panel_grid_major_x=element_blank(), |
146 | 146 | panel_grid_minor_x=element_blank(), |
| 147 | + panel_grid_major_y=element_line(size=0.5, color="#E8E8E8"), |
| 148 | + panel_grid_minor=element_blank(), |
| 149 | + plot_margin=[40, 20, 20, 20], |
147 | 150 | ) |
148 | 151 | + ggsize(1600, 900) |
149 | 152 | ) |
150 | 153 |
|
151 | | -# Save PNG (scale=3 gives 4800x2700) |
| 154 | +# Save |
152 | 155 | ggsave(plot, "plot.png", path=".", scale=3) |
153 | | - |
154 | | -# Save HTML for interactivity |
155 | 156 | ggsave(plot, "plot.html", path=".") |
0 commit comments