Skip to content
173 changes: 87 additions & 86 deletions plots/dendrogram-basic/implementations/letsplot.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
""" pyplots.ai
dendrogram-basic: Basic Dendrogram
Comment on lines 1 to 2
Copy link

Copilot AI Apr 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file uses """pyplots.ai while many other letsplot implementations in the repo use """ pyplots.ai (note the space) in the standard 4-line header. Consider matching the common header formatting for consistency across generated plot scripts.

Copilot uses AI. Check for mistakes.
Library: letsplot 4.8.2 | Python 3.13.11
Quality: 91/100 | Created: 2025-12-23
Library: letsplot 4.8.2 | Python 3.14.3
Quality: 85/100 | Updated: 2026-04-05
Copy link

Copilot AI Apr 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The implementation header has an invalid quality field (Quality: /100), which will likely break any tooling that parses the numeric quality score from the 4-line docstring. Set this to a valid value (e.g., a number like NN/100 or pending) consistent with other plot implementations.

Copilot uses AI. Check for mistakes.
"""

import numpy as np
Expand All @@ -10,12 +10,14 @@
LetsPlot,
aes,
element_blank,
element_line,
element_text,
geom_segment,
geom_text,
ggplot,
ggsize,
labs,
layer_tooltips,
scale_color_manual,
scale_x_continuous,
scale_y_continuous,
Expand All @@ -24,132 +26,131 @@
)
from lets_plot.export import ggsave
from scipy.cluster.hierarchy import linkage
from sklearn.datasets import load_iris


LetsPlot.setup_html()

# Data - Iris flower measurements (4 features for 15 samples)
# Data - Iris flower measurements (15 samples, 3 species)
iris = load_iris()
np.random.seed(42)
indices = np.sort(np.concatenate([np.random.choice(np.where(iris.target == k)[0], 5, replace=False) for k in range(3)]))
features = iris.data[indices]
species_names = ["Setosa", "Versicolor", "Virginica"]
labels = [f"{species_names[iris.target[i]]}-{j + 1}" for j, i in enumerate(indices)]
Copy link

Copilot AI Apr 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leaf labels are numbered using the global enumerate index after sorting, so Versicolor labels start at 6 and Virginica at 11. For readability and consistency with other dendrogram-basic implementations, consider numbering samples within each species (e.g., Versicolor-1..5, Virginica-1..5).

Suggested change
labels = [f"{species_names[iris.target[i]]}-{j + 1}" for j, i in enumerate(indices)]
species_counts = {k: 0 for k in range(len(species_names))}
labels = []
for i in indices:
species_id = int(iris.target[i])
species_counts[species_id] += 1
labels.append(f"{species_names[species_id]}-{species_counts[species_id]}")

Copilot uses AI. Check for mistakes.

# Simulate iris-like measurements: sepal length, sepal width, petal length, petal width
# Three species with distinct characteristics
samples_per_species = 5

labels = []
data = []

# Setosa: shorter petals, wider sepals
for i in range(samples_per_species):
labels.append(f"Setosa-{i + 1}")
data.append(
[
5.0 + np.random.randn() * 0.3, # sepal length
3.4 + np.random.randn() * 0.3, # sepal width
1.5 + np.random.randn() * 0.2, # petal length
0.3 + np.random.randn() * 0.1, # petal width
]
)

# Versicolor: medium measurements
for i in range(samples_per_species):
labels.append(f"Versicolor-{i + 1}")
data.append(
[
5.9 + np.random.randn() * 0.4, # sepal length
2.8 + np.random.randn() * 0.3, # sepal width
4.3 + np.random.randn() * 0.4, # petal length
1.3 + np.random.randn() * 0.2, # petal width
]
)

# Virginica: longer petals and sepals
for i in range(samples_per_species):
labels.append(f"Virginica-{i + 1}")
data.append(
[
6.6 + np.random.randn() * 0.5, # sepal length
3.0 + np.random.randn() * 0.3, # sepal width
5.5 + np.random.randn() * 0.5, # petal length
2.0 + np.random.randn() * 0.3, # petal width
]
)

data = np.array(data)
n_samples = len(labels)

# Compute hierarchical clustering using Ward's method
linkage_matrix = linkage(data, method="ward")
# Hierarchical clustering (Ward's method)
linkage_matrix = linkage(features, method="ward")

# Build dendrogram coordinates from linkage matrix
n = len(labels)
leaf_positions = {i: i for i in range(n)}
node_heights = dict.fromkeys(range(n), 0)
leaf_positions = {i: float(i) for i in range(n)}
node_heights = dict.fromkeys(range(n), 0.0)
segments = []

# Color threshold for clustering (similar to matplotlib's default)
# Color threshold — splits into 3 major clusters
max_dist = linkage_matrix[:, 2].max()
color_threshold = 0.7 * max_dist

# Process each merge in the linkage matrix
# Track cluster identity for each node (leaf or merged)
palette = {"above": "#306998", "Setosa": "#4DAF4A", "Versicolor": "#FF7F00", "Virginica": "#984EA3"}
node_cluster = {i: labels[i].split("-")[0] for i in range(n)}

for i, (left, right, dist, _) in enumerate(linkage_matrix):
left, right = int(left), int(right)
new_node = n + i

# Get positions of children
left_pos = leaf_positions[left]
right_pos = leaf_positions[right]

# New node position is midpoint of children
new_pos = (left_pos + right_pos) / 2
leaf_positions[new_node] = new_pos
leaf_positions[new_node] = (left_pos + right_pos) / 2
node_heights[new_node] = dist

# Determine color based on height threshold
color = "#306998" if dist >= color_threshold else "#FFD43B"
# Cluster label: same species if both children match, otherwise "above"
left_cl, right_cl = node_cluster[left], node_cluster[right]
node_cluster[new_node] = left_cl if left_cl == right_cl else "above"
cluster_label = node_cluster[new_node] if dist < color_threshold else "above"
color = palette[cluster_label]
display_cluster = cluster_label if cluster_label != "above" else "Inter-cluster"

left_height = node_heights[left]
right_height = node_heights[right]

# Vertical segment from left child to merge height
segments.append((left_pos, left_height, left_pos, dist, color))
# Vertical segment from right child to merge height
segments.append((right_pos, right_height, right_pos, dist, color))
# Horizontal segment connecting the two
segments.append((left_pos, dist, right_pos, dist, color))
# Vertical segment from left child up to merge height
segments.append(
{
"x": left_pos,
"y": left_height,
"xend": left_pos,
"yend": dist,
"color": color,
"merge_dist": round(dist, 2),
"cluster": display_cluster,
}
)
# Vertical segment from right child up to merge height
segments.append(
{
"x": right_pos,
"y": right_height,
"xend": right_pos,
"yend": dist,
"color": color,
"merge_dist": round(dist, 2),
"cluster": display_cluster,
}
)
# Horizontal segment connecting the two children
segments.append(
{
"x": left_pos,
"y": dist,
"xend": right_pos,
"yend": dist,
"color": color,
"merge_dist": round(dist, 2),
"cluster": display_cluster,
}
)

# Create segment dataframe
segment_df = pd.DataFrame(segments, columns=["x", "y", "xend", "yend", "color"])
segment_df = pd.DataFrame(segments)

# Create label dataframe for x-axis labels
label_data = []
for i, label in enumerate(labels):
label_data.append({"x": leaf_positions[i], "y": -0.8, "label": label})
label_df = pd.DataFrame(label_data)
# Leaf labels positioned just below y=0
label_df = pd.DataFrame([{"x": leaf_positions[i], "y": -0.3, "label": labels[i]} for i in range(n)])

# Plot
color_values = {v: v for v in palette.values()}

plot = (
ggplot()
+ geom_segment(aes(x="x", y="y", xend="xend", yend="yend", color="color"), data=segment_df, size=1.5)
+ geom_text(aes(x="x", y="y", label="label"), data=label_df, angle=35, hjust=1, vjust=1, size=10, color="#333333")
+ scale_color_manual(values={"#306998": "#306998", "#FFD43B": "#FFD43B"}, guide="none")
+ scale_x_continuous(expand=[0.06, 0.02])
+ scale_y_continuous(expand=[0.18, 0.02])
+ labs(x="Sample", y="Distance (Ward)", title="dendrogram-basic · letsplot · pyplots.ai")
+ geom_segment(
aes(x="x", y="y", xend="xend", yend="yend", color="color"),
data=segment_df,
size=1.8,
tooltips=layer_tooltips().title("Merge").line("Distance|@merge_dist").line("Cluster|@cluster"),
)
+ geom_text(aes(x="x", y="y", label="label"), data=label_df, angle=40, hjust=1, vjust=1, size=10, color="#444444")
+ scale_color_manual(values=color_values, guide="none")
+ scale_x_continuous(expand=[0.05, 0.02])
+ scale_y_continuous(name="Ward Linkage Distance", expand=[0.14, 0.01], breaks=[0, 2, 4, 6, 8, 10, 12])
+ labs(x="", title="dendrogram-basic \u00b7 letsplot \u00b7 pyplots.ai")
+ theme_minimal()
+ theme(
axis_title=element_text(size=20),
plot_title=element_text(size=24, face="bold"),
axis_title_y=element_text(size=20),
axis_text=element_text(size=16),
axis_text_x=element_blank(),
axis_ticks_x=element_blank(),
plot_title=element_text(size=24),
axis_line_x=element_blank(),
axis_line_y=element_line(size=0.5, color="#CCCCCC"),
panel_grid_major_x=element_blank(),
panel_grid_minor_x=element_blank(),
panel_grid_major_y=element_line(size=0.5, color="#E8E8E8"),
panel_grid_minor=element_blank(),
plot_margin=[40, 20, 20, 20],
)
+ ggsize(1600, 900)
)

# Save PNG (scale=3 gives 4800x2700)
# Save
ggsave(plot, "plot.png", path=".", scale=3)

# Save HTML for interactivity
ggsave(plot, "plot.html", path=".")
Loading
Loading