diff --git a/plots/dendrogram-basic/implementations/plotnine.py b/plots/dendrogram-basic/implementations/plotnine.py index 7640a9abb4..c059a7277d 100644 --- a/plots/dendrogram-basic/implementations/plotnine.py +++ b/plots/dendrogram-basic/implementations/plotnine.py @@ -1,19 +1,26 @@ """ pyplots.ai dendrogram-basic: Basic Dendrogram -Library: plotnine 0.15.2 | Python 3.13.11 -Quality: 91/100 | Created: 2025-12-23 +Library: plotnine 0.15.3 | Python 3.14.3 +Quality: 89/100 | Updated: 2026-04-05 """ import numpy as np import pandas as pd from plotnine import ( aes, + annotate, + coord_cartesian, element_blank, element_line, + element_rect, element_text, + geom_hline, + geom_point, geom_segment, geom_text, ggplot, + guide_legend, + guides, labs, scale_color_manual, scale_x_continuous, @@ -22,112 +29,184 @@ theme_minimal, ) from scipy.cluster.hierarchy import dendrogram, linkage +from sklearn.datasets import load_iris -# Data - Iris flower measurements (4 features for 15 samples) +# Data - Real iris flower measurements (15 samples, 5 per species) +iris = load_iris() np.random.seed(42) +species_names = ["Setosa", "Versicolor", "Virginica"] +species_counts = dict.fromkeys(species_names, 0) +sample_labels = [] +indices = np.concatenate([np.random.choice(np.where(iris.target == i)[0], 5, replace=False) for i in range(3)]) +for i in indices: + name = species_names[iris.target[i]] + species_counts[name] += 1 + sample_labels.append(f"{name}-{species_counts[name]}") +features = iris.data[indices] + +# Hierarchical clustering with Ward's method +linkage_matrix = linkage(features, method="ward") +palette = {"Setosa": "#306998", "Versicolor": "#E8833A", "Virginica": "#55A868"} + +# Extract dendrogram coordinates +dend = dendrogram(linkage_matrix, labels=sample_labels, no_plot=True) + +# Track species composition of each node for branch coloring +n = len(sample_labels) +leaf_species = {lbl: lbl.rsplit("-", 1)[0] for lbl in sample_labels} +node_species = {} +for i, label in enumerate(sample_labels): + node_species[i] = {leaf_species[label]} +for i, row in enumerate(linkage_matrix): + left, right = int(row[0]), int(row[1]) + node_species[n + i] = node_species[left] | node_species[right] + +# Branch type label for each merge: species name if pure, "Mixed" if mixed +branch_type_labels = {"Setosa": "Setosa (pure)", "Versicolor": "Versicolor (pure)", "Virginica": "Virginica (pure)"} +merge_branch_types = [] +for i in range(len(linkage_matrix)): + sp = node_species[n + i] + if len(sp) == 1: + merge_branch_types.append(branch_type_labels[next(iter(sp))]) + else: + merge_branch_types.append("Mixed species") -# Simulate iris-like measurements: sepal length, sepal width, petal length, petal width -# Three species with distinct characteristics -samples_per_species = 5 - -labels = [] -data = [] - -# Setosa: shorter petals, wider sepals -for i in range(samples_per_species): - labels.append(f"Setosa-{i + 1}") - data.append( - [ - 5.0 + np.random.randn() * 0.3, # sepal length - 3.4 + np.random.randn() * 0.3, # sepal width - 1.5 + np.random.randn() * 0.2, # petal length - 0.3 + np.random.randn() * 0.1, # petal width - ] - ) - -# Versicolor: medium measurements -for i in range(samples_per_species): - labels.append(f"Versicolor-{i + 1}") - data.append( - [ - 5.9 + np.random.randn() * 0.4, # sepal length - 2.8 + np.random.randn() * 0.3, # sepal width - 4.3 + np.random.randn() * 0.4, # petal length - 1.3 + np.random.randn() * 0.2, # petal width - ] - ) - -# Virginica: longer petals and sepals -for i in range(samples_per_species): - labels.append(f"Virginica-{i + 1}") - data.append( - [ - 6.6 + np.random.randn() * 0.5, # sepal length - 3.0 + np.random.randn() * 0.3, # sepal width - 5.5 + np.random.randn() * 0.5, # petal length - 2.0 + np.random.randn() * 0.3, # petal width - ] - ) - -data = np.array(data) - -# Compute hierarchical clustering using Ward's method -linkage_matrix = linkage(data, method="ward") - -# Extract dendrogram coordinates using scipy (no_plot=True returns coordinates only) -dend = dendrogram(linkage_matrix, labels=labels, no_plot=True) +# Map dendrogram order to linkage order via merge heights +height_to_merge = {} +for i, h in enumerate(linkage_matrix[:, 2]): + height_to_merge.setdefault(round(h, 10), []).append(i) -# Convert dendrogram coordinates to segment data for plotnine -# icoord contains x coords (pairs of 4 for each merge) -# dcoord contains y coords (pairs of 4 for each merge) +# Build segment dataframe segments = [] -color_threshold = 0.7 * max(linkage_matrix[:, 2]) - for xs, ys in zip(dend["icoord"], dend["dcoord"], strict=True): - # Each merge has 4 points forming a U-shape: [x1, x2, x3, x4], [y1, y2, y3, y4] - # We need 3 segments: left vertical, horizontal, right vertical - - # Determine color based on height (merge distance) - merge_height = max(ys) - if merge_height > color_threshold: - color = "#306998" # Python Blue for high-level merges + h = round(max(ys), 10) + if h in height_to_merge and height_to_merge[h]: + merge_idx = height_to_merge[h].pop(0) + btype = merge_branch_types[merge_idx] else: - color = "#FFD43B" # Python Yellow for low-level merges - - segments.append({"x": xs[0], "xend": xs[1], "y": ys[0], "yend": ys[1], "color": color}) - segments.append({"x": xs[1], "xend": xs[2], "y": ys[1], "yend": ys[2], "color": color}) - segments.append({"x": xs[2], "xend": xs[3], "y": ys[2], "yend": ys[3], "color": color}) + btype = "Mixed species" + segments.append({"x": xs[0], "xend": xs[1], "y": ys[0], "yend": ys[1], "branch_type": btype}) + segments.append({"x": xs[1], "xend": xs[2], "y": ys[1], "yend": ys[2], "branch_type": btype}) + segments.append({"x": xs[2], "xend": xs[3], "y": ys[2], "yend": ys[3], "branch_type": btype}) segments_df = pd.DataFrame(segments) -# Create label data using the actual leaf positions from dendrogram -# dend['leaves'] gives the order, and x positions are at 5, 15, 25, ... (spacing of 10) -leaf_positions = [(i + 1) * 10 - 5 for i in range(len(dend["ivl"]))] -ivl = dend["ivl"] # Reordered labels from dendrogram -label_df = pd.DataFrame({"x": leaf_positions, "label": ivl, "y": [-0.8] * len(ivl)}) +# Leaf labels with species-based coloring +n_leaves = len(dend["ivl"]) +leaf_positions = [(i + 1) * 10 - 5 for i in range(n_leaves)] +leaf_labels = dend["ivl"] +leaf_btypes = [branch_type_labels[leaf_species[lbl]] for lbl in leaf_labels] +label_df = pd.DataFrame({"x": leaf_positions, "label": leaf_labels, "y": [0.0] * n_leaves, "branch_type": leaf_btypes}) + +# Ordered category for consistent legend +category_order = ["Setosa (pure)", "Versicolor (pure)", "Virginica (pure)", "Mixed species"] +color_map = { + "Setosa (pure)": palette["Setosa"], + "Versicolor (pure)": palette["Versicolor"], + "Virginica (pure)": palette["Virginica"], + "Mixed species": "#888888", +} +segments_df["branch_type"] = pd.Categorical(segments_df["branch_type"], categories=category_order, ordered=True) +label_df["branch_type"] = pd.Categorical(label_df["branch_type"], categories=category_order, ordered=True) + +# Merge node points - highlight where clusters join (plotnine geom_point layer) +merge_nodes = [] +for xs, ys, btype in zip(dend["icoord"], dend["dcoord"], merge_branch_types, strict=True): + cx = (xs[1] + xs[2]) / 2 + cy = max(ys) + merge_nodes.append({"x": cx, "y": cy, "branch_type": btype}) +merge_df = pd.DataFrame(merge_nodes) +merge_df["branch_type"] = pd.Categorical(merge_df["branch_type"], categories=category_order, ordered=True) + +# Key merge threshold: where Setosa separates from the rest +setosa_sep_height = linkage_matrix[-2, 2] +threshold_df = pd.DataFrame({"yintercept": [setosa_sep_height]}) + +# Plot +y_max = max(linkage_matrix[:, 2]) * 1.08 +x_min = min(segments_df["x"].min(), segments_df["xend"].min()) +x_max = max(segments_df["x"].max(), segments_df["xend"].max()) +x_pad = (x_max - x_min) * 0.06 -# Plot using plotnine's native geom_segment plot = ( ggplot() - + geom_segment(aes(x="x", xend="xend", y="y", yend="yend", color="color"), data=segments_df, size=1.8) - + geom_text(aes(x="x", y="y", label="label"), data=label_df, angle=45, ha="right", va="top", size=9) - + scale_color_manual(values={"#306998": "#306998", "#FFD43B": "#FFD43B"}, guide=None) - + scale_x_continuous(breaks=[], expand=(0.12, 0.05)) - + scale_y_continuous(expand=(0.25, 0.02)) - + labs(x="Sample", y="Distance (Ward)", title="dendrogram-basic · plotnine · pyplots.ai") + # Dendrogram branches - thicker for HD visibility + + geom_segment(aes(x="x", xend="xend", y="y", yend="yend", color="branch_type"), data=segments_df, size=2.2) + # Threshold line using idiomatic geom_hline + + geom_hline(aes(yintercept="yintercept"), data=threshold_df, linetype="dashed", color="#AAAAAA", size=0.8) + # Threshold annotation using plotnine annotate + + annotate( + "text", + x=x_max - x_pad, + y=setosa_sep_height + 0.35, + label="Setosa separates", + size=13, + color="#555555", + fontstyle="italic", + ha="right", + ) + # Intermixing annotation - data storytelling for Versicolor/Virginica + + annotate( + "text", + x=x_max - x_pad, + y=linkage_matrix[-1, 2] * 0.55, + label="Versicolor & Virginica intermixed", + size=12, + color="#888888", + fontstyle="italic", + ha="right", + ) + # Leaf labels - larger for readability + + geom_text( + aes(x="x", y="y", label="label", color="branch_type"), + data=label_df, + angle=45, + ha="right", + va="top", + size=13, + nudge_y=-0.3, + show_legend=False, + ) + # Merge node markers - emphasize join points + + geom_point(aes(x="x", y="y", color="branch_type"), data=merge_df, size=3.5, show_legend=False) + + scale_color_manual(values=color_map, name="Branch Type") + + guides(color=guide_legend(override_aes={"size": 4, "alpha": 1})) + + scale_x_continuous(breaks=[], expand=(0.04, 0)) + + scale_y_continuous(breaks=np.arange(0, y_max, 2).tolist(), expand=(0.10, 0)) + + coord_cartesian(xlim=(x_min - x_pad, x_max + x_pad), ylim=(-2.5, y_max)) + + labs( + x="", + y="Ward Linkage Distance", + title="Iris Species Clustering · dendrogram-basic · plotnine · pyplots.ai", + subtitle="Hierarchical clustering of 15 iris samples using Ward's minimum variance method", + ) + theme_minimal() + theme( figure_size=(16, 9), - text=element_text(size=14), - axis_title=element_text(size=20), - axis_text=element_text(size=16), + text=element_text(size=14, family="sans-serif"), + axis_title_x=element_blank(), + axis_title_y=element_text(size=20, margin={"r": 12}), + axis_text=element_text(size=16, color="#444444"), axis_text_x=element_blank(), - plot_title=element_text(size=24), + axis_ticks_major_x=element_blank(), + plot_title=element_text(size=24, weight="bold", margin={"b": 4}), + plot_subtitle=element_text(size=15, color="#666666", margin={"b": 12}), + plot_background=element_rect(fill="#FAFAFA", color="none"), + panel_background=element_rect(fill="#FAFAFA", color="none"), panel_grid_major_x=element_blank(), panel_grid_minor_x=element_blank(), - panel_grid_major_y=element_line(alpha=0.3, linetype="dashed"), + panel_grid_minor_y=element_blank(), + panel_grid_major_y=element_line(alpha=0.2, size=0.5, color="#CCCCCC"), + legend_title=element_text(size=16, weight="bold"), + legend_text=element_text(size=14), + legend_position="right", + legend_background=element_rect(fill="#FAFAFA", color="#DDDDDD", size=0.5), + legend_key=element_rect(fill="none", color="none"), + plot_margin=0.02, ) ) -plot.save("plot.png", dpi=300) +# Save with tight layout +fig = plot.draw() +fig.savefig("plot.png", dpi=300, bbox_inches="tight") diff --git a/plots/dendrogram-basic/metadata/plotnine.yaml b/plots/dendrogram-basic/metadata/plotnine.yaml index 4a4eb9d3a6..f7dc07d52e 100644 --- a/plots/dendrogram-basic/metadata/plotnine.yaml +++ b/plots/dendrogram-basic/metadata/plotnine.yaml @@ -1,167 +1,184 @@ library: plotnine specification_id: dendrogram-basic created: '2025-12-23T10:00:38Z' -updated: '2025-12-23T10:06:36Z' -generated_by: claude-opus-4-5-20251101 +updated: '2026-04-05T21:11:21Z' +generated_by: claude-opus-4-6 workflow_run: 20457534445 issue: 0 -python_version: 3.13.11 -library_version: 0.15.2 +python_version: 3.14.3 +library_version: 0.15.3 preview_url: https://storage.googleapis.com/pyplots-images/plots/dendrogram-basic/plotnine/plot.png preview_html: null -quality_score: 91 +quality_score: 89 impl_tags: dependencies: - scipy - techniques: [] + - sklearn + techniques: + - annotations + - layer-composition + - custom-legend patterns: - - data-generation + - dataset-loading - iteration-over-groups dataprep: - hierarchical-clustering - styling: [] + styling: + - grid-styling review: strengths: - - Excellent use of scipy dendrogram() with no_plot=True to extract coordinates, - then rendering with plotnine native geom_segment - - Clean two-color scheme using Python branding colors that distinguishes high-level - from low-level merges - - Proper 45-degree label rotation prevents overlap while maintaining readability - - Well-structured code following KISS principle with clear data generation for three - iris species - - Appropriate figure sizing (16x9) and font scaling for high-resolution output + - Excellent data storytelling with threshold annotation and species intermixing + note — creates immediate narrative + - Species-purity branch coloring is a sophisticated touch that adds real analytical + value + - Very idiomatic plotnine grammar of graphics usage with well-layered geoms + - Clean, polished visual design with custom palette, subtle grid, and styled legend + - Real iris dataset provides authentic scientific context weaknesses: - - Y-axis label could include units or clarify the distance metric more explicitly - - The color threshold logic (0.7 * max) is somewhat arbitrary and unexplained visually - - Bottom margin has extra whitespace due to rotated labels extending into the plot - area - image_description: 'The plot displays a hierarchical dendrogram visualizing clustering - of 15 iris flower samples (5 each of Setosa, Versicolor, and Virginica). The tree - structure uses a two-color scheme: Python Blue (#306998) for high-level merges - at the top and Python Yellow (#FFD43B) for lower-level merges. The Y-axis shows - "Distance (Ward)" ranging from 0 to 12, and the X-axis is labeled "Sample". Sample - labels are displayed at 45-degree angles at the bottom (e.g., Setosa-4, Setosa-3, - Versicolor-1, Virginica-1, etc.). The title reads "dendrogram-basic · plotnine - · pyplots.ai". The layout is clean with subtle dashed grid lines on the Y-axis. - The dendrogram correctly shows Setosa samples clustering together on the left, - while Versicolor and Virginica samples cluster on the right side, reflecting their - biological similarity.' + - Leaf label font size (13) is slightly below the recommended 16pt minimum for tick-level + text + - Title format prepends descriptive text before the standard {spec-id} · {library} + · pyplots.ai format + image_description: 'The plot shows a dendrogram (hierarchical clustering tree) of + 15 iris flower samples on a light gray (#FAFAFA) background. The y-axis is labeled + "Ward Linkage Distance" ranging from 0 to ~11. Leaf labels at the bottom are rotated + 45° showing sample names (Setosa-1 through Setosa-5, Virginica-3 through Virginica-5, + Versicolor-1 through Versicolor-5, Virginica-1, Virginica-2). Branches are color-coded: + blue (#306998) for pure Setosa clusters, orange (#E8833A) for pure Versicolor + clusters, green (#55A868) for pure Virginica clusters, and gray (#888888) for + mixed-species merges. A dashed horizontal threshold line at ~4.5 is annotated + with "Setosa separates" in italic gray text. Above it, "Versicolor & Virginica + intermixed" annotation adds context. Merge node dots appear at cluster join points. + A legend on the right shows "Branch Type" with four categories. The title reads + "Iris Species Clustering · dendrogram-basic · plotnine · pyplots.ai" in bold, + with a subtitle describing the method.' criteria_checklist: visual_quality: - score: 36 - max: 40 + score: 28 + max: 30 items: - id: VQ-01 name: Text Legibility - score: 10 - max: 10 + score: 7 + max: 8 passed: true - comment: Title at 24pt, axis labels at 20pt, tick labels at 16pt, all perfectly - readable + comment: Font sizes explicitly set (title 24pt, axis title 20pt, ticks 16pt). + Leaf labels at size 13 slightly below 16pt guideline but readable. - id: VQ-02 name: No Overlap - score: 8 - max: 8 + score: 6 + max: 6 passed: true - comment: No overlapping text, labels are well-spaced with 45-degree rotation + comment: No overlapping text. Rotated leaf labels well-spaced, annotations + don't collide. - id: VQ-03 name: Element Visibility - score: 8 - max: 8 + score: 5 + max: 6 passed: true - comment: Dendrogram branches clearly visible with size=1.8 + comment: Segments at size 2.2 clearly visible. Merge dots adequate but could + be slightly more prominent. - id: VQ-04 name: Color Accessibility - score: 5 - max: 5 + score: 4 + max: 4 passed: true - comment: Blue/yellow color scheme is colorblind-safe + comment: Blue/orange/green/gray palette is colorblind-safe with good contrast. - id: VQ-05 - name: Layout Balance - score: 3 - max: 5 + name: Layout & Canvas + score: 4 + max: 4 passed: true - comment: Good proportions but slight extra whitespace at bottom due to label - rotation + comment: Good 16:9 proportions, plot fills canvas well, legend positioned + cleanly. - id: VQ-06 - name: Axis Labels - score: 1 + name: Axis Labels & Title + score: 2 max: 2 passed: true - comment: Y-axis has descriptive label with method "Distance (Ward)" but no - units; X-axis just "Sample" - - id: VQ-07 - name: Grid & Legend - score: 1 - max: 2 + comment: Y-axis 'Ward Linkage Distance' is descriptive. X-axis intentionally + blank with sample names as labels. + design_excellence: + score: 16 + max: 20 + items: + - id: DE-01 + name: Aesthetic Sophistication + score: 6 + max: 8 + passed: true + comment: 'Custom color palette, #FAFAFA background, styled legend with border. + Clearly above defaults.' + - id: DE-02 + name: Visual Refinement + score: 5 + max: 6 passed: true - comment: Grid is subtle with alpha=0.3 and dashed style; no legend needed - but color meaning unexplained + comment: theme_minimal removes spines, subtle y-grid (alpha=0.2), custom background, + generous whitespace. + - id: DE-03 + name: Data Storytelling + score: 5 + max: 6 + passed: true + comment: Threshold annotation and intermixing note create clear narrative. + Color-coded branches guide viewer. spec_compliance: - score: 25 - max: 25 + score: 14 + max: 15 items: - id: SC-01 name: Plot Type - score: 8 - max: 8 - passed: true - comment: Correct dendrogram/hierarchical clustering visualization - - id: SC-02 - name: Data Mapping score: 5 max: 5 passed: true - comment: Branch heights correctly represent merge distances - - id: SC-03 + comment: Correct dendrogram showing hierarchical clustering tree. + - id: SC-02 name: Required Features - score: 5 - max: 5 + score: 4 + max: 4 passed: true - comment: Shows hierarchical structure, merge distances, labeled leaves - - id: SC-04 - name: Data Range + comment: Ward's method, proportional branch heights, species-colored branches, + labeled leaves. + - id: SC-03 + name: Data Mapping score: 3 max: 3 passed: true - comment: Y-axis shows full range of distances - - id: SC-05 - name: Legend Accuracy - score: 2 - max: 2 - passed: true - comment: No legend needed; color distinction is decorative - - id: SC-06 - name: Title Format + comment: X positions from dendrogram coords, Y shows linkage distance. All + 15 samples visible. + - id: SC-04 + name: Title & Legend score: 2 - max: 2 + max: 3 passed: true - comment: Correctly uses "dendrogram-basic · plotnine · pyplots.ai" + comment: Title contains required elements but prepends descriptive text. Legend + labels are correct. data_quality: - score: 18 - max: 20 + score: 14 + max: 15 items: - id: DQ-01 name: Feature Coverage - score: 7 - max: 8 + score: 5 + max: 6 passed: true - comment: Shows species clustering well; could show more variation in merge - heights within species + comment: Shows pure and mixed clusters at different distance levels. Clear + Setosa separation vs Versicolor/Virginica intermixing. - id: DQ-02 name: Realistic Context - score: 7 - max: 7 + score: 5 + max: 5 passed: true - comment: Iris flower classification is a classic, realistic ML scenario + comment: Real iris dataset from sklearn — classic, neutral, scientific dataset. - id: DQ-03 name: Appropriate Scale score: 4 - max: 5 + max: 4 passed: true - comment: Ward distances are reasonable; values make sense for normalized data + comment: Ward linkage distances realistic for iris measurements. code_quality: - score: 9 + score: 10 max: 10 items: - id: CQ-01 @@ -169,41 +186,49 @@ review: score: 3 max: 3 passed: true - comment: 'Linear flow: imports → data → linkage → plot → save' + comment: 'Linear flow: imports → data → clustering → coordinate extraction + → plot → save.' - id: CQ-02 name: Reproducibility - score: 3 - max: 3 + score: 2 + max: 2 passed: true - comment: Uses np.random.seed(42) + comment: np.random.seed(42) set for sample selection. - id: CQ-03 name: Clean Imports score: 2 max: 2 passed: true - comment: All imports are used + comment: All imports used, no unused imports. - id: CQ-04 - name: No Deprecated API - score: 1 - max: 1 + name: Code Elegance + score: 2 + max: 2 passed: true - comment: Uses current plotnine API + comment: Well-structured, appropriate complexity. Height-to-merge mapping + is clean. - id: CQ-05 - name: Output Correct - score: 0 + name: Output & API + score: 1 max: 1 - passed: false - comment: Missing bbox_inches equivalent for tight layout - library_features: - score: 3 - max: 5 + passed: true + comment: Saves as plot.png with dpi=300 and bbox_inches='tight'. + library_mastery: + score: 7 + max: 10 items: - - id: LF-01 - name: Uses distinctive library features + - id: LM-01 + name: Idiomatic Usage + score: 4 + max: 5 + passed: true + comment: 'Excellent ggplot grammar: layered geoms, aes mappings, scale_color_manual, + coord_cartesian, guide_legend.' + - id: LM-02 + name: Distinctive Features score: 3 max: 5 passed: true - comment: Good use of geom_segment, geom_text, theme customization, and scale_color_manual. - However, relies heavily on scipy for dendrogram coordinates rather than - showcasing more plotnine-specific grammar of graphics features. - verdict: APPROVED + comment: pd.Categorical for legend ordering, guide_legend with override_aes, + annotate(), show_legend=False — distinctly plotnine patterns. + verdict: REJECTED