feat(plotnine): implement parallel-categories-basic (#2852)

github-actions[bot] · claude · web-flow · commit e82efb10379f · 2025-12-30T22:03:06.000Z
## Implementation: `parallel-categories-basic` - plotnine Implements the **plotnine** version of `parallel-categories-basic`. **File:** `plots/parallel-categories-basic/implementations/plotnine.py` --- :robot: *[impl-generate workflow](https://github.com/MarkusNeusinger/pyplots/actions/runs/20606634555)* --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
diff --git a/plots/parallel-categories-basic/implementations/plotnine.py b/plots/parallel-categories-basic/implementations/plotnine.py
@@ -0,0 +1,297 @@
+""" pyplots.ai
+parallel-categories-basic: Basic Parallel Categories Plot
+Library: plotnine 0.15.2 | Python 3.13.11
+Quality: 90/100 | Created: 2025-12-30
+"""
+
+import sys
+
+
+# Prevent current directory from shadowing the plotnine package
+sys.path = [p for p in sys.path if not p.endswith("implementations")]
+
+import numpy as np  # noqa: E402
+import pandas as pd  # noqa: E402
+from plotnine import (  # noqa: E402
+    aes,
+    annotate,
+    coord_cartesian,
+    element_blank,
+    element_text,
+    geom_polygon,
+    geom_rect,
+    geom_text,
+    ggplot,
+    labs,
+    scale_fill_manual,
+    theme,
+    theme_minimal,
+)
+
+
+# Data - Customer journey data with multiple categorical dimensions
+# Each row represents aggregated counts for a specific path through dimensions
+np.random.seed(42)
+
+# Define category combinations and realistic counts
+path_data = [
+    # Channel -> Product Category -> Customer Type -> Outcome
+    ("Online", "Electronics", "New", "Purchased", 145),
+    ("Online", "Electronics", "New", "Abandoned", 98),
+    ("Online", "Electronics", "Returning", "Purchased", 187),
+    ("Online", "Electronics", "Returning", "Abandoned", 42),
+    ("Online", "Clothing", "New", "Purchased", 112),
+    ("Online", "Clothing", "New", "Abandoned", 76),
+    ("Online", "Clothing", "Returning", "Purchased", 156),
+    ("Online", "Clothing", "Returning", "Abandoned", 38),
+    ("Online", "Home", "New", "Purchased", 67),
+    ("Online", "Home", "New", "Abandoned", 54),
+    ("Online", "Home", "Returning", "Purchased", 89),
+    ("Online", "Home", "Returning", "Abandoned", 23),
+    ("Store", "Electronics", "New", "Purchased", 78),
+    ("Store", "Electronics", "New", "Abandoned", 32),
+    ("Store", "Electronics", "Returning", "Purchased", 124),
+    ("Store", "Electronics", "Returning", "Abandoned", 18),
+    ("Store", "Clothing", "New", "Purchased", 95),
+    ("Store", "Clothing", "New", "Abandoned", 28),
+    ("Store", "Clothing", "Returning", "Purchased", 142),
+    ("Store", "Clothing", "Returning", "Abandoned", 15),
+    ("Store", "Home", "New", "Purchased", 56),
+    ("Store", "Home", "New", "Abandoned", 21),
+    ("Store", "Home", "Returning", "Purchased", 78),
+    ("Store", "Home", "Returning", "Abandoned", 12),
+    ("Mobile", "Electronics", "New", "Purchased", 89),
+    ("Mobile", "Electronics", "New", "Abandoned", 112),
+    ("Mobile", "Electronics", "Returning", "Purchased", 134),
+    ("Mobile", "Electronics", "Returning", "Abandoned", 67),
+    ("Mobile", "Clothing", "New", "Purchased", 76),
+    ("Mobile", "Clothing", "New", "Abandoned", 94),
+    ("Mobile", "Clothing", "Returning", "Purchased", 118),
+    ("Mobile", "Clothing", "Returning", "Abandoned", 52),
+    ("Mobile", "Home", "New", "Purchased", 45),
+    ("Mobile", "Home", "New", "Abandoned", 58),
+    ("Mobile", "Home", "Returning", "Purchased", 67),
+    ("Mobile", "Home", "Returning", "Abandoned", 34),
+]
+
+path_counts = pd.DataFrame(path_data, columns=["channel", "product", "customer_type", "outcome", "count"])
+
+# Define dimensions and their category orders (ordered to minimize ribbon crossings)
+dimensions = [
+    {"name": "channel", "label": "Channel", "categories": ["Online", "Store", "Mobile"]},
+    {"name": "product", "label": "Product", "categories": ["Electronics", "Clothing", "Home"]},
+    {"name": "customer_type", "label": "Customer", "categories": ["Returning", "New"]},
+    {"name": "outcome", "label": "Outcome", "categories": ["Purchased", "Abandoned"]},
+]
+
+# Color by outcome - Python Blue for abandoned, Yellow for purchased
+outcome_colors = {"Purchased": "#FFD43B", "Abandoned": "#306998"}
+
+# Layout parameters
+n_dims = len(dimensions)
+x_positions = np.linspace(0.1, 0.9, n_dims)
+node_width = 0.04
+node_gap = 0.03
+total_height = 0.82
+y_start = 0.92
+
+# Calculate node positions for each dimension
+node_positions = {}
+for dim_idx, dim in enumerate(dimensions):
+    x_pos = x_positions[dim_idx]
+    categories = dim["categories"]
+    col_name = dim["name"]
+
+    # Calculate totals for this dimension
+    if col_name == "outcome":
+        totals = path_counts.groupby(col_name)["count"].sum()
+    else:
+        totals = path_counts.groupby(col_name)["count"].sum()
+
+    grand_total = totals.sum()
+    current_y = y_start
+
+    for cat in categories:
+        count = totals.get(cat, 0)
+        height = (count / grand_total) * total_height if grand_total > 0 else 0
+
+        node_positions[(dim_idx, cat)] = {
+            "x": x_pos,
+            "y_top": current_y,
+            "y_bottom": current_y - height,
+            "height": height,
+            "count": count,
+            "flow_offset_out": 0,  # For outgoing flows (right side)
+            "flow_offset_in": 0,  # For incoming flows (left side)
+        }
+        current_y = current_y - height - node_gap
+
+# Build node rectangles dataframe
+node_data = []
+for (dim_idx, cat), pos in node_positions.items():
+    node_data.append(
+        {
+            "dim_idx": dim_idx,
+            "category": cat,
+            "xmin": pos["x"] - node_width / 2,
+            "xmax": pos["x"] + node_width / 2,
+            "ymin": pos["y_bottom"],
+            "ymax": pos["y_top"],
+            "label_y": (pos["y_top"] + pos["y_bottom"]) / 2,
+            "count": pos["count"],
+            "display_label": str(cat),
+            "fill_color": outcome_colors.get(cat, "#888888"),
+        }
+    )
+nodes_df = pd.DataFrame(node_data)
+
+# Build flow polygons between adjacent dimensions
+flow_polygons = []
+flow_id_counter = 0
+
+for _, path_row in path_counts.iterrows():
+    path_values = [path_row["channel"], path_row["product"], path_row["customer_type"], path_row["outcome"]]
+    count = path_row["count"]
+    outcome = path_row["outcome"]
+
+    # Draw flows between each adjacent pair of dimensions
+    for dim_idx in range(n_dims - 1):
+        from_cat = path_values[dim_idx]
+        to_cat = path_values[dim_idx + 1]
+
+        src_pos = node_positions[(dim_idx, from_cat)]
+        tgt_pos = node_positions[(dim_idx + 1, to_cat)]
+
+        # Calculate flow height proportional to count at source and target
+        src_total = sum(path_counts[path_counts[dimensions[dim_idx]["name"]] == from_cat]["count"])
+        flow_height_src = (count / src_total) * src_pos["height"] if src_total > 0 else 0
+
+        tgt_total = sum(path_counts[path_counts[dimensions[dim_idx + 1]["name"]] == to_cat]["count"])
+        flow_height_tgt = (count / tgt_total) * tgt_pos["height"] if tgt_total > 0 else 0
+
+        # Source connection point (right side of node)
+        src_y_top = src_pos["y_top"] - src_pos["flow_offset_out"]
+        src_y_bottom = src_y_top - flow_height_src
+        src_pos["flow_offset_out"] += flow_height_src
+
+        # Target connection point (left side of node)
+        tgt_y_top = tgt_pos["y_top"] - tgt_pos["flow_offset_in"]
+        tgt_y_bottom = tgt_y_top - flow_height_tgt
+        tgt_pos["flow_offset_in"] += flow_height_tgt
+
+        # Create curved flow polygon using cubic interpolation
+        flow_x_left = x_positions[dim_idx] + node_width / 2
+        flow_x_right = x_positions[dim_idx + 1] - node_width / 2
+        n_points = 30
+
+        t_param = np.linspace(0, 1, n_points)
+        # Smooth cubic easing for natural flow appearance
+        x_top = flow_x_left + (flow_x_right - flow_x_left) * t_param
+        y_top = src_y_top + (tgt_y_top - src_y_top) * (3 * t_param**2 - 2 * t_param**3)
+
+        x_bottom = flow_x_right + (flow_x_left - flow_x_right) * t_param
+        y_bottom = tgt_y_bottom + (src_y_bottom - tgt_y_bottom) * (3 * t_param**2 - 2 * t_param**3)
+
+        # Combine into polygon
+        x_polygon = np.concatenate([x_top, x_bottom])
+        y_polygon = np.concatenate([y_top, y_bottom])
+
+        flow_id = f"flow_{flow_id_counter}"
+        flow_id_counter += 1
+
+        for i in range(len(x_polygon)):
+            flow_polygons.append({"x": x_polygon[i], "y": y_polygon[i], "flow_id": flow_id, "outcome": outcome})
+
+flows_df = pd.DataFrame(flow_polygons)
+
+# Create the plot
+plot = (
+    ggplot()
+    # Flow polygons with transparency - colored by outcome
+    + geom_polygon(flows_df, aes(x="x", y="y", group="flow_id", fill="outcome"), alpha=0.5)
+    # Node rectangles - use neutral gray for all nodes
+    + geom_rect(
+        nodes_df, aes(xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax"), fill="#555555", color="white", size=0.8
+    )
+    # Category labels on nodes
+    + geom_text(
+        nodes_df[nodes_df["count"] >= 20],
+        aes(x=(nodes_df["xmin"] + nodes_df["xmax"]) / 2, y="label_y", label="count"),
+        ha="center",
+        va="center",
+        size=10,
+        color="white",
+        fontweight="bold",
+    )
+    + scale_fill_manual(values=outcome_colors, name="Outcome", breaks=["Purchased", "Abandoned"])
+    + labs(title="parallel-categories-basic · plotnine · pyplots.ai", x="", y="")
+    + coord_cartesian(xlim=(0, 1), ylim=(-0.02, 1.02))
+    + theme_minimal()
+    + theme(
+        figure_size=(16, 9),
+        plot_title=element_text(size=24, ha="center", weight="bold"),
+        axis_text=element_blank(),
+        axis_ticks=element_blank(),
+        panel_grid=element_blank(),
+        legend_title=element_text(size=16, weight="bold"),
+        legend_text=element_text(size=14),
+        legend_position="right",
+    )
+)
+
+# Add dimension labels at top
+for dim_idx, dim in enumerate(dimensions):
+    plot = plot + annotate(
+        "text",
+        x=x_positions[dim_idx],
+        y=0.98,
+        label=dim["label"],
+        size=14,
+        color="#333333",
+        fontweight="bold",
+        ha="center",
+    )
+
+# Add category labels beside each node (all dimensions)
+for (dim_idx, cat), pos in node_positions.items():
+    label = str(cat)
+    label_y = (pos["y_top"] + pos["y_bottom"]) / 2
+
+    # For first dimension, place label on left side of node
+    if dim_idx == 0:
+        plot = plot + annotate(
+            "text",
+            x=x_positions[dim_idx] - node_width / 2 - 0.01,
+            y=label_y,
+            label=label,
+            size=10,
+            color="#333333",
+            ha="right",
+            va="center",
+        )
+    # For last dimension, place label on right side of node
+    elif dim_idx == n_dims - 1:
+        plot = plot + annotate(
+            "text",
+            x=x_positions[dim_idx] + node_width / 2 + 0.01,
+            y=label_y,
+            label=label,
+            size=10,
+            color="#333333",
+            ha="left",
+            va="center",
+        )
+    # For middle dimensions, place label below the node
+    else:
+        plot = plot + annotate(
+            "text",
+            x=x_positions[dim_idx],
+            y=pos["y_bottom"] - 0.015,
+            label=label,
+            size=9,
+            color="#333333",
+            ha="center",
+            va="top",
+        )
+
+plot.save("plot.png", dpi=300, verbose=False)
diff --git a/plots/parallel-categories-basic/metadata/plotnine.yaml b/plots/parallel-categories-basic/metadata/plotnine.yaml
@@ -0,0 +1,27 @@
+library: plotnine
+specification_id: parallel-categories-basic
+created: '2025-12-30T21:54:08Z'
+updated: '2025-12-30T22:02:46Z'
+generated_by: claude-opus-4-5-20251101
+workflow_run: 20606634555
+issue: 0
+python_version: 3.13.11
+library_version: 0.15.2
+preview_url: https://storage.googleapis.com/pyplots-images/plots/parallel-categories-basic/plotnine/plot.png
+preview_thumb: https://storage.googleapis.com/pyplots-images/plots/parallel-categories-basic/plotnine/plot_thumb.png
+preview_html: null
+quality_score: 90
+review:
+  strengths:
+  - Creative implementation of parallel categories using plotnine basic geoms (geom_polygon,
+    geom_rect)
+  - Smooth cubic interpolation for ribbon curves creates professional appearance
+  - Clear visual distinction between outcomes with yellow/blue color scheme
+  - Effective use of transparency (alpha=0.5) for overlapping ribbons
+  - Well-organized data structure with explicit path counts
+  - Category labels positioned intelligently based on dimension position
+  - Counts displayed inside nodes for quantitative reference
+  weaknesses:
+  - Middle dimension category labels positioned below nodes are slightly smaller (9pt)
+    and could be harder to read
+  - Some imports may be unused (coord_cartesian could be replaced with theme settings)