|
| 1 | +""" pyplots.ai |
| 2 | +ma-differential-expression: MA Plot for Differential Expression |
| 3 | +Library: seaborn 0.13.2 | Python 3.14.3 |
| 4 | +Quality: 94/100 | Created: 2026-03-20 |
| 5 | +""" |
| 6 | + |
| 7 | +import matplotlib.pyplot as plt |
| 8 | +import numpy as np |
| 9 | +import pandas as pd |
| 10 | +import seaborn as sns |
| 11 | + |
| 12 | + |
| 13 | +# Seaborn theme and style |
| 14 | +sns.set_theme(style="ticks", context="talk", font_scale=1.1) |
| 15 | +palette = sns.color_palette("colorblind") |
| 16 | +TEAL = palette[0] # colorblind-safe blue/teal |
| 17 | +GRAY = "#C0C0C0" |
| 18 | +ACCENT = palette[2] # green for LOESS |
| 19 | +DARK = "#2C3E50" |
| 20 | +THRESHOLD_COLOR = palette[4] # muted purple for threshold lines |
| 21 | + |
| 22 | +# Data - Simulated RNA-seq differential expression results (~15,000 genes) |
| 23 | +np.random.seed(42) |
| 24 | +n_genes = 15000 |
| 25 | + |
| 26 | +mean_expression = np.random.exponential(scale=3, size=n_genes) + 1 |
| 27 | +log_fold_change = np.random.normal(0, 0.5, n_genes) |
| 28 | + |
| 29 | +# Add truly differentially expressed genes (~8% of genes) |
| 30 | +n_de = int(n_genes * 0.08) |
| 31 | +de_indices = np.random.choice(n_genes, n_de, replace=False) |
| 32 | +log_fold_change[de_indices] += np.random.choice([-1, 1], n_de) * np.random.uniform(1.5, 4, n_de) |
| 33 | + |
| 34 | +# Simulate p-values: DE genes get small p-values, others get uniform |
| 35 | +p_values = np.ones(n_genes) |
| 36 | +p_values[de_indices] = 10 ** (-np.random.uniform(2, 10, n_de)) |
| 37 | +p_values[~np.isin(np.arange(n_genes), de_indices)] = np.random.uniform(0.01, 1.0, n_genes - n_de) |
| 38 | + |
| 39 | +significant = p_values < 0.05 |
| 40 | + |
| 41 | +# Categorize genes for seaborn hue mapping |
| 42 | +status = np.where( |
| 43 | + ~significant, |
| 44 | + "Not significant", |
| 45 | + np.where(log_fold_change > 1, "Up-regulated", np.where(log_fold_change < -1, "Down-regulated", "Significant")), |
| 46 | +) |
| 47 | + |
| 48 | +# Gene names for top hits - spread across expression range for better storytelling |
| 49 | +gene_names = [None] * n_genes |
| 50 | +top_gene_labels = ["BRCA1", "TP53", "MYC", "EGFR", "VEGFA", "IL6"] |
| 51 | +sig_de_mask = significant & (np.abs(log_fold_change) > 1) |
| 52 | +sig_de_indices = np.where(sig_de_mask)[0] |
| 53 | +sig_de_expr = mean_expression[sig_de_indices] |
| 54 | +sig_de_abs_lfc = np.abs(log_fold_change[sig_de_indices]) |
| 55 | + |
| 56 | +# Use evenly-spaced expression bins across full range to spread labels |
| 57 | +expr_min, expr_max = sig_de_expr.min(), sig_de_expr.max() |
| 58 | +n_labels = len(top_gene_labels) |
| 59 | +expr_edges = np.linspace(expr_min, expr_max + 0.01, n_labels + 1) |
| 60 | +top_sig = [] |
| 61 | +for b in range(n_labels): |
| 62 | + in_bin = (sig_de_expr >= expr_edges[b]) & (sig_de_expr < expr_edges[b + 1]) |
| 63 | + if not np.any(in_bin): |
| 64 | + continue |
| 65 | + bin_idx = np.where(in_bin)[0] |
| 66 | + best = bin_idx[np.argmax(sig_de_abs_lfc[bin_idx])] |
| 67 | + top_sig.append(sig_de_indices[best]) |
| 68 | + |
| 69 | +for i, idx in enumerate(top_sig[:n_labels]): |
| 70 | + gene_names[idx] = top_gene_labels[i] |
| 71 | + |
| 72 | +df = pd.DataFrame( |
| 73 | + { |
| 74 | + "Mean Expression (A)": mean_expression, |
| 75 | + "Log₂ Fold Change (M)": log_fold_change, |
| 76 | + "Status": pd.Categorical( |
| 77 | + status, categories=["Not significant", "Significant", "Up-regulated", "Down-regulated"] |
| 78 | + ), |
| 79 | + "gene_name": gene_names, |
| 80 | + } |
| 81 | +) |
| 82 | + |
| 83 | +# Color palette for categories - colorblind-safe |
| 84 | +status_palette = { |
| 85 | + "Not significant": GRAY, |
| 86 | + "Significant": sns.desaturate(TEAL, 0.6), |
| 87 | + "Up-regulated": TEAL, |
| 88 | + "Down-regulated": palette[3], # red from colorblind palette |
| 89 | +} |
| 90 | + |
| 91 | +# Plot using seaborn's hue-based scatter |
| 92 | +fig, ax = plt.subplots(figsize=(16, 9)) |
| 93 | + |
| 94 | +sns.scatterplot( |
| 95 | + data=df, |
| 96 | + x="Mean Expression (A)", |
| 97 | + y="Log₂ Fold Change (M)", |
| 98 | + hue="Status", |
| 99 | + hue_order=["Not significant", "Significant", "Up-regulated", "Down-regulated"], |
| 100 | + palette=status_palette, |
| 101 | + size="Status", |
| 102 | + sizes={"Not significant": 20, "Significant": 40, "Up-regulated": 55, "Down-regulated": 55}, |
| 103 | + alpha=0.3, |
| 104 | + edgecolor="none", |
| 105 | + legend="full", |
| 106 | + ax=ax, |
| 107 | +) |
| 108 | + |
| 109 | +# Emphasize up/down-regulated with white edges via a second scatter layer |
| 110 | +de_data = df[df["Status"].isin(["Up-regulated", "Down-regulated"])] |
| 111 | +sns.scatterplot( |
| 112 | + data=de_data, |
| 113 | + x="Mean Expression (A)", |
| 114 | + y="Log₂ Fold Change (M)", |
| 115 | + hue="Status", |
| 116 | + hue_order=["Up-regulated", "Down-regulated"], |
| 117 | + palette={"Up-regulated": TEAL, "Down-regulated": palette[3]}, |
| 118 | + s=55, |
| 119 | + alpha=0.6, |
| 120 | + edgecolor="white", |
| 121 | + linewidth=0.4, |
| 122 | + legend=False, |
| 123 | + ax=ax, |
| 124 | +) |
| 125 | + |
| 126 | +# Reference lines |
| 127 | +ax.axhline(y=0, color=DARK, linewidth=2, alpha=0.5) |
| 128 | +ax.axhline(y=1, color=THRESHOLD_COLOR, linewidth=1.5, linestyle="--", alpha=0.7) |
| 129 | +ax.axhline(y=-1, color=THRESHOLD_COLOR, linewidth=1.5, linestyle="--", alpha=0.7) |
| 130 | +ax.text(ax.get_xlim()[1] * 0.97, 1.12, "2-fold ↑", fontsize=12, color=THRESHOLD_COLOR, ha="right", fontstyle="italic") |
| 131 | +ax.text(ax.get_xlim()[1] * 0.97, -1.35, "2-fold ↓", fontsize=12, color=THRESHOLD_COLOR, ha="right", fontstyle="italic") |
| 132 | + |
| 133 | +# LOESS smoothing curve using seaborn regplot with lowess |
| 134 | +sns.regplot( |
| 135 | + data=df, |
| 136 | + x="Mean Expression (A)", |
| 137 | + y="Log₂ Fold Change (M)", |
| 138 | + lowess=True, |
| 139 | + scatter=False, |
| 140 | + line_kws={"color": ACCENT, "linewidth": 3, "alpha": 0.85, "label": "LOESS trend"}, |
| 141 | + ax=ax, |
| 142 | +) |
| 143 | + |
| 144 | +# Label top differentially expressed genes - spread across expression range |
| 145 | +labeled = df[df["gene_name"].notna()].copy() |
| 146 | +label_positions = [] |
| 147 | +for _, row in labeled.iterrows(): |
| 148 | + x_val = row["Mean Expression (A)"] |
| 149 | + y_val = row["Log₂ Fold Change (M)"] |
| 150 | + # Place labels outward from the dense center: above for up-regulated, below for down |
| 151 | + y_off = -30 if y_val > 0 else 30 |
| 152 | + x_off = 25 if x_val < df["Mean Expression (A)"].median() else -25 |
| 153 | + # Avoid overlapping previously placed labels |
| 154 | + for px, py in label_positions: |
| 155 | + if abs(x_val - px) < 2 and abs(y_val - py) < 1: |
| 156 | + y_off = y_off + (35 if y_off > 0 else -35) |
| 157 | + break |
| 158 | + label_positions.append((x_val, y_val)) |
| 159 | + ax.annotate( |
| 160 | + row["gene_name"], |
| 161 | + xy=(x_val, y_val), |
| 162 | + xytext=(x_off, y_off), |
| 163 | + textcoords="offset points", |
| 164 | + fontsize=13, |
| 165 | + fontweight="bold", |
| 166 | + color=DARK, |
| 167 | + arrowprops={"arrowstyle": "->", "color": DARK, "lw": 1.2, "connectionstyle": "arc3,rad=0.2"}, |
| 168 | + bbox={"boxstyle": "round,pad=0.25", "facecolor": "white", "edgecolor": DARK, "alpha": 0.85, "linewidth": 0.8}, |
| 169 | + ) |
| 170 | + |
| 171 | +# Style - use seaborn's despine |
| 172 | +sns.despine(ax=ax) |
| 173 | +ax.set_xlabel("Mean Expression (A)", fontsize=20) |
| 174 | +ax.set_ylabel("Log₂ Fold Change (M)", fontsize=20) |
| 175 | +ax.set_title("ma-differential-expression · seaborn · pyplots.ai", fontsize=24, fontweight="medium", pad=15) |
| 176 | +ax.tick_params(axis="both", labelsize=16) |
| 177 | +ax.yaxis.grid(True, alpha=0.12, linewidth=0.8, color="#888888") |
| 178 | + |
| 179 | +# Refine legend with seaborn |
| 180 | +sns.move_legend(ax, "upper right", fontsize=13, framealpha=0.92, title="Gene Status", title_fontsize=14) |
| 181 | + |
| 182 | +plt.tight_layout() |
| 183 | +plt.savefig("plot.png", dpi=300, bbox_inches="tight") |
0 commit comments