|
| 1 | +""" pyplots.ai |
| 2 | +ma-differential-expression: MA Plot for Differential Expression |
| 3 | +Library: pygal 3.1.0 | Python 3.14.3 |
| 4 | +Quality: 84/100 | Created: 2026-03-20 |
| 5 | +""" |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +import pygal |
| 9 | +from pygal.style import Style |
| 10 | + |
| 11 | + |
| 12 | +# Data - Simulated RNA-seq differential expression results |
| 13 | +np.random.seed(42) |
| 14 | +n_genes = 15000 |
| 15 | + |
| 16 | +# Mean expression (A values) - log2 scale, typical RNA-seq range |
| 17 | +mean_expression = np.random.exponential(scale=3, size=n_genes) + 1 |
| 18 | + |
| 19 | +# Log fold change (M values) - most genes near zero, some truly differentially expressed |
| 20 | +log_fold_change = np.random.normal(0, 0.3, n_genes) |
| 21 | + |
| 22 | +# Add truly differentially expressed genes (~8% upregulated, ~7% downregulated) |
| 23 | +n_up = 1200 |
| 24 | +n_down = 1050 |
| 25 | +up_idx = np.random.choice(n_genes, n_up, replace=False) |
| 26 | +remaining = np.setdiff1d(np.arange(n_genes), up_idx) |
| 27 | +down_idx = np.random.choice(remaining, n_down, replace=False) |
| 28 | + |
| 29 | +log_fold_change[up_idx] = np.random.normal(2.5, 0.8, n_up) |
| 30 | +log_fold_change[down_idx] = np.random.normal(-2.2, 0.7, n_down) |
| 31 | + |
| 32 | +# Simulate p-values (significant for DE genes, noise for others) |
| 33 | +p_values = np.ones(n_genes) |
| 34 | +p_values[up_idx] = 10 ** (-np.random.uniform(2, 10, n_up)) |
| 35 | +p_values[down_idx] = 10 ** (-np.random.uniform(2, 10, n_down)) |
| 36 | +noise_idx = np.setdiff1d(np.arange(n_genes), np.concatenate([up_idx, down_idx])) |
| 37 | +p_values[noise_idx] = np.random.uniform(0.01, 1.0, len(noise_idx)) |
| 38 | + |
| 39 | +# Significance threshold |
| 40 | +significant = p_values < 0.05 |
| 41 | + |
| 42 | +# Gene names for top DE genes |
| 43 | +gene_names = [f"Gene{i}" for i in range(n_genes)] |
| 44 | +top_genes = ["BRCA1", "TP53", "MYC", "EGFR", "KRAS", "PTEN", "CDK2", "RB1", "AKT1", "VEGFA"] |
| 45 | +top_idx = np.argsort(p_values)[: len(top_genes)] |
| 46 | +for i, name in zip(top_idx, top_genes, strict=False): |
| 47 | + gene_names[i] = name |
| 48 | + |
| 49 | +# LOESS-like smoothing curve (binned moving average with additional smoothing) |
| 50 | +sort_order = np.argsort(mean_expression) |
| 51 | +sorted_a = mean_expression[sort_order] |
| 52 | +sorted_m = log_fold_change[sort_order] |
| 53 | + |
| 54 | +n_bins = 30 |
| 55 | +bin_edges = np.percentile(sorted_a, np.linspace(0, 100, n_bins + 1)) |
| 56 | +raw_x = [] |
| 57 | +raw_y = [] |
| 58 | +for b in range(n_bins): |
| 59 | + mask = (sorted_a >= bin_edges[b]) & (sorted_a < bin_edges[b + 1]) |
| 60 | + if mask.sum() > 20: |
| 61 | + raw_x.append(float(np.median(sorted_a[mask]))) |
| 62 | + raw_y.append(float(np.mean(sorted_m[mask]))) |
| 63 | + |
| 64 | +# Apply moving average smoothing |
| 65 | +smooth_y = np.array(raw_y) |
| 66 | +for _ in range(4): |
| 67 | + smoothed = np.copy(smooth_y) |
| 68 | + for j in range(1, len(smoothed) - 1): |
| 69 | + smoothed[j] = (smooth_y[j - 1] + smooth_y[j] + smooth_y[j + 1]) / 3 |
| 70 | + smooth_y = smoothed |
| 71 | +smooth_x = raw_x |
| 72 | + |
| 73 | +# Refined style - publication-quality aesthetics |
| 74 | +custom_style = Style( |
| 75 | + background="#FFFFFF", |
| 76 | + plot_background="#FAFBFC", |
| 77 | + foreground="#34495E", |
| 78 | + foreground_strong="#2C3E50", |
| 79 | + foreground_subtle="#E8ECF0", |
| 80 | + colors=( |
| 81 | + "#B0C4DE", # non-significant: light steel blue (muted background) |
| 82 | + "#E74C3C", # upregulated: vibrant red |
| 83 | + "#2980B9", # downregulated: ocean blue |
| 84 | + "#2C3E50", # M=0 line: charcoal |
| 85 | + "#7F8C8D", # +1 threshold: warm gray |
| 86 | + "#7F8C8D", # -1 threshold: warm gray |
| 87 | + "#E67E22", # LOESS curve: bold orange |
| 88 | + "#1A1A2E", # top DE genes: near-black |
| 89 | + ), |
| 90 | + title_font_size=52, |
| 91 | + label_font_size=40, |
| 92 | + major_label_font_size=38, |
| 93 | + legend_font_size=28, |
| 94 | + value_font_size=22, |
| 95 | + tooltip_font_size=26, |
| 96 | + stroke_width=2, |
| 97 | + opacity=0.6, |
| 98 | + opacity_hover=0.95, |
| 99 | +) |
| 100 | + |
| 101 | +# Subsample for performance (pygal renders each point as SVG element) |
| 102 | +np.random.seed(42) |
| 103 | +sig_mask = significant |
| 104 | +nonsig_mask = ~significant |
| 105 | + |
| 106 | +sig_indices = np.where(sig_mask)[0] |
| 107 | +nonsig_indices = np.where(nonsig_mask)[0] |
| 108 | +n_nonsig_display = 1800 |
| 109 | +nonsig_sample = np.random.choice(nonsig_indices, min(n_nonsig_display, len(nonsig_indices)), replace=False) |
| 110 | + |
| 111 | +# Split significant into up/down for visual storytelling |
| 112 | +sig_up_points = [] |
| 113 | +sig_down_points = [] |
| 114 | +nonsig_points = [] |
| 115 | + |
| 116 | +for i in nonsig_sample: |
| 117 | + nonsig_points.append( |
| 118 | + { |
| 119 | + "value": (round(float(mean_expression[i]), 2), round(float(log_fold_change[i]), 2)), |
| 120 | + "label": f"{gene_names[i]} | A={mean_expression[i]:.1f}, M={log_fold_change[i]:.2f}, p={p_values[i]:.2e}", |
| 121 | + } |
| 122 | + ) |
| 123 | + |
| 124 | +top_idx_set = set(top_idx.tolist()) |
| 125 | +for i in sig_indices: |
| 126 | + if i in top_idx_set: |
| 127 | + continue |
| 128 | + point = { |
| 129 | + "value": (round(float(mean_expression[i]), 2), round(float(log_fold_change[i]), 2)), |
| 130 | + "label": f"{gene_names[i]} | A={mean_expression[i]:.1f}, M={log_fold_change[i]:.2f}, p={p_values[i]:.2e}", |
| 131 | + } |
| 132 | + if log_fold_change[i] > 0: |
| 133 | + sig_up_points.append(point) |
| 134 | + else: |
| 135 | + sig_down_points.append(point) |
| 136 | + |
| 137 | +# Top 10 most significant genes with star markers in tooltips |
| 138 | +labeled_points = [] |
| 139 | +for i in top_idx: |
| 140 | + labeled_points.append( |
| 141 | + { |
| 142 | + "value": (round(float(mean_expression[i]), 2), round(float(log_fold_change[i]), 2)), |
| 143 | + "label": f"\u2605 {gene_names[i]} | A={mean_expression[i]:.1f}, M={log_fold_change[i]:.2f}, p={p_values[i]:.2e}", |
| 144 | + } |
| 145 | + ) |
| 146 | + |
| 147 | +# Reference line endpoints |
| 148 | +x_min = 0 |
| 149 | +x_max = float(np.percentile(mean_expression, 99.5)) |
| 150 | + |
| 151 | +# Create XY chart |
| 152 | +chart = pygal.XY( |
| 153 | + width=4800, |
| 154 | + height=2700, |
| 155 | + style=custom_style, |
| 156 | + title="ma-differential-expression \u00b7 pygal \u00b7 pyplots.ai", |
| 157 | + x_title="Mean Expression (A)", |
| 158 | + y_title="Log\u2082 Fold Change (M)", |
| 159 | + show_legend=True, |
| 160 | + legend_at_bottom=True, |
| 161 | + legend_at_bottom_columns=8, |
| 162 | + legend_box_size=22, |
| 163 | + dots_size=5, |
| 164 | + stroke=False, |
| 165 | + show_x_guides=False, |
| 166 | + show_y_guides=False, |
| 167 | + truncate_legend=-1, |
| 168 | + print_values=False, |
| 169 | + dynamic_print_values=True, |
| 170 | + js=[], |
| 171 | + x_label_rotation=0, |
| 172 | + margin_bottom=80, |
| 173 | +) |
| 174 | + |
| 175 | +# Non-significant genes - small muted dots for background density |
| 176 | +chart.add("Not Significant", nonsig_points, dots_size=4) |
| 177 | + |
| 178 | +# Upregulated significant - vibrant red |
| 179 | +chart.add("Upregulated (p<0.05)", sig_up_points, dots_size=7) |
| 180 | + |
| 181 | +# Downregulated significant - ocean blue |
| 182 | +chart.add("Downregulated (p<0.05)", sig_down_points, dots_size=7) |
| 183 | + |
| 184 | +# M = 0 line (no change) |
| 185 | +chart.add("M = 0", [(x_min, 0), (x_max, 0)], stroke=True, show_dots=False, stroke_style={"width": 5}) |
| 186 | + |
| 187 | +# M = +1 threshold (2-fold up) |
| 188 | +chart.add( |
| 189 | + "+2-fold", [(x_min, 1), (x_max, 1)], stroke=True, show_dots=False, stroke_style={"width": 3, "dasharray": "14, 8"} |
| 190 | +) |
| 191 | + |
| 192 | +# M = -1 threshold (2-fold down) |
| 193 | +chart.add( |
| 194 | + "\u22122-fold", |
| 195 | + [(x_min, -1), (x_max, -1)], |
| 196 | + stroke=True, |
| 197 | + show_dots=False, |
| 198 | + stroke_style={"width": 3, "dasharray": "14, 8"}, |
| 199 | +) |
| 200 | + |
| 201 | +# LOESS smoothing curve - bold orange |
| 202 | +chart.add( |
| 203 | + "LOESS trend", |
| 204 | + [(round(x, 2), round(y, 3)) for x, y in zip(smooth_x, smooth_y, strict=False)], |
| 205 | + stroke=True, |
| 206 | + show_dots=True, |
| 207 | + dots_size=5, |
| 208 | + stroke_style={"width": 7}, |
| 209 | +) |
| 210 | + |
| 211 | +# Top DE genes - large prominent dots |
| 212 | +chart.add("Top DE genes", labeled_points, dots_size=16, stroke=False) |
| 213 | + |
| 214 | +# Save |
| 215 | +chart.render_to_file("plot.html") |
| 216 | +chart.render_to_png("plot.png") |
0 commit comments