|
| 1 | +""" pyplots.ai |
| 2 | +manhattan-gwas: Manhattan Plot for GWAS |
| 3 | +Library: plotnine 0.15.2 | Python 3.13.11 |
| 4 | +Quality: 91/100 | Created: 2025-12-31 |
| 5 | +""" |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +import pandas as pd |
| 9 | +from plotnine import ( |
| 10 | + aes, |
| 11 | + element_blank, |
| 12 | + element_text, |
| 13 | + geom_hline, |
| 14 | + geom_point, |
| 15 | + ggplot, |
| 16 | + labs, |
| 17 | + scale_color_manual, |
| 18 | + scale_x_continuous, |
| 19 | + scale_y_continuous, |
| 20 | + theme, |
| 21 | + theme_minimal, |
| 22 | +) |
| 23 | + |
| 24 | + |
| 25 | +# Data - Simulated GWAS results with significant peaks |
| 26 | +np.random.seed(42) |
| 27 | + |
| 28 | +# Chromosome sizes (approximate in Mb, scaled for visualization) |
| 29 | +chr_sizes = { |
| 30 | + "1": 249, |
| 31 | + "2": 243, |
| 32 | + "3": 198, |
| 33 | + "4": 191, |
| 34 | + "5": 182, |
| 35 | + "6": 171, |
| 36 | + "7": 159, |
| 37 | + "8": 145, |
| 38 | + "9": 138, |
| 39 | + "10": 134, |
| 40 | + "11": 135, |
| 41 | + "12": 133, |
| 42 | + "13": 115, |
| 43 | + "14": 107, |
| 44 | + "15": 102, |
| 45 | + "16": 90, |
| 46 | + "17": 83, |
| 47 | + "18": 80, |
| 48 | + "19": 59, |
| 49 | + "20": 64, |
| 50 | + "21": 47, |
| 51 | + "22": 51, |
| 52 | +} |
| 53 | + |
| 54 | +# Generate SNPs per chromosome (proportional to size) |
| 55 | +chromosomes = list(chr_sizes.keys()) |
| 56 | +snp_data = [] |
| 57 | + |
| 58 | +# Cumulative positions for x-axis |
| 59 | +cumulative_offset = 0 |
| 60 | +chr_offsets = {} |
| 61 | +chr_midpoints = {} |
| 62 | + |
| 63 | +for chrom in chromosomes: |
| 64 | + chr_offsets[chrom] = cumulative_offset |
| 65 | + size = chr_sizes[chrom] |
| 66 | + n_snps = int(size * 40) # ~40 SNPs per Mb = ~8000 total |
| 67 | + |
| 68 | + # Generate positions |
| 69 | + positions = np.sort(np.random.uniform(0, size * 1e6, n_snps)) |
| 70 | + |
| 71 | + # Generate p-values (mostly non-significant, with some peaks) |
| 72 | + p_values = np.random.uniform(0.001, 1, n_snps) |
| 73 | + |
| 74 | + # Add significant peaks on some chromosomes |
| 75 | + if chrom in ["2", "6", "11", "17"]: |
| 76 | + # Add 20-40 highly significant SNPs |
| 77 | + n_sig = np.random.randint(20, 40) |
| 78 | + peak_idx = np.random.choice(n_snps, n_sig, replace=False) |
| 79 | + p_values[peak_idx] = 10 ** np.random.uniform(-10, -7.3, n_sig) |
| 80 | + |
| 81 | + # Add some suggestive signals on other chromosomes |
| 82 | + if chrom in ["4", "9", "15", "20"]: |
| 83 | + n_sug = np.random.randint(10, 20) |
| 84 | + sug_idx = np.random.choice(n_snps, n_sug, replace=False) |
| 85 | + p_values[sug_idx] = 10 ** np.random.uniform(-7, -5, n_sug) |
| 86 | + |
| 87 | + # Calculate cumulative position |
| 88 | + cumulative_positions = positions + cumulative_offset |
| 89 | + |
| 90 | + chr_midpoints[chrom] = cumulative_offset + (size * 1e6) / 2 |
| 91 | + |
| 92 | + for i in range(n_snps): |
| 93 | + snp_data.append( |
| 94 | + { |
| 95 | + "chromosome": chrom, |
| 96 | + "position": positions[i], |
| 97 | + "cumulative_pos": cumulative_positions[i], |
| 98 | + "p_value": p_values[i], |
| 99 | + } |
| 100 | + ) |
| 101 | + |
| 102 | + cumulative_offset += size * 1e6 |
| 103 | + |
| 104 | +# Create DataFrame |
| 105 | +df = pd.DataFrame(snp_data) |
| 106 | + |
| 107 | +# Calculate -log10(p-value) |
| 108 | +df["neg_log_p"] = -np.log10(df["p_value"]) |
| 109 | + |
| 110 | +# Assign alternating colors based on chromosome |
| 111 | +chr_order = {c: i for i, c in enumerate(chromosomes)} |
| 112 | +df["chr_num"] = df["chromosome"].map(chr_order) |
| 113 | +df["color_group"] = df["chr_num"].apply(lambda x: "odd" if x % 2 == 0 else "even") |
| 114 | + |
| 115 | +# Threshold lines |
| 116 | +genome_wide_threshold = -np.log10(5e-8) # ~7.3 |
| 117 | +suggestive_threshold = -np.log10(1e-5) # 5 |
| 118 | + |
| 119 | +# Identify top SNPs for potential labeling (above genome-wide significance) |
| 120 | +df["significant"] = df["neg_log_p"] > genome_wide_threshold |
| 121 | + |
| 122 | +# Create chromosome tick positions and labels |
| 123 | +chr_ticks = [chr_midpoints[c] / 1e6 for c in chromosomes] # Convert to Mb for display |
| 124 | +chr_labels = chromosomes |
| 125 | + |
| 126 | +# Scale positions to Mb for cleaner axis |
| 127 | +df["cumulative_pos_mb"] = df["cumulative_pos"] / 1e6 |
| 128 | + |
| 129 | +# Plot |
| 130 | +plot = ( |
| 131 | + ggplot(df, aes(x="cumulative_pos_mb", y="neg_log_p", color="color_group")) |
| 132 | + + geom_point(size=1.2, alpha=0.7, show_legend=False) |
| 133 | + + geom_hline(yintercept=genome_wide_threshold, linetype="dashed", color="#E31A1C", size=1) |
| 134 | + + geom_hline(yintercept=suggestive_threshold, linetype="dotted", color="#FF7F00", size=0.8) |
| 135 | + + scale_color_manual(values={"odd": "#306998", "even": "#636363"}) |
| 136 | + + scale_x_continuous(breaks=chr_ticks, labels=chr_labels) |
| 137 | + + scale_y_continuous(limits=(0, max(df["neg_log_p"]) * 1.05)) |
| 138 | + + labs(x="Chromosome", y="-log₁₀(p-value)", title="manhattan-gwas · plotnine · pyplots.ai") |
| 139 | + + theme_minimal() |
| 140 | + + theme( |
| 141 | + figure_size=(16, 9), |
| 142 | + plot_title=element_text(size=24, ha="center"), |
| 143 | + axis_title_x=element_text(size=20), |
| 144 | + axis_title_y=element_text(size=20), |
| 145 | + axis_text_x=element_text(size=12), |
| 146 | + axis_text_y=element_text(size=16), |
| 147 | + panel_grid_major_x=element_blank(), |
| 148 | + panel_grid_minor=element_blank(), |
| 149 | + ) |
| 150 | +) |
| 151 | + |
| 152 | +# Save |
| 153 | +plot.save("plot.png", dpi=300, width=16, height=9) |
0 commit comments