|
| 1 | +""" pyplots.ai |
| 2 | +manhattan-gwas: Manhattan Plot for GWAS |
| 3 | +Library: seaborn 0.13.2 | Python 3.13.11 |
| 4 | +Quality: 92/100 | Created: 2025-12-31 |
| 5 | +""" |
| 6 | + |
| 7 | +import matplotlib.pyplot as plt |
| 8 | +import numpy as np |
| 9 | +import pandas as pd |
| 10 | +import seaborn as sns |
| 11 | + |
| 12 | + |
| 13 | +# Data - Simulate GWAS data with realistic structure |
| 14 | +np.random.seed(42) |
| 15 | + |
| 16 | +# Define chromosomes with approximate sizes (in Mb) |
| 17 | +chromosomes = [str(i) for i in range(1, 23)] |
| 18 | +chrom_sizes = [250, 243, 198, 190, 182, 171, 159, 145, 138, 133, 135, 133, 114, 107, 102, 90, 83, 80, 59, 64, 47, 51] |
| 19 | + |
| 20 | +# Generate SNPs for each chromosome |
| 21 | +data = [] |
| 22 | +cumulative_pos = 0 |
| 23 | +chrom_centers = {} |
| 24 | +chrom_boundaries = [0] |
| 25 | + |
| 26 | +for chrom, size in zip(chromosomes, chrom_sizes, strict=True): |
| 27 | + # Number of SNPs proportional to chromosome size |
| 28 | + n_snps = int(size * 40) # ~40 SNPs per Mb, total ~10k SNPs |
| 29 | + |
| 30 | + # Random positions along chromosome |
| 31 | + positions = np.sort(np.random.randint(0, size * 1_000_000, n_snps)) |
| 32 | + |
| 33 | + # Generate p-values - mostly non-significant with some peaks |
| 34 | + # Use beta distribution to get realistic p-value distribution |
| 35 | + p_values = np.random.beta(1, 1, n_snps) |
| 36 | + |
| 37 | + # Add significant peaks on specific chromosomes |
| 38 | + if chrom == "6": # Major peak on chr6 (like MHC region) |
| 39 | + peak_region = (positions > 25_000_000) & (positions < 35_000_000) |
| 40 | + p_values[peak_region] = 10 ** (-np.random.uniform(7, 12, peak_region.sum())) |
| 41 | + elif chrom == "11": # Moderate peak |
| 42 | + peak_region = (positions > 60_000_000) & (positions < 70_000_000) |
| 43 | + p_values[peak_region] = 10 ** (-np.random.uniform(6, 9, peak_region.sum())) |
| 44 | + elif chrom == "2": # Smaller peak |
| 45 | + peak_region = (positions > 100_000_000) & (positions < 110_000_000) |
| 46 | + p_values[peak_region] = 10 ** (-np.random.uniform(5.5, 8, peak_region.sum())) |
| 47 | + |
| 48 | + # Calculate cumulative position |
| 49 | + cumulative_positions = positions + cumulative_pos |
| 50 | + |
| 51 | + # Store center for axis label |
| 52 | + chrom_centers[chrom] = cumulative_pos + (size * 1_000_000) / 2 |
| 53 | + |
| 54 | + for pos, cum_pos, pval in zip(positions, cumulative_positions, p_values, strict=True): |
| 55 | + data.append( |
| 56 | + { |
| 57 | + "chromosome": chrom, |
| 58 | + "position": pos, |
| 59 | + "cumulative_position": cum_pos, |
| 60 | + "p_value": pval, |
| 61 | + "neg_log_p": -np.log10(pval), |
| 62 | + } |
| 63 | + ) |
| 64 | + |
| 65 | + cumulative_pos += size * 1_000_000 |
| 66 | + chrom_boundaries.append(cumulative_pos) |
| 67 | + |
| 68 | +df = pd.DataFrame(data) |
| 69 | + |
| 70 | +# Create alternating color groups for chromosomes |
| 71 | +df["color_group"] = df["chromosome"].apply(lambda x: int(x) % 2) |
| 72 | + |
| 73 | +# Plot |
| 74 | +fig, ax = plt.subplots(figsize=(16, 9)) |
| 75 | + |
| 76 | +# Create custom palette - Python Blue and a muted gray |
| 77 | +palette = {0: "#306998", 1: "#8B9DC3"} |
| 78 | + |
| 79 | +# Plot points using seaborn scatterplot with hue for alternating colors |
| 80 | +sns.scatterplot( |
| 81 | + data=df, |
| 82 | + x="cumulative_position", |
| 83 | + y="neg_log_p", |
| 84 | + hue="color_group", |
| 85 | + palette=palette, |
| 86 | + s=15, |
| 87 | + alpha=0.7, |
| 88 | + edgecolor="none", |
| 89 | + legend=False, |
| 90 | + ax=ax, |
| 91 | +) |
| 92 | + |
| 93 | +# Highlight significant SNPs (above genome-wide threshold) |
| 94 | +significant = df[df["neg_log_p"] > 7.3] |
| 95 | +if len(significant) > 0: |
| 96 | + ax.scatter( |
| 97 | + significant["cumulative_position"], |
| 98 | + significant["neg_log_p"], |
| 99 | + c="#FFD43B", # Python Yellow for significant hits |
| 100 | + s=40, |
| 101 | + edgecolor="black", |
| 102 | + linewidth=0.5, |
| 103 | + zorder=5, |
| 104 | + ) |
| 105 | + |
| 106 | +# Genome-wide significance threshold |
| 107 | +ax.axhline(y=7.3, color="#D62728", linestyle="--", linewidth=2, label="Genome-wide significance (p < 5×10⁻⁸)") |
| 108 | + |
| 109 | +# Suggestive threshold |
| 110 | +ax.axhline(y=5, color="#7F7F7F", linestyle=":", linewidth=1.5, label="Suggestive (p < 1×10⁻⁵)") |
| 111 | + |
| 112 | +# Set x-axis ticks at chromosome centers |
| 113 | +ax.set_xticks([chrom_centers[c] for c in chromosomes]) |
| 114 | +ax.set_xticklabels(chromosomes) |
| 115 | + |
| 116 | +# Styling |
| 117 | +ax.set_xlabel("Chromosome", fontsize=20) |
| 118 | +ax.set_ylabel("-log₁₀(p-value)", fontsize=20) |
| 119 | +ax.set_title("manhattan-gwas · seaborn · pyplots.ai", fontsize=24) |
| 120 | +ax.tick_params(axis="both", labelsize=14) |
| 121 | +ax.tick_params(axis="x", labelsize=12) |
| 122 | + |
| 123 | +# Set axis limits |
| 124 | +ax.set_xlim(0, cumulative_pos) |
| 125 | +ax.set_ylim(0, max(df["neg_log_p"]) * 1.05) |
| 126 | + |
| 127 | +# Remove top and right spines for cleaner look |
| 128 | +ax.spines["top"].set_visible(False) |
| 129 | +ax.spines["right"].set_visible(False) |
| 130 | + |
| 131 | +# Add legend |
| 132 | +ax.legend(loc="upper right", fontsize=14, framealpha=0.9) |
| 133 | + |
| 134 | +# Subtle grid on y-axis only |
| 135 | +ax.yaxis.grid(True, alpha=0.3, linestyle="-") |
| 136 | +ax.xaxis.grid(False) |
| 137 | + |
| 138 | +plt.tight_layout() |
| 139 | +plt.savefig("plot.png", dpi=300, bbox_inches="tight") |
0 commit comments