-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample_gene_subset_analysis.py
More file actions
148 lines (120 loc) · 5.42 KB
/
example_gene_subset_analysis.py
File metadata and controls
148 lines (120 loc) · 5.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3
"""
Example: Using Gene Subsetting in Stability Analysis
This script demonstrates how to use the new gene subsetting functionality
in gather_stability_columns to analyze only highly variable genes.
"""
from gather_stability_columns import gather_stability_columns, get_highly_variable_genes
from plot_stability_analysis import plot_multi_tool_correlation_vs_parameter
import logging
# Set up logging to see the gene subsetting process
logging.basicConfig(level=logging.INFO)
# ==============================================================================
# Method 1: Using highly variable genes from original dataset
# ==============================================================================
# Extract highly variable genes from the original aging dataset
# Replace this path with the actual path to your original aging dataset
original_aging_path = "path/to/original_aging_data.h5ad" # Update this path
try:
# Get highly variable genes from the original dataset
hvg_genes = get_highly_variable_genes(
original_aging_path,
hvg_column='highly_variable' # or 'highly_variable_genes' depending on your data
)
print(f"Found {len(hvg_genes)} highly variable genes")
# Example of first few genes
print(f"First 10 HVG genes: {hvg_genes[:10]}")
except Exception as e:
print(f"Could not load HVG genes from original dataset: {e}")
print("Using a mock list for demonstration purposes")
# Mock highly variable genes for demonstration
hvg_genes = ["GENE1", "GENE2", "GENE3"] # Replace with actual gene names
# ==============================================================================
# Method 2: Using a predefined gene list
# ==============================================================================
# You can also manually specify genes of interest
immune_genes = [
"CD3D", "CD3E", "CD3G", "CD4", "CD8A", "CD8B",
"IL2", "IL4", "IL10", "IFNG", "TNF", "CCL2"
]
# Choose which gene subset to use
selected_genes = hvg_genes # or immune_genes for immune-specific analysis
# ==============================================================================
# Gather data with gene subsetting
# ==============================================================================
print("\n=== Gathering stability data with gene subsetting ===")
# For DE comparison with gene subsetting
kompot_regular_de_df, _ = gather_stability_columns(
'data/processed/aging_subsampling',
'kompot_de_mahalanobis_Young_to_Old',
'var',
subset_genes=selected_genes # NEW: Apply gene subsetting
)
seurat_de_df, _ = gather_stability_columns(
'data/processed/aging_seurat_subsampling',
'seurat_de_neg_log10_pvalue_Young_to_Old',
'var',
subset_genes=selected_genes # NEW: Apply gene subsetting
)
scanpy_de_df, _ = gather_stability_columns(
'data/processed/aging_scanpy_subsampling',
'scanpy_de_neg_log10_pvalue_Young_to_Old',
'var',
subset_genes=selected_genes # NEW: Apply gene subsetting
)
kompot_precomputed_de_df, _ = gather_stability_columns(
'data/processed/aging_precomputed_embeddings_subsampling',
'kompot_de_mahalanobis_Young_to_Old',
'var',
subset_genes=selected_genes # NEW: Apply gene subsetting
)
print(f"Data shapes after gene subsetting:")
print(f" Kompot Regular: {kompot_regular_de_df.shape}")
print(f" Seurat: {seurat_de_df.shape}")
print(f" Scanpy: {scanpy_de_df.shape}")
print(f" Kompot Precomputed: {kompot_precomputed_de_df.shape}")
# ==============================================================================
# Create comparison plot with subsetted genes
# ==============================================================================
print("\n=== Creating comparison plot ===")
fig = plot_multi_tool_correlation_vs_parameter(
tool_data={
'Kompot DE (Regular)': kompot_regular_de_df,
'Kompot DE (Precomputed Embeddings)': kompot_precomputed_de_df,
'Seurat': seurat_de_df,
'Scanpy': scanpy_de_df,
},
correlation_method='spearman',
title=f'Aging Dataset: DE Comparison (HVG subset, n={len(selected_genes)} genes)',
reference_index=0,
save_path='plots/kompot_de_embedding_comparison_hvg_subset.png'
)
print("Analysis complete!")
print(f"Plot saved to: plots/kompot_de_embedding_comparison_hvg_subset.png")
# ==============================================================================
# Additional example: Compare HVG vs All genes
# ==============================================================================
print("\n=== Bonus: Comparing HVG subset vs All genes ===")
# Load the same data without gene subsetting for comparison
kompot_all_genes_df, _ = gather_stability_columns(
'data/processed/aging_subsampling',
'kompot_de_mahalanobis_Young_to_Old',
'var'
# No subset_genes parameter = use all genes
)
print(f"Kompot data shapes:")
print(f" All genes: {kompot_all_genes_df.shape}")
print(f" HVG subset: {kompot_regular_de_df.shape}")
print(f" Reduction: {kompot_all_genes_df.shape[0] - kompot_regular_de_df.shape[0]} genes filtered out")
# You could create additional plots comparing HVG vs all genes
fig2 = plot_multi_tool_correlation_vs_parameter(
tool_data={
'Kompot (All genes)': kompot_all_genes_df,
'Kompot (HVG only)': kompot_regular_de_df,
},
correlation_method='spearman',
title='Gene Subsetting Impact: All genes vs HVG',
reference_index=0,
save_path='plots/all_vs_hvg_comparison.png'
)
print("Bonus analysis complete!")