-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnotebook_example_hvg_subset.py
More file actions
130 lines (109 loc) · 4.96 KB
/
notebook_example_hvg_subset.py
File metadata and controls
130 lines (109 loc) · 4.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
"""
Notebook Example: Gene Subsetting with Highly Variable Genes
Copy this code into your notebook to use gene subsetting functionality.
"""
from gather_stability_columns import gather_stability_columns, get_highly_variable_genes, plot_multi_tool_correlation_vs_parameter
# ==============================================================================
# Step 1: Extract highly variable genes from original dataset
# ==============================================================================
# You'll need to update this path to point to your original aging dataset
# This should be the full dataset before any subsampling was done
original_aging_path = "path/to/original_aging_data.h5ad" # UPDATE THIS PATH!
try:
# Extract highly variable genes from the original dataset
hvg_genes = get_highly_variable_genes(original_aging_path)
print(f"Loaded {len(hvg_genes)} highly variable genes from original dataset")
except:
# If you can't load the original dataset, you can also check one of the subsampled files
# to see what genes are available and manually select important ones
print("Could not load original dataset - using manual gene selection")
# Example: Use some common marker genes for demonstration
# Replace these with actual gene names from your dataset
hvg_genes = [
"CD3D", "CD3E", "CD4", "CD8A", "IL7R", "CCR7", # T cell markers
"MS4A1", "CD79A", "CD79B", # B cell markers
"CD14", "LYZ", "FCGR3A", # Myeloid markers
"PPBP", "PF4", # Platelet markers
# Add more genes as needed...
]
print(f"Using {len(hvg_genes)} genes for analysis")
print(f"Example genes: {hvg_genes[:5]}")
# ==============================================================================
# Step 2: Gather data with gene subsetting (YOUR MODIFIED CODE)
# ==============================================================================
# For DE comparison WITH GENE SUBSETTING
kompot_regular_de_df, _ = gather_stability_columns(
'data/processed/aging_subsampling',
'kompot_de_mahalanobis_Young_to_Old',
'var',
subset_genes=hvg_genes # NEW: Only analyze HVG genes
)
seurat_de_df, _ = gather_stability_columns(
'data/processed/aging_seurat_subsampling',
'seurat_de_neg_log10_pvalue_Young_to_Old',
'var',
subset_genes=hvg_genes # NEW: Only analyze HVG genes
)
scanpy_de_df, _ = gather_stability_columns(
'data/processed/aging_scanpy_subsampling',
'scanpy_de_neg_log10_pvalue_Young_to_Old',
'var',
subset_genes=hvg_genes # NEW: Only analyze HVG genes
)
kompot_precomputed_de_df, _ = gather_stability_columns(
'data/processed/aging_precomputed_embeddings_subsampling',
'kompot_de_mahalanobis_Young_to_Old',
'var',
subset_genes=hvg_genes # NEW: Only analyze HVG genes
)
print(f"Data loaded with gene subsetting:")
print(f" Kompot Regular: {kompot_regular_de_df.shape}")
print(f" Seurat: {seurat_de_df.shape}")
print(f" Scanpy: {scanpy_de_df.shape}")
print(f" Kompot Precomputed: {kompot_precomputed_de_df.shape}")
# ==============================================================================
# Step 3: Create the plot (YOUR ORIGINAL CODE WITH UPDATED TITLE)
# ==============================================================================
fig = plot_multi_tool_correlation_vs_parameter(
tool_data={
'Kompot DE (Regular)': kompot_regular_de_df,
'Kompot DE (Precomputed Embeddings)': kompot_precomputed_de_df,
'Seurat': seurat_de_df,
'Scanpy': scanpy_de_df,
},
correlation_method='spearman',
title=f'Aging Dataset: Kompot DE Embedding Strategy Comparison (HVG subset, n={len(hvg_genes)} genes)',
reference_index=0,
save_path='plots/kompot_de_embedding_comparison_hvg_subset.png'
)
# ==============================================================================
# Optional: Compare with all genes analysis
# ==============================================================================
print("\n=== Optional: Compare with all genes ===")
# Load the same data without gene subsetting
kompot_all_genes_df, _ = gather_stability_columns(
'data/processed/aging_subsampling',
'kompot_de_mahalanobis_Young_to_Old',
'var'
# No subset_genes = all genes
)
print(f"Comparison:")
print(f" All genes: {kompot_all_genes_df.shape[0]} genes")
print(f" HVG subset: {kompot_regular_de_df.shape[0]} genes")
print(f" Genes filtered out: {kompot_all_genes_df.shape[0] - kompot_regular_de_df.shape[0]}")
# Create side-by-side comparison
fig2 = plot_multi_tool_correlation_vs_parameter(
tool_data={
'Kompot (All genes)': kompot_all_genes_df,
'Kompot (HVG subset)': kompot_regular_de_df,
},
correlation_method='spearman',
title='Impact of Gene Subsetting on Stability',
reference_index=0,
save_path='plots/all_vs_hvg_genes_comparison.png'
)
print("Analysis complete!")
print("Files saved:")
print(" - plots/kompot_de_embedding_comparison_hvg_subset.png")
print(" - plots/all_vs_hvg_genes_comparison.png")