-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
54 lines (41 loc) · 2.61 KB
/
Copy pathpreprocessing.py
File metadata and controls
54 lines (41 loc) · 2.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""
preprocess datasets and save to disk to avoid redundant and expensive compute (like UMAP over large datasets)
for methods ["scimilarity", "scgpt", "geneformer", "uce"]:
data/{DL_method}_imru.h5ad is output from heist-methods where .obsm[f"{opt.method}_embeddings"] is the DL embedding
for state:
data/state_imru.h5ad is output from create_state_precursor_h5ad() followed by state emb transform call
will write data/{method}_imru_processed.h5ad such that .X will be np array of method's embedding, .obsm["umap"] will be umap of embedding
(or in case of umap method, will just be a copy of the umap of raw)
"""
from library import *
parser = argparse.ArgumentParser()
parser.add_argument("--method", type=str, default="raw", help="which method to use for embedding: raw, umap, scimilarity, geneformer, scgpt, uce, state")
parser.add_argument("--dataset", type=str, default="full", help="full, Alsaigh, Tellides, or Pass; determines which dataset to load and process")
opt = parser.parse_args()
##read in h5ad depending on method and dataset
if opt.method in ["raw", "umap"]:
adata = sc.read_h5ad(f"data/imru_{opt.dataset}.h5ad")
adata.X = adata.X.toarray()
if opt.method in ["scimilarity", "scgpt", "geneformer", "uce", "state"]: ##DL method
adata = sc.read_h5ad(f"data/{opt.method}_imru_{opt.dataset}.h5ad")
##get umap of embedding and add to adata.obsm["umap"]
if opt.method in ["raw", "umap"]:
umap_embedding = get_umap(adata.X)
adata.obsm["umap"] = umap_embedding
if opt.method == "umap":
adata = AnnData(umap_embedding, obs=adata.obs)
adata.obsm["umap"] = umap_embedding ##for sake of consistency and ease of syntax later
elif opt.method in ["scgpt", "scimilarity", "geneformer", "uce"]: ##if DL method processed by heist-methods
umap_embedding = get_umap(adata.obsm[f"{opt.method}_embeddings"])
adata = AnnData(adata.obsm[f"{opt.method}_embeddings"], obs=adata.obs) ##can't assign .X directly, so create new adata object instead. deep embeddings are found in .obsm["{opt.method}_embeddings"]
adata.obsm["umap"] = umap_embedding
elif opt.method in ["state"]:
umap_embedding = get_umap(adata.obsm[f"X_state"])
adata = AnnData(adata.obsm["X_state"], obs=adata.obs) ##can't assign .X directly, so create new adata object instead. deep embeddings are found in .obsm["{opt.method}_embeddings"]
adata.obsm["umap"] = umap_embedding
else:
raise Exception(f"{opt.method} not found")
##write final h5ad to be used by analysis.py
adata.write(f"data/{opt.method}_imru_{opt.dataset}_processed.h5ad")
if not os.path.isfile("data/random_imru_full_processed.h5ad"):
create_random_adata()