forked from BostonGene/MFP
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassification_example.py
More file actions
61 lines (48 loc) · 2.5 KB
/
classification_example.py
File metadata and controls
61 lines (48 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Python3.7
import pandas as pd
import numpy as np
from portraits.classification import KNeighborsClusterClassifier
from portraits.utils import read_gene_sets, ssgsea_formula, median_scale
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Classification')
parser.add_argument('refsign', type=str,
help='Reference signatures')
parser.add_argument('refannot', type=str,
help='Reference annotation, Should contain MFP column')
parser.add_argument('gmt', type=str,
help='Signatures')
parser.add_argument('exp', type=str,
help='Expression matrix to process')
parser.add_argument('result', type=str,
help='File to save labels')
args = parser.parse_args()
# Example of how to classify another cohort having clusters on TCGA for example
# Load Training cohort with known MFP labels
TCGA_signature_scores_scaled = pd.read_csv(args.refsign, sep='\t', index_col=0).T # Signatures in rows
print(f'Reference signatures provided for {len(TCGA_signature_scores_scaled)} samples')
TCGA_annotation = pd.read_csv(args.refannot, sep='\t', index_col=0) # Contains MFP cluster labels in MFP column
print(f'Reference annotation provided for {len(TCGA_signature_scores_scaled)} samples')
# Fit the model
MODEL = KNeighborsClusterClassifier(norm=False, scale=False, clip=2, k=35).fit(TCGA_signature_scores_scaled,
TCGA_annotation.MFP)
# Load the cohort of interest
# Read signatures
gmt = read_gene_sets(args.gmt) # GMT format like in MSIGdb
print(f'Loaded {len(gmt)} signatures')
# Read expressions
exp = pd.read_csv(args.exp, sep='\t', header=0, index_col=0).T # log2+1 transformed; Genes should appear to be in rows
print(f'Classifying cohort, N={len(exp)} samples')
if exp.max().max() > 35:
print('Performing log2+1 transformation')
exp = np.log2(1+exp)
# Calc signature scores
signature_scores = ssgsea_formula(exp, gmt)
# Scale signatures
signature_scores_scaled = median_scale(signature_scores, 2)
# Predict clusters
cluster_labels = MODEL.predict(signature_scores_scaled[MODEL.X.columns]).rename('MFP')
print('Predicted labels count:')
print(cluster_labels.value_counts())
# Output the clusters
cluster_labels.to_csv(args.result, sep='\t', index=True)