OmicsTIDE2.0/testk.py at master · Integrative-Transcriptomics/OmicsTIDE2.0 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import math

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pandas as pd
import sys
import os


def best_k_plots(X, name):
    """
    Tests different k with silhouette and ellbow method and creats plots.
    Parameters:
        X (pandas dataframe): A pandas data frame with the data
        name (string): The path were to save the file
    """
    X = X.to_numpy(copy=True)
    sse = []
    silhouette_scores = []
    used_k = []
    max = 11
    if len(X) > 1:
        fig, axs = plt.subplots(6, 2)
        fig.set_size_inches(10, 30)
        if len(X) < max:
            max = len(X)
        for k in range(1, max):
            kmeans = KMeans(n_clusters=k)
            cluster_labels = kmeans.fit_predict(X)
            # ellbow method
            sse.append(kmeans.inertia_)
            # silhouette
            if len(set(cluster_labels)) > 1:
                used_k.append(k)
                silhouette_avg = silhouette_score(X, cluster_labels)
                silhouette_scores.append(silhouette_avg)

                sample_silhouette_values = silhouette_samples(X, cluster_labels)
                ax_sil = axs[math.floor(k / 2), k % 2]
                ax_sil.set_xlim([-0.1, 1])
                ax_sil.set_ylim([0, len(X) + (k + 1) * 10])
                y_lower = 10

                # silhouette coefficient plots
                for i in range(k):
                    ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
                    ith_cluster_silhouette_values.sort()
                    size_cluster_i = ith_cluster_silhouette_values.shape[0]
                    y_upper = y_lower + size_cluster_i
                    color = cm.nipy_spectral(float(i) / k)
                    ax_sil.fill_betweenx(
                        np.arange(y_lower, y_upper),
                        0,
                        ith_cluster_silhouette_values,
                        facecolor=color,
                        edgecolor=color,
                        alpha=0.7,
                    )
                    ax_sil.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
                    y_lower = y_upper + 10  # 10 for the 0 samples

                ax_sil.set_title("k=" + str(k))
                ax_sil.set_xlabel("Silhouette coefficient")
                ax_sil.set_ylabel("Cluster label")

                ax_sil.axvline(x=silhouette_avg, color="red", linestyle="--")

                ax_sil.set_yticks([])
                ax_sil.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
        axs[5, 1].set_axis_off()

        axs[0, 0].plot(range(1, max), sse)
        axs[0, 0].set_title('Elbow method')
        axs[0, 0].set_xlabel('k')
        axs[0, 0].set_ylabel('Distortion')

        axs[0, 1].plot(used_k, silhouette_scores)
        axs[0, 1].set_ylim([0, 1])
        axs[0, 1].set_xlim([2, max - 1])
        axs[0, 1].set_title('Silhouette method')
        axs[0, 1].set_xlabel('k')
        axs[0, 1].set_ylabel('Silhouette score')
        fig.tight_layout()
        plt.savefig(name + ".png")


def main(path_to_data_folder):
    comparison_folder = os.path.join(path_to_data_folder, "comparisons")
    # create results folder if it does not exist
    if not os.path.exists("results"):
        os.mkdir("results")
    # go through downloaded normalized counts folder
    for folder in os.listdir(comparison_folder):
        if os.path.isdir(os.path.join(comparison_folder, folder)):
            for file in os.listdir(os.path.join(comparison_folder, folder)):
                if file.endswith(".csv"):
                    data = pd.read_csv(os.path.join(comparison_folder, folder, file))
                    data["gene"] = data['gene'] + "_" + data['dataset'].astype(str)
                    data.index = data["gene"]
                    data = data.drop(columns=["dataset", "gene"])

                    # create best k for every method
                    best_k_plots(data, os.path.join("results", file))


if __name__ == "__main__":
    try:
        main(sys.argv[1])
    except:
        print("Usage: python testk.py normalized_data")
        print("You can download the normalized data at http://omicstide-tuevis.cs.uni-tuebingen.de/. Upload your data "
              "and click \"download processed data\".")