Bioinformatics/bioinformatics/algorithms/clustering.py at main · A-F-V/Bioinformatics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from random import random
from math import inf, exp


def distance(x, y):
    return sum([(a - b)**2 for a, b in zip(x, y)])**0.5


def dist_to_cluster(point, centres):  # how close is the point to the cluster it is assigned to (i.e. the nearest centre)?
    return min([distance(point, centre) for centre in centres])


def distortion(points, centres):
    return sum([dist_to_cluster(point, centres)**2 for point in points])/len(points)


def farthest_first_clustering(points, k):
    centres = [points[0]]
    points = points[1:]
    for _ in range(k - 1):
        point_dists = [(dist_to_cluster(point, centres), point) for point in points]
        furtherst_away = max(point_dists)
        centres.append(furtherst_away[1])
    return centres


def assign_to_cluster(point, centres):
    best = (inf, 0)
    for centre in range(len(centres)):
        dist = distance(point, centres[centre])
        if dist < best[0]:
            best = (dist, centre)
    return best[1]


def add_vector(a, b):
    if b == None:
        print("STOP")
    return tuple(map(lambda x: x[0]+x[1], zip(a, b)))


def scale_vector(a, q):
    return tuple(map(lambda x: x*q, a))


def centre_of_mass(points):
    n = len(points)
    k = len(points[0])
    o = tuple([0]*k)
    for point in points:
        o = add_vector(o, point)
    return scale_vector(o, 1/n)


def lloyd_kmeans(points, k, initializer=False):
    points = list(points)
    if initializer:
        random.shuffle(points)
    centres = points[:k]
    change = inf
    dist = inf
    while abs(change) > 1e-6:
        assignments = [(point, assign_to_cluster(point, centres)) for point in points]
        clusters = {i: [] for i in range(len(centres))}
        for p, i in assignments:
            clusters[i].append(p)
        centres = [centre_of_mass(clusters[i]) for i in range(len(centres))]
        temp = distortion(points, centres)
        change = dist - temp
        dist = temp
    return centres


def get_responsibilities(point, centres, beta):
    h_vals = []
    for centre in centres:
        h_vals.append(exp(-beta*distance(point, centre)))
    return [h_val/sum(h_vals) for h_val in h_vals]


def weighted_centre_of_mass(centre: int, responsibilites, k):
    n = len(responsibilites)
    o = tuple([0]*k)
    tot = 0
    for p, r in responsibilites:
        tot += r[centre]
        o = add_vector(o, scale_vector(p, r[centre]))
    return scale_vector(o, 1/tot)


def soft_kmeans(points, k, beta, initializer=False):
    points = list(points)
    if initializer:
        random.shuffle(points)
    centres = points[:k]
    change = inf
    dist = inf
    while abs(change) > 1e-6:
        # E PHASE - Gte hidden
        responsibilities = [(point, get_responsibilities(point, centres, beta)) for point in points]
        # M PHASE - Update centres
        centres = [weighted_centre_of_mass(i, responsibilities, k) for i in range(len(centres))]
        # check convergance speed
        temp = distortion(points, centres)
        change = dist - temp
        dist = temp
    return centres