Python/machine_learning/tsne.py at 744a5bddd480f9e0b01e8028cb9003bcdc35d9ff · Nikita-Kedari/Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""
t-Distributed Stochastic Neighbor Embedding (t-SNE)
---------------------------------------------------

t-SNE is a nonlinear dimensionality reduction algorithm used for visualizing
high-dimensional data in a lower-dimensional (usually 2D or 3D) space.

It models pairwise similarities between points in both the high-dimensional
and low-dimensional spaces, and minimizes the difference between them using
gradient descent.

This simplified implementation demonstrates the core idea of t-SNE for
educational purposes — it is **not optimized for large datasets**.

This implementation:
- Computes pairwise similarities in the high-dimensional space.
- Computes pairwise similarities in the low-dimensional (embedding) space.
- Minimizes the Kullback-Leibler divergence between these distributions
  using gradient descent.
- Follows the original t-SNE formulation by van der Maaten & Hinton (2008).

References:
- van der Maaten, L. and Hinton, G. (2008).
  "Visualizing Data using t-SNE". Journal of Machine Learning Research.
- https://lvdmaaten.github.io/tsne/
"""

import doctest

import numpy as np
from sklearn.datasets import load_iris


def collect_dataset() -> tuple[np.ndarray, np.ndarray]:
    """
    Collects the Iris dataset and returns features and labels.

    :return: Tuple containing feature matrix and target labels

    Example:
    >>> x, y = collect_dataset()
    >>> x.shape
    (150, 4)
    >>> y.shape
    (150,)
    """
    data = load_iris()
    return np.array(data.data), np.array(data.target)


def compute_pairwise_affinities(x: np.ndarray, sigma: float = 1.0) -> np.ndarray:
    """
    Computes pairwise affinities (P matrix) in high-dimensional space using Gaussian kernel.

    :param x: Input data of shape (n_samples, n_features)
    :param sigma: Variance (Bandwidth) of the Gaussian kernel
    :return: Symmetrized probability matrix p

    Example:
    >>> import numpy as np
    >>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
    >>> p = compute_pairwise_affinities(x)
    >>> float(round(p[0, 1], 3))
    0.25
    """
    n_samples = x.shape[0]
    sum_x = np.sum(np.square(x), axis=1)
    d = np.add(np.add(-2 * np.dot(x, x.T), sum_x).T, sum_x)
    p = np.exp(-d / (2 * sigma**2))
    np.fill_diagonal(p, 0)
    p /= np.sum(p)
    return (p + p.T) / (2 * n_samples)


def compute_low_dim_affinities(y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """
    Computes low-dimensional similarities (Q matrix) using Student-t distribution.

    :param y: Low-dimensional embeddings (n_samples, n_components)
    :return: Tuple (q, num) where q is the probability matrix and num is numerator array
    """
    sum_y = np.sum(np.square(y), axis=1)
    num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y))
    np.fill_diagonal(num, 0)
    q = num / np.sum(num)
    return q, num


def apply_tsne(
    data_x: np.ndarray,
    n_components: int = 2,
    learning_rate: float = 200.0,
    n_iter: int = 500,
) -> np.ndarray:
    """
    Applies t-SNE to reduce data dimensionality for visualization.

    :param data_x: Original dataset (features)
    :param n_components: Target dimension (2D or 3D)
    :param learning_rate: Learning rate for gradient descent
    :param n_iter: Number of iterations
    :return: Transformed dataset (low-dimensional embedding)

    Example:
    >>> x, _ = collect_dataset()
    >>> y_emb = apply_tsne(x, n_components=2, n_iter=50)
    >>> y_emb.shape
    (150, 2)
    """
    if n_components < 1:
        raise ValueError("n_components must be >= 1")
    if n_iter < 1:
        raise ValueError("n_iter must be >= 1")

    n_samples = data_x.shape[0]

    # Initialize low-dimensional map randomly
    y = np.random.randn(n_samples, n_components) * 1e-4
    p = compute_pairwise_affinities(data_x)
    p = np.maximum(p, 1e-12)

    # Initialize parameters
    y_inc = np.zeros_like(y)
    momentum = 0.5

    for i in range(n_iter):
        q, num = compute_low_dim_affinities(y)
        q = np.maximum(q, 1e-12)

        pq = p - q

        # Compute gradient
        d_y = 4 * (
            np.dot((pq * num), y)
            - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y)
        )

        # Update with momentum and learning rate
        y_inc = momentum * y_inc - learning_rate * d_y
        y += y_inc

        # Adjust momentum halfway through
        if i == int(n_iter / 4):
            momentum = 0.8

    return y


def main() -> None:
    """
    Driver function for t-SNE demonstration.
    """
    x, y_labels = collect_dataset()
    y_emb = apply_tsne(x, n_components=2, n_iter=300)
    print("t-SNE embedding (first 5 points):")
    print(y_emb[:5])

    # Optional visualization (commented to avoid dependency)
    # import matplotlib.pyplot as plt
    # plt.scatter(y_emb[:, 0], y_emb[:, 1], c=y_labels, cmap="viridis")
    # plt.title("t-SNE Visualization of Iris Dataset")
    # plt.xlabel("Component 1")
    # plt.ylabel("Component 2")
    # plt.show()


if __name__ == "__main__":
    doctest.testmod()
    main()


"""
Explanation of Input and Output
--------------------------------

Input:
- data_x: numpy array of shape (n_samples, n_features)
  Example: Iris dataset (150 samples × 4 features)
- n_components: target dimension (usually 2 or 3)
- learning_rate: gradient descent step size
- n_iter: number of iterations for optimization

Output:
- y: numpy array of shape (n_samples, n_components)
  Each row is the low-dimensional embedding of the corresponding high-dimensional point.

How it works:
1. Compute high-dimensional similarities (p matrix)
2. Initialize low-dimensional map (y) randomly
3. Compute low-dimensional similarities (q matrix)
4. Minimize KL divergence between p and q using gradient descent
5. Update y with momentum and learning rate iteratively
"""