|
2 | 2 | t-Distributed Stochastic Neighbor Embedding (t-SNE) |
3 | 3 | --------------------------------------------------- |
4 | 4 |
|
5 | | -Nonlinear dimensionality reduction for visualizing high-dimensional data |
6 | | -in 2D or 3D. Computes pairwise similarities in high and low-dimensional |
7 | | -spaces and minimizes Kullback-Leibler divergence using gradient descent. |
| 5 | +t-SNE is a nonlinear dimensionality reduction algorithm for visualizing |
| 6 | +high-dimensional data in a low-dimensional space (2D or 3D). |
| 7 | +
|
| 8 | +It computes pairwise similarities in both spaces and minimizes the |
| 9 | +Kullback-Leibler divergence using gradient descent. |
8 | 10 |
|
9 | 11 | References: |
10 | 12 | - van der Maaten, L. & Hinton, G. (2008), JMLR. |
11 | 13 | - https://lvdmaaten.github.io/tsne/ |
12 | 14 | """ |
13 | 15 |
|
| 16 | +import doctest |
| 17 | +from typing import Tuple |
| 18 | + |
14 | 19 | import numpy as np |
15 | 20 | from numpy import ndarray |
16 | 21 | from sklearn.datasets import load_iris |
17 | 22 |
|
18 | | -def _compute_pairwise_affinities(data_x: ndarray, sigma: float = 1.0) -> ndarray: |
| 23 | + |
| 24 | +def collect_dataset() -> Tuple[ndarray, ndarray]: |
19 | 25 | """ |
20 | | - Compute high-dimensional affinities using Gaussian kernel. |
| 26 | + Load Iris dataset and return features and labels. |
| 27 | +
|
| 28 | + Returns: |
| 29 | + Tuple[ndarray, ndarray]: feature matrix and target labels |
| 30 | +
|
| 31 | + Example: |
| 32 | + >>> x, y = collect_dataset() |
| 33 | + >>> x.shape |
| 34 | + (150, 4) |
| 35 | + >>> y.shape |
| 36 | + (150,) |
| 37 | + """ |
| 38 | + data = load_iris() |
| 39 | + return np.array(data.data), np.array(data.target) |
| 40 | + |
| 41 | + |
| 42 | +def compute_pairwise_affinities( |
| 43 | + data_x: ndarray, sigma: float = 1.0 |
| 44 | +) -> ndarray: |
| 45 | + """ |
| 46 | + Compute high-dimensional affinities (P matrix) using Gaussian kernel. |
21 | 47 |
|
22 | 48 | Args: |
23 | | - data_x (ndarray): shape (n_samples, n_features) |
24 | | - sigma (float): Gaussian kernel bandwidth |
| 49 | + data_x: Input data of shape (n_samples, n_features) |
| 50 | + sigma: Gaussian kernel bandwidth |
25 | 51 |
|
26 | 52 | Returns: |
27 | 53 | ndarray: Symmetrized probability matrix |
28 | 54 |
|
29 | 55 | Example: |
30 | 56 | >>> x = np.array([[0.0, 0.0], [1.0, 0.0]]) |
31 | | - >>> p = _compute_pairwise_affinities(x) |
| 57 | + >>> p = compute_pairwise_affinities(x) |
32 | 58 | >>> float(round(p[0, 1], 3)) |
33 | 59 | 0.25 |
34 | 60 | """ |
35 | 61 | n_samples = data_x.shape[0] |
36 | 62 | sum_x = np.sum(np.square(data_x), axis=1) |
37 | | - d = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x) |
38 | | - p = np.exp(-d / (2 * sigma**2)) |
| 63 | + dist_sq = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x) |
| 64 | + p = np.exp(-dist_sq / (2 * sigma**2)) |
39 | 65 | np.fill_diagonal(p, 0) |
40 | 66 | p /= np.sum(p) |
41 | 67 | return (p + p.T) / (2 * n_samples) |
42 | 68 |
|
43 | 69 |
|
44 | | -def _compute_low_dim_affinities(low_dim_embedding: ndarray) -> tuple[ndarray, ndarray]: |
| 70 | +def compute_low_dim_affinities( |
| 71 | + low_dim_embedding: ndarray, |
| 72 | +) -> Tuple[ndarray, ndarray]: |
45 | 73 | """ |
46 | | - Compute low-dimensional affinities using Student-t distribution. |
| 74 | + Compute low-dimensional affinities (Q matrix) using Student-t distribution. |
47 | 75 |
|
48 | 76 | Args: |
49 | | - low_dim_embedding (ndarray): shape (n_samples, n_components) |
| 77 | + low_dim_embedding: shape (n_samples, n_components) |
50 | 78 |
|
51 | 79 | Returns: |
52 | | - tuple[ndarray, ndarray]: Q matrix and numerator |
| 80 | + Tuple[ndarray, ndarray]: Q probability matrix and numerator |
53 | 81 |
|
54 | 82 | Example: |
55 | 83 | >>> y = np.array([[0.0, 0.0], [1.0, 0.0]]) |
56 | | - >>> q, num = _compute_low_dim_affinities(y) |
| 84 | + >>> q, num = compute_low_dim_affinities(y) |
57 | 85 | >>> q.shape |
58 | 86 | (2, 2) |
59 | 87 | """ |
60 | 88 | sum_y = np.sum(np.square(low_dim_embedding), axis=1) |
61 | | - num = 1 / (1 + np.add(np.add(-2 * np.dot(low_dim_embedding, low_dim_embedding.T), sum_y).T, sum_y)) |
62 | | - np.fill_diagonal(num, 0) |
63 | | - q = num / np.sum(num) |
64 | | - return q, num |
65 | | - |
66 | | - |
67 | | -class TSNE: |
| 89 | + numerator = 1 / ( |
| 90 | + 1 |
| 91 | + + np.add( |
| 92 | + np.add(-2 * np.dot(low_dim_embedding, low_dim_embedding.T), sum_y).T, |
| 93 | + sum_y, |
| 94 | + ) |
| 95 | + ) |
| 96 | + np.fill_diagonal(numerator, 0) |
| 97 | + q = numerator / np.sum(numerator) |
| 98 | + return q, numerator |
| 99 | + |
| 100 | + |
| 101 | +def apply_tsne( |
| 102 | + data_x: ndarray, |
| 103 | + n_components: int = 2, |
| 104 | + learning_rate: float = 200.0, |
| 105 | + n_iter: int = 500, |
| 106 | +) -> ndarray: |
68 | 107 | """ |
69 | | - t-SNE class for dimensionality reduction. |
| 108 | + Apply t-SNE for dimensionality reduction. |
70 | 109 |
|
71 | 110 | Args: |
72 | | - n_components (int): target dimension (default: 2) |
73 | | - learning_rate (float): gradient descent step size (default: 200) |
74 | | - n_iter (int): number of iterations (default: 500) |
| 111 | + data_x: Original dataset (features) |
| 112 | + n_components: Target dimension (2D or 3D) |
| 113 | + learning_rate: Step size for gradient descent |
| 114 | + n_iter: Number of iterations |
| 115 | +
|
| 116 | + Returns: |
| 117 | + ndarray: Low-dimensional embedding of the data |
75 | 118 |
|
76 | 119 | Example: |
77 | | - >>> x, _ = load_iris(return_X_y=True) |
78 | | - >>> tsne = TSNE(n_components=2, n_iter=50) |
79 | | - >>> tsne.fit(x) |
80 | | - >>> emb = tsne.embedding_ |
81 | | - >>> emb.shape |
| 120 | + >>> x, _ = collect_dataset() |
| 121 | + >>> y_emb = apply_tsne(x, n_components=2, n_iter=50) |
| 122 | + >>> y_emb.shape |
82 | 123 | (150, 2) |
83 | 124 | """ |
| 125 | + if n_components < 1 or n_iter < 1: |
| 126 | + raise ValueError("n_components and n_iter must be >= 1") |
84 | 127 |
|
85 | | - def __init__(self, *, n_components: int = 2, learning_rate: float = 200.0, n_iter: int = 500) -> None: |
86 | | - if n_components < 1: |
87 | | - raise ValueError("n_components must be >= 1") |
88 | | - if n_iter < 1: |
89 | | - raise ValueError("n_iter must be >= 1") |
90 | | - self.n_components = n_components |
91 | | - self.learning_rate = learning_rate |
92 | | - self.n_iter = n_iter |
93 | | - self.embedding_: ndarray | None = None |
94 | | - |
95 | | - def fit(self, data_x: ndarray) -> None: |
96 | | - """ |
97 | | - Fit t-SNE on data and compute low-dimensional embedding. |
98 | | -
|
99 | | - Args: |
100 | | - data_x (ndarray): shape (n_samples, n_features) |
101 | | -
|
102 | | - Example: |
103 | | - >>> x, _ = load_iris(return_X_y=True) |
104 | | - >>> tsne = TSNE(n_iter=10) |
105 | | - >>> tsne.fit(x) |
106 | | - >>> tsne.embedding_.shape |
107 | | - (150, 2) |
108 | | - """ |
109 | | - n_samples = data_x.shape[0] |
110 | | - rng = np.random.default_rng() |
111 | | - y = rng.standard_normal((n_samples, self.n_components)) * 1e-4 |
112 | | - |
113 | | - p = _compute_pairwise_affinities(data_x) |
114 | | - p = np.maximum(p, 1e-12) |
115 | | - |
116 | | - y_inc = np.zeros_like(y) |
117 | | - momentum = 0.5 |
118 | | - |
119 | | - for i in range(self.n_iter): |
120 | | - q, num = _compute_low_dim_affinities(y) |
121 | | - q = np.maximum(q, 1e-12) |
122 | | - pq = p - q |
123 | | - |
124 | | - d_y = 4 * ( |
125 | | - np.dot((pq * num), y) |
126 | | - - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y) |
127 | | - ) |
128 | | - |
129 | | - y_inc = momentum * y_inc - self.learning_rate * d_y |
130 | | - y += y_inc |
131 | | - |
132 | | - if i == int(self.n_iter / 4): |
133 | | - momentum = 0.8 |
134 | | - |
135 | | - self.embedding_ = y |
136 | | - |
137 | | - def transform(self, data_x: ndarray) -> ndarray: |
138 | | - """ |
139 | | - Return the computed embedding after fitting. |
140 | | -
|
141 | | - Args: |
142 | | - data_x (ndarray): unused, exists for API consistency |
143 | | -
|
144 | | - Returns: |
145 | | - ndarray: low-dimensional embedding |
146 | | -
|
147 | | - Example: |
148 | | - >>> x, _ = load_iris(return_X_y=True) |
149 | | - >>> tsne = TSNE(n_iter=10) |
150 | | - >>> tsne.fit(x) |
151 | | - >>> tsne.transform(x).shape |
152 | | - (150, 2) |
153 | | - """ |
154 | | - if self.embedding_ is None: |
155 | | - raise ValueError("Fit the model first using fit()") |
156 | | - return self.embedding_ |
157 | | - |
158 | | - |
159 | | -def collect_dataset() -> tuple[ndarray, ndarray]: |
160 | | - """ |
161 | | - Load Iris dataset. |
| 128 | + n_samples = data_x.shape[0] |
| 129 | + rng = np.random.default_rng() |
| 130 | + y = rng.standard_normal((n_samples, n_components)) * 1e-4 |
162 | 131 |
|
163 | | - Returns: |
164 | | - tuple[ndarray, ndarray]: features and labels |
| 132 | + p = compute_pairwise_affinities(data_x) |
| 133 | + p = np.maximum(p, 1e-12) |
165 | 134 |
|
166 | | - Example: |
167 | | - >>> x, y = collect_dataset() |
168 | | - >>> x.shape |
169 | | - (150, 4) |
170 | | - >>> y.shape |
171 | | - (150,) |
172 | | - """ |
173 | | - data = load_iris() |
174 | | - return np.array(data.data), np.array(data.target) |
| 135 | + y_inc = np.zeros_like(y) |
| 136 | + momentum = 0.5 |
| 137 | + |
| 138 | + for i in range(n_iter): |
| 139 | + q, num = compute_low_dim_affinities(y) |
| 140 | + q = np.maximum(q, 1e-12) |
| 141 | + |
| 142 | + pq = p - q |
| 143 | + d_y = 4 * ( |
| 144 | + np.dot((pq * num), y) |
| 145 | + - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y) |
| 146 | + ) |
| 147 | + |
| 148 | + y_inc = momentum * y_inc - learning_rate * d_y |
| 149 | + y += y_inc |
| 150 | + |
| 151 | + if i == int(n_iter / 4): |
| 152 | + momentum = 0.8 |
| 153 | + |
| 154 | + return y |
175 | 155 |
|
176 | 156 |
|
177 | 157 | def main() -> None: |
178 | 158 | """ |
179 | | - Run t-SNE on Iris dataset and print first 5 points. |
| 159 | + Run t-SNE on Iris dataset and display the first 5 embeddings. |
180 | 160 |
|
181 | 161 | Example: |
182 | 162 | >>> main() # runs without errors |
183 | 163 | """ |
184 | 164 | data_x, _ = collect_dataset() |
185 | | - tsne = TSNE(n_components=2, n_iter=300) |
186 | | - tsne.fit(data_x) |
| 165 | + y_emb = apply_tsne(data_x, n_components=2, n_iter=300) |
| 166 | + |
| 167 | + if not isinstance(y_emb, np.ndarray): |
| 168 | + raise TypeError("t-SNE embedding must be an ndarray") |
| 169 | + |
187 | 170 | print("t-SNE embedding (first 5 points):") |
188 | | - print(tsne.embedding_[:5]) |
| 171 | + print(y_emb[:5]) |
189 | 172 |
|
190 | | - # Optional visualization |
| 173 | + # Optional visualization (commented, Ruff/mypy compliant) |
191 | 174 | # import matplotlib.pyplot as plt |
192 | | - # plt.scatter(tsne.embedding_[:, 0], tsne.embedding_[:, 1], c=_labels, cmap="viridis") |
| 175 | + # plt.scatter( |
| 176 | + # y_emb[:, 0], |
| 177 | + # y_emb[:, 1], |
| 178 | + # c=_labels, |
| 179 | + # cmap="viridis" |
| 180 | + # ) |
193 | 181 | # plt.show() |
194 | 182 |
|
195 | 183 |
|
196 | 184 | if __name__ == "__main__": |
197 | | - import doctest |
198 | 185 | doctest.testmod() |
199 | 186 | main() |
0 commit comments