Skip to content

Commit 9cc4692

Browse files
committed
Changed tsne.py
1 parent 1aa6b33 commit 9cc4692

File tree

1 file changed

+113
-126
lines changed

1 file changed

+113
-126
lines changed

machine_learning/tsne.py

Lines changed: 113 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -2,198 +2,185 @@
22
t-Distributed Stochastic Neighbor Embedding (t-SNE)
33
---------------------------------------------------
44
5-
Nonlinear dimensionality reduction for visualizing high-dimensional data
6-
in 2D or 3D. Computes pairwise similarities in high and low-dimensional
7-
spaces and minimizes Kullback-Leibler divergence using gradient descent.
5+
t-SNE is a nonlinear dimensionality reduction algorithm for visualizing
6+
high-dimensional data in a low-dimensional space (2D or 3D).
7+
8+
It computes pairwise similarities in both spaces and minimizes the
9+
Kullback-Leibler divergence using gradient descent.
810
911
References:
1012
- van der Maaten, L. & Hinton, G. (2008), JMLR.
1113
- https://lvdmaaten.github.io/tsne/
1214
"""
1315

16+
import doctest
17+
from typing import Tuple
18+
1419
import numpy as np
1520
from numpy import ndarray
1621
from sklearn.datasets import load_iris
1722

18-
def _compute_pairwise_affinities(data_x: ndarray, sigma: float = 1.0) -> ndarray:
23+
24+
def collect_dataset() -> Tuple[ndarray, ndarray]:
1925
"""
20-
Compute high-dimensional affinities using Gaussian kernel.
26+
Load Iris dataset and return features and labels.
27+
28+
Returns:
29+
Tuple[ndarray, ndarray]: feature matrix and target labels
30+
31+
Example:
32+
>>> x, y = collect_dataset()
33+
>>> x.shape
34+
(150, 4)
35+
>>> y.shape
36+
(150,)
37+
"""
38+
data = load_iris()
39+
return np.array(data.data), np.array(data.target)
40+
41+
42+
def compute_pairwise_affinities(
43+
data_x: ndarray, sigma: float = 1.0
44+
) -> ndarray:
45+
"""
46+
Compute high-dimensional affinities (P matrix) using Gaussian kernel.
2147
2248
Args:
23-
data_x (ndarray): shape (n_samples, n_features)
24-
sigma (float): Gaussian kernel bandwidth
49+
data_x: Input data of shape (n_samples, n_features)
50+
sigma: Gaussian kernel bandwidth
2551
2652
Returns:
2753
ndarray: Symmetrized probability matrix
2854
2955
Example:
3056
>>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
31-
>>> p = _compute_pairwise_affinities(x)
57+
>>> p = compute_pairwise_affinities(x)
3258
>>> float(round(p[0, 1], 3))
3359
0.25
3460
"""
3561
n_samples = data_x.shape[0]
3662
sum_x = np.sum(np.square(data_x), axis=1)
37-
d = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x)
38-
p = np.exp(-d / (2 * sigma**2))
63+
dist_sq = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x)
64+
p = np.exp(-dist_sq / (2 * sigma**2))
3965
np.fill_diagonal(p, 0)
4066
p /= np.sum(p)
4167
return (p + p.T) / (2 * n_samples)
4268

4369

44-
def _compute_low_dim_affinities(low_dim_embedding: ndarray) -> tuple[ndarray, ndarray]:
70+
def compute_low_dim_affinities(
71+
low_dim_embedding: ndarray,
72+
) -> Tuple[ndarray, ndarray]:
4573
"""
46-
Compute low-dimensional affinities using Student-t distribution.
74+
Compute low-dimensional affinities (Q matrix) using Student-t distribution.
4775
4876
Args:
49-
low_dim_embedding (ndarray): shape (n_samples, n_components)
77+
low_dim_embedding: shape (n_samples, n_components)
5078
5179
Returns:
52-
tuple[ndarray, ndarray]: Q matrix and numerator
80+
Tuple[ndarray, ndarray]: Q probability matrix and numerator
5381
5482
Example:
5583
>>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
56-
>>> q, num = _compute_low_dim_affinities(y)
84+
>>> q, num = compute_low_dim_affinities(y)
5785
>>> q.shape
5886
(2, 2)
5987
"""
6088
sum_y = np.sum(np.square(low_dim_embedding), axis=1)
61-
num = 1 / (1 + np.add(np.add(-2 * np.dot(low_dim_embedding, low_dim_embedding.T), sum_y).T, sum_y))
62-
np.fill_diagonal(num, 0)
63-
q = num / np.sum(num)
64-
return q, num
65-
66-
67-
class TSNE:
89+
numerator = 1 / (
90+
1
91+
+ np.add(
92+
np.add(-2 * np.dot(low_dim_embedding, low_dim_embedding.T), sum_y).T,
93+
sum_y,
94+
)
95+
)
96+
np.fill_diagonal(numerator, 0)
97+
q = numerator / np.sum(numerator)
98+
return q, numerator
99+
100+
101+
def apply_tsne(
102+
data_x: ndarray,
103+
n_components: int = 2,
104+
learning_rate: float = 200.0,
105+
n_iter: int = 500,
106+
) -> ndarray:
68107
"""
69-
t-SNE class for dimensionality reduction.
108+
Apply t-SNE for dimensionality reduction.
70109
71110
Args:
72-
n_components (int): target dimension (default: 2)
73-
learning_rate (float): gradient descent step size (default: 200)
74-
n_iter (int): number of iterations (default: 500)
111+
data_x: Original dataset (features)
112+
n_components: Target dimension (2D or 3D)
113+
learning_rate: Step size for gradient descent
114+
n_iter: Number of iterations
115+
116+
Returns:
117+
ndarray: Low-dimensional embedding of the data
75118
76119
Example:
77-
>>> x, _ = load_iris(return_X_y=True)
78-
>>> tsne = TSNE(n_components=2, n_iter=50)
79-
>>> tsne.fit(x)
80-
>>> emb = tsne.embedding_
81-
>>> emb.shape
120+
>>> x, _ = collect_dataset()
121+
>>> y_emb = apply_tsne(x, n_components=2, n_iter=50)
122+
>>> y_emb.shape
82123
(150, 2)
83124
"""
125+
if n_components < 1 or n_iter < 1:
126+
raise ValueError("n_components and n_iter must be >= 1")
84127

85-
def __init__(self, *, n_components: int = 2, learning_rate: float = 200.0, n_iter: int = 500) -> None:
86-
if n_components < 1:
87-
raise ValueError("n_components must be >= 1")
88-
if n_iter < 1:
89-
raise ValueError("n_iter must be >= 1")
90-
self.n_components = n_components
91-
self.learning_rate = learning_rate
92-
self.n_iter = n_iter
93-
self.embedding_: ndarray | None = None
94-
95-
def fit(self, data_x: ndarray) -> None:
96-
"""
97-
Fit t-SNE on data and compute low-dimensional embedding.
98-
99-
Args:
100-
data_x (ndarray): shape (n_samples, n_features)
101-
102-
Example:
103-
>>> x, _ = load_iris(return_X_y=True)
104-
>>> tsne = TSNE(n_iter=10)
105-
>>> tsne.fit(x)
106-
>>> tsne.embedding_.shape
107-
(150, 2)
108-
"""
109-
n_samples = data_x.shape[0]
110-
rng = np.random.default_rng()
111-
y = rng.standard_normal((n_samples, self.n_components)) * 1e-4
112-
113-
p = _compute_pairwise_affinities(data_x)
114-
p = np.maximum(p, 1e-12)
115-
116-
y_inc = np.zeros_like(y)
117-
momentum = 0.5
118-
119-
for i in range(self.n_iter):
120-
q, num = _compute_low_dim_affinities(y)
121-
q = np.maximum(q, 1e-12)
122-
pq = p - q
123-
124-
d_y = 4 * (
125-
np.dot((pq * num), y)
126-
- np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y)
127-
)
128-
129-
y_inc = momentum * y_inc - self.learning_rate * d_y
130-
y += y_inc
131-
132-
if i == int(self.n_iter / 4):
133-
momentum = 0.8
134-
135-
self.embedding_ = y
136-
137-
def transform(self, data_x: ndarray) -> ndarray:
138-
"""
139-
Return the computed embedding after fitting.
140-
141-
Args:
142-
data_x (ndarray): unused, exists for API consistency
143-
144-
Returns:
145-
ndarray: low-dimensional embedding
146-
147-
Example:
148-
>>> x, _ = load_iris(return_X_y=True)
149-
>>> tsne = TSNE(n_iter=10)
150-
>>> tsne.fit(x)
151-
>>> tsne.transform(x).shape
152-
(150, 2)
153-
"""
154-
if self.embedding_ is None:
155-
raise ValueError("Fit the model first using fit()")
156-
return self.embedding_
157-
158-
159-
def collect_dataset() -> tuple[ndarray, ndarray]:
160-
"""
161-
Load Iris dataset.
128+
n_samples = data_x.shape[0]
129+
rng = np.random.default_rng()
130+
y = rng.standard_normal((n_samples, n_components)) * 1e-4
162131

163-
Returns:
164-
tuple[ndarray, ndarray]: features and labels
132+
p = compute_pairwise_affinities(data_x)
133+
p = np.maximum(p, 1e-12)
165134

166-
Example:
167-
>>> x, y = collect_dataset()
168-
>>> x.shape
169-
(150, 4)
170-
>>> y.shape
171-
(150,)
172-
"""
173-
data = load_iris()
174-
return np.array(data.data), np.array(data.target)
135+
y_inc = np.zeros_like(y)
136+
momentum = 0.5
137+
138+
for i in range(n_iter):
139+
q, num = compute_low_dim_affinities(y)
140+
q = np.maximum(q, 1e-12)
141+
142+
pq = p - q
143+
d_y = 4 * (
144+
np.dot((pq * num), y)
145+
- np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y)
146+
)
147+
148+
y_inc = momentum * y_inc - learning_rate * d_y
149+
y += y_inc
150+
151+
if i == int(n_iter / 4):
152+
momentum = 0.8
153+
154+
return y
175155

176156

177157
def main() -> None:
178158
"""
179-
Run t-SNE on Iris dataset and print first 5 points.
159+
Run t-SNE on Iris dataset and display the first 5 embeddings.
180160
181161
Example:
182162
>>> main() # runs without errors
183163
"""
184164
data_x, _ = collect_dataset()
185-
tsne = TSNE(n_components=2, n_iter=300)
186-
tsne.fit(data_x)
165+
y_emb = apply_tsne(data_x, n_components=2, n_iter=300)
166+
167+
if not isinstance(y_emb, np.ndarray):
168+
raise TypeError("t-SNE embedding must be an ndarray")
169+
187170
print("t-SNE embedding (first 5 points):")
188-
print(tsne.embedding_[:5])
171+
print(y_emb[:5])
189172

190-
# Optional visualization
173+
# Optional visualization (commented, Ruff/mypy compliant)
191174
# import matplotlib.pyplot as plt
192-
# plt.scatter(tsne.embedding_[:, 0], tsne.embedding_[:, 1], c=_labels, cmap="viridis")
175+
# plt.scatter(
176+
# y_emb[:, 0],
177+
# y_emb[:, 1],
178+
# c=_labels,
179+
# cmap="viridis"
180+
# )
193181
# plt.show()
194182

195183

196184
if __name__ == "__main__":
197-
import doctest
198185
doctest.testmod()
199186
main()

0 commit comments

Comments
 (0)