Updated tsne.py

Nikita-Kedari · Nikita-Kedari · commit 88666f0451ce · 2025-10-12T01:27:09.000+05:30
diff --git a/machine_learning/tsne.py b/machine_learning/tsne.py
@@ -1,90 +1,89 @@
 """
 t-Distributed Stochastic Neighbor Embedding (t-SNE)
 ---------------------------------------------------
+
 t-SNE is a nonlinear dimensionality reduction algorithm used for visualizing
 high-dimensional data in a lower-dimensional (usually 2D or 3D) space.
 
 It models pairwise similarities between points in both the high-dimensional
-and low-dimensional spaces, and minimizes the difference between them
-using gradient descent.
+and low-dimensional spaces, and minimizes the difference between them using
+gradient descent.
 
 This simplified implementation demonstrates the core idea of t-SNE for
 educational purposes — it is **not optimized for large datasets**.
 
 This implementation:
 - Computes pairwise similarities in the high-dimensional space.
 - Computes pairwise similarities in the low-dimensional (embedding) space.
-- Minimizes the Kullback–Leibler divergence between these distributions
+- Minimizes the Kullback-Leibler divergence between these distributions
   using gradient descent.
 - Follows the original t-SNE formulation by van der Maaten & Hinton (2008).
 
 References:
 - van der Maaten, L. and Hinton, G. (2008).
   "Visualizing Data using t-SNE". Journal of Machine Learning Research.
 - https://lvdmaaten.github.io/tsne/
-
-Key Steps:
-1. Compute pairwise similarities (P) in high-dimensional space.
-2. Initialize low-dimensional map (Y) randomly.
-3. Compute pairwise similarities (Q) in low-dimensional space using
-   Student-t distribution.
-4. Minimize KL-divergence between P and Q using gradient descent.
 """
+
 import doctest
+
 import numpy as np
 from sklearn.datasets import load_iris
 
+
 def collect_dataset() -> tuple[np.ndarray, np.ndarray]:
     """
-    Collects the dataset (Iris dataset) and returns feature matrix and target values.
+    Collects the Iris dataset and returns features and labels.
 
-    :return: Tuple containing feature matrix (X) and target labels (y)
+    :return: Tuple containing feature matrix and target labels
 
     Example:
-    >>> X, y = collect_dataset()
-    >>> X.shape
+    >>> x, y = collect_dataset()
+    >>> x.shape
     (150, 4)
     >>> y.shape
     (150,)
     """
     data = load_iris()
     return np.array(data.data), np.array(data.target)
 
-def compute_pairwise_affinities(X: np.ndarray, sigma: float = 1.0) -> np.ndarray:
+
+def compute_pairwise_affinities(x: np.ndarray, sigma: float = 1.0) -> np.ndarray:
     """
     Computes pairwise affinities (P matrix) in high-dimensional space using Gaussian kernel.
 
-    :param X: Input data of shape (n_samples, n_features)
+    :param x: Input data of shape (n_samples, n_features)
     :param sigma: Variance (Bandwidth) of the Gaussian kernel
-    :return: Symmetrized probability matrix P of shape (n_samples, n_samples)/ Pairwise affinity matrix P
+    :return: Symmetrized probability matrix p
 
     Example:
     >>> import numpy as np
-    >>> X = np.array([[0.0, 0.0], [1.0, 0.0]])
-    >>> P = compute_pairwise_affinities(X)
-    >>> float(round(P[0, 1], 3))
+    >>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
+    >>> p = compute_pairwise_affinities(x)
+    >>> float(round(p[0, 1], 3))
     0.25
     """
-    n = X.shape[0]
-    sum_X = np.sum(np.square(X), axis=1)
-    D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X)
-    P = np.exp(-D / (2 * sigma ** 2))
-    np.fill_diagonal(P, 0)
-    P /= np.sum(P)
-    return (P + P.T) / (2 * n)
-
-def compute_low_dim_affinities(Y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    n_samples = x.shape[0]
+    sum_x = np.sum(np.square(x), axis=1)
+    d = np.add(np.add(-2 * np.dot(x, x.T), sum_x).T, sum_x)
+    p = np.exp(-d / (2 * sigma ** 2))
+    np.fill_diagonal(p, 0)
+    p /= np.sum(p)
+    return (p + p.T) / (2 * n_samples)
+
+
+def compute_low_dim_affinities(y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
     """
     Computes low-dimensional similarities (Q matrix) using Student-t distribution.
 
-    :param Y: Low-dimensional embeddings (n_samples, n_components)
-    :return: Tuple (Q, num) where Q is the probability matrix and num is numerator array
+    :param y: Low-dimensional embeddings (n_samples, n_components)
+    :return: Tuple (q, num) where q is the probability matrix and num is numerator array
     """
-    sum_Y = np.sum(np.square(Y), axis=1)
-    num = 1 / (1 + np.add(np.add(-2 * np.dot(Y, Y.T), sum_Y).T, sum_Y))
+    sum_y = np.sum(np.square(y), axis=1)
+    num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y))
     np.fill_diagonal(num, 0)
-    Q = num / np.sum(num)
-    return Q, num
+    q = num / np.sum(num)
+    return q, num
 
 
 def apply_tsne(
@@ -103,9 +102,9 @@ def apply_tsne(
     :return: Transformed dataset (low-dimensional embedding)
 
     Example:
-    >>> X, _ = collect_dataset()
-    >>> Y = apply_tsne(X, n_components=2, n_iter=250)
-    >>> Y.shape
+    >>> x, _ = collect_dataset()
+    >>> y_emb = apply_tsne(x, n_components=2, n_iter=50)
+    >>> y_emb.shape
     (150, 2)
     """
     if n_components < 1:
@@ -116,50 +115,49 @@ def apply_tsne(
     n_samples = data_x.shape[0]
 
     # Initialize low-dimensional map randomly
-    Y = np.random.randn(n_samples, n_components) * 1e-4
-    P = compute_pairwise_affinities(data_x)
-    P = np.maximum(P, 1e-12)
+    y = np.random.randn(n_samples, n_components) * 1e-4
+    p = compute_pairwise_affinities(data_x)
+    p = np.maximum(p, 1e-12)
 
     # Initialize parameters
-    Y_inc = np.zeros_like(Y)
+    y_inc = np.zeros_like(y)
     momentum = 0.5
 
     for i in range(n_iter):
-        Q, num = compute_low_dim_affinities(Y)
-        Q = np.maximum(Q, 1e-12)
+        q, num = compute_low_dim_affinities(y)
+        q = np.maximum(q, 1e-12)
 
-        PQ = P - Q
+        pq = p - q
 
         # Compute gradient
-        dY = 4 * (
-            np.dot((PQ * num), Y)
-            - np.multiply(np.sum(PQ * num, axis=1)[:, np.newaxis], Y)
+        d_y = 4 * (
+            np.dot((pq * num), y)
+            - np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y)
         )
 
         # Update with momentum and learning rate
-        Y_inc = momentum * Y_inc - learning_rate * dY
-        Y += Y_inc
+        y_inc = momentum * y_inc - learning_rate * d_y
+        y += y_inc
 
         # Adjust momentum halfway through
         if i == int(n_iter / 4):
             momentum = 0.8
 
-    return Y
+    return y
 
 
 def main() -> None:
     """
     Driver function for t-SNE demonstration.
     """
-    X, y = collect_dataset()
-
-    Y = apply_tsne(X, n_components=2, n_iter=300)
+    x, y_labels = collect_dataset()
+    y_emb = apply_tsne(x, n_components=2, n_iter=300)
     print("t-SNE embedding (first 5 points):")
-    print(Y[:5])
+    print(y_emb[:5])
 
     # Optional visualization (commented to avoid dependency)
     # import matplotlib.pyplot as plt
-    # plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap="viridis")
+    # plt.scatter(y_emb[:, 0], y_emb[:, 1], c=y_labels, cmap="viridis")
     # plt.title("t-SNE Visualization of Iris Dataset")
     # plt.xlabel("Component 1")
     # plt.ylabel("Component 2")
@@ -170,35 +168,26 @@ def main() -> None:
     doctest.testmod()
     main()
 
+
 """
-Explanation of t-SNE Implementation
------------------------------------
+Explanation of Input and Output
+--------------------------------
 
 Input:
 - data_x: numpy array of shape (n_samples, n_features)
   Example: Iris dataset (150 samples × 4 features)
-- n_components: target dimension (usually 2 or 3 for visualization)
-- learning_rate: controls step size in gradient descent
+- n_components: target dimension (usually 2 or 3)
+- learning_rate: gradient descent step size
 - n_iter: number of iterations for optimization
 
 Output:
-- Y: numpy array of shape (n_samples, n_components)
+- y: numpy array of shape (n_samples, n_components)
   Each row is the low-dimensional embedding of the corresponding high-dimensional point.
 
 How it works:
-1. Compute high-dimensional similarities (P matrix):
-   - Measures how likely points are neighbors in the original space.
-2. Initialize low-dimensional map (Y) randomly.
-3. Compute low-dimensional similarities (Q matrix) using Student-t distribution:
-   - Heavy tail prevents distant points from crowding together.
-4. Compute gradient of KL divergence between P and Q:
-   - If points are too far in low-D (Q < P), pull them closer.
-   - If points are too close in low-D (Q > P), push them apart.
-5. Update Y using gradient descent with momentum:
-   - Repeat for n_iter iterations until low-dimensional layout reflects high-dimensional structure.
-
-Why it works:
-- t-SNE tries to preserve **local structure**: neighbors stay close in the embedding.
-- Distant points may not be perfectly preserved (global structure is secondary).
-- The algorithm minimizes the KL divergence between high-D and low-D similarity distributions.
+1. Compute high-dimensional similarities (p matrix)
+2. Initialize low-dimensional map (y) randomly
+3. Compute low-dimensional similarities (q matrix)
+4. Minimize KL divergence between p and q using gradient descent
+5. Update y with momentum and learning rate iteratively
 """