added class-conformity metrics in metric_utils; integrated kmeans-probe in utils.analysis

Alexander Ororbia · Alexander Ororbia · commit 22ba7aaee944 · 2026-05-19T12:52:55.000-04:00
diff --git a/ngclearn/utils/analysis/__init__.py b/ngclearn/utils/analysis/__init__.py
@@ -2,4 +2,4 @@
 from .linear_probe import LinearProbe
 from .attentive_probe import AttentiveProbe
 from .knn_probe import KNNProbe
-
+from .kmeans_probe import KMeansProbe
diff --git a/ngclearn/utils/analysis/kmeans_probe.py b/ngclearn/utils/analysis/kmeans_probe.py
@@ -0,0 +1,109 @@
+import jax
+from ngcsimlib import deprecate_args
+from ngclearn.utils.analysis.probe import Probe
+from jax import jit, random, numpy as jnp, lax, nn
+from functools import partial as bind
+from ngclearn.utils.metric_utils import measure_ARI
+
+@bind(jax.jit, static_argnums=[2])
+def _run_kmeans_probe(_embeddings, centroids, n_clusters):
+    ## Broadcast distances: (n_samples, 1, n_features) - (1, n_clusters, n_features)
+    distances = jnp.sum((_embeddings[:, None, :] - centroids[None, :, :]) ** 2, axis=-1)
+    labels_pred = jnp.argmin(distances, axis=1)
+    ## Re-estimate centroids/means
+    one_hot_preds = labels_pred[:, None] == jnp.arange(n_clusters)
+    counts = jnp.maximum(one_hot_preds.sum(axis=0, keepdims=True).T, 1.0)
+    centroids = jnp.dot(one_hot_preds.T.astype(jnp.float32), _embeddings) / counts
+    return centroids
+
+@bind(jax.jit, static_argnums=[2])
+def _predict_with_probe(_embeddings, centroids, n_clusters):
+    ## Final pass to compute stable predictions
+    distances = jnp.sum((_embeddings[:, None, :] - centroids[None, :, :]) ** 2, axis=-1)
+    labels_pred = jnp.argmin(distances, axis=1)
+    Y_pred = nn.one_hot(labels_pred, n_clusters)
+    return labels_pred, Y_pred
+
+class KMeansProbe(Probe):
+    """
+    This implements a K-means clustering probe, which is useful for evaluating the quality of
+    encodings/embeddings in light of the ability to cluster downstream data. Currently, this
+    probe only supports L2/Euclidean distance-based clustering.
+
+    Args:
+        dkey: init seed key
+
+        source_seq_length: length of input sequence (e.g., height x width of the image feature)
+
+        input_dim: input dimensionality of probe
+
+        out_dim: output dimensionality of probe - number of clusters for this probe to create
+
+        batch_size: <Unused>
+
+    """
+
+    def __init__(
+            self,
+            dkey,
+            source_seq_length,
+            input_dim,
+            out_dim=2, ## number of clusters/centroids to uncover
+            batch_size=1,
+            **kwargs
+    ):
+        super().__init__(dkey, batch_size, **kwargs)
+        self.dkey, *subkeys = random.split(self.dkey, 3)
+        self.source_seq_length = source_seq_length
+        self.input_dim = input_dim
+        self.n_clusters = self.out_dim = out_dim
+        ## centroids that will be uncovered by this probe
+        self.centroids : jax.Array = None
+
+    def _init(self, embeddings):
+        _embeddings = embeddings
+        if len(_embeddings.shape) > 2:
+            flat_dim = embeddings.shape[1] * embeddings.shape[2]
+            _embeddings = jnp.reshape(_embeddings, (embeddings.shape[0], flat_dim))
+        ## choose random data-points to serve as centroids at iteration 0
+        self.dkey, *subkeys = random.split(self.dkey, 15)
+        n_samples, n_features = _embeddings.shape
+        random_indices = random.choice(
+            subkeys[0], n_samples, shape=(self.n_clusters,), replace=False
+        )
+        self.centroids = _embeddings[random_indices]
+
+    def process(self, embeddings, dkey=None):
+        _embeddings = embeddings
+        if len(_embeddings.shape) > 2:
+            flat_dim = embeddings.shape[1] * embeddings.shape[2]
+            _embeddings = jnp.reshape(_embeddings, (embeddings.shape[0], flat_dim))
+        ## Compute final geometric vs semantic conformity via ARI
+        _, Y_pred = _predict_with_probe(_embeddings, self.centroids, self.n_clusters)
+        return Y_pred ## (B, C)
+
+    def update(self, embeddings, labels, dkey=None):
+        _embeddings = embeddings
+        if len(_embeddings.shape) > 2:
+            flat_dim = embeddings.shape[1] * embeddings.shape[2]
+            _embeddings = jnp.reshape(_embeddings, (embeddings.shape[0], flat_dim))
+        self.centroids = _run_kmeans_probe(_embeddings, self.centroids, self.n_clusters)
+        L = 0. ## FIXME: should be clustering loss
+        predictions = self.process(_embeddings)
+        return L, predictions
+
+    def fit(self, dataset, dev_dataset=None, n_iter=20, patience=20):
+        data, labels = dataset
+        _labels = jnp.argmax(labels, axis=-1)
+
+        self._init(data) ## init K-means centroids
+        ari = 0.
+        for i in range(n_iter): ## Run vectorized K-Means optimization loop
+            _L, py = self.update(data, labels)
+            labels_pred = jnp.argmax(py, axis=1)
+            ari_i = measure_ARI(_labels, labels_pred)
+            print(f"\r{i}: ARI = {ari_i}", end="")
+            if ari_i > ari:
+                ari = ari_i
+        print()
+        return ari
diff --git a/ngclearn/utils/analysis/knn_probe.py b/ngclearn/utils/analysis/knn_probe.py
@@ -139,25 +139,25 @@ def update(self, embeddings, labels, dkey=None):
         Wy = labels
         self.probe_params = (Wx, Wy)
 
-if __name__ == '__main__':
-    seed = 42
-    D = 7
-    C = 5
-    dkey = random.PRNGKey(seed)
-    dkey, *subkeys = random.split(dkey, 3)
-    knn = KNNProbe(
-        subkeys[0], 1, input_dim=D, out_dim=C, K=1, dist_function="euclidean"
-    )
-    X = random.uniform(subkeys[1], shape=(10, D))
-    Y = jnp.concat(
-        [
-            jnp.ones((2, C)) * jnp.array([[1., 0., 0., 0., 0.]]),
-            jnp.ones((2, C)) * jnp.array([[0., 1., 0., 0., 0.]]),
-            jnp.ones((2, C)) * jnp.array([[0., 0., 1., 0., 0.]]),
-            jnp.ones((2, C)) * jnp.array([[0., 0., 0., 1., 0.]]),
-            jnp.ones((2, C)) * jnp.array([[0., 0., 0., 0., 1.]])
-         ],
-        axis=0
-    )
-    knn.update(X, Y) ## fit KNN to data
-    print(knn.process(X)) ## should construct the (smeared) identity matrix, exactly same as Y
+# if __name__ == '__main__':
+#     seed = 42
+#     D = 7
+#     C = 5
+#     dkey = random.PRNGKey(seed)
+#     dkey, *subkeys = random.split(dkey, 3)
+#     knn = KNNProbe(
+#         subkeys[0], 1, input_dim=D, out_dim=C, K=1, dist_function="euclidean"
+#     )
+#     X = random.uniform(subkeys[1], shape=(10, D))
+#     Y = jnp.concat(
+#         [
+#             jnp.ones((2, C)) * jnp.array([[1., 0., 0., 0., 0.]]),
+#             jnp.ones((2, C)) * jnp.array([[0., 1., 0., 0., 0.]]),
+#             jnp.ones((2, C)) * jnp.array([[0., 0., 1., 0., 0.]]),
+#             jnp.ones((2, C)) * jnp.array([[0., 0., 0., 1., 0.]]),
+#             jnp.ones((2, C)) * jnp.array([[0., 0., 0., 0., 1.]])
+#          ],
+#         axis=0
+#     )
+#     knn.update(X, Y) ## fit KNN to data
+#     print(knn.process(X)) ## should construct the (smeared) identity matrix, exactly same as Y
diff --git a/ngclearn/utils/metric_utils.py b/ngclearn/utils/metric_utils.py
@@ -463,3 +463,230 @@ def measure_BCE(p, x, offset=1e-7, preserve_batch=False): #1e-10
     if not preserve_batch:
         bce = jnp.mean(bce)
     return bce
+
+
+@partial(jit, static_argnums=[2, 3])
+def _compute_contingency_table( ## vectorized construction of contingency matrix
+        labels_true: jnp.ndarray,
+        labels_pred: jnp.ndarray,
+        n_classes: int,
+        n_clusters: int
+) -> jnp.ndarray:
+    ## Computes a contingency matrix table
+    ## This routine expects true integer labels and predicted integer labels (1D arrays of size N)
+
+    # Create indicator masks across all unique classes/clusters
+    # find unique IDs safely up to a static maximum size (or provide num_classes)
+    # n_classes = n_true = jnp.max(labels_true) + 1
+    # n_clusters = n_pred = jnp.max(labels_pred) + 1
+
+    # Broadcast to form a full one-hot lookup map
+    true_mask = labels_true[:, None] == jnp.arange(n_classes)
+    pred_mask = labels_pred[:, None] == jnp.arange(n_clusters)
+
+    # Contingency matrix is the matrix product of boolean indicators
+    contingency = jnp.dot(true_mask.T.astype(jnp.float32), pred_mask.astype(jnp.float32))
+    return contingency
+
+
+def measure_ARI(
+        labels_true: jnp.ndarray,
+        labels_pred: jnp.ndarray
+) -> jnp.ndarray:
+    """
+    Computes the adjusted random index (ARI), which measures similarity between two
+    sets of indices (ground truth against a clustering's produced indices) via counting the
+    pairs of data points assigned to same or different clusters (adjusted for chance). This
+    measurement lies in `[0, 1]`, where `0` indicates a random labeling/assignment and `1` indicates
+    perfect agreement.
+
+    Args:
+        labels_true: 1D array of shape (n_samples,) with true integer class labels.
+
+        labels_pred: 1D array of shape (n_samples,) with predicted integer cluster labels.
+
+    Returns:
+        scalar ARI of these two sets of indices
+    """
+    ## Dynamically find dimensions up to a statically bounded maximum
+    n_classes = int(jnp.max(labels_true) + 1)
+    n_clusters = int(jnp.max(labels_pred) + 1)
+    return _calc_adjusted_rand_index(labels_true, labels_pred, n_classes, n_clusters)
+
+
+@partial(jit, static_argnums=[2, 3])
+def _calc_adjusted_rand_index(  ## ARI
+        labels_true: jnp.ndarray,
+        labels_pred: jnp.ndarray,
+        n_classes: int,
+        n_clusters: int
+) -> jnp.ndarray:
+    n_samples = labels_true.shape[0]
+    if n_samples <= 1:
+        return jnp.array(1.0)
+
+    ## Get contingency matrix (n_classes x n_clusters)
+    contingency = _compute_contingency_table(
+        labels_true,
+        labels_pred,
+        n_classes,
+        n_clusters
+    )
+
+    ## Calculate combination sums n_ijC2 = (n_ij * (n_ij - 1)) / 2
+    sum_nij_c2 = jnp.sum((contingency * (contingency - 1.0)) / 2.0)
+
+    ## Sums across margins (rows and columns)
+    sum_a = jnp.sum(contingency, axis=1)
+    sum_b = jnp.sum(contingency, axis=0)
+
+    ## Margin pair combinations
+    sum_a_c2 = jnp.sum((sum_a * (sum_a - 1.0)) / 2.0)
+    sum_b_c2 = jnp.sum((sum_b * (sum_b - 1.0)) / 2.0)
+
+    ## Expected index and Max index math formulas
+    total_c2 = (n_samples * (n_samples - 1.0)) / 2.0
+    expected_index = (sum_a_c2 * sum_b_c2) / total_c2
+    max_index = (sum_a_c2 + sum_b_c2) / 2.0
+
+    ## Prevent division by zero if everything is perfectly clustered or uniform
+    denominator = max_index - expected_index
+    ari = jnp.where(denominator == 0.0, 1.0, (sum_nij_c2 - expected_index) / denominator)
+    return ari
+
+
+def measure_FMI(
+        labels_true: jnp.ndarray,
+        labels_pred: jnp.ndarray
+) -> jnp.ndarray:
+    """
+    Calculates the Fowlkes-Mallows Index (FMI), which measures similarity between two sets of
+    indices - this score is the geometric mean of pair-wise recall and precision.
+    This measurement lies in `[0, 1]`, where higher is better (indicating greater similarity between
+    two clustering sets of identifiers).
+
+    Args:
+        labels_true: 1D array of shape (n_samples,) with true integer class labels.
+
+        labels_pred: 1D array of shape (n_samples,) with predicted integer cluster labels.
+
+    Returns:
+        scalar FMI of these two sets of indices
+    """
+    ## Dynamically find dimensions up to a statically bounded maximum
+    n_classes = int(jnp.max(labels_true) + 1)
+    n_clusters = int(jnp.max(labels_pred) + 1)
+    return _measure_fowlkes_mallows_index(labels_true, labels_pred, n_classes, n_clusters)
+
+
+@partial(jit, static_argnums=[2, 3])
+def _measure_fowlkes_mallows_index(  ## FMI
+        labels_true: jnp.ndarray,
+        labels_pred: jnp.ndarray,
+        n_classes: int,
+        n_clusters: int
+) -> jnp.ndarray:
+    n_samples = labels_true.shape[0]
+    # Handle edge case for single or empty samples safely
+    if n_samples <= 1:
+        return jnp.array(0.0, dtype=jnp.float32)
+
+    contingency = _compute_contingency_table(labels_true, labels_pred, n_classes, n_clusters)
+
+    ## Compute marginal sums (sums along rows and columns)
+    sum_true = jnp.sum(contingency, axis=1)
+    sum_pred = jnp.sum(contingency, axis=0)
+
+    ## Calculate pairwise combinations using the matrix shortcut: nC2 = 0.5 * (sum(x^2) - N)
+    # True Positives pair combinations (tk)
+    tk = 0.5 * (jnp.sum(contingency ** 2) - n_samples)
+    ## Total pairs clustered together in ground truth (tr)
+    tr = 0.5 * (jnp.sum(sum_true ** 2) - n_samples)
+    ## Total pairs clustered together in predictions (tc)
+    tc = 0.5 * (jnp.sum(sum_pred ** 2) - n_samples)
+
+    ## Compute FMI = tk / sqrt(tr * tc)
+    # Prevent division by zero if there are no pair splits/matches
+    denominator = jnp.sqrt(tr * tc)
+    fmi = jnp.where(denominator == 0.0, 0.0, tk / denominator)
+    return fmi
+
+
+def measure_Vmeasure(  ## V-Measure
+        labels_true: jnp.ndarray,
+        labels_pred: jnp.ndarray,
+        beta: float = 1.0
+) -> jnp.ndarray:
+    """
+    Calculates the V-Measure scoring metric for class conformity. This measurement compares
+    predicted cluster indices ("labels_pred") against ground truth indices ("labels_true") and
+    represents the harmonic mean of homogeneity (where each cluster contains only members of a single class)
+    as well as completeness (where all members of a given class are assigned to the same cluster).
+    This measurement (higher is better) lies in `[0,1]` where `1` indicates perfect, correct clustering.
+
+    Args:
+        labels_true: 1D array of shape (n_samples,) with true integer class labels
+
+        labels_pred: 1D array of shape (n_samples,) with predicted integer cluster labels
+
+         beta: Weight factor. Ratios > 1.0 favor completeness, < 1.0 favor homogeneity.
+
+    Returns:
+        scalar V-measure of these two sets of indices
+    """
+    ## Dynamically find dimensions up to a statically bounded maximum
+    n_classes = int(jnp.max(labels_true) + 1)
+    n_clusters = int(jnp.max(labels_pred) + 1)
+    return _measure_v_measure_score(labels_true, labels_pred, n_classes, n_clusters, beta)
+
+
+@partial(jit, static_argnums=[2, 3, 4])
+def _measure_v_measure_score(  ## V-Measure
+        labels_true: jnp.ndarray,
+        labels_pred: jnp.ndarray,
+        n_classes: int,
+        n_clusters: int,
+        beta: float = 1.0
+) -> jnp.ndarray:
+    n_samples = labels_true.shape[0]
+
+    ## Handle edge case for single or empty samples safely
+    if n_samples <= 1:
+        return jnp.array(0.0, dtype=jnp.float32)
+
+    contingency = _compute_contingency_table(labels_true, labels_pred, n_classes, n_clusters)
+
+    ## Calculate Marginal Sums (Row and Column totals)
+    sum_true = jnp.sum(contingency, axis=1)
+    sum_pred = jnp.sum(contingency, axis=0)
+
+    ## Compute Base Entropies H(True) and H(Pred)
+    p_true = sum_true / n_samples
+    h_true = -jnp.sum(jnp.where(p_true > 0.0, p_true * jnp.log(p_true), 0.0))
+
+    p_pred = sum_pred / n_samples
+    h_pred = -jnp.sum(jnp.where(p_pred > 0.0, p_pred * jnp.log(p_pred), 0.0))
+
+    ## Compute Joint Entropy H(True, Pred)
+    p_joint = contingency / n_samples
+    h_joint = -jnp.sum(jnp.where(p_joint > 0.0, p_joint * jnp.log(p_joint), 0.0))
+
+    ## Derive Conditional Entropies: H(True|Pred) and H(Pred|True) using identity rule
+    h_true_given_pred = h_joint - h_pred
+    h_pred_given_true = h_joint - h_true
+
+    ## Compute Homogeneity (H) and Completeness (C)
+    ## If base entropy is 0, the metric is perfectly satisfied (1.0)
+    homogeneity = jnp.where(h_true == 0.0, 1.0, 1.0 - (h_true_given_pred / h_true))
+    completeness = jnp.where(h_pred == 0.0, 1.0, 1.0 - (h_pred_given_true / h_pred))
+
+    ## Compute Weighted Harmonic Mean (V-Measure)
+    denominator = beta * homogeneity + completeness
+
+    ## Prevent division by zero if both metrics are zero
+    v_measure = jnp.where(
+        denominator == 0.0,
+        0.0,
+        (1.0 + beta) * homogeneity * completeness / denominator
+    )
+    return v_measure
diff --git a/ngclearn/utils/viz/classification_analysis.py b/ngclearn/utils/viz/classification_analysis.py