From 2c1b0bc1e951b74684b99ae9bd30bb766a9c1526 Mon Sep 17 00:00:00 2001
From: Sohaib-Ahmed21 <sohaibahmed1919@gmail.com>
Date: Sun, 25 Jan 2026 10:17:30 -0800
Subject: [PATCH 01/11] Add depth score metric end to end

---
 docs/source/links.rst                         |   1 +
 docs/source/text/depth_score.rst              |  21 +
 src/torchmetrics/functional/text/__init__.py  |   3 +-
 .../functional/text/depth_score.py            | 682 ++++++++++++++++++
 src/torchmetrics/text/__init__.py             |   3 +-
 src/torchmetrics/text/depth_score.py          | 358 +++++++++
 tests/unittests/text/test_depth_score.py      | 291 ++++++++
 7 files changed, 1357 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/text/depth_score.rst
 create mode 100644 src/torchmetrics/functional/text/depth_score.py
 create mode 100644 src/torchmetrics/text/depth_score.py
 create mode 100644 tests/unittests/text/test_depth_score.py

diff --git a/docs/source/links.rst b/docs/source/links.rst
index 539d2728e74..95d5bd19712 100644
--- a/docs/source/links.rst
+++ b/docs/source/links.rst
@@ -50,6 +50,7 @@
 .. _Mean Reciprocal Rank: https://en.wikipedia.org/wiki/Mean_reciprocal_rank
 .. _BERT_score: https://github.com/Tiiiger/bert_score/blob/master/bert_score/utils.py
 .. _Bert_score Evaluating Text Generation: https://arxiv.org/abs/1904.09675
+.. _DepthScore Evaluating Text Generation: https://arxiv.org/abs/2103.12711
 .. _BLEU score: https://en.wikipedia.org/wiki/BLEU
 .. _BLEU: https://www.semanticscholar.org/paper/Bleu%3A-a-Method-for-Automatic-Evaluation-of-Machine-Papineni-Roukos/d7da009f457917aa381619facfa5ffae9329a6e9
 .. _SacreBLEU: https://github.com/mjpost/sacrebleu
diff --git a/docs/source/text/depth_score.rst b/docs/source/text/depth_score.rst
new file mode 100644
index 00000000000..68e695698bf
--- /dev/null
+++ b/docs/source/text/depth_score.rst
@@ -0,0 +1,21 @@
+.. customcarditem::
+   :header: Depth Score
+   :image: https://pl-flash-data.s3.amazonaws.com/assets/thumbnails/summarization.svg
+   :tags: Text
+
+.. include:: ../links.rst
+
+##########
+Depth Score
+##########
+
+Module Interface
+________________
+
+.. autoclass:: torchmetrics.text.depth_score.DepthScore
+    :exclude-members: update, compute
+
+Functional Interface
+____________________
+
+.. autofunction:: torchmetrics.functional.text.depth_score.depth_score
diff --git a/src/torchmetrics/functional/text/__init__.py b/src/torchmetrics/functional/text/__init__.py
index 9282be6fbae..acc84a0373d 100644
--- a/src/torchmetrics/functional/text/__init__.py
+++ b/src/torchmetrics/functional/text/__init__.py
@@ -48,6 +48,7 @@
 
 if _TRANSFORMERS_GREATER_EQUAL_4_4:
     from torchmetrics.functional.text.bert import bert_score
+    from torchmetrics.functional.text.depth_score import depth_score
     from torchmetrics.functional.text.infolm import infolm
 
-    __all__ += ["bert_score", "infolm"]
+    __all__ += ["bert_score", "depth_score", "infolm"]
diff --git a/src/torchmetrics/functional/text/depth_score.py b/src/torchmetrics/functional/text/depth_score.py
new file mode 100644
index 00000000000..d5f310526de
--- /dev/null
+++ b/src/torchmetrics/functional/text/depth_score.py
@@ -0,0 +1,682 @@
+# Copyright The Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import logging
+from collections.abc import Iterator, Sequence
+from contextlib import contextmanager
+from typing import Any, Callable, List, Optional, Tuple, Union, cast
+
+import numpy as np
+import torch
+from torch import Tensor
+from torch.nn import Module
+from torch.utils.data import DataLoader
+
+# DepthScore deps
+import ot  # pip install POT
+import geomloss
+from sklearn.preprocessing import normalize
+from sklearn.covariance import MinCovDet as MCD
+from sklearn.decomposition import PCA
+
+# TorchMetrics text helpers (same style as BERTScore)
+from torchmetrics.functional.text.helper_embedding_metric import (
+    TextDataset,
+    TokenizedDataset,
+    _check_shape_of_model_output,
+    _get_progress_bar,
+    _input_data_collator,
+    _output_data_collator,
+)
+from torchmetrics.utilities import rank_zero_warn
+from torchmetrics.utilities.checks import _SKIP_SLOW_DOCTEST, _try_proceed_with_timeout
+from torchmetrics.utilities.imports import _TQDM_AVAILABLE, _TRANSFORMERS_GREATER_EQUAL_4_4
+
+log = logging.getLogger(__name__)
+
+
+@contextmanager
+def _ignore_transformers_finetune_warning() -> Iterator[None]:
+    """Temporarily silence common transformers loading warnings."""
+    logger = logging.getLogger("transformers.modeling_utils")
+    original_level = logger.getEffectiveLevel()
+    try:
+        logger.setLevel(logging.ERROR)
+        yield
+    finally:
+        logger.setLevel(original_level)
+
+
+# Default model recommended in the original implementation.
+_DEFAULT_MODEL = "bert-base-uncased"
+
+if _TRANSFORMERS_GREATER_EQUAL_4_4:
+    from transformers import AutoModel, AutoTokenizer
+
+    def _download_model_for_depth_score() -> None:
+        """Download intensive operations."""
+        with _ignore_transformers_finetune_warning():
+            AutoTokenizer.from_pretrained(_DEFAULT_MODEL)
+            AutoModel.from_pretrained(_DEFAULT_MODEL)
+
+    if _SKIP_SLOW_DOCTEST and not _try_proceed_with_timeout(_download_model_for_depth_score):
+        __doctest_skip__ = ["depth_score"]
+else:
+    __doctest_skip__ = ["depth_score"]
+
+
+def _preprocess_multiple_references(
+    preds: List[str], target: List[Union[str, Sequence[str]]]
+) -> Tuple[List[str], List[str], Optional[List[Tuple[int, int]]]]:
+    """Preprocess predictions and targets when dealing with multiple references.
+
+    This function handles the case where a single prediction might have multiple
+    reference targets (represented as a list/tuple of strings). It flattens the
+    multi-reference structure into aligned (pred, ref) pairs and returns group
+    boundaries so the final distance can later be reduced per original prediction.
+
+    Args:
+        preds: A list of predictions.
+        target: A list of targets, where each item could be a string or a list/tuple of strings.
+
+    Returns:
+        Tuple: (preds, target, ref_group_boundaries)
+            - preds: Flattened list of `str` where each prediction is repeated once per reference.
+            - target: Flattened list of `str` containing all references.
+            - ref_group_boundaries: List of tuples (start, end) indicating the boundaries of each
+              original prediction's reference group in the flattened lists, or `None` if no
+              multi-reference structure is present.
+
+    Raises:
+        ValueError:
+            If `preds` is not a list of strings.
+            
+    """
+    if not all(isinstance(item, str) for item in preds):
+        raise ValueError("Invalid input provided.")
+
+    has_nested = any(isinstance(item, (list, tuple)) for item in target)
+    if not has_nested:
+        return preds, cast(List[str], target), None
+
+    ref_group_boundaries: List[Tuple[int, int]] = []
+    new_preds: List[str] = []
+    new_target: List[str] = []
+    count = 0
+
+    for pred, ref_group in zip(preds, target):
+        if isinstance(ref_group, (list, tuple)):
+            new_preds.extend([pred] * len(ref_group))
+            new_target.extend(cast(List[str], ref_group))
+            ref_group_boundaries.append((count, count + len(ref_group)))
+            count += len(ref_group)
+        else:
+            new_preds.append(pred)
+            new_target.append(cast(str, ref_group))
+            ref_group_boundaries.append((count, count + 1))
+            count += 1
+
+    return new_preds, new_target, ref_group_boundaries
+
+
+def _postprocess_multiple_references_distance(
+    distances: Tensor,
+    ref_group_boundaries: List[Tuple[int, int]],
+    reduction: str = "min",
+) -> Tensor:
+    """Postprocess distances when dealing with multiple references.
+
+    After `_preprocess_multiple_references` flattens multi-reference inputs, this function
+    reduces the computed per-(pred, ref) distances back to a single distance per original
+    prediction by aggregating within each reference group.
+
+    Since DepthScore is a distance (lower is better), the default behavior uses `min`
+    (best matching reference). Other reductions can be used for diagnostics.
+
+    Args:
+        distances: A 1D tensor of distances aligned with the flattened (pred, ref) pairs.
+        ref_group_boundaries: List of tuples (start, end) indicating the boundaries of each
+            original prediction's reference group in `distances`.
+        reduction: Reduction to apply within each group. One of `{"min", "max", "mean"}`.
+            - `"min"`: best reference match (default for distance metrics)
+            - `"max"`: worst reference match
+            - `"mean"`: average across references
+
+    Returns:
+        A 1D tensor of shape `(num_predictions,)` containing the reduced distance per prediction.
+
+    Raises:
+        ValueError:
+            If `distances` is not 1D.
+        ValueError:
+            If `reduction` is not one of `{"min","max","mean"}`.
+
+    """
+    if distances.dim() != 1:
+        raise ValueError("Expected 1D tensor of distances.")
+    if reduction not in {"min", "max", "mean"}:
+        raise ValueError("reduction must be one of {'min','max','mean'}.")
+
+    out: List[Tensor] = []
+    for start, end in ref_group_boundaries:
+        chunk = distances[start:end]
+        if reduction == "min":
+            out.append(chunk.min())
+        elif reduction == "max":
+            out.append(chunk.max())
+        else:
+            out.append(chunk.mean())
+    return torch.stack(out, dim=0)
+
+
+def cov_matrix(X: np.ndarray, robust: bool = False) -> np.ndarray:
+    """Covariance matrix (optionally robust)."""
+    if robust:
+        return MCD().fit(X).covariance_
+    return np.cov(X.T)
+
+
+def standardize(X: np.ndarray, robust: bool = False) -> np.ndarray:
+    """Affine standardization using inverse sqrt covariance."""
+    sigma = cov_matrix(X, robust)
+    _, n_features = X.shape
+    rank = np.linalg.matrix_rank(X)
+
+    if rank < n_features:
+        X = PCA(rank).fit_transform(X)
+        sigma = cov_matrix(X)
+
+    u, s, _ = np.linalg.svd(sigma)
+    square_inv = u / np.sqrt(s)
+    return X @ square_inv
+
+
+def sampled_sphere(n_dirs: int, d: int) -> np.ndarray:
+    """Uniform samples on unit sphere."""
+    U = np.random.multivariate_normal(np.zeros(d), np.eye(d), size=n_dirs)
+    return normalize(U)
+
+
+def Wasserstein(X: np.ndarray, Y: np.ndarray) -> float:
+    """OT cost with uniform weights."""
+    M = ot.dist(X, Y)
+    wX = np.ones(len(X)) / len(X)
+    wY = np.ones(len(Y)) / len(Y)
+    return float(ot.emd2(wX, wY, M))
+
+
+def SW(X: np.ndarray, Y: np.ndarray, ndirs: int, p: int = 2) -> float:
+    """Sliced Wasserstein distance."""
+    n, d = X.shape
+    U = sampled_sphere(ndirs, d)
+    Zx = X @ U.T
+    Zy = Y @ U.T
+    sliced = np.zeros(ndirs)
+    for k in range(ndirs):
+        sliced[k] = ot.emd2_1d(Zx[:, k], Zy[:, k], p=2)
+    return float((np.mean(sliced)) ** (1 / p))
+
+
+def MMD(X: np.ndarray, Y: np.ndarray) -> float:
+    """Gaussian MMD via geomloss."""
+    return float(geomloss.SamplesLoss("gaussian")(torch.tensor(X), torch.tensor(Y)).item())
+
+
+def ai_irw(X: np.ndarray, AI: bool = True, robust: bool = False, n_dirs: Optional[int] = None, random_state: int = 0) -> np.ndarray:
+    """(Affine-invariant) integrated rank-weighted depth."""
+    np.random.seed(random_state)
+    if AI:
+        X = standardize(X, robust)
+
+    n, d = X.shape
+    n_dirs = d * 100 if n_dirs is None else n_dirs
+
+    U = sampled_sphere(n_dirs, d)
+    proj = X @ U.T
+    ranks = np.argsort(proj, axis=0)
+
+    depth = np.zeros_like(proj)
+    for k in range(n_dirs):
+        depth[ranks[:, k], k] = np.arange(1, n + 1)
+
+    depth = depth / n
+    depth = np.minimum(depth, 1 - depth)
+    return np.mean(depth, axis=1)
+
+
+def dr_distance(
+    X: np.ndarray,
+    Y: np.ndarray,
+    n_alpha: int = 5,
+    n_dirs: int = 10000,
+    data_depth: str = "irw",
+    eps_min: float = 0.3,
+    eps_max: float = 1.0,
+    p: int = 5,
+    random_state: int = 0,
+) -> float:
+    """Compute the depth-based pseudo-metric between two point clouds.
+
+    This function implements the DepthScore "DR distance" between two empirical
+    distributions represented by token-embedding point clouds `X` and `Y`. The distance
+    is computed by (1) choosing a data depth / distributional discrepancy backend
+    (e.g., IRW depth, affine-invariant IRW, Wasserstein, sliced Wasserstein, or MMD),
+    and (2) integrating over depth level sets between `eps_min` and `eps_max`, while
+    approximating the supremum over directions on the unit sphere by Monte Carlo.
+
+    Args:
+        X: Array of shape `(n_samples, n_features)` representing the first point cloud.
+        Y: Array of shape `(n_samples, n_features)` representing the second point cloud.
+        n_alpha: Monte-Carlo parameter controlling the approximation of the integral
+            over alpha (number of level-set thresholds between `eps_min` and `eps_max`).
+        n_dirs: Number of random directions used to approximate the supremum over the
+            unit sphere (and for depth estimation when applicable).
+        data_depth: Depth / discrepancy measure to use. One of
+            `{"irw", "ai_irw", "wasserstein", "sliced", "mmd"}`.
+            - `"irw"` / `"ai_irw"` compute depth values and then integrate level sets.
+            - `"wasserstein"` returns the (unsliced) OT cost directly.
+            - `"sliced"` returns the sliced Wasserstein distance directly.
+            - `"mmd"` returns the Gaussian MMD directly.
+        eps_min: Lower level-set bound in `[0, eps_max]` (lowest alpha / quantile level).
+        eps_max: Upper level-set bound in `[eps_min, 1]` (highest alpha / quantile level).
+        p: Power used in the ground cost aggregation (corresponds to the exponent in the
+            reference implementation).
+        random_state: Random seed controlling direction sampling and any stochastic steps.
+
+    Returns:
+        The computed pseudo-metric score as a Python `float`.
+
+    Raises:
+        ValueError:
+            If `data_depth` is unsupported.
+        ValueError:
+            If `eps_min` and `eps_max` do not satisfy `0 <= eps_min <= eps_max <= 1`.
+
+    """
+    np.random.seed(random_state)
+
+    if data_depth == "irw":
+        depth_X = ai_irw(X, AI=False, n_dirs=n_dirs, random_state=random_state)
+        depth_Y = ai_irw(Y, AI=False, n_dirs=n_dirs, random_state=random_state)
+    elif data_depth == "ai_irw":
+        depth_X = ai_irw(X, AI=True, n_dirs=n_dirs, random_state=random_state)
+        depth_Y = ai_irw(Y, AI=True, n_dirs=n_dirs, random_state=random_state)
+    elif data_depth == "wasserstein":
+        return Wasserstein(X, Y)
+    elif data_depth == "sliced":
+        return SW(X, Y, ndirs=n_dirs)
+    elif data_depth == "mmd":
+        return MMD(X, Y)
+    else:
+        raise ValueError("Unsupported depth")
+
+    if not (0.0 <= eps_min <= eps_max <= 1.0):
+        raise ValueError("Expected 0 <= eps_min <= eps_max <= 1")
+
+    _, d = X.shape
+    U = sampled_sphere(n_dirs, d)
+    proj_X = X @ U.T
+    proj_Y = Y @ U.T
+
+    alphas = np.linspace(int(eps_min * 100), int(eps_max * 100), n_alpha)
+    qX = [np.percentile(depth_X, a) for a in alphas]
+    qY = [np.percentile(depth_Y, a) for a in alphas]
+
+    score = 0.0
+    for i in range(n_alpha):
+        idx_X = np.where(depth_X >= qX[i])[0]
+        idx_Y = np.where(depth_Y >= qY[i])[0]
+        supp_X = np.max(proj_X[idx_X], axis=0)
+        supp_Y = np.max(proj_Y[idx_Y], axis=0)
+        score += float(np.max((supp_X - supp_Y) ** p))
+
+    return float((score / n_alpha) ** (1 / p))
+
+
+def _get_embeddings_and_mask(
+    dataloader: DataLoader,
+    target_len: int,
+    model: Module,
+    device: Optional[Union[str, torch.device]] = None,
+    num_layers: Optional[int] = None,
+    all_layers: bool = False,
+    verbose: bool = False,
+    user_forward_fn: Optional[Callable[[Module, dict[str, Tensor]], Tensor]] = None,
+) -> Tuple[Tensor, Tensor]:
+    """Compute normalized token embeddings and the corresponding attention mask.
+
+    Args:
+        dataloader: Dataloader over `TextDataset` or `TokenizedDataset`.
+        target_len: Length of the longest sequence in the dataset (used for output collation/padding).
+        model: Transformer model used for embedding extraction.
+        device: Device to run inference on.
+        num_layers: Which hidden layer to use from `output_hidden_states`.
+            If `None`, the last layer is used.
+        all_layers: Whether to use representations from all layers.
+            If `True`, `num_layers` is ignored.
+        verbose: Whether to show a progress bar during embedding extraction.
+        user_forward_fn:
+            Optional user-defined forward function. If provided, it must:
+            - accept `(model, batch_dict)` where `batch_dict` contains `"input_ids"` and `"attention_mask"`
+            - return a tensor shaped like `(batch, seq_len, hidden_dim)`.
+
+    Returns:
+        A tuple `(embeddings, attention_mask)` where:
+            - embeddings: Tensor shaped `(batch, 1, seq_len, hidden_dim)` when `all_layers=False`,
+              or `(batch, num_layers, seq_len, hidden_dim)` when `all_layers=True`.
+              Embeddings are L2-normalized over the hidden dimension and masked by `attention_mask`.
+            - attention_mask: Tensor shaped `(batch, seq_len)` aligned with `embeddings`.
+
+    Raises:
+        ValueError:
+            If `user_forward_fn` output shape does not match the expected model output shape.
+        ValueError:
+            If `all_layers=True` is used with a custom `user_forward_fn`.
+
+    """
+    embeddings_list: List[Tensor] = []
+    mask_list: List[Tensor] = []
+
+    for batch in _get_progress_bar(dataloader, verbose):
+        with torch.no_grad():
+            batch = _input_data_collator(batch, device)
+
+            if not all_layers:
+                if user_forward_fn is None:
+                    out = model(batch["input_ids"], batch["attention_mask"], output_hidden_states=True)
+                    hs = out.hidden_states[num_layers if num_layers is not None else -1]
+                else:
+                    hs = user_forward_fn(model, batch)
+                    _check_shape_of_model_output(hs, batch["input_ids"])
+                # unify to (b, 1, s, d) like BERTScore's internal shape
+                hs = hs.unsqueeze(1)
+            else:
+                if user_forward_fn is not None:
+                    raise ValueError(
+                        "The option `all_layers=True` can be used only with default `transformers` models."
+                    )
+                out = model(batch["input_ids"], batch["attention_mask"], output_hidden_states=True)
+                hs = torch.cat([o.unsqueeze(1) for o in out.hidden_states], dim=1)
+
+        # normalize embeddings (safe)
+        denom = hs.norm(dim=-1).unsqueeze(-1).clamp_min(1e-12)
+        hs = hs / denom
+
+        hs, attention_mask = _output_data_collator(hs, batch["attention_mask"], target_len)
+
+        # mask out padding/special tokens
+        hs = torch.einsum("blsd, bs -> blsd", hs, attention_mask)
+
+        embeddings_list.append(hs.cpu())
+        mask_list.append(attention_mask.cpu())
+
+    return torch.cat(embeddings_list, dim=0), torch.cat(mask_list, dim=0)
+
+
+def depth_score(
+    preds: Union[str, Sequence[str], dict[str, Tensor]],
+    target: Union[str, Sequence[str], Sequence[Sequence[str]], dict[str, Tensor]],
+    model_name_or_path: Optional[str] = None,
+    num_layers: Optional[int] = None,
+    all_layers: bool = False,
+    model: Optional[Module] = None,
+    user_tokenizer: Any = None,
+    user_forward_fn: Optional[Callable[[Module, dict[str, Tensor]], Tensor]] = None,
+    verbose: bool = False,
+    device: Optional[Union[str, torch.device]] = None,
+    max_length: int = 512,
+    batch_size: int = 64,
+    num_threads: int = 0,
+    truncation: bool = False,
+    # DepthScore-specific knobs
+    n_alpha: int = 5,
+    n_dirs: int = 10000,
+    eps: float = 0.3,
+    p: int = 5,
+    measure: str = "irw",
+    # Multi-ref postprocess for a distance metric (best = min by default)
+    multi_ref_reduction: str = "min",
+) -> Tensor:
+    """`DepthScore Evaluating Text Generation`_ for text similarity matching.
+
+    DepthScore measures the distance between two sentences by comparing the distributions
+    of their contextual token embeddings using a depth-based pseudo-metric. Lower values
+    indicate that the predicted sentence is closer to the reference sentence.
+
+    The function supports:
+    - Single string inputs (`str`)
+    - Lists of strings (`Sequence[str]`)
+    - Tokenized dict inputs (`dict[str, Tensor]`) (used internally by the Metric class)
+    - Multi-reference evaluation (`Sequence[Sequence[str]]`), reduced per prediction by
+      `multi_ref_reduction` (default `"min"` for distance metrics).
+
+    Args:
+        preds: Predicted sentence(s) as `str`, `Sequence[str]`, or tokenized dict
+            containing `"input_ids"` and `"attention_mask"`.
+        target: Reference sentence(s) as `str`, `Sequence[str]`, multi-reference
+            `Sequence[Sequence[str]]`, or tokenized dict containing `"input_ids"` and `"attention_mask"`.
+        model_name_or_path: Hugging Face model name/path used when `model` is not provided.
+        num_layers: Hidden layer index to use for contextual embeddings. If `None`, the last layer is used.
+        all_layers:
+            An indication of whether the representation from all model's layers should be used.
+            If ``all_layers=True``, the argument ``num_layers`` is ignored.
+        model: Optional user-provided model. If provided, `user_tokenizer` must also be provided.
+        user_tokenizer: Tokenizer to use with a user-provided model. Ignored when `model` is `None`.
+        user_forward_fn:
+            Optional user-defined forward function producing embeddings from `(model, batch_dict)`.
+        verbose: Whether to show a progress bar during embedding extraction.
+        device: Device to run embedding extraction on.
+        max_length: Maximum input sequence length. Longer sequences are trimmed if `truncation=True`.
+        batch_size: Batch size used for model processing.
+        num_threads: Number of dataloader workers.
+        truncation: Whether to truncate input sequences to `max_length`.
+        n_alpha: Number of alpha levels used by the depth-based distance computation.
+        n_dirs: Number of random projection directions used by depth/sliced computations.
+        eps: Lower quantile bound (eps_min) used in the depth distance integration (upper bound fixed at 1.0).
+        p: Power used in the distance aggregation.
+        measure: Depth/distance backend to use. One of:
+            `"irw"`, `"ai_irw"`, `"wasserstein"`, `"sliced"`, `"mmd"`.
+        multi_ref_reduction: Reduction to apply across multiple references per prediction.
+            Default `"min"` (best match) since this is a distance metric.
+
+    Returns:
+        A 1D tensor of distances of shape `(num_predictions,)`. For multi-reference input,
+        the output is reduced per original prediction according to `multi_ref_reduction`.
+
+    Raises:
+        ValueError:
+            If `len(preds) != len(target)`.
+        ModuleNotFoundError:
+            If `verbose=True` but `tqdm` is not installed.
+        ModuleNotFoundError:
+            If default transformers model is required but `transformers` is not installed.
+        ValueError:
+            If invalid input is provided for `preds`/`target`.
+        ValueError:
+            If `num_layers` is larger than the number of model layers (when detectable).
+
+    Example:
+        >>> from torchmetrics.functional.text.depth_score import depth_score
+        >>> preds = ["hello there", "general kenobi"]
+        >>> target = ["hello there", "master kenobi"]
+        >>> depth_score(preds, target, model_name_or_path="distilbert-base-uncased", num_layers=4, device="cpu")
+        tensor([...])
+
+    Example:
+        >>> from torchmetrics.functional.text.depth_score import depth_score
+        >>> preds = ["hello there", "general kenobi"]
+        >>> target = [["hello there", "master kenobi"], ["hello there", "master kenobi"]]
+        >>> depth_score(preds, target, model_name_or_path="distilbert-base-uncased", num_layers=4, device="cpu")
+        tensor([...])
+
+    """
+
+    ref_group_boundaries: Optional[List[Tuple[int, int]]] = None
+
+    if isinstance(preds, str):
+        preds = [preds]
+    if isinstance(target, str):
+        target = [target]
+    if not isinstance(preds, (list, dict)):
+        preds = list(preds)
+    if not isinstance(target, (list, dict)):
+        target = list(target)
+
+    if len(preds) != len(target):
+        raise ValueError(
+            "Expected number of predicted and reference sentences to be the same, but got"
+            f" {len(preds)} and {len(target)}"
+        )
+    
+    if isinstance(preds, list) and len(preds) > 0 and isinstance(target, list) and len(target) > 0:
+        preds, target, ref_group_boundaries = _preprocess_multiple_references(preds, target)
+
+    if verbose and (not _TQDM_AVAILABLE):
+        raise ModuleNotFoundError(
+            "An argument `verbose = True` requires `tqdm` package be installed. Install with `pip install tqdm`."
+        )
+
+    if model is None:
+        if not _TRANSFORMERS_GREATER_EQUAL_4_4:
+            raise ModuleNotFoundError(
+                "`depth_score` metric with default models requires `transformers` package be installed."
+                " Either install with `pip install transformers>=4.4` or `pip install torchmetrics[text]`."
+            )
+        if model_name_or_path is None:
+            rank_zero_warn(
+                "The argument `model_name_or_path` was not specified while it is required when default"
+                " `transformers` model are used."
+                f" It is, therefore, used the default recommended model - {_DEFAULT_MODEL}."
+            )
+        from transformers import AutoModel, AutoTokenizer
+
+        with _ignore_transformers_finetune_warning():
+            tokenizer = AutoTokenizer.from_pretrained(model_name_or_path or _DEFAULT_MODEL)
+            model = AutoModel.from_pretrained(model_name_or_path or _DEFAULT_MODEL)
+    else:
+        if user_tokenizer is None:
+            raise ValueError("When `model` is provided, `user_tokenizer` must also be provided.")
+        tokenizer = user_tokenizer
+
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    model.eval()
+    model.to(device)
+
+    try:
+        if hasattr(model.config, "num_hidden_layers") and isinstance(model.config.num_hidden_layers, int):
+            if num_layers and num_layers > model.config.num_hidden_layers:
+                raise ValueError(
+                    f"num_layers={num_layers} is forbidden for {model_name_or_path}."
+                    f" Please use num_layers <= {model.config.num_hidden_layers}"
+                )
+        else:
+            rank_zero_warn(
+                "Model config does not have `num_hidden_layers` as an integer attribute. "
+                "Unable to validate `num_layers`."
+            )
+    except AttributeError:
+        rank_zero_warn("It was not possible to retrieve the parameter `num_layers` from the model specification.")
+
+    _are_empty_lists = all(isinstance(text, list) and len(text) == 0 for text in (preds, target))
+    _are_valid_lists = all(
+        isinstance(text, list) and len(text) > 0 and isinstance(text[0], str) for text in (preds, target)
+    )
+    _are_valid_tensors = all(
+        isinstance(text, dict) and isinstance(text["input_ids"], Tensor) for text in (preds, target)
+    )
+
+    if _are_empty_lists:
+        rank_zero_warn("Predictions and references are empty.")
+        return torch.zeros(1, dtype=torch.float32)
+
+    if _are_valid_lists:
+        target_dataset = TextDataset(target, tokenizer, max_length, truncation=truncation)
+        preds_dataset = TextDataset(preds, tokenizer, max_length, truncation=truncation)
+
+    elif _are_valid_tensors:
+
+        target_dataset = TokenizedDataset(**cast(dict, target))
+        preds_dataset = TokenizedDataset(**cast(dict, preds))
+    else:
+        raise ValueError("Invalid input provided.")
+
+    target_loader = DataLoader(target_dataset, batch_size=batch_size, num_workers=num_threads)
+    preds_loader = DataLoader(preds_dataset, batch_size=batch_size, num_workers=num_threads)
+
+    target_embeddings, target_mask = _get_embeddings_and_mask(
+        target_loader,
+        target_dataset.max_length,
+        model,
+        device=device,
+        num_layers=num_layers,
+        all_layers=all_layers,
+        verbose=verbose,
+        user_forward_fn=user_forward_fn,
+    )
+    preds_embeddings, preds_mask = _get_embeddings_and_mask(
+        preds_loader,
+        preds_dataset.max_length,
+        model,
+        device=device,
+        num_layers=num_layers,
+        all_layers=all_layers,
+        verbose=verbose,
+        user_forward_fn=user_forward_fn,
+    )
+
+    # Reorder back (TextDataset sorts by length internally)
+    target_embeddings = target_embeddings[target_loader.dataset.sorting_indices]
+    preds_embeddings = preds_embeddings[preds_loader.dataset.sorting_indices]
+    target_mask = target_mask[target_loader.dataset.sorting_indices]
+    preds_mask = preds_mask[preds_loader.dataset.sorting_indices]
+
+    # Pairwise (same index) distances
+    distances: List[float] = []
+    n = preds_embeddings.shape[0]
+
+    for i in range(n):
+        pm = preds_mask[i].bool()
+        tm = target_mask[i].bool()
+
+        X = preds_embeddings[i, 0, pm, :].numpy()
+        Y = target_embeddings[i, 0, tm, :].numpy()
+
+        if X.shape[0] == 0 or Y.shape[0] == 0:
+            distances.append(float("inf"))
+            continue
+
+        distances.append(
+            dr_distance(
+                X,
+                Y,
+                n_alpha=n_alpha,
+                n_dirs=n_dirs,
+                data_depth=measure,
+                eps_min=eps,
+                eps_max=1.0,
+                p=p,
+                random_state=0,
+            )
+        )
+
+    out = torch.tensor(distances, dtype=torch.float32)
+
+    # Multi-reference reduction (distance metric: default "min" = best ref)
+    if ref_group_boundaries is not None:
+        out = _postprocess_multiple_references_distance(out, ref_group_boundaries, reduction=multi_ref_reduction)
+
+    return out
\ No newline at end of file
diff --git a/src/torchmetrics/text/__init__.py b/src/torchmetrics/text/__init__.py
index 6af056246cd..9b9767f7ec5 100644
--- a/src/torchmetrics/text/__init__.py
+++ b/src/torchmetrics/text/__init__.py
@@ -46,6 +46,7 @@
 
 if _TRANSFORMERS_GREATER_EQUAL_4_4:
     from torchmetrics.text.bert import BERTScore
+    from torchmetrics.text.depth_score import DepthScore
     from torchmetrics.text.infolm import InfoLM
 
-    __all__ += ["BERTScore", "InfoLM"]
+    __all__ += ["BERTScore", "DepthScore", "InfoLM"]
\ No newline at end of file
diff --git a/src/torchmetrics/text/depth_score.py b/src/torchmetrics/text/depth_score.py
new file mode 100644
index 00000000000..2280f505ea5
--- /dev/null
+++ b/src/torchmetrics/text/depth_score.py
@@ -0,0 +1,358 @@
+# Copyright The Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from collections.abc import Sequence
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+
+import torch
+from torch import Tensor
+from torch.nn import Module
+
+from torchmetrics.functional.text.helper_embedding_metric import _preprocess_text
+from torchmetrics.metric import Metric
+from torchmetrics.utilities import rank_zero_warn
+from torchmetrics.utilities.checks import _SKIP_SLOW_DOCTEST, _try_proceed_with_timeout
+from torchmetrics.utilities.imports import _MATPLOTLIB_AVAILABLE, _TRANSFORMERS_GREATER_EQUAL_4_4
+from torchmetrics.utilities.plot import _AX_TYPE, _PLOT_OUT_TYPE
+from torchmetrics.utilities.data import dim_zero_cat
+
+from torchmetrics.functional.text.depth_score import (
+    _postprocess_multiple_references_distance,
+    _preprocess_multiple_references,
+    depth_score,
+)
+
+if not _MATPLOTLIB_AVAILABLE:
+    __doctest_skip__ = ["DepthScore.plot"]
+
+# Default model recommended in the original implementation.
+_DEFAULT_MODEL: str = "bert-base-uncased"
+
+if _SKIP_SLOW_DOCTEST and _TRANSFORMERS_GREATER_EQUAL_4_4:
+    from transformers import AutoModel, AutoTokenizer
+
+    def _download_model_for_depth_score() -> None:
+        """Download intensive operations."""
+        AutoTokenizer.from_pretrained(_DEFAULT_MODEL, resume_download=True)
+        AutoModel.from_pretrained(_DEFAULT_MODEL, resume_download=True)
+
+    if not _try_proceed_with_timeout(_download_model_for_depth_score):
+        __doctest_skip__ = ["DepthScore", "DepthScore.plot"]
+else:
+    __doctest_skip__ = ["DepthScore", "DepthScore.plot"]
+
+
+def _get_input_dict(input_ids: List[Tensor], attention_mask: List[Tensor]) -> dict[str, Tensor]:
+    """Create an input dictionary of ``input_ids`` and ``attention_mask`` for DepthScore calculation."""
+    return {"input_ids": torch.cat(input_ids), "attention_mask": torch.cat(attention_mask)}
+
+
+class DepthScore(Metric):
+    """`DepthScore Evaluating Text Generation`_ for measuring text similarity.
+
+    DepthScore leverages pre-trained contextual token embeddings (e.g., from BERT-like models) and compares candidate and
+    reference sentences by treating their token embeddings as point clouds and computing a depth-based pseudo-metric
+    between the two distributions. This distance is designed to capture distributional mismatches between contextual
+    representations and can be used for evaluating text generation tasks where *lower* distance indicates a better match.
+    This implementation follows the reference DepthScore formulation introduced by Colombo et al. and mirrors the
+    TorchMetrics-style API used by embedding-based text metrics.
+
+    As input to ``forward`` and ``update`` the metric accepts the following input:
+
+    - ``preds``: Predicted sentence(s). Can be one of:
+
+        * A single predicted sentence as a string (``str``)
+        * A sequence of predicted sentences (``Sequence[str]``)
+
+    - ``target``: Target/reference sentence(s). Can be one of:
+
+        * A single reference sentence as a string (``str``)
+        * A sequence of reference sentences (``Sequence[str]``)
+        * A sequence of sequences of reference sentences for multi-reference evaluation (``Sequence[Sequence[str]]``)
+
+    As output of ``forward`` and ``compute`` the metric returns the following output:
+
+    - ``score`` (:class:`~torch.Tensor`): A 1D tensor of distances of shape `(num_predictions,)`. For multi-reference input,
+      the output is reduced per original prediction according to `multi_ref_reduction`.
+
+    Args:
+        preds (Union[str, Sequence[str]]): A single predicted sentence or a sequence of predicted sentences.
+        target (Union[str, Sequence[str], Sequence[Sequence[str]]]): A single target sentence, a sequence of target
+            sentences, or a sequence of sequences of target sentences for multiple references per prediction.
+        model_name_or_path: A name or a model path used to load ``transformers`` pretrained model.
+        num_layers: A layer of representation to use.
+        all_layers:
+            An indication of whether the representation from all model's layers should be used.
+            If ``all_layers=True``, the argument ``num_layers`` is ignored.
+        model: A user's own model. Must be of `torch.nn.Module` instance.
+        user_tokenizer:
+            A user's own tokenizer used with the own model. This must be an instance with the ``__call__`` method.
+            This method must take an iterable of sentences (`List[str]`) and must return a python dictionary
+            containing `"input_ids"` and `"attention_mask"` represented by :class:`~torch.Tensor`.
+            It is up to the user's model of whether `"input_ids"` is a :class:`~torch.Tensor` of input ids or embedding
+            vectors. This tokenizer must prepend an equivalent of ``[CLS]`` token and append an equivalent of ``[SEP]``
+            token as ``transformers`` tokenizer does.
+        user_forward_fn:
+            A user's own forward function used in a combination with ``user_model``. This function must take
+            ``user_model`` and a python dictionary of containing ``"input_ids"`` and ``"attention_mask"`` represented
+            by :class:`~torch.Tensor` as an input and return the model's output represented by the single
+            :class:`~torch.Tensor`.
+        verbose: An indication of whether a progress bar to be displayed during the embeddings' calculation.
+        device: A device to be used for calculation.
+        max_length: A maximum length of input sequences. Sequences longer than ``max_length`` are to be trimmed.
+        batch_size: A batch size used for model processing.
+        num_threads: A number of threads to use for a dataloader.
+        n_alpha: The Monte-Carlo parameter for the approximation of the integral over alpha (number of level-set
+            thresholds between ``eps`` and 1.0).
+        eps: The lowest level-set bound in [0, 1]. The highest level set is fixed to 1.0 in this implementation.
+        p: The power of the ground cost.
+        measure: Depth / discrepancy measure to use (e.g. ``"irw"`` or ``"ai_irw"``).
+        truncation: An indication of whether the input sequences should be truncated to the ``max_length``.
+        kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
+
+    Example:
+        >>> from pprint import pprint
+        >>> from torchmetrics.text.depth_score import DepthScore
+        >>> preds = ["hello there", "general kenobi"]
+        >>> target = ["hello there", "master kenobi"]
+        >>> depthscore = DepthScore()
+        >>> pprint(depthscore(preds, target))
+        tensor([...])
+
+    Example:
+        >>> from pprint import pprint
+        >>> from torchmetrics.text.depth_score import DepthScore
+        >>> preds = ["hello there", "general kenobi"]
+        >>> target = [["hello there", "master kenobi"], ["hello there", "master kenobi"]]
+        >>> depthscore = DepthScore()
+        >>> pprint(depthscore(preds, target))
+        tensor([...])
+
+    """
+
+    is_differentiable: bool = False
+    higher_is_better: bool = False  # distance
+    full_state_update: bool = False
+    plot_lower_bound: float = 0.0
+    plot_upper_bound: float = 1.0  # not truly bounded; used only for plotting convenience
+
+    preds_input_ids: List[Tensor]
+    preds_attention_mask: List[Tensor]
+    target_input_ids: List[Tensor]
+    target_attention_mask: List[Tensor]
+
+    def __init__(
+        self,
+        model_name_or_path: Optional[str] = None,
+        num_layers: Optional[int] = None,
+        all_layers: bool = False,
+        model: Optional[Module] = None,
+        user_tokenizer: Optional[Any] = None,
+        user_forward_fn: Optional[Callable[[Module, dict[str, Tensor]], Tensor]] = None,
+        verbose: bool = False,
+        device: Optional[Union[str, torch.device]] = None,
+        max_length: int = 512,
+        batch_size: int = 64,
+        num_threads: int = 0,
+        n_alpha: int = 5,
+        eps: float = 0.3,
+        p: int = 5,
+        measure: str = "irw",
+        truncation: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(**kwargs)
+
+        if not _TRANSFORMERS_GREATER_EQUAL_4_4 and user_tokenizer is None:
+            raise ModuleNotFoundError(
+                "`DepthScore` metric with default tokenizers requires `transformers` package be installed."
+                " Either install with `pip install transformers>=4.4` or `pip install torchmetrics[text]`."
+            )
+
+        self.model_name_or_path = model_name_or_path or _DEFAULT_MODEL
+        self.num_layers = num_layers
+        self.all_layers = all_layers
+        self.model = model
+        self.user_forward_fn = user_forward_fn
+        self.verbose = verbose
+        self.embedding_device = device
+        self.max_length = max_length
+        self.batch_size = batch_size
+        self.num_threads = num_threads
+        self.n_alpha = n_alpha
+        self.eps = eps
+        self.p = p
+        self.measure = measure
+        self.truncation = truncation
+
+        self.ref_group_boundaries: Optional[List[Tuple[int, int]]] = None
+
+        if user_tokenizer:
+            self.tokenizer = user_tokenizer
+            self.user_tokenizer = True
+        else:
+            from transformers import AutoTokenizer
+
+            if model_name_or_path is None:
+                rank_zero_warn(
+                    "The argument `model_name_or_path` was not specified while it is required when the default"
+                    f" `transformers` model is used. It will use the default recommended model - {_DEFAULT_MODEL!r}."
+                )
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
+            self.user_tokenizer = False
+
+        self.add_state("preds_input_ids", [], dist_reduce_fx="cat")
+        self.add_state("preds_attention_mask", [], dist_reduce_fx="cat")
+        self.add_state("target_input_ids", [], dist_reduce_fx="cat")
+        self.add_state("target_attention_mask", [], dist_reduce_fx="cat")
+
+    def update(
+        self, preds: Union[str, Sequence[str]], target: Union[str, Sequence[str], Sequence[Sequence[str]]]
+    ) -> None:
+        """Store predictions/references for computing DepthScore.
+
+        It is necessary to store sentences in a tokenized form to ensure the DDP mode working.
+        """
+        if isinstance(preds, str):
+            preds = [preds]
+        if isinstance(target, str):
+            target = [target]
+        if not isinstance(preds, list):
+            preds = list(preds)
+        if not isinstance(target, list):
+            target = list(target)
+
+        if len(preds) != len(target):
+            raise ValueError(
+                "Expected number of predicted and reference sentences to be the same, but got"
+                f"{len(preds)} and {len(target)}"
+            )
+
+        if isinstance(preds, list) and len(preds) > 0 and isinstance(target, list) and len(target) > 0:
+            preds, target, self.ref_group_boundaries = _preprocess_multiple_references(preds, target)
+
+        preds_dict, _ = _preprocess_text(
+            preds,
+            self.tokenizer,
+            self.max_length,
+            truncation=self.truncation,
+            sort_according_length=False,
+            own_tokenizer=self.user_tokenizer,
+        )
+        target_dict, _ = _preprocess_text(
+            cast(List[str], target),
+            self.tokenizer,
+            self.max_length,
+            truncation=self.truncation,
+            sort_according_length=False,
+            own_tokenizer=self.user_tokenizer,
+        )
+
+        self.preds_input_ids.append(preds_dict["input_ids"])
+        self.preds_attention_mask.append(preds_dict["attention_mask"])
+        self.target_input_ids.append(target_dict["input_ids"])
+        self.target_attention_mask.append(target_dict["attention_mask"])
+
+    def compute(self) -> Tensor:
+        """Calculate DepthScore."""
+        preds = {
+            "input_ids": dim_zero_cat(self.preds_input_ids),
+            "attention_mask": dim_zero_cat(self.preds_attention_mask),
+        }
+        target = {
+            "input_ids": dim_zero_cat(self.target_input_ids),
+            "attention_mask": dim_zero_cat(self.target_attention_mask),
+        }
+
+        out = depth_score(
+            preds=preds,  # supports dict input (tokenized)
+            target=target,
+            model_name_or_path=self.model_name_or_path,
+            num_layers=self.num_layers,
+            all_layers=self.all_layers,
+            n_alpha=self.n_alpha,
+            eps=self.eps,
+            p=self.p,
+            measure=self.measure,
+            device=self.embedding_device if self.embedding_device is not None else None,
+            model=self.model,
+            user_tokenizer=self.tokenizer if self.user_tokenizer else None,
+            user_forward_fn=self.user_forward_fn,
+            max_length=self.max_length,
+            batch_size=self.batch_size,
+            num_threads=self.num_threads,
+            truncation=self.truncation,
+            verbose=self.verbose,
+        )
+
+        # out expected: {"depth_score": Tensor} aligned with flattened refs if multi-ref used
+        if self.ref_group_boundaries is not None:
+            out = _postprocess_multiple_references_distance(
+                out,
+                self.ref_group_boundaries,
+                reduction="min",   # distance metric → best match is smallest distance
+            )
+
+        return out
+
+    def plot(
+        self, val: Optional[Union[Tensor, Sequence[Tensor]]] = None, ax: Optional[_AX_TYPE] = None
+    ) -> _PLOT_OUT_TYPE:
+        """Plot a single or multiple values from the metric.
+
+        Args:
+            val: Either a single result from calling `metric.forward` or `metric.compute` or a list of these results.
+                If no value is provided, will automatically call `metric.compute` and plot that result.
+            ax: A matplotlib axis object. If provided will add plot to that axis.
+
+        Returns:
+            Figure and Axes object
+
+        Raises:
+            ModuleNotFoundError:
+                If `matplotlib` is not installed
+
+        .. plot::
+            :scale: 75
+
+            >>> # Example plotting a single value
+            >>> from torchmetrics.text.depth_score import DepthScore
+            >>> preds = ["hello there", "general kenobi"]
+            >>> target = ["hello there", "master kenobi"]
+            >>> metric = DepthScore()
+            >>> metric.update(preds, target)
+            >>> fig_, ax_ = metric.plot()
+
+        .. plot::
+            :scale: 75
+
+            >>> # Example plotting multiple values
+            >>> from torch import tensor
+            >>> from torchmetrics.text.depth_score import DepthScore
+            >>> preds = ["hello there", "general kenobi"]
+            >>> target = ["hello there", "master kenobi"]
+            >>> metric = DepthScore()
+            >>> values = []
+            >>> for _ in range(10):
+            ...     val = metric(preds, target)
+            ...     val = val.mean()  # convert into a single scalar
+            ...     values.append(val)
+            >>> fig_, ax_ = metric.plot(values)
+
+        """
+        if val is None:  # default average score across sentences
+            val = self.compute()  # type: ignore
+            val = val.mean()  # type: ignore
+        return self._plot(val, ax)
\ No newline at end of file
diff --git a/tests/unittests/text/test_depth_score.py b/tests/unittests/text/test_depth_score.py
new file mode 100644
index 00000000000..45b7fb87aee
--- /dev/null
+++ b/tests/unittests/text/test_depth_score.py
@@ -0,0 +1,291 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from collections.abc import Sequence
+from functools import partial
+
+import pytest
+import torch
+from torch import Tensor
+
+from torchmetrics.functional.text.depth_score import depth_score
+from torchmetrics.text.depth_score import DepthScore
+from torchmetrics.utilities.imports import _TRANSFORMERS_GREATER_EQUAL_4_4
+from unittests._helpers import (
+    _IS_WINDOWS,
+    _TORCH_LESS_THAN_2_1,
+    _TRANSFORMERS_GREATER_EQUAL_4_54,
+    _TRANSFORMERS_RANGE_GE_4_50_LT_4_54,
+    skip_on_connection_issues,
+)
+from unittests.text._helpers import TextTester
+from unittests.text._inputs import (
+    _inputs_multiple_references,
+    _inputs_single_reference,
+    _inputs_single_sentence_multiple_references,
+)
+
+MODEL_NAME = "albert-base-v2"
+
+# Disable tokenizers parallelism (forking not friendly with parallelism)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+@skip_on_connection_issues()
+@pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4")
+def _reference_depth_score(
+    preds: Sequence[str],
+    target: Sequence[str],
+    num_layers: int,
+) -> Tensor:
+    # Reference source code depthscore implementation
+    try:
+        from nlg_eval_via_simi_measures.depth_score import DepthScoreMetric
+    except ImportError:
+        pytest.skip("test requires `nlg_eval_via_simi_measures` to be installed.")
+
+    metric_call = DepthScoreMetric(MODEL_NAME, layers_to_consider=num_layers)
+    out = metric_call.evaluate_batch(list(target), list(preds))
+    return torch.as_tensor(out["depth_score"], dtype=torch.float32)
+
+
+@pytest.mark.parametrize("num_layers", [4, 8])
+@pytest.mark.parametrize(
+    ("preds", "targets"),
+    [(_inputs_single_reference.preds, _inputs_single_reference.target)],
+)
+@pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4")
+@pytest.mark.xfail(
+    RuntimeError,
+    condition=_TORCH_LESS_THAN_2_1 and _TRANSFORMERS_RANGE_GE_4_50_LT_4_54,
+    reason="could be due to torch compatibility issues with transformers",
+)
+@pytest.mark.xfail(
+    ImportError,
+    condition=_TORCH_LESS_THAN_2_1 and _IS_WINDOWS and _TRANSFORMERS_GREATER_EQUAL_4_54,
+    reason="another strange behaviour of transformers on windows",
+)
+class TestDepthScore(TextTester):
+    """Tests for DepthScore."""
+
+    @pytest.mark.parametrize("ddp", [pytest.param(True, marks=pytest.mark.DDP), False])
+    @skip_on_connection_issues()
+    def test_depthscore_class(self, ddp, preds, targets, num_layers):
+        """Test the depth score class."""
+        metric_args = {
+            "model_name_or_path": MODEL_NAME,
+            "num_layers": num_layers,
+            "device": "cpu",
+            "batch_size": 8,
+            "max_length": 128,
+            "truncation": True, # nlg_eval reference always truncates
+        }
+        reference_depth_score_metric = partial(
+            _reference_depth_score,
+            num_layers=num_layers,
+        )
+
+        self.run_class_metric_test(
+            ddp=ddp,
+            preds=preds,
+            targets=targets,
+            metric_class=DepthScore,
+            reference_metric=reference_depth_score_metric,
+            metric_args=metric_args,
+            check_scriptable=False,  # huggingface transformers are not usually scriptable
+            ignore_order=ddp,  # ignore order of predictions when DDP is used
+        )
+
+    @skip_on_connection_issues()
+    def test_depthscore_functional(self, preds, targets, num_layers):
+        """Test the depthscore functional."""
+        metric_args = {
+            "model_name_or_path": MODEL_NAME,
+            "num_layers": num_layers,
+            "truncation": True, # nlg_eval reference always truncates
+        }
+        reference_depth_score_metric = partial(
+            _reference_depth_score,
+            num_layers=num_layers,
+        )
+
+        self.run_functional_metric_test(
+            preds,
+            targets,
+            metric_functional=depth_score,
+            reference_metric=reference_depth_score_metric,
+            metric_args=metric_args,
+        )
+
+    @skip_on_connection_issues()
+    def test_depthscore_differentiability(self, preds, targets, num_layers):
+        """Test the depthscore differentiability."""
+        metric_args = {
+            "model_name_or_path": MODEL_NAME,
+            "num_layers": num_layers,
+            "truncation": True, # nlg_eval reference always truncates
+        }
+
+        self.run_differentiability_test(
+            preds=preds,
+            targets=targets,
+            metric_module=DepthScore,
+            metric_functional=depth_score,
+            metric_args=metric_args,
+        )
+
+
+@skip_on_connection_issues()
+@pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4")
+@pytest.mark.xfail(
+    RuntimeError,
+    condition=_TORCH_LESS_THAN_2_1 and _TRANSFORMERS_RANGE_GE_4_50_LT_4_54,
+    reason="could be due to torch compatibility issues with transformers",
+)
+@pytest.mark.xfail(
+    ImportError,
+    condition=_TORCH_LESS_THAN_2_1 and _IS_WINDOWS and _TRANSFORMERS_GREATER_EQUAL_4_54,
+    reason="another strange behaviour of transformers on windows",
+)
+def test_depthscore_sorting():
+    """Test that DepthScore is invariant to the order of the inputs."""
+    short = "Short text"
+    long = "This is a longer text"
+
+    preds = [long, long]
+    targets = [long, short]
+
+    metric = DepthScore(model_name_or_path=MODEL_NAME, num_layers=4, device="cpu", batch_size=2, max_length=64)
+    score = metric(preds, targets)
+
+    # First index should be the self-comparison - sorting by length should not shuffle this.
+    # Distance metric: self-comparison should have a smaller distance than mismatched pair.
+    assert score[0] < score[1]
+
+
+@skip_on_connection_issues()
+@pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4")
+@pytest.mark.xfail(
+    RuntimeError,
+    condition=_TORCH_LESS_THAN_2_1 and _TRANSFORMERS_RANGE_GE_4_50_LT_4_54,
+    reason="could be due to torch compatibility issues with transformers",
+)
+@pytest.mark.xfail(
+    ImportError,
+    condition=_TORCH_LESS_THAN_2_1 and _IS_WINDOWS and _TRANSFORMERS_GREATER_EQUAL_4_54,
+    reason="another strange behaviour of transformers on windows",
+)
+@pytest.mark.parametrize("truncation", [True, False])
+def test_depthscore_truncation(truncation: bool):
+    """Test that DepthScore truncation works as expected."""
+    pred = ["abc " * 2000]
+    gt = ["def " * 2000]
+    metric = DepthScore(model_name_or_path=MODEL_NAME, num_layers=4, device="cpu", batch_size=1, max_length=64, truncation=truncation)
+
+    if truncation:
+        res = metric(pred, gt)
+        # Should produce a finite tensor (not error). Value itself is not bounded.
+        assert torch.isfinite(res).all()
+    else:
+        with pytest.raises(RuntimeError, match="The expanded size of the tensor.*must match.*"):
+            metric(pred, gt)
+
+
+@skip_on_connection_issues()
+@pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4")
+@pytest.mark.xfail(
+    RuntimeError,
+    condition=_TORCH_LESS_THAN_2_1 and _TRANSFORMERS_RANGE_GE_4_50_LT_4_54,
+    reason="could be due to torch compatibility issues with transformers",
+)
+@pytest.mark.xfail(
+    ImportError,
+    condition=_TORCH_LESS_THAN_2_1 and _IS_WINDOWS and _TRANSFORMERS_GREATER_EQUAL_4_54,
+    reason="another strange behaviour of transformers on windows",
+)
+def test_depthscore_single_str_input():
+    """Test if DepthScore works with single string preds and target."""
+    preds = "hello there"
+    target = "hello there"
+
+    metric = DepthScore(model_name_or_path=MODEL_NAME, num_layers=4, device="cpu", batch_size=1, max_length=64)
+    score_class = metric(preds, target)
+
+    # Distance for identical text should be smaller than for different text.
+    score_class_ident = score_class.item()
+
+    score_functional = depth_score(preds, target, model_name_or_path=MODEL_NAME, num_layers=4, device="cpu", batch_size=1, max_length=64)
+    score_func_ident = score_functional.item()
+
+    assert score_class_ident == pytest.approx(score_func_ident, abs=1e-6)
+
+    # Compare to a different target to assert "identical is better"
+    score_diff = metric("hello there", "general kenobi").item()
+    assert score_class_ident <= score_diff
+
+
+@pytest.mark.parametrize(
+    ("preds", "target"),
+    [
+        (
+            _inputs_single_sentence_multiple_references.preds,
+            _inputs_single_sentence_multiple_references.target,
+        ),
+        (
+            ["hello there", "I'm in the middle", "general kenobi"],
+            (["hello there", "master kenobi"], "I'm here", ("hello there", "master kenobi")),
+        ),
+    ],
+)
+@skip_on_connection_issues()
+@pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4")
+@pytest.mark.xfail(
+    RuntimeError,
+    condition=_TORCH_LESS_THAN_2_1 and _TRANSFORMERS_RANGE_GE_4_50_LT_4_54,
+    reason="could be due to torch compatibility issues with transformers",
+)
+@pytest.mark.xfail(
+    ImportError,
+    condition=_TORCH_LESS_THAN_2_1 and _IS_WINDOWS and _TRANSFORMERS_GREATER_EQUAL_4_54,
+    reason="another strange behaviour of transformers on windows",
+)
+def test_depthscore_multiple_references(preds, target):
+    """Test both functional and class APIs with multiple references."""
+    # Functional returns a 1D tensor; class returns dict with "depth_score"
+    result_func = depth_score(preds, target)
+    metric = DepthScore()
+    result_class = metric(preds, target)
+
+    # They should match exactly (same code path), and output should be per-pred after reduction (min across refs).
+    assert torch.allclose(result_func, result_class, atol=1e-6)
+
+    # Sanity: output length equals number of predictions (not flattened refs)
+    if isinstance(preds, str):
+        assert result_func.numel() == 1
+    else:
+        assert result_func.numel() == len(preds)
+
+
+@pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4")
+def test_depthscore_invalid_references():
+    """Test both functional and class APIs with invalid references."""
+    preds = _inputs_multiple_references.preds
+    target = _inputs_multiple_references.target
+
+    with pytest.raises(ValueError, match="Invalid input provided."):
+        depth_score(preds, target)
+
+    metric = DepthScore()
+    with pytest.raises(ValueError, match="Invalid input provided."):
+        metric(preds, target)
\ No newline at end of file

From c8be46a12866055d1cd17c22d62267707a573c18 Mon Sep 17 00:00:00 2001
From: Sohaib-Ahmed21 <sohaibahmed1919@gmail.com>
Date: Sun, 25 Jan 2026 11:00:53 -0800
Subject: [PATCH 02/11] Fix pre-commit failures and add reduction param to main
 interface

---
 .../functional/text/depth_score.py            | 148 +++++++++---------
 src/torchmetrics/text/__init__.py             |   2 +-
 src/torchmetrics/text/depth_score.py          |  48 +++---
 tests/unittests/text/test_depth_score.py      |  27 +++-
 4 files changed, 119 insertions(+), 106 deletions(-)

diff --git a/src/torchmetrics/functional/text/depth_score.py b/src/torchmetrics/functional/text/depth_score.py
index d5f310526de..3af211359c9 100644
--- a/src/torchmetrics/functional/text/depth_score.py
+++ b/src/torchmetrics/functional/text/depth_score.py
@@ -11,26 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import annotations
-
 import logging
 from collections.abc import Iterator, Sequence
 from contextlib import contextmanager
 from typing import Any, Callable, List, Optional, Tuple, Union, cast
 
+import geomloss
 import numpy as np
+
+# DepthScore deps
+import ot  # pip install POT  # codespell:ignore ot
 import torch
+from sklearn.covariance import MinCovDet as MCD  # noqa: N817
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import normalize
 from torch import Tensor
 from torch.nn import Module
 from torch.utils.data import DataLoader
 
-# DepthScore deps
-import ot  # pip install POT
-import geomloss
-from sklearn.preprocessing import normalize
-from sklearn.covariance import MinCovDet as MCD
-from sklearn.decomposition import PCA
-
 # TorchMetrics text helpers (same style as BERTScore)
 from torchmetrics.functional.text.helper_embedding_metric import (
     TextDataset,
@@ -102,7 +100,7 @@ def _preprocess_multiple_references(
     Raises:
         ValueError:
             If `preds` is not a list of strings.
-            
+
     """
     if not all(isinstance(item, str) for item in preds):
         raise ValueError("Invalid input provided.")
@@ -181,70 +179,72 @@ def _postprocess_multiple_references_distance(
     return torch.stack(out, dim=0)
 
 
-def cov_matrix(X: np.ndarray, robust: bool = False) -> np.ndarray:
+def cov_matrix(x: np.ndarray, robust: bool = False) -> np.ndarray:
     """Covariance matrix (optionally robust)."""
     if robust:
-        return MCD().fit(X).covariance_
-    return np.cov(X.T)
+        return MCD().fit(x).covariance_
+    return np.cov(x.T)
 
 
-def standardize(X: np.ndarray, robust: bool = False) -> np.ndarray:
+def standardize(x: np.ndarray, robust: bool = False) -> np.ndarray:
     """Affine standardization using inverse sqrt covariance."""
-    sigma = cov_matrix(X, robust)
-    _, n_features = X.shape
-    rank = np.linalg.matrix_rank(X)
+    sigma = cov_matrix(x, robust)
+    _, n_features = x.shape
+    rank = np.linalg.matrix_rank(x)
 
     if rank < n_features:
-        X = PCA(rank).fit_transform(X)
-        sigma = cov_matrix(X)
+        x = PCA(rank).fit_transform(x)
+        sigma = cov_matrix(x)
 
     u, s, _ = np.linalg.svd(sigma)
     square_inv = u / np.sqrt(s)
-    return X @ square_inv
+    return x @ square_inv
 
 
 def sampled_sphere(n_dirs: int, d: int) -> np.ndarray:
     """Uniform samples on unit sphere."""
-    U = np.random.multivariate_normal(np.zeros(d), np.eye(d), size=n_dirs)
-    return normalize(U)
+    u = np.random.multivariate_normal(np.zeros(d), np.eye(d), size=n_dirs)
+    return normalize(u)
 
 
-def Wasserstein(X: np.ndarray, Y: np.ndarray) -> float:
-    """OT cost with uniform weights."""
-    M = ot.dist(X, Y)
-    wX = np.ones(len(X)) / len(X)
-    wY = np.ones(len(Y)) / len(Y)
-    return float(ot.emd2(wX, wY, M))
+def wasserstein(x: np.ndarray, y: np.ndarray) -> float:
+    """Optimal transport cost with uniform weights."""
+    m = ot.dist(x, y)  # codespell:ignore ot
+    w_x = np.ones(len(x)) / len(x)
+    w_y = np.ones(len(y)) / len(y)
+    return float(ot.emd2(w_x, w_y, m))  # codespell:ignore ot
 
 
-def SW(X: np.ndarray, Y: np.ndarray, ndirs: int, p: int = 2) -> float:
+def sw(x: np.ndarray, y: np.ndarray, ndirs: int, p: int = 2) -> float:
     """Sliced Wasserstein distance."""
-    n, d = X.shape
-    U = sampled_sphere(ndirs, d)
-    Zx = X @ U.T
-    Zy = Y @ U.T
+    n, d = x.shape
+    u = sampled_sphere(ndirs, d)
+    z_x = x @ u.T
+    z_y = y @ u.T
     sliced = np.zeros(ndirs)
     for k in range(ndirs):
-        sliced[k] = ot.emd2_1d(Zx[:, k], Zy[:, k], p=2)
+        sliced[k] = ot.emd2_1d(z_x[:, k], z_y[:, k], p=2)  # codespell:ignore ot
     return float((np.mean(sliced)) ** (1 / p))
 
 
-def MMD(X: np.ndarray, Y: np.ndarray) -> float:
+def mmd(x: np.ndarray, y: np.ndarray) -> float:
     """Gaussian MMD via geomloss."""
-    return float(geomloss.SamplesLoss("gaussian")(torch.tensor(X), torch.tensor(Y)).item())
+    return float(geomloss.SamplesLoss("gaussian")(torch.tensor(x), torch.tensor(y)).item())
 
 
-def ai_irw(X: np.ndarray, AI: bool = True, robust: bool = False, n_dirs: Optional[int] = None, random_state: int = 0) -> np.ndarray:
+def ai_irw(
+    x: np.ndarray, ai: bool = True, robust: bool = False, n_dirs: Optional[int] = None, random_state: int = 0
+) -> np.ndarray:
     """(Affine-invariant) integrated rank-weighted depth."""
     np.random.seed(random_state)
-    if AI:
-        X = standardize(X, robust)
+    if ai:
+        x = standardize(x, robust)
 
-    n, d = X.shape
+    n, d = x.shape
     n_dirs = d * 100 if n_dirs is None else n_dirs
 
-    U = sampled_sphere(n_dirs, d)
-    proj = X @ U.T
+    u = sampled_sphere(n_dirs, d)
+    proj = x @ u.T
     ranks = np.argsort(proj, axis=0)
 
     depth = np.zeros_like(proj)
@@ -257,8 +257,8 @@ def ai_irw(X: np.ndarray, AI: bool = True, robust: bool = False, n_dirs: Optiona
 
 
 def dr_distance(
-    X: np.ndarray,
-    Y: np.ndarray,
+    x: np.ndarray,
+    y: np.ndarray,
     n_alpha: int = 5,
     n_dirs: int = 10000,
     data_depth: str = "irw",
@@ -270,15 +270,15 @@ def dr_distance(
     """Compute the depth-based pseudo-metric between two point clouds.
 
     This function implements the DepthScore "DR distance" between two empirical
-    distributions represented by token-embedding point clouds `X` and `Y`. The distance
+    distributions represented by token-embedding point clouds `x` and `y`. The distance
     is computed by (1) choosing a data depth / distributional discrepancy backend
     (e.g., IRW depth, affine-invariant IRW, Wasserstein, sliced Wasserstein, or MMD),
     and (2) integrating over depth level sets between `eps_min` and `eps_max`, while
     approximating the supremum over directions on the unit sphere by Monte Carlo.
 
     Args:
-        X: Array of shape `(n_samples, n_features)` representing the first point cloud.
-        Y: Array of shape `(n_samples, n_features)` representing the second point cloud.
+        x: Array of shape `(n_samples, n_features)` representing the first point cloud.
+        y: Array of shape `(n_samples, n_features)` representing the second point cloud.
         n_alpha: Monte-Carlo parameter controlling the approximation of the integral
             over alpha (number of level-set thresholds between `eps_min` and `eps_max`).
         n_dirs: Number of random directions used to approximate the supremum over the
@@ -286,7 +286,7 @@ def dr_distance(
         data_depth: Depth / discrepancy measure to use. One of
             `{"irw", "ai_irw", "wasserstein", "sliced", "mmd"}`.
             - `"irw"` / `"ai_irw"` compute depth values and then integrate level sets.
-            - `"wasserstein"` returns the (unsliced) OT cost directly.
+            - `"wasserstein"` returns the (unsliced) OT cost directly.  # codespell:ignore ot
             - `"sliced"` returns the sliced Wasserstein distance directly.
             - `"mmd"` returns the Gaussian MMD directly.
         eps_min: Lower level-set bound in `[0, eps_max]` (lowest alpha / quantile level).
@@ -308,39 +308,39 @@ def dr_distance(
     np.random.seed(random_state)
 
     if data_depth == "irw":
-        depth_X = ai_irw(X, AI=False, n_dirs=n_dirs, random_state=random_state)
-        depth_Y = ai_irw(Y, AI=False, n_dirs=n_dirs, random_state=random_state)
+        depth_x = ai_irw(x, ai=False, n_dirs=n_dirs, random_state=random_state)
+        depth_y = ai_irw(y, ai=False, n_dirs=n_dirs, random_state=random_state)
     elif data_depth == "ai_irw":
-        depth_X = ai_irw(X, AI=True, n_dirs=n_dirs, random_state=random_state)
-        depth_Y = ai_irw(Y, AI=True, n_dirs=n_dirs, random_state=random_state)
+        depth_x = ai_irw(x, ai=True, n_dirs=n_dirs, random_state=random_state)
+        depth_y = ai_irw(y, ai=True, n_dirs=n_dirs, random_state=random_state)
     elif data_depth == "wasserstein":
-        return Wasserstein(X, Y)
+        return wasserstein(x, y)
     elif data_depth == "sliced":
-        return SW(X, Y, ndirs=n_dirs)
+        return sw(x, y, ndirs=n_dirs)
     elif data_depth == "mmd":
-        return MMD(X, Y)
+        return mmd(x, y)
     else:
         raise ValueError("Unsupported depth")
 
     if not (0.0 <= eps_min <= eps_max <= 1.0):
         raise ValueError("Expected 0 <= eps_min <= eps_max <= 1")
 
-    _, d = X.shape
-    U = sampled_sphere(n_dirs, d)
-    proj_X = X @ U.T
-    proj_Y = Y @ U.T
+    _, d = x.shape
+    u = sampled_sphere(n_dirs, d)
+    proj_x = x @ u.T
+    proj_y = y @ u.T
 
     alphas = np.linspace(int(eps_min * 100), int(eps_max * 100), n_alpha)
-    qX = [np.percentile(depth_X, a) for a in alphas]
-    qY = [np.percentile(depth_Y, a) for a in alphas]
+    q_x = [np.percentile(depth_x, a) for a in alphas]
+    q_y = [np.percentile(depth_y, a) for a in alphas]
 
     score = 0.0
     for i in range(n_alpha):
-        idx_X = np.where(depth_X >= qX[i])[0]
-        idx_Y = np.where(depth_Y >= qY[i])[0]
-        supp_X = np.max(proj_X[idx_X], axis=0)
-        supp_Y = np.max(proj_Y[idx_Y], axis=0)
-        score += float(np.max((supp_X - supp_Y) ** p))
+        idx_x = np.where(depth_x >= q_x[i])[0]
+        idx_y = np.where(depth_y >= q_y[i])[0]
+        supp_x = np.max(proj_x[idx_x], axis=0)
+        supp_y = np.max(proj_y[idx_y], axis=0)
+        score += float(np.max((supp_x - supp_y) ** p))
 
     return float((score / n_alpha) ** (1 / p))
 
@@ -522,7 +522,6 @@ def depth_score(
         tensor([...])
 
     """
-
     ref_group_boundaries: Optional[List[Tuple[int, int]]] = None
 
     if isinstance(preds, str):
@@ -539,7 +538,7 @@ def depth_score(
             "Expected number of predicted and reference sentences to be the same, but got"
             f" {len(preds)} and {len(target)}"
         )
-    
+
     if isinstance(preds, list) and len(preds) > 0 and isinstance(target, list) and len(target) > 0:
         preds, target, ref_group_boundaries = _preprocess_multiple_references(preds, target)
 
@@ -608,7 +607,6 @@ def depth_score(
         preds_dataset = TextDataset(preds, tokenizer, max_length, truncation=truncation)
 
     elif _are_valid_tensors:
-
         target_dataset = TokenizedDataset(**cast(dict, target))
         preds_dataset = TokenizedDataset(**cast(dict, preds))
     else:
@@ -652,17 +650,17 @@ def depth_score(
         pm = preds_mask[i].bool()
         tm = target_mask[i].bool()
 
-        X = preds_embeddings[i, 0, pm, :].numpy()
-        Y = target_embeddings[i, 0, tm, :].numpy()
+        x = preds_embeddings[i, 0, pm, :].numpy()
+        y = target_embeddings[i, 0, tm, :].numpy()
 
-        if X.shape[0] == 0 or Y.shape[0] == 0:
+        if x.shape[0] == 0 or y.shape[0] == 0:
             distances.append(float("inf"))
             continue
 
         distances.append(
             dr_distance(
-                X,
-                Y,
+                x,
+                y,
                 n_alpha=n_alpha,
                 n_dirs=n_dirs,
                 data_depth=measure,
@@ -679,4 +677,4 @@ def depth_score(
     if ref_group_boundaries is not None:
         out = _postprocess_multiple_references_distance(out, ref_group_boundaries, reduction=multi_ref_reduction)
 
-    return out
\ No newline at end of file
+    return out
diff --git a/src/torchmetrics/text/__init__.py b/src/torchmetrics/text/__init__.py
index 9b9767f7ec5..ca297acc152 100644
--- a/src/torchmetrics/text/__init__.py
+++ b/src/torchmetrics/text/__init__.py
@@ -49,4 +49,4 @@
     from torchmetrics.text.depth_score import DepthScore
     from torchmetrics.text.infolm import InfoLM
 
-    __all__ += ["BERTScore", "DepthScore", "InfoLM"]
\ No newline at end of file
+    __all__ += ["BERTScore", "DepthScore", "InfoLM"]
diff --git a/src/torchmetrics/text/depth_score.py b/src/torchmetrics/text/depth_score.py
index 2280f505ea5..4d6a55c4ead 100644
--- a/src/torchmetrics/text/depth_score.py
+++ b/src/torchmetrics/text/depth_score.py
@@ -11,28 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import annotations
-
 from collections.abc import Sequence
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Callable, List, Optional, Tuple, Union, cast
 
 import torch
 from torch import Tensor
 from torch.nn import Module
 
+from torchmetrics.functional.text.depth_score import (
+    _postprocess_multiple_references_distance,
+    _preprocess_multiple_references,
+    depth_score,
+)
 from torchmetrics.functional.text.helper_embedding_metric import _preprocess_text
 from torchmetrics.metric import Metric
 from torchmetrics.utilities import rank_zero_warn
 from torchmetrics.utilities.checks import _SKIP_SLOW_DOCTEST, _try_proceed_with_timeout
+from torchmetrics.utilities.data import dim_zero_cat
 from torchmetrics.utilities.imports import _MATPLOTLIB_AVAILABLE, _TRANSFORMERS_GREATER_EQUAL_4_4
 from torchmetrics.utilities.plot import _AX_TYPE, _PLOT_OUT_TYPE
-from torchmetrics.utilities.data import dim_zero_cat
-
-from torchmetrics.functional.text.depth_score import (
-    _postprocess_multiple_references_distance,
-    _preprocess_multiple_references,
-    depth_score,
-)
 
 if not _MATPLOTLIB_AVAILABLE:
     __doctest_skip__ = ["DepthScore.plot"]
@@ -54,19 +51,16 @@ def _download_model_for_depth_score() -> None:
     __doctest_skip__ = ["DepthScore", "DepthScore.plot"]
 
 
-def _get_input_dict(input_ids: List[Tensor], attention_mask: List[Tensor]) -> dict[str, Tensor]:
-    """Create an input dictionary of ``input_ids`` and ``attention_mask`` for DepthScore calculation."""
-    return {"input_ids": torch.cat(input_ids), "attention_mask": torch.cat(attention_mask)}
-
-
 class DepthScore(Metric):
     """`DepthScore Evaluating Text Generation`_ for measuring text similarity.
 
-    DepthScore leverages pre-trained contextual token embeddings (e.g., from BERT-like models) and compares candidate and
-    reference sentences by treating their token embeddings as point clouds and computing a depth-based pseudo-metric
-    between the two distributions. This distance is designed to capture distributional mismatches between contextual
-    representations and can be used for evaluating text generation tasks where *lower* distance indicates a better match.
-    This implementation follows the reference DepthScore formulation introduced by Colombo et al. and mirrors the
+    DepthScore leverages pre-trained contextual token embeddings (e.g., from BERT-like models) and compares
+    candidate and reference sentences by treating their token embeddings as point clouds and computing a depth-
+    based pseudo-metric between the two distributions. This distance is designed to capture distributional
+    mismatches between contextual representations and can be used for evaluating text generation tasks where
+    *lower* distance indicates a better match.
+
+    This implementation follows the reference DepthScore formulation introduced by ``Colombo et al.`` and mirrors the
     TorchMetrics-style API used by embedding-based text metrics.
 
     As input to ``forward`` and ``update`` the metric accepts the following input:
@@ -84,8 +78,8 @@ class DepthScore(Metric):
 
     As output of ``forward`` and ``compute`` the metric returns the following output:
 
-    - ``score`` (:class:`~torch.Tensor`): A 1D tensor of distances of shape `(num_predictions,)`. For multi-reference input,
-      the output is reduced per original prediction according to `multi_ref_reduction`.
+    - ``score`` (:class:`~torch.Tensor`): A 1D tensor of distances of shape `(num_predictions,)`. For multi-reference
+      input, the output is reduced per original prediction according to `multi_ref_reduction`.
 
     Args:
         preds (Union[str, Sequence[str]]): A single predicted sentence or a sequence of predicted sentences.
@@ -120,6 +114,8 @@ class DepthScore(Metric):
         p: The power of the ground cost.
         measure: Depth / discrepancy measure to use (e.g. ``"irw"`` or ``"ai_irw"``).
         truncation: An indication of whether the input sequences should be truncated to the ``max_length``.
+        multi_ref_reduction: Reduction to apply across multiple references per prediction.
+            Default ``"min"`` (best match) since this is a distance metric. Options: ``"min"``, ``"max"``, ``"mean"``.
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
 
     Example:
@@ -171,6 +167,7 @@ def __init__(
         p: int = 5,
         measure: str = "irw",
         truncation: bool = False,
+        multi_ref_reduction: str = "min",
         **kwargs: Any,
     ) -> None:
         super().__init__(**kwargs)
@@ -196,6 +193,7 @@ def __init__(
         self.p = p
         self.measure = measure
         self.truncation = truncation
+        self.multi_ref_reduction = multi_ref_reduction
 
         self.ref_group_boundaries: Optional[List[Tuple[int, int]]] = None
 
@@ -224,6 +222,7 @@ def update(
         """Store predictions/references for computing DepthScore.
 
         It is necessary to store sentences in a tokenized form to ensure the DDP mode working.
+
         """
         if isinstance(preds, str):
             preds = [preds]
@@ -295,6 +294,7 @@ def compute(self) -> Tensor:
             num_threads=self.num_threads,
             truncation=self.truncation,
             verbose=self.verbose,
+            multi_ref_reduction=self.multi_ref_reduction,
         )
 
         # out expected: {"depth_score": Tensor} aligned with flattened refs if multi-ref used
@@ -302,7 +302,7 @@ def compute(self) -> Tensor:
             out = _postprocess_multiple_references_distance(
                 out,
                 self.ref_group_boundaries,
-                reduction="min",   # distance metric → best match is smallest distance
+                reduction=self.multi_ref_reduction,
             )
 
         return out
@@ -355,4 +355,4 @@ def plot(
         if val is None:  # default average score across sentences
             val = self.compute()  # type: ignore
             val = val.mean()  # type: ignore
-        return self._plot(val, ax)
\ No newline at end of file
+        return self._plot(val, ax)
diff --git a/tests/unittests/text/test_depth_score.py b/tests/unittests/text/test_depth_score.py
index 45b7fb87aee..3beb4b3e44f 100644
--- a/tests/unittests/text/test_depth_score.py
+++ b/tests/unittests/text/test_depth_score.py
@@ -89,7 +89,7 @@ def test_depthscore_class(self, ddp, preds, targets, num_layers):
             "device": "cpu",
             "batch_size": 8,
             "max_length": 128,
-            "truncation": True, # nlg_eval reference always truncates
+            "truncation": True,  # nlg_eval reference always truncates
         }
         reference_depth_score_metric = partial(
             _reference_depth_score,
@@ -113,7 +113,7 @@ def test_depthscore_functional(self, preds, targets, num_layers):
         metric_args = {
             "model_name_or_path": MODEL_NAME,
             "num_layers": num_layers,
-            "truncation": True, # nlg_eval reference always truncates
+            "truncation": True,  # nlg_eval reference always truncates
         }
         reference_depth_score_metric = partial(
             _reference_depth_score,
@@ -134,7 +134,7 @@ def test_depthscore_differentiability(self, preds, targets, num_layers):
         metric_args = {
             "model_name_or_path": MODEL_NAME,
             "num_layers": num_layers,
-            "truncation": True, # nlg_eval reference always truncates
+            "truncation": True,  # nlg_eval reference always truncates
         }
 
         self.run_differentiability_test(
@@ -191,7 +191,14 @@ def test_depthscore_truncation(truncation: bool):
     """Test that DepthScore truncation works as expected."""
     pred = ["abc " * 2000]
     gt = ["def " * 2000]
-    metric = DepthScore(model_name_or_path=MODEL_NAME, num_layers=4, device="cpu", batch_size=1, max_length=64, truncation=truncation)
+    metric = DepthScore(
+        model_name_or_path=MODEL_NAME,
+        num_layers=4,
+        device="cpu",
+        batch_size=1,
+        max_length=64,
+        truncation=truncation,
+    )
 
     if truncation:
         res = metric(pred, gt)
@@ -225,7 +232,15 @@ def test_depthscore_single_str_input():
     # Distance for identical text should be smaller than for different text.
     score_class_ident = score_class.item()
 
-    score_functional = depth_score(preds, target, model_name_or_path=MODEL_NAME, num_layers=4, device="cpu", batch_size=1, max_length=64)
+    score_functional = depth_score(
+        preds,
+        target,
+        model_name_or_path=MODEL_NAME,
+        num_layers=4,
+        device="cpu",
+        batch_size=1,
+        max_length=64,
+    )
     score_func_ident = score_functional.item()
 
     assert score_class_ident == pytest.approx(score_func_ident, abs=1e-6)
@@ -288,4 +303,4 @@ def test_depthscore_invalid_references():
 
     metric = DepthScore()
     with pytest.raises(ValueError, match="Invalid input provided."):
-        metric(preds, target)
\ No newline at end of file
+        metric(preds, target)

From b57be3f040c1bc15c4243c0bc52c7125678af05f Mon Sep 17 00:00:00 2001
From: Sohaib-Ahmed21 <sohaibahmed1919@gmail.com>
Date: Sun, 25 Jan 2026 12:48:39 -0800
Subject: [PATCH 03/11] Rename measure param to depth_measure and provided test
 coverage for all depth measures

---
 src/torchmetrics/functional/text/depth_score.py | 10 +++++++---
 src/torchmetrics/text/depth_score.py            |  8 ++++----
 tests/unittests/text/test_depth_score.py        | 15 +++++++++++----
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/torchmetrics/functional/text/depth_score.py b/src/torchmetrics/functional/text/depth_score.py
index 3af211359c9..2606c5edb84 100644
--- a/src/torchmetrics/functional/text/depth_score.py
+++ b/src/torchmetrics/functional/text/depth_score.py
@@ -307,6 +307,10 @@ def dr_distance(
     """
     np.random.seed(random_state)
 
+    # Match reference numerics: many reference code paths end up in float64.
+    x = np.asarray(x, dtype=np.float64)
+    y = np.asarray(y, dtype=np.float64)
+
     if data_depth == "irw":
         depth_x = ai_irw(x, ai=False, n_dirs=n_dirs, random_state=random_state)
         depth_y = ai_irw(y, ai=False, n_dirs=n_dirs, random_state=random_state)
@@ -445,7 +449,7 @@ def depth_score(
     n_dirs: int = 10000,
     eps: float = 0.3,
     p: int = 5,
-    measure: str = "irw",
+    depth_measure: str = "irw",
     # Multi-ref postprocess for a distance metric (best = min by default)
     multi_ref_reduction: str = "min",
 ) -> Tensor:
@@ -486,7 +490,7 @@ def depth_score(
         n_dirs: Number of random projection directions used by depth/sliced computations.
         eps: Lower quantile bound (eps_min) used in the depth distance integration (upper bound fixed at 1.0).
         p: Power used in the distance aggregation.
-        measure: Depth/distance backend to use. One of:
+        depth_measure: Depth/distance backend to use. One of:
             `"irw"`, `"ai_irw"`, `"wasserstein"`, `"sliced"`, `"mmd"`.
         multi_ref_reduction: Reduction to apply across multiple references per prediction.
             Default `"min"` (best match) since this is a distance metric.
@@ -663,7 +667,7 @@ def depth_score(
                 y,
                 n_alpha=n_alpha,
                 n_dirs=n_dirs,
-                data_depth=measure,
+                data_depth=depth_measure,
                 eps_min=eps,
                 eps_max=1.0,
                 p=p,
diff --git a/src/torchmetrics/text/depth_score.py b/src/torchmetrics/text/depth_score.py
index 4d6a55c4ead..db90e865cf1 100644
--- a/src/torchmetrics/text/depth_score.py
+++ b/src/torchmetrics/text/depth_score.py
@@ -112,7 +112,7 @@ class DepthScore(Metric):
             thresholds between ``eps`` and 1.0).
         eps: The lowest level-set bound in [0, 1]. The highest level set is fixed to 1.0 in this implementation.
         p: The power of the ground cost.
-        measure: Depth / discrepancy measure to use (e.g. ``"irw"`` or ``"ai_irw"``).
+        depth_measure: Depth / discrepancy measure to use (e.g. ``"irw"`` or ``"ai_irw"``).
         truncation: An indication of whether the input sequences should be truncated to the ``max_length``.
         multi_ref_reduction: Reduction to apply across multiple references per prediction.
             Default ``"min"`` (best match) since this is a distance metric. Options: ``"min"``, ``"max"``, ``"mean"``.
@@ -165,7 +165,7 @@ def __init__(
         n_alpha: int = 5,
         eps: float = 0.3,
         p: int = 5,
-        measure: str = "irw",
+        depth_measure: str = "irw",
         truncation: bool = False,
         multi_ref_reduction: str = "min",
         **kwargs: Any,
@@ -191,7 +191,7 @@ def __init__(
         self.n_alpha = n_alpha
         self.eps = eps
         self.p = p
-        self.measure = measure
+        self.depth_measure = depth_measure
         self.truncation = truncation
         self.multi_ref_reduction = multi_ref_reduction
 
@@ -284,7 +284,7 @@ def compute(self) -> Tensor:
             n_alpha=self.n_alpha,
             eps=self.eps,
             p=self.p,
-            measure=self.measure,
+            depth_measure=self.depth_measure,
             device=self.embedding_device if self.embedding_device is not None else None,
             model=self.model,
             user_tokenizer=self.tokenizer if self.user_tokenizer else None,
diff --git a/tests/unittests/text/test_depth_score.py b/tests/unittests/text/test_depth_score.py
index 3beb4b3e44f..93d83799a96 100644
--- a/tests/unittests/text/test_depth_score.py
+++ b/tests/unittests/text/test_depth_score.py
@@ -48,6 +48,7 @@ def _reference_depth_score(
     preds: Sequence[str],
     target: Sequence[str],
     num_layers: int,
+    depth_measure: str = "irw",
 ) -> Tensor:
     # Reference source code depthscore implementation
     try:
@@ -55,12 +56,13 @@ def _reference_depth_score(
     except ImportError:
         pytest.skip("test requires `nlg_eval_via_simi_measures` to be installed.")
 
-    metric_call = DepthScoreMetric(MODEL_NAME, layers_to_consider=num_layers)
+    metric_call = DepthScoreMetric(MODEL_NAME, layers_to_consider=num_layers, considered_measure=depth_measure)
     out = metric_call.evaluate_batch(list(target), list(preds))
     return torch.as_tensor(out["depth_score"], dtype=torch.float32)
 
 
 @pytest.mark.parametrize("num_layers", [4, 8])
+@pytest.mark.parametrize("depth_measure", ["irw", "ai_irw", "sliced", "wasserstein", "mmd"])
 @pytest.mark.parametrize(
     ("preds", "targets"),
     [(_inputs_single_reference.preds, _inputs_single_reference.target)],
@@ -81,11 +83,12 @@ class TestDepthScore(TextTester):
 
     @pytest.mark.parametrize("ddp", [pytest.param(True, marks=pytest.mark.DDP), False])
     @skip_on_connection_issues()
-    def test_depthscore_class(self, ddp, preds, targets, num_layers):
+    def test_depthscore_class(self, ddp, preds, targets, num_layers, depth_measure):
         """Test the depth score class."""
         metric_args = {
             "model_name_or_path": MODEL_NAME,
             "num_layers": num_layers,
+            "depth_measure": depth_measure,
             "device": "cpu",
             "batch_size": 8,
             "max_length": 128,
@@ -94,6 +97,7 @@ def test_depthscore_class(self, ddp, preds, targets, num_layers):
         reference_depth_score_metric = partial(
             _reference_depth_score,
             num_layers=num_layers,
+            depth_measure=depth_measure,
         )
 
         self.run_class_metric_test(
@@ -108,16 +112,18 @@ def test_depthscore_class(self, ddp, preds, targets, num_layers):
         )
 
     @skip_on_connection_issues()
-    def test_depthscore_functional(self, preds, targets, num_layers):
+    def test_depthscore_functional(self, preds, targets, num_layers, depth_measure):
         """Test the depthscore functional."""
         metric_args = {
             "model_name_or_path": MODEL_NAME,
             "num_layers": num_layers,
+            "depth_measure": depth_measure,
             "truncation": True,  # nlg_eval reference always truncates
         }
         reference_depth_score_metric = partial(
             _reference_depth_score,
             num_layers=num_layers,
+            depth_measure=depth_measure,
         )
 
         self.run_functional_metric_test(
@@ -129,11 +135,12 @@ def test_depthscore_functional(self, preds, targets, num_layers):
         )
 
     @skip_on_connection_issues()
-    def test_depthscore_differentiability(self, preds, targets, num_layers):
+    def test_depthscore_differentiability(self, preds, targets, num_layers, depth_measure):
         """Test the depthscore differentiability."""
         metric_args = {
             "model_name_or_path": MODEL_NAME,
             "num_layers": num_layers,
+            "depth_measure": depth_measure,
             "truncation": True,  # nlg_eval reference always truncates
         }
 

From 0b7a8f59b51427fbe727b23fdee358348950509f Mon Sep 17 00:00:00 2001
From: Sohaib-Ahmed21 <sohaibahmed1919@gmail.com>
Date: Sun, 25 Jan 2026 13:07:32 -0800
Subject: [PATCH 04/11] Add reference to original implementation properly

---
 docs/source/links.rst                           | 1 +
 src/torchmetrics/functional/text/depth_score.py | 7 +------
 src/torchmetrics/text/depth_score.py            | 3 +--
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/docs/source/links.rst b/docs/source/links.rst
index 95d5bd19712..0bdde6a0a91 100644
--- a/docs/source/links.rst
+++ b/docs/source/links.rst
@@ -51,6 +51,7 @@
 .. _BERT_score: https://github.com/Tiiiger/bert_score/blob/master/bert_score/utils.py
 .. _Bert_score Evaluating Text Generation: https://arxiv.org/abs/1904.09675
 .. _DepthScore Evaluating Text Generation: https://arxiv.org/abs/2103.12711
+.. _DEPTH_score: https://github.com/PierreColombo/nlg_eval_via_simi_measures/blob/main/nlg_eval_via_simi_measures/depth_score.py
 .. _BLEU score: https://en.wikipedia.org/wiki/BLEU
 .. _BLEU: https://www.semanticscholar.org/paper/Bleu%3A-a-Method-for-Automatic-Evaluation-of-Machine-Papineni-Roukos/d7da009f457917aa381619facfa5ffae9329a6e9
 .. _SacreBLEU: https://github.com/mjpost/sacrebleu
diff --git a/src/torchmetrics/functional/text/depth_score.py b/src/torchmetrics/functional/text/depth_score.py
index 2606c5edb84..aa4908cdb12 100644
--- a/src/torchmetrics/functional/text/depth_score.py
+++ b/src/torchmetrics/functional/text/depth_score.py
@@ -459,12 +459,7 @@ def depth_score(
     of their contextual token embeddings using a depth-based pseudo-metric. Lower values
     indicate that the predicted sentence is closer to the reference sentence.
 
-    The function supports:
-    - Single string inputs (`str`)
-    - Lists of strings (`Sequence[str]`)
-    - Tokenized dict inputs (`dict[str, Tensor]`) (used internally by the Metric class)
-    - Multi-reference evaluation (`Sequence[Sequence[str]]`), reduced per prediction by
-      `multi_ref_reduction` (default `"min"` for distance metrics).
+    This implementation follows the original implementation from `DEPTH_score`_.
 
     Args:
         preds: Predicted sentence(s) as `str`, `Sequence[str]`, or tokenized dict
diff --git a/src/torchmetrics/text/depth_score.py b/src/torchmetrics/text/depth_score.py
index db90e865cf1..7ced84ceeae 100644
--- a/src/torchmetrics/text/depth_score.py
+++ b/src/torchmetrics/text/depth_score.py
@@ -60,8 +60,7 @@ class DepthScore(Metric):
     mismatches between contextual representations and can be used for evaluating text generation tasks where
     *lower* distance indicates a better match.
 
-    This implementation follows the reference DepthScore formulation introduced by ``Colombo et al.`` and mirrors the
-    TorchMetrics-style API used by embedding-based text metrics.
+    This implementation follows the original implementation from `DEPTH_score`_.
 
     As input to ``forward`` and ``update`` the metric accepts the following input:
 

From 7e0929cb99f1deb30eb8bb19423a95967636c024 Mon Sep 17 00:00:00 2001
From: Sohaib-Ahmed21 <sohaibahmed1919@gmail.com>
Date: Sun, 25 Jan 2026 13:31:05 -0800
Subject: [PATCH 05/11] Add new depthscore specific dependencies to text
 dependencies in repo

---
 requirements/text.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/requirements/text.txt b/requirements/text.txt
index 188e4dc1e77..4ad9338c7dc 100644
--- a/requirements/text.txt
+++ b/requirements/text.txt
@@ -9,3 +9,7 @@ transformers >=4.43.0,<4.57
 mecab-python3 >=1.0.6, <1.1.0
 ipadic >=1.0.0, <1.1.0
 sentencepiece >=0.2.0, <0.3.0
+
+git scikit-learn >1.5.0, <1.8.0
+POT >=0.4.0, <=0.9.6
+geomloss ==0.2.6  # strict

From e514181c7701719166e18616ae4f259ca6627c6d Mon Sep 17 00:00:00 2001
From: Sohaib-Ahmed21 <sohaibahmed1919@gmail.com>
Date: Sun, 25 Jan 2026 14:37:13 -0800
Subject: [PATCH 06/11] Fix typo

---
 requirements/text.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/text.txt b/requirements/text.txt
index 4ad9338c7dc..64bb5be4a92 100644
--- a/requirements/text.txt
+++ b/requirements/text.txt
@@ -10,6 +10,6 @@ mecab-python3 >=1.0.6, <1.1.0
 ipadic >=1.0.0, <1.1.0
 sentencepiece >=0.2.0, <0.3.0
 
-git scikit-learn >1.5.0, <1.8.0
+scikit-learn >1.5.0, <1.8.0
 POT >=0.4.0, <=0.9.6
 geomloss ==0.2.6  # strict

From cd1a94ff535c064bba20965bcb1e90023cecfc58 Mon Sep 17 00:00:00 2001
From: Sohaib-Ahmed21 <sohaibahmed1919@gmail.com>
Date: Mon, 26 Jan 2026 01:05:31 -0800
Subject: [PATCH 07/11] Handle depth score specific dependencies, their imports
 and related test cases through utils properly

---
 .../functional/text/depth_score.py            | 60 ++++++++++++++++---
 src/torchmetrics/utilities/imports.py         |  3 +
 tests/unittests/text/test_depth_score.py      | 30 +++++++++-
 3 files changed, 82 insertions(+), 11 deletions(-)

diff --git a/src/torchmetrics/functional/text/depth_score.py b/src/torchmetrics/functional/text/depth_score.py
index aa4908cdb12..a9727306b13 100644
--- a/src/torchmetrics/functional/text/depth_score.py
+++ b/src/torchmetrics/functional/text/depth_score.py
@@ -16,15 +16,8 @@
 from contextlib import contextmanager
 from typing import Any, Callable, List, Optional, Tuple, Union, cast
 
-import geomloss
 import numpy as np
-
-# DepthScore deps
-import ot  # pip install POT  # codespell:ignore ot
 import torch
-from sklearn.covariance import MinCovDet as MCD  # noqa: N817
-from sklearn.decomposition import PCA
-from sklearn.preprocessing import normalize
 from torch import Tensor
 from torch.nn import Module
 from torch.utils.data import DataLoader
@@ -40,7 +33,13 @@
 )
 from torchmetrics.utilities import rank_zero_warn
 from torchmetrics.utilities.checks import _SKIP_SLOW_DOCTEST, _try_proceed_with_timeout
-from torchmetrics.utilities.imports import _TQDM_AVAILABLE, _TRANSFORMERS_GREATER_EQUAL_4_4
+from torchmetrics.utilities.imports import (
+    _GEOMLOSS_AVAILABLE,
+    _POT_AVAILABLE,
+    _SKLEARN_AVAILABLE,
+    _TQDM_AVAILABLE,
+    _TRANSFORMERS_GREATER_EQUAL_4_4,
+)
 
 log = logging.getLogger(__name__)
 
@@ -182,6 +181,13 @@ def _postprocess_multiple_references_distance(
 def cov_matrix(x: np.ndarray, robust: bool = False) -> np.ndarray:
     """Covariance matrix (optionally robust)."""
     if robust:
+        if not _SKLEARN_AVAILABLE:
+            raise ModuleNotFoundError(
+                "Robust covariance requires that `scikit-learn` is installed. "
+                "Use `pip install scikit-learn` or `pip install torchmetrics[text]`."
+            )
+        from sklearn.covariance import MinCovDet as MCD  # noqa: N817
+
         return MCD().fit(x).covariance_
     return np.cov(x.T)
 
@@ -193,6 +199,13 @@ def standardize(x: np.ndarray, robust: bool = False) -> np.ndarray:
     rank = np.linalg.matrix_rank(x)
 
     if rank < n_features:
+        if not _SKLEARN_AVAILABLE:
+            raise ModuleNotFoundError(
+                "Affine-invariant DepthScore requires that `scikit-learn` is installed. "
+                "Use `pip install scikit-learn` or `pip install torchmetrics[text]`."
+            )
+        from sklearn.decomposition import PCA
+
         x = PCA(rank).fit_transform(x)
         sigma = cov_matrix(x)
 
@@ -204,11 +217,26 @@ def standardize(x: np.ndarray, robust: bool = False) -> np.ndarray:
 def sampled_sphere(n_dirs: int, d: int) -> np.ndarray:
     """Uniform samples on unit sphere."""
     u = np.random.multivariate_normal(np.zeros(d), np.eye(d), size=n_dirs)
-    return normalize(u)
+    # The reference implementation uses `sklearn.preprocessing.normalize`. Here, that is mocked
+    # so default irw metric runs without any additional dependencies being installed.
+    return _normalize_l2_rows_exact(u)
+
+
+def _normalize_l2_rows_exact(x: np.ndarray) -> np.ndarray:
+    norms = np.sqrt(np.einsum("ij,ij->i", x, x))
+    norms[norms == 0.0] = 1.0
+    return x / norms[:, None]
 
 
 def wasserstein(x: np.ndarray, y: np.ndarray) -> float:
     """Optimal transport cost with uniform weights."""
+    if not _POT_AVAILABLE:
+        raise ModuleNotFoundError(
+            "The `wasserstein` backend requires that `POT` is installed. "
+            "Use `pip install POT` or `pip install torchmetrics[text]`."
+        )
+    import ot  # pip install POT  # codespell:ignore ot
+
     m = ot.dist(x, y)  # codespell:ignore ot
     w_x = np.ones(len(x)) / len(x)
     w_y = np.ones(len(y)) / len(y)
@@ -217,6 +245,13 @@ def wasserstein(x: np.ndarray, y: np.ndarray) -> float:
 
 def sw(x: np.ndarray, y: np.ndarray, ndirs: int, p: int = 2) -> float:
     """Sliced Wasserstein distance."""
+    if not _POT_AVAILABLE:
+        raise ModuleNotFoundError(
+            "The `sliced` backend requires that `POT` is installed. "
+            "Use `pip install POT` or `pip install torchmetrics[text]`."
+        )
+    import ot  # pip install POT  # codespell:ignore ot
+
     n, d = x.shape
     u = sampled_sphere(ndirs, d)
     z_x = x @ u.T
@@ -229,6 +264,13 @@ def sw(x: np.ndarray, y: np.ndarray, ndirs: int, p: int = 2) -> float:
 
 def mmd(x: np.ndarray, y: np.ndarray) -> float:
     """Gaussian MMD via geomloss."""
+    if not _GEOMLOSS_AVAILABLE:
+        raise ModuleNotFoundError(
+            "The `mmd` backend requires that `geomloss` is installed. "
+            "Use `pip install geomloss` or `pip install torchmetrics[text]`."
+        )
+    import geomloss
+
     return float(geomloss.SamplesLoss("gaussian")(torch.tensor(x), torch.tensor(y)).item())
 
 
diff --git a/src/torchmetrics/utilities/imports.py b/src/torchmetrics/utilities/imports.py
index 637b8a5468b..fa9538ecf2b 100644
--- a/src/torchmetrics/utilities/imports.py
+++ b/src/torchmetrics/utilities/imports.py
@@ -28,8 +28,11 @@
 _NLTK_AVAILABLE = RequirementCache("nltk")
 _ROUGE_SCORE_AVAILABLE = RequirementCache("rouge_score")
 _BERTSCORE_AVAILABLE = RequirementCache("bert_score")
+_GEOMLOSS_AVAILABLE = RequirementCache("geomloss")
+_POT_AVAILABLE = RequirementCache("POT")
 _SCIPY_AVAILABLE = RequirementCache("scipy")
 _SCIPY_GREATER_EQUAL_1_8 = RequirementCache("scipy>=1.8.0")
+_SKLEARN_AVAILABLE = RequirementCache("scikit-learn")
 _TORCH_FIDELITY_AVAILABLE = RequirementCache("torch_fidelity")
 _LPIPS_AVAILABLE = RequirementCache("lpips")
 _PYCOCOTOOLS_AVAILABLE = RequirementCache("pycocotools")
diff --git a/tests/unittests/text/test_depth_score.py b/tests/unittests/text/test_depth_score.py
index 93d83799a96..6ad34ed9ae4 100644
--- a/tests/unittests/text/test_depth_score.py
+++ b/tests/unittests/text/test_depth_score.py
@@ -21,7 +21,12 @@
 
 from torchmetrics.functional.text.depth_score import depth_score
 from torchmetrics.text.depth_score import DepthScore
-from torchmetrics.utilities.imports import _TRANSFORMERS_GREATER_EQUAL_4_4
+from torchmetrics.utilities.imports import (
+    _GEOMLOSS_AVAILABLE,
+    _POT_AVAILABLE,
+    _SKLEARN_AVAILABLE,
+    _TRANSFORMERS_GREATER_EQUAL_4_4,
+)
 from unittests._helpers import (
     _IS_WINDOWS,
     _TORCH_LESS_THAN_2_1,
@@ -38,6 +43,27 @@
 
 MODEL_NAME = "albert-base-v2"
 
+
+_DEPTH_MEASURES = [
+    "irw",
+    pytest.param(
+        "ai_irw",
+        marks=pytest.mark.skipif(not _SKLEARN_AVAILABLE, reason="test requires scikit-learn"),
+    ),
+    pytest.param(
+        "sliced",
+        marks=pytest.mark.skipif(not _POT_AVAILABLE, reason="test requires POT"),
+    ),
+    pytest.param(
+        "wasserstein",
+        marks=pytest.mark.skipif(not _POT_AVAILABLE, reason="test requires POT"),
+    ),
+    pytest.param(
+        "mmd",
+        marks=pytest.mark.skipif(not _GEOMLOSS_AVAILABLE, reason="test requires geomloss"),
+    ),
+]
+
 # Disable tokenizers parallelism (forking not friendly with parallelism)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
@@ -62,7 +88,7 @@ def _reference_depth_score(
 
 
 @pytest.mark.parametrize("num_layers", [4, 8])
-@pytest.mark.parametrize("depth_measure", ["irw", "ai_irw", "sliced", "wasserstein", "mmd"])
+@pytest.mark.parametrize("depth_measure", _DEPTH_MEASURES)
 @pytest.mark.parametrize(
     ("preds", "targets"),
     [(_inputs_single_reference.preds, _inputs_single_reference.target)],

From 52a3d3e25ebfa34feff2770374bebe43b5862c8c Mon Sep 17 00:00:00 2001
From: Sohaib-Ahmed21 <sohaibahmed1919@gmail.com>
Date: Mon, 26 Jan 2026 02:01:45 -0800
Subject: [PATCH 08/11] Fix RST formatting error

---
 docs/source/text/depth_score.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/text/depth_score.rst b/docs/source/text/depth_score.rst
index 68e695698bf..f73665e2886 100644
--- a/docs/source/text/depth_score.rst
+++ b/docs/source/text/depth_score.rst
@@ -5,9 +5,9 @@
 
 .. include:: ../links.rst
 
-##########
+###########
 Depth Score
-##########
+###########
 
 Module Interface
 ________________

From 8ef1573f21eadb504ef22d7f0d3bf473b32aba4b Mon Sep 17 00:00:00 2001
From: Sohaib-Ahmed21 <sohaibahmed1919@gmail.com>
Date: Mon, 26 Jan 2026 02:29:53 -0800
Subject: [PATCH 09/11] Change POT version to >=0.9.0 to fix Cython dependency
 issue in CI

---
 requirements/text.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/text.txt b/requirements/text.txt
index 64bb5be4a92..672650c99c6 100644
--- a/requirements/text.txt
+++ b/requirements/text.txt
@@ -11,5 +11,5 @@ ipadic >=1.0.0, <1.1.0
 sentencepiece >=0.2.0, <0.3.0
 
 scikit-learn >1.5.0, <1.8.0
-POT >=0.4.0, <=0.9.6
+POT >=0.9.0, <=0.9.6
 geomloss ==0.2.6  # strict

From 0454a489318d07bfd6d1f5872c0352349a21b037 Mon Sep 17 00:00:00 2001
From: Sohaib-Ahmed21 <sohaibahmed1919@gmail.com>
Date: Tue, 27 Jan 2026 10:51:03 -0800
Subject: [PATCH 10/11] Proper test skipping for DepthScore when
 nlg_eval_via_simi_measures is missing, fixes ddp Skipped exception failures

---
 tests/unittests/text/test_depth_score.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/unittests/text/test_depth_score.py b/tests/unittests/text/test_depth_score.py
index 6ad34ed9ae4..8a5634c9e71 100644
--- a/tests/unittests/text/test_depth_score.py
+++ b/tests/unittests/text/test_depth_score.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Check if nlg_eval_via_simi_measures is available for reference metric tests
+import importlib.util
 import os
 from collections.abc import Sequence
 from functools import partial
@@ -41,6 +43,8 @@
     _inputs_single_sentence_multiple_references,
 )
 
+_NLG_EVAL_AVAILABLE = importlib.util.find_spec("nlg_eval_via_simi_measures") is not None
+
 MODEL_NAME = "albert-base-v2"
 
 
@@ -77,10 +81,7 @@ def _reference_depth_score(
     depth_measure: str = "irw",
 ) -> Tensor:
     # Reference source code depthscore implementation
-    try:
-        from nlg_eval_via_simi_measures.depth_score import DepthScoreMetric
-    except ImportError:
-        pytest.skip("test requires `nlg_eval_via_simi_measures` to be installed.")
+    from nlg_eval_via_simi_measures.depth_score import DepthScoreMetric
 
     metric_call = DepthScoreMetric(MODEL_NAME, layers_to_consider=num_layers, considered_measure=depth_measure)
     out = metric_call.evaluate_batch(list(target), list(preds))
@@ -107,6 +108,7 @@ def _reference_depth_score(
 class TestDepthScore(TextTester):
     """Tests for DepthScore."""
 
+    @pytest.mark.skipif(not _NLG_EVAL_AVAILABLE, reason="test requires nlg_eval_via_simi_measures to be installed")
     @pytest.mark.parametrize("ddp", [pytest.param(True, marks=pytest.mark.DDP), False])
     @skip_on_connection_issues()
     def test_depthscore_class(self, ddp, preds, targets, num_layers, depth_measure):
@@ -137,6 +139,7 @@ def test_depthscore_class(self, ddp, preds, targets, num_layers, depth_measure):
             ignore_order=ddp,  # ignore order of predictions when DDP is used
         )
 
+    @pytest.mark.skipif(not _NLG_EVAL_AVAILABLE, reason="test requires nlg_eval_via_simi_measures to be installed")
     @skip_on_connection_issues()
     def test_depthscore_functional(self, preds, targets, num_layers, depth_measure):
         """Test the depthscore functional."""

From dc90a5528bdc45a13df40a41b3571b3cd5a01994 Mon Sep 17 00:00:00 2001
From: Sohaib-Ahmed21 <sohaibahmed1919@gmail.com>
Date: Thu, 29 Jan 2026 07:57:36 -0800
Subject: [PATCH 11/11] Fix code quality check failures

---
 src/torchmetrics/functional/text/depth_score.py | 8 ++++----
 src/torchmetrics/text/depth_score.py            | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/torchmetrics/functional/text/depth_score.py b/src/torchmetrics/functional/text/depth_score.py
index a9727306b13..f27055bc46f 100644
--- a/src/torchmetrics/functional/text/depth_score.py
+++ b/src/torchmetrics/functional/text/depth_score.py
@@ -644,12 +644,12 @@ def depth_score(
         return torch.zeros(1, dtype=torch.float32)
 
     if _are_valid_lists:
-        target_dataset = TextDataset(target, tokenizer, max_length, truncation=truncation)
-        preds_dataset = TextDataset(preds, tokenizer, max_length, truncation=truncation)
+        target_dataset = TextDataset(target, tokenizer, max_length, truncation=truncation)  # type: ignore
+        preds_dataset = TextDataset(preds, tokenizer, max_length, truncation=truncation)  # type: ignore
 
     elif _are_valid_tensors:
-        target_dataset = TokenizedDataset(**cast(dict, target))
-        preds_dataset = TokenizedDataset(**cast(dict, preds))
+        target_dataset = TokenizedDataset(**target)  # type: ignore
+        preds_dataset = TokenizedDataset(**preds)  # type: ignore
     else:
         raise ValueError("Invalid input provided.")
 
diff --git a/src/torchmetrics/text/depth_score.py b/src/torchmetrics/text/depth_score.py
index 7ced84ceeae..5d8b6f5ec0f 100644
--- a/src/torchmetrics/text/depth_score.py
+++ b/src/torchmetrics/text/depth_score.py
@@ -352,6 +352,6 @@ def plot(
 
         """
         if val is None:  # default average score across sentences
-            val = self.compute()  # type: ignore
-            val = val.mean()  # type: ignore
+            val = self.compute()
+            val = val.mean()
         return self._plot(val, ax)