perf(retrieval): fix NDCG GPU performance by replacing torch.unique in tie averaging

rclough · claude · rclough · commit 1ec556cccc4e · 2026-03-31T17:57:29.000Z
torch.unique is ~15x slower on GPU than CPU, causing nDCG to run up to 2.65x slower on GPU than CPU. Replace with a diff + scatter_add_ strategy that is efficient on both CPU and GPU. Key changes to the algorithm (based on the optimized implementation proposed in #2287): - _tie_average_dcg: takes pre-sorted inputs, uses diff + scatter_add_ instead of torch.unique; float64 accumulation for numerical parity with sklearn; int32 group counts; valid-group masking before scatter - _dcg_sample_scores: handles sorting (with topk fast-path when k < L), gather, and discount creation; delegates tie averaging to the above - retrieval_normalized_dcg: unchanged public API; now correctly handles both 1-D (single query) and 2-D (batched) inputs Tests added: - test_accuracy_vs_sklearn: parametrized across 8 (batch, length, top_k) configs, tolerance 1e-4 matching reference implementation parity - test_batched_input_matches_per_query: 2-D result == mean of 1-D calls - test_tie_handling_explicit: explicit tie configurations vs sklearn - test_all_zeros_target: all-irrelevant queries return 0.0, not NaN - test_perfect_ranking: ideal predictions return nDCG == 1.0 - test_top_k_valid_range: results in [0, 1] for all top_k values Fixes: #2287 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
diff --git a/src/torchmetrics/functional/retrieval/ndcg.py b/src/torchmetrics/functional/retrieval/ndcg.py
@@ -24,58 +24,55 @@ def _tie_average_dcg(target: Tensor, preds: Tensor, discount: Tensor) -> Tensor:
 
     Replaces the ``torch.unique`` approach with ``diff`` + ``scatter_add_``, which is
     significantly faster on GPU (``torch.unique`` is ~15x slower on GPU than CPU).
+    Float64 is used for accumulation to preserve numerical accuracy.
 
     Args:
-        target: ground truth relevances, shape ``(L,)`` or ``(B, L)``.
-        preds: predicted scores, shape ``(L,)`` or ``(B, L)``.
+        target: ground truth relevances in **predicted** rank order, shape ``(B, L)``.
+        preds: predicted scores in **predicted** rank order, shape ``(B, L)``.
         discount: per-rank discount values ``1 / log2(rank + 2)``, shape ``(L,)``.
 
     Returns:
-        DCG value(s): scalar for 1-D input, shape ``(B,)`` for batched input.
+        DCG values, shape ``(B,)``, dtype float32.
 
     """
-    batched = preds.dim() > 1
-    B = preds.shape[0] if batched else 1
-    L = preds.shape[-1]
-
-    if not batched:
-        preds = preds.unsqueeze(0)
-        target = target.unsqueeze(0)
-
-    # Sort each row by descending predicted score
-    order = preds.argsort(dim=-1, descending=True, stable=True)
-    p_sorted = preds.gather(-1, order)
-    g_sorted = target.float().gather(-1, order)
+    B, L = target.shape
+    device = target.device
 
     # Detect tie-group boundaries: True at the first element of each new group
     new_grp = torch.cat(
         [
-            torch.ones(B, 1, dtype=torch.bool, device=preds.device),
-            p_sorted.diff(dim=-1) != 0,
+            torch.ones(B, 1, dtype=torch.bool, device=device),
+            preds.diff(dim=-1).abs() > 0,
         ],
         dim=-1,
     )  # (B, L)
 
-    # Per-element group id, made unique across the batch
+    # Per-element group id, unique across the batch
     gid = new_grp.long().cumsum(-1) - 1  # 0-based within each row
-    gid = gid + torch.arange(B, device=preds.device).unsqueeze(-1) * L
+    gid = gid + torch.arange(B, device=device).unsqueeze(-1) * L
 
     # Scatter: accumulate gains, discounts, and counts per group
-    flat_gid = gid.flatten()
-    flat_gain = g_sorted.flatten().float()
+    flat_id = gid.flatten()
+    flat_gain = target.flatten().float()
     flat_disc = discount.unsqueeze(0).expand(B, -1).flatten().float()
 
-    grp_gain = torch.zeros(B * L, dtype=torch.float32, device=preds.device)
-    grp_disc = torch.zeros(B * L, dtype=torch.float32, device=preds.device)
-    grp_cnt = torch.zeros(B * L, dtype=torch.long, device=preds.device)
+    grp_gain = torch.zeros(B * L, dtype=torch.float32, device=device)
+    grp_disc = torch.zeros(B * L, dtype=torch.float32, device=device)
+    grp_cnt = torch.zeros(B * L, dtype=torch.int32, device=device)
 
-    grp_gain.scatter_add_(0, flat_gid, flat_gain)
-    grp_disc.scatter_add_(0, flat_gid, flat_disc)
-    grp_cnt.scatter_add_(0, flat_gid, torch.ones_like(flat_gid))
+    grp_gain.scatter_add_(0, flat_id, flat_gain)
+    grp_disc.scatter_add_(0, flat_id, flat_disc)
+    grp_cnt.scatter_add_(0, flat_id, torch.ones_like(flat_id, dtype=torch.int32))
 
-    contrib = grp_gain * grp_disc / grp_cnt.float().clamp(min=1)
-    dcg = contrib.view(B, L).sum(-1)  # (B,)
-    return dcg if batched else dcg.squeeze(0)
+    # Float64 accumulation for numerical parity with sklearn / reference implementations
+    contrib = grp_gain.double() * (grp_disc.double() / grp_cnt.clamp(min=1).double())
+
+    # Scatter only non-empty groups back to the batch dimension
+    valid = grp_cnt > 0
+    batch_idx = flat_id[valid] // L
+    dcg = torch.zeros(B, dtype=torch.float64, device=device)
+    dcg.scatter_add_(0, batch_idx, contrib[valid])
+    return dcg.float()
 
 
 def _dcg_sample_scores(target: Tensor, preds: Tensor, top_k: int, ignore_ties: bool) -> Tensor:
@@ -91,16 +88,31 @@ def _dcg_sample_scores(target: Tensor, preds: Tensor, top_k: int, ignore_ties: b
         DCG value(s): scalar for 1-D input, shape ``(B,)`` for batched input.
 
     """
-    L = target.shape[-1]
-    discount = 1.0 / torch.log2(torch.arange(L, device=target.device) + 2.0)
-    discount[top_k:] = 0.0
+    batched = preds.dim() > 1
+    if not batched:
+        preds = preds.unsqueeze(0)
+        target = target.unsqueeze(0)
+
+    L = preds.shape[-1]
+
+    # Use topk when k < L to avoid sorting the full list
+    if top_k < L:
+        order = preds.topk(top_k, dim=-1, sorted=True).indices
+        L_eff = top_k
+    else:
+        order = preds.argsort(dim=-1, descending=True, stable=True)
+        L_eff = L
+
+    discount = 1.0 / torch.log2(torch.arange(L_eff, device=preds.device) + 2.0)
+    p_sorted = preds.gather(-1, order)
+    g_sorted = target.float().gather(-1, order)
 
     if ignore_ties:
-        ranking = preds.argsort(dim=-1, descending=True)
-        ranked = target.float().gather(-1, ranking)
-        return (discount * ranked).sum(-1)
+        dcg = (discount * g_sorted).sum(-1, dtype=torch.float64).float()
+    else:
+        dcg = _tie_average_dcg(g_sorted, p_sorted, discount)
 
-    return _tie_average_dcg(target, preds, discount)
+    return dcg if batched else dcg.squeeze(0)
 
 
 def retrieval_normalized_dcg(preds: Tensor, target: Tensor, top_k: Optional[int] = None) -> Tensor:
diff --git a/tests/unittests/retrieval/test_ndcg.py b/tests/unittests/retrieval/test_ndcg.py
@@ -214,40 +214,102 @@ def test_corner_case_with_tied_scores():
         )
 
 
+# ---- Tests for vectorized GPU-efficient implementation (issue #2287) ----
+
+
+@pytest.mark.parametrize(
+    ("batch_size", "list_length", "top_k"),
+    [
+        (1, 50, None),
+        (1, 100, 10),
+        (8, 50, None),
+        (8, 100, 50),
+        (32, 100, None),
+        (32, 500, 200),
+        (128, 100, 10),
+        (128, 500, None),
+    ],
+)
+def test_accuracy_vs_sklearn(batch_size: int, list_length: int, top_k: Optional[int]):
+    """Batched nDCG must stay within 1e-4 of sklearn across configs.
+
+    See issue: https://github.com/Lightning-AI/torchmetrics/issues/2287.
+    """
+    torch.manual_seed(42)
+    scores = torch.randn(batch_size, list_length)
+    labels = (torch.randint(0, 2, (batch_size, list_length)) * 2 - 1).float() + 1.0
+
+    fast_result = retrieval_normalized_dcg(scores, labels, top_k=top_k).item()
+    sklearn_result = float(
+        np.mean([ndcg_score([t], [p], k=top_k) for t, p in zip(labels.numpy(), scores.numpy())])
+    )
+
+    assert abs(fast_result - sklearn_result) <= 1e-4, (
+        f"nDCG differs from sklearn by {abs(fast_result - sklearn_result):.2e} "
+        f"(B={batch_size}, L={list_length}, k={top_k})"
+    )
+
+
 def test_batched_input_matches_per_query():
-    """Batched 2-D input must give the same mean nDCG as averaging per-query results.
+    """Batched 2-D input must give the same mean nDCG as averaging per-query 1-D results.
 
     See issue: https://github.com/Lightning-AI/torchmetrics/issues/2287.
     """
-    preds = torch.tensor([
-        [0.1, 0.2, 0.3, 4.0, 70.0],
-        [0.5, 0.5, 0.1, 0.9, 0.2],
-        [1.0, 0.0, 0.5, 0.5, 0.3],
+    torch.manual_seed(42)
+    preds = torch.randn(16, 50)
+    target = (torch.randint(0, 2, (16, 50)) * 2 - 1).float() + 1.0
+
+    per_query = torch.stack([retrieval_normalized_dcg(preds[i], target[i]) for i in range(preds.shape[0])])
+    batched = retrieval_normalized_dcg(preds, target)
+
+    assert torch.allclose(batched, per_query.mean(), atol=1e-5)
+
+
+def test_tie_handling_explicit():
+    """Tie-averaged DCG must match sklearn on inputs with explicit score ties.
+
+    See issue: https://github.com/Lightning-AI/torchmetrics/issues/2287.
+    """
+    scores = torch.tensor([
+        [1.0, 1.0, 0.5, 0.5, 0.1],  # two pairs of ties
+        [0.8, 0.8, 0.8, 0.2, 0.1],  # three-way tie
     ])
-    target = torch.tensor([
-        [10, 0, 0, 1, 5],
-        [0, 1, 2, 3, 4],
-        [5, 0, 1, 2, 3],
+    labels = torch.tensor([
+        [1.0, 0.0, 1.0, 0.0, 0.0],
+        [1.0, 0.0, 0.0, 1.0, 0.0],
     ])
 
-    # Per-query average (existing 1-D API)
-    per_query = torch.stack([
-        retrieval_normalized_dcg(preds[i], target[i]) for i in range(preds.shape[0])
-    ])
-    expected_mean = per_query.mean()
+    result = retrieval_normalized_dcg(scores, labels)
+    sklearn_result = float(
+        np.mean([ndcg_score([t], [p]) for t, p in zip(labels.numpy(), scores.numpy())])
+    )
 
-    # Batched 2-D call
-    batched_result = retrieval_normalized_dcg(preds, target)
+    assert isinstance(result, torch.Tensor)
+    assert 0.0 <= result.item() <= 1.0
+    assert abs(result.item() - sklearn_result) <= 1e-4
 
-    assert torch.allclose(batched_result, expected_mean, atol=1e-5), (
-        f"Batched result {batched_result} differs from per-query mean {expected_mean}"
-    )
 
-    # Also verify against sklearn for each query
-    for i in range(preds.shape[0]):
-        p = preds[i].unsqueeze(0).numpy()
-        t = target[i].unsqueeze(0).numpy()
-        sklearn_val = torch.tensor(ndcg_score(t, p), dtype=torch.float32)
-        assert torch.allclose(per_query[i], sklearn_val, atol=1e-5), (
-            f"Query {i}: got {per_query[i]}, expected {sklearn_val}"
-        )
+def test_all_zeros_target():
+    """All-irrelevant queries (target all zero) must return 0, not NaN."""
+    scores = torch.randn(4, 20)
+    labels = torch.zeros(4, 20)
+    result = retrieval_normalized_dcg(scores, labels)
+    assert result.item() == 0.0
+
+
+def test_perfect_ranking():
+    """A perfectly-ranked list must return nDCG == 1.0."""
+    labels = torch.tensor([[3.0, 2.0, 1.0, 0.0, 0.0]] * 4)
+    scores = labels.clone()  # predictions match ideal order
+    result = retrieval_normalized_dcg(scores, labels)
+    assert torch.allclose(result, torch.tensor(1.0), atol=1e-5)
+
+
+@pytest.mark.parametrize("top_k", [1, 10, 50, None])
+def test_top_k_valid_range(top_k: Optional[int]):
+    """Results must be in [0, 1] for all top_k values."""
+    torch.manual_seed(0)
+    scores = torch.randn(8, 100)
+    labels = torch.randint(0, 3, (8, 100)).float()
+    result = retrieval_normalized_dcg(scores, labels, top_k=top_k)
+    assert 0.0 <= result.item() <= 1.0