Use linear space in multiclass stats computation when n_classes>1000

barakugav · barakugav · commit c610c46a15eb · 2026-03-21T19:28:03.000+02:00
diff --git a/src/torchmetrics/functional/classification/stat_scores.py b/src/torchmetrics/functional/classification/stat_scores.py
@@ -441,13 +441,21 @@ def _multiclass_stat_scores_update(
             idx = target != ignore_index
             preds = preds[idx]
             target = target[idx]
-        unique_mapping = target.to(torch.long) * num_classes + preds.to(torch.long)
-        bins = _bincount(unique_mapping, minlength=num_classes**2)
-        confmat = bins.reshape(num_classes, num_classes)
-        tp = confmat.diag()
-        fp = confmat.sum(0) - tp
-        fn = confmat.sum(1) - tp
-        tn = confmat.sum() - (fp + fn + tp)
+        if num_classes < 1000:
+            unique_mapping = target.to(torch.long) * num_classes + preds.to(torch.long)
+            bins = _bincount(unique_mapping, minlength=num_classes**2)
+            confmat = bins.reshape(num_classes, num_classes)
+            tp = confmat.diag()
+            fp = confmat.sum(0) - tp
+            fn = confmat.sum(1) - tp
+            tn = confmat.sum() - (fp + fn + tp)
+        else:
+            # The above approach requires num_classes**2 memory. For large num_classes, we can calculate the
+            # statistics separately using linear memory.
+            tp = _bincount(preds[target == preds], minlength=num_classes)
+            fp = _bincount(preds, minlength=num_classes) - tp
+            fn = _bincount(target, minlength=num_classes) - tp
+            tn = target.numel() - (tp + fp + fn)
     return tp, fp, tn, fn
 
 
diff --git a/tests/unittests/classification/test_accuracy.py b/tests/unittests/classification/test_accuracy.py
@@ -408,6 +408,25 @@ def test_multiclass_accuracy_gpu_sync_points_uptodate(
             )
 
 
+def test_multiclass_accuracy_large_num_classes():
+    """Test that accuracy is correct when num_classes>=1000, exercising the linear-space code path."""
+    num_classes = 1_000_000
+    n = 500
+    generator = torch.Generator().manual_seed(42)
+    target = torch.randint(0, num_classes, (n,), generator=generator)
+    preds = torch.randint(0, num_classes, (n,), generator=generator)
+
+    # We have so many classes that its most likely the accurary is 0 in this test, so we artificially
+    # set 20% of the predictions to be correct.
+    artificially_correct = torch.randperm(n, generator=generator)[: n // 5]
+    preds[artificially_correct] = target[artificially_correct]
+
+    # Expected: fraction of exactly correct predictions
+    expected = (preds == target).float().mean()
+    result = multiclass_accuracy(preds, target, num_classes=num_classes, average="micro")
+    assert torch.isclose(result, expected), f"Expected {expected}, got {result}"
+
+
 _mc_k_target = torch.tensor([0, 1, 2])
 _mc_k_preds = torch.tensor([[0.35, 0.4, 0.25], [0.1, 0.5, 0.4], [0.2, 0.1, 0.7]])