From 2fbc407498d42052ad883033566c2b4e15cc8ba8 Mon Sep 17 00:00:00 2001 From: mfdel Date: Wed, 14 Jan 2026 21:21:37 +0100 Subject: [PATCH] Add Booster.compute_leaf_similarity() method Compute similarity between observations based on leaf node co-occurrence across trees. Similar to Random Forest proximity matrices. - Two weight types: 'gain' (default) and 'cover' - Returns similarity matrix with values in [0, 1] - Self-similarity is 1.0 Closes #11919 --- python-package/xgboost/core.py | 64 ++++++++++++++++++++++++++++ tests/python/test_leaf_similarity.py | 47 ++++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 tests/python/test_leaf_similarity.py diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 528b44c6fe25..3ffe681a4a07 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -2981,6 +2981,70 @@ def inplace_predict( "Data type:" + str(type(data)) + " not supported by inplace prediction." ) + def compute_leaf_similarity( + self, + data: DMatrix, + reference: DMatrix, + weight_type: str = "gain", + ) -> np.ndarray: + """Compute similarity between observations based on leaf node co-occurrence. + + Two samples are similar if they land in the same leaf nodes across trees. + This is similar to Random Forest proximity matrices. + + Parameters + ---------- + data : + Query dataset (m samples). + reference : + Reference dataset (n samples). + weight_type : + How to weight trees: "gain" (by loss improvement) or "cover" + (by hessian sum, approximately sample count for regression). + + Returns + ------- + similarity : ndarray of shape (m, n) + Similarity scores in [0, 1]. + """ + if weight_type not in ("gain", "cover"): + raise ValueError( + f"weight_type must be 'gain' or 'cover', got '{weight_type}'" + ) + + query_leaves = self.predict(data, pred_leaf=True) + ref_leaves = self.predict(reference, pred_leaf=True) + + if query_leaves.ndim == 1: + query_leaves = query_leaves.reshape(-1, 1) + if ref_leaves.ndim == 1: + ref_leaves = ref_leaves.reshape(-1, 1) + + n_trees = query_leaves.shape[1] + + trees_df = self.trees_to_dataframe() + split_nodes = trees_df[trees_df["Feature"] != "Leaf"] + col = "Gain" if weight_type == "gain" else "Cover" + tree_weights = split_nodes.groupby("Tree")[col].sum() + + weights = np.zeros(n_trees, dtype=np.float32) + for tree_id, w in tree_weights.items(): + if tree_id < n_trees: + weights[int(tree_id)] = w + + if weights.sum() == 0: + weights = np.ones(n_trees, dtype=np.float32) + + total_weight = weights.sum() + m, n = len(query_leaves), len(ref_leaves) + + similarity = np.zeros((m, n), dtype=np.float32) + for i in range(m): + matches_i = query_leaves[i] == ref_leaves + similarity[i] = (matches_i * weights).sum(axis=1) / total_weight + + return similarity + def save_model(self, fname: PathLike) -> None: """Save the model to a file. diff --git a/tests/python/test_leaf_similarity.py b/tests/python/test_leaf_similarity.py new file mode 100644 index 000000000000..3dcbf9e804da --- /dev/null +++ b/tests/python/test_leaf_similarity.py @@ -0,0 +1,47 @@ +"""Tests for leaf similarity computation.""" + +import numpy as np +import pytest + +import xgboost as xgb +from xgboost import testing as tm + +rng = np.random.RandomState(1994) + + +class TestLeafSimilarity: + """Tests for Booster.compute_leaf_similarity()""" + + def test_leaf_similarity(self) -> None: + """Test basic leaf similarity computation.""" + dtrain, _ = tm.load_agaricus(__file__) + param = {"max_depth": 4, "eta": 0.3, "objective": "binary:logistic"} + bst = xgb.train(param, dtrain, num_boost_round=10) + + X = dtrain.get_data() + dm_query = xgb.DMatrix(X[:10]) + dm_ref = xgb.DMatrix(X[100:150]) + + # Test shape and range + similarity = bst.compute_leaf_similarity(dm_query, dm_ref) + assert similarity.shape == (10, 50) + assert similarity.min() >= 0.0 + assert similarity.max() <= 1.0 + + # Self-similarity diagonal should be 1.0 + dm_self = xgb.DMatrix(X[:20]) + self_sim = bst.compute_leaf_similarity(dm_self, dm_self) + np.testing.assert_allclose(np.diag(self_sim), 1.0, rtol=1e-5) + + # Test weight types + sim_gain = bst.compute_leaf_similarity(dm_query, dm_ref, weight_type="gain") + sim_cover = bst.compute_leaf_similarity(dm_query, dm_ref, weight_type="cover") + assert sim_gain.shape == sim_cover.shape + + # Default should be gain + sim_default = bst.compute_leaf_similarity(dm_query, dm_ref) + np.testing.assert_array_equal(sim_default, sim_gain) + + # Invalid weight_type + with pytest.raises(ValueError, match="weight_type must be"): + bst.compute_leaf_similarity(dm_query, dm_ref, weight_type="invalid")