Skip to content

Commit 250cc53

Browse files
committed
[mathcore] Fix TKDTreeBinning::FindBin returning out-of-range bins
TKDTreeBinning computed fNBins independently from the kd-tree it builds. SetNBins picks a bucket size of fDataSize / nBins (integer division), but the underlying TKDTree creates ceil(fNPoints / bucketSize) terminal nodes. When the requested number of bins does not divide the data size evenly, the bucket size is rounded down and the tree ends up with more terminal nodes than fNBins. Since FindBin returns a terminal-node index, it could return a value >= GetNBins(), pointing at a bin with no edges and no content: ```c++ binning.GetNBins() # 1001 binning.FindBin([1.2, 0.5, 0.5, 0.5, 0.5]) # 1004 binning.GetBinContent(1004) # RuntimeWarning: No such bin ``` Set fNBins to the actual number of terminal nodes (GetNNodes() + 1) after building the tree, so FindBin always returns an index within [0, GetNBins()) and every bin has valid edges and content. This also self-corrects existing objects on read-back, as the Streamer rebuilds the tree via SetNBins. Also fix SetBinsContent to read the per-node point counts directly from the tree (GetNPointsNode) instead of an incorrect last-bin heuristic, so bin contents are exact and sum to the data size. Add a regression test covering the non-evenly-dividing case. Closes #10786. 🤖 Done with the help of AI.
1 parent d761977 commit 250cc53

2 files changed

Lines changed: 73 additions & 4 deletions

File tree

math/mathcore/src/TKDTreeBinning.cxx

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,12 @@ void TKDTreeBinning::SetNBins(UInt_t bins) {
136136
fDataBins = new TKDTreeID(fDataSize, fDim, fDataSize / (fNBins - remainingData)); // TKDTree input is data size, data dimension and the content size of bins ("bucket" size)
137137
SetTreeData();
138138
fDataBins->Build();
139+
// The kd-tree determines the actual number of bins: every terminal node
140+
// is a bin and there are GetNNodes() + 1 of them. When the requested bin
141+
// count does not divide the data size evenly, the bucket size is rounded
142+
// down and the tree ends up with more terminal nodes than the requested
143+
// number of bins. We must therefore use the real number of terminal nodes.
144+
fNBins = fDataBins->GetNNodes() + 1;
139145
SetBinsEdges();
140146
SetBinsContent();
141147
} else {
@@ -238,12 +244,13 @@ void TKDTreeBinning::SetTreeData() {
238244
}
239245

240246
void TKDTreeBinning::SetBinsContent() {
241-
// Sets the bins' content
247+
// Sets the bins' content from the actual number of points in each terminal
248+
// node of the kd-tree. All terminal nodes hold fBucketSize points except
249+
// possibly the last one, so query the tree directly instead of guessing.
242250
fBinsContent.resize(fNBins);
251+
UInt_t nTreeNodes = fDataBins->GetNNodes();
243252
for (UInt_t i = 0; i < fNBins; ++i)
244-
fBinsContent[i] = fDataBins->GetBucketSize();
245-
if ( fDataSize % fNBins != 0 )
246-
fBinsContent[fNBins - 1] = fDataSize % (fNBins-1);
253+
fBinsContent[i] = fDataBins->GetNPointsNode(i + nTreeNodes);
247254
}
248255

249256
void TKDTreeBinning::SetBinsEdges() {

math/mathcore/test/testkdTreeBinning.cxx

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,66 @@ int testkdTreeBinning() {
169169

170170
}
171171

172+
// Regression test for the case where the requested number of bins does not
173+
// divide the data size evenly. In that situation the kd-tree builds more
174+
// terminal nodes (bins) than naively expected, and FindBin() must never return
175+
// an index outside [0, GetNBins()). See
176+
// https://github.com/root-project/root/issues/10784 about
177+
// TKDTreeBinning::FindBin returning non-existent bins. Returns the number of
178+
// detected failures (0 on success) so that the caller can turn it into a
179+
// non-zero process exit code: ROOT's Error() only prints, it does not by
180+
// itself make the test fail under ctest.
181+
int testkdTreeBinningFindBinRange() {
182+
183+
int nfail = 0;
184+
185+
const UInt_t DATASZ = 100500; // deliberately NOT a multiple of NBINS
186+
const UInt_t DATADIM = 5;
187+
const UInt_t NBINS = 1000;
188+
189+
std::vector<Double_t> smp(DATASZ * DATADIM);
190+
TRandom3 r;
191+
r.SetSeed(1);
192+
for (UInt_t i = 0; i < DATADIM; ++i)
193+
for (UInt_t j = 0; j < DATASZ; ++j)
194+
smp[DATASZ * i + j] = r.Uniform(-1., 1.);
195+
196+
TKDTreeBinning kdBins(DATASZ, DATADIM, smp, NBINS);
197+
198+
const UInt_t nbins = kdBins.GetNBins();
199+
200+
// The number of bins must match the number of terminal nodes of the kd-tree.
201+
if ((int)nbins != kdBins.GetTree()->GetNNodes() + 1) {
202+
Error("testkdTreeBinningFindBinRange", "GetNBins() (%u) != number of kd-tree terminal nodes (%d)",
203+
nbins, kdBins.GetTree()->GetNNodes() + 1);
204+
++nfail;
205+
}
206+
207+
// Every data point must be assigned to a valid bin.
208+
std::vector<Double_t> point(DATADIM);
209+
for (UInt_t j = 0; j < DATASZ; ++j) {
210+
for (UInt_t i = 0; i < DATADIM; ++i)
211+
point[i] = smp[DATASZ * i + j];
212+
UInt_t bin = kdBins.FindBin(point.data());
213+
if (bin >= nbins) {
214+
Error("testkdTreeBinningFindBinRange", "FindBin returned out-of-range bin %u (NBins = %u)", bin, nbins);
215+
++nfail;
216+
break;
217+
}
218+
}
219+
220+
// The total bin content must add up to the data size.
221+
Long64_t total = 0;
222+
for (UInt_t i = 0; i < nbins; ++i)
223+
total += kdBins.GetBinContent(i);
224+
if (total != (Long64_t)DATASZ) {
225+
Error("testkdTreeBinningFindBinRange", "Sum of bin contents (%lld) != data size (%u)", total, DATASZ);
226+
++nfail;
227+
}
228+
229+
return nfail;
230+
}
231+
172232
int main(int argc, char **argv)
173233
{
174234
// Parse command line arguments
@@ -197,6 +257,8 @@ int main(int argc, char **argv)
197257

198258
int nfail = testkdTreeBinning();
199259

260+
nfail += testkdTreeBinningFindBinRange();
261+
200262
if ( showGraphics )
201263
{
202264
theApp->Run();

0 commit comments

Comments
 (0)