diff --git a/math/mathcore/src/TKDTreeBinning.cxx b/math/mathcore/src/TKDTreeBinning.cxx index 055add4c6fead..c7407b25fb0a3 100644 --- a/math/mathcore/src/TKDTreeBinning.cxx +++ b/math/mathcore/src/TKDTreeBinning.cxx @@ -136,6 +136,12 @@ void TKDTreeBinning::SetNBins(UInt_t bins) { fDataBins = new TKDTreeID(fDataSize, fDim, fDataSize / (fNBins - remainingData)); // TKDTree input is data size, data dimension and the content size of bins ("bucket" size) SetTreeData(); fDataBins->Build(); + // The kd-tree determines the actual number of bins: every terminal node + // is a bin and there are GetNNodes() + 1 of them. When the requested bin + // count does not divide the data size evenly, the bucket size is rounded + // down and the tree ends up with more terminal nodes than the requested + // number of bins. We must therefore use the real number of terminal nodes. + fNBins = fDataBins->GetNNodes() + 1; SetBinsEdges(); SetBinsContent(); } else { @@ -238,12 +244,13 @@ void TKDTreeBinning::SetTreeData() { } void TKDTreeBinning::SetBinsContent() { - // Sets the bins' content + // Sets the bins' content from the actual number of points in each terminal + // node of the kd-tree. All terminal nodes hold fBucketSize points except + // possibly the last one, so query the tree directly instead of guessing. fBinsContent.resize(fNBins); + UInt_t nTreeNodes = fDataBins->GetNNodes(); for (UInt_t i = 0; i < fNBins; ++i) - fBinsContent[i] = fDataBins->GetBucketSize(); - if ( fDataSize % fNBins != 0 ) - fBinsContent[fNBins - 1] = fDataSize % (fNBins-1); + fBinsContent[i] = fDataBins->GetNPointsNode(i + nTreeNodes); } void TKDTreeBinning::SetBinsEdges() { diff --git a/math/mathcore/test/testkdTreeBinning.cxx b/math/mathcore/test/testkdTreeBinning.cxx index ba763375a6bb4..e4eb67c2bf73a 100644 --- a/math/mathcore/test/testkdTreeBinning.cxx +++ b/math/mathcore/test/testkdTreeBinning.cxx @@ -28,7 +28,13 @@ using std::cout; using std::cerr; using std::endl; -void testkdTreeBinning() { +// Returns the number of detected failures (0 on success) so that the caller can +// turn it into a non-zero process exit code: ROOT's Error() only prints, it does +// not by itself make the test fail under ctest. +int testkdTreeBinning() +{ + + int nfail = 0; // ----------------------------------------------------------------------------------------------- // C r e a t e r a n d o m s a m p l e @@ -83,14 +89,22 @@ void testkdTreeBinning() { std::cout << "Bin with minimum density: " << ibinMin << " density = " << kdBins->GetBinDensity(ibinMin) << " content = " << kdBins->GetBinContent(ibinMin) << std::endl; std::cout << "Bin with maximum density: " << ibinMax << " density = " << kdBins->GetBinDensity(ibinMax) << " content = " << kdBins->GetBinContent(ibinMax) << std::endl; - if (kdBins->GetBinContent(ibinMax) != DATASZ/NBINS) + if (kdBins->GetBinContent(ibinMax) != DATASZ / NBINS) { Error("testkdTreeBinning","Wrong bin content"); + ++nfail; + } // order bins by density kdBins->SortBinsByDensity(true); - if (kdBins->GetBinMinDensity() != 0) Error("testkdTreeBinning","Wrong minimum bin after sorting"); - if (kdBins->GetBinMaxDensity() != nbins-1) Error("testkdTreeBinning","Wrong maximum bin after sorting"); + if (kdBins->GetBinMinDensity() != 0) { + Error("testkdTreeBinning", "Wrong minimum bin after sorting"); + ++nfail; + } + if (kdBins->GetBinMaxDensity() != nbins - 1) { + Error("testkdTreeBinning", "Wrong maximum bin after sorting"); + ++nfail; + } if (showGraphics) { new TCanvas(); @@ -104,7 +118,6 @@ void testkdTreeBinning() { double point[2] = {0,0}; // double binCenter[2]; gRandom->SetSeed(0); - bool ok = true; for (int itimes = 0; itimes < ntimes; itimes++) { // generate a random point in 2D @@ -126,12 +139,12 @@ void testkdTreeBinning() { std::cout << " point y " << point[1] << " BIN CENTER is " << binCenter[1] << " min " << binMin[1] << " max " << binMax[1] << std::endl; } - ok &= point[0] > binMin[0] && point[0] < binMax[0]; - ok &= point[1] > binMin[1] && point[1] < binMax[1]; + bool ok = point[0] > binMin[0] && point[0] < binMax[0] && point[1] > binMin[1] && point[1] < binMax[1]; if (!ok) { Error ("testkdTreeBinning::FindBin"," Point is not in the right bin " ); std::cout << " point x " << point[0] << " BIN CENTER is " << binCenter[0] << " min " << binMin[0] << " max " << binMax[0] << std::endl; std::cout << " point y " << point[1] << " BIN CENTER is " << binCenter[1] << " min " << binMin[1] << " max " << binMax[1] << std::endl; + ++nfail; } if (itimes < 2 && showGraphics ) { @@ -143,15 +156,74 @@ void testkdTreeBinning() { } delete [] binCenter; - } - else + } else { Error("testkdTreeBinning::FindBin"," Bin %d is not existing ",ibin); + ++nfail; + } + } + + return nfail; +} + +// Regression test for the case where the requested number of bins does not +// divide the data size evenly. In that situation the kd-tree builds more +// terminal nodes (bins) than naively expected, and FindBin() must never return +// an index outside [0, GetNBins()). See +// https://github.com/root-project/root/issues/10784 about +// TKDTreeBinning::FindBin returning non-existent bins. Returns the number of +// detected failures (0 on success) so that the caller can turn it into a +// non-zero process exit code: ROOT's Error() only prints, it does not by +// itself make the test fail under ctest. +int testkdTreeBinningFindBinRange() +{ + + int nfail = 0; + const UInt_t DATASZ = 100500; // deliberately NOT a multiple of NBINS + const UInt_t DATADIM = 5; + const UInt_t NBINS = 1000; + std::vector smp(DATASZ * DATADIM); + TRandom3 r; + r.SetSeed(1); + for (UInt_t i = 0; i < DATADIM; ++i) + for (UInt_t j = 0; j < DATASZ; ++j) + smp[DATASZ * i + j] = r.Uniform(-1., 1.); + + TKDTreeBinning kdBins(DATASZ, DATADIM, smp, NBINS); + + const UInt_t nbins = kdBins.GetNBins(); + + // The number of bins must match the number of terminal nodes of the kd-tree. + if ((int)nbins != kdBins.GetTree()->GetNNodes() + 1) { + Error("testkdTreeBinningFindBinRange", "GetNBins() (%u) != number of kd-tree terminal nodes (%d)", nbins, + kdBins.GetTree()->GetNNodes() + 1); + ++nfail; } - return; + // Every data point must be assigned to a valid bin. + std::vector point(DATADIM); + for (UInt_t j = 0; j < DATASZ; ++j) { + for (UInt_t i = 0; i < DATADIM; ++i) + point[i] = smp[DATASZ * i + j]; + UInt_t bin = kdBins.FindBin(point.data()); + if (bin >= nbins) { + Error("testkdTreeBinningFindBinRange", "FindBin returned out-of-range bin %u (NBins = %u)", bin, nbins); + ++nfail; + break; + } + } + + // The total bin content must add up to the data size. + Long64_t total = 0; + for (UInt_t i = 0; i < nbins; ++i) + total += kdBins.GetBinContent(i); + if (total != (Long64_t)DATASZ) { + Error("testkdTreeBinningFindBinRange", "Sum of bin contents (%lld) != data size (%u)", total, DATASZ); + ++nfail; + } + return nfail; } int main(int argc, char **argv) @@ -180,7 +252,9 @@ int main(int argc, char **argv) if ( showGraphics ) theApp = new TApplication("App",&argc,argv); - testkdTreeBinning(); + int nfail = testkdTreeBinning(); + + nfail += testkdTreeBinningFindBinRange(); if ( showGraphics ) { @@ -189,7 +263,6 @@ int main(int argc, char **argv) theApp = nullptr; } - return 0; - + return nfail; }