Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
GoFTest.h
Go to the documentation of this file.
1// @(#)root/mathcore:$Id$
2// Authors: Bartolomeu Rabacal 05/2010
3/**********************************************************************
4 * *
5 * Copyright (c) 2006 , LCG ROOT MathLib Team *
6 * *
7 * *
8 **********************************************************************/
9// Header file for GoFTest
10
11#ifndef ROOT_Math_GoFTest
12#define ROOT_Math_GoFTest
13
15#include "TMath.h"
16
17#include <memory>
18#include <vector>
19
20/*
21*/
22
23namespace ROOT {
24
25 namespace Fit {
26 class BinData;
27 }
28namespace Math {
29
30
31/**
32 @defgroup GoFClasses Goodness of Fit Tests
33 Classical one-dimensional goodness of git tests for unbinned data.
34 ROOT provides 1 sample goodness of fit test (comparison of data with a theoretical distribution) and
35 2-sample test (comparison of two data sets) through the class ROOT::Math::GoFTest
36 The algorithms provided are the Kolmogorov-Smirnov and Anderson-Darling.
37 These tests could be applied approximately also to binned data, assuming the bin size is much smaller than the intrinsic
38 data variations. It is assumed than a bin is like many data at the same bin center value.
39 For these binned version tests look at `TH1::KolmogorovTest` and `TH1::AndersonDarlingTest`
40 @ingroup MathCore
41 */
42
43/**
44 * GoFTest class implementing the 1 sample and 2 sample goodness of fit tests
45 * for uni-variate distributions and data.
46 * The class implements the AndersonDarling and the KolmogorovSmirnov tests
47 *
48 * In the case of the 1-sample test the user needs to provide:
49 * - input data
50 * - theoretical distribution. The distribution can be provided as a function object (functor) or an object implementing
51 * the `ROOT::Math::IGenFunction` interface. One can provide either the PDF (default) of the CDF (cumulative distribution)
52 * One can also provide a pre-defined function. In that case one needs to give also the distribution parameters otherwise the default values will be used.
53 * The pre-defined distributions are:
54 * - kGaussian with default parameter mean=0, sigma=1
55 * - kExponential with default parameter rate=1
56 * - kLogNormal with default parameter meanlog=0, sigmalog=1
57 *
58 * Note that one should not use data computed distribution parameters, otherwise the test will be biased.
59 * The 1-sample KS test using data computed quantities is called Lilliefors test (see https://en.wikipedia.org/wiki/Lilliefors_test)
60 *
61 * @ingroup GoFClasses
62 */
63
64
65class GoFTest {
66public:
67
68 /// H0 distributions for using only with 1-sample tests.
69 /// One should provide the distribution parameters otherwise the default values will be used
71 kUndefined, /// Default value for non templated 1-sample test. Set with SetDistribution
72 kUserDefined, /// For internal use only within the class's template constructor
73 kGaussian, /// Gaussian distribution with default mean=0, sigma=1
74 kLogNormal, /// Lognormal distribution with default meanlog=0, sigmalog=1
75 kExponential /// Exponential distribution with default rate=1
76 };
77
78 /// User input distribution option
80 kCDF, /// Input distribution is a CDF : cumulative distribution function
81 kPDF /// Input distribution is a PDF (Default value)
82 };
83
84 /// Goodness of Fit test types for using with the class's unary functions as a shorthand for the in-built methods
85 enum ETestType {
86 kAD, /// Anderson-Darling Test. Default value
87 kAD2s, /// Anderson-Darling 2-Samples Test
88 kKS, /// Kolmogorov-Smirnov Test
89 kKS2s /// Kolmogorov-Smirnov 2-Samples Test
90 };
91
92 /// Constructor for 2-samples tests
93 GoFTest(size_t sample1Size, const Double_t* sample1, size_t sample2Size, const Double_t* sample2);
94
95 /// Constructor for 1-sample tests with a specified distribution.
96 /// If a specific distribution is not specified it can be set later using SetDistribution.
97 GoFTest(size_t sampleSize, const Double_t* sample, EDistribution dist = kUndefined, const std::vector<double> & distParams = {});
98
99 /// Templated constructor for 1-sample tests with a user specified distribution as a functor object implementing `double operator()(double x)`.
100 template<class Dist>
101 GoFTest(size_t sampleSize, const Double_t* sample, Dist& dist, EUserDistribution userDist = kPDF,
102 Double_t xmin = 1, Double_t xmax = 0)
103 {
104 Instantiate(sample, sampleSize);
105 SetUserDistribution<Dist>(dist, userDist, xmin, xmax);
106 }
107
108 /// Constructor for 1-sample tests with a user specified distribution implementing the ROOT::Math::IGenFunction interface.
109 GoFTest(size_t sampleSize, const Double_t* sample, const IGenFunction& dist, EUserDistribution userDist = kPDF,
110 Double_t xmin = 1, Double_t xmax = 0)
111 {
112 Instantiate(sample, sampleSize);
113 SetUserDistribution(dist, userDist, xmin, xmax);
114 }
115
116 /// Sets the user input distribution function for 1-sample test as a generic functor object.
117 template<class Dist>
118 void SetUserDistribution(Dist& dist, EUserDistribution userDist = kPDF, Double_t xmin = 1, Double_t xmax = 0) {
119 WrappedFunction<Dist&> wdist(dist);
120 SetDistributionFunction(wdist, userDist, xmin, xmax);
121 }
122
123 /// Sets the user input distribution function for 1-sample test using the ROOT::Math::IGenFunction interface.
125 SetDistributionFunction(dist, userDist, xmin, xmax);
126 }
127
128 /// Sets the user input distribution as a probability density function for 1-sample tests.
129 template<class Dist>
130 void SetUserPDF(Dist& pdf, Double_t xmin = 1, Double_t xmax = 0) {
131 SetUserDistribution<Dist>(pdf, kPDF, xmin, xmax);
132 }
133
134 /// Specialization to set the user input distribution as a probability density function for 1-sample tests using the ROOT::Math::IGenFunction interface.
135 void SetUserPDF(const IGenFunction& pdf, Double_t xmin = 1, Double_t xmax = 0) {
137 }
138
139 /// Sets the user input distribution as a cumulative distribution function for 1-sample tests.
140 /// The CDF must return zero for x=xmin and 1 for x=xmax.
141 template<class Dist>
142 void SetUserCDF(Dist& cdf, Double_t xmin = 1, Double_t xmax = 0) {
143 SetUserDistribution<Dist>(cdf, kCDF, xmin, xmax);
144 }
145
146 /// Specialization to set the user input distribution as a cumulative distribution function for 1-sample tests.
147 void SetUserCDF(const IGenFunction& cdf, Double_t xmin = 1, Double_t xmax = 0) {
149 }
150
151
152 /// Sets the distribution for the predefined distribution types and optionally its parameters for 1-sample tests.
153 void SetDistribution(EDistribution dist, const std::vector<double> & distParams = {});
154
155
156 virtual ~GoFTest();
157
158 /// Performs the Anderson-Darling 2-Sample Test.
159 /// The Anderson-Darling K-Sample Test algorithm is described and taken from
160 /// http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/andeksam.htm
161 /// and from
162 /// (1) Scholz F.W., Stephens M.A. (1987), K-sample Anderson-Darling Tests, Journal of the American Statistical Association, 82, 918–924.
163 /// (2-samples variant implemented).
164 void AndersonDarling2SamplesTest(Double_t& pvalue, Double_t& testStat) const;
165
166 /// Anderson-Darling 2-Sample Test.
167 /// Returns by default the p-value; when using option "t" returns the test statistic value "A2".
168 Double_t AndersonDarling2SamplesTest(const Char_t* option = "p") const;
169
170 /**
171 Performs the Anderson-Darling 1-Sample Test.
172 The Anderson-Darling 1-Sample Test algorithm for a specific distribution is described at
173 http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/andedarl.htm
174 and described and taken from (2)
175 Marsaglia J.C.W., Marsaglia G. (2004), Evaluating the Anderson-Darling Distribution, Journal of Statistical Software, Volume 09, Issue i02.
176 and described and taken from (3)
177 Lewis P.A.W. (1961), The Annals of Mathematical Statistics, Distribution of the Anderson-Darling Statistic, Volume 32, Number 4, 1118-1124.
178 */
179 void AndersonDarlingTest(Double_t& pvalue, Double_t& testStat) const;
180
181 /// Anderson-Darling 2-Sample Test.
182 /// Returns default p-value; option "t" returns the test statistic value "A2"
183 Double_t AndersonDarlingTest(const Char_t* option = "p") const;
184
185 /**
186 * @brief Kolmogorov-Smirnov 2-Samples Test.
187 The Kolmogorov-Smirnov 2-Samples Test algorithm is described at
188 http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ks2samp.htm
189 and described and taken from
190 https://root.cern/doc/master/namespaceTMath.html
191 */
192 void KolmogorovSmirnov2SamplesTest(Double_t& pvalue, Double_t& testStat) const;
193
194 /// Kolmogorov-Smirnov 2-Samples Test.
195 /// Returns by default the p-value; option "t" returns the test statistic value "Dn".
196 Double_t KolmogorovSmirnov2SamplesTest(const Char_t* option = "p") const;
197
198 /**
199 * @brief Kolmogorov-Smirnov 1-Sample Test.
200 *
201 The Kolmogorov-Smirnov 1-Sample Test algorithm for a specific distribution is described at
202 http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/kstest.htm
203 and described and taken from (4)
204 Press W. H., Teukolsky S.A., Vetterling W.T., Flannery B.P. (2007), Numerical Recipes -
205 The Art of Scientific Computing (Third Edition), Cambridge University Press
206 */
207 void KolmogorovSmirnovTest(Double_t& pvalue, Double_t& testStat) const;
208
209 /// Kolmogorov-Smirnov 1-Sample Test.
210 /// Returns default p-value; option "t" returns the test statistic value "Dn".
211 Double_t KolmogorovSmirnovTest(const Char_t* option = "p") const;
212
213 /// The class's unary functions performing the gif test according to the ETestType provided.
214 void operator()(ETestType test, Double_t& pvalue, Double_t& testStat) const;
215
216 /// Returns default Anderson Darling 1-Sample Test and default p-value; option "t" returns the test statistic value
217 /// specific to the test type.
218 Double_t operator()(ETestType test = kAD, const Char_t* option = "p") const;
219
220 /// Computation of the K-Sample Anderson-Darling Test's p-value as described in (1)
221 // given a normalized test statistic. The first variant described in the paper is used.
222 static Double_t PValueADKSamples(size_t nsamples, Double_t A2 );
223
224 /// Compute the 2-Sample Anderson Darling test for binned data
225 /// assuming equal data are present at the bin center values.
226 /// Used by `TH1::AndersonDarling`
227 static void AndersonDarling2SamplesTest(const ROOT::Fit::BinData & data1, const ROOT::Fit::BinData & data2, Double_t& pvalue, Double_t& testStat);
228
229private:
230
231 GoFTest(); ///< Disallowed default constructor
232 GoFTest(GoFTest& gof); ///< Disallowed copy constructor
233 GoFTest operator=(GoFTest& gof); ///< Disallowed assign operator
234
235 std::unique_ptr<IGenFunction> fCDF; ///< Pointer to CDF used in 1-sample test
236
237
238 EDistribution fDist; ///< Type of distribution
239 std::vector<Double_t> fParams; ///< The distribution parameters (e.g. fParams[0] = mean, fParams[1] = sigma for a Gaussian)
240
241 std::vector<Double_t> fCombinedSamples; ///< The combined data
242
243 std::vector<std::vector<Double_t> > fSamples; ///< The input data
244
246
247 void SetCDF();
249
250 void Instantiate(const Double_t* sample, size_t sampleSize);
251
252
256
257 /// Computation of sigma_N as described in (1)
258 static Double_t GetSigmaN(const std::vector<size_t> & ns, size_t N);
259
260 /// Linear interpolation used in GoFTest::PValueAD2Samples
261 static Double_t InterpolatePValues(int nsamples,Double_t A2);
262
263 /// Computation of the 1-Sample Anderson-Darling Test's p-value
265
266 /// Applies the logarithm to the sample when the specified distribution to test is LogNormal
267 void LogSample();
268
269 /// set a vector of samples
270 void SetSamples(std::vector<const Double_t*> samples, const std::vector<size_t> samplesSizes);
271
272 /// Sets the distribution parameters
273 void SetParameters(const std::vector<double> & params);
274
275}; // end GoFTest class
276
277
278} // ROOT namespace
279} // Math namespace
280#endif
char Char_t
Definition RtypesCore.h:37
double Double_t
Definition RtypesCore.h:59
#define N
float xmin
float xmax
Class describing the binned data sets : vectors of x coordinates, y values and optionally error on y ...
Definition BinData.h:52
GoFTest class implementing the 1 sample and 2 sample goodness of fit tests for uni-variate distributi...
Definition GoFTest.h:65
static Double_t GetSigmaN(const std::vector< size_t > &ns, size_t N)
Computation of sigma_N as described in (1)
Definition GoFTest.cxx:311
void SetUserDistribution(Dist &dist, EUserDistribution userDist=kPDF, Double_t xmin=1, Double_t xmax=0)
Sets the user input distribution function for 1-sample test as a generic functor object.
Definition GoFTest.h:118
void operator()(ETestType test, Double_t &pvalue, Double_t &testStat) const
The class's unary functions performing the gif test according to the ETestType provided.
Definition GoFTest.cxx:208
void SetDistributionFunction(const IGenFunction &cdf, Bool_t isPDF, Double_t xmin, Double_t xmax)
Definition GoFTest.cxx:267
GoFTest(GoFTest &gof)
Disallowed copy constructor.
std::unique_ptr< IGenFunction > fCDF
Pointer to CDF used in 1-sample test.
Definition GoFTest.h:235
Bool_t fTestSampleFromH0
Definition GoFTest.h:245
void SetUserPDF(const IGenFunction &pdf, Double_t xmin=1, Double_t xmax=0)
Specialization to set the user input distribution as a probability density function for 1-sample test...
Definition GoFTest.h:135
EDistribution
H0 distributions for using only with 1-sample tests.
Definition GoFTest.h:70
@ kLogNormal
Gaussian distribution with default mean=0, sigma=1.
Definition GoFTest.h:74
@ kExponential
Lognormal distribution with default meanlog=0, sigmalog=1.
Definition GoFTest.h:75
@ kGaussian
For internal use only within the class's template constructor.
Definition GoFTest.h:73
@ kUserDefined
Default value for non templated 1-sample test. Set with SetDistribution.
Definition GoFTest.h:72
void SetUserPDF(Dist &pdf, Double_t xmin=1, Double_t xmax=0)
Sets the user input distribution as a probability density function for 1-sample tests.
Definition GoFTest.h:130
EDistribution fDist
Type of distribution.
Definition GoFTest.h:238
GoFTest operator=(GoFTest &gof)
Disallowed assign operator.
void SetUserCDF(const IGenFunction &cdf, Double_t xmin=1, Double_t xmax=0)
Specialization to set the user input distribution as a cumulative distribution function for 1-sample ...
Definition GoFTest.h:147
void Instantiate(const Double_t *sample, size_t sampleSize)
Definition GoFTest.cxx:279
std::vector< Double_t > fCombinedSamples
The combined data.
Definition GoFTest.h:241
void KolmogorovSmirnovTest(Double_t &pvalue, Double_t &testStat) const
Kolmogorov-Smirnov 1-Sample Test.
Definition GoFTest.cxx:921
std::vector< Double_t > fParams
The distribution parameters (e.g. fParams[0] = mean, fParams[1] = sigma for a Gaussian)
Definition GoFTest.h:239
ETestType
Goodness of Fit test types for using with the class's unary functions as a shorthand for the in-built...
Definition GoFTest.h:85
@ kKS
Anderson-Darling 2-Samples Test.
Definition GoFTest.h:88
@ kKS2s
Kolmogorov-Smirnov Test.
Definition GoFTest.h:89
@ kAD2s
Anderson-Darling Test. Default value.
Definition GoFTest.h:87
void SetSamples(std::vector< const Double_t * > samples, const std::vector< size_t > samplesSizes)
set a vector of samples
Definition GoFTest.cxx:181
static Double_t PValueADKSamples(size_t nsamples, Double_t A2)
Computation of the K-Sample Anderson-Darling Test's p-value as described in (1)
Definition GoFTest.cxx:353
EUserDistribution
User input distribution option.
Definition GoFTest.h:79
@ kPDF
Input distribution is a CDF : cumulative distribution function.
Definition GoFTest.h:81
Double_t LogNormalCDF(Double_t x) const
void LogSample()
Applies the logarithm to the sample when the specified distribution to test is LogNormal.
Definition GoFTest.cxx:303
void SetDistribution(EDistribution dist, const std::vector< double > &distParams={})
Sets the distribution for the predefined distribution types and optionally its parameters for 1-sampl...
Definition GoFTest.cxx:124
Double_t GaussianCDF(Double_t x) const
Definition GoFTest.cxx:295
void SetUserDistribution(const IGenFunction &dist, GoFTest::EUserDistribution userDist=kPDF, Double_t xmin=1, Double_t xmax=0)
Sets the user input distribution function for 1-sample test using the ROOT::Math::IGenFunction interf...
Definition GoFTest.h:124
void AndersonDarling2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Performs the Anderson-Darling 2-Sample Test.
Definition GoFTest.cxx:646
GoFTest()
Disallowed default constructor.
void KolmogorovSmirnov2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Kolmogorov-Smirnov 2-Samples Test.
Definition GoFTest.cxx:896
std::vector< std::vector< Double_t > > fSamples
The input data.
Definition GoFTest.h:243
Double_t PValueAD1Sample(Double_t A2) const
Computation of the 1-Sample Anderson-Darling Test's p-value.
Definition GoFTest.cxx:483
GoFTest(size_t sampleSize, const Double_t *sample, Dist &dist, EUserDistribution userDist=kPDF, Double_t xmin=1, Double_t xmax=0)
Templated constructor for 1-sample tests with a user specified distribution as a functor object imple...
Definition GoFTest.h:101
void AndersonDarlingTest(Double_t &pvalue, Double_t &testStat) const
Performs the Anderson-Darling 1-Sample Test.
Definition GoFTest.cxx:862
GoFTest(size_t sampleSize, const Double_t *sample, const IGenFunction &dist, EUserDistribution userDist=kPDF, Double_t xmin=1, Double_t xmax=0)
Constructor for 1-sample tests with a user specified distribution implementing the ROOT::Math::IGenFu...
Definition GoFTest.h:109
Double_t ExponentialCDF(Double_t x) const
Definition GoFTest.cxx:299
void SetParameters(const std::vector< double > &params)
Sets the distribution parameters.
Definition GoFTest.cxx:204
static Double_t InterpolatePValues(int nsamples, Double_t A2)
Linear interpolation used in GoFTest::PValueAD2Samples.
void SetUserCDF(Dist &cdf, Double_t xmin=1, Double_t xmax=0)
Sets the user input distribution as a cumulative distribution function for 1-sample tests.
Definition GoFTest.h:142
Interface (abstract class) for generic functions objects of one-dimension Provides a method to evalua...
Definition IFunction.h:112
Template class to wrap any C++ callable object which takes one argument i.e.
Double_t x[n]
Definition legend1.C:17
TFitResultPtr Fit(FitObject *h1, TF1 *f1, Foption_t &option, const ROOT::Math::MinimizerOptions &moption, const char *goption, ROOT::Fit::DataRange &range)
Definition HFitImpl.cxx:133
Namespace for new Math classes and functions.
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...