Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
GoFTest.cxx
Go to the documentation of this file.
1// @(#)root/mathcore:$Id$
2// Authors: Bartolomeu Rabacal 05/2010
3/**********************************************************************
4 * *
5 * Copyright (c) 2006 , LCG ROOT MathLib Team *
6 * *
7 * *
8 **********************************************************************/
9// implementation file for GoFTest
10
11
12#include <algorithm>
13#include <functional>
14#include <iostream>
15#include <map>
16#include <memory>
17#include <numeric>
18#include <cstring>
19#include <cassert>
20
21#include "Math/Error.h"
22#include "Math/Math.h"
23#include "Math/IFunction.h"
24#include "Math/IFunctionfwd.h"
25#include "Math/Integrator.h"
28
29#include "Math/GoFTest.h"
30
31#include "Fit/BinData.h"
32
33#include "TStopwatch.h"
34
35/* Note: The references mentioned here are stated in GoFTest.h */
36
37namespace ROOT {
38namespace Math {
39
40 struct CDFWrapper : public IGenFunction {
41 // wrapper around a cdf function to re-scale for the range
42 Double_t fXmin; // lower range for x
43 Double_t fXmax; // lower range for x
44 Double_t fNorm; // normalization
45 const IGenFunction* fCDF; // cdf pointer (owned by the class)
46
47
48 ~CDFWrapper() override { if (fCDF) delete fCDF; }
49
51 fCDF(cdf.Clone())
52 {
53 if (xmin >= xmax) {
54 fNorm = 1;
55 fXmin = -std::numeric_limits<double>::infinity();
56 fXmax = std::numeric_limits<double>::infinity();
57 }
58 else {
59 fNorm = cdf(xmax) - cdf(xmin);
60 fXmin = xmin;
61 fXmax = xmax;
62 }
63 }
64
65 Double_t DoEval(Double_t x) const override {
66 if (x <= fXmin) return 0;
67 if (x >= fXmax) return 1.0;
68 return (*fCDF)(x)/fNorm;
69 }
70
71 IGenFunction* Clone() const override {
72 return new CDFWrapper(*fCDF,fXmin,fXmax);
73 }
74 };
75
76
77 class PDFIntegral : public IGenFunction {
78 Double_t fXmin; // lower range for x
79 Double_t fXmax; // lower range for x
80 Double_t fNorm; // normalization
82 const IGenFunction* fPDF; // pdf pointer (owned by the class)
83 public:
84
85 ~PDFIntegral() override { if (fPDF) delete fPDF; }
86
88 fXmin(xmin),
89 fXmax(xmax),
90 fNorm(1),
91 fPDF(pdf.Clone())
92 {
93 // compute normalization
94 fIntegral.SetFunction(*fPDF); // N.B. must be fPDF (the cloned copy) and not pdf which can disappear
95 if (fXmin >= fXmax) {
96 fXmin = -std::numeric_limits<double>::infinity();
97 fXmax = std::numeric_limits<double>::infinity();
98 }
99 if (fXmin == -std::numeric_limits<double>::infinity() && fXmax == std::numeric_limits<double>::infinity() ) {
101 }
102 else if (fXmin == -std::numeric_limits<double>::infinity() )
104 else if (fXmax == std::numeric_limits<double>::infinity() )
106 else
108 }
109
110 Double_t DoEval(Double_t x) const override {
111 if (x <= fXmin) return 0;
112 if (x >= fXmax) return 1.0;
113 if (fXmin == -std::numeric_limits<double>::infinity() )
115 else
117 }
118
119 IGenFunction* Clone() const override {
120 return new PDFIntegral(*fPDF, fXmin, fXmax);
121 }
122 };
123
124 void GoFTest::SetDistribution(EDistribution dist, const std::vector<double> & distParams ) {
125 if (!(kGaussian <= dist && dist <= kExponential)) {
126 MATH_ERROR_MSG("SetDistribution", "Cannot set distribution type! Distribution type option must be enabled.");
127 return;
128 }
129 fDist = dist;
131 SetCDF();
132 }
133
135 : fDist(kUndefined),
136 fSamples(std::vector<std::vector<Double_t> >(2)),
137 fTestSampleFromH0(kFALSE) {
138 Bool_t badSampleArg = sample1 == nullptr || sample1Size == 0;
139 if (badSampleArg) {
140 std::string msg = "'sample1";
141 msg += !sample1Size ? "Size' cannot be zero" : "' cannot be zero-length";
142 MATH_ERROR_MSG("GoFTest", msg.c_str());
144 }
145 badSampleArg = sample2 == nullptr || sample2Size == 0;
146 if (badSampleArg) {
147 std::string msg = "'sample2";
148 msg += !sample2Size ? "Size' cannot be zero" : "' cannot be zero-length";
149 MATH_ERROR_MSG("GoFTest", msg.c_str());
151 }
152 std::vector<const Double_t*> samples(2);
153 std::vector<size_t> samplesSizes(2);
154 samples[0] = sample1;
155 samples[1] = sample2;
159 }
160
161 GoFTest::GoFTest(size_t sampleSize, const Double_t* sample, EDistribution dist, const std::vector<double> & distParams)
162 : fDist(dist),
163 fSamples(std::vector<std::vector<Double_t> >(1)),
164 fTestSampleFromH0(kTRUE) {
165 Bool_t badSampleArg = sample == nullptr || sampleSize == 0;
166 if (badSampleArg) {
167 std::string msg = "'sample";
168 msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
169 MATH_ERROR_MSG("GoFTest", msg.c_str());
171 }
172 std::vector<const Double_t*> samples(1, sample);
173 std::vector<size_t> samplesSizes(1, sampleSize);
176 SetCDF();
177 }
178
180
181 void GoFTest::SetSamples(std::vector<const Double_t*> samples, const std::vector<size_t> samplesSizes) {
182 fCombinedSamples.assign(std::accumulate(samplesSizes.begin(), samplesSizes.end(), 0u), 0.0);
183 size_t combinedSamplesSize = 0;
184 for (size_t i = 0; i < samples.size(); ++i) {
185 fSamples[i].assign(samples[i], samples[i] + samplesSizes[i]);
186 std::sort(fSamples[i].begin(), fSamples[i].end());
187 for (size_t j = 0; j < samplesSizes[i]; ++j) {
189 }
191 }
192 std::sort(fCombinedSamples.begin(), fCombinedSamples.end());
193
194 Bool_t degenerateSamples = *(fCombinedSamples.begin()) == *(fCombinedSamples.end() - 1);
195 if (degenerateSamples) {
196 std::string msg = "Degenerate sample";
197 msg += samplesSizes.size() > 1 ? "s!" : "!";
198 msg += " Sampling values all identical.";
199 MATH_ERROR_MSG("SetSamples", msg.c_str());
201 }
202 }
203
204 void GoFTest::SetParameters(const std::vector<double> & distParams) {
206 }
207
209 switch (test) {
210 default:
211 case kAD:
213 break;
214 case kAD2s:
216 break;
217 case kKS:
219 break;
220 case kKS2s:
222 }
223 }
224
226 Double_t result = 0.0;
227 switch (test) {
228 default:
229 case kAD:
231 break;
232 case kAD2s:
234 break;
235 case kKS:
237 break;
238 case kKS2s:
240 }
241 return result;
242 }
243
244 void GoFTest::SetCDF() { // Setting parameter-free distributions
245 IGenFunction* cdf = nullptr;
246 switch (fDist) {
247 case kLogNormal:
248 LogSample();
249 if (fParams.empty()) fParams = {0,1};
250 /* fall through */
251 case kGaussian :
253 if (fParams.empty()) fParams = {0,1};
254 break;
255 case kExponential:
257 if (fParams.empty()) fParams = {1};
258 break;
259 case kUserDefined:
260 case kUndefined:
261 default:
262 break;
263 }
264 fCDF.reset(cdf);
265 }
266
268 if (fDist > kUserDefined) {
269 MATH_WARN_MSG("SetDistributionFunction","Distribution type is changed to user defined");
270 }
272 // function will be cloned inside the wrapper PDFIntegral of CDFWrapper classes
273 if (isPDF)
274 fCDF = std::make_unique<PDFIntegral>(f, xmin, xmax );
275 else
276 fCDF = std::make_unique<CDFWrapper>(f, xmin, xmax );
277 }
278
280 // initialization function for the template constructors
281 Bool_t badSampleArg = sample == nullptr || sampleSize == 0;
282 if (badSampleArg) {
283 std::string msg = "'sample";
284 msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
285 MATH_ERROR_MSG("GoFTest", msg.c_str());
287 }
288 fCDF.reset((IGenFunction*)nullptr);
290 fSamples = std::vector<std::vector<Double_t> >(1);
292 SetSamples(std::vector<const Double_t*>(1, sample), std::vector<size_t>(1, sampleSize));
293 }
294
298
302
304 transform(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(),
305 std::function<Double_t(Double_t)>(TMath::Log));
306 }
307
308/*
309 Taken from (1)
310*/
311 Double_t GoFTest::GetSigmaN(const std::vector<size_t> & ns, size_t N) {
312 // compute moments of AD distribution (from Scholz-Stephen paper, paragraph 3)
313
314 Double_t sigmaN = 0.0, h = 0.0, H = 0.0, g = 0.0, a, b, c, d, k = ns.size();
315
316 for (size_t i = 0; i < ns.size(); ++i) {
317 H += 1.0 / double( ns[i] );
318 }
319
320 // use approximate formulas for large N
321 // cache Sum( 1 / i)
322 if (N < 2000) {
323 std::vector<double> invI(N);
324 for (size_t i = 1; i <= N - 1; ++i) {
325 invI[i] = 1.0 / i;
326 h += invI[i];
327 }
328 for (size_t i = 1; i <= N - 2; ++i) {
329 double tmp = invI[N-i];
330 for (size_t j = i + 1; j <= N - 1; ++j) {
331 g += tmp * invI[j];
332 }
333 }
334 }
335 else {
336 // for N larger than 2000 error difference in g is ~ 5 10^-3 while in h is at the level of 10^-5
337 const double emc = 0.5772156649015328606065120900824024; // Euler-Mascheroni constant
338 h = std::log(double(N-1) ) + emc;
339 g = (M_PI)*(M_PI)/6.0;
340 }
341 double k2 = std::pow(k,2);
342 a = (4 * g - 6) * k + (10 - 6 * g) * H - 4 * g + 6;
343 b = (2 * g - 4) * k2 + 8 * h * k + (2 * g - 14 * h - 4) * H - 8 * h + 4 * g - 6;
344 c = (6 * h + 2 * g - 2) * k2 + (4 * h - 4 *g + 6) * k + (2 * h - 6) * H + 4 * h;
345 d = (2 * h + 6) * k2 - 4 * h * k;
346 sigmaN += a * std::pow(double(N),3) + b * std::pow(double(N),2) + c * N + d;
347 sigmaN /= ( double(N - 1) * double(N - 2) * double(N - 3) );
349 return sigmaN;
350 }
351
352
354
355 /*
356 Computation of p-values according to
357 "K-Sample Anderson-Darling Tests" by F.W. Scholz
358 and M.A. Stephens (1987), Journal of the American Statistical Association,
359 Vol 82, No. 399, pp 918-924.
360 Code from kSamples package from R (author F. Scholtz)
361
362 This function uses the upper T_m quantiles as obtained via simulation of
363 the Anderson-Darling test statistics (Nsim = 2*10^6) with sample sizes n=500
364 for each sample, and after standardization, in order to emulate the Table 1
365 values given in the above reference. However, here we estimate p-quantiles
366 for p = .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,
367 .1,.2,.3,.4,.5,.6,.7,.8,.9,.925,.95,.975,.99,.9925,.995,.9975,.999,
368 .99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999
369 First the appropriate p-quantiles are determined from those simulated
370 for ms = 1,2,3,4,6,8,10, Inf, interpolating to the given value of m.
371 Since we use only m=2 we avoid this interpolation.
372
373 Next linear inetrpolation to find the observed p value given the observed test statistic value.
374 We use interpolation in the test statistic -> log((1-p)/p) domain
375 and we extrapolatelinearly) beyond p = .00001 and .99999.
376 */
377
378 // sample values
379 //double ms[] = { 1, 2, 3, 4, 6, 8, 10, TMath::Infinity() };
380 //int ns = ms.size();
381 const int ns = 8;
382 double ts[ ] = { -1.1954, -1.5806, -1.8172,
383 -2.0032, -2.2526, -2.4204, -2.5283, -4.2649, -1.1786, -1.5394,
384 -1.7728, -1.9426, -2.1685, -2.3288, -2.4374, -3.8906, -1.166,
385 -1.5193, -1.7462, -1.9067, -2.126, -2.2818, -2.3926, -3.719,
386 -1.1407, -1.4659, -1.671, -1.8105, -2.0048, -2.1356, -2.2348,
387 -3.2905, -1.1253, -1.4371, -1.6314, -1.7619, -1.9396, -2.0637,
388 -2.1521, -3.0902, -1.0777, -1.3503, -1.5102, -1.6177, -1.761,
389 -1.8537, -1.9178, -2.5758, -1.0489, -1.2984, -1.4415, -1.5355,
390 -1.6625, -1.738, -1.7936, -2.3263, -0.9978, -1.2098, -1.3251,
391 -1.4007, -1.4977, -1.5555, -1.5941, -1.96, -0.9417, -1.1187,
392 -1.209, -1.2671, -1.3382, -1.379, -1.405, -1.6449, -0.8981, -1.0491,
393 -1.1235, -1.1692, -1.2249, -1.2552, -1.2755, -1.4395, -0.8598,
394 -0.9904, -1.0513, -1.0879, -1.1317, -1.155, -1.1694, -1.2816,
395 -0.7258, -0.7938, -0.8188, -0.8312, -0.8435, -0.8471, -0.8496,
396 -0.8416, -0.5966, -0.617, -0.6177, -0.6139, -0.6073, -0.5987,
397 -0.5941, -0.5244, -0.4572, -0.4383, -0.419, -0.4033, -0.3834,
398 -0.3676, -0.3587, -0.2533, -0.2966, -0.2428, -0.2078, -0.1844,
399 -0.1548, -0.1346, -0.1224, 0, -0.1009, -0.0169, 0.0304, 0.0596,
400 0.0933, 0.1156, 0.1294, 0.2533, 0.1571, 0.2635, 0.3169, 0.348,
401 0.3823, 0.4038, 0.4166, 0.5244, 0.5357, 0.6496, 0.6992, 0.7246,
402 0.7528, 0.7683, 0.7771, 0.8416, 1.2255, 1.2989, 1.3202, 1.3254,
403 1.3305, 1.3286, 1.3257, 1.2816, 1.5262, 1.5677, 1.5709, 1.5663,
404 1.5561, 1.5449, 1.5356, 1.4395, 1.9633, 1.943, 1.919, 1.8975,
405 1.8641, 1.8389, 1.8212, 1.6449, 2.7314, 2.5899, 2.5, 2.4451,
406 2.3664, 2.3155, 2.2823, 1.96, 3.7825, 3.4425, 3.2582, 3.1423,
407 3.0036, 2.9101, 2.8579, 2.3263, 4.1241, 3.716, 3.4984, 3.3651,
408 3.2003, 3.0928, 3.0311, 2.4324, 4.6044, 4.0847, 3.8348, 3.6714,
409 3.4721, 3.3453, 3.2777, 2.5758, 5.409, 4.7223, 4.4022, 4.1791,
410 3.9357, 3.7809, 3.6963, 2.807, 6.4954, 5.5823, 5.1456, 4.8657,
411 4.5506, 4.3275, 4.2228, 3.0902, 6.8279, 5.8282, 5.3658, 5.0749,
412 4.7318, 4.4923, 4.3642, 3.1747, 7.2755, 6.197, 5.6715, 5.3642,
413 4.9991, 4.7135, 4.5945, 3.2905, 8.1885, 6.8537, 6.2077, 5.8499,
414 5.4246, 5.1137, 4.9555, 3.4808, 9.3061, 7.6592, 6.85, 6.4806,
415 5.9919, 5.6122, 5.5136, 3.719, 9.6132, 7.9234, 7.1025, 6.6731,
416 6.1549, 5.8217, 5.7345, 3.7911, 10.0989, 8.2395, 7.4326, 6.9567,
417 6.3908, 6.011, 5.9566, 3.8906, 10.8825, 8.8994, 7.8934, 7.4501,
418 6.9009, 6.4538, 6.2705, 4.0556, 11.8537, 9.5482, 8.5568, 8.0283,
419 7.4418, 6.9524, 6.6195, 4.2649 };
420
421
422
423
424
425 // p values bins
426 double p[] = { .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,.1,.2,.3,.4,.5,.6,.7,.8,.9,
427 .925,.95,.975,.99,.9925,.995,.9975,.999,.99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999 };
428
429 //int nbins = p.size();
430 const int nbins = 35;
431 //assert ( nbins*ns == ts.size() );
432
433 // get ts values for nsamples = 2
434 // corresponding value is for m=nsamples-1
435 int offset = 0; // for m = 1 (i.e. for nsamples = 2)
436 if (nsamples != 2) {
437 MATH_ERROR_MSG("InterpolatePValues", "Interpolation not implemented for nsamples not equal to 2");
438 return 0;
439 }
440 std::vector<double> ts2(nbins); // ts values for nsamples = 2
441 std::vector<double> lp(nbins); // log ( p / (1-p) )
442 for (int i = 0; i < nbins; ++i)
443 {
444 ts2[i] = ts[offset+ i * ns];
445 p[i] = 1.-p[i];
446 lp[i] = std::log( p[i]/(1.-p[i] ) );
447 }
448 // do linear interpolation to find right lp value for given observed test staistic value
449 //auto it = std::lower_bound(ts2.begin(), ts2.end(), tx );
450 int i1 = std::distance(ts2.begin(), std::lower_bound(ts2.begin(), ts2.end(), tx ) ) - 1;
451 int i2 = i1+1;
452 // if tx is before min of tabulated data
453 if (i1 < 0) {
454 i1 = 0;
455 i2 = 1;
456 }
457 // if tx is after max of tabulated data
458 if (i2 >= int(ts2.size()) ) {
459 i1 = ts2.size()-2;
460 i2 = ts2.size()-1;
461 }
462
463 //std::cout << i1 << " , " << i2 << std::endl;
464 assert(i1 < (int) lp.size() && i2 < (int) lp.size() );
465 double lp1 = lp[i1];
466 double lp2 = lp[i2];
467 double tx1 = ts2[i1];
468 double tx2 = ts2[i2];
469
470 //std::cout << " tx1,2 " << tx1 << " " << tx2 << std::endl;
471 /// find interpolated (or extrapolated value)(
472 double lp0 = (lp1-lp2) * (tx - tx2)/ ( tx1-tx2) + lp2;
473
474
475 double p0 = exp(lp0)/(1. + exp(lp0) );
476 return p0;
477
478 }
479
480
481/*
482 Taken from (2)
484 Double_t pvalue = 0.0;
485 if (A2 <= 0.0) {
486 return pvalue;
487 } else if (A2 < 2.) {
488 pvalue = std::pow(A2, -0.5) * std::exp(-1.2337141 / A2) * (2.00012 + (0.247105 - (0.0649821 - (0.0347962 - (0.011672 - 0.00168691 * A2) * A2) * A2) * A2) * A2);
489 } else {
490 pvalue = std::exp(-1. * std::exp(1.0776 - (2.30695 - (0.43424 - (.082433 - (0.008056 - 0.0003146 * A2) * A2) * A2) * A2) * A2));
491 }
492 return 1. - pvalue;
493 }
494
495
496// code from kSamples (R) F. Scholz
497
498/* computes the k-sample Anderson-Darling test statistics in both original
499 and alternative versions for the nonparametric (rank) test described in
500 Scholz F.W. and Stephens M.A. (1987), K-sample Anderson-Darling Tests,
501 Journal of the American Statistical Association, Vol 82, No. 399,
502 pp. 918-924
503
504 Arguments:
505 adk: double array with length 2, stores AkN2 and AakN2
506 k: integer, number of samples being compared
507 x: double array storing the concatenated samples in the same order as ns
508 ns: integer array storing the k sample sizes, corresponding to x
509 zstar: double array storing the l distinct ordered observations in the
510 pooled sample
511 l: integer, length of zstar
512
513 Outputs:
514 when the computation ends, AkN2 and AakN2 are stored in the given memory
515 pointed by adk
516*/
517
518/* counts and returns the number of occurrence of a given number
519 in a double array */
520int getCount(double z, const double *dat, int n) {
521 int i;
522 int count = 0;
523
524 for (i = 0; i < n; i++) {
525 if (dat[i] == z) {
526 count++;
527 }
528 }
529
530 return(count);
531}
532
533/* computes and returns the sum of elements in a given integer array */
534int getSum(const int *x, int n) {
535 int i;
536 int sum = 0;
537
538 for (i = 0; i < n; i++) {
539 sum += x[i];
540 }
541
542 return(sum);
543}
544
545
546void adkTestStat(double *adk, const std::vector<std::vector<double> > & samples, const std::vector<double> & zstar) {
547
548 int i;
549 int j;
550
551 int nsum; /* total sample size = n_1 + ... + n_k */
552 int k = samples.size();
553 int l = zstar.size();
554
555 /* fij records the number of observations in the ith sample coinciding
556 with zstar[j], where i = 1, ..., k, and j = 1, ..., l */
557 std::vector<int> fij (k*l);
558 /* lvec is an integer vector with length l,
559 whose jth entry = \sum_{i=1}^{k} f_{ij}, i.e., the multiplicity
560 of zstar[j] */
561 std::vector<int> lvec(l);
562
563 /* for computation */
564 double mij;
565 double maij;
566 double innerSum;
567 double aInnerSum;
568 double bj;
569 double baj;
570 double tmp;
571
572 /* samples is a two-dimensional double array with length k;
573 it stores an array of k pointers to double arrays which are
574 the k samples being compared */
575// double **samples;
576
577 /* dynamically allocate memory */
578 //std::vector< std::vector<double> > samples(k);
579 std::vector<int> ns(k);
580 nsum = 0;
581 for (i = 0; i < k; i++) {
582 ns[i] = samples[i].size();
583 nsum += ns[i];
584 }
585
586 /* fij: k*l integer matrix, where l is the length of zstar and
587 k is the number of samples being compared
588 lvec: integer vector of length l, records the multiplicity of
589 each element of zstar */
590 for (j = 0; j < l; j++) {
591 lvec[j] = 0;
592 for (i = 0; i < k; i++) {
593 fij[i + j*k] = getCount(zstar[j], &samples[i][0], ns[i]);
594 lvec[j] += fij[i + j*k];
595 }
596 }
597
598 // loop on samples to compute the adk's
599 // Formula (6) and (7) of the paper
600 adk[0] = adk[1] = 0;
601 for (i = 0; i < k; i++) {
602 mij = 0;
603 maij = 0;
604 innerSum = 0;
605 aInnerSum = 0;
606
607 for (j = 0; j < l; j++) {
608 mij += fij[i + j*k];
609 maij = mij - (double) fij[i + j*k] / 2.0;
610 bj = getSum(&lvec[0], j + 1);
611 baj = bj - (double) lvec[j] / 2.0;
612
613 if (j < l - 1) {
614 tmp = (double) nsum * mij - (double) ns[i] * bj;
615 innerSum = innerSum + (double) lvec[j] * tmp * tmp /
616 (bj * ((double) nsum - bj));
617 }
618
619 tmp = (double) nsum * maij - (double) ns[i] * baj;
620 aInnerSum = aInnerSum + (double) lvec[j] * tmp * tmp /
621 (baj * (nsum - baj) - nsum * (double) lvec[j] / 4.0);
622 }
623
624 adk[0] = adk[0] + innerSum / ns[i]; /* AkN2*/
625 adk[1] = adk[1] + aInnerSum / ns[i]; /* AakN2 */
626 }
627
628 /* k-sample Anderson-Darling test statistics in both original and
629 alternative versions, AkN2 and AakN2, are stored in the given
630 double array adk */
631 adk[0] = adk[0] / (double) nsum; /* AkN2*/
632 adk[1] = (nsum - 1) * adk[1] / ((double) nsum * (double) nsum); /* AakN2 */
633
634 // /* free pointers */
635 // for (i = 0; i < k; i++) {
636 // free(samples[i]);
637 // }
638 // free(samples);
639
640}
641
642
643/*
644 Taken from (1) -- Named for 2 samples but implemented for K. Restricted to K = 2 by the class's constructors
645*/
647 pvalue = -1;
648 testStat = -1;
649 if (fTestSampleFromH0) {
650 MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
651 return;
652 }
653 std::vector<Double_t> z(fCombinedSamples);
654 // unique removes all consecutives duplicates elements. This is exactly what we wants
655 // for example unique of v={1,2,2,3,1,2,3,3} results in {1,2,3,1,2,3} which is exactly what we wants
656 std::vector<Double_t>::iterator endUnique = std::unique(z.begin(), z.end()); //z_j's in (1)
657 z.erase(endUnique, z.end() );
658 std::vector<size_t> h; // h_j's in (1)
659 std::vector<Double_t> H; // H_j's in (1)
660 size_t N = fCombinedSamples.size();
661 Double_t A2 = 0.0; // Anderson-Darling A^2 Test Statistic
662
663#ifdef USE_OLDIMPL
664
665 TStopwatch w; w.Start();
666
667 unsigned int nSamples = fSamples.size();
668
669 // old implementation
670 for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
671 size_t n = std::count(fCombinedSamples.begin(), fCombinedSamples.end(), *data);
672 h.push_back(n);
673 H.push_back(std::count_if(fCombinedSamples.begin(), fCombinedSamples.end(),
674 std::bind(std::less<Double_t>(), std::placeholders::_1, *data)) + n / 2.);
675 }
676 std::cout << "time for H";
677 w.Print();
678 w.Reset(); w.Start();
679 std::vector<std::vector<Double_t> > F(nSamples); // F_ij's in (1)
680 for (size_t i = 0; i < nSamples; ++i) {
681 for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
682 size_t n = std::count(fSamples[i].begin(), fSamples[i].end(), *data);
683 F[i].push_back(std::count_if(fSamples[i].begin(), fSamples[i].end(),
684 std::bind(std::less<Double_t>(), std::placeholders::_1, *data)) + n / 2.);
685 }
686 }
687 std::cout << "time for F";
688 w.Print();
689 for (size_t i = 0; i < nSamples; ++i) {
690 Double_t sum_result = 0.0;
691 size_t j = 0;
692 w.Reset(); w.Start();
693 for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
694 sum_result += h[j] * TMath::Power(N * F[i][j]- fSamples[i].size() * H[j], 2) / (H[j] * (N - H[j]) - N * h[j] / 4.0);
695 ++j;
696 }
697 std::cout << "time for sum_result";
698 w.Print();
699 std::cout << "sum_result " << sum_result << std::endl;
700 A2 += 1.0 / fSamples[i].size() * sum_result;
701 }
702 A2 *= (N - 1) / (TMath::Power(N, 2)); // A2_akN in (1)
703
704 std::cout << "A2 - old Bartolomeo code " << A2 << std::endl;
705#endif
706 // w.Reset();
707 // w.Start();
708
709 double adk[2] = {0,0};
710
711 //debug
712 // std::cout << "combined samples\n";
713 // for (int i = 0; i < fCombinedSamples.size(); ++i)
714 // std::cout << fCombinedSamples[i] << " ,";
715 // std::cout << std::endl;
716 // std::cout << ns[0] << " " << ns[1] << std::endl;
717 // std::cout << "Z\n";
718 // for (int i = 0; i < z.size(); ++i)
719 // std::cout << z[i] << " ,";
720 // std::cout << std::endl;
721
722 // use function from kSamples code
724 // w.Print();
725 // std::cout << "A2 - new kSamples code " << adk[0] << " " << adk[1] << std::endl;
726
727 A2 = adk[0];
728
729 // compute the normalized test statistic
730
731 std::vector<size_t> ns(fSamples.size());
732 for (unsigned int k = 0; k < ns.size(); ++k) ns[k] = fSamples[k].size();
733 Double_t sigmaN = GetSigmaN(ns, N);
734 A2 -= fSamples.size() - 1;
735 A2 /= sigmaN; // standardized test statistic
736
738 testStat = A2;
739 return;
740 }
741
742
743/*
744 Compute Anderson Darling test for two binned data set.
745 A binned data set can be seen as many identical observation happening at the center of the bin
746 In this way it is trivial to apply the formula (6) in the paper of W. Scholz, M. Stephens, "K-Sample Anderson-Darling Tests"
747 to the case of histograms. See also http://arxiv.org/pdf/0804.0380v1.pdf paragraph 3.3.5
748 It is important that empty bins are not present
749*/
751 pvalue = -1;
752 testStat = -1;
753 //
754 // compute cumulative sum of bin counts
755 // std::vector<double> sum1(data1.Size() );
756 // std::vector<double> sum2(data2.Size() );
757 // std::vector<double> sumAll(data1.Size() + data2.Size() );
758
759 if (data1.NDim() != 1 && data2.NDim() != 1) {
760 MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Bin Data set must be one-dimensional ");
761 return;
762 }
763 unsigned int n1 = data1.Size();
764 unsigned int n2 = data2.Size();
765 double ntot1 = 0;
766 double ntot2 = 0;
767
768
769 // make a combined data set and sort it
770 std::vector<double> xdata(n1+n2);
771 for (unsigned int i = 0; i < n1; ++i) {
772 double value = 0;
773 const double * x = data1.GetPoint(i, value);
774 xdata[i] = *x;
775 ntot1 += value;
776 }
777 for (unsigned int i = 0; i < n2; ++i) {
778 double value = 0;
779 const double * x = data2.GetPoint(i, value);
780 xdata[n1+i] = *x;
781 ntot2 += value;
782 }
783 double nall = ntot1+ntot2;
784 // sort the combined data
785 std::vector<unsigned int> index(n1+n2);
786 TMath::Sort(n1+n2, &xdata[0], &index[0], false );
787
788 // now compute the sums for the tests
789 double sum1 = 0;
790 double sum2 = 0;
791 double sumAll = 0;
792 double adsum = 0;
793 unsigned int j = 0;
794
795 while( j < n1+n2 ) {
796// for (unsigned int j = 0; j < n1+n2; ++j) {
797 // skip equal observations
798 double x = xdata[ index[j] ];
799 unsigned int k = j;
800 // loop on the bins with the same center value
801 double t = 0;
802 do {
803 unsigned int i = index[k];
804 double value = 0;
805 if (i < n1 ) {
806 value = data1.Value(i);
807 sum1 += value;
808 }
809 else {
810 // from data2
811 i -= n1;
812 assert(i < n2);
813 value = data2.Value(i);
814 sum2 += value;
815 }
816 sumAll += value;
817 t += value;
818 //std::cout << "j " << j << " k " << k << " data " << x << " index " << index[k] << " value " << value << std::endl;
819 k++;
820 } while ( k < n1+n2 && xdata[ index[k] ] == x );
821
822
823 j = k;
824 // skip last point
825 if (j < n1+n2) {
826 double tmp1 = ( nall * sum1 - ntot1 * sumAll );
827 double tmp2 = ( nall * sum2 - ntot2 * sumAll );
828 adsum += t * (tmp1*tmp1/ntot1 + tmp2*tmp2/ntot2) / ( sumAll * (nall - sumAll) ) ;
829
830 //std::cout << "comp sum " << adsum << " " << t << " " << sumAll << " s1 " << sum1 << " s2 " << sum2 << " tmp1 " << tmp1 << " tmp2 " << tmp2 << std::endl;
831 }
832 }
833 double A2 = adsum / nall;
834
835 // compute the normalized test statistic
836 std::vector<size_t> ns(2);
837 ns[0] = ntot1;
838 ns[1] = ntot2;
839 //std::cout << " ad2 = " << A2 << " nall " << nall;
840
842 A2 -= 1;
843 A2 /= sigmaN; // standardized test statistic
844
845 //std::cout << " sigmaN " << sigmaN << " new A2 " << A2;
846
848 //std::cout << " pvalue = " << pvalue << std::endl;
849 testStat = A2;
850 return;
851 }
852
853
857 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
858 }
859
860/*
861 Taken from (3)
863 pvalue = -1;
864 testStat = -1;
865 if (!fTestSampleFromH0) {
866 MATH_ERROR_MSG("AndersonDarlingTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
867 return;
868 }
869 if (fDist == kUndefined) {
870 MATH_ERROR_MSG("AndersonDarlingTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
871 return;
872 }
873 Double_t A2 = 0.0;
874 Int_t n = fSamples[0].size();
875 for (Int_t i = 0; i < n ; ++i) {
876 Double_t x1 = fSamples[0][i];
877 Double_t w1 = (*fCDF)(x1);
878 Double_t result = (2 * (i + 1) - 1) * TMath::Log(w1) + (2 * (n - (i + 1)) + 1) * TMath::Log(1 - w1);
879 A2 += result;
880 }
881 (A2 /= -n) -= n;
882 if (A2 != A2) {
883 MATH_ERROR_MSG("AndersonDarlingTest", "Cannot compute p-value: data below or above the distribution's thresholds. Check sample consistency.");
884 return;
885 }
887 testStat = A2;
888 }
889
893 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
894 }
895
897 pvalue = -1;
898 testStat = -1;
899 if (fTestSampleFromH0) {
900 MATH_ERROR_MSG("KolmogorovSmirnov2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
901 return;
902 }
903 const size_t na = fSamples[0].size();
904 const size_t nb = fSamples[1].size();
905 std::vector<Double_t> a(na);
906 std::vector<Double_t> b(nb);
907 std::copy(fSamples[0].begin(), fSamples[0].end(), a.begin());
908 std::copy(fSamples[1].begin(), fSamples[1].end(), b.begin());
909 pvalue = TMath::KolmogorovTest(na, a.data(), nb, b.data(), nullptr);
910 testStat = TMath::KolmogorovTest(na, a.data(), nb, b.data(), "M");
911 }
912
918
919/*
920 Algorithm taken from (3) in page 737
922 pvalue = -1;
923 testStat = -1;
924 if (!fTestSampleFromH0) {
925 MATH_ERROR_MSG("KolmogorovSmirnovTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
926 return;
927 }
928 if (fDist == kUndefined) {
929 MATH_ERROR_MSG("KolmogorovSmirnovTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
930 return;
931 }
932 Double_t Fo = 0.0, Dn = 0.0;
933 size_t n = fSamples[0].size();
934 for (size_t i = 0; i < n; ++i) {
935 Double_t Fn = (i + 1.0) / n;
936 Double_t F = (*fCDF)(fSamples[0][i]);
937 Double_t result = std::max(TMath::Abs(Fn - F), TMath::Abs(Fo - F));
938 if (result > Dn) Dn = result;
939 Fo = Fn;
940 }
941 pvalue = TMath::KolmogorovProb(Dn * (TMath::Sqrt(n) + 0.12 + 0.11 / TMath::Sqrt(n)));
942 testStat = Dn;
943 }
944
948 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
949 }
950
951
952
953
954
955} // ROOT namespace
956} // Math namespace
957
#define MATH_ERROR_MSG(loc, str)
Definition Error.h:83
#define MATH_WARN_MSG(loc, str)
Definition Error.h:80
#define d(i)
Definition RSha256.hxx:102
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
#define g(i)
Definition RSha256.hxx:105
#define a(i)
Definition RSha256.hxx:99
#define h(i)
Definition RSha256.hxx:106
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
#define M_PI
Definition Rotated.cxx:105
char Char_t
Definition RtypesCore.h:37
constexpr Bool_t kFALSE
Definition RtypesCore.h:94
double Double_t
Definition RtypesCore.h:59
constexpr Bool_t kTRUE
Definition RtypesCore.h:93
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
#define N
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t option
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Option_t Option_t TPoint TPoint const char x1
float xmin
float xmax
Class describing the binned data sets : vectors of x coordinates, y values and optionally error on y ...
Definition BinData.h:52
static Double_t GetSigmaN(const std::vector< size_t > &ns, size_t N)
Computation of sigma_N as described in (1)
Definition GoFTest.cxx:311
void operator()(ETestType test, Double_t &pvalue, Double_t &testStat) const
The class's unary functions performing the gif test according to the ETestType provided.
Definition GoFTest.cxx:208
void SetDistributionFunction(const IGenFunction &cdf, Bool_t isPDF, Double_t xmin, Double_t xmax)
Definition GoFTest.cxx:267
std::unique_ptr< IGenFunction > fCDF
Pointer to CDF used in 1-sample test.
Definition GoFTest.h:235
Bool_t fTestSampleFromH0
Definition GoFTest.h:245
EDistribution
H0 distributions for using only with 1-sample tests.
Definition GoFTest.h:70
@ kLogNormal
Gaussian distribution with default mean=0, sigma=1.
Definition GoFTest.h:74
@ kExponential
Lognormal distribution with default meanlog=0, sigmalog=1.
Definition GoFTest.h:75
@ kGaussian
For internal use only within the class's template constructor.
Definition GoFTest.h:73
@ kUserDefined
Default value for non templated 1-sample test. Set with SetDistribution.
Definition GoFTest.h:72
EDistribution fDist
Type of distribution.
Definition GoFTest.h:238
void Instantiate(const Double_t *sample, size_t sampleSize)
Definition GoFTest.cxx:279
std::vector< Double_t > fCombinedSamples
The combined data.
Definition GoFTest.h:241
void KolmogorovSmirnovTest(Double_t &pvalue, Double_t &testStat) const
Kolmogorov-Smirnov 1-Sample Test.
Definition GoFTest.cxx:921
std::vector< Double_t > fParams
The distribution parameters (e.g. fParams[0] = mean, fParams[1] = sigma for a Gaussian)
Definition GoFTest.h:239
ETestType
Goodness of Fit test types for using with the class's unary functions as a shorthand for the in-built...
Definition GoFTest.h:85
@ kKS
Anderson-Darling 2-Samples Test.
Definition GoFTest.h:88
@ kKS2s
Kolmogorov-Smirnov Test.
Definition GoFTest.h:89
@ kAD2s
Anderson-Darling Test. Default value.
Definition GoFTest.h:87
void SetSamples(std::vector< const Double_t * > samples, const std::vector< size_t > samplesSizes)
set a vector of samples
Definition GoFTest.cxx:181
static Double_t PValueADKSamples(size_t nsamples, Double_t A2)
Computation of the K-Sample Anderson-Darling Test's p-value as described in (1)
Definition GoFTest.cxx:353
void LogSample()
Applies the logarithm to the sample when the specified distribution to test is LogNormal.
Definition GoFTest.cxx:303
void SetDistribution(EDistribution dist, const std::vector< double > &distParams={})
Sets the distribution for the predefined distribution types and optionally its parameters for 1-sampl...
Definition GoFTest.cxx:124
Double_t GaussianCDF(Double_t x) const
Definition GoFTest.cxx:295
void AndersonDarling2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Performs the Anderson-Darling 2-Sample Test.
Definition GoFTest.cxx:646
GoFTest()
Disallowed default constructor.
void KolmogorovSmirnov2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Kolmogorov-Smirnov 2-Samples Test.
Definition GoFTest.cxx:896
std::vector< std::vector< Double_t > > fSamples
The input data.
Definition GoFTest.h:243
Double_t PValueAD1Sample(Double_t A2) const
Computation of the 1-Sample Anderson-Darling Test's p-value.
Definition GoFTest.cxx:483
void AndersonDarlingTest(Double_t &pvalue, Double_t &testStat) const
Performs the Anderson-Darling 1-Sample Test.
Definition GoFTest.cxx:862
Double_t ExponentialCDF(Double_t x) const
Definition GoFTest.cxx:299
void SetParameters(const std::vector< double > &params)
Sets the distribution parameters.
Definition GoFTest.cxx:204
Interface (abstract class) for generic functions objects of one-dimension Provides a method to evalua...
Definition IFunction.h:112
User Class for performing numerical integration of a function in one dimension.
Definition Integrator.h:98
double IntegralUp(const IGenFunction &f, double a)
evaluate the Integral of a function f over the semi-infinite interval (a,+inf)
Definition Integrator.h:278
void SetFunction(Function &f)
method to set the a generic integration function
Definition Integrator.h:492
double Integral(Function &f, double a, double b)
evaluate the Integral of a function f over the defined interval (a,b)
Definition Integrator.h:499
double IntegralLow(const IGenFunction &f, double b)
evaluate the Integral of a function f over the over the semi-infinite interval (-inf,...
Definition Integrator.h:296
PDFIntegral(const IGenFunction &pdf, Double_t xmin=0, Double_t xmax=-1)
Definition GoFTest.cxx:87
Double_t DoEval(Double_t x) const override
implementation of the evaluation function. Must be implemented by derived classes
Definition GoFTest.cxx:110
const IGenFunction * fPDF
Definition GoFTest.cxx:82
~PDFIntegral() override
Definition GoFTest.cxx:85
IntegratorOneDim fIntegral
Definition GoFTest.cxx:81
IGenFunction * Clone() const override
Clone a function.
Definition GoFTest.cxx:119
Template class to wrap any member function of a class taking a double and returning a double in a 1D ...
const_iterator begin() const
const_iterator end() const
Stopwatch class.
Definition TStopwatch.h:28
double normal_cdf(double x, double sigma=1, double x0=0)
Cumulative distribution function of the normal (Gaussian) distribution (lower tail).
double exponential_cdf(double x, double lambda, double x0=0)
Cumulative distribution function of the exponential distribution (lower tail).
Double_t x[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
#define F(x, y, z)
#define H(x, y, z)
Namespace for new Math classes and functions.
void adkTestStat(double *adk, const std::vector< std::vector< double > > &samples, const std::vector< double > &zstar)
Definition GoFTest.cxx:546
int getCount(double z, const double *dat, int n)
Definition GoFTest.cxx:520
int getSum(const int *x, int n)
Definition GoFTest.cxx:534
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Double_t KolmogorovTest(Int_t na, const Double_t *a, Int_t nb, const Double_t *b, Option_t *option)
Statistical test whether two one-dimensional sets of points are compatible with coming from the same ...
Definition TMath.cxx:805
Double_t Log(Double_t x)
Returns the natural logarithm of x.
Definition TMath.h:760
Double_t Sqrt(Double_t x)
Returns the square root of x.
Definition TMath.h:666
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
Returns x raised to the power y.
Definition TMath.h:725
Double_t KolmogorovProb(Double_t z)
Calculates the Kolmogorov distribution function,.
Definition TMath.cxx:679
void Sort(Index n, const Element *a, Index *index, Bool_t down=kTRUE)
Sort the n elements of the array a of generic templated type Element.
Definition TMathBase.h:431
Short_t Abs(Short_t d)
Returns the absolute value of parameter Short_t d.
Definition TMathBase.h:123
CDFWrapper(const IGenFunction &cdf, Double_t xmin=0, Double_t xmax=-1)
Definition GoFTest.cxx:50
Double_t DoEval(Double_t x) const override
implementation of the evaluation function. Must be implemented by derived classes
Definition GoFTest.cxx:65
IGenFunction * Clone() const override
Clone a function.
Definition GoFTest.cxx:71
const IGenFunction * fCDF
Definition GoFTest.cxx:45
~CDFWrapper() override
Definition GoFTest.cxx:48
TLine l
Definition textangle.C:4
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345