ROOT  6.06/09
Reference Guide
GoFTest.cxx
Go to the documentation of this file.
1 // @(#)root/mathcore:$Id$
2 // Authors: Bartolomeu Rabacal 05/2010
3 /**********************************************************************
4  * *
5  * Copyright (c) 2006 , LCG ROOT MathLib Team *
6  * *
7  * *
8  **********************************************************************/
9 // implementation file for GoFTest
10 
11 
12 #include <algorithm>
13 #include <functional>
14 #include <iostream>
15 #include <map>
16 #include <numeric>
17 #include <string.h>
18 #include <cassert>
19 
20 #include "Math/Error.h"
21 #include "Math/Math.h"
22 #include "Math/Integrator.h"
23 #include "Math/ProbFuncMathCore.h"
24 
25 #include "Math/GoFTest.h"
26 
27 #include "Fit/BinData.h"
28 
29 #include "TStopwatch.h"
30 
31 /* Note: The references mentioned here are stated in GoFTest.h */
32 
33 namespace ROOT {
34 namespace Math {
35 
36  struct CDFWrapper : public IGenFunction {
37  // wrapper around a cdf funciton to re-scale for the range
38  Double_t fXmin; // lower range for x
39  Double_t fXmax; // lower range for x
40  Double_t fNorm; // normalization
41  const IGenFunction* fCDF; // cdf pointer (owned by the class)
42 
43 
44  virtual ~CDFWrapper() { if (fCDF) delete fCDF; }
45 
46  CDFWrapper(const IGenFunction& cdf, Double_t xmin=0, Double_t xmax=-1) :
47  fCDF(cdf.Clone())
48  {
49  if (xmin >= xmax) {
50  fNorm = 1;
53  }
54  else {
55  fNorm = cdf(xmax) - cdf(xmin);
56  fXmin = xmin;
57  fXmax = xmax;
58  }
59  }
60 
61  Double_t DoEval(Double_t x) const {
62  if (x <= fXmin) return 0;
63  if (x >= fXmax) return 1.0;
64  return (*fCDF)(x)/fNorm;
65  }
66 
67  IGenFunction* Clone() const {
68  return new CDFWrapper(*fCDF,fXmin,fXmax);
69  }
70  };
71 
72 
73  class PDFIntegral : public IGenFunction {
74  Double_t fXmin; // lower range for x
75  Double_t fXmax; // lower range for x
76  Double_t fNorm; // normalization
77  mutable IntegratorOneDim fIntegral;
78  const IGenFunction* fPDF; // pdf pointer (owned by the class)
79  public:
80 
81  virtual ~PDFIntegral() { if (fPDF) delete fPDF; }
82 
83  PDFIntegral(const IGenFunction& pdf, Double_t xmin = 0, Double_t xmax = -1) :
84  fXmin(xmin),
85  fXmax(xmax),
86  fNorm(1),
87  fPDF(pdf.Clone())
88  {
89  // compute normalization
90  fIntegral.SetFunction(*fPDF); // N.B. must be fPDF (the cloned copy) and not pdf which can disappear
91  if (fXmin >= fXmax) {
94  }
96  fNorm = fIntegral.Integral();
97  }
98  else if (fXmin == -std::numeric_limits<double>::infinity() )
99  fNorm = fIntegral.IntegralLow(fXmax);
100  else if (fXmax == std::numeric_limits<double>::infinity() )
101  fNorm = fIntegral.IntegralUp(fXmin);
102  else
103  fNorm = fIntegral.Integral(fXmin, fXmax);
104  }
105 
106  Double_t DoEval(Double_t x) const {
107  if (x <= fXmin) return 0;
108  if (x >= fXmax) return 1.0;
109  if (fXmin == -std::numeric_limits<double>::infinity() )
110  return fIntegral.IntegralLow(x)/fNorm;
111  else
112  return fIntegral.Integral(fXmin,x)/fNorm;
113  }
114 
115  IGenFunction* Clone() const {
116  return new PDFIntegral(*fPDF, fXmin, fXmax);
117  }
118  };
119 
121  if (!(kGaussian <= dist && dist <= kExponential)) {
122  MATH_ERROR_MSG("SetDistribution", "Cannot set distribution type! Distribution type option must be ennabled.");
123  return;
124  }
125  fDist = dist;
126  SetCDF();
127  }
128 
129  GoFTest::GoFTest( UInt_t sample1Size, const Double_t* sample1, UInt_t sample2Size, const Double_t* sample2 )
130  : fCDF(std::auto_ptr<IGenFunction>((IGenFunction*)0)),
131  fDist(kUndefined),
132  fSamples(std::vector<std::vector<Double_t> >(2)),
133  fTestSampleFromH0(kFALSE) {
134  Bool_t badSampleArg = sample1 == 0 || sample1Size == 0;
135  if (badSampleArg) {
136  std::string msg = "'sample1";
137  msg += !sample1Size ? "Size' cannot be zero" : "' cannot be zero-length";
138  MATH_ERROR_MSG("GoFTest", msg.c_str());
139  assert(!badSampleArg);
140  }
141  badSampleArg = sample2 == 0 || sample2Size == 0;
142  if (badSampleArg) {
143  std::string msg = "'sample2";
144  msg += !sample2Size ? "Size' cannot be zero" : "' cannot be zero-length";
145  MATH_ERROR_MSG("GoFTest", msg.c_str());
146  assert(!badSampleArg);
147  }
148  std::vector<const Double_t*> samples(2);
149  std::vector<UInt_t> samplesSizes(2);
150  samples[0] = sample1;
151  samples[1] = sample2;
152  samplesSizes[0] = sample1Size;
153  samplesSizes[1] = sample2Size;
154  SetSamples(samples, samplesSizes);
155  SetParameters();
156  }
157 
159  : fCDF(std::auto_ptr<IGenFunction>((IGenFunction*)0)),
160  fDist(dist),
161  fSamples(std::vector<std::vector<Double_t> >(1)),
162  fTestSampleFromH0(kTRUE) {
163  Bool_t badSampleArg = sample == 0 || sampleSize == 0;
164  if (badSampleArg) {
165  std::string msg = "'sample";
166  msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
167  MATH_ERROR_MSG("GoFTest", msg.c_str());
168  assert(!badSampleArg);
169  }
170  std::vector<const Double_t*> samples(1, sample);
171  std::vector<UInt_t> samplesSizes(1, sampleSize);
172  SetSamples(samples, samplesSizes);
173  SetParameters();
174  SetCDF();
175  }
176 
178 
179  void GoFTest::SetSamples(std::vector<const Double_t*> samples, const std::vector<UInt_t> samplesSizes) {
180  fCombinedSamples.assign(std::accumulate(samplesSizes.begin(), samplesSizes.end(), 0), 0.0);
181  UInt_t combinedSamplesSize = 0;
182  for (UInt_t i = 0; i < samples.size(); ++i) {
183  fSamples[i].assign(samples[i], samples[i] + samplesSizes[i]);
184  std::sort(fSamples[i].begin(), fSamples[i].end());
185  for (UInt_t j = 0; j < samplesSizes[i]; ++j) {
186  fCombinedSamples[combinedSamplesSize + j] = samples[i][j];
187  }
188  combinedSamplesSize += samplesSizes[i];
189  }
190  std::sort(fCombinedSamples.begin(), fCombinedSamples.end());
191 
192  Bool_t degenerateSamples = *(fCombinedSamples.begin()) == *(fCombinedSamples.end() - 1);
193  if (degenerateSamples) {
194  std::string msg = "Degenerate sample";
195  msg += samplesSizes.size() > 1 ? "s!" : "!";
196  msg += " Sampling values all identical.";
197  MATH_ERROR_MSG("SetSamples", msg.c_str());
198  assert(!degenerateSamples);
199  }
200  }
201 
203  fMean = std::accumulate(fSamples[0].begin(), fSamples[0].end(), 0.0) / fSamples[0].size();
204  fSigma = TMath::Sqrt(1. / (fSamples[0].size() - 1) * (std::inner_product(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(), 0.0) - fSamples[0].size() * TMath::Power(fMean, 2)));
205  }
206 
207  void GoFTest::operator()(ETestType test, Double_t& pvalue, Double_t& testStat) const {
208  switch (test) {
209  default:
210  case kAD:
211  AndersonDarlingTest(pvalue, testStat);
212  break;
213  case kAD2s:
214  AndersonDarling2SamplesTest(pvalue, testStat);
215  break;
216  case kKS:
217  KolmogorovSmirnovTest(pvalue, testStat);
218  break;
219  case kKS2s:
220  KolmogorovSmirnov2SamplesTest(pvalue, testStat);
221  }
222  }
223 
225  Double_t result = 0.0;
226  switch (test) {
227  default:
228  case kAD:
229  result = AndersonDarlingTest(option);
230  break;
231  case kAD2s:
232  result = AndersonDarling2SamplesTest(option);
233  break;
234  case kKS:
235  result = KolmogorovSmirnovTest(option);
236  break;
237  case kKS2s:
238  result = KolmogorovSmirnov2SamplesTest(option);
239  }
240  return result;
241  }
242 
243  void GoFTest::SetCDF() { // Setting parameter-free distributions
244  IGenFunction* cdf = 0;
245  switch (fDist) {
246  case kLogNormal:
247  LogSample();
248  case kGaussian :
250  break;
251  case kExponential:
253  break;
254  case kUserDefined:
255  case kUndefined:
256  default:
257  break;
258  }
259  fCDF = std::auto_ptr<IGenFunction>(cdf);
260  }
261 
263  if (fDist > kUserDefined) {
264  MATH_WARN_MSG("SetDistributionFunction","Distribution type is changed to user defined");
265  }
267  // function will be cloned inside the wrapper PDFIntegral of CDFWrapper classes
268  if (isPDF)
269  fCDF = std::auto_ptr<IGenFunction>(new PDFIntegral(f, xmin, xmax) );
270  else
271  fCDF = std::auto_ptr<IGenFunction>(new CDFWrapper(f, xmin, xmax) );
272  }
273 
275  // initialization function for the template constructors
276  Bool_t badSampleArg = sample == 0 || sampleSize == 0;
277  if (badSampleArg) {
278  std::string msg = "'sample";
279  msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
280  MATH_ERROR_MSG("GoFTest", msg.c_str());
281  assert(!badSampleArg);
282  }
283  fCDF = std::auto_ptr<IGenFunction>((IGenFunction*)0);
285  fMean = 0;
286  fSigma = 0;
287  fSamples = std::vector<std::vector<Double_t> >(1);
289  SetSamples(std::vector<const Double_t*>(1, sample), std::vector<UInt_t>(1, sampleSize));
290  }
291 
293  return ROOT::Math::normal_cdf(x, fSigma, fMean);
294  }
295 
297  return ROOT::Math::exponential_cdf(x, 1.0 / fMean);
298  }
299 
301  transform(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(), std::ptr_fun<Double_t, Double_t>(TMath::Log));
302  SetParameters();
303  }
304 
305 /*
306  Taken from (1)
307 */
308  Double_t GoFTest::GetSigmaN(const std::vector<UInt_t> & ns, UInt_t N) {
309  // compute moments of AD distribution (from Scholz-Stephen paper, paragraph 3)
310 
311  Double_t sigmaN = 0.0, h = 0.0, H = 0.0, g = 0.0, a, b, c, d, k = ns.size();
312 
313  for (UInt_t i = 0; i < ns.size(); ++i) {
314  H += 1.0 / double( ns[i] );
315  }
316 
317  // use approximate formulas for large N
318  // cache Sum( 1 / i)
319  if (N < 2000) {
320  std::vector<double> invI(N);
321  for (UInt_t i = 1; i <= N - 1; ++i) {
322  invI[i] = 1.0 / i;
323  h += invI[i];
324  }
325  for (UInt_t i = 1; i <= N - 2; ++i) {
326  double tmp = invI[N-i];
327  for (UInt_t j = i + 1; j <= N - 1; ++j) {
328  g += tmp * invI[j];
329  }
330  }
331  }
332  else {
333  // for N larger than 2000 error difference in g is ~ 5 10^-3 while in h is at the level of 10^-5
334  const double emc = 0.5772156649015328606065120900824024; // Euler-Mascheroni constant
335  h = std::log(double(N-1) ) + emc;
336  g = (M_PI)*(M_PI)/6.0;
337  }
338  double k2 = std::pow(k,2);
339  a = (4 * g - 6) * k + (10 - 6 * g) * H - 4 * g + 6;
340  b = (2 * g - 4) * k2 + 8 * h * k + (2 * g - 14 * h - 4) * H - 8 * h + 4 * g - 6;
341  c = (6 * h + 2 * g - 2) * k2 + (4 * h - 4 *g + 6) * k + (2 * h - 6) * H + 4 * h;
342  d = (2 * h + 6) * k2 - 4 * h * k;
343  sigmaN += a * std::pow(double(N),3) + b * std::pow(double(N),2) + c * N + d;
344  sigmaN /= ( double(N - 1) * double(N - 2) * double(N - 3) );
345  sigmaN = TMath::Sqrt(sigmaN);
346  return sigmaN;
347  }
348 
349 
351 
352  /*
353  Computation of p-values according to
354  "K-Sample Anderson-Darling Tests" by F.W. Scholz
355  and M.A. Stephens (1987), Journal of the American Statistical Association,
356  Vol 82, No. 399, pp 918-924.
357  Code from kSamples package from R (author F. Scholtz)
358 
359  This function uses the upper T_m quantiles as obtained via simulation of
360  the Anderson-Darling test statistics (Nsim = 2*10^6) with sample sizes n=500
361  for each sample, and after standardization, in order to emulate the Table 1
362  values given in the above reference. However, here we estimate p-quantiles
363  for p = .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,
364  .1,.2,.3,.4,.5,.6,.7,.8,.9,.925,.95,.975,.99,.9925,.995,.9975,.999,
365  .99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999
366  First the appropriate p-quantiles are determined from those simulated
367  for ms = 1,2,3,4,6,8,10, Inf, interpolating to the given value of m.
368  Since we use only m=2 we avoid this interpolation.
369 
370  Next linear inetrpolation to find the observed p value given the observed test statistic value.
371  We use interpolation in the test statistic -> log((1-p)/p) domain
372  and we extrapolatelinearly) beyond p = .00001 and .99999.
373  */
374 
375  // sample values
376  //double ms[] = { 1, 2, 3, 4, 6, 8, 10, TMath::Infinity() };
377  //int ns = ms.size();
378  const int ns = 8;
379  double ts[ ] = { -1.1954, -1.5806, -1.8172,
380  -2.0032, -2.2526, -2.4204, -2.5283, -4.2649, -1.1786, -1.5394,
381  -1.7728, -1.9426, -2.1685, -2.3288, -2.4374, -3.8906, -1.166,
382  -1.5193, -1.7462, -1.9067, -2.126, -2.2818, -2.3926, -3.719,
383  -1.1407, -1.4659, -1.671, -1.8105, -2.0048, -2.1356, -2.2348,
384  -3.2905, -1.1253, -1.4371, -1.6314, -1.7619, -1.9396, -2.0637,
385  -2.1521, -3.0902, -1.0777, -1.3503, -1.5102, -1.6177, -1.761,
386  -1.8537, -1.9178, -2.5758, -1.0489, -1.2984, -1.4415, -1.5355,
387  -1.6625, -1.738, -1.7936, -2.3263, -0.9978, -1.2098, -1.3251,
388  -1.4007, -1.4977, -1.5555, -1.5941, -1.96, -0.9417, -1.1187,
389  -1.209, -1.2671, -1.3382, -1.379, -1.405, -1.6449, -0.8981, -1.0491,
390  -1.1235, -1.1692, -1.2249, -1.2552, -1.2755, -1.4395, -0.8598,
391  -0.9904, -1.0513, -1.0879, -1.1317, -1.155, -1.1694, -1.2816,
392  -0.7258, -0.7938, -0.8188, -0.8312, -0.8435, -0.8471, -0.8496,
393  -0.8416, -0.5966, -0.617, -0.6177, -0.6139, -0.6073, -0.5987,
394  -0.5941, -0.5244, -0.4572, -0.4383, -0.419, -0.4033, -0.3834,
395  -0.3676, -0.3587, -0.2533, -0.2966, -0.2428, -0.2078, -0.1844,
396  -0.1548, -0.1346, -0.1224, 0, -0.1009, -0.0169, 0.0304, 0.0596,
397  0.0933, 0.1156, 0.1294, 0.2533, 0.1571, 0.2635, 0.3169, 0.348,
398  0.3823, 0.4038, 0.4166, 0.5244, 0.5357, 0.6496, 0.6992, 0.7246,
399  0.7528, 0.7683, 0.7771, 0.8416, 1.2255, 1.2989, 1.3202, 1.3254,
400  1.3305, 1.3286, 1.3257, 1.2816, 1.5262, 1.5677, 1.5709, 1.5663,
401  1.5561, 1.5449, 1.5356, 1.4395, 1.9633, 1.943, 1.919, 1.8975,
402  1.8641, 1.8389, 1.8212, 1.6449, 2.7314, 2.5899, 2.5, 2.4451,
403  2.3664, 2.3155, 2.2823, 1.96, 3.7825, 3.4425, 3.2582, 3.1423,
404  3.0036, 2.9101, 2.8579, 2.3263, 4.1241, 3.716, 3.4984, 3.3651,
405  3.2003, 3.0928, 3.0311, 2.4324, 4.6044, 4.0847, 3.8348, 3.6714,
406  3.4721, 3.3453, 3.2777, 2.5758, 5.409, 4.7223, 4.4022, 4.1791,
407  3.9357, 3.7809, 3.6963, 2.807, 6.4954, 5.5823, 5.1456, 4.8657,
408  4.5506, 4.3275, 4.2228, 3.0902, 6.8279, 5.8282, 5.3658, 5.0749,
409  4.7318, 4.4923, 4.3642, 3.1747, 7.2755, 6.197, 5.6715, 5.3642,
410  4.9991, 4.7135, 4.5945, 3.2905, 8.1885, 6.8537, 6.2077, 5.8499,
411  5.4246, 5.1137, 4.9555, 3.4808, 9.3061, 7.6592, 6.85, 6.4806,
412  5.9919, 5.6122, 5.5136, 3.719, 9.6132, 7.9234, 7.1025, 6.6731,
413  6.1549, 5.8217, 5.7345, 3.7911, 10.0989, 8.2395, 7.4326, 6.9567,
414  6.3908, 6.011, 5.9566, 3.8906, 10.8825, 8.8994, 7.8934, 7.4501,
415  6.9009, 6.4538, 6.2705, 4.0556, 11.8537, 9.5482, 8.5568, 8.0283,
416  7.4418, 6.9524, 6.6195, 4.2649 };
417 
418 
419 
420 
421 
422  // p values bins
423  double p[] = { .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,.1,.2,.3,.4,.5,.6,.7,.8,.9,
424  .925,.95,.975,.99,.9925,.995,.9975,.999,.99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999 };
425 
426  //int nbins = p.size();
427  const int nbins = 35;
428  //assert ( nbins*ns == ts.size() );
429 
430  // get ts values for nsamples = 2
431  // corresponding value is for m=nsamples-1
432  int offset = 0; // for m = 1 (i.e. for nsamples = 2)
433  if (nsamples != 2) {
434  MATH_ERROR_MSG("InterpolatePValues", "Interpolation not implemented for nsamples not equal to 2");
435  return 0;
436  }
437  std::vector<double> ts2(nbins); // ts values for nsamples = 2
438  std::vector<double> lp(nbins); // log ( p / (1-p) )
439  for (int i = 0; i < nbins; ++i)
440  {
441  ts2[i] = ts[offset+ i * ns];
442  p[i] = 1.-p[i];
443  lp[i] = std::log( p[i]/(1.-p[i] ) );
444  }
445  // do linear interpolation to find right lp value for given observed test staistic value
446  //auto it = std::lower_bound(ts2.begin(), ts2.end(), tx );
447  int i1 = std::distance(ts2.begin(), std::lower_bound(ts2.begin(), ts2.end(), tx ) ) - 1;
448  int i2 = i1+1;
449  // if tx is before min of tabluated data
450  if (i1 < 0) {
451  i1 = 0;
452  i2 = 1;
453  }
454  // if tx is after max of tabulated data
455  if (i2 >= int(ts2.size()) ) {
456  i1 = ts2.size()-2;
457  i2 = ts2.size()-1;
458  }
459 
460  //std::cout << i1 << " , " << i2 << std::endl;
461  assert(i1 < (int) lp.size() && i2 < (int) lp.size() );
462  double lp1 = lp[i1];
463  double lp2 = lp[i2];
464  double tx1 = ts2[i1];
465  double tx2 = ts2[i2];
466 
467  //std::cout << " tx1,2 " << tx1 << " " << tx2 << std::endl;
468  /// find interpolated (or extrapolated value)(
469  double lp0 = (lp1-lp2) * (tx - tx2)/ ( tx1-tx2) + lp2;
470 
471 
472  double p0 = exp(lp0)/(1. + exp(lp0) );
473  return p0;
474 
475  }
476 
477 
478 /*
479  Taken from (2)
481  Double_t pvalue = 0.0;
482  if (A2 <= 0.0) {
483  return pvalue;
484  } else if (A2 < 2.) {
485  pvalue = std::pow(A2, -0.5) * std::exp(-1.2337141 / A2) * (2.00012 + (0.247105 - (0.0649821 - (0.0347962 - (0.011672 - 0.00168691 * A2) * A2) * A2) * A2) * A2);
486  } else {
487  pvalue = std::exp(-1. * std::exp(1.0776 - (2.30695 - (0.43424 - (.082433 - (0.008056 - 0.0003146 * A2) * A2) * A2) * A2) * A2));
488  }
489  return 1. - pvalue;
490  }
491 
492 
493 // code from kSamples (R) F. Scholz
494 
495 /* computes the k-sample Anderson-Darling test statistics in both original
496  and alternative versions for the nonparametric (rank) test described in
497  Scholz F.W. and Stephens M.A. (1987), K-sample Anderson-Darling Tests,
498  Journal of the American Statistical Association, Vol 82, No. 399,
499  pp. 918-924
500 
501  Arguments:
502  adk: double array with length 2, stores AkN2 and AakN2
503  k: integer, number of samples being compared
504  x: double array storing the concatenated samples in the same order as ns
505  ns: integer array storing the k sample sizes, corresponding to x
506  zstar: double array storing the l distinct ordered observations in the
507  pooled sample
508  l: integer, length of zstar
509 
510  Outputs:
511  when the computation ends, AkN2 and AakN2 are stored in the given memory
512  pointed by adk
513 */
514 
515 /* counts and returns the number of occurrence of a given number
516  in a double array */
517 int getCount(double z, const double *dat, int n) {
518  int i;
519  int count = 0;
520 
521  for (i = 0; i < n; i++) {
522  if (dat[i] == z) {
523  count++;
524  }
525  }
526 
527  return(count);
528 }
529 
530 /* computes and returns the sum of elements in a given integer array */
531 int getSum(const int *x, int n) {
532  int i;
533  int sum = 0;
534 
535  for (i = 0; i < n; i++) {
536  sum += x[i];
537  }
538 
539  return(sum);
540 }
541 
542 
543  void adkTestStat(double *adk, const std::vector<std::vector<double> > & samples, const std::vector<double> & zstar) {
544 
545  int i;
546  int j;
547 
548  int nsum; /* total sample size = n_1 + ... + n_k */
549  int k = samples.size();
550  int l = zstar.size();
551 
552  /* fij records the number of observations in the ith sample coinciding
553  with zstar[j], where i = 1, ..., k, and j = 1, ..., l */
554  std::vector<int> fij (k*l);
555  /* lvec is an integer vector with length l,
556  whose jth entry = \sum_{i=1}^{k} f_{ij}, i.e., the multiplicity
557  of zstar[j] */
558  std::vector<int> lvec(l);
559 
560  /* for computation */
561  double mij;
562  double maij;
563  double innerSum;
564  double aInnerSum;
565  double bj;
566  double baj;
567  double tmp;
568 
569  /* samples is a two-dimensional double array with length k;
570  it stores an array of k pointers to double arrays which are
571  the k samples beeing compared */
572 // double **samples;
573 
574  /* dynamically allocate memory */
575  //std::vector< std::vector<double> > samples(k);
576  std::vector<int> ns(k);
577  nsum = 0;
578  for (i = 0; i < k; i++) {
579  ns[i] = samples[i].size();
580  nsum += ns[i];
581  }
582 
583  /* fij: k*l integer matrix, where l is the length of zstar and
584  k is the number of samples being compared
585  lvec: integer vector of length l, records the multiplicity of
586  each element of zstar */
587  for (j = 0; j < l; j++) {
588  lvec[j] = 0;
589  for (i = 0; i < k; i++) {
590  fij[i + j*k] = getCount(zstar[j], &samples[i][0], ns[i]);
591  lvec[j] += fij[i + j*k];
592  }
593  }
594 
595  // loop on samples to compute the adk's
596  // Formula (6) and (7) of the paper
597  adk[0] = adk[1] = 0;
598  for (i = 0; i < k; i++) {
599  mij = 0;
600  maij = 0;
601  innerSum = 0;
602  aInnerSum = 0;
603 
604  for (j = 0; j < l; j++) {
605  mij += fij[i + j*k];
606  maij = mij - (double) fij[i + j*k] / 2.0;
607  bj = getSum(&lvec[0], j + 1);
608  baj = bj - (double) lvec[j] / 2.0;
609 
610  if (j < l - 1) {
611  tmp = (double) nsum * mij - (double) ns[i] * bj;
612  innerSum = innerSum + (double) lvec[j] * tmp * tmp /
613  (bj * ((double) nsum - bj));
614  }
615 
616  tmp = (double) nsum * maij - (double) ns[i] * baj;
617  aInnerSum = aInnerSum + (double) lvec[j] * tmp * tmp /
618  (baj * (nsum - baj) - nsum * (double) lvec[j] / 4.0);
619  }
620 
621  adk[0] = adk[0] + innerSum / ns[i]; /* AkN2*/
622  adk[1] = adk[1] + aInnerSum / ns[i]; /* AakN2 */
623  }
624 
625  /* k-sample Anderson-Darling test statistics in both original and
626  alternative versions, AkN2 and AakN2, are stored in the given
627  double array adk */
628  adk[0] = adk[0] / (double) nsum; /* AkN2*/
629  adk[1] = (nsum - 1) * adk[1] / ((double) nsum * (double) nsum); /* AakN2 */
630 
631  // /* free pointers */
632  // for (i = 0; i < k; i++) {
633  // free(samples[i]);
634  // }
635  // free(samples);
636 
637 }
638 
639 
640 /*
641  Taken from (1) -- Named for 2 samples but implemented for K. Restricted to K = 2 by the class's constructors
642 */
644  pvalue = -1;
645  testStat = -1;
646  if (fTestSampleFromH0) {
647  MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
648  return;
649  }
650  std::vector<Double_t> z(fCombinedSamples);
651  // unique removes all consecutives duplicates elements. This is exactly what we wants
652  // for example unique of v={1,2,2,3,1,2,3,3} results in {1,2,3,1,2,3} which is exactly what we wants
653  std::vector<Double_t>::iterator endUnique = std::unique(z.begin(), z.end()); //z_j's in (1)
654  z.erase(endUnique, z.end() );
655  std::vector<UInt_t> h; // h_j's in (1)
656  std::vector<Double_t> H; // H_j's in (1)
657  UInt_t N = fCombinedSamples.size();
658  Double_t A2 = 0.0; // Anderson-Darling A^2 Test Statistic
659 
660 #ifdef USE_OLDIMPL
661 
662  TStopwatch w; w.Start();
663 
664  unsigned int nSamples = fSamples.size();
665 
666  // old implementation
667  for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
668  UInt_t n = std::count(fCombinedSamples.begin(), fCombinedSamples.end(), *data);
669  h.push_back(n);
670  H.push_back(std::count_if(fCombinedSamples.begin(), fCombinedSamples.end(), bind2nd(std::less<Double_t>(), *data)) + n / 2.);
671  }
672  std::cout << "time for H";
673  w.Print();
674  w.Reset(); w.Start();
675  std::vector<std::vector<Double_t> > F(nSamples); // F_ij's in (1)
676  for (UInt_t i = 0; i < nSamples; ++i) {
677  for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
678  UInt_t n = std::count(fSamples[i].begin(), fSamples[i].end(), *data);
679  F[i].push_back(std::count_if(fSamples[i].begin(), fSamples[i].end(), bind2nd(std::less<Double_t>(), *data)) + n / 2.);
680  }
681  }
682  std::cout << "time for F";
683  w.Print();
684  for (UInt_t i = 0; i < nSamples; ++i) {
685  Double_t sum_result = 0.0;
686  UInt_t j = 0;
687  w.Reset(); w.Start();
688  for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
689  sum_result += h[j] * TMath::Power(N * F[i][j]- fSamples[i].size() * H[j], 2) / (H[j] * (N - H[j]) - N * h[j] / 4.0);
690  ++j;
691  }
692  std::cout << "time for sum_resut";
693  w.Print();
694  std::cout << "sum_result " << sum_result << std::endl;
695  A2 += 1.0 / fSamples[i].size() * sum_result;
696  }
697  A2 *= (N - 1) / (TMath::Power(N, 2)); // A2_akN in (1)
698 
699  std::cout << "A2 - old Bartolomeo code " << A2 << std::endl;
700 #endif
701  // w.Reset();
702  // w.Start();
703 
704  double adk[2] = {0,0};
705 
706  //debug
707  // std::cout << "combined samples\n";
708  // for (int i = 0; i < fCombinedSamples.size(); ++i)
709  // std::cout << fCombinedSamples[i] << " ,";
710  // std::cout << std::endl;
711  // std::cout << ns[0] << " " << ns[1] << std::endl;
712  // std::cout << "Z\n";
713  // for (int i = 0; i < z.size(); ++i)
714  // std::cout << z[i] << " ,";
715  // std::cout << std::endl;
716 
717  // use function from kSamples code
718  adkTestStat(adk, fSamples, z );
719  // w.Print();
720  // std::cout << "A2 - new kSamples code " << adk[0] << " " << adk[1] << std::endl;
721 
722  A2 = adk[0];
723 
724  // compute the normalized test statistic
725 
726  std::vector<UInt_t> ns(fSamples.size());
727  for (unsigned int k = 0; k < ns.size(); ++k) ns[k] = fSamples[k].size();
728  Double_t sigmaN = GetSigmaN(ns, N);
729  A2 -= fSamples.size() - 1;
730  A2 /= sigmaN; // standartized test statistic
731 
732  pvalue = PValueADKSamples(2,A2);
733  testStat = A2;
734  return;
735  }
736 
737 
738 /*
739  Compute Anderson Darling test for two binned data set.
740  A binned data set can be seen as many identical observation happening at the center of the bin
741  In this way it is trivial to apply the formula (6) in the paper of W. Scholz, M. Stephens, "K-Sample Anderson-Darling Tests"
742  to the case of histograms. See also http://arxiv.org/pdf/0804.0380v1.pdf paragraph 3.3.5
743  It is importat that empty bins are not present
744 */
745  void GoFTest::AndersonDarling2SamplesTest(const ROOT::Fit::BinData &data1, const ROOT::Fit::BinData & data2, Double_t& pvalue, Double_t& testStat) {
746  pvalue = -1;
747  testStat = -1;
748  //
749  // compute cumulative sum of bin counts
750  // std::vector<double> sum1(data1.Size() );
751  // std::vector<double> sum2(data2.Size() );
752  // std::vector<double> sumAll(data1.Size() + data2.Size() );
753 
754  if (data1.NDim() != 1 && data2.NDim() != 1) {
755  MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Bin Data set must be one-dimensional ");
756  return;
757  }
758  unsigned int n1 = data1.Size();
759  unsigned int n2 = data2.Size();
760  double ntot1 = 0;
761  double ntot2 = 0;
762 
763 
764  // make a combined data set and sort it
765  std::vector<double> xdata(n1+n2);
766  for (unsigned int i = 0; i < n1; ++i) {
767  double value = 0;
768  const double * x = data1.GetPoint(i, value);
769  xdata[i] = *x;
770  ntot1 += value;
771  }
772  for (unsigned int i = 0; i < n2; ++i) {
773  double value = 0;
774  const double * x = data2.GetPoint(i, value);
775  xdata[n1+i] = *x;
776  ntot2 += value;
777  }
778  double nall = ntot1+ntot2;
779  // sort the combined data
780  std::vector<unsigned int> index(n1+n2);
781  TMath::Sort(n1+n2, &xdata[0], &index[0], false );
782 
783  // now compute the sums for the tests
784  double sum1 = 0;
785  double sum2 = 0;
786  double sumAll = 0;
787  double adsum = 0;
788  unsigned int j = 0;
789 
790  while( j < n1+n2 ) {
791 // for (unsigned int j = 0; j < n1+n2; ++j) {
792  // skip equal observations
793  double x = xdata[ index[j] ];
794  unsigned int k = j;
795  // loop on the bins with the same center value
796  double t = 0;
797  do {
798  unsigned int i = index[k];
799  double value = 0;
800  if (i < n1 ) {
801  value = data1.Value(i);
802  sum1 += value;
803  }
804  else {
805  // from data2
806  i -= n1;
807  assert(i < n2);
808  value = data2.Value(i);
809  sum2 += value;
810  }
811  sumAll += value;
812  t += value;
813  //std::cout << "j " << j << " k " << k << " data " << x << " index " << index[k] << " value " << value << std::endl;
814  k++;
815  } while ( k < n1+n2 && xdata[ index[k] ] == x );
816 
817 
818  j = k;
819  // skip last point
820  if (j < n1+n2) {
821  double tmp1 = ( nall * sum1 - ntot1 * sumAll );
822  double tmp2 = ( nall * sum2 - ntot2 * sumAll );
823  adsum += t * (tmp1*tmp1/ntot1 + tmp2*tmp2/ntot2) / ( sumAll * (nall - sumAll) ) ;
824 
825  //std::cout << "comp sum " << adsum << " " << t << " " << sumAll << " s1 " << sum1 << " s2 " << sum2 << " tmp1 " << tmp1 << " tmp2 " << tmp2 << std::endl;
826  }
827  }
828  double A2 = adsum / nall;
829 
830  // compute the normalized test statistic
831  std::vector<unsigned int> ns(2);
832  ns[0] = ntot1;
833  ns[1] = ntot2;
834  //std::cout << " ad2 = " << A2 << " nall " << nall;
835 
836  Double_t sigmaN = GetSigmaN(ns,nall);
837  A2 -= 1;
838  A2 /= sigmaN; // standartized test statistic
839 
840  //std::cout << " sigmaN " << sigmaN << " new A2 " << A2;
841 
842  pvalue = PValueADKSamples(2,A2);
843  //std::cout << " pvalue = " << pvalue << std::endl;
844  testStat = A2;
845  return;
846  }
847 
848 
850  Double_t pvalue, testStat;
851  AndersonDarling2SamplesTest(pvalue, testStat);
852  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
853  }
854 
855 /*
856  Taken from (3)
857 */ void GoFTest::AndersonDarlingTest(Double_t& pvalue, Double_t& testStat) const {
858  pvalue = -1;
859  testStat = -1;
860  if (!fTestSampleFromH0) {
861  MATH_ERROR_MSG("AndersonDarlingTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
862  return;
863  }
864  if (fDist == kUndefined) {
865  MATH_ERROR_MSG("AndersonDarlingTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
866  return;
867  }
868  Double_t A2 = 0.0;
869  Int_t n = fSamples[0].size();
870  for (Int_t i = 0; i < n ; ++i) {
871  Double_t x1 = fSamples[0][i];
872  Double_t w1 = (*fCDF)(x1);
873  Double_t result = (2 * (i + 1) - 1) * TMath::Log(w1) + (2 * (n - (i + 1)) + 1) * TMath::Log(1 - w1);
874  A2 += result;
875  }
876  (A2 /= -n) -= n;
877  if (A2 != A2) {
878  MATH_ERROR_MSG("AndersonDarlingTest", "Cannot compute p-value: data below or above the distribution's thresholds. Check sample consistency.");
879  return;
880  }
881  pvalue = PValueAD1Sample(A2);
882  testStat = A2;
883  }
884 
886  Double_t pvalue, testStat;
887  AndersonDarlingTest(pvalue, testStat);
888  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
889  }
890 
892  pvalue = -1;
893  testStat = -1;
894  if (fTestSampleFromH0) {
895  MATH_ERROR_MSG("KolmogorovSmirnov2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
896  return;
897  }
898  const UInt_t na = fSamples[0].size();
899  const UInt_t nb = fSamples[1].size();
900  std::vector<Double_t> a(na);
901  std::vector<Double_t> b(nb);
902  std::copy(fSamples[0].begin(), fSamples[0].end(), a.begin());
903  std::copy(fSamples[1].begin(), fSamples[1].end(), b.begin());
904  pvalue = TMath::KolmogorovTest(na, a.data(), nb, b.data(), 0);
905  testStat = TMath::KolmogorovTest(na, a.data(), nb, b.data(), "M");
906  }
907 
909  Double_t pvalue, testStat;
910  KolmogorovSmirnov2SamplesTest(pvalue, testStat);
911  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
912  }
913 
914 /*
915  Algorithm taken from (3) in page 737
916 */ void GoFTest::KolmogorovSmirnovTest(Double_t& pvalue, Double_t& testStat) const {
917  pvalue = -1;
918  testStat = -1;
919  if (!fTestSampleFromH0) {
920  MATH_ERROR_MSG("KolmogorovSmirnovTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
921  return;
922  }
923  if (fDist == kUndefined) {
924  MATH_ERROR_MSG("KolmogorovSmirnovTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
925  return;
926  }
927  Double_t Fo = 0.0, Dn = 0.0;
928  UInt_t n = fSamples[0].size();
929  for (UInt_t i = 0; i < n; ++i) {
930  Double_t Fn = (i + 1.0) / n;
931  Double_t F = (*fCDF)(fSamples[0][i]);
932  Double_t result = std::max(TMath::Abs(Fn - F), TMath::Abs(Fo - Fn));
933  if (result > Dn) Dn = result;
934  Fo = Fn;
935  }
936  pvalue = TMath::KolmogorovProb(Dn * (TMath::Sqrt(n) + 0.12 + 0.11 / TMath::Sqrt(n)));
937  testStat = Dn;
938  }
939 
941  Double_t pvalue, testStat;
942  KolmogorovSmirnovTest(pvalue, testStat);
943  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
944  }
945 
946 
947 
948 
949 
950 } // ROOT namespace
951 } // Math namespace
952 
void SetDistribution(EDistribution dist)
Definition: GoFTest.cxx:120
Double_t PValueAD1Sample(Double_t A2) const
Definition: GoFTest.cxx:480
double dist(Rotation3D const &r1, Rotation3D const &r2)
Definition: 3DDistances.cxx:48
Bool_t fTestSampleFromH0
Definition: GoFTest.h:197
void Print(Option_t *option="") const
Print the real and cpu time passed between the start and stop events.
Definition: TStopwatch.cxx:217
int getCount(double z, const double *dat, int n)
Definition: GoFTest.cxx:517
Double_t fMean
Definition: GoFTest.h:190
float xmin
Definition: THbookFile.cxx:93
Interface (abstract class) for generic functions objects of one-dimension Provides a method to evalua...
Definition: IFunction.h:133
void Start(Bool_t reset=kTRUE)
Start the stopwatch.
Definition: TStopwatch.cxx:56
Namespace for new ROOT classes and functions.
Definition: ROOT.py:1
Double_t Log(Double_t x)
Definition: TMath.h:526
Double_t KolmogorovProb(Double_t z)
Calculates the Kolmogorov distribution function, Begin_Html.
Definition: TMath.cxx:635
std::vector< Double_t > fCombinedSamples
Definition: GoFTest.h:193
#define assert(cond)
Definition: unittest.h:542
Double_t distance(const TPoint2 &p1, const TPoint2 &p2)
Definition: CsgOps.cxx:467
TH1 * h
Definition: legend2.C:5
#define N
#define H(x, y, z)
double inner_product(const LAVector &, const LAVector &)
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
TArc * a
Definition: textangle.C:12
const Bool_t kFALSE
Definition: Rtypes.h:92
int nbins[3]
STL namespace.
void KolmogorovSmirnovTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:916
#define MATH_WARN_MSG(loc, str)
Definition: Error.h:47
Short_t Abs(Short_t d)
Definition: TMathBase.h:110
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
Definition: TMath.h:501
void operator()(ETestType test, Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:207
unsigned int Size() const
return number of fit points
Definition: BinData.h:447
int sampleSize
Definition: unuranDistr.cxx:34
Double_t x[n]
Definition: legend1.C:17
double cdf(double *x, double *p)
Definition: unuranDistr.cxx:44
double normal_cdf(double x, double sigma=1, double x0=0)
Cumulative distribution function of the normal (Gaussian) distribution (lower tail).
static Double_t PValueADKSamples(UInt_t nsamples, Double_t A2)
Definition: GoFTest.cxx:350
EDistribution fDist
Definition: GoFTest.h:188
double pow(double, double)
void Sort(Index n, const Element *a, Index *index, Bool_t down=kTRUE)
Definition: TMath.h:1002
static Double_t GetSigmaN(const std::vector< UInt_t > &ns, UInt_t N)
Definition: GoFTest.cxx:308
#define MATH_ERROR_MSG(loc, str)
Definition: Error.h:50
void AndersonDarling2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:643
int getSum(const int *x, int n)
Definition: GoFTest.cxx:531
#define F(x, y, z)
#define M_PI
Definition: Rotated.cxx:105
IBaseFunctionOneDim IGenFunction
Definition: IFunctionfwd.h:24
Double_t GaussianCDF(Double_t x) const
Definition: GoFTest.cxx:292
unsigned int UInt_t
Definition: RtypesCore.h:42
Double_t KolmogorovTest(Int_t na, const Double_t *a, Int_t nb, const Double_t *b, Option_t *option)
Statistical test whether two one-dimensional sets of points are compatible with coming from the same ...
Definition: TMath.cxx:784
TLine * l
Definition: textangle.C:4
virtual ~GoFTest()
Definition: GoFTest.cxx:177
Class describing the binned data sets : vectors of x coordinates, y values and optionally error on y ...
Definition: BinData.h:61
Double_t fSigma
Definition: GoFTest.h:191
float xmax
Definition: THbookFile.cxx:93
const Double_t infinity
Definition: CsgOps.cxx:85
std::vector< std::vector< Double_t > > fSamples
Definition: GoFTest.h:195
static const double x1[5]
double f(double x)
double Double_t
Definition: RtypesCore.h:55
void Instantiate(const Double_t *sample, UInt_t sampleSize)
Definition: GoFTest.cxx:274
double Value(unsigned int ipoint) const
return the value for the given fit point
Definition: BinData.h:236
static Vc_ALWAYS_INLINE int_v max(const int_v &x, const int_v &y)
Definition: vector.h:440
Namespace for new Math classes and functions.
char Char_t
Definition: RtypesCore.h:29
void SetSamples(std::vector< const Double_t * > samples, const std::vector< UInt_t > samplesSizes)
Definition: GoFTest.cxx:179
void Reset()
Definition: TStopwatch.h:54
double result[121]
const double * GetPoint(unsigned int ipoint, double &value) const
retrieve at the same time a pointer to the coordinate data and the fit value More efficient than call...
Definition: BinData.h:304
Double_t Sqrt(Double_t x)
Definition: TMath.h:464
double exp(double)
void SetDistributionFunction(const IGenFunction &cdf, Bool_t isPDF, Double_t xmin, Double_t xmax)
Definition: GoFTest.cxx:262
const Bool_t kTRUE
Definition: Rtypes.h:91
float value
Definition: math.cpp:443
const Int_t n
Definition: legend1.C:16
void AndersonDarlingTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:857
Template class to wrap any member function of a class taking a double and returning a double in a 1D ...
double log(double)
void KolmogorovSmirnov2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:891
void adkTestStat(double *adk, const std::vector< std::vector< double > > &samples, const std::vector< double > &zstar)
Definition: GoFTest.cxx:543
unsigned int NDim() const
return coordinate data dimension
Definition: BinData.h:452
double exponential_cdf(double x, double lambda, double x0=0)
Cumulative distribution function of the exponential distribution (lower tail).
Stopwatch class.
Definition: TStopwatch.h:30
Double_t ExponentialCDF(Double_t x) const
Definition: GoFTest.cxx:296