Logo ROOT   6.13/01
Reference Guide
GoFTest.cxx
Go to the documentation of this file.
1 // @(#)root/mathcore:$Id$
2 // Authors: Bartolomeu Rabacal 05/2010
3 /**********************************************************************
4  * *
5  * Copyright (c) 2006 , LCG ROOT MathLib Team *
6  * *
7  * *
8  **********************************************************************/
9 // implementation file for GoFTest
10 
11 
12 #include <algorithm>
13 #include <functional>
14 #include <iostream>
15 #include <map>
16 #include <numeric>
17 #include <string.h>
18 #include <cassert>
19 
20 #include "Math/Error.h"
21 #include "Math/Math.h"
22 #include "Math/IFunction.h"
23 #include "Math/IFunctionfwd.h"
24 #include "Math/Integrator.h"
25 #include "Math/ProbFuncMathCore.h"
26 #include "Math/WrappedFunction.h"
27 
28 #include "Math/GoFTest.h"
29 
30 #include "Fit/BinData.h"
31 
32 #include "TStopwatch.h"
33 
34 /* Note: The references mentioned here are stated in GoFTest.h */
35 
36 namespace ROOT {
37 namespace Math {
38 
39  struct CDFWrapper : public IGenFunction {
40  // wrapper around a cdf funciton to re-scale for the range
41  Double_t fXmin; // lower range for x
42  Double_t fXmax; // lower range for x
43  Double_t fNorm; // normalization
44  const IGenFunction* fCDF; // cdf pointer (owned by the class)
45 
46 
47  virtual ~CDFWrapper() { if (fCDF) delete fCDF; }
48 
49  CDFWrapper(const IGenFunction& cdf, Double_t xmin=0, Double_t xmax=-1) :
50  fCDF(cdf.Clone())
51  {
52  if (xmin >= xmax) {
53  fNorm = 1;
54  fXmin = -std::numeric_limits<double>::infinity();
55  fXmax = std::numeric_limits<double>::infinity();
56  }
57  else {
58  fNorm = cdf(xmax) - cdf(xmin);
59  fXmin = xmin;
60  fXmax = xmax;
61  }
62  }
63 
64  Double_t DoEval(Double_t x) const {
65  if (x <= fXmin) return 0;
66  if (x >= fXmax) return 1.0;
67  return (*fCDF)(x)/fNorm;
68  }
69 
70  IGenFunction* Clone() const {
71  return new CDFWrapper(*fCDF,fXmin,fXmax);
72  }
73  };
74 
75 
76  class PDFIntegral : public IGenFunction {
77  Double_t fXmin; // lower range for x
78  Double_t fXmax; // lower range for x
79  Double_t fNorm; // normalization
80  mutable IntegratorOneDim fIntegral;
81  const IGenFunction* fPDF; // pdf pointer (owned by the class)
82  public:
83 
84  virtual ~PDFIntegral() { if (fPDF) delete fPDF; }
85 
86  PDFIntegral(const IGenFunction& pdf, Double_t xmin = 0, Double_t xmax = -1) :
87  fXmin(xmin),
88  fXmax(xmax),
89  fNorm(1),
90  fPDF(pdf.Clone())
91  {
92  // compute normalization
93  fIntegral.SetFunction(*fPDF); // N.B. must be fPDF (the cloned copy) and not pdf which can disappear
94  if (fXmin >= fXmax) {
95  fXmin = -std::numeric_limits<double>::infinity();
96  fXmax = std::numeric_limits<double>::infinity();
97  }
98  if (fXmin == -std::numeric_limits<double>::infinity() && fXmax == std::numeric_limits<double>::infinity() ) {
99  fNorm = fIntegral.Integral();
100  }
101  else if (fXmin == -std::numeric_limits<double>::infinity() )
102  fNorm = fIntegral.IntegralLow(fXmax);
103  else if (fXmax == std::numeric_limits<double>::infinity() )
104  fNorm = fIntegral.IntegralUp(fXmin);
105  else
106  fNorm = fIntegral.Integral(fXmin, fXmax);
107  }
108 
109  Double_t DoEval(Double_t x) const {
110  if (x <= fXmin) return 0;
111  if (x >= fXmax) return 1.0;
112  if (fXmin == -std::numeric_limits<double>::infinity() )
113  return fIntegral.IntegralLow(x)/fNorm;
114  else
115  return fIntegral.Integral(fXmin,x)/fNorm;
116  }
117 
118  IGenFunction* Clone() const {
119  return new PDFIntegral(*fPDF, fXmin, fXmax);
120  }
121  };
122 
124  if (!(kGaussian <= dist && dist <= kExponential)) {
125  MATH_ERROR_MSG("SetDistribution", "Cannot set distribution type! Distribution type option must be ennabled.");
126  return;
127  }
128  fDist = dist;
129  SetCDF();
130  }
131 
132  GoFTest::GoFTest( UInt_t sample1Size, const Double_t* sample1, UInt_t sample2Size, const Double_t* sample2 )
133  : fDist(kUndefined),
134  fSamples(std::vector<std::vector<Double_t> >(2)),
135  fTestSampleFromH0(kFALSE) {
136  Bool_t badSampleArg = sample1 == 0 || sample1Size == 0;
137  if (badSampleArg) {
138  std::string msg = "'sample1";
139  msg += !sample1Size ? "Size' cannot be zero" : "' cannot be zero-length";
140  MATH_ERROR_MSG("GoFTest", msg.c_str());
141  assert(!badSampleArg);
142  }
143  badSampleArg = sample2 == 0 || sample2Size == 0;
144  if (badSampleArg) {
145  std::string msg = "'sample2";
146  msg += !sample2Size ? "Size' cannot be zero" : "' cannot be zero-length";
147  MATH_ERROR_MSG("GoFTest", msg.c_str());
148  assert(!badSampleArg);
149  }
150  std::vector<const Double_t*> samples(2);
151  std::vector<UInt_t> samplesSizes(2);
152  samples[0] = sample1;
153  samples[1] = sample2;
154  samplesSizes[0] = sample1Size;
155  samplesSizes[1] = sample2Size;
156  SetSamples(samples, samplesSizes);
157  SetParameters();
158  }
159 
160  GoFTest::GoFTest(UInt_t sampleSize, const Double_t* sample, EDistribution dist)
161  : fDist(dist),
162  fSamples(std::vector<std::vector<Double_t> >(1)),
163  fTestSampleFromH0(kTRUE) {
164  Bool_t badSampleArg = sample == 0 || sampleSize == 0;
165  if (badSampleArg) {
166  std::string msg = "'sample";
167  msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
168  MATH_ERROR_MSG("GoFTest", msg.c_str());
169  assert(!badSampleArg);
170  }
171  std::vector<const Double_t*> samples(1, sample);
172  std::vector<UInt_t> samplesSizes(1, sampleSize);
173  SetSamples(samples, samplesSizes);
174  SetParameters();
175  SetCDF();
176  }
177 
179 
180  void GoFTest::SetSamples(std::vector<const Double_t*> samples, const std::vector<UInt_t> samplesSizes) {
181  fCombinedSamples.assign(std::accumulate(samplesSizes.begin(), samplesSizes.end(), 0), 0.0);
182  UInt_t combinedSamplesSize = 0;
183  for (UInt_t i = 0; i < samples.size(); ++i) {
184  fSamples[i].assign(samples[i], samples[i] + samplesSizes[i]);
185  std::sort(fSamples[i].begin(), fSamples[i].end());
186  for (UInt_t j = 0; j < samplesSizes[i]; ++j) {
187  fCombinedSamples[combinedSamplesSize + j] = samples[i][j];
188  }
189  combinedSamplesSize += samplesSizes[i];
190  }
191  std::sort(fCombinedSamples.begin(), fCombinedSamples.end());
192 
193  Bool_t degenerateSamples = *(fCombinedSamples.begin()) == *(fCombinedSamples.end() - 1);
194  if (degenerateSamples) {
195  std::string msg = "Degenerate sample";
196  msg += samplesSizes.size() > 1 ? "s!" : "!";
197  msg += " Sampling values all identical.";
198  MATH_ERROR_MSG("SetSamples", msg.c_str());
199  assert(!degenerateSamples);
200  }
201  }
202 
204  fMean = std::accumulate(fSamples[0].begin(), fSamples[0].end(), 0.0) / fSamples[0].size();
205  fSigma = TMath::Sqrt(1. / (fSamples[0].size() - 1) * (std::inner_product(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(), 0.0) - fSamples[0].size() * TMath::Power(fMean, 2)));
206  }
207 
208  void GoFTest::operator()(ETestType test, Double_t& pvalue, Double_t& testStat) const {
209  switch (test) {
210  default:
211  case kAD:
212  AndersonDarlingTest(pvalue, testStat);
213  break;
214  case kAD2s:
215  AndersonDarling2SamplesTest(pvalue, testStat);
216  break;
217  case kKS:
218  KolmogorovSmirnovTest(pvalue, testStat);
219  break;
220  case kKS2s:
221  KolmogorovSmirnov2SamplesTest(pvalue, testStat);
222  }
223  }
224 
225  Double_t GoFTest::operator()(ETestType test, const Char_t* option) const {
226  Double_t result = 0.0;
227  switch (test) {
228  default:
229  case kAD:
230  result = AndersonDarlingTest(option);
231  break;
232  case kAD2s:
233  result = AndersonDarling2SamplesTest(option);
234  break;
235  case kKS:
236  result = KolmogorovSmirnovTest(option);
237  break;
238  case kKS2s:
239  result = KolmogorovSmirnov2SamplesTest(option);
240  }
241  return result;
242  }
243 
244  void GoFTest::SetCDF() { // Setting parameter-free distributions
245  IGenFunction* cdf = 0;
246  switch (fDist) {
247  case kLogNormal:
248  LogSample();
249  /* fall through */
250  case kGaussian :
252  break;
253  case kExponential:
255  break;
256  case kUserDefined:
257  case kUndefined:
258  default:
259  break;
260  }
261  fCDF.reset(cdf);
262  }
263 
264  void GoFTest::SetDistributionFunction(const IGenFunction& f, Bool_t isPDF, Double_t xmin, Double_t xmax) {
265  if (fDist > kUserDefined) {
266  MATH_WARN_MSG("SetDistributionFunction","Distribution type is changed to user defined");
267  }
269  // function will be cloned inside the wrapper PDFIntegral of CDFWrapper classes
270  if (isPDF)
271  fCDF.reset(new PDFIntegral(f, xmin, xmax) );
272  else
273  fCDF.reset(new CDFWrapper(f, xmin, xmax) );
274  }
275 
276  void GoFTest::Instantiate(const Double_t* sample, UInt_t sampleSize) {
277  // initialization function for the template constructors
278  Bool_t badSampleArg = sample == 0 || sampleSize == 0;
279  if (badSampleArg) {
280  std::string msg = "'sample";
281  msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
282  MATH_ERROR_MSG("GoFTest", msg.c_str());
283  assert(!badSampleArg);
284  }
285  fCDF.reset((IGenFunction*)0);
287  fMean = 0;
288  fSigma = 0;
289  fSamples = std::vector<std::vector<Double_t> >(1);
290  fTestSampleFromH0 = kTRUE;
291  SetSamples(std::vector<const Double_t*>(1, sample), std::vector<UInt_t>(1, sampleSize));
292  }
293 
294  Double_t GoFTest::GaussianCDF(Double_t x) const {
295  return ROOT::Math::normal_cdf(x, fSigma, fMean);
296  }
297 
298  Double_t GoFTest::ExponentialCDF(Double_t x) const {
299  return ROOT::Math::exponential_cdf(x, 1.0 / fMean);
300  }
301 
303  transform(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(),
304  std::function<Double_t(Double_t)>(TMath::Log));
305  SetParameters();
306  }
307 
308 /*
309  Taken from (1)
310 */
311  Double_t GoFTest::GetSigmaN(const std::vector<UInt_t> & ns, UInt_t N) {
312  // compute moments of AD distribution (from Scholz-Stephen paper, paragraph 3)
313 
314  Double_t sigmaN = 0.0, h = 0.0, H = 0.0, g = 0.0, a, b, c, d, k = ns.size();
315 
316  for (UInt_t i = 0; i < ns.size(); ++i) {
317  H += 1.0 / double( ns[i] );
318  }
319 
320  // use approximate formulas for large N
321  // cache Sum( 1 / i)
322  if (N < 2000) {
323  std::vector<double> invI(N);
324  for (UInt_t i = 1; i <= N - 1; ++i) {
325  invI[i] = 1.0 / i;
326  h += invI[i];
327  }
328  for (UInt_t i = 1; i <= N - 2; ++i) {
329  double tmp = invI[N-i];
330  for (UInt_t j = i + 1; j <= N - 1; ++j) {
331  g += tmp * invI[j];
332  }
333  }
334  }
335  else {
336  // for N larger than 2000 error difference in g is ~ 5 10^-3 while in h is at the level of 10^-5
337  const double emc = 0.5772156649015328606065120900824024; // Euler-Mascheroni constant
338  h = std::log(double(N-1) ) + emc;
339  g = (M_PI)*(M_PI)/6.0;
340  }
341  double k2 = std::pow(k,2);
342  a = (4 * g - 6) * k + (10 - 6 * g) * H - 4 * g + 6;
343  b = (2 * g - 4) * k2 + 8 * h * k + (2 * g - 14 * h - 4) * H - 8 * h + 4 * g - 6;
344  c = (6 * h + 2 * g - 2) * k2 + (4 * h - 4 *g + 6) * k + (2 * h - 6) * H + 4 * h;
345  d = (2 * h + 6) * k2 - 4 * h * k;
346  sigmaN += a * std::pow(double(N),3) + b * std::pow(double(N),2) + c * N + d;
347  sigmaN /= ( double(N - 1) * double(N - 2) * double(N - 3) );
348  sigmaN = TMath::Sqrt(sigmaN);
349  return sigmaN;
350  }
351 
352 
353  Double_t GoFTest::PValueADKSamples(UInt_t nsamples, Double_t tx) {
354 
355  /*
356  Computation of p-values according to
357  "K-Sample Anderson-Darling Tests" by F.W. Scholz
358  and M.A. Stephens (1987), Journal of the American Statistical Association,
359  Vol 82, No. 399, pp 918-924.
360  Code from kSamples package from R (author F. Scholtz)
361 
362  This function uses the upper T_m quantiles as obtained via simulation of
363  the Anderson-Darling test statistics (Nsim = 2*10^6) with sample sizes n=500
364  for each sample, and after standardization, in order to emulate the Table 1
365  values given in the above reference. However, here we estimate p-quantiles
366  for p = .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,
367  .1,.2,.3,.4,.5,.6,.7,.8,.9,.925,.95,.975,.99,.9925,.995,.9975,.999,
368  .99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999
369  First the appropriate p-quantiles are determined from those simulated
370  for ms = 1,2,3,4,6,8,10, Inf, interpolating to the given value of m.
371  Since we use only m=2 we avoid this interpolation.
372 
373  Next linear inetrpolation to find the observed p value given the observed test statistic value.
374  We use interpolation in the test statistic -> log((1-p)/p) domain
375  and we extrapolatelinearly) beyond p = .00001 and .99999.
376  */
377 
378  // sample values
379  //double ms[] = { 1, 2, 3, 4, 6, 8, 10, TMath::Infinity() };
380  //int ns = ms.size();
381  const int ns = 8;
382  double ts[ ] = { -1.1954, -1.5806, -1.8172,
383  -2.0032, -2.2526, -2.4204, -2.5283, -4.2649, -1.1786, -1.5394,
384  -1.7728, -1.9426, -2.1685, -2.3288, -2.4374, -3.8906, -1.166,
385  -1.5193, -1.7462, -1.9067, -2.126, -2.2818, -2.3926, -3.719,
386  -1.1407, -1.4659, -1.671, -1.8105, -2.0048, -2.1356, -2.2348,
387  -3.2905, -1.1253, -1.4371, -1.6314, -1.7619, -1.9396, -2.0637,
388  -2.1521, -3.0902, -1.0777, -1.3503, -1.5102, -1.6177, -1.761,
389  -1.8537, -1.9178, -2.5758, -1.0489, -1.2984, -1.4415, -1.5355,
390  -1.6625, -1.738, -1.7936, -2.3263, -0.9978, -1.2098, -1.3251,
391  -1.4007, -1.4977, -1.5555, -1.5941, -1.96, -0.9417, -1.1187,
392  -1.209, -1.2671, -1.3382, -1.379, -1.405, -1.6449, -0.8981, -1.0491,
393  -1.1235, -1.1692, -1.2249, -1.2552, -1.2755, -1.4395, -0.8598,
394  -0.9904, -1.0513, -1.0879, -1.1317, -1.155, -1.1694, -1.2816,
395  -0.7258, -0.7938, -0.8188, -0.8312, -0.8435, -0.8471, -0.8496,
396  -0.8416, -0.5966, -0.617, -0.6177, -0.6139, -0.6073, -0.5987,
397  -0.5941, -0.5244, -0.4572, -0.4383, -0.419, -0.4033, -0.3834,
398  -0.3676, -0.3587, -0.2533, -0.2966, -0.2428, -0.2078, -0.1844,
399  -0.1548, -0.1346, -0.1224, 0, -0.1009, -0.0169, 0.0304, 0.0596,
400  0.0933, 0.1156, 0.1294, 0.2533, 0.1571, 0.2635, 0.3169, 0.348,
401  0.3823, 0.4038, 0.4166, 0.5244, 0.5357, 0.6496, 0.6992, 0.7246,
402  0.7528, 0.7683, 0.7771, 0.8416, 1.2255, 1.2989, 1.3202, 1.3254,
403  1.3305, 1.3286, 1.3257, 1.2816, 1.5262, 1.5677, 1.5709, 1.5663,
404  1.5561, 1.5449, 1.5356, 1.4395, 1.9633, 1.943, 1.919, 1.8975,
405  1.8641, 1.8389, 1.8212, 1.6449, 2.7314, 2.5899, 2.5, 2.4451,
406  2.3664, 2.3155, 2.2823, 1.96, 3.7825, 3.4425, 3.2582, 3.1423,
407  3.0036, 2.9101, 2.8579, 2.3263, 4.1241, 3.716, 3.4984, 3.3651,
408  3.2003, 3.0928, 3.0311, 2.4324, 4.6044, 4.0847, 3.8348, 3.6714,
409  3.4721, 3.3453, 3.2777, 2.5758, 5.409, 4.7223, 4.4022, 4.1791,
410  3.9357, 3.7809, 3.6963, 2.807, 6.4954, 5.5823, 5.1456, 4.8657,
411  4.5506, 4.3275, 4.2228, 3.0902, 6.8279, 5.8282, 5.3658, 5.0749,
412  4.7318, 4.4923, 4.3642, 3.1747, 7.2755, 6.197, 5.6715, 5.3642,
413  4.9991, 4.7135, 4.5945, 3.2905, 8.1885, 6.8537, 6.2077, 5.8499,
414  5.4246, 5.1137, 4.9555, 3.4808, 9.3061, 7.6592, 6.85, 6.4806,
415  5.9919, 5.6122, 5.5136, 3.719, 9.6132, 7.9234, 7.1025, 6.6731,
416  6.1549, 5.8217, 5.7345, 3.7911, 10.0989, 8.2395, 7.4326, 6.9567,
417  6.3908, 6.011, 5.9566, 3.8906, 10.8825, 8.8994, 7.8934, 7.4501,
418  6.9009, 6.4538, 6.2705, 4.0556, 11.8537, 9.5482, 8.5568, 8.0283,
419  7.4418, 6.9524, 6.6195, 4.2649 };
420 
421 
422 
423 
424 
425  // p values bins
426  double p[] = { .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,.1,.2,.3,.4,.5,.6,.7,.8,.9,
427  .925,.95,.975,.99,.9925,.995,.9975,.999,.99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999 };
428 
429  //int nbins = p.size();
430  const int nbins = 35;
431  //assert ( nbins*ns == ts.size() );
432 
433  // get ts values for nsamples = 2
434  // corresponding value is for m=nsamples-1
435  int offset = 0; // for m = 1 (i.e. for nsamples = 2)
436  if (nsamples != 2) {
437  MATH_ERROR_MSG("InterpolatePValues", "Interpolation not implemented for nsamples not equal to 2");
438  return 0;
439  }
440  std::vector<double> ts2(nbins); // ts values for nsamples = 2
441  std::vector<double> lp(nbins); // log ( p / (1-p) )
442  for (int i = 0; i < nbins; ++i)
443  {
444  ts2[i] = ts[offset+ i * ns];
445  p[i] = 1.-p[i];
446  lp[i] = std::log( p[i]/(1.-p[i] ) );
447  }
448  // do linear interpolation to find right lp value for given observed test staistic value
449  //auto it = std::lower_bound(ts2.begin(), ts2.end(), tx );
450  int i1 = std::distance(ts2.begin(), std::lower_bound(ts2.begin(), ts2.end(), tx ) ) - 1;
451  int i2 = i1+1;
452  // if tx is before min of tabluated data
453  if (i1 < 0) {
454  i1 = 0;
455  i2 = 1;
456  }
457  // if tx is after max of tabulated data
458  if (i2 >= int(ts2.size()) ) {
459  i1 = ts2.size()-2;
460  i2 = ts2.size()-1;
461  }
462 
463  //std::cout << i1 << " , " << i2 << std::endl;
464  assert(i1 < (int) lp.size() && i2 < (int) lp.size() );
465  double lp1 = lp[i1];
466  double lp2 = lp[i2];
467  double tx1 = ts2[i1];
468  double tx2 = ts2[i2];
469 
470  //std::cout << " tx1,2 " << tx1 << " " << tx2 << std::endl;
471  /// find interpolated (or extrapolated value)(
472  double lp0 = (lp1-lp2) * (tx - tx2)/ ( tx1-tx2) + lp2;
473 
474 
475  double p0 = exp(lp0)/(1. + exp(lp0) );
476  return p0;
477 
478  }
479 
480 
481 /*
482  Taken from (2)
483 */ Double_t GoFTest::PValueAD1Sample(Double_t A2) const {
484  Double_t pvalue = 0.0;
485  if (A2 <= 0.0) {
486  return pvalue;
487  } else if (A2 < 2.) {
488  pvalue = std::pow(A2, -0.5) * std::exp(-1.2337141 / A2) * (2.00012 + (0.247105 - (0.0649821 - (0.0347962 - (0.011672 - 0.00168691 * A2) * A2) * A2) * A2) * A2);
489  } else {
490  pvalue = std::exp(-1. * std::exp(1.0776 - (2.30695 - (0.43424 - (.082433 - (0.008056 - 0.0003146 * A2) * A2) * A2) * A2) * A2));
491  }
492  return 1. - pvalue;
493  }
494 
495 
496 // code from kSamples (R) F. Scholz
497 
498 /* computes the k-sample Anderson-Darling test statistics in both original
499  and alternative versions for the nonparametric (rank) test described in
500  Scholz F.W. and Stephens M.A. (1987), K-sample Anderson-Darling Tests,
501  Journal of the American Statistical Association, Vol 82, No. 399,
502  pp. 918-924
503 
504  Arguments:
505  adk: double array with length 2, stores AkN2 and AakN2
506  k: integer, number of samples being compared
507  x: double array storing the concatenated samples in the same order as ns
508  ns: integer array storing the k sample sizes, corresponding to x
509  zstar: double array storing the l distinct ordered observations in the
510  pooled sample
511  l: integer, length of zstar
512 
513  Outputs:
514  when the computation ends, AkN2 and AakN2 are stored in the given memory
515  pointed by adk
516 */
517 
518 /* counts and returns the number of occurrence of a given number
519  in a double array */
520 int getCount(double z, const double *dat, int n) {
521  int i;
522  int count = 0;
523 
524  for (i = 0; i < n; i++) {
525  if (dat[i] == z) {
526  count++;
527  }
528  }
529 
530  return(count);
531 }
532 
533 /* computes and returns the sum of elements in a given integer array */
534 int getSum(const int *x, int n) {
535  int i;
536  int sum = 0;
537 
538  for (i = 0; i < n; i++) {
539  sum += x[i];
540  }
541 
542  return(sum);
543 }
544 
545 
546 void adkTestStat(double *adk, const std::vector<std::vector<double> > & samples, const std::vector<double> & zstar) {
547 
548  int i;
549  int j;
550 
551  int nsum; /* total sample size = n_1 + ... + n_k */
552  int k = samples.size();
553  int l = zstar.size();
554 
555  /* fij records the number of observations in the ith sample coinciding
556  with zstar[j], where i = 1, ..., k, and j = 1, ..., l */
557  std::vector<int> fij (k*l);
558  /* lvec is an integer vector with length l,
559  whose jth entry = \sum_{i=1}^{k} f_{ij}, i.e., the multiplicity
560  of zstar[j] */
561  std::vector<int> lvec(l);
562 
563  /* for computation */
564  double mij;
565  double maij;
566  double innerSum;
567  double aInnerSum;
568  double bj;
569  double baj;
570  double tmp;
571 
572  /* samples is a two-dimensional double array with length k;
573  it stores an array of k pointers to double arrays which are
574  the k samples beeing compared */
575 // double **samples;
576 
577  /* dynamically allocate memory */
578  //std::vector< std::vector<double> > samples(k);
579  std::vector<int> ns(k);
580  nsum = 0;
581  for (i = 0; i < k; i++) {
582  ns[i] = samples[i].size();
583  nsum += ns[i];
584  }
585 
586  /* fij: k*l integer matrix, where l is the length of zstar and
587  k is the number of samples being compared
588  lvec: integer vector of length l, records the multiplicity of
589  each element of zstar */
590  for (j = 0; j < l; j++) {
591  lvec[j] = 0;
592  for (i = 0; i < k; i++) {
593  fij[i + j*k] = getCount(zstar[j], &samples[i][0], ns[i]);
594  lvec[j] += fij[i + j*k];
595  }
596  }
597 
598  // loop on samples to compute the adk's
599  // Formula (6) and (7) of the paper
600  adk[0] = adk[1] = 0;
601  for (i = 0; i < k; i++) {
602  mij = 0;
603  maij = 0;
604  innerSum = 0;
605  aInnerSum = 0;
606 
607  for (j = 0; j < l; j++) {
608  mij += fij[i + j*k];
609  maij = mij - (double) fij[i + j*k] / 2.0;
610  bj = getSum(&lvec[0], j + 1);
611  baj = bj - (double) lvec[j] / 2.0;
612 
613  if (j < l - 1) {
614  tmp = (double) nsum * mij - (double) ns[i] * bj;
615  innerSum = innerSum + (double) lvec[j] * tmp * tmp /
616  (bj * ((double) nsum - bj));
617  }
618 
619  tmp = (double) nsum * maij - (double) ns[i] * baj;
620  aInnerSum = aInnerSum + (double) lvec[j] * tmp * tmp /
621  (baj * (nsum - baj) - nsum * (double) lvec[j] / 4.0);
622  }
623 
624  adk[0] = adk[0] + innerSum / ns[i]; /* AkN2*/
625  adk[1] = adk[1] + aInnerSum / ns[i]; /* AakN2 */
626  }
627 
628  /* k-sample Anderson-Darling test statistics in both original and
629  alternative versions, AkN2 and AakN2, are stored in the given
630  double array adk */
631  adk[0] = adk[0] / (double) nsum; /* AkN2*/
632  adk[1] = (nsum - 1) * adk[1] / ((double) nsum * (double) nsum); /* AakN2 */
633 
634  // /* free pointers */
635  // for (i = 0; i < k; i++) {
636  // free(samples[i]);
637  // }
638  // free(samples);
639 
640 }
641 
642 
643 /*
644  Taken from (1) -- Named for 2 samples but implemented for K. Restricted to K = 2 by the class's constructors
645 */
646 void GoFTest::AndersonDarling2SamplesTest(Double_t& pvalue, Double_t& testStat) const {
647  pvalue = -1;
648  testStat = -1;
649  if (fTestSampleFromH0) {
650  MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
651  return;
652  }
653  std::vector<Double_t> z(fCombinedSamples);
654  // unique removes all consecutives duplicates elements. This is exactly what we wants
655  // for example unique of v={1,2,2,3,1,2,3,3} results in {1,2,3,1,2,3} which is exactly what we wants
656  std::vector<Double_t>::iterator endUnique = std::unique(z.begin(), z.end()); //z_j's in (1)
657  z.erase(endUnique, z.end() );
658  std::vector<UInt_t> h; // h_j's in (1)
659  std::vector<Double_t> H; // H_j's in (1)
660  UInt_t N = fCombinedSamples.size();
661  Double_t A2 = 0.0; // Anderson-Darling A^2 Test Statistic
662 
663 #ifdef USE_OLDIMPL
664 
665  TStopwatch w; w.Start();
666 
667  unsigned int nSamples = fSamples.size();
668 
669  // old implementation
670  for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
671  UInt_t n = std::count(fCombinedSamples.begin(), fCombinedSamples.end(), *data);
672  h.push_back(n);
673  H.push_back(std::count_if(fCombinedSamples.begin(), fCombinedSamples.end(),
674  std::bind(std::less<Double_t>(), std::placeholders::_1, *data)) + n / 2.);
675  }
676  std::cout << "time for H";
677  w.Print();
678  w.Reset(); w.Start();
679  std::vector<std::vector<Double_t> > F(nSamples); // F_ij's in (1)
680  for (UInt_t i = 0; i < nSamples; ++i) {
681  for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
682  UInt_t n = std::count(fSamples[i].begin(), fSamples[i].end(), *data);
683  F[i].push_back(std::count_if(fSamples[i].begin(), fSamples[i].end(),
684  std::bind(std::less<Double_t>(), std::placeholders::_1, *data)) + n / 2.);
685  }
686  }
687  std::cout << "time for F";
688  w.Print();
689  for (UInt_t i = 0; i < nSamples; ++i) {
690  Double_t sum_result = 0.0;
691  UInt_t j = 0;
692  w.Reset(); w.Start();
693  for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
694  sum_result += h[j] * TMath::Power(N * F[i][j]- fSamples[i].size() * H[j], 2) / (H[j] * (N - H[j]) - N * h[j] / 4.0);
695  ++j;
696  }
697  std::cout << "time for sum_resut";
698  w.Print();
699  std::cout << "sum_result " << sum_result << std::endl;
700  A2 += 1.0 / fSamples[i].size() * sum_result;
701  }
702  A2 *= (N - 1) / (TMath::Power(N, 2)); // A2_akN in (1)
703 
704  std::cout << "A2 - old Bartolomeo code " << A2 << std::endl;
705 #endif
706  // w.Reset();
707  // w.Start();
708 
709  double adk[2] = {0,0};
710 
711  //debug
712  // std::cout << "combined samples\n";
713  // for (int i = 0; i < fCombinedSamples.size(); ++i)
714  // std::cout << fCombinedSamples[i] << " ,";
715  // std::cout << std::endl;
716  // std::cout << ns[0] << " " << ns[1] << std::endl;
717  // std::cout << "Z\n";
718  // for (int i = 0; i < z.size(); ++i)
719  // std::cout << z[i] << " ,";
720  // std::cout << std::endl;
721 
722  // use function from kSamples code
723  adkTestStat(adk, fSamples, z );
724  // w.Print();
725  // std::cout << "A2 - new kSamples code " << adk[0] << " " << adk[1] << std::endl;
726 
727  A2 = adk[0];
728 
729  // compute the normalized test statistic
730 
731  std::vector<UInt_t> ns(fSamples.size());
732  for (unsigned int k = 0; k < ns.size(); ++k) ns[k] = fSamples[k].size();
733  Double_t sigmaN = GetSigmaN(ns, N);
734  A2 -= fSamples.size() - 1;
735  A2 /= sigmaN; // standartized test statistic
736 
737  pvalue = PValueADKSamples(2,A2);
738  testStat = A2;
739  return;
740  }
741 
742 
743 /*
744  Compute Anderson Darling test for two binned data set.
745  A binned data set can be seen as many identical observation happening at the center of the bin
746  In this way it is trivial to apply the formula (6) in the paper of W. Scholz, M. Stephens, "K-Sample Anderson-Darling Tests"
747  to the case of histograms. See also http://arxiv.org/pdf/0804.0380v1.pdf paragraph 3.3.5
748  It is importat that empty bins are not present
749 */
750  void GoFTest::AndersonDarling2SamplesTest(const ROOT::Fit::BinData &data1, const ROOT::Fit::BinData & data2, Double_t& pvalue, Double_t& testStat) {
751  pvalue = -1;
752  testStat = -1;
753  //
754  // compute cumulative sum of bin counts
755  // std::vector<double> sum1(data1.Size() );
756  // std::vector<double> sum2(data2.Size() );
757  // std::vector<double> sumAll(data1.Size() + data2.Size() );
758 
759  if (data1.NDim() != 1 && data2.NDim() != 1) {
760  MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Bin Data set must be one-dimensional ");
761  return;
762  }
763  unsigned int n1 = data1.Size();
764  unsigned int n2 = data2.Size();
765  double ntot1 = 0;
766  double ntot2 = 0;
767 
768 
769  // make a combined data set and sort it
770  std::vector<double> xdata(n1+n2);
771  for (unsigned int i = 0; i < n1; ++i) {
772  double value = 0;
773  const double * x = data1.GetPoint(i, value);
774  xdata[i] = *x;
775  ntot1 += value;
776  }
777  for (unsigned int i = 0; i < n2; ++i) {
778  double value = 0;
779  const double * x = data2.GetPoint(i, value);
780  xdata[n1+i] = *x;
781  ntot2 += value;
782  }
783  double nall = ntot1+ntot2;
784  // sort the combined data
785  std::vector<unsigned int> index(n1+n2);
786  TMath::Sort(n1+n2, &xdata[0], &index[0], false );
787 
788  // now compute the sums for the tests
789  double sum1 = 0;
790  double sum2 = 0;
791  double sumAll = 0;
792  double adsum = 0;
793  unsigned int j = 0;
794 
795  while( j < n1+n2 ) {
796 // for (unsigned int j = 0; j < n1+n2; ++j) {
797  // skip equal observations
798  double x = xdata[ index[j] ];
799  unsigned int k = j;
800  // loop on the bins with the same center value
801  double t = 0;
802  do {
803  unsigned int i = index[k];
804  double value = 0;
805  if (i < n1 ) {
806  value = data1.Value(i);
807  sum1 += value;
808  }
809  else {
810  // from data2
811  i -= n1;
812  assert(i < n2);
813  value = data2.Value(i);
814  sum2 += value;
815  }
816  sumAll += value;
817  t += value;
818  //std::cout << "j " << j << " k " << k << " data " << x << " index " << index[k] << " value " << value << std::endl;
819  k++;
820  } while ( k < n1+n2 && xdata[ index[k] ] == x );
821 
822 
823  j = k;
824  // skip last point
825  if (j < n1+n2) {
826  double tmp1 = ( nall * sum1 - ntot1 * sumAll );
827  double tmp2 = ( nall * sum2 - ntot2 * sumAll );
828  adsum += t * (tmp1*tmp1/ntot1 + tmp2*tmp2/ntot2) / ( sumAll * (nall - sumAll) ) ;
829 
830  //std::cout << "comp sum " << adsum << " " << t << " " << sumAll << " s1 " << sum1 << " s2 " << sum2 << " tmp1 " << tmp1 << " tmp2 " << tmp2 << std::endl;
831  }
832  }
833  double A2 = adsum / nall;
834 
835  // compute the normalized test statistic
836  std::vector<unsigned int> ns(2);
837  ns[0] = ntot1;
838  ns[1] = ntot2;
839  //std::cout << " ad2 = " << A2 << " nall " << nall;
840 
841  Double_t sigmaN = GetSigmaN(ns,nall);
842  A2 -= 1;
843  A2 /= sigmaN; // standartized test statistic
844 
845  //std::cout << " sigmaN " << sigmaN << " new A2 " << A2;
846 
847  pvalue = PValueADKSamples(2,A2);
848  //std::cout << " pvalue = " << pvalue << std::endl;
849  testStat = A2;
850  return;
851  }
852 
853 
854  Double_t GoFTest::AndersonDarling2SamplesTest(const Char_t* option) const {
855  Double_t pvalue, testStat;
856  AndersonDarling2SamplesTest(pvalue, testStat);
857  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
858  }
859 
860 /*
861  Taken from (3)
862 */ void GoFTest::AndersonDarlingTest(Double_t& pvalue, Double_t& testStat) const {
863  pvalue = -1;
864  testStat = -1;
865  if (!fTestSampleFromH0) {
866  MATH_ERROR_MSG("AndersonDarlingTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
867  return;
868  }
869  if (fDist == kUndefined) {
870  MATH_ERROR_MSG("AndersonDarlingTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
871  return;
872  }
873  Double_t A2 = 0.0;
874  Int_t n = fSamples[0].size();
875  for (Int_t i = 0; i < n ; ++i) {
876  Double_t x1 = fSamples[0][i];
877  Double_t w1 = (*fCDF)(x1);
878  Double_t result = (2 * (i + 1) - 1) * TMath::Log(w1) + (2 * (n - (i + 1)) + 1) * TMath::Log(1 - w1);
879  A2 += result;
880  }
881  (A2 /= -n) -= n;
882  if (A2 != A2) {
883  MATH_ERROR_MSG("AndersonDarlingTest", "Cannot compute p-value: data below or above the distribution's thresholds. Check sample consistency.");
884  return;
885  }
886  pvalue = PValueAD1Sample(A2);
887  testStat = A2;
888  }
889 
890  Double_t GoFTest::AndersonDarlingTest(const Char_t* option) const {
891  Double_t pvalue, testStat;
892  AndersonDarlingTest(pvalue, testStat);
893  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
894  }
895 
896  void GoFTest::KolmogorovSmirnov2SamplesTest(Double_t& pvalue, Double_t& testStat) const {
897  pvalue = -1;
898  testStat = -1;
899  if (fTestSampleFromH0) {
900  MATH_ERROR_MSG("KolmogorovSmirnov2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
901  return;
902  }
903  const UInt_t na = fSamples[0].size();
904  const UInt_t nb = fSamples[1].size();
905  std::vector<Double_t> a(na);
906  std::vector<Double_t> b(nb);
907  std::copy(fSamples[0].begin(), fSamples[0].end(), a.begin());
908  std::copy(fSamples[1].begin(), fSamples[1].end(), b.begin());
909  pvalue = TMath::KolmogorovTest(na, a.data(), nb, b.data(), 0);
910  testStat = TMath::KolmogorovTest(na, a.data(), nb, b.data(), "M");
911  }
912 
913  Double_t GoFTest::KolmogorovSmirnov2SamplesTest(const Char_t* option) const {
914  Double_t pvalue, testStat;
915  KolmogorovSmirnov2SamplesTest(pvalue, testStat);
916  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
917  }
918 
919 /*
920  Algorithm taken from (3) in page 737
921 */ void GoFTest::KolmogorovSmirnovTest(Double_t& pvalue, Double_t& testStat) const {
922  pvalue = -1;
923  testStat = -1;
924  if (!fTestSampleFromH0) {
925  MATH_ERROR_MSG("KolmogorovSmirnovTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
926  return;
927  }
928  if (fDist == kUndefined) {
929  MATH_ERROR_MSG("KolmogorovSmirnovTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
930  return;
931  }
932  Double_t Fo = 0.0, Dn = 0.0;
933  UInt_t n = fSamples[0].size();
934  for (UInt_t i = 0; i < n; ++i) {
935  Double_t Fn = (i + 1.0) / n;
936  Double_t F = (*fCDF)(fSamples[0][i]);
937  Double_t result = std::max(TMath::Abs(Fn - F), TMath::Abs(Fo - Fn));
938  if (result > Dn) Dn = result;
939  Fo = Fn;
940  }
941  pvalue = TMath::KolmogorovProb(Dn * (TMath::Sqrt(n) + 0.12 + 0.11 / TMath::Sqrt(n)));
942  testStat = Dn;
943  }
944 
945  Double_t GoFTest::KolmogorovSmirnovTest(const Char_t* option) const {
946  Double_t pvalue, testStat;
947  KolmogorovSmirnovTest(pvalue, testStat);
948  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
949  }
950 
951 
952 
953 
954 
955 } // ROOT namespace
956 } // Math namespace
957 
constexpr Double_t H()
Planck&#39;s constant in .
Definition: TMath.h:189
void SetDistribution(EDistribution dist)
Definition: GoFTest.cxx:123
double dist(Rotation3D const &r1, Rotation3D const &r2)
Definition: 3DDistances.cxx:48
Bool_t fTestSampleFromH0
Definition: GoFTest.h:193
int getCount(double z, const double *dat, int n)
Definition: GoFTest.cxx:520
Double_t fMean
Definition: GoFTest.h:186
Interface (abstract class) for generic functions objects of one-dimension Provides a method to evalua...
Definition: IFunction.h:135
void operator()(ETestType test, Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:208
Namespace for new ROOT classes and functions.
Definition: TFoamSampler.h:19
Double_t Log(Double_t x)
Definition: TMath.h:759
void SetSamples(std::vector< const Double_t *> samples, const std::vector< UInt_t > samplesSizes)
Definition: GoFTest.cxx:180
Double_t KolmogorovProb(Double_t z)
Calculates the Kolmogorov distribution function,.
Definition: TMath.cxx:663
std::vector< Double_t > fCombinedSamples
Definition: GoFTest.h:189
void AndersonDarlingTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:862
double inner_product(const LAVector &, const LAVector &)
STL namespace.
#define MATH_WARN_MSG(loc, str)
Definition: Error.h:47
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
Definition: TMath.h:734
const double * GetPoint(unsigned int ipoint, double &value) const
retrieve at the same time a pointer to the coordinate data and the fit value More efficient than call...
Definition: BinData.h:370
Double_t GaussianCDF(Double_t x) const
Definition: GoFTest.cxx:294
double normal_cdf(double x, double sigma=1, double x0=0)
Cumulative distribution function of the normal (Gaussian) distribution (lower tail).
static Double_t PValueADKSamples(UInt_t nsamples, Double_t A2)
Definition: GoFTest.cxx:353
EDistribution fDist
Definition: GoFTest.h:184
double pow(double, double)
#define N
void Sort(Index n, const Element *a, Index *index, Bool_t down=kTRUE)
Sort the n elements of the array a of generic templated type Element.
Definition: TMath.h:1282
static Double_t GetSigmaN(const std::vector< UInt_t > &ns, UInt_t N)
Definition: GoFTest.cxx:311
void KolmogorovSmirnovTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:921
unsigned int Size() const
return number of fit points
Definition: FitData.h:303
#define MATH_ERROR_MSG(loc, str)
Definition: Error.h:50
int getSum(const int *x, int n)
Definition: GoFTest.cxx:534
IBaseFunctionOneDim IGenFunction
Definition: IFunctionfwd.h:39
Double_t KolmogorovTest(Int_t na, const Double_t *a, Int_t nb, const Double_t *b, Option_t *option)
Statistical test whether two one-dimensional sets of points are compatible with coming from the same ...
Definition: TMath.cxx:789
#define M_PI
Definition: Math.h:38
virtual ~GoFTest()
Definition: GoFTest.cxx:178
Class describing the binned data sets : vectors of x coordinates, y values and optionally error on y ...
Definition: BinData.h:53
Double_t fSigma
Definition: GoFTest.h:187
* x
Deprecated and error prone model selection interface.
Definition: TRolke.cxx:630
std::vector< std::vector< Double_t > > fSamples
Definition: GoFTest.h:191
void Instantiate(const Double_t *sample, UInt_t sampleSize)
Definition: GoFTest.cxx:276
double Value(unsigned int ipoint) const
return the value for the given fit point
Definition: BinData.h:217
void KolmogorovSmirnov2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:896
Namespace for new Math classes and functions.
Double_t PValueAD1Sample(Double_t A2) const
Definition: GoFTest.cxx:483
you should not use this method at all Int_t Int_t z
Definition: TRolke.cxx:630
Double_t ExponentialCDF(Double_t x) const
Definition: GoFTest.cxx:298
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
unsigned int NDim() const
return coordinate data dimension
Definition: FitData.h:311
Double_t Sqrt(Double_t x)
Definition: TMath.h:690
double exp(double)
void SetDistributionFunction(const IGenFunction &cdf, Bool_t isPDF, Double_t xmin, Double_t xmax)
Definition: GoFTest.cxx:264
Template class to wrap any member function of a class taking a double and returning a double in a 1D ...
double log(double)
void adkTestStat(double *adk, const std::vector< std::vector< double > > &samples, const std::vector< double > &zstar)
Definition: GoFTest.cxx:546
double exponential_cdf(double x, double lambda, double x0=0)
Cumulative distribution function of the exponential distribution (lower tail).
void AndersonDarling2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:646