Logo ROOT  
Reference Guide
SamplingDistribution.cxx
Go to the documentation of this file.
1// @(#)root/roostats:$Id$
2
3/*************************************************************************
4 * Project: RooStats *
5 * Package: RooFit/RooStats *
6 * Authors: *
7 * Kyle Cranmer, Lorenzo Moneta, Gregory Schott, Wouter Verkerke *
8 *************************************************************************
9 * Copyright (C) 1995-2008, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16/** \class RooStats::SamplingDistribution
17 \ingroup Roostats
18
19This class simply holds a sampling distribution of some test statistic.
20The distribution can either be an empirical distribution (eg. the samples themselves) or
21a weighted set of points (eg. for the FFT method).
22The class supports merging.
23*/
24
25#include "RooMsgService.h"
26
28#include "RooNumber.h"
29#include "TMath.h"
30#include <algorithm>
31#include <iostream>
32#include <cmath>
33#include <limits>
34using namespace std ;
35
37
38using namespace RooStats;
39
40////////////////////////////////////////////////////////////////////////////////
41/// SamplingDistribution constructor
42
43SamplingDistribution::SamplingDistribution( const char *name, const char *title,
44 std::vector<Double_t>& samplingDist, const char * varName) :
45 TNamed(name,title)
46{
47 fSamplingDist = samplingDist;
48 // need to check STL stuff here. Will this = operator work as wanted, or do we need:
49 // std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());
50
51 // WVE must fill sampleWeights vector here otherwise append behavior potentially undefined
52 fSampleWeights.resize(fSamplingDist.size(),1.0) ;
53
54 fVarName = varName;
55}
56
57////////////////////////////////////////////////////////////////////////////////
58/// SamplingDistribution constructor
59
60SamplingDistribution::SamplingDistribution( const char *name, const char *title,
61 std::vector<Double_t>& samplingDist, std::vector<Double_t>& sampleWeights, const char * varName) :
62 TNamed(name,title)
63{
64 fSamplingDist = samplingDist;
65 fSampleWeights = sampleWeights;
66 // need to check STL stuff here. Will this = operator work as wanted, or do we need:
67 // std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());
68
69 fVarName = varName;
70}
71
72////////////////////////////////////////////////////////////////////////////////
73/// SamplingDistribution constructor (with name and title)
74
75SamplingDistribution::SamplingDistribution( const char *name, const char *title, const char * varName) :
76 TNamed(name,title)
77{
78 fVarName = varName;
79}
80
81////////////////////////////////////////////////////////////////////////////////
82/// Creates a SamplingDistribution from a RooDataSet for debugging
83/// purposes; e.g. if you need a Gaussian type SamplingDistribution
84/// you can generate it from a Gaussian pdf and use the resulting
85/// RooDataSet with this constructor.
86///
87/// The result is the projected distribution onto varName
88/// marginalizing the other variables.
89///
90/// If varName is not given, the first variable will be used.
91/// This is useful mostly for RooDataSets with only one observable.
92
94 const char *name,
95 const char *title,
96 RooDataSet& dataSet,
97 const char * _columnName,
98 const char * varName
99) : TNamed(name, title) {
100
101
102 // check there are any meaningful entries in the given dataset
103 if( dataSet.numEntries() == 0 || !dataSet.get()->first() ) {
104 if( varName ) fVarName = varName;
105 return;
106 }
107
108 TString columnName( _columnName );
109
110 if( !columnName.Length() ) {
111 columnName.Form( "%s_TS0", name );
112 if( !dataSet.get()->find(columnName) ) {
113 columnName = dataSet.get()->first()->GetName();
114 }
115 }
116
117 if( !varName ) {
118 // no leak. none of these transfers ownership.
119 fVarName = (*dataSet.get())[columnName].GetTitle();
120 }else{
121 fVarName = varName;
122 }
123
124 for(Int_t i=0; i < dataSet.numEntries(); i++) {
125 fSamplingDist.push_back(dataSet.get(i)->getRealValue(columnName));
126 fSampleWeights.push_back(dataSet.weight());
127 }
128}
129
130
131////////////////////////////////////////////////////////////////////////////////
132/// SamplingDistribution default constructor
133
135 TNamed("SamplingDistribution_DefaultName","SamplingDistribution")
136{
137}
138
139////////////////////////////////////////////////////////////////////////////////
140/// SamplingDistribution destructor
141
143{
144 fSamplingDist.clear();
145 fSampleWeights.clear();
146}
147
148////////////////////////////////////////////////////////////////////////////////
149/// Merge SamplingDistributions (does nothing if NULL is given).
150/// If variable name was not set before, it is copied from the added
151/// SamplingDistribution.
152
154{
155 if(!other) return;
156
157 std::vector<double> newSamplingDist = other->fSamplingDist;
158 std::vector<double> newSampleWeights = other->fSampleWeights;
159 // need to check STL stuff here. Will this = operator work as wanted, or do we need:
160 // std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());
161 // need to look into STL, do it the easy way for now
162
163 // reserve memory
164 fSamplingDist.reserve(fSamplingDist.size()+newSamplingDist.size());
165 fSampleWeights.reserve(fSampleWeights.size()+newSampleWeights.size());
166
167 // push back elements
168 for(unsigned int i=0; i<newSamplingDist.size(); ++i){
169 fSamplingDist.push_back(newSamplingDist[i]);
170 fSampleWeights.push_back(newSampleWeights[i]);
171 }
172
173
174 if(GetVarName().Length() == 0 && other->GetVarName().Length() > 0)
175 fVarName = other->GetVarName();
176
177 if(strlen(GetName()) == 0 && strlen(other->GetName()) > 0)
178 SetName(other->GetName());
179 if(strlen(GetTitle()) == 0 && strlen(other->GetTitle()) > 0)
180 SetTitle(other->GetTitle());
181
182}
183
184////////////////////////////////////////////////////////////////////////////////
185/// Returns the integral in the open/closed/mixed interval. Default is [low,high) interval.
186/// Normalization can be turned off.
187
189 highClosed) const
190{
191 double error = 0;
192 return IntegralAndError(error, low,high, normalize, lowClosed, highClosed);
193}
194
195////////////////////////////////////////////////////////////////////////////////
196/// first need to sort the values and then compute the
197/// running sum of the weights and of the weight square
198/// needed later for computing the integral
199
201
202 unsigned int n = fSamplingDist.size();
203 std::vector<unsigned int> index(n);
204 TMath::SortItr(fSamplingDist.begin(), fSamplingDist.end(), index.begin(), false );
205
206 // compute the empirical CDF and cache in a vector
207 fSumW = std::vector<double>( n );
208 fSumW2 = std::vector<double>( n );
209
210 std::vector<double> sortedDist( n);
211 std::vector<double> sortedWeights( n);
212
213 for(unsigned int i=0; i <n; i++) {
214 unsigned int j = index[i];
215 if (i > 0) {
216 fSumW[i] += fSumW[i-1];
217 fSumW2[i] += fSumW2[i-1];
218 }
219 fSumW[i] += fSampleWeights[j];
221 // sort also the sampling distribution and the weights
222 sortedDist[i] = fSamplingDist[ j] ;
223 sortedWeights[i] = fSampleWeights[ j] ;
224 }
225
226 // save the sorted distribution
227 fSamplingDist = sortedDist;
228 fSampleWeights = sortedWeights;
229
230
231}
232
233////////////////////////////////////////////////////////////////////////////////
234/// Returns the integral in the open/closed/mixed interval. Default is [low,high) interval.
235/// Normalization can be turned off.
236/// compute also the error on the integral
237
239 highClosed) const
240{
241 int n = fSamplingDist.size();
242 if( n == 0 ) {
243 error = numeric_limits<Double_t>::infinity();
244 return 0.0;
245 }
246
247 if (int(fSumW.size()) != n)
248 SortValues();
249
250
251 // use std::upper_bounds returns lower index value
252 int indexLow = -1;
253 int indexHigh = -1;
254 if (lowClosed) {
255 // case of closed intervals want to include lower part
256 indexLow = std::lower_bound( fSamplingDist.begin(), fSamplingDist.end() , low) - fSamplingDist.begin() -1;
257 }
258 else {
259 // case of open intervals
260 indexLow = std::upper_bound( fSamplingDist.begin(), fSamplingDist.end() , low) - fSamplingDist.begin() - 1;
261 }
262
263
264 if (highClosed) {
265 indexHigh = std::upper_bound( fSamplingDist.begin(), fSamplingDist.end() , high) - fSamplingDist.begin() -1;
266 }
267 else {
268 indexHigh = std::lower_bound( fSamplingDist.begin(), fSamplingDist.end() , high) - fSamplingDist.begin() -1;
269
270 }
271
272
273 assert(indexLow < n && indexHigh < n);
274
275 double sum = 0;
276 double sum2 = 0;
277
278 if (indexHigh >= 0) {
279 sum = fSumW[indexHigh];
280 sum2 = fSumW2[indexHigh];
281
282 if (indexLow >= 0) {
283 sum -= fSumW[indexLow];
284 sum2 -= fSumW2[indexLow];
285 }
286 }
287
288 if(normalize) {
289
290 double norm = fSumW.back();
291 double norm2 = fSumW2.back();
292
293 sum /= norm;
294
295 // use formula for binomial error in case of weighted events
296 // expression can be derived using a MLE for a weighted binomial likelihood
297 error = std::sqrt( sum2 * (1. - 2. * sum) + norm2 * sum * sum ) / norm;
298 }
299 else {
300 error = std::sqrt(sum2);
301 }
302
303
304 return sum;
305}
306
307////////////////////////////////////////////////////////////////////////////////
308/// returns the closed integral [-inf,x]
309
312}
313
314////////////////////////////////////////////////////////////////////////////////
315/// returns the inverse of the cumulative distribution function
316
318{
319 Double_t dummy=0;
320 return InverseCDF(pvalue,0,dummy);
321}
322
323////////////////////////////////////////////////////////////////////////////////
324/// returns the inverse of the cumulative distribution function, with variations depending on number of samples
325
327 Double_t sigmaVariation,
328 Double_t& inverseWithVariation)
329{
330 if (fSumW.size() != fSamplingDist.size())
331 SortValues();
332
333 if (!TMath::AreEqualRel(fSumW.back(), fSumW2.back(), 1.E-6) )
334 Warning("InverseCDF","Estimation of Quantiles (InverseCDF) for weighted events is not yet supported");
335
336
337 // Acceptance regions are meant to be inclusive of (1-\alpha) of the probability
338 // so the returned values of the CDF should make this easy.
339 // in particular:
340 // if finding the critical value for a lower bound
341 // when p_i < p < p_j, one should return the value associated with i
342 // if i=0, then one should return -infinity
343 // if finding the critical value for an upper bound
344 // when p_i < p < p_j, one should return the value associated with j
345 // if i = size-1, then one should return +infinity
346 // use pvalue < 0.5 to indicate a lower bound is requested
347
348 // casting will round down, eg. give i
349 int nominal = (unsigned int) (pvalue*fSamplingDist.size());
350
351 if(nominal <= 0) {
352 inverseWithVariation = -1.*RooNumber::infinity();
353 return -1.*RooNumber::infinity();
354 }
355 else if(nominal >= (Int_t)fSamplingDist.size()-1 ) {
356 inverseWithVariation = RooNumber::infinity();
357 return RooNumber::infinity();
358 }
359 else if(pvalue < 0.5){
360 int delta = (int)(sigmaVariation*sqrt(1.0*nominal)); // note sqrt(small fraction)
361 int variation = nominal+delta;
362
363 if(variation>=(Int_t)fSamplingDist.size()-1)
364 inverseWithVariation = RooNumber::infinity();
365 else if(variation<=0)
366 inverseWithVariation = -1.*RooNumber::infinity();
367 else
368 inverseWithVariation = fSamplingDist[ variation ];
369
370 return fSamplingDist[nominal];
371 }
372 else if(pvalue >= 0.5){
373 int delta = (int)(sigmaVariation*sqrt(1.0*fSamplingDist.size()- nominal)); // note sqrt(small fraction)
374 int variation = nominal+delta;
375
376
377 if(variation>=(Int_t)fSamplingDist.size()-1)
378 inverseWithVariation = RooNumber::infinity();
379
380 else if(variation<=0)
381 inverseWithVariation = -1.*RooNumber::infinity();
382 else
383 inverseWithVariation = fSamplingDist[ variation+1 ];
384
385
386 /*
387 std::cout << "dgb SamplingDistribution::InverseCDF. variation = " << variation
388 << " size = " << fSamplingDist.size()
389 << " value = " << inverseWithVariation << std::endl;
390 */
391
392 return fSamplingDist[nominal+1];
393 }
394 else{
395 std::cout << "problem in SamplingDistribution::InverseCDF" << std::endl;
396 }
397 inverseWithVariation = RooNumber::infinity();
398 return RooNumber::infinity();
399
400}
401
402
403////////////////////////////////////////////////////////////////////////////////
404/// returns the inverse of the cumulative distribution function
405
407{
408 if (fSumW.size() != fSamplingDist.size())
409 SortValues();
410
411 if (!TMath::AreEqualRel(fSumW.back(), fSumW2.back(), 1.E-6) )
412 Warning("InverseCDFInterpolate","Estimation of Quantiles (InverseCDF) for weighted events is not yet supported.");
413
414 // casting will round down, eg. give i
415 int nominal = (unsigned int) (pvalue*fSamplingDist.size());
416
417 if(nominal <= 0) {
418 return -1.*RooNumber::infinity();
419 }
420 if(nominal >= (Int_t)fSamplingDist.size()-1 ) {
421 return RooNumber::infinity();
422 }
423 Double_t upperX = fSamplingDist[nominal+1];
424 Double_t upperY = ((Double_t) (nominal+1))/fSamplingDist.size();
425 Double_t lowerX = fSamplingDist[nominal];
426 Double_t lowerY = ((Double_t) nominal)/fSamplingDist.size();
427
428 // std::cout << upperX << " " << upperY << " " << lowerX << " " << lowerY << std::endl;
429
430 return (upperX-lowerX)/(upperY-lowerY)*(pvalue-lowerY)+lowerX;
431
432}
static RooMathCoreReg dummy
double Double_t
Definition: RtypesCore.h:57
const Bool_t kTRUE
Definition: RtypesCore.h:89
#define ClassImp(name)
Definition: Rtypes.h:361
char name[80]
Definition: TGX11.cxx:109
double sqrt(double)
Double_t getRealValue(const char *name, Double_t defVal=0, Bool_t verbose=kFALSE) const
Get value of a RooAbsReal stored in set with given name.
RooAbsArg * first() const
RooAbsArg * find(const char *name) const
Find object with given name in list.
virtual Int_t numEntries() const
Definition: RooAbsData.cxx:306
RooDataSet is a container class to hold unbinned data.
Definition: RooDataSet.h:33
virtual const RooArgSet * get(Int_t index) const override
Return RooArgSet with coordinates of event 'index'.
virtual Double_t weight() const override
Return event weight of current event.
Definition: RooDataSet.cxx:973
static Double_t infinity()
Return internal infinity representation.
Definition: RooNumber.cxx:49
This class simply holds a sampling distribution of some test statistic.
virtual ~SamplingDistribution()
Destructor of SamplingDistribution.
Double_t InverseCDF(Double_t pvalue)
get the inverse of the Cumulative distribution function
Double_t Integral(Double_t low, Double_t high, Bool_t normalize=kTRUE, Bool_t lowClosed=kTRUE, Bool_t highClosed=kFALSE) const
numerical integral in these limits
Double_t IntegralAndError(Double_t &error, Double_t low, Double_t high, Bool_t normalize=kTRUE, Bool_t lowClosed=kTRUE, Bool_t highClosed=kFALSE) const
numerical integral in these limits including error estimation
SamplingDistribution()
Default constructor for SamplingDistribution.
Double_t CDF(Double_t x) const
calculate CDF as a special case of Integral(...) with lower limit equal to -inf
std::vector< Double_t > fSumW2
Cached vector with sum of the weight used to compute integral.
TString fVarName
vector of weights for the samples
Double_t InverseCDFInterpolate(Double_t pvalue)
get the inverse of the Cumulative distribution function
void Add(const SamplingDistribution *other)
merge two sampling distributions
std::vector< Double_t > fSampleWeights
vector of points for the sampling distribution
void SortValues() const
Cached vector with sum of the weight used to compute integral error.
std::vector< Double_t > fSamplingDist
The TNamed class is the base class for all named ROOT classes.
Definition: TNamed.h:29
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
Definition: TNamed.cxx:164
virtual void SetName(const char *name)
Set the name of the TNamed.
Definition: TNamed.cxx:140
virtual const char * GetTitle() const
Returns title of object.
Definition: TNamed.h:48
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:47
virtual void Warning(const char *method, const char *msgfmt,...) const
Issue warning message.
Definition: TObject.cxx:877
Basic string class.
Definition: TString.h:131
Ssiz_t Length() const
Definition: TString.h:405
void Form(const char *fmt,...)
Formats a string using a printf style format descriptor.
Definition: TString.cxx:2289
Double_t x[n]
Definition: legend1.C:17
const Int_t n
Definition: legend1.C:16
Namespace for the RooStats classes.
Definition: Asimov.h:19
void SortItr(Iterator first, Iterator last, IndexIterator index, Bool_t down=kTRUE)
Definition: TMathBase.h:337
Bool_t AreEqualRel(Double_t af, Double_t bf, Double_t relPrec)
Definition: TMath.h:418
static long int sum(long int i)
Definition: Factory.cxx:2275