Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
SamplingDistribution.cxx
Go to the documentation of this file.
1// @(#)root/roostats:$Id$
2
3/*************************************************************************
4 * Project: RooStats *
5 * Package: RooFit/RooStats *
6 * Authors: *
7 * Kyle Cranmer, Lorenzo Moneta, Gregory Schott, Wouter Verkerke *
8 *************************************************************************
9 * Copyright (C) 1995-2008, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16/** \class RooStats::SamplingDistribution
17 \ingroup Roostats
18
19This class simply holds a sampling distribution of some test statistic.
20The distribution can either be an empirical distribution (eg. the samples themselves) or
21a weighted set of points (eg. for the FFT method).
22The class supports merging.
23*/
24
25#include "RooMsgService.h"
26
28#include "RooNumber.h"
29#include "TMath.h"
30#include <algorithm>
31#include <iostream>
32#include <cmath>
33#include <limits>
34using std::numeric_limits;
35
37
38using namespace RooStats;
39
40////////////////////////////////////////////////////////////////////////////////
41/// SamplingDistribution constructor
42
43SamplingDistribution::SamplingDistribution(const char *name, const char *title, std::vector<double> &samplingDist,
44 const char *varName)
45 : TNamed(name, title), fSamplingDist(samplingDist), fVarName(varName)
46{
47 // need to check STL stuff here. Will this = operator work as wanted, or do we need:
48 // std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());
49
50 // WVE must fill sampleWeights vector here otherwise append behavior potentially undefined
51 fSampleWeights.resize(fSamplingDist.size(),1.0) ;
52}
53
54////////////////////////////////////////////////////////////////////////////////
55/// SamplingDistribution constructor
56
57SamplingDistribution::SamplingDistribution(const char *name, const char *title, std::vector<double> &samplingDist,
58 std::vector<double> &sampleWeights, const char *varName)
59 : TNamed(name, title), fSamplingDist(samplingDist), fSampleWeights(sampleWeights), fVarName(varName)
60{
61 // need to check STL stuff here. Will this = operator work as wanted, or do we need:
62 // std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());
63}
64
65////////////////////////////////////////////////////////////////////////////////
66/// SamplingDistribution constructor (with name and title)
67
68SamplingDistribution::SamplingDistribution(const char *name, const char *title, const char *varName)
69 : TNamed(name, title), fVarName(varName)
70{
71}
72
73////////////////////////////////////////////////////////////////////////////////
74/// Creates a SamplingDistribution from a RooDataSet for debugging
75/// purposes; e.g. if you need a Gaussian type SamplingDistribution
76/// you can generate it from a Gaussian pdf and use the resulting
77/// RooDataSet with this constructor.
78///
79/// The result is the projected distribution onto varName
80/// marginalizing the other variables.
81///
82/// If varName is not given, the first variable will be used.
83/// This is useful mostly for RooDataSets with only one observable.
84
86 const char *name,
87 const char *title,
88 RooDataSet& dataSet,
89 const char * _columnName,
90 const char * varName
91) : TNamed(name, title) {
92
93
94 // check there are any meaningful entries in the given dataset
95 if( dataSet.numEntries() == 0 || !dataSet.get()->first() ) {
96 if( varName ) fVarName = varName;
97 return;
98 }
99
100 TString columnName( _columnName );
101
102 if( !columnName.Length() ) {
103 columnName.Form( "%s_TS0", name );
104 if( !dataSet.get()->find(columnName) ) {
105 columnName = dataSet.get()->first()->GetName();
106 }
107 }
108
109 if( !varName ) {
110 // no leak. none of these transfers ownership.
111 fVarName = (*dataSet.get())[columnName].GetTitle();
112 }else{
113 fVarName = varName;
114 }
115
116 for(Int_t i=0; i < dataSet.numEntries(); i++) {
117 fSamplingDist.push_back(dataSet.get(i)->getRealValue(columnName));
118 fSampleWeights.push_back(dataSet.weight());
119 }
120}
121
122
123////////////////////////////////////////////////////////////////////////////////
124/// SamplingDistribution default constructor
125
126SamplingDistribution::SamplingDistribution() : TNamed("SamplingDistribution_DefaultName", "SamplingDistribution") {}
127
128////////////////////////////////////////////////////////////////////////////////
129/// SamplingDistribution destructor
130
132{
133 fSamplingDist.clear();
134 fSampleWeights.clear();
135}
136
137////////////////////////////////////////////////////////////////////////////////
138/// Merge SamplingDistributions (does nothing if nullptr is given).
139/// If variable name was not set before, it is copied from the added
140/// SamplingDistribution.
141
143{
144 if(!other) return;
145
146 std::vector<double> newSamplingDist = other->fSamplingDist;
147 std::vector<double> newSampleWeights = other->fSampleWeights;
148 // need to check STL stuff here. Will this = operator work as wanted, or do we need:
149 // std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());
150 // need to look into STL, do it the easy way for now
151
152 // reserve memory
153 fSamplingDist.reserve(fSamplingDist.size()+newSamplingDist.size());
154 fSampleWeights.reserve(fSampleWeights.size()+newSampleWeights.size());
155
156 // push back elements
157 for(unsigned int i=0; i<newSamplingDist.size(); ++i){
158 fSamplingDist.push_back(newSamplingDist[i]);
159 fSampleWeights.push_back(newSampleWeights[i]);
160 }
161
162
163 if(GetVarName().Length() == 0 && other->GetVarName().Length() > 0)
164 fVarName = other->GetVarName();
165
166 if(strlen(GetName()) == 0 && strlen(other->GetName()) > 0)
167 SetName(other->GetName());
168 if(strlen(GetTitle()) == 0 && strlen(other->GetTitle()) > 0)
169 SetTitle(other->GetTitle());
170
171}
172
173////////////////////////////////////////////////////////////////////////////////
174/// Returns the integral in the open/closed/mixed interval. Default is [low,high) interval.
175/// Normalization can be turned off.
176
177double SamplingDistribution::Integral(double low, double high, bool normalize, bool lowClosed, bool
178 highClosed) const
179{
180 double error = 0;
181 return IntegralAndError(error, low,high, normalize, lowClosed, highClosed);
182}
183
184////////////////////////////////////////////////////////////////////////////////
185/// first need to sort the values and then compute the
186/// running sum of the weights and of the weight square
187/// needed later for computing the integral
188
190
191 unsigned int n = fSamplingDist.size();
192 std::vector<unsigned int> index(n);
193 TMath::SortItr(fSamplingDist.begin(), fSamplingDist.end(), index.begin(), false );
194
195 // compute the empirical CDF and cache in a vector
196 fSumW = std::vector<double>( n );
197 fSumW2 = std::vector<double>( n );
198
199 std::vector<double> sortedDist( n);
200 std::vector<double> sortedWeights( n);
201
202 for(unsigned int i=0; i <n; i++) {
203 unsigned int j = index[i];
204 if (i > 0) {
205 fSumW[i] += fSumW[i-1];
206 fSumW2[i] += fSumW2[i-1];
207 }
208 fSumW[i] += fSampleWeights[j];
210 // sort also the sampling distribution and the weights
211 sortedDist[i] = fSamplingDist[ j] ;
212 sortedWeights[i] = fSampleWeights[ j] ;
213 }
214
215 // save the sorted distribution
216 fSamplingDist = sortedDist;
217 fSampleWeights = sortedWeights;
218
219
220}
221
222////////////////////////////////////////////////////////////////////////////////
223/// Returns the integral in the open/closed/mixed interval. Default is [low,high) interval.
224/// Normalization can be turned off.
225/// compute also the error on the integral
226
227double SamplingDistribution::IntegralAndError(double & error, double low, double high, bool normalize, bool lowClosed, bool
228 highClosed) const
229{
230 int n = fSamplingDist.size();
231 if( n == 0 ) {
232 error = numeric_limits<double>::infinity();
233 return 0.0;
234 }
235
236 if (int(fSumW.size()) != n)
237 SortValues();
238
239
240 // use std::upper_bounds returns lower index value
241 int indexLow = -1;
242 int indexHigh = -1;
243 if (lowClosed) {
244 // case of closed intervals want to include lower part
245 indexLow = std::lower_bound( fSamplingDist.begin(), fSamplingDist.end() , low) - fSamplingDist.begin() -1;
246 }
247 else {
248 // case of open intervals
249 indexLow = std::upper_bound( fSamplingDist.begin(), fSamplingDist.end() , low) - fSamplingDist.begin() - 1;
250 }
251
252
253 if (highClosed) {
254 indexHigh = std::upper_bound( fSamplingDist.begin(), fSamplingDist.end() , high) - fSamplingDist.begin() -1;
255 }
256 else {
257 indexHigh = std::lower_bound( fSamplingDist.begin(), fSamplingDist.end() , high) - fSamplingDist.begin() -1;
258
259 }
260
261
262 assert(indexLow < n && indexHigh < n);
263
264 double sum = 0;
265 double sum2 = 0;
266
267 if (indexHigh >= 0) {
268 sum = fSumW[indexHigh];
269 sum2 = fSumW2[indexHigh];
270
271 if (indexLow >= 0) {
272 sum -= fSumW[indexLow];
273 sum2 -= fSumW2[indexLow];
274 }
275 }
276
277 if(normalize) {
278
279 double norm = fSumW.back();
280 double norm2 = fSumW2.back();
281
282 sum /= norm;
283
284 // use formula for binomial error in case of weighted events
285 // expression can be derived using a MLE for a weighted binomial likelihood
286 error = std::sqrt( sum2 * (1. - 2. * sum) + norm2 * sum * sum ) / norm;
287 }
288 else {
289 error = std::sqrt(sum2);
290 }
291
292
293 return sum;
294}
295
296////////////////////////////////////////////////////////////////////////////////
297/// returns the closed integral [-inf,x]
298
299double SamplingDistribution::CDF(double x) const {
300 return Integral(-RooNumber::infinity(), x, true, true, true);
301}
302
303////////////////////////////////////////////////////////////////////////////////
304/// returns the inverse of the cumulative distribution function
305
307{
308 double dummy=0;
309 return InverseCDF(pvalue,0,dummy);
310}
311
312////////////////////////////////////////////////////////////////////////////////
313/// returns the inverse of the cumulative distribution function, with variations depending on number of samples
314
316 double sigmaVariation,
317 double& inverseWithVariation)
318{
319 if (fSumW.size() != fSamplingDist.size())
320 SortValues();
321
322 if (!TMath::AreEqualRel(fSumW.back(), fSumW2.back(), 1.E-6) )
323 Warning("InverseCDF","Estimation of Quantiles (InverseCDF) for weighted events is not yet supported");
324
325
326 // Acceptance regions are meant to be inclusive of (1-\alpha) of the probability
327 // so the returned values of the CDF should make this easy.
328 // in particular:
329 // if finding the critical value for a lower bound
330 // when p_i < p < p_j, one should return the value associated with i
331 // if i=0, then one should return -infinity
332 // if finding the critical value for an upper bound
333 // when p_i < p < p_j, one should return the value associated with j
334 // if i = size-1, then one should return +infinity
335 // use pvalue < 0.5 to indicate a lower bound is requested
336
337 // casting will round down, eg. give i
338 int nominal = (unsigned int) (pvalue*fSamplingDist.size());
339
340 if(nominal <= 0) {
341 inverseWithVariation = -1.*RooNumber::infinity();
342 return -1.*RooNumber::infinity();
343 }
344 else if(nominal >= (Int_t)fSamplingDist.size()-1 ) {
345 inverseWithVariation = RooNumber::infinity();
346 return RooNumber::infinity();
347 }
348 else if(pvalue < 0.5){
349 int delta = (int)(sigmaVariation*sqrt(1.0*nominal)); // note sqrt(small fraction)
350 int variation = nominal+delta;
351
352 if (variation >= (Int_t)fSamplingDist.size() - 1) {
353 inverseWithVariation = RooNumber::infinity();
354 } else if (variation <= 0) {
355 inverseWithVariation = -1. * RooNumber::infinity();
356 } else {
357 inverseWithVariation = fSamplingDist[variation];
358 }
359
360 return fSamplingDist[nominal];
361 }
362 else if(pvalue >= 0.5){
363 int delta = (int)(sigmaVariation*sqrt(1.0*fSamplingDist.size()- nominal)); // note sqrt(small fraction)
364 int variation = nominal+delta;
365
366 if (variation >= (Int_t)fSamplingDist.size() - 1) {
367 inverseWithVariation = RooNumber::infinity();
368
369 } else if (variation <= 0) {
370 inverseWithVariation = -1. * RooNumber::infinity();
371 } else {
372 inverseWithVariation = fSamplingDist[variation + 1];
373 }
374
375 /*
376 std::cout << "dgb SamplingDistribution::InverseCDF. variation = " << variation
377 << " size = " << fSamplingDist.size()
378 << " value = " << inverseWithVariation << std::endl;
379 */
380
381 return fSamplingDist[nominal+1];
382 }
383 else{
384 std::cout << "problem in SamplingDistribution::InverseCDF" << std::endl;
385 }
386 inverseWithVariation = RooNumber::infinity();
387 return RooNumber::infinity();
388
389}
390
391
392////////////////////////////////////////////////////////////////////////////////
393/// returns the inverse of the cumulative distribution function
394
396{
397 if (fSumW.size() != fSamplingDist.size())
398 SortValues();
399
400 if (!TMath::AreEqualRel(fSumW.back(), fSumW2.back(), 1.E-6) )
401 Warning("InverseCDFInterpolate","Estimation of Quantiles (InverseCDF) for weighted events is not yet supported.");
402
403 // casting will round down, eg. give i
404 int nominal = (unsigned int) (pvalue*fSamplingDist.size());
405
406 if(nominal <= 0) {
407 return -1.*RooNumber::infinity();
408 }
409 if(nominal >= (Int_t)fSamplingDist.size()-1 ) {
410 return RooNumber::infinity();
411 }
412 double upperX = fSamplingDist[nominal+1];
413 double upperY = ((double) (nominal+1))/fSamplingDist.size();
414 double lowerX = fSamplingDist[nominal];
415 double lowerY = ((double) nominal)/fSamplingDist.size();
416
417 // std::cout << upperX << " " << upperY << " " << lowerX << " " << lowerY << std::endl;
418
419 return (upperX-lowerX)/(upperY-lowerY)*(pvalue-lowerY)+lowerX;
420
421}
#define ClassImp(name)
Definition Rtypes.h:382
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
char name[80]
Definition TGX11.cxx:110
Container class to hold unbinned data.
Definition RooDataSet.h:33
static constexpr double infinity()
Return internal infinity representation.
Definition RooNumber.h:25
This class simply holds a sampling distribution of some test statistic.
double Integral(double low, double high, bool normalize=true, bool lowClosed=true, bool highClosed=false) const
numerical integral in these limits
double CDF(double x) const
calculate CDF as a special case of Integral(...) with lower limit equal to -inf
~SamplingDistribution() override
Destructor of SamplingDistribution.
std::vector< double > fSumW2
! Cached vector with sum of the weight used to compute integral error
double InverseCDFInterpolate(double pvalue)
get the inverse of the Cumulative distribution function
std::vector< double > fSumW
! Cached vector with sum of the weight used to compute integral
SamplingDistribution()
Default constructor for SamplingDistribution.
std::vector< double > fSamplingDist
vector of points for the sampling distribution
double IntegralAndError(double &error, double low, double high, bool normalize=true, bool lowClosed=true, bool highClosed=false) const
numerical integral in these limits including error estimation
double InverseCDF(double pvalue)
get the inverse of the Cumulative distribution function
void Add(const SamplingDistribution *other)
merge two sampling distributions
void SortValues() const
internal function to sort values
std::vector< double > fSampleWeights
vector of weights for the samples
The TNamed class is the base class for all named ROOT classes.
Definition TNamed.h:29
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
Definition TNamed.cxx:164
const char * GetName() const override
Returns name of object.
Definition TNamed.h:47
const char * GetTitle() const override
Returns title of object.
Definition TNamed.h:48
virtual void SetName(const char *name)
Set the name of the TNamed.
Definition TNamed.cxx:140
virtual void Warning(const char *method, const char *msgfmt,...) const
Issue warning message.
Definition TObject.cxx:991
Basic string class.
Definition TString.h:139
Ssiz_t Length() const
Definition TString.h:417
void Form(const char *fmt,...)
Formats a string using a printf style format descriptor.
Definition TString.cxx:2356
Double_t x[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
Namespace for the RooStats classes.
Definition CodegenImpl.h:58
void SortItr(Iterator first, Iterator last, IndexIterator index, Bool_t down=kTRUE)
Sort the n1 elements of the Short_t array defined by its iterators.
Definition TMathBase.h:406
Bool_t AreEqualRel(Double_t af, Double_t bf, Double_t relPrec)
Comparing floating points.
Definition TMath.h:426
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345