doc/v622/SamplingDistribution_8cxx_source.html

// @(#)root/roostats:$Id$


/*************************************************************************

 * Project: RooStats                                                     *

 * Package: RooFit/RooStats                                              *

 * Authors:                                                              *

 *   Kyle Cranmer, Lorenzo Moneta, Gregory Schott, Wouter Verkerke       *

 *************************************************************************

 * Copyright (C) 1995-2008, Rene Brun and Fons Rademakers.               *

 * All rights reserved.                                                  *

 *                                                                       *

 * For the licensing terms see $ROOTSYS/LICENSE.                         *

 * For the list of contributors see $ROOTSYS/README/CREDITS.             *

 *************************************************************************/


/** \class RooStats::SamplingDistribution

    \ingroup Roostats


This class simply holds a sampling distribution of some test statistic.

The distribution can either be an empirical distribution (eg. the samples themselves) or

a weighted set of points (eg. for the FFT method).

The class supports merging.

*/


#include "RooMsgService.h"


#include "RooStats/SamplingDistribution.h"

#include "RooNumber.h"

#include "TMath.h"

#include <algorithm>

#include <iostream>

#include <cmath>

#include <limits>

using namespace std ;


ClassImp(RooStats::SamplingDistribution);


using namespace RooStats;


////////////////////////////////////////////////////////////////////////////////

/// SamplingDistribution constructor


SamplingDistribution::SamplingDistribution( const char *name, const char *title,

                   std::vector<Double_t>& samplingDist, const char * varName) :

  TNamed(name,title)

{

  fSamplingDist = samplingDist;

  // need to check STL stuff here.  Will this = operator work as wanted, or do we need:

  //  std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());


  // WVE must fill sampleWeights vector here otherwise append behavior potentially undefined

  fSampleWeights.resize(fSamplingDist.size(),1.0) ;


  fVarName = varName;

}


////////////////////////////////////////////////////////////////////////////////

/// SamplingDistribution constructor


SamplingDistribution::SamplingDistribution( const char *name, const char *title,

                   std::vector<Double_t>& samplingDist, std::vector<Double_t>& sampleWeights, const char * varName) :

  TNamed(name,title)

{

  fSamplingDist = samplingDist;

  fSampleWeights = sampleWeights;

  // need to check STL stuff here.  Will this = operator work as wanted, or do we need:

  //  std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());


  fVarName = varName;

}


////////////////////////////////////////////////////////////////////////////////

/// SamplingDistribution constructor (with name and title)


SamplingDistribution::SamplingDistribution( const char *name, const char *title, const char * varName) :

  TNamed(name,title)

{

  fVarName = varName;

}


////////////////////////////////////////////////////////////////////////////////

/// Creates a SamplingDistribution from a RooDataSet for debugging

/// purposes; e.g. if you need a Gaussian type SamplingDistribution

/// you can generate it from a Gaussian pdf and use the resulting

/// RooDataSet with this constructor.

///

/// The result is the projected distribution onto varName

/// marginalizing the other variables.

///

/// If varName is not given, the first variable will be used.

/// This is useful mostly for RooDataSets with only one observable.


SamplingDistribution::SamplingDistribution(

   const char *name,

   const char *title,

   RooDataSet& dataSet,

   const char * _columnName,

   const char * varName

) : TNamed(name, title) {


   // check there are any meaningful entries in the given dataset

   if( dataSet.numEntries() == 0  ||  !dataSet.get()->first() ) {

      if( varName ) fVarName = varName;

      return;

   }


   TString columnName( _columnName );


   if( !columnName.Length() ) {

      columnName.Form( "%s_TS0", name );

      if( !dataSet.get()->find(columnName) ) {

         columnName = dataSet.get()->first()->GetName();

      }

   }


   if( !varName ) {

      // no leak. none of these transfers ownership.

      fVarName = (*dataSet.get())[columnName].GetTitle();

   }else{

      fVarName = varName;

   }


   for(Int_t i=0; i < dataSet.numEntries(); i++) {

      fSamplingDist.push_back(dataSet.get(i)->getRealValue(columnName));

      fSampleWeights.push_back(dataSet.weight());

   }

}


////////////////////////////////////////////////////////////////////////////////

/// SamplingDistribution default constructor


SamplingDistribution::SamplingDistribution( ) :

  TNamed("SamplingDistribution_DefaultName","SamplingDistribution")

{

}


////////////////////////////////////////////////////////////////////////////////

/// SamplingDistribution destructor


SamplingDistribution::~SamplingDistribution()

{

   fSamplingDist.clear();

   fSampleWeights.clear();

}


////////////////////////////////////////////////////////////////////////////////

/// Merge SamplingDistributions (does nothing if NULL is given).

/// If variable name was not set before, it is copied from the added

/// SamplingDistribution.


void SamplingDistribution::Add(const SamplingDistribution* other)

{

   if(!other) return;


  std::vector<double> newSamplingDist = other->fSamplingDist;

  std::vector<double> newSampleWeights = other->fSampleWeights;

  // need to check STL stuff here.  Will this = operator work as wanted, or do we need:

  //  std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());

  // need to look into STL, do it the easy way for now


  // reserve memory

  fSamplingDist.reserve(fSamplingDist.size()+newSamplingDist.size());

  fSampleWeights.reserve(fSampleWeights.size()+newSampleWeights.size());


  // push back elements

  for(unsigned int i=0; i<newSamplingDist.size(); ++i){

    fSamplingDist.push_back(newSamplingDist[i]);

    fSampleWeights.push_back(newSampleWeights[i]);

  }


  if(GetVarName().Length() == 0  &&  other->GetVarName().Length() > 0)

     fVarName = other->GetVarName();


  if(strlen(GetName()) == 0  &&  strlen(other->GetName()) > 0)

     SetName(other->GetName());

  if(strlen(GetTitle()) == 0  &&  strlen(other->GetTitle()) > 0)

     SetTitle(other->GetTitle());


}


////////////////////////////////////////////////////////////////////////////////

/// Returns the integral in the open/closed/mixed interval. Default is [low,high) interval.

/// Normalization can be turned off.


Double_t SamplingDistribution::Integral(Double_t low, Double_t high, Bool_t normalize, Bool_t lowClosed, Bool_t

                                        highClosed) const

{

   double error = 0;

   return IntegralAndError(error, low,high, normalize, lowClosed, highClosed);

}


////////////////////////////////////////////////////////////////////////////////

/// first need to sort the values and then compute the

/// running sum of the weights and of the weight square

/// needed later for computing the integral


void SamplingDistribution::SortValues() const {


   unsigned int n = fSamplingDist.size();

   std::vector<unsigned int> index(n);

   TMath::SortItr(fSamplingDist.begin(), fSamplingDist.end(), index.begin(), false );


   // compute the empirical CDF and cache in a vector

   fSumW = std::vector<double>( n );

   fSumW2 = std::vector<double>( n );


   std::vector<double> sortedDist( n);

   std::vector<double> sortedWeights( n);


   for(unsigned int i=0; i <n; i++) {

      unsigned int j = index[i];

      if (i > 0) {

         fSumW[i] += fSumW[i-1];

         fSumW2[i] += fSumW2[i-1];

      }

      fSumW[i] += fSampleWeights[j];

      fSumW2[i] += fSampleWeights[j]*fSampleWeights[j];

      // sort also the sampling distribution and the weights

      sortedDist[i] = fSamplingDist[ j] ;

      sortedWeights[i] = fSampleWeights[ j] ;

   }


   // save the sorted distribution

   fSamplingDist = sortedDist;

   fSampleWeights = sortedWeights;


}


////////////////////////////////////////////////////////////////////////////////

/// Returns the integral in the open/closed/mixed interval. Default is [low,high) interval.

/// Normalization can be turned off.

/// compute also the error on the integral


Double_t SamplingDistribution::IntegralAndError(Double_t & error, Double_t low, Double_t high, Bool_t normalize, Bool_t lowClosed, Bool_t

                                                highClosed) const

{

   int n = fSamplingDist.size();

   if( n == 0 ) {

      error = numeric_limits<Double_t>::infinity();

      return 0.0;

   }


   if (int(fSumW.size()) != n)

      SortValues();


   // use std::upper_bounds returns lower index value

   int indexLow = -1;

   int indexHigh = -1;

   if (lowClosed)  {

      // case of closed intervals want to include lower part

      indexLow = std::lower_bound( fSamplingDist.begin(), fSamplingDist.end() , low) - fSamplingDist.begin() -1;

   }

   else {

      // case of open intervals

      indexLow = std::upper_bound( fSamplingDist.begin(), fSamplingDist.end() , low) - fSamplingDist.begin() - 1;

   }


   if (highClosed) {

      indexHigh = std::upper_bound( fSamplingDist.begin(), fSamplingDist.end() , high) - fSamplingDist.begin() -1;

   }

   else {

      indexHigh = std::lower_bound( fSamplingDist.begin(), fSamplingDist.end() , high) - fSamplingDist.begin() -1;


   }


   assert(indexLow < n && indexHigh < n);


   double sum = 0;

   double sum2 = 0;


   if (indexHigh >= 0) {

      sum  = fSumW[indexHigh];

      sum2  = fSumW2[indexHigh];


      if (indexLow >= 0) {

         sum -= fSumW[indexLow];

         sum2 -= fSumW2[indexLow];

      }

   }


   if(normalize) {


      double norm  = fSumW.back();

      double norm2 = fSumW2.back();


      sum /= norm;


      // use formula for binomial error in case of weighted events

      // expression can be derived using a MLE for a weighted binomial likelihood

      error = std::sqrt( sum2 * (1. - 2. * sum) + norm2 * sum * sum ) / norm;

   }

   else {

      error = std::sqrt(sum2);

   }


   return sum;

}


////////////////////////////////////////////////////////////////////////////////

/// returns the closed integral [-inf,x]


Double_t SamplingDistribution::CDF(Double_t x) const {

   return Integral(-RooNumber::infinity(), x, kTRUE, kTRUE, kTRUE);

}


////////////////////////////////////////////////////////////////////////////////

/// returns the inverse of the cumulative distribution function


Double_t SamplingDistribution::InverseCDF(Double_t pvalue)

{

  Double_t dummy=0;

  return InverseCDF(pvalue,0,dummy);

}


////////////////////////////////////////////////////////////////////////////////

/// returns the inverse of the cumulative distribution function, with variations depending on number of samples


Double_t SamplingDistribution::InverseCDF(Double_t pvalue,

                 Double_t sigmaVariation,

                 Double_t& inverseWithVariation)

{

   if (fSumW.size() != fSamplingDist.size())

      SortValues();


   if (!TMath::AreEqualRel(fSumW.back(), fSumW2.back(), 1.E-6) )

      Warning("InverseCDF","Estimation of Quantiles (InverseCDF) for weighted events is not yet supported");


  // Acceptance regions are meant to be inclusive of (1-\alpha) of the probability

  // so the returned values of the CDF should make this easy.

  // in particular:

  //   if finding the critical value for a lower bound

  //     when p_i < p < p_j, one should return the value associated with i

  //     if i=0, then one should return -infinity

  //   if finding the critical value for an upper bound

  //     when p_i < p < p_j, one should return the value associated with j

  //     if i = size-1, then one should return +infinity

  //   use pvalue < 0.5 to indicate a lower bound is requested


  // casting will round down, eg. give i

  int nominal = (unsigned int) (pvalue*fSamplingDist.size());


  if(nominal <= 0) {

    inverseWithVariation = -1.*RooNumber::infinity();

    return -1.*RooNumber::infinity();

  }

  else if(nominal >= (Int_t)fSamplingDist.size()-1 ) {

    inverseWithVariation = RooNumber::infinity();

    return RooNumber::infinity();

  }

  else if(pvalue < 0.5){

    int delta = (int)(sigmaVariation*sqrt(1.0*nominal)); // note sqrt(small fraction)

    int variation = nominal+delta;


    if(variation>=(Int_t)fSamplingDist.size()-1)

      inverseWithVariation = RooNumber::infinity();

    else if(variation<=0)

      inverseWithVariation = -1.*RooNumber::infinity();

    else

      inverseWithVariation =  fSamplingDist[ variation ];


    return fSamplingDist[nominal];

  }

  else if(pvalue >= 0.5){

    int delta = (int)(sigmaVariation*sqrt(1.0*fSamplingDist.size()- nominal)); // note sqrt(small fraction)

    int variation = nominal+delta;


    if(variation>=(Int_t)fSamplingDist.size()-1)

      inverseWithVariation = RooNumber::infinity();


    else if(variation<=0)

      inverseWithVariation = -1.*RooNumber::infinity();

    else

      inverseWithVariation =  fSamplingDist[ variation+1 ];


    /*

      std::cout << "dgb SamplingDistribution::InverseCDF. variation = " << variation

      << " size = " << fSamplingDist.size()

      << " value = " << inverseWithVariation << std::endl;

    */


    return fSamplingDist[nominal+1];

  }

  else{

    std::cout << "problem in SamplingDistribution::InverseCDF" << std::endl;

  }

  inverseWithVariation = RooNumber::infinity();

  return RooNumber::infinity();


}


////////////////////////////////////////////////////////////////////////////////

/// returns the inverse of the cumulative distribution function


Double_t SamplingDistribution::InverseCDFInterpolate(Double_t pvalue)

{

   if (fSumW.size() != fSamplingDist.size())

      SortValues();


   if (!TMath::AreEqualRel(fSumW.back(), fSumW2.back(), 1.E-6) )

      Warning("InverseCDFInterpolate","Estimation of Quantiles (InverseCDF) for weighted events is not yet supported.");


  // casting will round down, eg. give i

  int nominal = (unsigned int) (pvalue*fSamplingDist.size());


  if(nominal <= 0) {

    return -1.*RooNumber::infinity();

  }

  if(nominal >= (Int_t)fSamplingDist.size()-1 ) {

    return RooNumber::infinity();

  }

  Double_t upperX = fSamplingDist[nominal+1];

  Double_t upperY = ((Double_t) (nominal+1))/fSamplingDist.size();

  Double_t lowerX =  fSamplingDist[nominal];

  Double_t lowerY = ((Double_t) nominal)/fSamplingDist.size();


  //  std::cout << upperX << " " << upperY << " " << lowerX << " " << lowerY << std::endl;


  return (upperX-lowerX)/(upperY-lowerY)*(pvalue-lowerY)+lowerX;


}

dummy
static RooMathCoreReg dummy
Definition: RooMathCoreReg.cxx:27

RooMsgService.h

RooNumber.h

Double_t
double Double_t
Definition: RtypesCore.h:57

kTRUE
const Bool_t kTRUE
Definition: RtypesCore.h:89

ClassImp
#define ClassImp(name)
Definition: Rtypes.h:361

SamplingDistribution.h

name
char name[80]
Definition: TGX11.cxx:109

TMath.h

sqrt
double sqrt(double)

Double_t

RooAbsCollection::getRealValue
Double_t getRealValue(const char *name, Double_t defVal=0, Bool_t verbose=kFALSE) const
Get value of a RooAbsReal stored in set with given name.
Definition: RooAbsCollection.cxx:854

RooAbsCollection::first
RooAbsArg * first() const
Definition: RooAbsCollection.h:185

RooAbsCollection::find
RooAbsArg * find(const char *name) const
Find object with given name in list.
Definition: RooAbsCollection.cxx:811

RooAbsData::numEntries
virtual Int_t numEntries() const
Definition: RooAbsData.cxx:306

RooDataSet
RooDataSet is a container class to hold unbinned data.
Definition: RooDataSet.h:33

RooDataSet::get
virtual const RooArgSet * get(Int_t index) const override
Return RooArgSet with coordinates of event 'index'.
Definition: RooDataSet.cxx:1017

RooDataSet::weight
virtual Double_t weight() const override
Return event weight of current event.
Definition: RooDataSet.cxx:973

RooNumber::infinity
static Double_t infinity()
Return internal infinity representation.
Definition: RooNumber.cxx:49

RooStats::SamplingDistribution
This class simply holds a sampling distribution of some test statistic.
Definition: SamplingDistribution.h:28

RooStats::SamplingDistribution::~SamplingDistribution
virtual ~SamplingDistribution()
Destructor of SamplingDistribution.
Definition: SamplingDistribution.cxx:142

RooStats::SamplingDistribution::InverseCDF
Double_t InverseCDF(Double_t pvalue)
get the inverse of the Cumulative distribution function
Definition: SamplingDistribution.cxx:317

RooStats::SamplingDistribution::Integral
Double_t Integral(Double_t low, Double_t high, Bool_t normalize=kTRUE, Bool_t lowClosed=kTRUE, Bool_t highClosed=kFALSE) const
numerical integral in these limits
Definition: SamplingDistribution.cxx:188

RooStats::SamplingDistribution::IntegralAndError
Double_t IntegralAndError(Double_t &error, Double_t low, Double_t high, Bool_t normalize=kTRUE, Bool_t lowClosed=kTRUE, Bool_t highClosed=kFALSE) const
numerical integral in these limits including error estimation
Definition: SamplingDistribution.cxx:238

RooStats::SamplingDistribution::SamplingDistribution
SamplingDistribution()
Default constructor for SamplingDistribution.
Definition: SamplingDistribution.cxx:134

RooStats::SamplingDistribution::CDF
Double_t CDF(Double_t x) const
calculate CDF as a special case of Integral(...) with lower limit equal to -inf
Definition: SamplingDistribution.cxx:310

RooStats::SamplingDistribution::fSumW
std::vector< Double_t > fSumW
Definition: SamplingDistribution.h:89

RooStats::SamplingDistribution::fSumW2
std::vector< Double_t > fSumW2
Cached vector with sum of the weight used to compute integral.
Definition: SamplingDistribution.h:90

RooStats::SamplingDistribution::fVarName
TString fVarName
vector of weights for the samples
Definition: SamplingDistribution.h:87

RooStats::SamplingDistribution::InverseCDFInterpolate
Double_t InverseCDFInterpolate(Double_t pvalue)
get the inverse of the Cumulative distribution function
Definition: SamplingDistribution.cxx:406

RooStats::SamplingDistribution::Add
void Add(const SamplingDistribution *other)
merge two sampling distributions
Definition: SamplingDistribution.cxx:153

RooStats::SamplingDistribution::fSampleWeights
std::vector< Double_t > fSampleWeights
vector of points for the sampling distribution
Definition: SamplingDistribution.h:84

RooStats::SamplingDistribution::GetVarName
const TString GetVarName() const
Definition: SamplingDistribution.h:69

RooStats::SamplingDistribution::SortValues
void SortValues() const
Cached vector with sum of the weight used to compute integral error.
Definition: SamplingDistribution.cxx:200

RooStats::SamplingDistribution::fSamplingDist
std::vector< Double_t > fSamplingDist
Definition: SamplingDistribution.h:83

TNamed
The TNamed class is the base class for all named ROOT classes.
Definition: TNamed.h:29

TNamed::SetTitle
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
Definition: TNamed.cxx:164

TNamed::SetName
virtual void SetName(const char *name)
Set the name of the TNamed.
Definition: TNamed.cxx:140

TNamed::GetTitle
virtual const char * GetTitle() const
Returns title of object.
Definition: TNamed.h:48

TNamed::GetName
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:47

TObject::Warning
virtual void Warning(const char *method, const char *msgfmt,...) const
Issue warning message.
Definition: TObject.cxx:877

TString
Basic string class.
Definition: TString.h:131

TString::Length
Ssiz_t Length() const
Definition: TString.h:405

TString::Form
void Form(const char *fmt,...)
Formats a string using a printf style format descriptor.
Definition: TString.cxx:2289

bool

int

x
Double_t x[n]
Definition: legend1.C:17

n
const Int_t n
Definition: legend1.C:16

RooStats
Namespace for the RooStats classes.
Definition: Asimov.h:19

TMath::SortItr
void SortItr(Iterator first, Iterator last, IndexIterator index, Bool_t down=kTRUE)
Definition: TMathBase.h:337

TMath::AreEqualRel
Bool_t AreEqualRel(Double_t af, Double_t bf, Double_t relPrec)
Definition: TMath.h:418

sum
static long int sum(long int i)
Definition: Factory.cxx:2275