/*
Multivariate optimisation of signal efficiency for given background
efficiency, applying rectangular minimum and maximum requirements.
<p>
Also implemented is a "decorrelate/diagonlized cuts approach",
which improves over the uncorrelated cuts ansatz by
transforming linearly the input variables into a diagonal space,
using the square-root of the covariance matrix.
<p>
<font size="-1">
Other optimisation criteria, such as maximising the signal significance-
squared, S^2/(S+B), with S and B being the signal and background yields,
correspond to a particular point in the optimised background rejection
versus signal efficiency curve. This working point requires the knowledge
of the expected yields, which is not the case in general. Note also that
for rare signals, Poissonian statistics should be used, which modifies
the significance criterion.
</font>
<p>
The rectangular cut of a volume in the variable space is performed using
a binary tree to sort the training events. This provides a significant
reduction in computing time (up to several orders of magnitudes, depending
on the complexity of the problem at hand).
<p>
Technically, optimisation is achieved in TMVA by two methods:
<ol>
<li>Monte Carlo generation using uniform priors for the lower cut value,
and the cut width, thrown within the variable ranges.
<li>A Genetic Algorithm (GA) searches for the optimal ("fittest") cut sample.
The GA is configurable by many external settings through the option
string. For difficult cases (such as many variables), some tuning
may be necessary to achieve satisfying results
</ol>
<p>
<font size="-1">
Attempts to use Minuit fits (Simplex ot Migrad) instead have not shown
superior results, and often failed due to convergence at local minima.
</font>
<p>
The tests we have performed so far showed that in generic applications,
the GA is superior to MC sampling, and hence GA is the default method.
It is worthwhile trying both anyway.
<b>Decorrelated (or "diagonalized") Cuts</b>
<p>
See class description for Method Likelihood for a detailed explanation.
*/
//End_Html
#include <stdio.h>
#include "time.h"
#include "Riostream.h"
#include "TH1F.h"
#include "TObjString.h"
#include "TDirectory.h"
#include "TMath.h"
#include "TMVA/MethodCuts.h"
#include "TMVA/GeneticFitter.h"
#include "TMVA/MinuitFitter.h"
#include "TMVA/MCFitter.h"
#include "TMVA/Tools.h"
#include "TMVA/Timer.h"
#include "TMVA/Interval.h"
ClassImp(TMVA::MethodCuts)
TMVA::MethodCuts::MethodCuts( TString jobName, TString methodTitle, DataSet& theData,
TString theOption, TDirectory* theTargetDir )
: MethodBase( jobName, methodTitle, theData, theOption, theTargetDir )
{
InitCuts();
DeclareOptions();
ParseOptions();
ProcessOptions();
}
TMVA::MethodCuts::MethodCuts( DataSet& theData,
TString theWeightFile,
TDirectory* theTargetDir )
: MethodBase( theData, theWeightFile, theTargetDir )
{
InitCuts();
DeclareOptions();
}
void TMVA::MethodCuts::InitCuts( void )
{
SetMethodName( "Cuts" );
SetMethodType( Types::kCuts );
SetTestvarName();
fVarHistS = fVarHistB = 0;
fVarHistS_smooth = fVarHistB_smooth = 0;
fVarPdfS = fVarPdfB = 0;
fFitParams = 0;
fEffBvsSLocal = 0;
fBinaryTreeS = fBinaryTreeB = 0;
fEffSMin = 0;
fEffSMax = 0;
fTrainEffBvsS = 0;
fTrainRejBvsS = 0;
fNpar = 2*GetNvar();
fRangeSign = new vector<Int_t> ( GetNvar() );
for (Int_t ivar=0; ivar<GetNvar(); ivar++) (*fRangeSign)[ivar] = +1;
fMeanS = new vector<Double_t>( GetNvar() );
fMeanB = new vector<Double_t>( GetNvar() );
fRmsS = new vector<Double_t>( GetNvar() );
fRmsB = new vector<Double_t>( GetNvar() );
fFitParams = new vector<EFitParameters>( GetNvar() );
for (Int_t ivar=0; ivar<GetNvar(); ivar++) (*fFitParams)[ivar] = kNotEnforced;
fFitMethod = kUseMonteCarlo;
fTestSignalEff = -1;
fCutMin = new Double_t*[GetNvar()];
fCutMax = new Double_t*[GetNvar()];
for (Int_t i=0;i<GetNvar();i++) {
fCutMin[i] = new Double_t[fNbins];
fCutMax[i] = new Double_t[fNbins];
}
for (Int_t ivar=0; ivar<GetNvar(); ivar++) {
for (Int_t ibin=0; ibin<fNbins; ibin++) {
fCutMin[ivar][ibin] = 0;
fCutMax[ivar][ibin] = 0;
}
}
fTmpCutMin = new Double_t[GetNvar()];
fTmpCutMax = new Double_t[GetNvar()];
}
TMVA::MethodCuts::~MethodCuts( void )
{
delete fRangeSign;
delete fMeanS;
delete fMeanB;
delete fRmsS;
delete fRmsB;
for (Int_t i=0;i<GetNvar();i++) {
if (fCutMin[i] != NULL) delete [] fCutMin[i];
if (fCutMax[i] != NULL) delete [] fCutMax[i];
if (fCutRange[i] != NULL) delete fCutRange[i];
}
delete[] fCutMin;
delete[] fCutMax;
delete[] fTmpCutMin;
delete[] fTmpCutMax;
if (NULL != fBinaryTreeS) delete fBinaryTreeS;
if (NULL != fBinaryTreeB) delete fBinaryTreeB;
}
void TMVA::MethodCuts::DeclareOptions()
{
DeclareOptionRef(fFitMethodS = "GA", "FitMethod", "Minimization Method");
AddPreDefVal(TString("GA"));
AddPreDefVal(TString("SA"));
AddPreDefVal(TString("MC"));
AddPreDefVal(TString("MINUIT"));
DeclareOptionRef(fEffMethodS = "EffSel", "EffMethod", "Selection Method");
AddPreDefVal(TString("EffSel"));
AddPreDefVal(TString("EffPDF"));
fCutRange.resize(GetNvar());
fCutRangeMin = new Double_t[GetNvar()];
fCutRangeMax = new Double_t[GetNvar()];
for (Int_t ivar=0; ivar<GetNvar(); ivar++) {
fCutRange[ivar] = 0;
fCutRangeMin[ivar] = fCutRangeMax[ivar] = -1;
}
DeclareOptionRef( fCutRangeMin, GetNvar(), "CutRangeMin", "Minimum of allowed cut range (set per variable)" );
DeclareOptionRef( fCutRangeMax, GetNvar(), "CutRangeMax", "Maximum of allowed cut range (set per variable)" );
fAllVarsI = new TString[GetNvar()];
for (int i=0; i<GetNvar(); i++) fAllVarsI[i] = "NotEnforced";
DeclareOptionRef(fAllVarsI, GetNvar(), "VarProp", "Categorisation of cuts");
AddPreDefVal(TString("NotEnforced"));
AddPreDefVal(TString("FMax"));
AddPreDefVal(TString("FMin"));
AddPreDefVal(TString("FSmart"));
AddPreDefVal(TString("FVerySmart"));
}
void TMVA::MethodCuts::ProcessOptions()
{
MethodBase::ProcessOptions();
if (fFitMethodS == "MC" ) fFitMethod = kUseMonteCarlo;
else if (fFitMethodS == "GA" ) fFitMethod = kUseGeneticAlgorithm;
else if (fFitMethodS == "SA" ) fFitMethod = kUseSimulatedAnnealing;
else if (fFitMethodS == "MINUIT" ) {
fFitMethod = kUseMinuit;
fLogger << kWARNING << "poor performance of MINUIT in MethodCuts; preferred fit method: GA" << Endl;
}
else {
fLogger << kFATAL << "unknown minimization method: " << fFitMethodS << Endl;
}
if (fEffMethodS == "EFFSEL" ) fEffMethod = kUseEventSelection;
else if (fEffMethodS == "EFFPDF" ) fEffMethod = kUsePDFs;
else fEffMethod = kUseEventSelection;
fLogger << kINFO << Form("Use optimization method: '%s'\n",
(fFitMethod == kUseMonteCarlo) ? "Monte Carlo" : "Genetic Algorithm" );
fLogger << kINFO << Form("Use efficiency computation method: '%s'\n",
(fEffMethod == kUseEventSelection) ? "Event Selection" : "PDF" );
for (Int_t ivar=0; ivar<GetNvar(); ivar++) {
fCutRange[ivar] = new Interval( fCutRangeMin[ivar], fCutRangeMax[ivar] );
}
int maxVar = GetNvar();
for (Int_t ivar=0; ivar<maxVar; ivar++) {
EFitParameters theFitP = kNotEnforced;
if (fAllVarsI[ivar] == "" || fAllVarsI[ivar] == "NotEnforced") theFitP = kNotEnforced;
else if (fAllVarsI[ivar] == "FMax" ) theFitP = kForceMax;
else if (fAllVarsI[ivar] == "FMin" ) theFitP = kForceMin;
else if (fAllVarsI[ivar] == "FSmart" ) theFitP = kForceSmart;
else if (fAllVarsI[ivar] == "FVerySmart" ) theFitP = kForceVerySmart;
else {
fLogger << kFATAL << "unknown value \'" << fAllVarsI[ivar]
<< "\' for fit parameter option " << Form("VarProp[%i]",ivar+1) << Endl;
}
(*fFitParams)[ivar] = theFitP;
if (theFitP != kNotEnforced)
fLogger << kINFO << "Use \"" << fAllVarsI[ivar]
<< "\" cuts for variable: " << "'" << (*fInputVars)[ivar] << "'" << Endl;
}
if (fFitMethod == kUseMonteCarlo) {
for (Int_t ivar=0; ivar<GetNvar(); ivar++) {
TString theFitOption = ( ((*fFitParams)[ivar] == kNotEnforced) ? "NotEnforced" :
((*fFitParams)[ivar] == kForceMin ) ? "ForceMin" :
((*fFitParams)[ivar] == kForceMax ) ? "ForceMax" :
((*fFitParams)[ivar] == kForceSmart ) ? "ForceSmart" :
((*fFitParams)[ivar] == kForceVerySmart ) ? "ForceVerySmart" : "other" );
fLogger << kINFO << Form("Option for variable: %s: '%s' (#: %i)\n",
(const char*)(*fInputVars)[ivar], (const char*)theFitOption,
(Int_t)(*fFitParams)[ivar] );
}
}
if (GetVariableTransform() == Types::kDecorrelated)
fLogger << kINFO << "Use decorrelated variable set" << Endl;
else if (GetVariableTransform() == Types::kPCA)
fLogger << kINFO << "Use principal component transformation" << Endl;
}
Double_t TMVA::MethodCuts::GetMvaValue()
{
if (fCutMin == NULL || fCutMax == NULL || fNbins == 0) {
fLogger << kFATAL << "<Eval_Cuts> fCutMin/Max have zero pointer. "
<< "Did you book Cuts ?" << Endl;
}
if (fTestSignalEff > 0) {
Int_t ibin = Int_t((fTestSignalEff - fEffSMin)/(fEffSMax - fEffSMin)*Double_t(fNbins));
if (ibin < 0 ) ibin = 0;
if (ibin >= fNbins) ibin = fNbins - 1;
Bool_t passed = kTRUE;
for (Int_t ivar=0; ivar<GetNvar(); ivar++)
passed &= ( (GetEventVal(ivar) >= fCutMin[ivar][ibin]) &&
(GetEventVal(ivar) <= fCutMax[ivar][ibin]) );
return passed ? 1. : 0. ;
}
else return 0;
}
void TMVA::MethodCuts::Train( void )
{
if (!SanityChecks()) fLogger << kFATAL << "Basic sanity checks failed" << Endl;
if (fEffMethod == kUsePDFs) CreateVariablePDFs();
if (fBinaryTreeS != 0) delete fBinaryTreeS;
if (fBinaryTreeB != 0) delete fBinaryTreeB;
fBinaryTreeS = new BinarySearchTree();
fBinaryTreeS->Fill( *this, Data().GetTrainingTree(), 1 );
fBinaryTreeB = new BinarySearchTree();
fBinaryTreeB->Fill( *this, Data().GetTrainingTree(), 0 );
for (UInt_t ivar =0; ivar < Data().GetNVariables(); ivar++) {
(*fMeanS)[ivar] = fBinaryTreeS->Mean(Types::kSignal, ivar);
(*fRmsS)[ivar] = fBinaryTreeS->RMS (Types::kSignal, ivar);
(*fMeanB)[ivar] = fBinaryTreeB->Mean(Types::kBackground, ivar);
(*fRmsB)[ivar] = fBinaryTreeB->RMS (Types::kBackground, ivar);
Double_t xmin = TMath::Min(fBinaryTreeS->Min(Types::kSignal, ivar), fBinaryTreeB->Min(Types::kBackground, ivar));
Double_t xmax = TMath::Max(fBinaryTreeS->Max(Types::kSignal, ivar), fBinaryTreeB->Max(Types::kBackground, ivar));
if (fCutRange[ivar]->GetMin() == fCutRange[ivar]->GetMax()) {
fCutRange[ivar]->SetMin( xmin );
fCutRange[ivar]->SetMax( xmax );
}
else if (xmin > fCutRange[ivar]->GetMin()) fCutRange[ivar]->SetMin( xmin );
else if (xmax < fCutRange[ivar]->GetMax()) fCutRange[ivar]->SetMax( xmax );
}
vector<TH1F*> signalDist, bkgDist;
Data().ResetCurrentTree();
fEffBvsSLocal = new TH1F( GetTestvarName() + "_effBvsSLocal",
TString(GetName()) + " efficiency of B vs S", fNbins, 0.0, 1.0 );
for (Int_t ibin=1; ibin<=fNbins; ibin++) fEffBvsSLocal->SetBinContent( ibin, -0.1 );
if (fFitMethod == kUseGeneticAlgorithm || fFitMethod == kUseMonteCarlo || fFitMethod == kUseMinuit) {
vector<Interval*> ranges;
for (Int_t ivar=0; ivar<GetNvar(); ivar++) {
Int_t nbins = 0;
if (Data().GetVarType(ivar) == 'I') {
nbins = Int_t(fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin()) + 1;
}
EFitParameters fitParam = (*fFitParams)[ivar];
if (fitParam == kForceSmart) {
if ((*fMeanS)[ivar] > (*fMeanB)[ivar]) fitParam = kForceMax;
else fitParam = kForceMin;
}
if (fitParam == kForceMin){
ranges.push_back( new Interval( fCutRange[ivar]->GetMin(), fCutRange[ivar]->GetMin(), nbins ) );
ranges.push_back( new Interval( 0, fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(), nbins ) );
}
else if (fitParam == kForceMax){
ranges.push_back( new Interval( fCutRange[ivar]->GetMin(), fCutRange[ivar]->GetMax(), nbins ) );
ranges.push_back( new Interval( fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(),
fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(), nbins ) );
}
else{
ranges.push_back( new Interval( fCutRange[ivar]->GetMin(), fCutRange[ivar]->GetMax(), nbins ) );
ranges.push_back( new Interval( 0, fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(), nbins ) );
}
}
FitterBase* fitter = NULL;
switch (fFitMethod) {
case kUseGeneticAlgorithm:
fitter = new GeneticFitter( *this, Form("%sFitter_GA", GetName()), ranges, GetOptions() );
break;
case kUseMonteCarlo:
fitter = new MCFitter ( *this, Form("%sFitter_MC", GetName()), ranges, GetOptions() );
break;
case kUseMinuit:
fitter = new MinuitFitter ( *this, Form("%sFitter_MINUIT", GetName()), ranges, GetOptions() );
break;
default:
fLogger << kFATAL << "Wrong fit method: " << fFitMethod << Endl;
}
fitter->CheckForUnusedOptions();
fitter->Run();
for (UInt_t ivar=0; ivar<ranges.size(); ivar++) delete ranges[ivar];
}
else fLogger << kFATAL << "unknown minization method: " << fFitMethod << Endl;
if (fBinaryTreeS != 0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
if (fBinaryTreeB != 0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
}
void TMVA::MethodCuts::Test( TTree* )
{
}
Double_t TMVA::MethodCuts::EstimatorFunction( std::vector<Double_t>& par )
{
return ComputeEstimator( par );
}
Double_t TMVA::MethodCuts::ComputeEstimator( std::vector<Double_t>& par )
{
Double_t effS = 0, effB = 0;
this->MatchParsToCuts( par, &fTmpCutMin[0], &fTmpCutMax[0] );
switch (fEffMethod) {
case kUsePDFs:
this->GetEffsfromPDFs( &fTmpCutMin[0], &fTmpCutMax[0], effS, effB );
break;
case kUseEventSelection:
this->GetEffsfromSelection( &fTmpCutMin[0], &fTmpCutMax[0], effS, effB);
break;
default:
this->GetEffsfromSelection( &fTmpCutMin[0], &fTmpCutMax[0], effS, effB);
}
Double_t eta = 0;
Int_t ibinS = (Int_t)(effS*Float_t(fNbins) + 1);
if (ibinS < 1 ) ibinS = 1;
if (ibinS > fNbins) ibinS = fNbins;
Double_t effBH = fEffBvsSLocal->GetBinContent( ibinS );
Double_t effBH_left = fEffBvsSLocal->GetBinContent( ibinS-1 );
Double_t effBH_right = fEffBvsSLocal->GetBinContent( ibinS+1 );
Double_t average = (effBH_left+effBH_right)/2.;
if (effBH < effB) average = effBH;
eta = ( -TMath::Abs(effBH-average) +( 1. - (effBH - effB) ) ) / (1+effS);
if (effBH < 0 || effBH > effB) {
fEffBvsSLocal->SetBinContent( ibinS, effB );
for (Int_t ivar=0; ivar<GetNvar(); ivar++) {
fCutMin[ivar][ibinS-1] = fTmpCutMin[ivar];
fCutMax[ivar][ibinS-1] = fTmpCutMax[ivar];
}
}
return eta;
}
void TMVA::MethodCuts::MatchParsToCuts( const std::vector<Double_t> & par,
Double_t* cutMin, Double_t* cutMax )
{
for (Int_t ivar=0; ivar<GetNvar(); ivar++) {
Int_t ipar = 2*ivar;
cutMin[ivar] = ((*fRangeSign)[ivar] > 0) ? par[ipar] : par[ipar] - par[ipar+1];
cutMax[ivar] = ((*fRangeSign)[ivar] > 0) ? par[ipar] + par[ipar+1] : par[ipar];
}
}
void TMVA::MethodCuts::MatchCutsToPars( std::vector<Double_t>& par,
Double_t** cutMinAll, Double_t** cutMaxAll, Int_t ibin )
{
if (ibin < 1 || ibin > fNbins) fLogger << kFATAL << "::MatchCutsToPars: bin error: "
<< ibin << Endl;
const Int_t nvar = GetNvar();
Double_t *cutMin = new Double_t[nvar];
Double_t *cutMax = new Double_t[nvar];
for (Int_t ivar=0; ivar<nvar; ivar++) {
cutMin[ivar] = cutMinAll[ivar][ibin-1];
cutMax[ivar] = cutMaxAll[ivar][ibin-1];
}
MatchCutsToPars( par, cutMin, cutMax );
delete [] cutMin;
delete [] cutMax;
}
void TMVA::MethodCuts::MatchCutsToPars( std::vector<Double_t>& par,
Double_t* cutMin, Double_t* cutMax )
{
for (Int_t ivar=0; ivar<GetNvar(); ivar++) {
Int_t ipar = 2*ivar;
par[ipar] = ((*fRangeSign)[ivar] > 0) ? cutMin[ivar] : cutMax[ivar];
par[ipar+1] = cutMax[ivar] - cutMin[ivar];
}
}
void TMVA::MethodCuts::GetEffsfromPDFs( Double_t* cutMin, Double_t* cutMax,
Double_t& effS, Double_t& effB )
{
effS = 1.0;
effB = 1.0;
for (Int_t ivar=0; ivar<GetNvar(); ivar++) {
effS *= (*fVarPdfS)[ivar]->GetIntegral( cutMin[ivar], cutMax[ivar] );
effB *= (*fVarPdfB)[ivar]->GetIntegral( cutMin[ivar], cutMax[ivar] );
}
}
void TMVA::MethodCuts::GetEffsfromSelection( Double_t* cutMin, Double_t* cutMax,
Double_t& effS, Double_t& effB)
{
Float_t nTotS = 0, nTotB = 0;
Float_t nSelS = 0, nSelB = 0;
Volume* volume = new Volume( cutMin, cutMax, GetNvar() );
nSelS = fBinaryTreeS->SearchVolume( volume );
nSelB = fBinaryTreeB->SearchVolume( volume );
delete volume;
nTotS = fBinaryTreeS->GetSumOfWeights();
nTotB = fBinaryTreeB->GetSumOfWeights();
if (nTotS == 0 && nTotB == 0) {
fLogger << kFATAL << "<GetEffsfromSelection> fatal error in zero total number of events:"
<< " nTotS, nTotB: " << nTotS << " " << nTotB << " ***" << Endl;
}
if (nTotS == 0 ) {
effS = 0;
effB = nSelB/nTotB;
fLogger << kWARNING << "<ComputeEstimator> zero number of signal events" << Endl;
}
else if (nTotB == 0) {
effB = 0;
effS = nSelS/nTotS;
fLogger << kWARNING << "<ComputeEstimator> zero number of background events" << Endl;
}
else {
effS = nSelS/nTotS;
effB = nSelB/nTotB;
}
}
void TMVA::MethodCuts::CreateVariablePDFs( void )
{
fVarHistS = new vector<TH1*>( GetNvar() );
fVarHistB = new vector<TH1*>( GetNvar() );
fVarHistS_smooth = new vector<TH1*>( GetNvar() );
fVarHistB_smooth = new vector<TH1*>( GetNvar() );
fVarPdfS = new vector<PDF*>( GetNvar() );
fVarPdfB = new vector<PDF*>( GetNvar() );
Int_t nsmooth = 0;
for (Int_t ivar=0; ivar<GetNvar(); ivar++) {
TString histTitle = (*fInputVars)[ivar] + " signal training";
TString histName = (*fInputVars)[ivar] + "_sig";
TString drawOpt = (*fInputVars)[ivar] + ">>h(";
drawOpt += fNbins;
drawOpt += ")";
Data().GetTrainingTree()->Draw( drawOpt, "type==1", "goff" );
(*fVarHistS)[ivar] = (TH1F*)gDirectory->Get("h");
(*fVarHistS)[ivar]->SetName(histName);
(*fVarHistS)[ivar]->SetTitle(histTitle);
(*fVarHistS_smooth)[ivar] = (TH1F*)(*fVarHistS)[ivar]->Clone();
histTitle = (*fInputVars)[ivar] + " signal training smoothed ";
histTitle += nsmooth;
histTitle +=" times";
histName = (*fInputVars)[ivar] + "_sig_smooth";
(*fVarHistS_smooth)[ivar]->SetName(histName);
(*fVarHistS_smooth)[ivar]->SetTitle(histTitle);
(*fVarHistS_smooth)[ivar]->Smooth(nsmooth);
histTitle = (*fInputVars)[ivar] + " background training";
histName = (*fInputVars)[ivar] + "_bgd";
drawOpt = (*fInputVars)[ivar] + ">>h(";
drawOpt += fNbins;
drawOpt += ")";
Data().GetTrainingTree()->Draw( drawOpt, "type==0", "goff" );
(*fVarHistB)[ivar] = (TH1F*)gDirectory->Get("h");
(*fVarHistB)[ivar]->SetName(histName);
(*fVarHistB)[ivar]->SetTitle(histTitle);
(*fVarHistB_smooth)[ivar] = (TH1F*)(*fVarHistB)[ivar]->Clone();
histTitle = (*fInputVars)[ivar]+" background training smoothed ";
histTitle += nsmooth;
histTitle +=" times";
histName = (*fInputVars)[ivar]+"_bgd_smooth";
(*fVarHistB_smooth)[ivar]->SetName(histName);
(*fVarHistB_smooth)[ivar]->SetTitle(histTitle);
(*fVarHistB_smooth)[ivar]->Smooth(nsmooth);
(*fVarPdfS)[ivar] = new PDF( (*fVarHistS_smooth)[ivar], PDF::kSpline2 );
(*fVarPdfB)[ivar] = new PDF( (*fVarHistB_smooth)[ivar], PDF::kSpline2 );
}
}
Bool_t TMVA::MethodCuts::SanityChecks( void )
{
Bool_t isOK = kTRUE;
TObjArrayIter branchIter( Data().GetTrainingTree()->GetListOfBranches(), kIterForward );
TBranch* branch = 0;
Int_t ivar = -1;
while ((branch = (TBranch*)branchIter.Next()) != 0) {
TString branchName = branch->GetName();
if (branchName != "type" && branchName != "weight" && branchName != "boostweight") {
ivar++;
if ((*fInputVars)[ivar] != branchName) {
fLogger << kWARNING << "<SanityChecks> mismatch in variables" << Endl;
isOK = kFALSE;
}
}
}
return isOK;
}
void TMVA::MethodCuts::WriteWeightsToStream( ostream & o ) const
{
o << "OptimisationMethod " << "nbins:" << endl;
o << ((fEffMethod == kUseEventSelection) ? "Fit-EventSelection" :
(fEffMethod == kUsePDFs) ? "Fit-PDF" : "Monte-Carlo") << " " ;
o << fNbins << endl;
o << "Below are the optimised cuts for " << GetNvar() << " variables:" << endl;
o << "Format: ibin(hist) effS effB cutMin[ivar=0] cutMax[ivar=0]"
<< " ... cutMin[ivar=n-1] cutMax[ivar=n-1]" << endl;
for (Int_t ibin=0; ibin<fNbins; ibin++) {
o << setw(4) << ibin+1 << " "
<< setw(8)<< fEffBvsSLocal->GetBinCenter( ibin +1 ) << " "
<< setw(8)<< fEffBvsSLocal->GetBinContent( ibin +1 ) << " ";
for (Int_t ivar=0; ivar<GetNvar(); ivar++)
o <<setw(10)<< fCutMin[ivar][ibin] << " " << setw(10) << fCutMax[ivar][ibin] << " ";
o << endl;
}
}
void TMVA::MethodCuts::ReadWeightsFromStream( istream& istr )
{
TString dummy;
UInt_t dummyInt;
istr >> dummy >> dummy;
istr >> dummy >> fNbins;
istr >> dummy >> dummy >> dummy >> dummy >> dummy >> dummy >> dummyInt >> dummy ;
if (dummyInt != Data().GetNVariables()) {
fLogger << kFATAL << "<ReadWeightsFromStream> fatal error: mismatch "
<< "in number of variables: " << dummyInt << " != " << Data().GetNVariables() << Endl;
}
SetNvar(dummyInt);
if (fFitMethod == kUseMonteCarlo) {
fLogger << kINFO << "Read cuts optimised using "<< fNRandCuts << " MC events" << Endl;
}
else if (fFitMethod == kUseGeneticAlgorithm) {
fLogger << kINFO << "Read cuts optimised using Genetic Algorithm" << Endl;
}
else if (fFitMethod == kUseSimulatedAnnealing) {
fLogger << kINFO << "Read cuts optimised using Si,ulated Annealing" << Endl;
}
else {
fLogger << kWARNING << "unknown method: " << fFitMethod << Endl;
}
fLogger << kINFO << "in " << fNbins << " signal efficiency bins and for " << GetNvar() << " variables" << Endl;
char buffer[200];
istr.getline(buffer,200);
istr.getline(buffer,200);
Int_t tmpbin;
Float_t tmpeffS, tmpeffB;
if (fEffBvsSLocal!=0) delete fEffBvsSLocal;
fEffBvsSLocal = new TH1F( GetTestvarName() + "_effBvsSLocal",
TString(GetName()) + " efficiency of B vs S", fNbins, 0.0, 1.0 );
for (Int_t ibin=0; ibin<fNbins; ibin++) {
istr >> tmpbin >> tmpeffS >> tmpeffB;
fEffBvsSLocal->SetBinContent( ibin+1, tmpeffB );
for (Int_t ivar=0; ivar<GetNvar(); ivar++) {
istr >> fCutMin[ivar][ibin] >> fCutMax[ivar][ibin];
}
}
fEffSMin = fEffBvsSLocal->GetBinCenter(1);
fEffSMax = fEffBvsSLocal->GetBinCenter(fNbins);
}
void TMVA::MethodCuts::WriteMonitoringHistosToFile( void ) const
{
fLogger << kINFO << "write monitoring histograms to file: " << BaseDir()->GetPath() << Endl;
fEffBvsSLocal->Write();
if (fEffMethod == kUsePDFs) {
for (Int_t ivar=0; ivar<GetNvar(); ivar++) {
(*fVarHistS)[ivar]->Write();
(*fVarHistB)[ivar]->Write();
(*fVarHistS_smooth)[ivar]->Write();
(*fVarHistB_smooth)[ivar]->Write();
(*fVarPdfS)[ivar]->GetPDFHist()->Write();
(*fVarPdfB)[ivar]->GetPDFHist()->Write();
}
}
}
Double_t TMVA::MethodCuts::GetTrainingEfficiency( TString theString)
{
TList* list = Tools::ParseFormatLine( theString );
if (list->GetSize() != 2) {
fLogger << kFATAL << "<GetTrainingEfficiency> wrong number of arguments"
<< " in string: " << theString
<< " | required format, e.g., Efficiency:0.05" << Endl;
return -1;
}
Float_t effBref = atof( ((TObjString*)list->At(1))->GetString() );
Bool_t firstPass = (NULL == fTrainEffBvsS || NULL == fTrainRejBvsS);
if (firstPass) {
if (fBinaryTreeS != 0) delete fBinaryTreeS;
if (fBinaryTreeB != 0) delete fBinaryTreeB;
fBinaryTreeS = new BinarySearchTree();
fBinaryTreeS->Fill( *this, Data().GetTrainingTree(), 1 );
fBinaryTreeB = new BinarySearchTree();
fBinaryTreeB->Fill( *this, Data().GetTrainingTree(), 0 );
if (NULL != fTrainEffBvsS) delete fTrainEffBvsS;
if (NULL != fTrainRejBvsS) delete fTrainRejBvsS;
fTrainEffBvsS = new TH1F( GetTestvarName() + "_trainingEffBvsS", GetTestvarName() + "", fNbins, 0, 1 );
fTrainRejBvsS = new TH1F( GetTestvarName() + "_trainingRejBvsS", GetTestvarName() + "", fNbins, 0, 1 );
Double_t* tmpCutMin = new Double_t[GetNvar()];
Double_t* tmpCutMax = new Double_t[GetNvar()];
for (Int_t bini=1; bini<=fNbins; bini++) {
for (Int_t ivar=0; ivar <GetNvar(); ivar++){
tmpCutMin[ivar] = fCutMin[ivar][bini-1];
tmpCutMax[ivar] = fCutMax[ivar][bini-1];
}
Double_t effS, effB;
this->GetEffsfromSelection( &tmpCutMin[0], &tmpCutMax[0], effS, effB);
fTrainEffBvsS->SetBinContent( bini, effB );
fTrainRejBvsS->SetBinContent( bini, 1.0-effB );
}
delete[] tmpCutMin;
delete[] tmpCutMax;
fGraphTrainEffBvsS = new TGraph( fTrainEffBvsS );
fSplTrainEffBvsS = new TSpline1( "trainEffBvsS", fGraphTrainEffBvsS );
}
if (NULL == fSplTrainEffBvsS) return 0.0;
Double_t effS, effB, effS_ = 0, effB_ = 0;
Int_t nbins_ = 1000;
for (Int_t bini=1; bini<=nbins_; bini++) {
effS = (bini - 0.5)/Float_t(nbins_);
effB = fSplTrainEffBvsS->Eval( effS );
if ((effB - effBref)*(effB_ - effBref) < 0) break;
effS_ = effS;
effB_ = effB;
}
return 0.5*(effS + effS_);
}
Double_t TMVA::MethodCuts::GetEfficiency( TString theString, TTree* theTree, Double_t& effSerr )
{
if (theTree == 0);
TList* list = TMVA::Tools::ParseFormatLine( theString, ":" );
Bool_t computeArea = kFALSE;
if (!list || list->GetSize() < 2) computeArea = kTRUE;
else if (list->GetSize() > 2) {
fLogger << kFATAL << "<GetEfficiency> wrong number of arguments"
<< " in string: " << theString
<< " | required format, e.g., Efficiency:0.05, or empty string" << Endl;
return -1;
}
if (fEffBvsS == NULL || fRejBvsS == NULL) {
if (fBinaryTreeS!=0) delete fBinaryTreeS;
if (fBinaryTreeB!=0) delete fBinaryTreeB;
fBinaryTreeS = new BinarySearchTree();
fBinaryTreeS->Fill( *this, Data().GetTestTree(), 1 );
fBinaryTreeB = new BinarySearchTree();
fBinaryTreeB->Fill( *this, Data().GetTestTree(), 0 );
if (NULL != fEffBvsS)delete fEffBvsS;
if (NULL != fRejBvsS)delete fRejBvsS;
fEffBvsS = new TH1F( GetTestvarName() + "_effBvsS", GetTestvarName() + "", fNbins, 0, 1 );
fRejBvsS = new TH1F( GetTestvarName() + "_rejBvsS", GetTestvarName() + "", fNbins, 0, 1 );
Double_t* tmpCutMin = new Double_t[GetNvar()];
Double_t* tmpCutMax = new Double_t[GetNvar()];
for (Int_t bini=1; bini<=fNbins; bini++) {
for (Int_t ivar=0; ivar <GetNvar(); ivar++){
tmpCutMin[ivar] = fCutMin[ivar][bini-1];
tmpCutMax[ivar] = fCutMax[ivar][bini-1];
}
Double_t effS, effB;
this->GetEffsfromSelection( &tmpCutMin[0], &tmpCutMax[0], effS, effB);
fEffBvsS->SetBinContent( bini, effB );
fRejBvsS->SetBinContent( bini, 1.0-effB );
}
delete[] tmpCutMin;
delete[] tmpCutMax;
fGrapheffBvsS = new TGraph( fEffBvsS );
fSpleffBvsS = new TSpline1( "effBvsS", fGrapheffBvsS );
}
if (NULL == fSpleffBvsS) return 0.0;
Double_t effS = 0, effB = 0, effS_ = 0, effB_ = 0;
Int_t nbins_ = 1000;
if (computeArea) {
Double_t integral = 0;
for (Int_t bini=1; bini<=nbins_; bini++) {
effS = (bini - 0.5)/Float_t(nbins_);
effB = fSpleffBvsS->Eval( effS );
integral += (1.0 - effB);
}
integral /= nbins_;
return integral;
}
else {
Float_t effBref = atof( ((TObjString*)list->At(1))->GetString() );
for (Int_t bini=1; bini<=nbins_; bini++) {
effS = (bini - 0.5)/Float_t(nbins_);
effB = fSpleffBvsS->Eval( effS );
if ((effB - effBref)*(effB_ - effBref) < 0) break;
effS_ = effS;
effB_ = effB;
}
effS = 0.5*(effS + effS_);
effSerr = 0;
if (Data().GetNEvtSigTest() > 0)
effSerr = TMath::Sqrt( effS*(1.0 - effS)/Double_t(Data().GetNEvtSigTest()) );
return effS;
}
return -1;
}
void TMVA::MethodCuts::MakeClassSpecific( std::ostream& fout, const TString& className ) const
{
fout << " // not implemented for class: \"" << className << "\"" << endl;
fout << "};" << endl;
}
void TMVA::MethodCuts::GetHelpMessage() const
{
fLogger << Endl;
fLogger << Tools::Color("bold") << "--- Short description:" << Tools::Color("reset") << Endl;
fLogger << Endl;
fLogger << "The optimisation of rectangular cuts performed by TMVA maximises " << Endl;
fLogger << "the background rejection at given signal efficiency, and scans " << Endl;
fLogger << "over the full range of the latter quantity. Three optimisation" << Endl;
fLogger << "methods are optional: Monte Carlo sampling (MC), a Genetics Algo-," << Endl;
fLogger << "rithm (GA), and Simulated Annealing (SA - depreciated at present). " << Endl;
fLogger << "GA is expected to perform best." << Endl;
fLogger << Endl;
fLogger << "The difficulty to find the optimal cuts strongly increases with" << Endl;
fLogger << "the dimensionality (number of input variables) of the problem." << Endl;
fLogger << "This behavior is due to the non-uniqueness of the solution space."<< Endl;
fLogger << Endl;
fLogger << Tools::Color("bold") << "--- Performance optimisation:" << Tools::Color("reset") << Endl;
fLogger << Endl;
fLogger << "If the dimensionality exceeds, say, 4 input variables, it is " << Endl;
fLogger << "advisable to scrutinize the separation power of the variables," << Endl;
fLogger << "and to remove the weakest ones. If some among the input variables" << Endl;
fLogger << "can be described by a single cut (e.g., because signal tends to be" << Endl;
fLogger << "larger than background), this can be indicated to MethodCuts via" << Endl;
fLogger << "the \"Fsmart\" options (see option string). Choosing this option" << Endl;
fLogger << "reduces the number of requirements for the variable from 2 (min/max)" << Endl;
fLogger << "to a single one (TMVA finds out whether it is to be interpreted as" << Endl;
fLogger << "min or max)." << Endl;
fLogger << Endl;
fLogger << Tools::Color("bold") << "--- Performance tuning via configuration options:" << Tools::Color("reset") << Endl;
fLogger << "" << Endl;
fLogger << "Monte Carlo sampling:" << Endl;
fLogger << "" << Endl;
fLogger << "Apart form the \"Fsmart\" option for the variables, the only way" << Endl;
fLogger << "to improve the MC sampling is to increase the sampling rate. This" << Endl;
fLogger << "is done via the configuration option \"MC_NRandCuts\". The execution" << Endl;
fLogger << "time scales linearly with the sampling rate." << Endl;
fLogger << "" << Endl;
fLogger << "Genetic Algorithm:" << Endl;
fLogger << "" << Endl;
fLogger << "The algorithm terminates if no significant fitness increase has" << Endl;
fLogger << "been achieved within the last \"nsteps\" steps of the calculation." << Endl;
fLogger << "Wiggles in the ROC curve or constant background rejection of 1" << Endl;
fLogger << "indicate that the GA failed to always converge at the true maximum" << Endl;
fLogger << "fitness. In such a case, it is recommended to broaden the search " << Endl;
fLogger << "by increasing the population size (\"popSize\") and to give the GA " << Endl;
fLogger << "more time to find improvements by increasing the number of steps" << Endl;
fLogger << "(\"nsteps\")" << Endl;
fLogger << " -> increase \"popSize\" (at least >10 * number of variables)" << Endl;
fLogger << " -> increase \"nsteps\"" << Endl;
fLogger << "" << Endl;
}
This page has been automatically generated. If you have any comments or suggestions about the page layout send a mail to ROOT support, or contact the developers with any questions or problems regarding ROOT.