Logo ROOT   6.14/05
Reference Guide
DataLoader.h
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Peter Speckmayer, Joerg Stelzer, Helge Voss, Kai Voss, Eckhard von Toerne, Jan Therhaag, Omar Zapata, Lorenzo Moneta, Sergei Gleyzer
3 //NOTE: Based on TMVA::Factory
4 
5 /**********************************************************************************
6  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
7  * Package: TMVA *
8  * Class : DataLoader *
9  * Web : http://tmva.sourceforge.net *
10  * *
11  * Description: *
12  * This is a class to load datasets into every booked method *
13  * *
14  * Authors (alphabetical): *
15  * Lorenzo Moneta <Lorenzo.Moneta@cern.ch> - CERN, Switzerland *
16  * Omar Zapata <andresete.chaos@gmail.com> - ITM/UdeA, Colombia *
17  * Sergei Gleyzer<sergei.gleyzer@cern.ch> - CERN, Switzerland *
18  * *
19  * Copyright (c) 2005-2011: *
20  * CERN, Switzerland *
21  * ITM/UdeA, Colombia *
22  * *
23  * Redistribution and use in source and binary forms, with or without *
24  * modification, are permitted according to the terms listed in LICENSE *
25  * (http://tmva.sourceforge.net/LICENSE) *
26  **********************************************************************************/
27 
28 #ifndef ROOT_TMVA_DataLoader
29 #define ROOT_TMVA_DataLoader
30 
31 
32 #include <string>
33 #include <vector>
34 #include <map>
35 #include "TCut.h"
36 
37 #include "TMVA/Factory.h"
38 #include "TMVA/Types.h"
39 #include "TMVA/DataSet.h"
40 
41 class TFile;
42 class TTree;
43 class TDirectory;
44 class TH2;
45 
46 namespace TMVA {
47 
48  class CvSplit;
49  class DataInputHandler;
50  class DataSetInfo;
51  class DataSetManager;
52  class Envelope;
53  class MethodBase;
54  class IMethod;
55  class VariableTransformBase;
56  class VarTransformHandler;
57 
58  class DataLoader : public Configurable {
59  friend class Factory;
60  friend class Envelope;
61  public:
62 
63  DataLoader( TString thedlName="default");
64 
65  // default destructor
66  virtual ~DataLoader();
67 
68 
69  // add events to training and testing trees
70  void AddSignalTrainingEvent ( const std::vector<Double_t>& event, Double_t weight = 1.0 );
71  void AddBackgroundTrainingEvent( const std::vector<Double_t>& event, Double_t weight = 1.0 );
72  void AddSignalTestEvent ( const std::vector<Double_t>& event, Double_t weight = 1.0 );
73  void AddBackgroundTestEvent ( const std::vector<Double_t>& event, Double_t weight = 1.0 );
74  void AddTrainingEvent( const TString& className, const std::vector<Double_t>& event, Double_t weight );
75  void AddTestEvent ( const TString& className, const std::vector<Double_t>& event, Double_t weight );
76  void AddEvent ( const TString& className, Types::ETreeType tt, const std::vector<Double_t>& event, Double_t weight );
79 
81  DataSetInfo& AddDataSet( const TString& );
83  DataLoader* VarTransform(TString trafoDefinition);
84 
85  // special case: signal/background
86 
87  // Data input related
88  void SetInputTrees( const TString& signalFileName, const TString& backgroundFileName,
89  Double_t signalWeight=1.0, Double_t backgroundWeight=1.0 );
90  void SetInputTrees( TTree* inputTree, const TCut& SigCut, const TCut& BgCut );
91  // Set input trees at once
93  Double_t signalWeight=1.0, Double_t backgroundWeight=1.0) ;
94 
95  void AddSignalTree( TTree* signal, Double_t weight=1.0, Types::ETreeType treetype = Types::kMaxTreeType );
96  void AddSignalTree( TString datFileS, Double_t weight=1.0, Types::ETreeType treetype = Types::kMaxTreeType );
97  void AddSignalTree( TTree* signal, Double_t weight, const TString& treetype );
98 
99  // ... depreciated, kept for backwards compatibility
100  void SetSignalTree( TTree* signal, Double_t weight=1.0);
101 
103  void AddBackgroundTree( TString datFileB, Double_t weight=1.0, Types::ETreeType treetype = Types::kMaxTreeType );
104  void AddBackgroundTree( TTree* background, Double_t weight, const TString & treetype );
105 
106  // ... depreciated, kept for backwards compatibility
107  void SetBackgroundTree( TTree* background, Double_t weight=1.0 );
108 
109  void SetSignalWeightExpression( const TString& variable );
110  void SetBackgroundWeightExpression( const TString& variable );
111 
112  // special case: regression
113  void AddRegressionTree( TTree* tree, Double_t weight = 1.0,
114  Types::ETreeType treetype = Types::kMaxTreeType ) {
115  AddTree( tree, "Regression", weight, "", treetype );
116  }
117 
118  // general
119 
120  // Data input related
121  void SetTree( TTree* tree, const TString& className, Double_t weight ); // depreciated
122  void AddTree( TTree* tree, const TString& className, Double_t weight=1.0,
123  const TCut& cut = "",
125  void AddTree( TTree* tree, const TString& className, Double_t weight, const TCut& cut, const TString& treeType );
126 
127  // set input variable
128  void SetInputVariables ( std::vector<TString>* theVariables ); // depreciated
129  void AddVariable ( const TString& expression, const TString& title, const TString& unit,
130  char type='F', Double_t min = 0, Double_t max = 0 );
131  void AddVariable ( const TString& expression, char type='F',
132  Double_t min = 0, Double_t max = 0 );
133  void AddTarget ( const TString& expression, const TString& title = "", const TString& unit = "",
134  Double_t min = 0, Double_t max = 0 );
135  void AddRegressionTarget( const TString& expression, const TString& title = "", const TString& unit = "",
136  Double_t min = 0, Double_t max = 0 )
137  {
138  AddTarget( expression, title, unit, min, max );
139  }
140  void AddSpectator ( const TString& expression, const TString& title = "", const TString& unit = "",
141  Double_t min = 0, Double_t max = 0 );
142 
143  // set weight for class
144  void SetWeightExpression( const TString& variable, const TString& className = "" );
145 
146  // set cut for class
147  void SetCut( const TString& cut, const TString& className = "" );
148  void SetCut( const TCut& cut, const TString& className = "" );
149  void AddCut( const TString& cut, const TString& className = "" );
150  void AddCut( const TCut& cut, const TString& className = "" );
151 
152 
153  // prepare input tree for training
154  void PrepareTrainingAndTestTree( const TCut& cut, const TString& splitOpt );
155  void PrepareTrainingAndTestTree( TCut sigcut, TCut bkgcut, const TString& splitOpt );
156 
157  // ... deprecated, kept for backwards compatibility
158  void PrepareTrainingAndTestTree( const TCut& cut, Int_t Ntrain, Int_t Ntest = -1 );
159 
160  void PrepareTrainingAndTestTree( const TCut& cut, Int_t NsigTrain, Int_t NbkgTrain, Int_t NsigTest, Int_t NbkgTest,
161  const TString& otherOpt="SplitMode=Random:!V" );
162 
163  // Cross validation
164  void MakeKFoldDataSet(CvSplit & s);
167 
169 
170  TH2* GetCorrelationMatrix(const TString& className);
171 
172  //Copy method use in VI and CV DEPRECATED: you can just call Clone DataLoader *dl2=(DataLoader *)dl1->Clone("dl2")
174  friend void DataLoaderCopy(TMVA::DataLoader* des, TMVA::DataLoader* src);
176 
177  private:
178 
179 
182 
183 
184  private:
185 
186  // data members
187 
188 
190 
191 
193 
194  std::vector<TMVA::VariableTransformBase*> fDefaultTrfs; // list of transformations on default DataSet
195 
196  // cd to local directory
197  TString fOptions; // option string given by construction (presently only "V")
198  TString fTransformations; // List of transformations to test
199  Bool_t fVerbose; // verbose mode
200 
201  // flag determining the way training and test data are assigned to DataLoader
205  DataAssignType fDataAssignType; // flags for data assigning
206  std::vector<TTree*> fTrainAssignTree; // for each class: tmp tree if user wants to assign the events directly
207  std::vector<TTree*> fTestAssignTree; // for each class: tmp tree if user wants to assign the events directly
208 
209  Int_t fATreeType = 0; // type of event (=classIndex)
210  Float_t fATreeWeight = 0.0; // weight of the event
211  std::vector<Float_t> fATreeEvent; // event variables
212 
213  Types::EAnalysisType fAnalysisType; // the training type
214 
215  protected:
216 
217  ClassDef(DataLoader,4);
218  };
220 } // namespace TMVA
221 
222 #endif
223 
void AddBackgroundTree(TTree *background, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
number of signal events (used to compute significance)
Definition: DataLoader.cxx:408
DataSetManager * fDataSetManager
Definition: DataLoader.h:189
virtual ~DataLoader()
Definition: DataLoader.cxx:103
auto * tt
Definition: textangle.C:16
void AddTrainingEvent(const TString &className, const std::vector< Double_t > &event, Double_t weight)
add signal training event
Definition: DataLoader.cxx:266
std::vector< TMVA::VariableTransformBase * > fDefaultTrfs
Definition: DataLoader.h:194
DataLoader(TString thedlName="default")
Definition: DataLoader.cxx:87
float Float_t
Definition: RtypesCore.h:53
void AddRegressionTarget(const TString &expression, const TString &title="", const TString &unit="", Double_t min=0, Double_t max=0)
Definition: DataLoader.h:135
DataSetInfo & GetDataSetInfo()
Definition: DataLoader.cxx:144
A ROOT file is a suite of consecutive data records (TKey instances) with a well defined format...
Definition: TFile.h:47
EAnalysisType
Definition: Types.h:127
TTree * CreateEventAssignTrees(const TString &name)
create the data assignment tree (for event-wise data assignment by user)
Definition: DataLoader.cxx:201
DataSetInfo & DefaultDataSetInfo()
default creation
Definition: DataLoader.cxx:530
DataLoader * VarTransform(TString trafoDefinition)
Transforms the variables and return a new DataLoader with the transformed variables.
Definition: DataLoader.cxx:153
Basic string class.
Definition: TString.h:131
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
void SetBackgroundTree(TTree *background, Double_t weight=1.0)
Definition: DataLoader.cxx:445
DataInputHandler * fDataInputHandler
Definition: DataLoader.h:192
Types::EAnalysisType fAnalysisType
Definition: DataLoader.h:213
void AddBackgroundTestEvent(const std::vector< Double_t > &event, Double_t weight=1.0)
add signal training event
Definition: DataLoader.cxx:258
TH2 * GetCorrelationMatrix(const TString &className)
returns the correlation matrix of datasets
Definition: DataLoader.cxx:714
void AddVariable(const TString &expression, const TString &title, const TString &unit, char type='F', Double_t min=0, Double_t max=0)
user inserts discriminating variable in data set info
Definition: DataLoader.cxx:491
void MakeKFoldDataSet(CvSplit &s)
Function required to split the training and testing datasets into a number of folds.
Definition: DataLoader.cxx:659
#define ClassDef(name, id)
Definition: Rtypes.h:320
void AddTestEvent(const TString &className, const std::vector< Double_t > &event, Double_t weight)
add signal test event
Definition: DataLoader.cxx:274
void SetInputTrees(const TString &signalFileName, const TString &backgroundFileName, Double_t signalWeight=1.0, Double_t backgroundWeight=1.0)
Definition: DataLoader.cxx:470
void SetTree(TTree *tree, const TString &className, Double_t weight)
set background tree
Definition: DataLoader.cxx:453
Abstract base class for all high level ml algorithms, you can book ml methods like BDT...
Definition: Envelope.h:43
Class that contains all the data information.
Definition: DataSetInfo.h:60
void SetInputVariables(std::vector< TString > *theVariables)
fill input variables in data set
Definition: DataLoader.cxx:538
DataSetInfo & AddDataSet(DataSetInfo &)
Definition: DataLoader.cxx:126
void AddCut(const TString &cut, const TString &className="")
Definition: DataLoader.cxx:585
A specialized string object used for TTree selections.
Definition: TCut.h:25
void SetInputTreesFromEventAssignTrees()
assign event-wise local trees to data set
Definition: DataLoader.cxx:325
Float_t fATreeWeight
Definition: DataLoader.h:210
DataInputHandler & DataInput()
Definition: DataLoader.h:175
Service class for 2-Dim histogram classes.
Definition: TH2.h:30
Class that contains all the data information.
unsigned int UInt_t
Definition: RtypesCore.h:42
std::vector< TTree * > fTestAssignTree
Definition: DataLoader.h:207
Bool_t UserAssignEvents(UInt_t clIndex)
Definition: DataLoader.cxx:317
std::vector< Float_t > fATreeEvent
Definition: DataLoader.h:211
void AddRegressionTree(TTree *tree, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
Definition: DataLoader.h:113
This is the main MVA steering class.
Definition: Factory.h:81
DataLoader * MakeCopy(TString name)
Copy method use in VI and CV.
Definition: DataLoader.cxx:688
const DataSetInfo & GetDefaultDataSetInfo()
Definition: DataLoader.h:168
void AddTree(TTree *tree, const TString &className, Double_t weight=1.0, const TCut &cut="", Types::ETreeType tt=Types::kMaxTreeType)
Definition: DataLoader.cxx:357
void PrepareTrainingAndTestTree(const TCut &cut, const TString &splitOpt)
prepare the training and test trees -> same cuts for signal and background
Definition: DataLoader.cxx:629
DataAssignType fDataAssignType
Definition: DataLoader.h:205
TString fTransformations
Definition: DataLoader.h:198
double Double_t
Definition: RtypesCore.h:55
void AddEvent(const TString &className, Types::ETreeType tt, const std::vector< Double_t > &event, Double_t weight)
add event vector event : the order of values is: variables + targets + spectators ...
Definition: DataLoader.cxx:283
Class that contains all the data information.
Describe directory structure in memory.
Definition: TDirectory.h:34
void SetBackgroundWeightExpression(const TString &variable)
Definition: DataLoader.cxx:553
int type
Definition: TGX11.cxx:120
void AddTarget(const TString &expression, const TString &title="", const TString &unit="", Double_t min=0, Double_t max=0)
user inserts target in data set info
Definition: DataLoader.cxx:509
void SetWeightExpression(const TString &variable, const TString &className="")
Definition: DataLoader.cxx:560
void AddBackgroundTrainingEvent(const std::vector< Double_t > &event, Double_t weight=1.0)
add signal training event
Definition: DataLoader.cxx:250
static constexpr double s
void PrepareFoldDataSet(CvSplit &s, UInt_t foldNumber, Types::ETreeType tt=Types::kTraining)
Function for assigning the correct folds to the testing or training set.
Definition: DataLoader.cxx:667
void SetSignalWeightExpression(const TString &variable)
Definition: DataLoader.cxx:546
Abstract ClassifierFactory template that handles arbitrary types.
std::vector< TTree * > fTrainAssignTree
Definition: DataLoader.h:206
void AddSignalTestEvent(const std::vector< Double_t > &event, Double_t weight=1.0)
add signal testing event
Definition: DataLoader.cxx:242
void AddSignalTrainingEvent(const std::vector< Double_t > &event, Double_t weight=1.0)
add signal training event
Definition: DataLoader.cxx:234
friend void DataLoaderCopy(TMVA::DataLoader *des, TMVA::DataLoader *src)
void RecombineKFoldDataSet(CvSplit &s, Types::ETreeType tt=Types::kTraining)
Recombines the dataset.
Definition: DataLoader.cxx:680
TString fOptions
Definition: DataLoader.h:197
void SetSignalTree(TTree *signal, Double_t weight=1.0)
Definition: DataLoader.cxx:438
Definition: tree.py:1
A TTree object has a header with a name and a title.
Definition: TTree.h:70
void AddSignalTree(TTree *signal, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
number of signal events (used to compute significance)
Definition: DataLoader.cxx:377
void SetCut(const TString &cut, const TString &className="")
Definition: DataLoader.cxx:572
char name[80]
Definition: TGX11.cxx:109
void AddSpectator(const TString &expression, const TString &title="", const TString &unit="", Double_t min=0, Double_t max=0)
user inserts target in data set info
Definition: DataLoader.cxx:521