Logo ROOT  
Reference Guide
DataSetFactory.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Peter Speckmayer, Joerg Stelzer, Eckhard von Toerne, Helge Voss
3 
4 /*****************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : DataSetFactory *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation (see header for description) *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Peter Speckmayer <Peter.Speckmayer@cern.ch> - CERN, Switzerland *
16  * Joerg Stelzer <Joerg.Stelzer@cern.ch> - MSU, USA *
17  * Eckhard von Toerne <evt@physik.uni-bonn.de> - U. of Bonn, Germany *
18  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
19  * *
20  * Copyright (c) 2009: *
21  * CERN, Switzerland *
22  * MPI-K Heidelberg, Germany *
23  * U. of Bonn, Germany *
24  * Redistribution and use in source and binary forms, with or without *
25  * modification, are permitted according to the terms listed in LICENSE *
26  * (http://tmva.sourceforge.net/LICENSE) *
27  *****************************************************************************/
28 
29 /*! \class TMVA::DataSetFactory
30 \ingroup TMVA
31 
32 Class that contains all the data information
33 
34 */
35 
36 #include <assert.h>
37 
38 #include <map>
39 #include <vector>
40 #include <iomanip>
41 #include <iostream>
42 
43 #include <algorithm>
44 #include <functional>
45 #include <numeric>
46 #include <random>
47 
48 #include "TMVA/DataSetFactory.h"
49 
50 #include "TEventList.h"
51 #include "TFile.h"
52 #include "TRandom3.h"
53 #include "TMatrixF.h"
54 #include "TVectorF.h"
55 #include "TMath.h"
56 #include "TTree.h"
57 #include "TBranch.h"
58 
59 #include "TMVA/MsgLogger.h"
60 #include "TMVA/Configurable.h"
64 #include "TMVA/DataSet.h"
65 #include "TMVA/DataSetInfo.h"
66 #include "TMVA/DataInputHandler.h"
67 #include "TMVA/Event.h"
68 
69 #include "TMVA/Tools.h"
70 #include "TMVA/Types.h"
71 #include "TMVA/VariableInfo.h"
72 
73 using namespace std;
74 
75 //TMVA::DataSetFactory* TMVA::DataSetFactory::fgInstance = 0;
76 
77 namespace TMVA {
78  // calculate the largest common divider
79  // this function is not happy if numbers are negative!
81  {
82  if (a<b) {Int_t tmp = a; a=b; b=tmp; } // achieve a>=b
83  if (b==0) return a;
84  Int_t fullFits = a/b;
85  return LargestCommonDivider(b,a-b*fullFits);
86  }
87 }
88 
89 
90 ////////////////////////////////////////////////////////////////////////////////
91 /// constructor
92 
94  fVerbose(kFALSE),
95  fVerboseLevel(TString("Info")),
96  fScaleWithPreselEff(0),
97  fCurrentTree(0),
98  fCurrentEvtIdx(0),
99  fInputFormulas(0),
100  fLogger( new MsgLogger("DataSetFactory", kINFO) )
101 {
102 }
103 
104 ////////////////////////////////////////////////////////////////////////////////
105 /// destructor
106 
108 {
109  std::vector<TTreeFormula*>::const_iterator formIt;
110 
111  for (formIt = fInputFormulas.begin() ; formIt!=fInputFormulas.end() ; ++formIt) if (*formIt) delete *formIt;
112  for (formIt = fTargetFormulas.begin() ; formIt!=fTargetFormulas.end() ; ++formIt) if (*formIt) delete *formIt;
113  for (formIt = fCutFormulas.begin() ; formIt!=fCutFormulas.end() ; ++formIt) if (*formIt) delete *formIt;
114  for (formIt = fWeightFormula.begin() ; formIt!=fWeightFormula.end() ; ++formIt) if (*formIt) delete *formIt;
115  for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); ++formIt) if (*formIt) delete *formIt;
116 
117  delete fLogger;
118 }
119 
120 ////////////////////////////////////////////////////////////////////////////////
121 /// steering the creation of a new dataset
122 
124  TMVA::DataInputHandler& dataInput )
125 {
126  // build the first dataset from the data input
127  DataSet * ds = BuildInitialDataSet( dsi, dataInput );
128 
129  if (ds->GetNEvents() > 1 && fComputeCorrelations ) {
130  CalcMinMax(ds,dsi);
131 
132  // from the the final dataset build the correlation matrix
133  for (UInt_t cl = 0; cl< dsi.GetNClasses(); cl++) {
134  const TString className = dsi.GetClassInfo(cl)->GetName();
135  dsi.SetCorrelationMatrix( className, CalcCorrelationMatrix( ds, cl ) );
136  if (fCorrelations) {
137  dsi.PrintCorrelationMatrix(className);
138  }
139  }
140  //Log() << kHEADER << Endl;
141  Log() << kHEADER << Form("[%s] : ",dsi.GetName()) << " " << Endl << Endl;
142  }
143 
144  return ds;
145 }
146 
147 ////////////////////////////////////////////////////////////////////////////////
148 
150 {
151  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "Build DataSet consisting of one Event with dynamically changing variables" << Endl;
152  DataSet* ds = new DataSet(dsi);
153 
154  // create a DataSet with one Event which uses dynamic variables
155  // (pointers to variables)
156  if(dsi.GetNClasses()==0){
157  dsi.AddClass( "data" );
158  dsi.GetClassInfo( "data" )->SetNumber(0);
159  }
160 
161  std::vector<Float_t*>* evdyn = new std::vector<Float_t*>(0);
162 
163  std::vector<VariableInfo>& varinfos = dsi.GetVariableInfos();
164 
165  if (varinfos.empty())
166  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName()) << "Dynamic data set cannot be built, since no variable informations are present. Apparently no variables have been set. This should not happen, please contact the TMVA authors." << Endl;
167 
168  std::vector<VariableInfo>::iterator it = varinfos.begin(), itEnd=varinfos.end();
169  for (;it!=itEnd;++it) {
170  Float_t* external=(Float_t*)(*it).GetExternalLink();
171  if (external==0)
172  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "The link to the external variable is NULL while I am trying to build a dynamic data set. In this case fTmpEvent from MethodBase HAS TO BE USED in the method to get useful values in variables." << Endl;
173  else evdyn->push_back (external);
174  }
175 
176  std::vector<VariableInfo>& spectatorinfos = dsi.GetSpectatorInfos();
177  it = spectatorinfos.begin();
178  for (;it!=spectatorinfos.end();++it) evdyn->push_back( (Float_t*)(*it).GetExternalLink() );
179 
180  TMVA::Event * ev = new Event((const std::vector<Float_t*>*&)evdyn, varinfos.size());
181  std::vector<Event*>* newEventVector = new std::vector<Event*>;
182  newEventVector->push_back(ev);
183 
184  ds->SetEventCollection(newEventVector, Types::kTraining);
186  ds->SetCurrentEvent( 0 );
187 
188  delete newEventVector;
189  return ds;
190 }
191 
192 ////////////////////////////////////////////////////////////////////////////////
193 /// if no entries, than create a DataSet with one Event which uses
194 /// dynamic variables (pointers to variables)
195 
198  DataInputHandler& dataInput )
199 {
200  if (dataInput.GetEntries()==0) return BuildDynamicDataSet( dsi );
201  // -------------------------------------------------------------------------
202 
203  // register the classes in the datasetinfo-object
204  // information comes from the trees in the dataInputHandler-object
205  std::vector< TString >* classList = dataInput.GetClassList();
206  for (std::vector<TString>::iterator it = classList->begin(); it< classList->end(); ++it) {
207  dsi.AddClass( (*it) );
208  }
209  delete classList;
210 
211  EvtStatsPerClass eventCounts(dsi.GetNClasses());
212  TString normMode;
213  TString splitMode;
214  TString mixMode;
215  UInt_t splitSeed;
216 
217  InitOptions( dsi, eventCounts, normMode, splitSeed, splitMode , mixMode );
218  // ======= build event-vector from input, apply preselection ===============
219  EventVectorOfClassesOfTreeType tmpEventVector;
220  BuildEventVector( dsi, dataInput, tmpEventVector, eventCounts );
221 
222  DataSet* ds = MixEvents( dsi, tmpEventVector, eventCounts,
223  splitMode, mixMode, normMode, splitSeed );
224 
225  const Bool_t showCollectedOutput = kFALSE;
226  if (showCollectedOutput) {
227  Int_t maxL = dsi.GetClassNameMaxLength();
228  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Collected:" << Endl;
229  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
230  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
231  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
232  << " training entries: " << ds->GetNClassEvents( 0, cl ) << Endl;
233  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
234  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
235  << " testing entries: " << ds->GetNClassEvents( 1, cl ) << Endl;
236  }
237  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " " << Endl;
238  }
239 
240  return ds;
241 }
242 
243 ////////////////////////////////////////////////////////////////////////////////
244 /// checks a TTreeFormula for problems
245 
247  const TString& expression,
248  Bool_t& hasDollar )
249 {
250  Bool_t worked = kTRUE;
251 
252  if( ttf->GetNdim() <= 0 )
253  Log() << kFATAL << "Expression " << expression.Data()
254  << " could not be resolved to a valid formula. " << Endl;
255  if( ttf->GetNdata() == 0 ){
256  Log() << kWARNING << "Expression: " << expression.Data()
257  << " does not provide data for this event. "
258  << "This event is not taken into account. --> please check if you use as a variable "
259  << "an entry of an array which is not filled for some events "
260  << "(e.g. arr[4] when arr has only 3 elements)." << Endl;
261  Log() << kWARNING << "If you want to take the event into account you can do something like: "
262  << "\"Alt$(arr[4],0)\" where in cases where arr doesn't have a 4th element, "
263  << " 0 is taken as an alternative." << Endl;
264  worked = kFALSE;
265  }
266  if( expression.Contains("$") )
267  hasDollar = kTRUE;
268  else
269  {
270  for (int i = 0, iEnd = ttf->GetNcodes (); i < iEnd; ++i)
271  {
272  TLeaf* leaf = ttf->GetLeaf (i);
273  if (!leaf->IsOnTerminalBranch())
274  hasDollar = kTRUE;
275  }
276  }
277  return worked;
278 }
279 
280 
281 ////////////////////////////////////////////////////////////////////////////////
282 /// While the data gets copied into the local training and testing
283 /// trees, the input tree can change (for instance when changing from
284 /// signal to background tree, or using TChains as input) The
285 /// TTreeFormulas, that hold the input expressions need to be
286 /// re-associated with the new tree, which is done here
287 
289 {
290  TTree *tr = tinfo.GetTree()->GetTree();
291 
292  //tr->SetBranchStatus("*",1); // nor needed when using TTReeFormula
293  tr->ResetBranchAddresses();
294 
295  Bool_t hasDollar = kTRUE; // Set to false if wants to enable only some branch in the tree
296 
297  // 1) the input variable formulas
298  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " create input formulas for tree " << tr->GetName() << Endl;
299  std::vector<TTreeFormula*>::const_iterator formIt, formItEnd;
300  for (formIt = fInputFormulas.begin(), formItEnd=fInputFormulas.end(); formIt!=formItEnd; ++formIt) if (*formIt) delete *formIt;
301  fInputFormulas.clear();
302  TTreeFormula* ttf = 0;
303  fInputTableFormulas.clear(); // this contains shallow pointer copies
304 
305  bool firstArrayVar = kTRUE;
306  int firstArrayVarIndex = -1;
307  int arraySize = -1;
308  for (UInt_t i = 0; i < dsi.GetNVariables(); i++) {
309 
310  // create TTreeformula
311  if (! dsi.IsVariableFromArray(i) ) {
312  ttf = new TTreeFormula(Form("Formula%s", dsi.GetVariableInfo(i).GetInternalName().Data()),
313  dsi.GetVariableInfo(i).GetExpression().Data(), tr);
314  CheckTTreeFormula(ttf, dsi.GetVariableInfo(i).GetExpression(), hasDollar);
315  fInputFormulas.emplace_back(ttf);
316  fInputTableFormulas.emplace_back(std::make_pair(ttf, (Int_t) 0));
317  } else {
318  // it is a variable from an array
319  if (firstArrayVar) {
320 
321  // create a new TFormula
322  ttf = new TTreeFormula(Form("Formula%s", dsi.GetVariableInfo(i).GetInternalName().Data()),
323  dsi.GetVariableInfo(i).GetExpression().Data(), tr);
324  CheckTTreeFormula(ttf, dsi.GetVariableInfo(i).GetExpression(), hasDollar);
325  fInputFormulas.push_back(ttf);
326 
327  arraySize = dsi.GetVarArraySize(dsi.GetVariableInfo(i).GetExpression());
328  firstArrayVar = kFALSE;
329  firstArrayVarIndex = i;
330 
331  Log() << kINFO << "Using variable " << dsi.GetVariableInfo(i).GetInternalName() <<
332  " from array expression " << dsi.GetVariableInfo(i).GetExpression() << " of size " << arraySize << Endl;
333  }
334  fInputTableFormulas.push_back(std::make_pair(ttf, (Int_t) i-firstArrayVarIndex));
335  if (int(i)-firstArrayVarIndex == arraySize-1 ) {
336  // I am the last element of the array
337  firstArrayVar = kTRUE;
338  firstArrayVarIndex = -1;
339  Log() << kDEBUG << "Using Last variable from array : " << dsi.GetVariableInfo(i).GetInternalName() << Endl;
340  }
341  }
342 
343  }
344 
345  //
346  // targets
347  //
348  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform regression targets" << Endl;
349  for (formIt = fTargetFormulas.begin(), formItEnd = fTargetFormulas.end(); formIt!=formItEnd; ++formIt) if (*formIt) delete *formIt;
350  fTargetFormulas.clear();
351  for (UInt_t i=0; i<dsi.GetNTargets(); i++) {
352  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetTargetInfo(i).GetInternalName().Data() ),
353  dsi.GetTargetInfo(i).GetExpression().Data(), tr );
354  CheckTTreeFormula( ttf, dsi.GetTargetInfo(i).GetExpression(), hasDollar );
355  fTargetFormulas.push_back( ttf );
356  }
357 
358  //
359  // spectators
360  //
361  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform spectator variables" << Endl;
362  for (formIt = fSpectatorFormulas.begin(), formItEnd = fSpectatorFormulas.end(); formIt!=formItEnd; ++formIt) if (*formIt) delete *formIt;
363  fSpectatorFormulas.clear();
364  for (UInt_t i=0; i<dsi.GetNSpectators(); i++) {
365  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetSpectatorInfo(i).GetInternalName().Data() ),
366  dsi.GetSpectatorInfo(i).GetExpression().Data(), tr );
367  CheckTTreeFormula( ttf, dsi.GetSpectatorInfo(i).GetExpression(), hasDollar );
368  fSpectatorFormulas.push_back( ttf );
369  }
370 
371  //
372  // the cuts (one per class, if non-existent: formula pointer = 0)
373  //
374  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform cuts" << Endl;
375  for (formIt = fCutFormulas.begin(), formItEnd = fCutFormulas.end(); formIt!=formItEnd; ++formIt) if (*formIt) delete *formIt;
376  fCutFormulas.clear();
377  for (UInt_t clIdx=0; clIdx<dsi.GetNClasses(); clIdx++) {
378  const TCut& tmpCut = dsi.GetClassInfo(clIdx)->GetCut();
379  const TString tmpCutExp(tmpCut.GetTitle());
380  ttf = 0;
381  if (tmpCutExp!="") {
382  ttf = new TTreeFormula( Form("CutClass%i",clIdx), tmpCutExp, tr );
383  Bool_t worked = CheckTTreeFormula( ttf, tmpCutExp, hasDollar );
384  if( !worked ){
385  Log() << kWARNING << "Please check class \"" << dsi.GetClassInfo(clIdx)->GetName()
386  << "\" cut \"" << dsi.GetClassInfo(clIdx)->GetCut() << Endl;
387  }
388  }
389  fCutFormulas.push_back( ttf );
390  }
391 
392  //
393  // the weights (one per class, if non-existent: formula pointer = 0)
394  //
395  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform weights" << Endl;
396  for (formIt = fWeightFormula.begin(), formItEnd = fWeightFormula.end(); formIt!=formItEnd; ++formIt) if (*formIt) delete *formIt;
397  fWeightFormula.clear();
398  for (UInt_t clIdx=0; clIdx<dsi.GetNClasses(); clIdx++) {
399  const TString tmpWeight = dsi.GetClassInfo(clIdx)->GetWeight();
400 
401  if (dsi.GetClassInfo(clIdx)->GetName() != tinfo.GetClassName() ) { // if the tree is of another class
402  fWeightFormula.push_back( 0 );
403  continue;
404  }
405 
406  ttf = 0;
407  if (tmpWeight!="") {
408  ttf = new TTreeFormula( "FormulaWeight", tmpWeight, tr );
409  Bool_t worked = CheckTTreeFormula( ttf, tmpWeight, hasDollar );
410  if( !worked ){
411  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName()) << "Please check class \"" << dsi.GetClassInfo(clIdx)->GetName()
412  << "\" weight \"" << dsi.GetClassInfo(clIdx)->GetWeight() << Endl;
413  }
414  }
415  else {
416  ttf = 0;
417  }
418  fWeightFormula.push_back( ttf );
419  }
420  return;
421  // all this code below is not needed when using TTReeFormula
422 
423  Log() << kDEBUG << Form("Dataset[%s] : ", dsi.GetName()) << "enable branches" << Endl;
424  // now enable only branches that are needed in any input formula, target, cut, weight
425 
426  if (!hasDollar) {
427  tr->SetBranchStatus("*",0);
428  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: input variables" << Endl;
429  // input vars
430  for (formIt = fInputFormulas.begin(); formIt!=fInputFormulas.end(); ++formIt) {
431  ttf = *formIt;
432  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++) {
433  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
434  }
435  }
436  // targets
437  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: targets" << Endl;
438  for (formIt = fTargetFormulas.begin(); formIt!=fTargetFormulas.end(); ++formIt) {
439  ttf = *formIt;
440  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
441  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
442  }
443  // spectators
444  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: spectators" << Endl;
445  for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); ++formIt) {
446  ttf = *formIt;
447  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
448  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
449  }
450  // cuts
451  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: cuts" << Endl;
452  for (formIt = fCutFormulas.begin(); formIt!=fCutFormulas.end(); ++formIt) {
453  ttf = *formIt;
454  if (!ttf) continue;
455  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
456  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
457  }
458  // weights
459  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: weights" << Endl;
460  for (formIt = fWeightFormula.begin(); formIt!=fWeightFormula.end(); ++formIt) {
461  ttf = *formIt;
462  if (!ttf) continue;
463  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
464  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
465  }
466  }
467  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "tree initialized" << Endl;
468  return;
469 }
470 
471 ////////////////////////////////////////////////////////////////////////////////
472 /// compute covariance matrix
473 
475 {
476  const UInt_t nvar = ds->GetNVariables();
477  const UInt_t ntgts = ds->GetNTargets();
478  const UInt_t nvis = ds->GetNSpectators();
479 
480  Float_t *min = new Float_t[nvar];
481  Float_t *max = new Float_t[nvar];
482  Float_t *tgmin = new Float_t[ntgts];
483  Float_t *tgmax = new Float_t[ntgts];
484  Float_t *vmin = new Float_t[nvis];
485  Float_t *vmax = new Float_t[nvis];
486 
487  for (UInt_t ivar=0; ivar<nvar ; ivar++) { min[ivar] = FLT_MAX; max[ivar] = -FLT_MAX; }
488  for (UInt_t ivar=0; ivar<ntgts; ivar++) { tgmin[ivar] = FLT_MAX; tgmax[ivar] = -FLT_MAX; }
489  for (UInt_t ivar=0; ivar<nvis; ivar++) { vmin[ivar] = FLT_MAX; vmax[ivar] = -FLT_MAX; }
490 
491  // perform event loop
492 
493  for (Int_t i=0; i<ds->GetNEvents(); i++) {
494  const Event * ev = ds->GetEvent(i);
495  for (UInt_t ivar=0; ivar<nvar; ivar++) {
496  Double_t v = ev->GetValue(ivar);
497  if (v<min[ivar]) min[ivar] = v;
498  if (v>max[ivar]) max[ivar] = v;
499  }
500  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
501  Double_t v = ev->GetTarget(itgt);
502  if (v<tgmin[itgt]) tgmin[itgt] = v;
503  if (v>tgmax[itgt]) tgmax[itgt] = v;
504  }
505  for (UInt_t ivis=0; ivis<nvis; ivis++) {
506  Double_t v = ev->GetSpectator(ivis);
507  if (v<vmin[ivis]) vmin[ivis] = v;
508  if (v>vmax[ivis]) vmax[ivis] = v;
509  }
510  }
511 
512  for (UInt_t ivar=0; ivar<nvar; ivar++) {
513  dsi.GetVariableInfo(ivar).SetMin(min[ivar]);
514  dsi.GetVariableInfo(ivar).SetMax(max[ivar]);
515  if( TMath::Abs(max[ivar]-min[ivar]) <= FLT_MIN )
516  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName()) << "Variable " << dsi.GetVariableInfo(ivar).GetExpression().Data() << " is constant. Please remove the variable." << Endl;
517  }
518  for (UInt_t ivar=0; ivar<ntgts; ivar++) {
519  dsi.GetTargetInfo(ivar).SetMin(tgmin[ivar]);
520  dsi.GetTargetInfo(ivar).SetMax(tgmax[ivar]);
521  if( TMath::Abs(tgmax[ivar]-tgmin[ivar]) <= FLT_MIN )
522  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName()) << "Target " << dsi.GetTargetInfo(ivar).GetExpression().Data() << " is constant. Please remove the variable." << Endl;
523  }
524  for (UInt_t ivar=0; ivar<nvis; ivar++) {
525  dsi.GetSpectatorInfo(ivar).SetMin(vmin[ivar]);
526  dsi.GetSpectatorInfo(ivar).SetMax(vmax[ivar]);
527  // if( TMath::Abs(vmax[ivar]-vmin[ivar]) <= FLT_MIN )
528  // Log() << kWARNING << "Spectator variable " << dsi.GetSpectatorInfo(ivar).GetExpression().Data() << " is constant." << Endl;
529  }
530  delete [] min;
531  delete [] max;
532  delete [] tgmin;
533  delete [] tgmax;
534  delete [] vmin;
535  delete [] vmax;
536 }
537 
538 ////////////////////////////////////////////////////////////////////////////////
539 /// computes correlation matrix for variables "theVars" in tree;
540 /// "theType" defines the required event "type"
541 /// ("type" variable must be present in tree)
542 
544 {
545  // first compute variance-covariance
546  TMatrixD* mat = CalcCovarianceMatrix( ds, classNumber );
547 
548  // now the correlation
549  UInt_t nvar = ds->GetNVariables(), ivar, jvar;
550 
551  for (ivar=0; ivar<nvar; ivar++) {
552  for (jvar=0; jvar<nvar; jvar++) {
553  if (ivar != jvar) {
554  Double_t d = (*mat)(ivar, ivar)*(*mat)(jvar, jvar);
555  if (d > 0) (*mat)(ivar, jvar) /= sqrt(d);
556  else {
557  Log() << kWARNING << Form("Dataset[%s] : ",DataSetInfo().GetName())<< "<GetCorrelationMatrix> Zero variances for variables "
558  << "(" << ivar << ", " << jvar << ") = " << d
559  << Endl;
560  (*mat)(ivar, jvar) = 0;
561  }
562  }
563  }
564  }
565 
566  for (ivar=0; ivar<nvar; ivar++) (*mat)(ivar, ivar) = 1.0;
567 
568  return mat;
569 }
570 
571 ////////////////////////////////////////////////////////////////////////////////
572 /// compute covariance matrix
573 
575 {
576  UInt_t nvar = ds->GetNVariables();
577  UInt_t ivar = 0, jvar = 0;
578 
579  TMatrixD* mat = new TMatrixD( nvar, nvar );
580 
581  // init matrices
582  TVectorD vec(nvar);
583  TMatrixD mat2(nvar, nvar);
584  for (ivar=0; ivar<nvar; ivar++) {
585  vec(ivar) = 0;
586  for (jvar=0; jvar<nvar; jvar++) mat2(ivar, jvar) = 0;
587  }
588 
589  // perform event loop
590  Double_t ic = 0;
591  for (Int_t i=0; i<ds->GetNEvents(); i++) {
592 
593  const Event * ev = ds->GetEvent(i);
594  if (ev->GetClass() != classNumber ) continue;
595 
596  Double_t weight = ev->GetWeight();
597  ic += weight; // count used events
598 
599  for (ivar=0; ivar<nvar; ivar++) {
600 
601  Double_t xi = ev->GetValue(ivar);
602  vec(ivar) += xi*weight;
603  mat2(ivar, ivar) += (xi*xi*weight);
604 
605  for (jvar=ivar+1; jvar<nvar; jvar++) {
606  Double_t xj = ev->GetValue(jvar);
607  mat2(ivar, jvar) += (xi*xj*weight);
608  }
609  }
610  }
611 
612  for (ivar=0; ivar<nvar; ivar++)
613  for (jvar=ivar+1; jvar<nvar; jvar++)
614  mat2(jvar, ivar) = mat2(ivar, jvar); // symmetric matrix
615 
616 
617  // variance-covariance
618  for (ivar=0; ivar<nvar; ivar++) {
619  for (jvar=0; jvar<nvar; jvar++) {
620  (*mat)(ivar, jvar) = mat2(ivar, jvar)/ic - vec(ivar)*vec(jvar)/(ic*ic);
621  }
622  }
623 
624  return mat;
625 }
626 
627 // --------------------------------------- new versions
628 
629 ////////////////////////////////////////////////////////////////////////////////
630 /// the dataset splitting
631 
632 void
634  EvtStatsPerClass& nEventRequests,
635  TString& normMode,
636  UInt_t& splitSeed,
637  TString& splitMode,
638  TString& mixMode)
639 {
640  Configurable splitSpecs( dsi.GetSplitOptions() );
641  splitSpecs.SetConfigName("DataSetFactory");
642  splitSpecs.SetConfigDescription( "Configuration options given in the \"PrepareForTrainingAndTesting\" call; these options define the creation of the data sets used for training and expert validation by TMVA" );
643 
644  splitMode = "Random"; // the splitting mode
645  splitSpecs.DeclareOptionRef( splitMode, "SplitMode",
646  "Method of picking training and testing events (default: random)" );
647  splitSpecs.AddPreDefVal(TString("Random"));
648  splitSpecs.AddPreDefVal(TString("Alternate"));
649  splitSpecs.AddPreDefVal(TString("Block"));
650 
651  mixMode = "SameAsSplitMode"; // the splitting mode
652  splitSpecs.DeclareOptionRef( mixMode, "MixMode",
653  "Method of mixing events of different classes into one dataset (default: SameAsSplitMode)" );
654  splitSpecs.AddPreDefVal(TString("SameAsSplitMode"));
655  splitSpecs.AddPreDefVal(TString("Random"));
656  splitSpecs.AddPreDefVal(TString("Alternate"));
657  splitSpecs.AddPreDefVal(TString("Block"));
658 
659  splitSeed = 100;
660  splitSpecs.DeclareOptionRef( splitSeed, "SplitSeed",
661  "Seed for random event shuffling" );
662 
663  normMode = "EqualNumEvents"; // the weight normalisation modes
664  splitSpecs.DeclareOptionRef( normMode, "NormMode",
665  "Overall renormalisation of event-by-event weights used in the training (NumEvents: average weight of 1 per event, independently for signal and background; EqualNumEvents: average weight of 1 per event for signal, and sum of weights for background equal to sum of weights for signal)" );
666  splitSpecs.AddPreDefVal(TString("None"));
667  splitSpecs.AddPreDefVal(TString("NumEvents"));
668  splitSpecs.AddPreDefVal(TString("EqualNumEvents"));
669 
670  splitSpecs.DeclareOptionRef(fScaleWithPreselEff=kFALSE,"ScaleWithPreselEff","Scale the number of requested events by the eff. of the preselection cuts (or not)" );
671 
672  // the number of events
673 
674  // fill in the numbers
675  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
676  TString clName = dsi.GetClassInfo(cl)->GetName();
677  TString titleTrain = TString().Format("Number of training events of class %s (default: 0 = all)",clName.Data()).Data();
678  TString titleTest = TString().Format("Number of test events of class %s (default: 0 = all)",clName.Data()).Data();
679  TString titleSplit = TString().Format("Split in training and test events of class %s (default: 0 = deactivated)",clName.Data()).Data();
680 
681  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTrainingEventsRequested, TString("nTrain_")+clName, titleTrain );
682  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTestingEventsRequested , TString("nTest_")+clName , titleTest );
683  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).TrainTestSplitRequested , TString("TrainTestSplit_")+clName , titleTest );
684  }
685 
686  splitSpecs.DeclareOptionRef( fVerbose, "V", "Verbosity (default: true)" );
687 
688  splitSpecs.DeclareOptionRef( fVerboseLevel=TString("Info"), "VerboseLevel", "VerboseLevel (Debug/Verbose/Info)" );
689  splitSpecs.AddPreDefVal(TString("Debug"));
690  splitSpecs.AddPreDefVal(TString("Verbose"));
691  splitSpecs.AddPreDefVal(TString("Info"));
692 
693  fCorrelations = kTRUE;
694  splitSpecs.DeclareOptionRef(fCorrelations, "Correlations", "Boolean to show correlation output (Default: true)");
695  fComputeCorrelations = kTRUE;
696  splitSpecs.DeclareOptionRef(fComputeCorrelations, "CalcCorrelations", "Compute correlations and also some variable statistics, e.g. min/max (Default: true )");
697 
698  splitSpecs.ParseOptions();
699  splitSpecs.CheckForUnusedOptions();
700 
701  // output logging verbosity
702  if (Verbose()) fLogger->SetMinType( kVERBOSE );
703  if (fVerboseLevel.CompareTo("Debug") ==0) fLogger->SetMinType( kDEBUG );
704  if (fVerboseLevel.CompareTo("Verbose") ==0) fLogger->SetMinType( kVERBOSE );
705  if (fVerboseLevel.CompareTo("Info") ==0) fLogger->SetMinType( kINFO );
706 
707  // put all to upper case
708  splitMode.ToUpper(); mixMode.ToUpper(); normMode.ToUpper();
709  // adjust mixmode if same as splitmode option has been set
710  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
711  << "\tSplitmode is: \"" << splitMode << "\" the mixmode is: \"" << mixMode << "\"" << Endl;
712  if (mixMode=="SAMEASSPLITMODE") mixMode = splitMode;
713  else if (mixMode!=splitMode)
714  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "DataSet splitmode="<<splitMode
715  <<" differs from mixmode="<<mixMode<<Endl;
716 }
717 
718 ////////////////////////////////////////////////////////////////////////////////
719 /// build empty event vectors
720 /// distributes events between kTraining/kTesting/kMaxTreeType
721 
722 void
724  TMVA::DataInputHandler& dataInput,
726  EvtStatsPerClass& eventCounts)
727 {
728  const UInt_t nclasses = dsi.GetNClasses();
729 
730  eventsmap[ Types::kTraining ] = EventVectorOfClasses(nclasses);
731  eventsmap[ Types::kTesting ] = EventVectorOfClasses(nclasses);
732  eventsmap[ Types::kMaxTreeType ] = EventVectorOfClasses(nclasses);
733 
734  // create the type, weight and boostweight branches
735  const UInt_t nvars = dsi.GetNVariables();
736  const UInt_t ntgts = dsi.GetNTargets();
737  const UInt_t nvis = dsi.GetNSpectators();
738 
739  for (size_t i=0; i<nclasses; i++) {
740  eventCounts[i].varAvLength = new Float_t[nvars];
741  for (UInt_t ivar=0; ivar<nvars; ivar++)
742  eventCounts[i].varAvLength[ivar] = 0;
743  }
744 
745  //Bool_t haveArrayVariable = kFALSE;
746  //Bool_t *varIsArray = new Bool_t[nvars];
747 
748  // If there are NaNs in the tree:
749  // => warn if used variables/cuts/weights contain nan (no problem if event is cut out)
750  // => fatal if cut value is nan or (event not cut out and nans somewhere)
751  // Count & collect all these warnings/errors and output them at the end.
752  std::map<TString, int> nanInfWarnings;
753  std::map<TString, int> nanInfErrors;
754 
755  // if we work with chains we need to remember the current tree if
756  // the chain jumps to a new tree we have to reset the formulas
757  for (UInt_t cl=0; cl<nclasses; cl++) {
758 
759  //Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Create training and testing trees -- looping over class \"" << dsi.GetClassInfo(cl)->GetName() << "\" ..." << Endl;
760 
761  EventStats& classEventCounts = eventCounts[cl];
762 
763  // info output for weights
764  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
765  << "\tWeight expression for class \'" << dsi.GetClassInfo(cl)->GetName() << "\': \""
766  << dsi.GetClassInfo(cl)->GetWeight() << "\"" << Endl;
767 
768  // used for chains only
769  TString currentFileName("");
770 
771  std::vector<TreeInfo>::const_iterator treeIt(dataInput.begin(dsi.GetClassInfo(cl)->GetName()));
772  for (;treeIt!=dataInput.end(dsi.GetClassInfo(cl)->GetName()); ++treeIt) {
773 
774  // read first the variables
775  std::vector<Float_t> vars(nvars);
776  std::vector<Float_t> tgts(ntgts);
777  std::vector<Float_t> vis(nvis);
778  TreeInfo currentInfo = *treeIt;
779 
780  Log() << kINFO << "Building event vectors for type " << currentInfo.GetTreeType() << " " << currentInfo.GetClassName() << Endl;
781 
782  EventVector& event_v = eventsmap[currentInfo.GetTreeType()].at(cl);
783 
784  Bool_t isChain = (TString("TChain") == currentInfo.GetTree()->ClassName());
785  currentInfo.GetTree()->LoadTree(0);
786  // create the TTReeFormula to evalute later on on each single event
787  ChangeToNewTree( currentInfo, dsi );
788 
789  // count number of events in tree before cut
790  classEventCounts.nInitialEvents += currentInfo.GetTree()->GetEntries();
791 
792  // loop over events in ntuple
793  const UInt_t nEvts = currentInfo.GetTree()->GetEntries();
794  for (Long64_t evtIdx = 0; evtIdx < nEvts; evtIdx++) {
795  currentInfo.GetTree()->LoadTree(evtIdx);
796 
797  // may need to reload tree in case of chains
798  if (isChain) {
799  if (currentInfo.GetTree()->GetTree()->GetDirectory()->GetFile()->GetName() != currentFileName) {
800  currentFileName = currentInfo.GetTree()->GetTree()->GetDirectory()->GetFile()->GetName();
801  ChangeToNewTree( currentInfo, dsi );
802  }
803  }
804  currentInfo.GetTree()->GetEntry(evtIdx);
805  Int_t sizeOfArrays = 1;
806  Int_t prevArrExpr = 0;
807  Bool_t haveAllArrayData = kFALSE;
808 
809  // ======= evaluate all formulas =================
810 
811  // first we check if some of the formulas are arrays
812  // This is the case when all inputs (variables, targets and spectetors are array and a TMVA event is not
813  // an event of the tree but an event + array index). In this case we set the flag haveAllArrayData = true
814  // Otherwise we support for arrays of variables where each
815  // element of the array corresponds to a different variable like in the case of image
816  // In that case the VAriableInfo has a bit, IsVariableFromArray that is set and we have a single formula for the array
817  // fInputFormulaTable contains a map of the formula and the variable index to evaluate the formula
818  for (UInt_t ivar = 0; ivar < nvars; ivar++) {
819  // distinguish case where variable is not from an array
820  if (dsi.IsVariableFromArray(ivar)) continue;
821  auto inputFormula = fInputTableFormulas[ivar].first;
822 
823  Int_t ndata = inputFormula->GetNdata();
824 
825  classEventCounts.varAvLength[ivar] += ndata;
826  if (ndata == 1) continue;
827  haveAllArrayData = kTRUE;
828  //varIsArray[ivar] = kTRUE;
829  //std::cout << "Found array !!!" << std::endl;
830  if (sizeOfArrays == 1) {
831  sizeOfArrays = ndata;
832  prevArrExpr = ivar;
833  }
834  else if (sizeOfArrays!=ndata) {
835  Log() << kERROR << Form("Dataset[%s] : ",dsi.GetName())<< "ERROR while preparing training and testing trees:" << Endl;
836  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " multiple array-type expressions of different length were encountered" << Endl;
837  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " location of error: event " << evtIdx
838  << " in tree " << currentInfo.GetTree()->GetName()
839  << " of file " << currentInfo.GetTree()->GetCurrentFile()->GetName() << Endl;
840  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " expression " << inputFormula->GetTitle() << " has "
841  << Form("Dataset[%s] : ",dsi.GetName()) << ndata << " entries, while" << Endl;
842  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " expression " << fInputTableFormulas[prevArrExpr].first->GetTitle() << " has "
843  << Form("Dataset[%s] : ",dsi.GetName())<< fInputTableFormulas[prevArrExpr].first->GetNdata() << " entries" << Endl;
844  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "Need to abort" << Endl;
845  }
846  }
847 
848  // now we read the information
849  for (Int_t idata = 0; idata<sizeOfArrays; idata++) {
850  Bool_t contains_NaN_or_inf = kFALSE;
851 
852  auto checkNanInf = [&](std::map<TString, int> &msgMap, Float_t value, const char *what, const char *formulaTitle) {
853  if (TMath::IsNaN(value)) {
854  contains_NaN_or_inf = kTRUE;
855  ++msgMap[TString::Format("Dataset[%s] : %s expression resolves to indeterminate value (NaN): %s", dsi.GetName(), what, formulaTitle)];
856  } else if (!TMath::Finite(value)) {
857  contains_NaN_or_inf = kTRUE;
858  ++msgMap[TString::Format("Dataset[%s] : %s expression resolves to infinite value (+inf or -inf): %s", dsi.GetName(), what, formulaTitle)];
859  }
860  };
861 
862  TTreeFormula* formula = 0;
863 
864  // the cut expression
865  Double_t cutVal = 1.;
866  formula = fCutFormulas[cl];
867  if (formula) {
868  Int_t ndata = formula->GetNdata();
869  cutVal = (ndata==1 ?
870  formula->EvalInstance(0) :
871  formula->EvalInstance(idata));
872  checkNanInf(nanInfErrors, cutVal, "Cut", formula->GetTitle());
873  }
874 
875  // if event is cut out, add to warnings, else add to errors.
876  auto &nanMessages = cutVal < 0.5 ? nanInfWarnings : nanInfErrors;
877 
878  // the input variable
879  for (UInt_t ivar=0; ivar<nvars; ivar++) {
880  auto formulaMap = fInputTableFormulas[ivar];
881  formula = formulaMap.first;
882  int inputVarIndex = formulaMap.second;
883  formula->SetQuickLoad(true); // is this needed ???
884 
885  vars[ivar] = ( !haveAllArrayData ?
886  formula->EvalInstance(inputVarIndex) :
887  formula->EvalInstance(idata));
888  checkNanInf(nanMessages, vars[ivar], "Input", formula->GetTitle());
889  }
890 
891  // the targets
892  for (UInt_t itrgt=0; itrgt<ntgts; itrgt++) {
893  formula = fTargetFormulas[itrgt];
894  Int_t ndata = formula->GetNdata();
895  tgts[itrgt] = (ndata == 1 ?
896  formula->EvalInstance(0) :
897  formula->EvalInstance(idata));
898  checkNanInf(nanMessages, tgts[itrgt], "Target", formula->GetTitle());
899  }
900 
901  // the spectators
902  for (UInt_t itVis=0; itVis<nvis; itVis++) {
903  formula = fSpectatorFormulas[itVis];
904  Int_t ndata = formula->GetNdata();
905  vis[itVis] = (ndata == 1 ?
906  formula->EvalInstance(0) :
907  formula->EvalInstance(idata));
908  checkNanInf(nanMessages, vis[itVis], "Spectator", formula->GetTitle());
909  }
910 
911 
912  // the weight
913  Float_t weight = currentInfo.GetWeight(); // multiply by tree weight
914  formula = fWeightFormula[cl];
915  if (formula!=0) {
916  Int_t ndata = formula->GetNdata();
917  weight *= (ndata == 1 ?
918  formula->EvalInstance() :
919  formula->EvalInstance(idata));
920  checkNanInf(nanMessages, weight, "Weight", formula->GetTitle());
921  }
922 
923  // Count the events before rejection due to cut or NaN
924  // value (weighted and unweighted)
925  classEventCounts.nEvBeforeCut++;
926  if (!TMath::IsNaN(weight))
927  classEventCounts.nWeEvBeforeCut += weight;
928 
929  // apply the cut, skip rest if cut is not fulfilled
930  if (cutVal<0.5) continue;
931 
932  // global flag if negative weights exist -> can be used
933  // by classifiers who may require special data
934  // treatment (also print warning)
935  if (weight < 0) classEventCounts.nNegWeights++;
936 
937  // now read the event-values (variables and regression targets)
938 
939  if (contains_NaN_or_inf) {
940  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "NaN or +-inf in Event " << evtIdx << Endl;
941  if (sizeOfArrays>1) Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< " rejected" << Endl;
942  continue;
943  }
944 
945  // Count the events after rejection due to cut or NaN value
946  // (weighted and unweighted)
947  classEventCounts.nEvAfterCut++;
948  classEventCounts.nWeEvAfterCut += weight;
949 
950  // event accepted, fill temporary ntuple
951  event_v.push_back(new Event(vars, tgts , vis, cl , weight));
952  }
953  }
954  currentInfo.GetTree()->ResetBranchAddresses();
955  }
956  }
957 
958  if (!nanInfWarnings.empty()) {
959  Log() << kWARNING << "Found events with NaN and/or +-inf values" << Endl;
960  for (const auto &warning : nanInfWarnings) {
961  auto &log = Log() << kWARNING << warning.first;
962  if (warning.second > 1) log << " (" << warning.second << " times)";
963  log << Endl;
964  }
965  Log() << kWARNING << "These NaN and/or +-infs were all removed by the specified cut, continuing." << Endl;
966  Log() << Endl;
967  }
968 
969  if (!nanInfErrors.empty()) {
970  Log() << kWARNING << "Found events with NaN and/or +-inf values (not removed by cut)" << Endl;
971  for (const auto &error : nanInfErrors) {
972  auto &log = Log() << kWARNING << error.first;
973  if (error.second > 1) log << " (" << error.second << " times)";
974  log << Endl;
975  }
976  Log() << kFATAL << "How am I supposed to train a NaN or +-inf?!" << Endl;
977  }
978 
979  // for output format, get the maximum class name length
980  Int_t maxL = dsi.GetClassNameMaxLength();
981 
982  Log() << kHEADER << Form("[%s] : ",dsi.GetName()) << "Number of events in input trees" << Endl;
983  Log() << kDEBUG << "(after possible flattening of arrays):" << Endl;
984 
985 
986  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
987  Log() << kDEBUG //<< Form("[%s] : ",dsi.GetName())
988  << " "
989  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
990  << " -- number of events : "
991  << std::setw(5) << eventCounts[cl].nEvBeforeCut
992  << " / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvBeforeCut << Endl;
993  }
994 
995  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
996  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
997  << " " << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
998  <<" tree -- total number of entries: "
999  << std::setw(5) << dataInput.GetEntries(dsi.GetClassInfo(cl)->GetName()) << Endl;
1000  }
1001 
1002  if (fScaleWithPreselEff)
1003  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1004  << "\tPreselection: (will affect number of requested training and testing events)" << Endl;
1005  else
1006  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1007  << "\tPreselection: (will NOT affect number of requested training and testing events)" << Endl;
1008 
1009  if (dsi.HasCuts()) {
1010  for (UInt_t cl = 0; cl< dsi.GetNClasses(); cl++) {
1011  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " " << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
1012  << " requirement: \"" << dsi.GetClassInfo(cl)->GetCut() << "\"" << Endl;
1013  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
1014  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
1015  << " -- number of events passed: "
1016  << std::setw(5) << eventCounts[cl].nEvAfterCut
1017  << " / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvAfterCut << Endl;
1018  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
1019  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
1020  << " -- efficiency : "
1021  << std::setw(6) << eventCounts[cl].nWeEvAfterCut/eventCounts[cl].nWeEvBeforeCut << Endl;
1022  }
1023  }
1024  else Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1025  << " No preselection cuts applied on event classes" << Endl;
1026 
1027  //delete[] varIsArray;
1028 
1029 }
1030 
1031 ////////////////////////////////////////////////////////////////////////////////
1032 /// Select and distribute unassigned events to kTraining and kTesting
1033 
1036  EventVectorOfClassesOfTreeType& tmpEventVector,
1037  EvtStatsPerClass& eventCounts,
1038  const TString& splitMode,
1039  const TString& mixMode,
1040  const TString& normMode,
1041  UInt_t splitSeed)
1042 {
1043  TMVA::RandomGenerator<TRandom3> rndm(splitSeed);
1044 
1045  // ==== splitting of undefined events to kTraining and kTesting
1046 
1047  // if splitMode contains "RANDOM", then shuffle the undefined events
1048  if (splitMode.Contains( "RANDOM" ) /*&& !emptyUndefined*/ ) {
1049  // random shuffle the undefined events of each class
1050  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1051  EventVector& unspecifiedEvents = tmpEventVector[Types::kMaxTreeType].at(cls);
1052  if( ! unspecifiedEvents.empty() ) {
1053  Log() << kDEBUG << "randomly shuffling "
1054  << unspecifiedEvents.size()
1055  << " events of class " << cls
1056  << " which are not yet associated to testing or training" << Endl;
1057  std::shuffle(unspecifiedEvents.begin(), unspecifiedEvents.end(), rndm);
1058  }
1059  }
1060  }
1061 
1062  // check for each class the number of training and testing events, the requested number and the available number
1063  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "SPLITTING ========" << Endl;
1064  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1065  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "---- class " << cls << Endl;
1066  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "check number of training/testing events, requested and available number of events and for class " << cls << Endl;
1067 
1068  // check if enough or too many events are already in the training/testing eventvectors of the class cls
1069  EventVector& eventVectorTraining = tmpEventVector[ Types::kTraining ].at(cls);
1070  EventVector& eventVectorTesting = tmpEventVector[ Types::kTesting ].at(cls);
1071  EventVector& eventVectorUndefined = tmpEventVector[ Types::kMaxTreeType ].at(cls);
1072 
1073  Int_t availableTraining = eventVectorTraining.size();
1074  Int_t availableTesting = eventVectorTesting.size();
1075  Int_t availableUndefined = eventVectorUndefined.size();
1076 
1077  Float_t presel_scale;
1078  if (fScaleWithPreselEff) {
1079  presel_scale = eventCounts[cls].cutScaling();
1080  if (presel_scale < 1)
1081  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " you have opted for scaling the number of requested training/testing events\n to be scaled by the preselection efficiency"<< Endl;
1082  }else{
1083  presel_scale = 1.; // this scaling was too confusing to most people, including me! Sorry... (Helge)
1084  if (eventCounts[cls].cutScaling() < 1)
1085  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " you have opted for interpreting the requested number of training/testing events\n to be the number of events AFTER your preselection cuts" << Endl;
1086 
1087  }
1088 
1089  // If TrainTestSplit_<class> is set, set number of requested training events to split*num_all_events
1090  // Requested number of testing events is set to zero and therefore takes all other events
1091  // The option TrainTestSplit_<class> overrides nTrain_<class> or nTest_<class>
1092  if(eventCounts[cls].TrainTestSplitRequested < 1.0 && eventCounts[cls].TrainTestSplitRequested > 0.0){
1093  eventCounts[cls].nTrainingEventsRequested = Int_t(eventCounts[cls].TrainTestSplitRequested*(availableTraining+availableTesting+availableUndefined));
1094  eventCounts[cls].nTestingEventsRequested = Int_t(0);
1095  }
1096  else if(eventCounts[cls].TrainTestSplitRequested != 0.0) Log() << kFATAL << Form("The option TrainTestSplit_<class> has to be in range (0, 1] but is set to %f.",eventCounts[cls].TrainTestSplitRequested) << Endl;
1097  Int_t requestedTraining = Int_t(eventCounts[cls].nTrainingEventsRequested * presel_scale);
1098  Int_t requestedTesting = Int_t(eventCounts[cls].nTestingEventsRequested * presel_scale);
1099 
1100  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "events in training trees : " << availableTraining << Endl;
1101  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "events in testing trees : " << availableTesting << Endl;
1102  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "events in unspecified trees : " << availableUndefined << Endl;
1103  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "requested for training : " << requestedTraining << Endl;;
1104 
1105  if(presel_scale<1)
1106  Log() << " ( " << eventCounts[cls].nTrainingEventsRequested
1107  << " * " << presel_scale << " preselection efficiency)" << Endl;
1108  else
1109  Log() << Endl;
1110  Log() << kDEBUG << "requested for testing : " << requestedTesting;
1111  if(presel_scale<1)
1112  Log() << " ( " << eventCounts[cls].nTestingEventsRequested
1113  << " * " << presel_scale << " preselection efficiency)" << Endl;
1114  else
1115  Log() << Endl;
1116 
1117  // nomenclature r = available training
1118  // s = available testing
1119  // u = available undefined
1120  // R = requested training
1121  // S = requested testing
1122  // nR = to be used to select training events
1123  // nS = to be used to select test events
1124  // we have the constraint: nR + nS < r+s+u,
1125  // since we can not use more events than we have
1126  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1127  // nomenclature: Thet(x) = x, if x>0 else 0
1128  // nR = max(R,r) + 0.5 * Nfree
1129  // nS = max(S,s) + 0.5 * Nfree
1130  // nR +nS = R+S + u-R+r-S+s = u+r+s= ok! for R>r
1131  // nR +nS = r+S + u-S+s = u+r+s= ok! for r>R
1132 
1133  // three different cases might occur here
1134  //
1135  // Case a
1136  // requestedTraining and requestedTesting >0
1137  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1138  // nR = Max(R,r) + 0.5 * Nfree
1139  // nS = Max(S,s) + 0.5 * Nfree
1140  //
1141  // Case b
1142  // exactly one of requestedTraining or requestedTesting >0
1143  // assume training R >0
1144  // nR = max(R,r)
1145  // nS = s+u+r-nR
1146  // and s=nS
1147  //
1148  // Case c
1149  // requestedTraining=0, requestedTesting=0
1150  // Nfree = u-|r-s|
1151  // if NFree >=0
1152  // R = Max(r,s) + 0.5 * Nfree = S
1153  // else if r>s
1154  // R = r; S=s+u
1155  // else
1156  // R = r+u; S=s
1157  //
1158  // Next steps:
1159  // Determination of Event numbers R,S, nR, nS
1160  // distribute undefined events according to nR, nS
1161  // finally determine actual sub samples from nR and nS to be used in training / testing
1162  //
1163 
1164  Int_t useForTesting(0),useForTraining(0);
1165  Int_t allAvailable(availableUndefined + availableTraining + availableTesting);
1166 
1167  if( (requestedTraining == 0) && (requestedTesting == 0)){
1168 
1169  // Case C: balance the number of training and testing events
1170 
1171  if ( availableUndefined >= TMath::Abs(availableTraining - availableTesting) ) {
1172  // enough unspecified are available to equal training and testing
1173  useForTraining = useForTesting = allAvailable/2;
1174  } else {
1175  // all unspecified are assigned to the smaller of training / testing
1176  useForTraining = availableTraining;
1177  useForTesting = availableTesting;
1178  if (availableTraining < availableTesting)
1179  useForTraining += availableUndefined;
1180  else
1181  useForTesting += availableUndefined;
1182  }
1183  requestedTraining = useForTraining;
1184  requestedTesting = useForTesting;
1185  }
1186 
1187  else if (requestedTesting == 0){
1188  // case B
1189  useForTraining = TMath::Max(requestedTraining,availableTraining);
1190  if (allAvailable < useForTraining) {
1191  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "More events requested for training ("
1192  << requestedTraining << ") than available ("
1193  << allAvailable << ")!" << Endl;
1194  }
1195  useForTesting = allAvailable - useForTraining; // the rest
1196  requestedTesting = useForTesting;
1197  }
1198 
1199  else if (requestedTraining == 0){ // case B)
1200  useForTesting = TMath::Max(requestedTesting,availableTesting);
1201  if (allAvailable < useForTesting) {
1202  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "More events requested for testing ("
1203  << requestedTesting << ") than available ("
1204  << allAvailable << ")!" << Endl;
1205  }
1206  useForTraining= allAvailable - useForTesting; // the rest
1207  requestedTraining = useForTraining;
1208  }
1209 
1210  else {
1211  // Case A
1212  // requestedTraining R and requestedTesting S >0
1213  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1214  // nR = Max(R,r) + 0.5 * Nfree
1215  // nS = Max(S,s) + 0.5 * Nfree
1216  Int_t stillNeedForTraining = TMath::Max(requestedTraining-availableTraining,0);
1217  Int_t stillNeedForTesting = TMath::Max(requestedTesting-availableTesting,0);
1218 
1219  int NFree = availableUndefined - stillNeedForTraining - stillNeedForTesting;
1220  if (NFree <0) NFree = 0;
1221  useForTraining = TMath::Max(requestedTraining,availableTraining) + NFree/2;
1222  useForTesting= allAvailable - useForTraining; // the rest
1223  }
1224 
1225  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "determined event sample size to select training sample from="<<useForTraining<<Endl;
1226  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "determined event sample size to select test sample from="<<useForTesting<<Endl;
1227 
1228 
1229 
1230  // associate undefined events
1231  if( splitMode == "ALTERNATE" ){
1232  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "split 'ALTERNATE'" << Endl;
1233  Int_t nTraining = availableTraining;
1234  Int_t nTesting = availableTesting;
1235  for( EventVector::iterator it = eventVectorUndefined.begin(), itEnd = eventVectorUndefined.end(); it != itEnd; ){
1236  ++nTraining;
1237  if( nTraining <= requestedTraining ){
1238  eventVectorTraining.insert( eventVectorTraining.end(), (*it) );
1239  ++it;
1240  }
1241  if( it != itEnd ){
1242  ++nTesting;
1243  eventVectorTesting.insert( eventVectorTesting.end(), (*it) );
1244  ++it;
1245  }
1246  }
1247  } else {
1248  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "split '" << splitMode << "'" << Endl;
1249 
1250  // test if enough events are available
1251  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "availableundefined : " << availableUndefined << Endl;
1252  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "useForTraining : " << useForTraining << Endl;
1253  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "useForTesting : " << useForTesting << Endl;
1254  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "availableTraining : " << availableTraining << Endl;
1255  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "availableTesting : " << availableTesting << Endl;
1256 
1257  if( availableUndefined<(useForTraining-availableTraining) ||
1258  availableUndefined<(useForTesting -availableTesting ) ||
1259  availableUndefined<(useForTraining+useForTesting-availableTraining-availableTesting ) ){
1260  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "More events requested than available!" << Endl;
1261  }
1262 
1263  // select the events
1264  if (useForTraining>availableTraining){
1265  eventVectorTraining.insert( eventVectorTraining.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTraining- availableTraining );
1266  eventVectorUndefined.erase( eventVectorUndefined.begin(), eventVectorUndefined.begin() + useForTraining- availableTraining);
1267  }
1268  if (useForTesting>availableTesting){
1269  eventVectorTesting.insert( eventVectorTesting.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTesting- availableTesting );
1270  }
1271  }
1272  eventVectorUndefined.clear();
1273 
1274  // finally shorten the event vectors to the requested size by removing random events
1275  if (splitMode.Contains( "RANDOM" )){
1276  UInt_t sizeTraining = eventVectorTraining.size();
1277  if( sizeTraining > UInt_t(requestedTraining) ){
1278  std::vector<UInt_t> indicesTraining( sizeTraining );
1279  // make indices
1280  std::generate( indicesTraining.begin(), indicesTraining.end(), TMVA::Increment<UInt_t>(0) );
1281  // shuffle indices
1282  std::shuffle(indicesTraining.begin(), indicesTraining.end(), rndm);
1283  // erase indices of not needed events
1284  indicesTraining.erase( indicesTraining.begin()+sizeTraining-UInt_t(requestedTraining), indicesTraining.end() );
1285  // delete all events with the given indices
1286  for( std::vector<UInt_t>::iterator it = indicesTraining.begin(), itEnd = indicesTraining.end(); it != itEnd; ++it ){
1287  delete eventVectorTraining.at( (*it) ); // delete event
1288  eventVectorTraining.at( (*it) ) = NULL; // set pointer to NULL
1289  }
1290  // now remove and erase all events with pointer==NULL
1291  eventVectorTraining.erase( std::remove( eventVectorTraining.begin(), eventVectorTraining.end(), (void*)NULL ), eventVectorTraining.end() );
1292  }
1293 
1294  UInt_t sizeTesting = eventVectorTesting.size();
1295  if( sizeTesting > UInt_t(requestedTesting) ){
1296  std::vector<UInt_t> indicesTesting( sizeTesting );
1297  // make indices
1298  std::generate( indicesTesting.begin(), indicesTesting.end(), TMVA::Increment<UInt_t>(0) );
1299  // shuffle indices
1300  std::shuffle(indicesTesting.begin(), indicesTesting.end(), rndm);
1301  // erase indices of not needed events
1302  indicesTesting.erase( indicesTesting.begin()+sizeTesting-UInt_t(requestedTesting), indicesTesting.end() );
1303  // delete all events with the given indices
1304  for( std::vector<UInt_t>::iterator it = indicesTesting.begin(), itEnd = indicesTesting.end(); it != itEnd; ++it ){
1305  delete eventVectorTesting.at( (*it) ); // delete event
1306  eventVectorTesting.at( (*it) ) = NULL; // set pointer to NULL
1307  }
1308  // now remove and erase all events with pointer==NULL
1309  eventVectorTesting.erase( std::remove( eventVectorTesting.begin(), eventVectorTesting.end(), (void*)NULL ), eventVectorTesting.end() );
1310  }
1311  }
1312  else { // erase at end
1313  if( eventVectorTraining.size() < UInt_t(requestedTraining) )
1314  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "DataSetFactory/requested number of training samples larger than size of eventVectorTraining.\n"
1315  << "There is probably an issue. Please contact the TMVA developers." << Endl;
1316  std::for_each( eventVectorTraining.begin()+requestedTraining, eventVectorTraining.end(), DeleteFunctor<Event>() );
1317  eventVectorTraining.erase(eventVectorTraining.begin()+requestedTraining,eventVectorTraining.end());
1318 
1319  if( eventVectorTesting.size() < UInt_t(requestedTesting) )
1320  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "DataSetFactory/requested number of testing samples larger than size of eventVectorTesting.\n"
1321  << "There is probably an issue. Please contact the TMVA developers." << Endl;
1322  std::for_each( eventVectorTesting.begin()+requestedTesting, eventVectorTesting.end(), DeleteFunctor<Event>() );
1323  eventVectorTesting.erase(eventVectorTesting.begin()+requestedTesting,eventVectorTesting.end());
1324  }
1325  }
1326 
1327  TMVA::DataSetFactory::RenormEvents( dsi, tmpEventVector, eventCounts, normMode );
1328 
1329  Int_t trainingSize = 0;
1330  Int_t testingSize = 0;
1331 
1332  // sum up number of training and testing events
1333  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1334  trainingSize += tmpEventVector[Types::kTraining].at(cls).size();
1335  testingSize += tmpEventVector[Types::kTesting].at(cls).size();
1336  }
1337 
1338  // --- collect all training (testing) events into the training (testing) eventvector
1339 
1340  // create event vectors reserve enough space
1341  EventVector* trainingEventVector = new EventVector();
1342  EventVector* testingEventVector = new EventVector();
1343 
1344  trainingEventVector->reserve( trainingSize );
1345  testingEventVector->reserve( testingSize );
1346 
1347 
1348  // collect the events
1349 
1350  // mixing of kTraining and kTesting data sets
1351  Log() << kDEBUG << " MIXING ============= " << Endl;
1352 
1353  if( mixMode == "ALTERNATE" ){
1354  // Inform user if he tries to use alternate mixmode for
1355  // event classes with different number of events, this works but the alternation stops at the last event of the smaller class
1356  for( UInt_t cls = 1; cls < dsi.GetNClasses(); ++cls ){
1357  if (tmpEventVector[Types::kTraining].at(cls).size() != tmpEventVector[Types::kTraining].at(0).size()){
1358  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Training sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1359  }
1360  if (tmpEventVector[Types::kTesting].at(cls).size() != tmpEventVector[Types::kTesting].at(0).size()){
1361  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Testing sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1362  }
1363  }
1364  typedef EventVector::iterator EvtVecIt;
1365  EvtVecIt itEvent, itEventEnd;
1366 
1367  // insert first class
1368  Log() << kDEBUG << "insert class 0 into training and test vector" << Endl;
1369  trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[Types::kTraining].at(0).begin(), tmpEventVector[Types::kTraining].at(0).end() );
1370  testingEventVector->insert( testingEventVector->end(), tmpEventVector[Types::kTesting].at(0).begin(), tmpEventVector[Types::kTesting].at(0).end() );
1371 
1372  // insert other classes
1373  EvtVecIt itTarget;
1374  for( UInt_t cls = 1; cls < dsi.GetNClasses(); ++cls ){
1375  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "insert class " << cls << Endl;
1376  // training vector
1377  itTarget = trainingEventVector->begin() - 1; // start one before begin
1378  // loop over source
1379  for( itEvent = tmpEventVector[Types::kTraining].at(cls).begin(), itEventEnd = tmpEventVector[Types::kTraining].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1380  // if( std::distance( itTarget, trainingEventVector->end()) < Int_t(cls+1) ) {
1381  if( (trainingEventVector->end() - itTarget) < Int_t(cls+1) ) {
1382  itTarget = trainingEventVector->end();
1383  trainingEventVector->insert( itTarget, itEvent, itEventEnd ); // fill in the rest without mixing
1384  break;
1385  }else{
1386  itTarget += cls+1;
1387  trainingEventVector->insert( itTarget, (*itEvent) ); // fill event
1388  }
1389  }
1390  // testing vector
1391  itTarget = testingEventVector->begin() - 1;
1392  // loop over source
1393  for( itEvent = tmpEventVector[Types::kTesting].at(cls).begin(), itEventEnd = tmpEventVector[Types::kTesting].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1394  // if( std::distance( itTarget, testingEventVector->end()) < Int_t(cls+1) ) {
1395  if( ( testingEventVector->end() - itTarget ) < Int_t(cls+1) ) {
1396  itTarget = testingEventVector->end();
1397  testingEventVector->insert( itTarget, itEvent, itEventEnd ); // fill in the rest without mixing
1398  break;
1399  }else{
1400  itTarget += cls+1;
1401  testingEventVector->insert( itTarget, (*itEvent) ); // fill event
1402  }
1403  }
1404  }
1405  }else{
1406  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1407  trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[Types::kTraining].at(cls).begin(), tmpEventVector[Types::kTraining].at(cls).end() );
1408  testingEventVector->insert ( testingEventVector->end(), tmpEventVector[Types::kTesting].at(cls).begin(), tmpEventVector[Types::kTesting].at(cls).end() );
1409  }
1410  }
1411  // delete the tmpEventVector (but not the events therein)
1412  tmpEventVector[Types::kTraining].clear();
1413  tmpEventVector[Types::kTesting].clear();
1414 
1415  tmpEventVector[Types::kMaxTreeType].clear();
1416 
1417  if (mixMode == "RANDOM") {
1418  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "shuffling events"<<Endl;
1419 
1420  std::shuffle(trainingEventVector->begin(), trainingEventVector->end(), rndm);
1421  std::shuffle(testingEventVector->begin(), testingEventVector->end(), rndm);
1422  }
1423 
1424  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "trainingEventVector " << trainingEventVector->size() << Endl;
1425  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "testingEventVector " << testingEventVector->size() << Endl;
1426 
1427  // create dataset
1428  DataSet* ds = new DataSet(dsi);
1429 
1430  // Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Create internal training tree" << Endl;
1431  ds->SetEventCollection(trainingEventVector, Types::kTraining );
1432  // Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Create internal testing tree" << Endl;
1433  ds->SetEventCollection(testingEventVector, Types::kTesting );
1434 
1435 
1436  if (ds->GetNTrainingEvents() < 1){
1437  Log() << kFATAL << "Dataset " << std::string(dsi.GetName()) << " does not have any training events, I better stop here and let you fix that one first " << Endl;
1438  }
1439 
1440  if (ds->GetNTestEvents() < 1) {
1441  Log() << kERROR << "Dataset " << std::string(dsi.GetName()) << " does not have any testing events, guess that will cause problems later..but for now, I continue " << Endl;
1442  }
1443 
1444  delete trainingEventVector;
1445  delete testingEventVector;
1446  return ds;
1447 
1448 }
1449 
1450 ////////////////////////////////////////////////////////////////////////////////
1451 /// renormalisation of the TRAINING event weights
1452 /// - none (kind of obvious) .. use the weights as supplied by the
1453 /// user.. (we store however the relative weight for later use)
1454 /// - numEvents
1455 /// - equalNumEvents reweight the training events such that the sum of all
1456 /// backgr. (class > 0) weights equal that of the signal (class 0)
1457 
1458 void
1460  EventVectorOfClassesOfTreeType& tmpEventVector,
1461  const EvtStatsPerClass& eventCounts,
1462  const TString& normMode )
1463 {
1464 
1465 
1466  // print rescaling info
1467  // ---------------------------------
1468  // compute sizes and sums of weights
1469  Int_t trainingSize = 0;
1470  Int_t testingSize = 0;
1471 
1472  ValuePerClass trainingSumWeightsPerClass( dsi.GetNClasses() );
1473  ValuePerClass testingSumWeightsPerClass( dsi.GetNClasses() );
1474 
1475  NumberPerClass trainingSizePerClass( dsi.GetNClasses() );
1476  NumberPerClass testingSizePerClass( dsi.GetNClasses() );
1477 
1478  Double_t trainingSumSignalWeights = 0;
1479  Double_t trainingSumBackgrWeights = 0; // Backgr. includes all classes that are not signal
1480  Double_t testingSumSignalWeights = 0;
1481  Double_t testingSumBackgrWeights = 0; // Backgr. includes all classes that are not signal
1482 
1483 
1484 
1485  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1486  trainingSizePerClass.at(cls) = tmpEventVector[Types::kTraining].at(cls).size();
1487  testingSizePerClass.at(cls) = tmpEventVector[Types::kTesting].at(cls).size();
1488 
1489  trainingSize += trainingSizePerClass.back();
1490  testingSize += testingSizePerClass.back();
1491 
1492  // the functional solution
1493  // sum up the weights in Double_t although the individual weights are Float_t to prevent rounding issues in addition of floating points
1494  //
1495  // accumulate --> does what the name says
1496  // begin() and end() denote the range of the vector to be accumulated
1497  // Double_t(0) tells accumulate the type and the starting value
1498  // compose_binary creates a BinaryFunction of ...
1499  // std::plus<Double_t>() knows how to sum up two doubles
1500  // null<Double_t>() leaves the first argument (the running sum) unchanged and returns it
1501  //
1502  // all together sums up all the event-weights of the events in the vector and returns it
1503  trainingSumWeightsPerClass.at(cls) =
1504  std::accumulate(tmpEventVector[Types::kTraining].at(cls).begin(),
1505  tmpEventVector[Types::kTraining].at(cls).end(),
1506  Double_t(0), [](Double_t w, const TMVA::Event *E) { return w + E->GetOriginalWeight(); });
1507 
1508  testingSumWeightsPerClass.at(cls) =
1509  std::accumulate(tmpEventVector[Types::kTesting].at(cls).begin(),
1510  tmpEventVector[Types::kTesting].at(cls).end(),
1511  Double_t(0), [](Double_t w, const TMVA::Event *E) { return w + E->GetOriginalWeight(); });
1512 
1513  if ( cls == dsi.GetSignalClassIndex()){
1514  trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1515  testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1516  }else{
1517  trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1518  testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1519  }
1520  }
1521 
1522  // ---------------------------------
1523  // compute renormalization factors
1524 
1525  ValuePerClass renormFactor( dsi.GetNClasses() );
1526 
1527 
1528  // for information purposes
1529  dsi.SetNormalization( normMode );
1530  // !! these will be overwritten later by the 'rescaled' ones if
1531  // NormMode != None !!!
1532  dsi.SetTrainingSumSignalWeights(trainingSumSignalWeights);
1533  dsi.SetTrainingSumBackgrWeights(trainingSumBackgrWeights);
1534  dsi.SetTestingSumSignalWeights(testingSumSignalWeights);
1535  dsi.SetTestingSumBackgrWeights(testingSumBackgrWeights);
1536 
1537 
1538  if (normMode == "NONE") {
1539  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "No weight renormalisation applied: use original global and event weights" << Endl;
1540  return;
1541  }
1542  //changed by Helge 27.5.2013 What on earth was done here before? I still remember the idea behind this which apparently was
1543  //NOT understood by the 'programmer' :) .. the idea was to have SAME amount of effective TRAINING data for signal and background.
1544  // Testing events are totally irrelevant for this and might actually skew the whole normalisation!!
1545  else if (normMode == "NUMEVENTS") {
1546  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1547  << "\tWeight renormalisation mode: \"NumEvents\": renormalises all event classes " << Endl;
1548  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1549  << " such that the effective (weighted) number of events in each class equals the respective " << Endl;
1550  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1551  << " number of events (entries) that you demanded in PrepareTrainingAndTestTree(\"\",\"nTrain_Signal=.. )" << Endl;
1552  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1553  << " ... i.e. such that Sum[i=1..N_j]{w_i} = N_j, j=0,1,2..." << Endl;
1554  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1555  << " ... (note that N_j is the sum of TRAINING events (nTrain_j...with j=Signal,Background.." << Endl;
1556  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1557  << " ..... Testing events are not renormalised nor included in the renormalisation factor! )"<< Endl;
1558 
1559  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1560  // renormFactor.at(cls) = ( (trainingSizePerClass.at(cls) + testingSizePerClass.at(cls))/
1561  // (trainingSumWeightsPerClass.at(cls) + testingSumWeightsPerClass.at(cls)) );
1562  //changed by Helge 27.5.2013
1563  renormFactor.at(cls) = ((Float_t)trainingSizePerClass.at(cls) )/
1564  (trainingSumWeightsPerClass.at(cls)) ;
1565  }
1566  }
1567  else if (normMode == "EQUALNUMEVENTS") {
1568  //changed by Helge 27.5.2013 What on earth was done here before? I still remember the idea behind this which apparently was
1569  //NOT understood by the 'programmer' :) .. the idea was to have SAME amount of effective TRAINING data for signal and background.
1570  //done here was something like having each data source normalized to its number of entries and this even for training+testing together.
1571  // what should this have been good for ???
1572 
1573  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Weight renormalisation mode: \"EqualNumEvents\": renormalises all event classes ..." << Endl;
1574  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " such that the effective (weighted) number of events in each class is the same " << Endl;
1575  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " (and equals the number of events (entries) given for class=0 )" << Endl;
1576  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "... i.e. such that Sum[i=1..N_j]{w_i} = N_classA, j=classA, classB, ..." << Endl;
1577  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "... (note that N_j is the sum of TRAINING events" << Endl;
1578  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " ..... Testing events are not renormalised nor included in the renormalisation factor!)" << Endl;
1579 
1580  // normalize to size of first class
1581  UInt_t referenceClass = 0;
1582  for (UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ) {
1583  renormFactor.at(cls) = Float_t(trainingSizePerClass.at(referenceClass))/
1584  (trainingSumWeightsPerClass.at(cls));
1585  }
1586  }
1587  else {
1588  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "<PrepareForTrainingAndTesting> Unknown NormMode: " << normMode << Endl;
1589  }
1590 
1591  // ---------------------------------
1592  // now apply the normalization factors
1593  Int_t maxL = dsi.GetClassNameMaxLength();
1594  for (UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls<clsEnd; ++cls) {
1595  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1596  << "--> Rescale " << setiosflags(ios::left) << std::setw(maxL)
1597  << dsi.GetClassInfo(cls)->GetName() << " event weights by factor: " << renormFactor.at(cls) << Endl;
1598  for (EventVector::iterator it = tmpEventVector[Types::kTraining].at(cls).begin(),
1599  itEnd = tmpEventVector[Types::kTraining].at(cls).end(); it != itEnd; ++it){
1600  (*it)->SetWeight ((*it)->GetWeight() * renormFactor.at(cls));
1601  }
1602 
1603  }
1604 
1605 
1606  // print out the result
1607  // (same code as before --> this can be done nicer )
1608  //
1609 
1610  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1611  << "Number of training and testing events" << Endl;
1612  Log() << kDEBUG << "\tafter rescaling:" << Endl;
1613  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1614  << "---------------------------------------------------------------------------" << Endl;
1615 
1616  trainingSumSignalWeights = 0;
1617  trainingSumBackgrWeights = 0; // Backgr. includes all classes that are not signal
1618  testingSumSignalWeights = 0;
1619  testingSumBackgrWeights = 0; // Backgr. includes all classes that are not signal
1620 
1621  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1622  trainingSumWeightsPerClass.at(cls) =
1623  std::accumulate(tmpEventVector[Types::kTraining].at(cls).begin(),
1624  tmpEventVector[Types::kTraining].at(cls).end(),
1625  Double_t(0), [](Double_t w, const TMVA::Event *E) { return w + E->GetOriginalWeight(); });
1626 
1627  testingSumWeightsPerClass.at(cls) =
1628  std::accumulate(tmpEventVector[Types::kTesting].at(cls).begin(),
1629  tmpEventVector[Types::kTesting].at(cls).end(),
1630  Double_t(0), [](Double_t w, const TMVA::Event *E) { return w + E->GetOriginalWeight(); });
1631 
1632  if ( cls == dsi.GetSignalClassIndex()){
1633  trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1634  testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1635  }else{
1636  trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1637  testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1638  }
1639 
1640  // output statistics
1641 
1642  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1643  << setiosflags(ios::left) << std::setw(maxL)
1644  << dsi.GetClassInfo(cls)->GetName() << " -- "
1645  << "training events : " << trainingSizePerClass.at(cls) << Endl;
1646  Log() << kDEBUG << "\t(sum of weights: " << trainingSumWeightsPerClass.at(cls) << ")"
1647  << " - requested were " << eventCounts[cls].nTrainingEventsRequested << " events" << Endl;
1648  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1649  << setiosflags(ios::left) << std::setw(maxL)
1650  << dsi.GetClassInfo(cls)->GetName() << " -- "
1651  << "testing events : " << testingSizePerClass.at(cls) << Endl;
1652  Log() << kDEBUG << "\t(sum of weights: " << testingSumWeightsPerClass.at(cls) << ")"
1653  << " - requested were " << eventCounts[cls].nTestingEventsRequested << " events" << Endl;
1654  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1655  << setiosflags(ios::left) << std::setw(maxL)
1656  << dsi.GetClassInfo(cls)->GetName() << " -- "
1657  << "training and testing events: "
1658  << (trainingSizePerClass.at(cls)+testingSizePerClass.at(cls)) << Endl;
1659  Log() << kDEBUG << "\t(sum of weights: "
1660  << (trainingSumWeightsPerClass.at(cls)+testingSumWeightsPerClass.at(cls)) << ")" << Endl;
1661  if(eventCounts[cls].nEvAfterCut<eventCounts[cls].nEvBeforeCut) {
1662  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << setiosflags(ios::left) << std::setw(maxL)
1663  << dsi.GetClassInfo(cls)->GetName() << " -- "
1664  << "due to the preselection a scaling factor has been applied to the numbers of requested events: "
1665  << eventCounts[cls].cutScaling() << Endl;
1666  }
1667  }
1668  Log() << kINFO << Endl;
1669 
1670  // for information purposes
1671  dsi.SetTrainingSumSignalWeights(trainingSumSignalWeights);
1672  dsi.SetTrainingSumBackgrWeights(trainingSumBackgrWeights);
1673  dsi.SetTestingSumSignalWeights(testingSumSignalWeights);
1674  dsi.SetTestingSumBackgrWeights(testingSumBackgrWeights);
1675 
1676 
1677 }
TMVA::DataSetInfo::SetTestingSumSignalWeights
void SetTestingSumSignalWeights(Double_t testingSumSignalWeights)
Definition: DataSetInfo.h:138
TMVA::DataSetFactory::ChangeToNewTree
void ChangeToNewTree(TreeInfo &, const DataSetInfo &)
While the data gets copied into the local training and testing trees, the input tree can change (for ...
Definition: DataSetFactory.cxx:288
TMVA::DataSetFactory::CalcMinMax
void CalcMinMax(DataSet *, DataSetInfo &dsi)
compute covariance matrix
Definition: DataSetFactory.cxx:474
TMVA::DataSet::GetNVariables
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:216
TCut
Definition: TCut.h:25
TMVA::DataSetInfo::SetTrainingSumBackgrWeights
void SetTrainingSumBackgrWeights(Double_t trainingSumBackgrWeights)
Definition: DataSetInfo.h:137
TTreeFormula::SetQuickLoad
void SetQuickLoad(Bool_t quick)
Definition: TTreeFormula.h:207
TMVA::DataSetFactory::BuildEventVector
void BuildEventVector(DataSetInfo &dsi, DataInputHandler &dataInput, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts)
build empty event vectors distributes events between kTraining/kTesting/kMaxTreeType
Definition: DataSetFactory.cxx:723
VariablePCATransform.h
kTRUE
const Bool_t kTRUE
Definition: RtypesCore.h:91
TMVA::DataSetFactory::BuildDynamicDataSet
DataSet * BuildDynamicDataSet(DataSetInfo &)
Definition: DataSetFactory.cxx:149
TMVA::DataSetFactory::EvtStatsPerClass
std::vector< EventStats > EvtStatsPerClass
Definition: DataSetFactory.h:236
TMVA::Configurable
Definition: Configurable.h:66
TMVA::DataSetFactory::EventStats::nWeEvBeforeCut
Float_t nWeEvBeforeCut
Definition: DataSetFactory.h:215
TMVA::DataSetFactory::ValuePerClass
std::vector< Double_t > ValuePerClass
Definition: DataSetFactory.h:204
TMath::Max
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:212
TMVA::Increment
Definition: DataSetFactory.h:108
TMVA::DataSetFactory::EventVectorOfClasses
std::vector< EventVector > EventVectorOfClasses
Definition: DataSetFactory.h:200
TMVA::DataSetFactory::InitOptions
void InitOptions(DataSetInfo &dsi, EvtStatsPerClass &eventsmap, TString &normMode, UInt_t &splitSeed, TString &splitMode, TString &mixMode)
the dataset splitting
Definition: DataSetFactory.cxx:633
TMVA::DataInputHandler::begin
std::vector< TreeInfo >::const_iterator begin(const TString &className) const
Definition: DataInputHandler.h:133
TMVA::DataSetInfo::SetNormalization
void SetNormalization(const TString &norm)
Definition: DataSetInfo.h:132
TMVA::DataInputHandler
Definition: DataInputHandler.h:100
TString::Data
const char * Data() const
Definition: TString.h:369
DataSetInfo.h
Form
char * Form(const char *fmt,...)
TMVA::Configurable::AddPreDefVal
void AddPreDefVal(const T &)
Definition: Configurable.h:168
TNamed::GetTitle
virtual const char * GetTitle() const
Returns title of object.
Definition: TNamed.h:54
TMVA::Event::GetClass
UInt_t GetClass() const
Definition: Event.h:86
TMVA::DataSet::GetNTargets
UInt_t GetNTargets() const
access the number of targets through the datasetinfo
Definition: DataSet.cxx:224
TMVA::DataSet::SetCurrentType
void SetCurrentType(Types::ETreeType type) const
Definition: DataSet.h:112
TBranch.h
Long64_t
long long Long64_t
Definition: RtypesCore.h:73
TMath::Log
Double_t Log(Double_t x)
Definition: TMath.h:760
TMVA::Types::kTesting
@ kTesting
Definition: Types.h:168
TMVA::RandomGenerator
Definition: Tools.h:305
TTree
Definition: TTree.h:79
TMVA::DataSetInfo::GetNVariables
UInt_t GetNVariables() const
Definition: DataSetInfo.h:127
TMVA::DataSetFactory::CalcCovarianceMatrix
TMatrixD * CalcCovarianceMatrix(DataSet *, const UInt_t classNumber)
compute covariance matrix
Definition: DataSetFactory.cxx:574
TMVA::DataSetInfo::GetNSpectators
UInt_t GetNSpectators(bool all=kTRUE) const
Definition: DataSetInfo.cxx:496
Float_t
float Float_t
Definition: RtypesCore.h:57
log
double log(double)
VariableDecorrTransform.h
VariableInfo.h
TMVA::DataSetInfo::HasCuts
Bool_t HasCuts() const
Definition: DataSetInfo.cxx:186
Int_t
int Int_t
Definition: RtypesCore.h:45
TMVA::DataSetFactory::EventStats::varAvLength
Float_t * varAvLength
Definition: DataSetFactory.h:218
VariableIdentityTransform.h
TString::Contains
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Definition: TString.h:624
TMVA::DataSetFactory::EventStats::nInitialEvents
Int_t nInitialEvents
Definition: DataSetFactory.h:212
TMVA::DataSetInfo::GetSpectatorInfos
std::vector< VariableInfo > & GetSpectatorInfos()
Definition: DataSetInfo.h:122
TMVA::Event::GetTarget
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:102
TMVA::DataInputHandler::GetClassList
std::vector< TString > * GetClassList() const
Definition: DataInputHandler.cxx:193
TMath::Abs
Short_t Abs(Short_t d)
Definition: TMathBase.h:120
TMVA::DataSetFactory::EventVectorOfClassesOfTreeType
std::map< Types::ETreeType, EventVectorOfClasses > EventVectorOfClassesOfTreeType
Definition: DataSetFactory.h:201
TString::Format
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
Definition: TString.cxx:2311
TTree.h
TMVA::DataSetFactory::BuildInitialDataSet
DataSet * BuildInitialDataSet(DataSetInfo &, TMVA::DataInputHandler &)
if no entries, than create a DataSet with one Event which uses dynamic variables (pointers to variabl...
Definition: DataSetFactory.cxx:197
TString
Definition: TString.h:136
TMVA::DataSetInfo::GetVarArraySize
Int_t GetVarArraySize(const TString &expression) const
Definition: DataSetInfo.h:108
TMVA::VariableInfo::SetMax
void SetMax(Double_t v)
Definition: VariableInfo.h:118
TMatrixT
Definition: TMatrixDfwd.h:22
v
@ v
Definition: rootcling_impl.cxx:3635
TMVA::DataSetFactory::EventStats
Definition: DataSetFactory.h:207
b
#define b(i)
Definition: RSha256.hxx:118
TTree::GetEntry
virtual Int_t GetEntry(Long64_t entry=0, Int_t getall=0)
Read all branches of entry and return total number of bytes read.
Definition: TTree.cxx:5537
TFile.h
TMVA::DataSetInfo::GetVariableInfo
VariableInfo & GetVariableInfo(Int_t i)
Definition: DataSetInfo.h:105
bool
TMVA::DataSet::SetEventCollection
void SetEventCollection(std::vector< Event * > *, Types::ETreeType, Bool_t deleteEvents=true)
Sets the event collection (by DataSetFactory)
Definition: DataSet.cxx:250
TMVA::DataSetInfo::PrintCorrelationMatrix
void PrintCorrelationMatrix(const TString &className)
calculates the correlation matrices for signal and background, prints them to standard output,...
Definition: DataSetInfo.cxx:406
TString::ToUpper
void ToUpper()
Change string to upper case.
Definition: TString.cxx:1138
TMVA::DataSetInfo::GetNClasses
UInt_t GetNClasses() const
Definition: DataSetInfo.h:155
TTree::GetTree
virtual TTree * GetTree() const
Definition: TTree.h:512
TMVA::Event::GetValue
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
Definition: Event.cxx:236
TTree::GetCurrentFile
TFile * GetCurrentFile() const
Return pointer to the current file.
Definition: TTree.cxx:5378
TMVA::DataSetFactory::EventStats::nNegWeights
Double_t nNegWeights
Definition: DataSetFactory.h:217
TMVA::LargestCommonDivider
Int_t LargestCommonDivider(Int_t a, Int_t b)
Definition: DataSetFactory.cxx:80
TMVA::DataSet::GetNClassEvents
Long64_t GetNClassEvents(Int_t type, UInt_t classNumber)
Definition: DataSet.cxx:168
TMVA::DataSetInfo
Definition: DataSetInfo.h:62
TMVA::DataSetInfo::IsVariableFromArray
Bool_t IsVariableFromArray(Int_t i) const
Definition: DataSetInfo.h:112
DataInputHandler.h
TTreeFormula::GetNdata
virtual Int_t GetNdata()
Return number of available instances in the formula.
Definition: TTreeFormula.cxx:4438
TMVA::DataSetFactory::NumberPerClass
std::vector< int > NumberPerClass
Definition: DataSetFactory.h:235
MsgLogger.h
TMVA::TreeInfo::GetWeight
Double_t GetWeight() const
Definition: DataInputHandler.h:84
TMVA::TreeInfo
Definition: DataInputHandler.h:74
TMVA::DataSet::GetNSpectators
UInt_t GetNSpectators() const
access the number of targets through the datasetinfo
Definition: DataSet.cxx:232
TMVA::Configurable::CheckForUnusedOptions
void CheckForUnusedOptions() const
checks for unused options in option string
Definition: Configurable.cxx:270
TTreeFormula::GetLeaf
virtual TLeaf * GetLeaf(Int_t n) const
Return leaf corresponding to serial number n.
Definition: TTreeFormula.cxx:4418
TMVA::DataSetFactory::CheckTTreeFormula
Bool_t CheckTTreeFormula(TTreeFormula *ttf, const TString &expression, Bool_t &hasDollar)
checks a TTreeFormula for problems
Definition: DataSetFactory.cxx:246
TMVA::DataSetFactory::~DataSetFactory
~DataSetFactory()
destructor
Definition: DataSetFactory.cxx:107
TLeaf
Definition: TLeaf.h:57
TMVA::VariableInfo::SetMin
void SetMin(Double_t v)
Definition: VariableInfo.h:117
TMVA::DataSet::GetNTestEvents
Long64_t GetNTestEvents() const
Definition: DataSet.h:92
TTree::LoadTree
virtual Long64_t LoadTree(Long64_t entry)
Set current entry.
Definition: TTree.cxx:6366
TMath::IsNaN
Bool_t IsNaN(Double_t x)
Definition: TMath.h:892
TMVA::DataSet::GetEvent
const Event * GetEvent() const
Definition: DataSet.cxx:202
TMVA::DataSetFactory::CreateDataSet
DataSet * CreateDataSet(DataSetInfo &, DataInputHandler &)
steering the creation of a new dataset
Definition: DataSetFactory.cxx:123
a
auto * a
Definition: textangle.C:12
TMVA::DataSet::GetNEvents
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:206
TMVA::DataSetInfo::GetSpectatorInfo
VariableInfo & GetSpectatorInfo(Int_t i)
Definition: DataSetInfo.h:124
TMVA::DataSetInfo::GetClassInfo
ClassInfo * GetClassInfo(Int_t clNum) const
Definition: DataSetInfo.cxx:146
RooFit::Verbose
RooCmdArg Verbose(Bool_t flag=kTRUE)
Definition: RooGlobalFunc.cxx:186
kFALSE
const Bool_t kFALSE
Definition: RtypesCore.h:92
TMVA::DataSetFactory::RenormEvents
void RenormEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, const EvtStatsPerClass &eventCounts, const TString &normMode)
renormalisation of the TRAINING event weights
Definition: DataSetFactory.cxx:1459
TMVA::DataSetFactory::MixEvents
DataSet * MixEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts, const TString &splitMode, const TString &mixMode, const TString &normMode, UInt_t splitSeed)
Select and distribute unassigned events to kTraining and kTesting.
Definition: DataSetFactory.cxx:1035
TTreeFormula::EvalInstance
T EvalInstance(Int_t i=0, const char *stringStack[]=0)
Evaluate this treeformula.
Definition: TTreeFormula.cxx:3930
TMVA::DataSet
Definition: DataSet.h:81
TMVA::DataSetInfo::SetTrainingSumSignalWeights
void SetTrainingSumSignalWeights(Double_t trainingSumSignalWeights)
Definition: DataSetInfo.h:134
Event.h
TTreeFormula
Definition: TTreeFormula.h:58
TRandom3.h
what
static const char * what
Definition: stlLoader.cc:6
TTreeFormula::GetNcodes
virtual Int_t GetNcodes() const
Definition: TTreeFormula.h:193
UInt_t
unsigned int UInt_t
Definition: RtypesCore.h:46
sqrt
double sqrt(double)
Types.h
Configurable.h
TMVA::DataSetInfo::GetSplitOptions
const TString & GetSplitOptions() const
Definition: DataSetInfo.h:186
TMVA::TreeInfo::GetTreeType
Types::ETreeType GetTreeType() const
Definition: DataInputHandler.h:86
TMVA::VariableInfo::GetInternalName
const TString & GetInternalName() const
Definition: VariableInfo.h:104
TMVA::Types::kMaxTreeType
@ kMaxTreeType
Definition: Types.h:169
TMVA::Endl
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:182
unsigned int
TMVA::ClassInfo::GetCut
const TCut & GetCut() const
Definition: ClassInfo.h:87
TTree::GetDirectory
TDirectory * GetDirectory() const
Definition: TTree.h:457
TMVA::Types::kTraining
@ kTraining
Definition: Types.h:167
TMVA::DataSetInfo::SetCorrelationMatrix
void SetCorrelationMatrix(const TString &className, TMatrixD *matrix)
Definition: DataSetInfo.cxx:352
TMVA::DataSetInfo::GetNTargets
UInt_t GetNTargets() const
Definition: DataSetInfo.h:128
DataSetFactory.h
TVectorT
Definition: TMatrixTBase.h:78
TLeaf::GetBranch
TBranch * GetBranch() const
Definition: TLeaf.h:115
TMVA::DataSetFactory::DataSetFactory
DataSetFactory()
constructor
Definition: DataSetFactory.cxx:93
TMVA::Event::GetSpectator
Float_t GetSpectator(UInt_t ivar) const
return spectator content
Definition: Event.cxx:261
TMVA::DataSet::SetCurrentEvent
void SetCurrentEvent(Long64_t ievt) const
Definition: DataSet.h:111
Double_t
double Double_t
Definition: RtypesCore.h:59
TMVA::DataSetFactory::CalcCorrelationMatrix
TMatrixD * CalcCorrelationMatrix(DataSet *, const UInt_t classNumber)
computes correlation matrix for variables "theVars" in tree; "theType" defines the required event "ty...
Definition: DataSetFactory.cxx:543
TMVA::MsgLogger
Definition: MsgLogger.h:83
TMVA::DataSetInfo::GetVariableInfos
std::vector< VariableInfo > & GetVariableInfos()
Definition: DataSetInfo.h:103
TMVA::TreeInfo::GetClassName
const TString & GetClassName() const
Definition: DataInputHandler.h:87
TMVA::DataSetFactory::EventStats::nWeEvAfterCut
Float_t nWeEvAfterCut
Definition: DataSetFactory.h:216
TMVA::Event::GetWeight
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not.
Definition: Event.cxx:381
TMatrixD
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:22
TMVA::DataSetInfo::AddClass
ClassInfo * AddClass(const TString &className)
Definition: DataSetInfo.cxx:113
TMVA::ClassInfo::SetNumber
void SetNumber(const UInt_t index)
Definition: ClassInfo.h:82
TMVA::DataSetInfo::GetName
virtual const char * GetName() const
Returns name of object.
Definition: DataSetInfo.h:71
TMVA::Event
Definition: Event.h:51
TMVA::ClassInfo::GetWeight
const TString & GetWeight() const
Definition: ClassInfo.h:86
TMVA::Configurable::SetConfigDescription
void SetConfigDescription(const char *d)
Definition: Configurable.h:106
TDirectory::GetFile
virtual TFile * GetFile() const
Definition: TDirectory.h:165
TMVA::DataSetFactory::EventStats::nEvBeforeCut
Int_t nEvBeforeCut
Definition: DataSetFactory.h:213
TMVA::DataSetInfo::SetTestingSumBackgrWeights
void SetTestingSumBackgrWeights(Double_t testingSumBackgrWeights)
Definition: DataSetInfo.h:139
TVectorF.h
TMVA::DataSetFactory::EventStats::nEvAfterCut
Int_t nEvAfterCut
Definition: DataSetFactory.h:214
d
#define d(i)
Definition: RSha256.hxx:120
TMVA::VariableInfo::GetExpression
const TString & GetExpression() const
Definition: VariableInfo.h:103
TMVA::DataSetInfo::GetSignalClassIndex
UInt_t GetSignalClassIndex()
Definition: DataSetInfo.h:158
TMVA::TreeInfo::GetTree
TTree * GetTree() const
Definition: DataInputHandler.h:83
ROOT::v5::TFormula::GetNdim
virtual Int_t GetNdim() const
Definition: TFormula.h:237
Tools.h
TMVA::DataSet::GetNTrainingEvents
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:91
TNamed::GetName
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:53
Float_t
TTree::ResetBranchAddresses
virtual void ResetBranchAddresses()
Tell all of our branches to drop their current objects and allocate new ones.
Definition: TTree.cxx:7937
TMVA::DataInputHandler::GetEntries
UInt_t GetEntries(const TString &name) const
Definition: DataInputHandler.h:122
TMVA::Configurable::DeclareOptionRef
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
TObject::ClassName
virtual const char * ClassName() const
Returns name of class to which the object belongs.
Definition: TObject.cxx:130
TTree::SetBranchStatus
virtual void SetBranchStatus(const char *bname, Bool_t status=1, UInt_t *found=0)
Set branch status to Process or DoNotProcess.
Definition: TTree.cxx:8354
TTree::GetEntries
virtual Long64_t GetEntries() const
Definition: TTree.h:458
TMVA::DataInputHandler::end
std::vector< TreeInfo >::const_iterator end(const TString &className) const
Definition: DataInputHandler.h:134
TMath::E
constexpr Double_t E()
Base of natural log:
Definition: TMath.h:102
TMVA::DataSetInfo::GetClassNameMaxLength
Int_t GetClassNameMaxLength() const
Definition: DataSetInfo.cxx:509
DataSet.h
TMatrixF.h
TMath::Finite
Int_t Finite(Double_t x)
Check if it is finite with a mask in order to be consistent in presence of fast math.
Definition: TMath.h:771
TMVA::Configurable::ParseOptions
virtual void ParseOptions()
options parser
Definition: Configurable.cxx:124
TMVA::DataSetInfo::GetTargetInfo
VariableInfo & GetTargetInfo(Int_t i)
Definition: DataSetInfo.h:119
TLeaf::IsOnTerminalBranch
virtual Bool_t IsOnTerminalBranch() const
Definition: TLeaf.h:147
TMVA::Configurable::SetConfigName
void SetConfigName(const char *n)
Definition: Configurable.h:105
TMath.h
TMVA
create variable transformations
Definition: GeneticMinimizer.h:22
int
TMVA::DataSetFactory::EventVector
std::vector< Event * > EventVector
Definition: DataSetFactory.h:199
TEventList.h