Logo ROOT  
Reference Guide
DataSetFactory.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Peter Speckmayer, Joerg Stelzer, Eckhard von Toerne, Helge Voss
3 
4 /*****************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : DataSetFactory *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation (see header for description) *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Peter Speckmayer <Peter.Speckmayer@cern.ch> - CERN, Switzerland *
16  * Joerg Stelzer <Joerg.Stelzer@cern.ch> - MSU, USA *
17  * Eckhard von Toerne <evt@physik.uni-bonn.de> - U. of Bonn, Germany *
18  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
19  * *
20  * Copyright (c) 2009: *
21  * CERN, Switzerland *
22  * MPI-K Heidelberg, Germany *
23  * U. of Bonn, Germany *
24  * Redistribution and use in source and binary forms, with or without *
25  * modification, are permitted according to the terms listed in LICENSE *
26  * (http://tmva.sourceforge.net/LICENSE) *
27  *****************************************************************************/
28 
29 /*! \class TMVA::DataSetFactory
30 \ingroup TMVA
31 
32 Class that contains all the data information
33 
34 */
35 
36 #include <assert.h>
37 
38 #include <map>
39 #include <vector>
40 #include <iomanip>
41 #include <iostream>
42 
43 #include <algorithm>
44 #include <functional>
45 #include <numeric>
46 #include <random>
47 
48 #include "TMVA/DataSetFactory.h"
49 
50 #include "TEventList.h"
51 #include "TFile.h"
52 #include "TRandom3.h"
53 #include "TMatrixF.h"
54 #include "TVectorF.h"
55 #include "TMath.h"
56 #include "TTree.h"
57 #include "TBranch.h"
58 
59 #include "TMVA/MsgLogger.h"
60 #include "TMVA/Configurable.h"
64 #include "TMVA/DataSet.h"
65 #include "TMVA/DataSetInfo.h"
66 #include "TMVA/DataInputHandler.h"
67 #include "TMVA/Event.h"
68 
69 #include "TMVA/Tools.h"
70 #include "TMVA/Types.h"
71 #include "TMVA/VariableInfo.h"
72 
73 using namespace std;
74 
75 //TMVA::DataSetFactory* TMVA::DataSetFactory::fgInstance = 0;
76 
77 namespace TMVA {
78  // calculate the largest common divider
79  // this function is not happy if numbers are negative!
81  {
82  if (a<b) {Int_t tmp = a; a=b; b=tmp; } // achieve a>=b
83  if (b==0) return a;
84  Int_t fullFits = a/b;
85  return LargestCommonDivider(b,a-b*fullFits);
86  }
87 }
88 
89 
90 ////////////////////////////////////////////////////////////////////////////////
91 /// constructor
92 
94  fVerbose(kFALSE),
95  fVerboseLevel(TString("Info")),
96  fScaleWithPreselEff(0),
97  fCurrentTree(0),
98  fCurrentEvtIdx(0),
99  fInputFormulas(0),
100  fLogger( new MsgLogger("DataSetFactory", kINFO) )
101 {
102 }
103 
104 ////////////////////////////////////////////////////////////////////////////////
105 /// destructor
106 
108 {
109  std::vector<TTreeFormula*>::const_iterator formIt;
110 
111  for (formIt = fInputFormulas.begin() ; formIt!=fInputFormulas.end() ; ++formIt) if (*formIt) delete *formIt;
112  for (formIt = fTargetFormulas.begin() ; formIt!=fTargetFormulas.end() ; ++formIt) if (*formIt) delete *formIt;
113  for (formIt = fCutFormulas.begin() ; formIt!=fCutFormulas.end() ; ++formIt) if (*formIt) delete *formIt;
114  for (formIt = fWeightFormula.begin() ; formIt!=fWeightFormula.end() ; ++formIt) if (*formIt) delete *formIt;
115  for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); ++formIt) if (*formIt) delete *formIt;
116 
117  delete fLogger;
118 }
119 
120 ////////////////////////////////////////////////////////////////////////////////
121 /// steering the creation of a new dataset
122 
124  TMVA::DataInputHandler& dataInput )
125 {
126  // build the first dataset from the data input
127  DataSet * ds = BuildInitialDataSet( dsi, dataInput );
128 
129  if (ds->GetNEvents() > 1 && fComputeCorrelations ) {
130  CalcMinMax(ds,dsi);
131 
132  // from the the final dataset build the correlation matrix
133  for (UInt_t cl = 0; cl< dsi.GetNClasses(); cl++) {
134  const TString className = dsi.GetClassInfo(cl)->GetName();
135  dsi.SetCorrelationMatrix( className, CalcCorrelationMatrix( ds, cl ) );
136  if (fCorrelations) {
137  dsi.PrintCorrelationMatrix(className);
138  }
139  }
140  //Log() << kHEADER << Endl;
141  Log() << kHEADER << Form("[%s] : ",dsi.GetName()) << " " << Endl << Endl;
142  }
143 
144  return ds;
145 }
146 
147 ////////////////////////////////////////////////////////////////////////////////
148 
150 {
151  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "Build DataSet consisting of one Event with dynamically changing variables" << Endl;
152  DataSet* ds = new DataSet(dsi);
153 
154  // create a DataSet with one Event which uses dynamic variables
155  // (pointers to variables)
156  if(dsi.GetNClasses()==0){
157  dsi.AddClass( "data" );
158  dsi.GetClassInfo( "data" )->SetNumber(0);
159  }
160 
161  std::vector<Float_t*>* evdyn = new std::vector<Float_t*>(0);
162 
163  std::vector<VariableInfo>& varinfos = dsi.GetVariableInfos();
164 
165  if (varinfos.empty())
166  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName()) << "Dynamic data set cannot be built, since no variable informations are present. Apparently no variables have been set. This should not happen, please contact the TMVA authors." << Endl;
167 
168  std::vector<VariableInfo>::iterator it = varinfos.begin(), itEnd=varinfos.end();
169  for (;it!=itEnd;++it) {
170  Float_t* external=(Float_t*)(*it).GetExternalLink();
171  if (external==0)
172  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "The link to the external variable is NULL while I am trying to build a dynamic data set. In this case fTmpEvent from MethodBase HAS TO BE USED in the method to get useful values in variables." << Endl;
173  else evdyn->push_back (external);
174  }
175 
176  std::vector<VariableInfo>& spectatorinfos = dsi.GetSpectatorInfos();
177  it = spectatorinfos.begin();
178  for (;it!=spectatorinfos.end();++it) evdyn->push_back( (Float_t*)(*it).GetExternalLink() );
179 
180  TMVA::Event * ev = new Event((const std::vector<Float_t*>*&)evdyn, varinfos.size());
181  std::vector<Event*>* newEventVector = new std::vector<Event*>;
182  newEventVector->push_back(ev);
183 
184  ds->SetEventCollection(newEventVector, Types::kTraining);
185  ds->SetCurrentType( Types::kTraining );
186  ds->SetCurrentEvent( 0 );
187 
188  delete newEventVector;
189  return ds;
190 }
191 
192 ////////////////////////////////////////////////////////////////////////////////
193 /// if no entries, than create a DataSet with one Event which uses
194 /// dynamic variables (pointers to variables)
195 
198  DataInputHandler& dataInput )
199 {
200  if (dataInput.GetEntries()==0) return BuildDynamicDataSet( dsi );
201  // -------------------------------------------------------------------------
202 
203  // register the classes in the datasetinfo-object
204  // information comes from the trees in the dataInputHandler-object
205  std::vector< TString >* classList = dataInput.GetClassList();
206  for (std::vector<TString>::iterator it = classList->begin(); it< classList->end(); ++it) {
207  dsi.AddClass( (*it) );
208  }
209  delete classList;
210 
211  EvtStatsPerClass eventCounts(dsi.GetNClasses());
212  TString normMode;
213  TString splitMode;
214  TString mixMode;
215  UInt_t splitSeed;
216 
217  InitOptions( dsi, eventCounts, normMode, splitSeed, splitMode , mixMode );
218  // ======= build event-vector from input, apply preselection ===============
219  EventVectorOfClassesOfTreeType tmpEventVector;
220  BuildEventVector( dsi, dataInput, tmpEventVector, eventCounts );
221 
222  DataSet* ds = MixEvents( dsi, tmpEventVector, eventCounts,
223  splitMode, mixMode, normMode, splitSeed );
224 
225  const Bool_t showCollectedOutput = kFALSE;
226  if (showCollectedOutput) {
227  Int_t maxL = dsi.GetClassNameMaxLength();
228  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Collected:" << Endl;
229  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
230  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
231  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
232  << " training entries: " << ds->GetNClassEvents( 0, cl ) << Endl;
233  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
234  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
235  << " testing entries: " << ds->GetNClassEvents( 1, cl ) << Endl;
236  }
237  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " " << Endl;
238  }
239 
240  return ds;
241 }
242 
243 ////////////////////////////////////////////////////////////////////////////////
244 /// checks a TTreeFormula for problems
245 
247  const TString& expression,
248  Bool_t& hasDollar )
249 {
250  Bool_t worked = kTRUE;
251 
252  if( ttf->GetNdim() <= 0 )
253  Log() << kFATAL << "Expression " << expression.Data()
254  << " could not be resolved to a valid formula. " << Endl;
255  if( ttf->GetNdata() == 0 ){
256  Log() << kWARNING << "Expression: " << expression.Data()
257  << " does not provide data for this event. "
258  << "This event is not taken into account. --> please check if you use as a variable "
259  << "an entry of an array which is not filled for some events "
260  << "(e.g. arr[4] when arr has only 3 elements)." << Endl;
261  Log() << kWARNING << "If you want to take the event into account you can do something like: "
262  << "\"Alt$(arr[4],0)\" where in cases where arr doesn't have a 4th element, "
263  << " 0 is taken as an alternative." << Endl;
264  worked = kFALSE;
265  }
266  if( expression.Contains("$") )
267  hasDollar = kTRUE;
268  else
269  {
270  for (int i = 0, iEnd = ttf->GetNcodes (); i < iEnd; ++i)
271  {
272  TLeaf* leaf = ttf->GetLeaf (i);
273  if (!leaf->IsOnTerminalBranch())
274  hasDollar = kTRUE;
275  }
276  }
277  return worked;
278 }
279 
280 
281 ////////////////////////////////////////////////////////////////////////////////
282 /// While the data gets copied into the local training and testing
283 /// trees, the input tree can change (for instance when changing from
284 /// signal to background tree, or using TChains as input) The
285 /// TTreeFormulas, that hold the input expressions need to be
286 /// re-associated with the new tree, which is done here
287 
289 {
290  TTree *tr = tinfo.GetTree()->GetTree();
291 
292  //tr->SetBranchStatus("*",1); // nor needed when using TTReeFormula
293  tr->ResetBranchAddresses();
294 
295  Bool_t hasDollar = kTRUE; // Set to false if wants to enable only some branch in the tree
296 
297  // 1) the input variable formulas
298  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " create input formulas for tree " << tr->GetName() << Endl;
299  std::vector<TTreeFormula*>::const_iterator formIt, formItEnd;
300  for (formIt = fInputFormulas.begin(), formItEnd=fInputFormulas.end(); formIt!=formItEnd; ++formIt) if (*formIt) delete *formIt;
301  fInputFormulas.clear();
302  TTreeFormula* ttf = 0;
303  fInputTableFormulas.clear(); // this contains shallow pointer copies
304 
305  bool firstArrayVar = kTRUE;
306  int firstArrayVarIndex = -1;
307  int arraySize = -1;
308  for (UInt_t i = 0; i < dsi.GetNVariables(); i++) {
309 
310  // create TTreeformula
311  if (! dsi.IsVariableFromArray(i) ) {
312  ttf = new TTreeFormula(Form("Formula%s", dsi.GetVariableInfo(i).GetInternalName().Data()),
313  dsi.GetVariableInfo(i).GetExpression().Data(), tr);
314  CheckTTreeFormula(ttf, dsi.GetVariableInfo(i).GetExpression(), hasDollar);
315  fInputFormulas.emplace_back(ttf);
316  fInputTableFormulas.emplace_back(std::make_pair(ttf, (Int_t) 0));
317  } else {
318  // it is a variable from an array
319  if (firstArrayVar) {
320 
321  // create a new TFormula
322  ttf = new TTreeFormula(Form("Formula%s", dsi.GetVariableInfo(i).GetInternalName().Data()),
323  dsi.GetVariableInfo(i).GetExpression().Data(), tr);
324  CheckTTreeFormula(ttf, dsi.GetVariableInfo(i).GetExpression(), hasDollar);
325  fInputFormulas.push_back(ttf);
326 
327  arraySize = dsi.GetVarArraySize(dsi.GetVariableInfo(i).GetExpression());
328  firstArrayVar = kFALSE;
329  firstArrayVarIndex = i;
330 
331  Log() << kINFO << "Using variable " << dsi.GetVariableInfo(i).GetInternalName() <<
332  " from array expression " << dsi.GetVariableInfo(i).GetExpression() << " of size " << arraySize << Endl;
333  }
334  fInputTableFormulas.push_back(std::make_pair(ttf, (Int_t) i-firstArrayVarIndex));
335  if (int(i)-firstArrayVarIndex == arraySize-1 ) {
336  // I am the last element of the array
337  firstArrayVar = kTRUE;
338  firstArrayVarIndex = -1;
339  Log() << kDEBUG << "Using Last variable from array : " << dsi.GetVariableInfo(i).GetInternalName() << Endl;
340  }
341  }
342 
343  }
344 
345  //
346  // targets
347  //
348  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform regression targets" << Endl;
349  for (formIt = fTargetFormulas.begin(), formItEnd = fTargetFormulas.end(); formIt!=formItEnd; ++formIt) if (*formIt) delete *formIt;
350  fTargetFormulas.clear();
351  for (UInt_t i=0; i<dsi.GetNTargets(); i++) {
352  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetTargetInfo(i).GetInternalName().Data() ),
353  dsi.GetTargetInfo(i).GetExpression().Data(), tr );
354  CheckTTreeFormula( ttf, dsi.GetTargetInfo(i).GetExpression(), hasDollar );
355  fTargetFormulas.push_back( ttf );
356  }
357 
358  //
359  // spectators
360  //
361  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform spectator variables" << Endl;
362  for (formIt = fSpectatorFormulas.begin(), formItEnd = fSpectatorFormulas.end(); formIt!=formItEnd; ++formIt) if (*formIt) delete *formIt;
363  fSpectatorFormulas.clear();
364  for (UInt_t i=0; i<dsi.GetNSpectators(); i++) {
365  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetSpectatorInfo(i).GetInternalName().Data() ),
366  dsi.GetSpectatorInfo(i).GetExpression().Data(), tr );
367  CheckTTreeFormula( ttf, dsi.GetSpectatorInfo(i).GetExpression(), hasDollar );
368  fSpectatorFormulas.push_back( ttf );
369  }
370 
371  //
372  // the cuts (one per class, if non-existent: formula pointer = 0)
373  //
374  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform cuts" << Endl;
375  for (formIt = fCutFormulas.begin(), formItEnd = fCutFormulas.end(); formIt!=formItEnd; ++formIt) if (*formIt) delete *formIt;
376  fCutFormulas.clear();
377  for (UInt_t clIdx=0; clIdx<dsi.GetNClasses(); clIdx++) {
378  const TCut& tmpCut = dsi.GetClassInfo(clIdx)->GetCut();
379  const TString tmpCutExp(tmpCut.GetTitle());
380  ttf = 0;
381  if (tmpCutExp!="") {
382  ttf = new TTreeFormula( Form("CutClass%i",clIdx), tmpCutExp, tr );
383  Bool_t worked = CheckTTreeFormula( ttf, tmpCutExp, hasDollar );
384  if( !worked ){
385  Log() << kWARNING << "Please check class \"" << dsi.GetClassInfo(clIdx)->GetName()
386  << "\" cut \"" << dsi.GetClassInfo(clIdx)->GetCut() << Endl;
387  }
388  }
389  fCutFormulas.push_back( ttf );
390  }
391 
392  //
393  // the weights (one per class, if non-existent: formula pointer = 0)
394  //
395  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform weights" << Endl;
396  for (formIt = fWeightFormula.begin(), formItEnd = fWeightFormula.end(); formIt!=formItEnd; ++formIt) if (*formIt) delete *formIt;
397  fWeightFormula.clear();
398  for (UInt_t clIdx=0; clIdx<dsi.GetNClasses(); clIdx++) {
399  const TString tmpWeight = dsi.GetClassInfo(clIdx)->GetWeight();
400 
401  if (dsi.GetClassInfo(clIdx)->GetName() != tinfo.GetClassName() ) { // if the tree is of another class
402  fWeightFormula.push_back( 0 );
403  continue;
404  }
405 
406  ttf = 0;
407  if (tmpWeight!="") {
408  ttf = new TTreeFormula( "FormulaWeight", tmpWeight, tr );
409  Bool_t worked = CheckTTreeFormula( ttf, tmpWeight, hasDollar );
410  if( !worked ){
411  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName()) << "Please check class \"" << dsi.GetClassInfo(clIdx)->GetName()
412  << "\" weight \"" << dsi.GetClassInfo(clIdx)->GetWeight() << Endl;
413  }
414  }
415  else {
416  ttf = 0;
417  }
418  fWeightFormula.push_back( ttf );
419  }
420  return;
421  // all this code below is not needed when using TTReeFormula
422 
423  Log() << kDEBUG << Form("Dataset[%s] : ", dsi.GetName()) << "enable branches" << Endl;
424  // now enable only branches that are needed in any input formula, target, cut, weight
425 
426  if (!hasDollar) {
427  tr->SetBranchStatus("*",0);
428  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: input variables" << Endl;
429  // input vars
430  for (formIt = fInputFormulas.begin(); formIt!=fInputFormulas.end(); ++formIt) {
431  ttf = *formIt;
432  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++) {
433  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
434  }
435  }
436  // targets
437  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: targets" << Endl;
438  for (formIt = fTargetFormulas.begin(); formIt!=fTargetFormulas.end(); ++formIt) {
439  ttf = *formIt;
440  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
441  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
442  }
443  // spectators
444  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: spectators" << Endl;
445  for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); ++formIt) {
446  ttf = *formIt;
447  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
448  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
449  }
450  // cuts
451  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: cuts" << Endl;
452  for (formIt = fCutFormulas.begin(); formIt!=fCutFormulas.end(); ++formIt) {
453  ttf = *formIt;
454  if (!ttf) continue;
455  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
456  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
457  }
458  // weights
459  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: weights" << Endl;
460  for (formIt = fWeightFormula.begin(); formIt!=fWeightFormula.end(); ++formIt) {
461  ttf = *formIt;
462  if (!ttf) continue;
463  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
464  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
465  }
466  }
467  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "tree initialized" << Endl;
468  return;
469 }
470 
471 ////////////////////////////////////////////////////////////////////////////////
472 /// compute covariance matrix
473 
475 {
476  const UInt_t nvar = ds->GetNVariables();
477  const UInt_t ntgts = ds->GetNTargets();
478  const UInt_t nvis = ds->GetNSpectators();
479 
480  Float_t *min = new Float_t[nvar];
481  Float_t *max = new Float_t[nvar];
482  Float_t *tgmin = new Float_t[ntgts];
483  Float_t *tgmax = new Float_t[ntgts];
484  Float_t *vmin = new Float_t[nvis];
485  Float_t *vmax = new Float_t[nvis];
486 
487  for (UInt_t ivar=0; ivar<nvar ; ivar++) { min[ivar] = FLT_MAX; max[ivar] = -FLT_MAX; }
488  for (UInt_t ivar=0; ivar<ntgts; ivar++) { tgmin[ivar] = FLT_MAX; tgmax[ivar] = -FLT_MAX; }
489  for (UInt_t ivar=0; ivar<nvis; ivar++) { vmin[ivar] = FLT_MAX; vmax[ivar] = -FLT_MAX; }
490 
491  // perform event loop
492 
493  for (Int_t i=0; i<ds->GetNEvents(); i++) {
494  const Event * ev = ds->GetEvent(i);
495  for (UInt_t ivar=0; ivar<nvar; ivar++) {
496  Double_t v = ev->GetValue(ivar);
497  if (v<min[ivar]) min[ivar] = v;
498  if (v>max[ivar]) max[ivar] = v;
499  }
500  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
501  Double_t v = ev->GetTarget(itgt);
502  if (v<tgmin[itgt]) tgmin[itgt] = v;
503  if (v>tgmax[itgt]) tgmax[itgt] = v;
504  }
505  for (UInt_t ivis=0; ivis<nvis; ivis++) {
506  Double_t v = ev->GetSpectator(ivis);
507  if (v<vmin[ivis]) vmin[ivis] = v;
508  if (v>vmax[ivis]) vmax[ivis] = v;
509  }
510  }
511 
512  for (UInt_t ivar=0; ivar<nvar; ivar++) {
513  dsi.GetVariableInfo(ivar).SetMin(min[ivar]);
514  dsi.GetVariableInfo(ivar).SetMax(max[ivar]);
515  if( TMath::Abs(max[ivar]-min[ivar]) <= FLT_MIN )
516  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName()) << "Variable " << dsi.GetVariableInfo(ivar).GetExpression().Data() << " is constant. Please remove the variable." << Endl;
517  }
518  for (UInt_t ivar=0; ivar<ntgts; ivar++) {
519  dsi.GetTargetInfo(ivar).SetMin(tgmin[ivar]);
520  dsi.GetTargetInfo(ivar).SetMax(tgmax[ivar]);
521  if( TMath::Abs(tgmax[ivar]-tgmin[ivar]) <= FLT_MIN )
522  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName()) << "Target " << dsi.GetTargetInfo(ivar).GetExpression().Data() << " is constant. Please remove the variable." << Endl;
523  }
524  for (UInt_t ivar=0; ivar<nvis; ivar++) {
525  dsi.GetSpectatorInfo(ivar).SetMin(vmin[ivar]);
526  dsi.GetSpectatorInfo(ivar).SetMax(vmax[ivar]);
527  // if( TMath::Abs(vmax[ivar]-vmin[ivar]) <= FLT_MIN )
528  // Log() << kWARNING << "Spectator variable " << dsi.GetSpectatorInfo(ivar).GetExpression().Data() << " is constant." << Endl;
529  }
530  delete [] min;
531  delete [] max;
532  delete [] tgmin;
533  delete [] tgmax;
534  delete [] vmin;
535  delete [] vmax;
536 }
537 
538 ////////////////////////////////////////////////////////////////////////////////
539 /// computes correlation matrix for variables "theVars" in tree;
540 /// "theType" defines the required event "type"
541 /// ("type" variable must be present in tree)
542 
544 {
545  // first compute variance-covariance
546  TMatrixD* mat = CalcCovarianceMatrix( ds, classNumber );
547 
548  // now the correlation
549  UInt_t nvar = ds->GetNVariables(), ivar, jvar;
550 
551  for (ivar=0; ivar<nvar; ivar++) {
552  for (jvar=0; jvar<nvar; jvar++) {
553  if (ivar != jvar) {
554  Double_t d = (*mat)(ivar, ivar)*(*mat)(jvar, jvar);
555  if (d > 0) (*mat)(ivar, jvar) /= sqrt(d);
556  else {
557  Log() << kWARNING << Form("Dataset[%s] : ",DataSetInfo().GetName())<< "<GetCorrelationMatrix> Zero variances for variables "
558  << "(" << ivar << ", " << jvar << ") = " << d
559  << Endl;
560  (*mat)(ivar, jvar) = 0;
561  }
562  }
563  }
564  }
565 
566  for (ivar=0; ivar<nvar; ivar++) (*mat)(ivar, ivar) = 1.0;
567 
568  return mat;
569 }
570 
571 ////////////////////////////////////////////////////////////////////////////////
572 /// compute covariance matrix
573 
575 {
576  UInt_t nvar = ds->GetNVariables();
577  UInt_t ivar = 0, jvar = 0;
578 
579  TMatrixD* mat = new TMatrixD( nvar, nvar );
580 
581  // init matrices
582  TVectorD vec(nvar);
583  TMatrixD mat2(nvar, nvar);
584  for (ivar=0; ivar<nvar; ivar++) {
585  vec(ivar) = 0;
586  for (jvar=0; jvar<nvar; jvar++) mat2(ivar, jvar) = 0;
587  }
588 
589  // perform event loop
590  Double_t ic = 0;
591  for (Int_t i=0; i<ds->GetNEvents(); i++) {
592 
593  const Event * ev = ds->GetEvent(i);
594  if (ev->GetClass() != classNumber ) continue;
595 
596  Double_t weight = ev->GetWeight();
597  ic += weight; // count used events
598 
599  for (ivar=0; ivar<nvar; ivar++) {
600 
601  Double_t xi = ev->GetValue(ivar);
602  vec(ivar) += xi*weight;
603  mat2(ivar, ivar) += (xi*xi*weight);
604 
605  for (jvar=ivar+1; jvar<nvar; jvar++) {
606  Double_t xj = ev->GetValue(jvar);
607  mat2(ivar, jvar) += (xi*xj*weight);
608  }
609  }
610  }
611 
612  for (ivar=0; ivar<nvar; ivar++)
613  for (jvar=ivar+1; jvar<nvar; jvar++)
614  mat2(jvar, ivar) = mat2(ivar, jvar); // symmetric matrix
615 
616 
617  // variance-covariance
618  for (ivar=0; ivar<nvar; ivar++) {
619  for (jvar=0; jvar<nvar; jvar++) {
620  (*mat)(ivar, jvar) = mat2(ivar, jvar)/ic - vec(ivar)*vec(jvar)/(ic*ic);
621  }
622  }
623 
624  return mat;
625 }
626 
627 // --------------------------------------- new versions
628 
629 ////////////////////////////////////////////////////////////////////////////////
630 /// the dataset splitting
631 
632 void
634  EvtStatsPerClass& nEventRequests,
635  TString& normMode,
636  UInt_t& splitSeed,
637  TString& splitMode,
638  TString& mixMode)
639 {
640  Configurable splitSpecs( dsi.GetSplitOptions() );
641  splitSpecs.SetConfigName("DataSetFactory");
642  splitSpecs.SetConfigDescription( "Configuration options given in the \"PrepareForTrainingAndTesting\" call; these options define the creation of the data sets used for training and expert validation by TMVA" );
643 
644  splitMode = "Random"; // the splitting mode
645  splitSpecs.DeclareOptionRef( splitMode, "SplitMode",
646  "Method of picking training and testing events (default: random)" );
647  splitSpecs.AddPreDefVal(TString("Random"));
648  splitSpecs.AddPreDefVal(TString("Alternate"));
649  splitSpecs.AddPreDefVal(TString("Block"));
650 
651  mixMode = "SameAsSplitMode"; // the splitting mode
652  splitSpecs.DeclareOptionRef( mixMode, "MixMode",
653  "Method of mixing events of different classes into one dataset (default: SameAsSplitMode)" );
654  splitSpecs.AddPreDefVal(TString("SameAsSplitMode"));
655  splitSpecs.AddPreDefVal(TString("Random"));
656  splitSpecs.AddPreDefVal(TString("Alternate"));
657  splitSpecs.AddPreDefVal(TString("Block"));
658 
659  splitSeed = 100;
660  splitSpecs.DeclareOptionRef( splitSeed, "SplitSeed",
661  "Seed for random event shuffling" );
662 
663  normMode = "EqualNumEvents"; // the weight normalisation modes
664  splitSpecs.DeclareOptionRef( normMode, "NormMode",
665  "Overall renormalisation of event-by-event weights used in the training (NumEvents: average weight of 1 per event, independently for signal and background; EqualNumEvents: average weight of 1 per event for signal, and sum of weights for background equal to sum of weights for signal)" );
666  splitSpecs.AddPreDefVal(TString("None"));
667  splitSpecs.AddPreDefVal(TString("NumEvents"));
668  splitSpecs.AddPreDefVal(TString("EqualNumEvents"));
669 
670  splitSpecs.DeclareOptionRef(fScaleWithPreselEff=kFALSE,"ScaleWithPreselEff","Scale the number of requested events by the eff. of the preselection cuts (or not)" );
671 
672  // the number of events
673 
674  // fill in the numbers
675  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
676  TString clName = dsi.GetClassInfo(cl)->GetName();
677  TString titleTrain = TString().Format("Number of training events of class %s (default: 0 = all)",clName.Data()).Data();
678  TString titleTest = TString().Format("Number of test events of class %s (default: 0 = all)",clName.Data()).Data();
679  TString titleSplit = TString().Format("Split in training and test events of class %s (default: 0 = deactivated)",clName.Data()).Data();
680 
681  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTrainingEventsRequested, TString("nTrain_")+clName, titleTrain );
682  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTestingEventsRequested , TString("nTest_")+clName , titleTest );
683  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).TrainTestSplitRequested , TString("TrainTestSplit_")+clName , titleTest );
684  }
685 
686  splitSpecs.DeclareOptionRef( fVerbose, "V", "Verbosity (default: true)" );
687 
688  splitSpecs.DeclareOptionRef( fVerboseLevel=TString("Info"), "VerboseLevel", "VerboseLevel (Debug/Verbose/Info)" );
689  splitSpecs.AddPreDefVal(TString("Debug"));
690  splitSpecs.AddPreDefVal(TString("Verbose"));
691  splitSpecs.AddPreDefVal(TString("Info"));
692 
693  fCorrelations = kTRUE;
694  splitSpecs.DeclareOptionRef(fCorrelations, "Correlations", "Boolean to show correlation output (Default: true)");
695  fComputeCorrelations = kTRUE;
696  splitSpecs.DeclareOptionRef(fComputeCorrelations, "CalcCorrelations", "Compute correlations and also some variable statistics, e.g. min/max (Default: true )");
697 
698  splitSpecs.ParseOptions();
699  splitSpecs.CheckForUnusedOptions();
700 
701  // output logging verbosity
702  if (Verbose()) fLogger->SetMinType( kVERBOSE );
703  if (fVerboseLevel.CompareTo("Debug") ==0) fLogger->SetMinType( kDEBUG );
704  if (fVerboseLevel.CompareTo("Verbose") ==0) fLogger->SetMinType( kVERBOSE );
705  if (fVerboseLevel.CompareTo("Info") ==0) fLogger->SetMinType( kINFO );
706 
707  // put all to upper case
708  splitMode.ToUpper(); mixMode.ToUpper(); normMode.ToUpper();
709  // adjust mixmode if same as splitmode option has been set
710  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
711  << "\tSplitmode is: \"" << splitMode << "\" the mixmode is: \"" << mixMode << "\"" << Endl;
712  if (mixMode=="SAMEASSPLITMODE") mixMode = splitMode;
713  else if (mixMode!=splitMode)
714  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "DataSet splitmode="<<splitMode
715  <<" differs from mixmode="<<mixMode<<Endl;
716 }
717 
718 ////////////////////////////////////////////////////////////////////////////////
719 /// build empty event vectors
720 /// distributes events between kTraining/kTesting/kMaxTreeType
721 
722 void
724  TMVA::DataInputHandler& dataInput,
726  EvtStatsPerClass& eventCounts)
727 {
728  const UInt_t nclasses = dsi.GetNClasses();
729 
730  eventsmap[ Types::kTraining ] = EventVectorOfClasses(nclasses);
731  eventsmap[ Types::kTesting ] = EventVectorOfClasses(nclasses);
732  eventsmap[ Types::kMaxTreeType ] = EventVectorOfClasses(nclasses);
733 
734  // create the type, weight and boostweight branches
735  const UInt_t nvars = dsi.GetNVariables();
736  const UInt_t ntgts = dsi.GetNTargets();
737  const UInt_t nvis = dsi.GetNSpectators();
738 
739  for (size_t i=0; i<nclasses; i++) {
740  eventCounts[i].varAvLength = new Float_t[nvars];
741  for (UInt_t ivar=0; ivar<nvars; ivar++)
742  eventCounts[i].varAvLength[ivar] = 0;
743  }
744 
745  //Bool_t haveArrayVariable = kFALSE;
746  //Bool_t *varIsArray = new Bool_t[nvars];
747 
748  // If there are NaNs in the tree:
749  // => warn if used variables/cuts/weights contain nan (no problem if event is cut out)
750  // => fatal if cut value is nan or (event not cut out and nans somewhere)
751  // Count & collect all these warnings/errors and output them at the end.
752  std::map<TString, int> nanInfWarnings;
753  std::map<TString, int> nanInfErrors;
754 
755  // if we work with chains we need to remember the current tree if
756  // the chain jumps to a new tree we have to reset the formulas
757  for (UInt_t cl=0; cl<nclasses; cl++) {
758 
759  //Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Create training and testing trees -- looping over class \"" << dsi.GetClassInfo(cl)->GetName() << "\" ..." << Endl;
760 
761  EventStats& classEventCounts = eventCounts[cl];
762 
763  // info output for weights
764  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
765  << "\tWeight expression for class \'" << dsi.GetClassInfo(cl)->GetName() << "\': \""
766  << dsi.GetClassInfo(cl)->GetWeight() << "\"" << Endl;
767 
768  // used for chains only
769  TString currentFileName("");
770 
771  std::vector<TreeInfo>::const_iterator treeIt(dataInput.begin(dsi.GetClassInfo(cl)->GetName()));
772  for (;treeIt!=dataInput.end(dsi.GetClassInfo(cl)->GetName()); ++treeIt) {
773 
774  // read first the variables
775  std::vector<Float_t> vars(nvars);
776  std::vector<Float_t> tgts(ntgts);
777  std::vector<Float_t> vis(nvis);
778  TreeInfo currentInfo = *treeIt;
779 
780  Log() << kINFO << "Building event vectors for type " << currentInfo.GetTreeType() << " " << currentInfo.GetClassName() << Endl;
781 
782  EventVector& event_v = eventsmap[currentInfo.GetTreeType()].at(cl);
783 
784  Bool_t isChain = (TString("TChain") == currentInfo.GetTree()->ClassName());
785  currentInfo.GetTree()->LoadTree(0);
786  // create the TTReeFormula to evalute later on on each single event
787  ChangeToNewTree( currentInfo, dsi );
788 
789  // count number of events in tree before cut
790  classEventCounts.nInitialEvents += currentInfo.GetTree()->GetEntries();
791 
792  // flag to control a warning message when size of array in disk are bigger than what requested
793  Bool_t foundLargerArraySize = kFALSE;
794 
795  // loop over events in ntuple
796  const UInt_t nEvts = currentInfo.GetTree()->GetEntries();
797  for (Long64_t evtIdx = 0; evtIdx < nEvts; evtIdx++) {
798  currentInfo.GetTree()->LoadTree(evtIdx);
799 
800  // may need to reload tree in case of chains
801  if (isChain) {
802  if (currentInfo.GetTree()->GetTree()->GetDirectory()->GetFile()->GetName() != currentFileName) {
803  currentFileName = currentInfo.GetTree()->GetTree()->GetDirectory()->GetFile()->GetName();
804  ChangeToNewTree( currentInfo, dsi );
805  }
806  }
807  currentInfo.GetTree()->GetEntry(evtIdx);
808  Int_t sizeOfArrays = 1;
809  Int_t prevArrExpr = 0;
810  Bool_t haveAllArrayData = kFALSE;
811 
812  // ======= evaluate all formulas =================
813 
814  // first we check if some of the formulas are arrays
815  // This is the case when all inputs (variables, targets and spectetors are array and a TMVA event is not
816  // an event of the tree but an event + array index). In this case we set the flag haveAllArrayData = true
817  // Otherwise we support for arrays of variables where each
818  // element of the array corresponds to a different variable like in the case of image
819  // In that case the VAriableInfo has a bit, IsVariableFromArray that is set and we have a single formula for the array
820  // fInputFormulaTable contains a map of the formula and the variable index to evaluate the formula
821  for (UInt_t ivar = 0; ivar < nvars; ivar++) {
822  // distinguish case where variable is not from an array
823  if (dsi.IsVariableFromArray(ivar)) continue;
824  auto inputFormula = fInputTableFormulas[ivar].first;
825 
826  Int_t ndata = inputFormula->GetNdata();
827 
828  classEventCounts.varAvLength[ivar] += ndata;
829  if (ndata == 1) continue;
830  haveAllArrayData = kTRUE;
831  //varIsArray[ivar] = kTRUE;
832  //std::cout << "Found array !!!" << std::endl;
833  if (sizeOfArrays == 1) {
834  sizeOfArrays = ndata;
835  prevArrExpr = ivar;
836  }
837  else if (sizeOfArrays!=ndata) {
838  Log() << kERROR << Form("Dataset[%s] : ",dsi.GetName())<< "ERROR while preparing training and testing trees:" << Endl;
839  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " multiple array-type expressions of different length were encountered" << Endl;
840  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " location of error: event " << evtIdx
841  << " in tree " << currentInfo.GetTree()->GetName()
842  << " of file " << currentInfo.GetTree()->GetCurrentFile()->GetName() << Endl;
843  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " expression " << inputFormula->GetTitle() << " has "
844  << Form("Dataset[%s] : ",dsi.GetName()) << ndata << " entries, while" << Endl;
845  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " expression " << fInputTableFormulas[prevArrExpr].first->GetTitle() << " has "
846  << Form("Dataset[%s] : ",dsi.GetName())<< fInputTableFormulas[prevArrExpr].first->GetNdata() << " entries" << Endl;
847  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "Need to abort" << Endl;
848  }
849  }
850 
851  // now we read the information
852  for (Int_t idata = 0; idata<sizeOfArrays; idata++) {
853  Bool_t contains_NaN_or_inf = kFALSE;
854 
855  auto checkNanInf = [&](std::map<TString, int> &msgMap, Float_t value, const char *what, const char *formulaTitle) {
856  if (TMath::IsNaN(value)) {
857  contains_NaN_or_inf = kTRUE;
858  ++msgMap[TString::Format("Dataset[%s] : %s expression resolves to indeterminate value (NaN): %s", dsi.GetName(), what, formulaTitle)];
859  } else if (!TMath::Finite(value)) {
860  contains_NaN_or_inf = kTRUE;
861  ++msgMap[TString::Format("Dataset[%s] : %s expression resolves to infinite value (+inf or -inf): %s", dsi.GetName(), what, formulaTitle)];
862  }
863  };
864 
865  TTreeFormula* formula = 0;
866 
867  // the cut expression
868  Double_t cutVal = 1.;
869  formula = fCutFormulas[cl];
870  if (formula) {
871  Int_t ndata = formula->GetNdata();
872  cutVal = (ndata==1 ?
873  formula->EvalInstance(0) :
874  formula->EvalInstance(idata));
875  checkNanInf(nanInfErrors, cutVal, "Cut", formula->GetTitle());
876  }
877 
878  // if event is cut out, add to warnings, else add to errors.
879  auto &nanMessages = cutVal < 0.5 ? nanInfWarnings : nanInfErrors;
880 
881  // the input variable
882  for (UInt_t ivar=0; ivar<nvars; ivar++) {
883  auto formulaMap = fInputTableFormulas[ivar];
884  formula = formulaMap.first;
885  int inputVarIndex = formulaMap.second;
886  // check fomula ndata size (in case of arrays variable)
887  // enough to check for ivarindex = 0 then formula is the same
888  // this check might take some time. Maybe do only in debug mode
889  if (inputVarIndex == 0 && dsi.IsVariableFromArray(ivar)) {
890  Int_t ndata = formula->GetNdata();
891  Int_t arraySize = dsi.GetVarArraySize(dsi.GetVariableInfo(ivar).GetExpression());
892  if (ndata < arraySize) {
893  Log() << kFATAL << "Size of array " << dsi.GetVariableInfo(ivar).GetExpression()
894  << " in the current tree " << currentInfo.GetTree()->GetName() << " for the event " << evtIdx
895  << " is " << ndata << " instead of " << arraySize << Endl;
896  } else if (ndata > arraySize && !foundLargerArraySize) {
897  Log() << kWARNING << "Size of array " << dsi.GetVariableInfo(ivar).GetExpression()
898  << " in the current tree " << currentInfo.GetTree()->GetName() << " for the event "
899  << evtIdx << " is " << ndata << ", larger than " << arraySize << Endl;
900  Log() << kWARNING << "Some data will then be ignored. This WARNING is printed only once, "
901  << " check in case for the other variables and events " << Endl;
902  // note that following warnings will be suppressed
903  foundLargerArraySize = kTRUE;
904  }
905  }
906  formula->SetQuickLoad(true); // is this needed ???
907 
908  vars[ivar] = ( !haveAllArrayData ?
909  formula->EvalInstance(inputVarIndex) :
910  formula->EvalInstance(idata));
911  checkNanInf(nanMessages, vars[ivar], "Input", formula->GetTitle());
912  }
913 
914  // the targets
915  for (UInt_t itrgt=0; itrgt<ntgts; itrgt++) {
916  formula = fTargetFormulas[itrgt];
917  Int_t ndata = formula->GetNdata();
918  tgts[itrgt] = (ndata == 1 ?
919  formula->EvalInstance(0) :
920  formula->EvalInstance(idata));
921  checkNanInf(nanMessages, tgts[itrgt], "Target", formula->GetTitle());
922  }
923 
924  // the spectators
925  for (UInt_t itVis=0; itVis<nvis; itVis++) {
926  formula = fSpectatorFormulas[itVis];
927  Int_t ndata = formula->GetNdata();
928  vis[itVis] = (ndata == 1 ?
929  formula->EvalInstance(0) :
930  formula->EvalInstance(idata));
931  checkNanInf(nanMessages, vis[itVis], "Spectator", formula->GetTitle());
932  }
933 
934 
935  // the weight
936  Float_t weight = currentInfo.GetWeight(); // multiply by tree weight
937  formula = fWeightFormula[cl];
938  if (formula!=0) {
939  Int_t ndata = formula->GetNdata();
940  weight *= (ndata == 1 ?
941  formula->EvalInstance() :
942  formula->EvalInstance(idata));
943  checkNanInf(nanMessages, weight, "Weight", formula->GetTitle());
944  }
945 
946  // Count the events before rejection due to cut or NaN
947  // value (weighted and unweighted)
948  classEventCounts.nEvBeforeCut++;
949  if (!TMath::IsNaN(weight))
950  classEventCounts.nWeEvBeforeCut += weight;
951 
952  // apply the cut, skip rest if cut is not fulfilled
953  if (cutVal<0.5) continue;
954 
955  // global flag if negative weights exist -> can be used
956  // by classifiers who may require special data
957  // treatment (also print warning)
958  if (weight < 0) classEventCounts.nNegWeights++;
959 
960  // now read the event-values (variables and regression targets)
961 
962  if (contains_NaN_or_inf) {
963  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "NaN or +-inf in Event " << evtIdx << Endl;
964  if (sizeOfArrays>1) Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< " rejected" << Endl;
965  continue;
966  }
967 
968  // Count the events after rejection due to cut or NaN value
969  // (weighted and unweighted)
970  classEventCounts.nEvAfterCut++;
971  classEventCounts.nWeEvAfterCut += weight;
972 
973  // event accepted, fill temporary ntuple
974  event_v.push_back(new Event(vars, tgts , vis, cl , weight));
975  }
976  }
977  currentInfo.GetTree()->ResetBranchAddresses();
978  }
979  }
980 
981  if (!nanInfWarnings.empty()) {
982  Log() << kWARNING << "Found events with NaN and/or +-inf values" << Endl;
983  for (const auto &warning : nanInfWarnings) {
984  auto &log = Log() << kWARNING << warning.first;
985  if (warning.second > 1) log << " (" << warning.second << " times)";
986  log << Endl;
987  }
988  Log() << kWARNING << "These NaN and/or +-infs were all removed by the specified cut, continuing." << Endl;
989  Log() << Endl;
990  }
991 
992  if (!nanInfErrors.empty()) {
993  Log() << kWARNING << "Found events with NaN and/or +-inf values (not removed by cut)" << Endl;
994  for (const auto &error : nanInfErrors) {
995  auto &log = Log() << kWARNING << error.first;
996  if (error.second > 1) log << " (" << error.second << " times)";
997  log << Endl;
998  }
999  Log() << kFATAL << "How am I supposed to train a NaN or +-inf?!" << Endl;
1000  }
1001 
1002  // for output format, get the maximum class name length
1003  Int_t maxL = dsi.GetClassNameMaxLength();
1004 
1005  Log() << kHEADER << Form("[%s] : ",dsi.GetName()) << "Number of events in input trees" << Endl;
1006  Log() << kDEBUG << "(after possible flattening of arrays):" << Endl;
1007 
1008 
1009  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
1010  Log() << kDEBUG //<< Form("[%s] : ",dsi.GetName())
1011  << " "
1012  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
1013  << " -- number of events : "
1014  << std::setw(5) << eventCounts[cl].nEvBeforeCut
1015  << " / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvBeforeCut << Endl;
1016  }
1017 
1018  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
1019  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1020  << " " << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
1021  <<" tree -- total number of entries: "
1022  << std::setw(5) << dataInput.GetEntries(dsi.GetClassInfo(cl)->GetName()) << Endl;
1023  }
1024 
1025  if (fScaleWithPreselEff)
1026  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1027  << "\tPreselection: (will affect number of requested training and testing events)" << Endl;
1028  else
1029  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1030  << "\tPreselection: (will NOT affect number of requested training and testing events)" << Endl;
1031 
1032  if (dsi.HasCuts()) {
1033  for (UInt_t cl = 0; cl< dsi.GetNClasses(); cl++) {
1034  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " " << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
1035  << " requirement: \"" << dsi.GetClassInfo(cl)->GetCut() << "\"" << Endl;
1036  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
1037  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
1038  << " -- number of events passed: "
1039  << std::setw(5) << eventCounts[cl].nEvAfterCut
1040  << " / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvAfterCut << Endl;
1041  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
1042  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
1043  << " -- efficiency : "
1044  << std::setw(6) << eventCounts[cl].nWeEvAfterCut/eventCounts[cl].nWeEvBeforeCut << Endl;
1045  }
1046  }
1047  else Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1048  << " No preselection cuts applied on event classes" << Endl;
1049 
1050  //delete[] varIsArray;
1051 
1052 }
1053 
1054 ////////////////////////////////////////////////////////////////////////////////
1055 /// Select and distribute unassigned events to kTraining and kTesting
1056 
1059  EventVectorOfClassesOfTreeType& tmpEventVector,
1060  EvtStatsPerClass& eventCounts,
1061  const TString& splitMode,
1062  const TString& mixMode,
1063  const TString& normMode,
1064  UInt_t splitSeed)
1065 {
1066  TMVA::RandomGenerator<TRandom3> rndm(splitSeed);
1067 
1068  // ==== splitting of undefined events to kTraining and kTesting
1069 
1070  // if splitMode contains "RANDOM", then shuffle the undefined events
1071  if (splitMode.Contains( "RANDOM" ) /*&& !emptyUndefined*/ ) {
1072  // random shuffle the undefined events of each class
1073  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1074  EventVector& unspecifiedEvents = tmpEventVector[Types::kMaxTreeType].at(cls);
1075  if( ! unspecifiedEvents.empty() ) {
1076  Log() << kDEBUG << "randomly shuffling "
1077  << unspecifiedEvents.size()
1078  << " events of class " << cls
1079  << " which are not yet associated to testing or training" << Endl;
1080  std::shuffle(unspecifiedEvents.begin(), unspecifiedEvents.end(), rndm);
1081  }
1082  }
1083  }
1084 
1085  // check for each class the number of training and testing events, the requested number and the available number
1086  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "SPLITTING ========" << Endl;
1087  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1088  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "---- class " << cls << Endl;
1089  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "check number of training/testing events, requested and available number of events and for class " << cls << Endl;
1090 
1091  // check if enough or too many events are already in the training/testing eventvectors of the class cls
1092  EventVector& eventVectorTraining = tmpEventVector[ Types::kTraining ].at(cls);
1093  EventVector& eventVectorTesting = tmpEventVector[ Types::kTesting ].at(cls);
1094  EventVector& eventVectorUndefined = tmpEventVector[ Types::kMaxTreeType ].at(cls);
1095 
1096  Int_t availableTraining = eventVectorTraining.size();
1097  Int_t availableTesting = eventVectorTesting.size();
1098  Int_t availableUndefined = eventVectorUndefined.size();
1099 
1100  Float_t presel_scale;
1101  if (fScaleWithPreselEff) {
1102  presel_scale = eventCounts[cls].cutScaling();
1103  if (presel_scale < 1)
1104  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " you have opted for scaling the number of requested training/testing events\n to be scaled by the preselection efficiency"<< Endl;
1105  }else{
1106  presel_scale = 1.; // this scaling was too confusing to most people, including me! Sorry... (Helge)
1107  if (eventCounts[cls].cutScaling() < 1)
1108  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " you have opted for interpreting the requested number of training/testing events\n to be the number of events AFTER your preselection cuts" << Endl;
1109 
1110  }
1111 
1112  // If TrainTestSplit_<class> is set, set number of requested training events to split*num_all_events
1113  // Requested number of testing events is set to zero and therefore takes all other events
1114  // The option TrainTestSplit_<class> overrides nTrain_<class> or nTest_<class>
1115  if(eventCounts[cls].TrainTestSplitRequested < 1.0 && eventCounts[cls].TrainTestSplitRequested > 0.0){
1116  eventCounts[cls].nTrainingEventsRequested = Int_t(eventCounts[cls].TrainTestSplitRequested*(availableTraining+availableTesting+availableUndefined));
1117  eventCounts[cls].nTestingEventsRequested = Int_t(0);
1118  }
1119  else if(eventCounts[cls].TrainTestSplitRequested != 0.0) Log() << kFATAL << Form("The option TrainTestSplit_<class> has to be in range (0, 1] but is set to %f.",eventCounts[cls].TrainTestSplitRequested) << Endl;
1120  Int_t requestedTraining = Int_t(eventCounts[cls].nTrainingEventsRequested * presel_scale);
1121  Int_t requestedTesting = Int_t(eventCounts[cls].nTestingEventsRequested * presel_scale);
1122 
1123  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "events in training trees : " << availableTraining << Endl;
1124  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "events in testing trees : " << availableTesting << Endl;
1125  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "events in unspecified trees : " << availableUndefined << Endl;
1126  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "requested for training : " << requestedTraining << Endl;;
1127 
1128  if(presel_scale<1)
1129  Log() << " ( " << eventCounts[cls].nTrainingEventsRequested
1130  << " * " << presel_scale << " preselection efficiency)" << Endl;
1131  else
1132  Log() << Endl;
1133  Log() << kDEBUG << "requested for testing : " << requestedTesting;
1134  if(presel_scale<1)
1135  Log() << " ( " << eventCounts[cls].nTestingEventsRequested
1136  << " * " << presel_scale << " preselection efficiency)" << Endl;
1137  else
1138  Log() << Endl;
1139 
1140  // nomenclature r = available training
1141  // s = available testing
1142  // u = available undefined
1143  // R = requested training
1144  // S = requested testing
1145  // nR = to be used to select training events
1146  // nS = to be used to select test events
1147  // we have the constraint: nR + nS < r+s+u,
1148  // since we can not use more events than we have
1149  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1150  // nomenclature: Thet(x) = x, if x>0 else 0
1151  // nR = max(R,r) + 0.5 * Nfree
1152  // nS = max(S,s) + 0.5 * Nfree
1153  // nR +nS = R+S + u-R+r-S+s = u+r+s= ok! for R>r
1154  // nR +nS = r+S + u-S+s = u+r+s= ok! for r>R
1155 
1156  // three different cases might occur here
1157  //
1158  // Case a
1159  // requestedTraining and requestedTesting >0
1160  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1161  // nR = Max(R,r) + 0.5 * Nfree
1162  // nS = Max(S,s) + 0.5 * Nfree
1163  //
1164  // Case b
1165  // exactly one of requestedTraining or requestedTesting >0
1166  // assume training R >0
1167  // nR = max(R,r)
1168  // nS = s+u+r-nR
1169  // and s=nS
1170  //
1171  // Case c
1172  // requestedTraining=0, requestedTesting=0
1173  // Nfree = u-|r-s|
1174  // if NFree >=0
1175  // R = Max(r,s) + 0.5 * Nfree = S
1176  // else if r>s
1177  // R = r; S=s+u
1178  // else
1179  // R = r+u; S=s
1180  //
1181  // Next steps:
1182  // Determination of Event numbers R,S, nR, nS
1183  // distribute undefined events according to nR, nS
1184  // finally determine actual sub samples from nR and nS to be used in training / testing
1185  //
1186 
1187  Int_t useForTesting(0),useForTraining(0);
1188  Int_t allAvailable(availableUndefined + availableTraining + availableTesting);
1189 
1190  if( (requestedTraining == 0) && (requestedTesting == 0)){
1191 
1192  // Case C: balance the number of training and testing events
1193 
1194  if ( availableUndefined >= TMath::Abs(availableTraining - availableTesting) ) {
1195  // enough unspecified are available to equal training and testing
1196  useForTraining = useForTesting = allAvailable/2;
1197  } else {
1198  // all unspecified are assigned to the smaller of training / testing
1199  useForTraining = availableTraining;
1200  useForTesting = availableTesting;
1201  if (availableTraining < availableTesting)
1202  useForTraining += availableUndefined;
1203  else
1204  useForTesting += availableUndefined;
1205  }
1206  requestedTraining = useForTraining;
1207  requestedTesting = useForTesting;
1208  }
1209 
1210  else if (requestedTesting == 0){
1211  // case B
1212  useForTraining = TMath::Max(requestedTraining,availableTraining);
1213  if (allAvailable < useForTraining) {
1214  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "More events requested for training ("
1215  << requestedTraining << ") than available ("
1216  << allAvailable << ")!" << Endl;
1217  }
1218  useForTesting = allAvailable - useForTraining; // the rest
1219  requestedTesting = useForTesting;
1220  }
1221 
1222  else if (requestedTraining == 0){ // case B)
1223  useForTesting = TMath::Max(requestedTesting,availableTesting);
1224  if (allAvailable < useForTesting) {
1225  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "More events requested for testing ("
1226  << requestedTesting << ") than available ("
1227  << allAvailable << ")!" << Endl;
1228  }
1229  useForTraining= allAvailable - useForTesting; // the rest
1230  requestedTraining = useForTraining;
1231  }
1232 
1233  else {
1234  // Case A
1235  // requestedTraining R and requestedTesting S >0
1236  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1237  // nR = Max(R,r) + 0.5 * Nfree
1238  // nS = Max(S,s) + 0.5 * Nfree
1239  Int_t stillNeedForTraining = TMath::Max(requestedTraining-availableTraining,0);
1240  Int_t stillNeedForTesting = TMath::Max(requestedTesting-availableTesting,0);
1241 
1242  int NFree = availableUndefined - stillNeedForTraining - stillNeedForTesting;
1243  if (NFree <0) NFree = 0;
1244  useForTraining = TMath::Max(requestedTraining,availableTraining) + NFree/2;
1245  useForTesting= allAvailable - useForTraining; // the rest
1246  }
1247 
1248  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "determined event sample size to select training sample from="<<useForTraining<<Endl;
1249  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "determined event sample size to select test sample from="<<useForTesting<<Endl;
1250 
1251 
1252 
1253  // associate undefined events
1254  if( splitMode == "ALTERNATE" ){
1255  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "split 'ALTERNATE'" << Endl;
1256  Int_t nTraining = availableTraining;
1257  Int_t nTesting = availableTesting;
1258  for( EventVector::iterator it = eventVectorUndefined.begin(), itEnd = eventVectorUndefined.end(); it != itEnd; ){
1259  ++nTraining;
1260  if( nTraining <= requestedTraining ){
1261  eventVectorTraining.insert( eventVectorTraining.end(), (*it) );
1262  ++it;
1263  }
1264  if( it != itEnd ){
1265  ++nTesting;
1266  eventVectorTesting.insert( eventVectorTesting.end(), (*it) );
1267  ++it;
1268  }
1269  }
1270  } else {
1271  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "split '" << splitMode << "'" << Endl;
1272 
1273  // test if enough events are available
1274  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "availableundefined : " << availableUndefined << Endl;
1275  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "useForTraining : " << useForTraining << Endl;
1276  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "useForTesting : " << useForTesting << Endl;
1277  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "availableTraining : " << availableTraining << Endl;
1278  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "availableTesting : " << availableTesting << Endl;
1279 
1280  if( availableUndefined<(useForTraining-availableTraining) ||
1281  availableUndefined<(useForTesting -availableTesting ) ||
1282  availableUndefined<(useForTraining+useForTesting-availableTraining-availableTesting ) ){
1283  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "More events requested than available!" << Endl;
1284  }
1285 
1286  // select the events
1287  if (useForTraining>availableTraining){
1288  eventVectorTraining.insert( eventVectorTraining.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTraining- availableTraining );
1289  eventVectorUndefined.erase( eventVectorUndefined.begin(), eventVectorUndefined.begin() + useForTraining- availableTraining);
1290  }
1291  if (useForTesting>availableTesting){
1292  eventVectorTesting.insert( eventVectorTesting.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTesting- availableTesting );
1293  }
1294  }
1295  eventVectorUndefined.clear();
1296 
1297  // finally shorten the event vectors to the requested size by removing random events
1298  if (splitMode.Contains( "RANDOM" )){
1299  UInt_t sizeTraining = eventVectorTraining.size();
1300  if( sizeTraining > UInt_t(requestedTraining) ){
1301  std::vector<UInt_t> indicesTraining( sizeTraining );
1302  // make indices
1303  std::generate( indicesTraining.begin(), indicesTraining.end(), TMVA::Increment<UInt_t>(0) );
1304  // shuffle indices
1305  std::shuffle(indicesTraining.begin(), indicesTraining.end(), rndm);
1306  // erase indices of not needed events
1307  indicesTraining.erase( indicesTraining.begin()+sizeTraining-UInt_t(requestedTraining), indicesTraining.end() );
1308  // delete all events with the given indices
1309  for( std::vector<UInt_t>::iterator it = indicesTraining.begin(), itEnd = indicesTraining.end(); it != itEnd; ++it ){
1310  delete eventVectorTraining.at( (*it) ); // delete event
1311  eventVectorTraining.at( (*it) ) = NULL; // set pointer to NULL
1312  }
1313  // now remove and erase all events with pointer==NULL
1314  eventVectorTraining.erase( std::remove( eventVectorTraining.begin(), eventVectorTraining.end(), (void*)NULL ), eventVectorTraining.end() );
1315  }
1316 
1317  UInt_t sizeTesting = eventVectorTesting.size();
1318  if( sizeTesting > UInt_t(requestedTesting) ){
1319  std::vector<UInt_t> indicesTesting( sizeTesting );
1320  // make indices
1321  std::generate( indicesTesting.begin(), indicesTesting.end(), TMVA::Increment<UInt_t>(0) );
1322  // shuffle indices
1323  std::shuffle(indicesTesting.begin(), indicesTesting.end(), rndm);
1324  // erase indices of not needed events
1325  indicesTesting.erase( indicesTesting.begin()+sizeTesting-UInt_t(requestedTesting), indicesTesting.end() );
1326  // delete all events with the given indices
1327  for( std::vector<UInt_t>::iterator it = indicesTesting.begin(), itEnd = indicesTesting.end(); it != itEnd; ++it ){
1328  delete eventVectorTesting.at( (*it) ); // delete event
1329  eventVectorTesting.at( (*it) ) = NULL; // set pointer to NULL
1330  }
1331  // now remove and erase all events with pointer==NULL
1332  eventVectorTesting.erase( std::remove( eventVectorTesting.begin(), eventVectorTesting.end(), (void*)NULL ), eventVectorTesting.end() );
1333  }
1334  }
1335  else { // erase at end
1336  if( eventVectorTraining.size() < UInt_t(requestedTraining) )
1337  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "DataSetFactory/requested number of training samples larger than size of eventVectorTraining.\n"
1338  << "There is probably an issue. Please contact the TMVA developers." << Endl;
1339  std::for_each( eventVectorTraining.begin()+requestedTraining, eventVectorTraining.end(), DeleteFunctor<Event>() );
1340  eventVectorTraining.erase(eventVectorTraining.begin()+requestedTraining,eventVectorTraining.end());
1341 
1342  if( eventVectorTesting.size() < UInt_t(requestedTesting) )
1343  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "DataSetFactory/requested number of testing samples larger than size of eventVectorTesting.\n"
1344  << "There is probably an issue. Please contact the TMVA developers." << Endl;
1345  std::for_each( eventVectorTesting.begin()+requestedTesting, eventVectorTesting.end(), DeleteFunctor<Event>() );
1346  eventVectorTesting.erase(eventVectorTesting.begin()+requestedTesting,eventVectorTesting.end());
1347  }
1348  }
1349 
1350  TMVA::DataSetFactory::RenormEvents( dsi, tmpEventVector, eventCounts, normMode );
1351 
1352  Int_t trainingSize = 0;
1353  Int_t testingSize = 0;
1354 
1355  // sum up number of training and testing events
1356  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1357  trainingSize += tmpEventVector[Types::kTraining].at(cls).size();
1358  testingSize += tmpEventVector[Types::kTesting].at(cls).size();
1359  }
1360 
1361  // --- collect all training (testing) events into the training (testing) eventvector
1362 
1363  // create event vectors reserve enough space
1364  EventVector* trainingEventVector = new EventVector();
1365  EventVector* testingEventVector = new EventVector();
1366 
1367  trainingEventVector->reserve( trainingSize );
1368  testingEventVector->reserve( testingSize );
1369 
1370 
1371  // collect the events
1372 
1373  // mixing of kTraining and kTesting data sets
1374  Log() << kDEBUG << " MIXING ============= " << Endl;
1375 
1376  if( mixMode == "ALTERNATE" ){
1377  // Inform user if he tries to use alternate mixmode for
1378  // event classes with different number of events, this works but the alternation stops at the last event of the smaller class
1379  for( UInt_t cls = 1; cls < dsi.GetNClasses(); ++cls ){
1380  if (tmpEventVector[Types::kTraining].at(cls).size() != tmpEventVector[Types::kTraining].at(0).size()){
1381  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Training sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1382  }
1383  if (tmpEventVector[Types::kTesting].at(cls).size() != tmpEventVector[Types::kTesting].at(0).size()){
1384  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Testing sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1385  }
1386  }
1387  typedef EventVector::iterator EvtVecIt;
1388  EvtVecIt itEvent, itEventEnd;
1389 
1390  // insert first class
1391  Log() << kDEBUG << "insert class 0 into training and test vector" << Endl;
1392  trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[Types::kTraining].at(0).begin(), tmpEventVector[Types::kTraining].at(0).end() );
1393  testingEventVector->insert( testingEventVector->end(), tmpEventVector[Types::kTesting].at(0).begin(), tmpEventVector[Types::kTesting].at(0).end() );
1394 
1395  // insert other classes
1396  EvtVecIt itTarget;
1397  for( UInt_t cls = 1; cls < dsi.GetNClasses(); ++cls ){
1398  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "insert class " << cls << Endl;
1399  // training vector
1400  itTarget = trainingEventVector->begin() - 1; // start one before begin
1401  // loop over source
1402  for( itEvent = tmpEventVector[Types::kTraining].at(cls).begin(), itEventEnd = tmpEventVector[Types::kTraining].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1403  // if( std::distance( itTarget, trainingEventVector->end()) < Int_t(cls+1) ) {
1404  if( (trainingEventVector->end() - itTarget) < Int_t(cls+1) ) {
1405  itTarget = trainingEventVector->end();
1406  trainingEventVector->insert( itTarget, itEvent, itEventEnd ); // fill in the rest without mixing
1407  break;
1408  }else{
1409  itTarget += cls+1;
1410  trainingEventVector->insert( itTarget, (*itEvent) ); // fill event
1411  }
1412  }
1413  // testing vector
1414  itTarget = testingEventVector->begin() - 1;
1415  // loop over source
1416  for( itEvent = tmpEventVector[Types::kTesting].at(cls).begin(), itEventEnd = tmpEventVector[Types::kTesting].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1417  // if( std::distance( itTarget, testingEventVector->end()) < Int_t(cls+1) ) {
1418  if( ( testingEventVector->end() - itTarget ) < Int_t(cls+1) ) {
1419  itTarget = testingEventVector->end();
1420  testingEventVector->insert( itTarget, itEvent, itEventEnd ); // fill in the rest without mixing
1421  break;
1422  }else{
1423  itTarget += cls+1;
1424  testingEventVector->insert( itTarget, (*itEvent) ); // fill event
1425  }
1426  }
1427  }
1428  }else{
1429  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1430  trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[Types::kTraining].at(cls).begin(), tmpEventVector[Types::kTraining].at(cls).end() );
1431  testingEventVector->insert ( testingEventVector->end(), tmpEventVector[Types::kTesting].at(cls).begin(), tmpEventVector[Types::kTesting].at(cls).end() );
1432  }
1433  }
1434  // delete the tmpEventVector (but not the events therein)
1435  tmpEventVector[Types::kTraining].clear();
1436  tmpEventVector[Types::kTesting].clear();
1437 
1438  tmpEventVector[Types::kMaxTreeType].clear();
1439 
1440  if (mixMode == "RANDOM") {
1441  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "shuffling events"<<Endl;
1442 
1443  std::shuffle(trainingEventVector->begin(), trainingEventVector->end(), rndm);
1444  std::shuffle(testingEventVector->begin(), testingEventVector->end(), rndm);
1445  }
1446 
1447  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "trainingEventVector " << trainingEventVector->size() << Endl;
1448  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "testingEventVector " << testingEventVector->size() << Endl;
1449 
1450  // create dataset
1451  DataSet* ds = new DataSet(dsi);
1452 
1453  // Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Create internal training tree" << Endl;
1454  ds->SetEventCollection(trainingEventVector, Types::kTraining );
1455  // Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Create internal testing tree" << Endl;
1456  ds->SetEventCollection(testingEventVector, Types::kTesting );
1457 
1458 
1459  if (ds->GetNTrainingEvents() < 1){
1460  Log() << kFATAL << "Dataset " << std::string(dsi.GetName()) << " does not have any training events, I better stop here and let you fix that one first " << Endl;
1461  }
1462 
1463  if (ds->GetNTestEvents() < 1) {
1464  Log() << kERROR << "Dataset " << std::string(dsi.GetName()) << " does not have any testing events, guess that will cause problems later..but for now, I continue " << Endl;
1465  }
1466 
1467  delete trainingEventVector;
1468  delete testingEventVector;
1469  return ds;
1470 
1471 }
1472 
1473 ////////////////////////////////////////////////////////////////////////////////
1474 /// renormalisation of the TRAINING event weights
1475 /// - none (kind of obvious) .. use the weights as supplied by the
1476 /// user.. (we store however the relative weight for later use)
1477 /// - numEvents
1478 /// - equalNumEvents reweight the training events such that the sum of all
1479 /// backgr. (class > 0) weights equal that of the signal (class 0)
1480 
1481 void
1483  EventVectorOfClassesOfTreeType& tmpEventVector,
1484  const EvtStatsPerClass& eventCounts,
1485  const TString& normMode )
1486 {
1487 
1488 
1489  // print rescaling info
1490  // ---------------------------------
1491  // compute sizes and sums of weights
1492  Int_t trainingSize = 0;
1493  Int_t testingSize = 0;
1494 
1495  ValuePerClass trainingSumWeightsPerClass( dsi.GetNClasses() );
1496  ValuePerClass testingSumWeightsPerClass( dsi.GetNClasses() );
1497 
1498  NumberPerClass trainingSizePerClass( dsi.GetNClasses() );
1499  NumberPerClass testingSizePerClass( dsi.GetNClasses() );
1500 
1501  Double_t trainingSumSignalWeights = 0;
1502  Double_t trainingSumBackgrWeights = 0; // Backgr. includes all classes that are not signal
1503  Double_t testingSumSignalWeights = 0;
1504  Double_t testingSumBackgrWeights = 0; // Backgr. includes all classes that are not signal
1505 
1506 
1507 
1508  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1509  trainingSizePerClass.at(cls) = tmpEventVector[Types::kTraining].at(cls).size();
1510  testingSizePerClass.at(cls) = tmpEventVector[Types::kTesting].at(cls).size();
1511 
1512  trainingSize += trainingSizePerClass.back();
1513  testingSize += testingSizePerClass.back();
1514 
1515  // the functional solution
1516  // sum up the weights in Double_t although the individual weights are Float_t to prevent rounding issues in addition of floating points
1517  //
1518  // accumulate --> does what the name says
1519  // begin() and end() denote the range of the vector to be accumulated
1520  // Double_t(0) tells accumulate the type and the starting value
1521  // compose_binary creates a BinaryFunction of ...
1522  // std::plus<Double_t>() knows how to sum up two doubles
1523  // null<Double_t>() leaves the first argument (the running sum) unchanged and returns it
1524  //
1525  // all together sums up all the event-weights of the events in the vector and returns it
1526  trainingSumWeightsPerClass.at(cls) =
1527  std::accumulate(tmpEventVector[Types::kTraining].at(cls).begin(),
1528  tmpEventVector[Types::kTraining].at(cls).end(),
1529  Double_t(0), [](Double_t w, const TMVA::Event *E) { return w + E->GetOriginalWeight(); });
1530 
1531  testingSumWeightsPerClass.at(cls) =
1532  std::accumulate(tmpEventVector[Types::kTesting].at(cls).begin(),
1533  tmpEventVector[Types::kTesting].at(cls).end(),
1534  Double_t(0), [](Double_t w, const TMVA::Event *E) { return w + E->GetOriginalWeight(); });
1535 
1536  if ( cls == dsi.GetSignalClassIndex()){
1537  trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1538  testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1539  }else{
1540  trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1541  testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1542  }
1543  }
1544 
1545  // ---------------------------------
1546  // compute renormalization factors
1547 
1548  ValuePerClass renormFactor( dsi.GetNClasses() );
1549 
1550 
1551  // for information purposes
1552  dsi.SetNormalization( normMode );
1553  // !! these will be overwritten later by the 'rescaled' ones if
1554  // NormMode != None !!!
1555  dsi.SetTrainingSumSignalWeights(trainingSumSignalWeights);
1556  dsi.SetTrainingSumBackgrWeights(trainingSumBackgrWeights);
1557  dsi.SetTestingSumSignalWeights(testingSumSignalWeights);
1558  dsi.SetTestingSumBackgrWeights(testingSumBackgrWeights);
1559 
1560 
1561  if (normMode == "NONE") {
1562  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "No weight renormalisation applied: use original global and event weights" << Endl;
1563  return;
1564  }
1565  //changed by Helge 27.5.2013 What on earth was done here before? I still remember the idea behind this which apparently was
1566  //NOT understood by the 'programmer' :) .. the idea was to have SAME amount of effective TRAINING data for signal and background.
1567  // Testing events are totally irrelevant for this and might actually skew the whole normalisation!!
1568  else if (normMode == "NUMEVENTS") {
1569  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1570  << "\tWeight renormalisation mode: \"NumEvents\": renormalises all event classes " << Endl;
1571  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1572  << " such that the effective (weighted) number of events in each class equals the respective " << Endl;
1573  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1574  << " number of events (entries) that you demanded in PrepareTrainingAndTestTree(\"\",\"nTrain_Signal=.. )" << Endl;
1575  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1576  << " ... i.e. such that Sum[i=1..N_j]{w_i} = N_j, j=0,1,2..." << Endl;
1577  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1578  << " ... (note that N_j is the sum of TRAINING events (nTrain_j...with j=Signal,Background.." << Endl;
1579  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1580  << " ..... Testing events are not renormalised nor included in the renormalisation factor! )"<< Endl;
1581 
1582  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1583  // renormFactor.at(cls) = ( (trainingSizePerClass.at(cls) + testingSizePerClass.at(cls))/
1584  // (trainingSumWeightsPerClass.at(cls) + testingSumWeightsPerClass.at(cls)) );
1585  //changed by Helge 27.5.2013
1586  renormFactor.at(cls) = ((Float_t)trainingSizePerClass.at(cls) )/
1587  (trainingSumWeightsPerClass.at(cls)) ;
1588  }
1589  }
1590  else if (normMode == "EQUALNUMEVENTS") {
1591  //changed by Helge 27.5.2013 What on earth was done here before? I still remember the idea behind this which apparently was
1592  //NOT understood by the 'programmer' :) .. the idea was to have SAME amount of effective TRAINING data for signal and background.
1593  //done here was something like having each data source normalized to its number of entries and this even for training+testing together.
1594  // what should this have been good for ???
1595 
1596  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Weight renormalisation mode: \"EqualNumEvents\": renormalises all event classes ..." << Endl;
1597  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " such that the effective (weighted) number of events in each class is the same " << Endl;
1598  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " (and equals the number of events (entries) given for class=0 )" << Endl;
1599  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "... i.e. such that Sum[i=1..N_j]{w_i} = N_classA, j=classA, classB, ..." << Endl;
1600  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "... (note that N_j is the sum of TRAINING events" << Endl;
1601  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " ..... Testing events are not renormalised nor included in the renormalisation factor!)" << Endl;
1602 
1603  // normalize to size of first class
1604  UInt_t referenceClass = 0;
1605  for (UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ) {
1606  renormFactor.at(cls) = Float_t(trainingSizePerClass.at(referenceClass))/
1607  (trainingSumWeightsPerClass.at(cls));
1608  }
1609  }
1610  else {
1611  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "<PrepareForTrainingAndTesting> Unknown NormMode: " << normMode << Endl;
1612  }
1613 
1614  // ---------------------------------
1615  // now apply the normalization factors
1616  Int_t maxL = dsi.GetClassNameMaxLength();
1617  for (UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls<clsEnd; ++cls) {
1618  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1619  << "--> Rescale " << setiosflags(ios::left) << std::setw(maxL)
1620  << dsi.GetClassInfo(cls)->GetName() << " event weights by factor: " << renormFactor.at(cls) << Endl;
1621  for (EventVector::iterator it = tmpEventVector[Types::kTraining].at(cls).begin(),
1622  itEnd = tmpEventVector[Types::kTraining].at(cls).end(); it != itEnd; ++it){
1623  (*it)->SetWeight ((*it)->GetWeight() * renormFactor.at(cls));
1624  }
1625 
1626  }
1627 
1628 
1629  // print out the result
1630  // (same code as before --> this can be done nicer )
1631  //
1632 
1633  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1634  << "Number of training and testing events" << Endl;
1635  Log() << kDEBUG << "\tafter rescaling:" << Endl;
1636  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1637  << "---------------------------------------------------------------------------" << Endl;
1638 
1639  trainingSumSignalWeights = 0;
1640  trainingSumBackgrWeights = 0; // Backgr. includes all classes that are not signal
1641  testingSumSignalWeights = 0;
1642  testingSumBackgrWeights = 0; // Backgr. includes all classes that are not signal
1643 
1644  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1645  trainingSumWeightsPerClass.at(cls) =
1646  std::accumulate(tmpEventVector[Types::kTraining].at(cls).begin(),
1647  tmpEventVector[Types::kTraining].at(cls).end(),
1648  Double_t(0), [](Double_t w, const TMVA::Event *E) { return w + E->GetOriginalWeight(); });
1649 
1650  testingSumWeightsPerClass.at(cls) =
1651  std::accumulate(tmpEventVector[Types::kTesting].at(cls).begin(),
1652  tmpEventVector[Types::kTesting].at(cls).end(),
1653  Double_t(0), [](Double_t w, const TMVA::Event *E) { return w + E->GetOriginalWeight(); });
1654 
1655  if ( cls == dsi.GetSignalClassIndex()){
1656  trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1657  testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1658  }else{
1659  trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1660  testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1661  }
1662 
1663  // output statistics
1664 
1665  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1666  << setiosflags(ios::left) << std::setw(maxL)
1667  << dsi.GetClassInfo(cls)->GetName() << " -- "
1668  << "training events : " << trainingSizePerClass.at(cls) << Endl;
1669  Log() << kDEBUG << "\t(sum of weights: " << trainingSumWeightsPerClass.at(cls) << ")"
1670  << " - requested were " << eventCounts[cls].nTrainingEventsRequested << " events" << Endl;
1671  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1672  << setiosflags(ios::left) << std::setw(maxL)
1673  << dsi.GetClassInfo(cls)->GetName() << " -- "
1674  << "testing events : " << testingSizePerClass.at(cls) << Endl;
1675  Log() << kDEBUG << "\t(sum of weights: " << testingSumWeightsPerClass.at(cls) << ")"
1676  << " - requested were " << eventCounts[cls].nTestingEventsRequested << " events" << Endl;
1677  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1678  << setiosflags(ios::left) << std::setw(maxL)
1679  << dsi.GetClassInfo(cls)->GetName() << " -- "
1680  << "training and testing events: "
1681  << (trainingSizePerClass.at(cls)+testingSizePerClass.at(cls)) << Endl;
1682  Log() << kDEBUG << "\t(sum of weights: "
1683  << (trainingSumWeightsPerClass.at(cls)+testingSumWeightsPerClass.at(cls)) << ")" << Endl;
1684  if(eventCounts[cls].nEvAfterCut<eventCounts[cls].nEvBeforeCut) {
1685  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << setiosflags(ios::left) << std::setw(maxL)
1686  << dsi.GetClassInfo(cls)->GetName() << " -- "
1687  << "due to the preselection a scaling factor has been applied to the numbers of requested events: "
1688  << eventCounts[cls].cutScaling() << Endl;
1689  }
1690  }
1691  Log() << kINFO << Endl;
1692 
1693  // for information purposes
1694  dsi.SetTrainingSumSignalWeights(trainingSumSignalWeights);
1695  dsi.SetTrainingSumBackgrWeights(trainingSumBackgrWeights);
1696  dsi.SetTestingSumSignalWeights(testingSumSignalWeights);
1697  dsi.SetTestingSumBackgrWeights(testingSumBackgrWeights);
1698 
1699 
1700 }
TMVA::DataSetInfo::SetTestingSumSignalWeights
void SetTestingSumSignalWeights(Double_t testingSumSignalWeights)
Definition: DataSetInfo.h:138
TMVA::DataSetFactory::ChangeToNewTree
void ChangeToNewTree(TreeInfo &, const DataSetInfo &)
While the data gets copied into the local training and testing trees, the input tree can change (for ...
Definition: DataSetFactory.cxx:288
TMVA::DataSetFactory::CalcMinMax
void CalcMinMax(DataSet *, DataSetInfo &dsi)
compute covariance matrix
Definition: DataSetFactory.cxx:474
TCut
A specialized string object used for TTree selections.
Definition: TCut.h:25
TMVA::DataSetInfo::SetTrainingSumBackgrWeights
void SetTrainingSumBackgrWeights(Double_t trainingSumBackgrWeights)
Definition: DataSetInfo.h:137
TTreeFormula::SetQuickLoad
void SetQuickLoad(Bool_t quick)
Definition: TTreeFormula.h:207
TMVA::DataSetFactory::BuildEventVector
void BuildEventVector(DataSetInfo &dsi, DataInputHandler &dataInput, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts)
build empty event vectors distributes events between kTraining/kTesting/kMaxTreeType
Definition: DataSetFactory.cxx:723
VariablePCATransform.h
kTRUE
const Bool_t kTRUE
Definition: RtypesCore.h:91
TMVA::DataSetFactory::BuildDynamicDataSet
DataSet * BuildDynamicDataSet(DataSetInfo &)
Definition: DataSetFactory.cxx:149
TMVA::DataSetFactory::EvtStatsPerClass
std::vector< EventStats > EvtStatsPerClass
Definition: DataSetFactory.h:213
TMVA::Configurable
Definition: Configurable.h:45
TMVA::DataSetFactory::EventStats::nWeEvBeforeCut
Float_t nWeEvBeforeCut
Definition: DataSetFactory.h:192
TMVA::DataSetFactory::ValuePerClass
std::vector< Double_t > ValuePerClass
Definition: DataSetFactory.h:181
TMath::Max
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:212
TMVA::Increment
Definition: DataSetFactory.h:85
TMVA::DataSetFactory::EventVectorOfClasses
std::vector< EventVector > EventVectorOfClasses
Definition: DataSetFactory.h:177
TMVA::DataSetFactory::InitOptions
void InitOptions(DataSetInfo &dsi, EvtStatsPerClass &eventsmap, TString &normMode, UInt_t &splitSeed, TString &splitMode, TString &mixMode)
the dataset splitting
Definition: DataSetFactory.cxx:633
TMVA::DataInputHandler::begin
std::vector< TreeInfo >::const_iterator begin(const TString &className) const
Definition: DataInputHandler.h:111
TMVA::DataSetInfo::SetNormalization
void SetNormalization(const TString &norm)
Definition: DataSetInfo.h:132
TMVA::DataInputHandler
Class that contains all the data information.
Definition: DataInputHandler.h:78
TString::Data
const char * Data() const
Definition: TString.h:369
DataSetInfo.h
Form
char * Form(const char *fmt,...)
TMVA::Configurable::AddPreDefVal
void AddPreDefVal(const T &)
Definition: Configurable.h:168
TNamed::GetTitle
virtual const char * GetTitle() const
Returns title of object.
Definition: TNamed.h:48
TMVA::Event::GetClass
UInt_t GetClass() const
Definition: Event.h:86
TBranch.h
Long64_t
long long Long64_t
Definition: RtypesCore.h:73
TMath::Log
Double_t Log(Double_t x)
Definition: TMath.h:760
TMVA::Types::kTesting
@ kTesting
Definition: Types.h:146
TMVA::RandomGenerator
Definition: Tools.h:305
TTree
A TTree represents a columnar dataset.
Definition: TTree.h:79
extract_docstrings.ds
ds
Definition: extract_docstrings.py:40
TMVA::DataSetInfo::GetNVariables
UInt_t GetNVariables() const
Definition: DataSetInfo.h:127
TMVA::DataSetFactory::CalcCovarianceMatrix
TMatrixD * CalcCovarianceMatrix(DataSet *, const UInt_t classNumber)
compute covariance matrix
Definition: DataSetFactory.cxx:574
TMVA::DataSetInfo::GetNSpectators
UInt_t GetNSpectators(bool all=kTRUE) const
Definition: DataSetInfo.cxx:496
Float_t
float Float_t
Definition: RtypesCore.h:57
log
double log(double)
VariableDecorrTransform.h
VariableInfo.h
TMVA::DataSetInfo::HasCuts
Bool_t HasCuts() const
Definition: DataSetInfo.cxx:186
Int_t
int Int_t
Definition: RtypesCore.h:45
TMVA::DataSetFactory::EventStats::varAvLength
Float_t * varAvLength
Definition: DataSetFactory.h:195
VariableIdentityTransform.h
TString::Contains
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Definition: TString.h:624
TMVA::DataSetFactory::EventStats::nInitialEvents
Int_t nInitialEvents
Definition: DataSetFactory.h:189
TMVA::DataSetInfo::GetSpectatorInfos
std::vector< VariableInfo > & GetSpectatorInfos()
Definition: DataSetInfo.h:122
TMVA::Event::GetTarget
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:102
TMVA::DataInputHandler::GetClassList
std::vector< TString > * GetClassList() const
Definition: DataInputHandler.cxx:193
TMath::Abs
Short_t Abs(Short_t d)
Definition: TMathBase.h:120
TMVA::DataSetFactory::EventVectorOfClassesOfTreeType
std::map< Types::ETreeType, EventVectorOfClasses > EventVectorOfClassesOfTreeType
Definition: DataSetFactory.h:178
TString::Format
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
Definition: TString.cxx:2311
TTree.h
TMVA::DataSetFactory::BuildInitialDataSet
DataSet * BuildInitialDataSet(DataSetInfo &, TMVA::DataInputHandler &)
if no entries, than create a DataSet with one Event which uses dynamic variables (pointers to variabl...
Definition: DataSetFactory.cxx:197
TString
Basic string class.
Definition: TString.h:136
TMVA::DataSetInfo::GetVarArraySize
Int_t GetVarArraySize(const TString &expression) const
Definition: DataSetInfo.h:108
TMVA::VariableInfo::SetMax
void SetMax(Double_t v)
Definition: VariableInfo.h:72
TMatrixT< Double_t >
v
@ v
Definition: rootcling_impl.cxx:3635
TMVA::DataSetFactory::EventStats
Definition: DataSetFactory.h:184
b
#define b(i)
Definition: RSha256.hxx:100
TTree::GetEntry
virtual Int_t GetEntry(Long64_t entry=0, Int_t getall=0)
Read all branches of entry and return total number of bytes read.
Definition: TTree.cxx:5548
TFile.h
TMVA::DataSetInfo::GetVariableInfo
VariableInfo & GetVariableInfo(Int_t i)
Definition: DataSetInfo.h:105
bool
TMVA::DataSetInfo::PrintCorrelationMatrix
void PrintCorrelationMatrix(const TString &className)
calculates the correlation matrices for signal and background, prints them to standard output,...
Definition: DataSetInfo.cxx:406
TString::ToUpper
void ToUpper()
Change string to upper case.
Definition: TString.cxx:1138
TMVA::DataSetInfo::GetNClasses
UInt_t GetNClasses() const
Definition: DataSetInfo.h:155
TTree::GetTree
virtual TTree * GetTree() const
Definition: TTree.h:512
TMVA::Event::GetValue
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
Definition: Event.cxx:236
TTree::GetCurrentFile
TFile * GetCurrentFile() const
Return pointer to the current file.
Definition: TTree.cxx:5389
TMVA::DataSetFactory::EventStats::nNegWeights
Double_t nNegWeights
Definition: DataSetFactory.h:194
TMVA::LargestCommonDivider
Int_t LargestCommonDivider(Int_t a, Int_t b)
Definition: DataSetFactory.cxx:80
TMVA::DataSetInfo
Class that contains all the data information.
Definition: DataSetInfo.h:62
TMVA::DataSetInfo::IsVariableFromArray
Bool_t IsVariableFromArray(Int_t i) const
Definition: DataSetInfo.h:112
DataInputHandler.h
TTreeFormula::GetNdata
virtual Int_t GetNdata()
Return number of available instances in the formula.
Definition: TTreeFormula.cxx:4444
TMVA::DataSetFactory::NumberPerClass
std::vector< int > NumberPerClass
Definition: DataSetFactory.h:212
MsgLogger.h
TMVA::TreeInfo::GetWeight
Double_t GetWeight() const
Definition: DataInputHandler.h:62
TMVA::TreeInfo
Definition: DataInputHandler.h:52
TMVA::Configurable::CheckForUnusedOptions
void CheckForUnusedOptions() const
checks for unused options in option string
Definition: Configurable.cxx:270
TTreeFormula::GetLeaf
virtual TLeaf * GetLeaf(Int_t n) const
Return leaf corresponding to serial number n.
Definition: TTreeFormula.cxx:4424
TMVA::DataSetFactory::CheckTTreeFormula
Bool_t CheckTTreeFormula(TTreeFormula *ttf, const TString &expression, Bool_t &hasDollar)
checks a TTreeFormula for problems
Definition: DataSetFactory.cxx:246
TMVA::DataSetFactory::~DataSetFactory
~DataSetFactory()
destructor
Definition: DataSetFactory.cxx:107
TLeaf
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition: TLeaf.h:57
TMVA::VariableInfo::SetMin
void SetMin(Double_t v)
Definition: VariableInfo.h:71
TTree::LoadTree
virtual Long64_t LoadTree(Long64_t entry)
Set current entry.
Definition: TTree.cxx:6382
TMath::IsNaN
Bool_t IsNaN(Double_t x)
Definition: TMath.h:892
TMVA::DataSetFactory::CreateDataSet
DataSet * CreateDataSet(DataSetInfo &, DataInputHandler &)
steering the creation of a new dataset
Definition: DataSetFactory.cxx:123
a
auto * a
Definition: textangle.C:12
TMVA::DataSetInfo::GetSpectatorInfo
VariableInfo & GetSpectatorInfo(Int_t i)
Definition: DataSetInfo.h:124
TMVA::DataSetInfo::GetClassInfo
ClassInfo * GetClassInfo(Int_t clNum) const
Definition: DataSetInfo.cxx:146
RooFit::Verbose
RooCmdArg Verbose(Bool_t flag=kTRUE)
Definition: RooGlobalFunc.cxx:189
kFALSE
const Bool_t kFALSE
Definition: RtypesCore.h:92
TMVA::DataSetFactory::RenormEvents
void RenormEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, const EvtStatsPerClass &eventCounts, const TString &normMode)
renormalisation of the TRAINING event weights
Definition: DataSetFactory.cxx:1482
TMVA::DataSetFactory::MixEvents
DataSet * MixEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts, const TString &splitMode, const TString &mixMode, const TString &normMode, UInt_t splitSeed)
Select and distribute unassigned events to kTraining and kTesting.
Definition: DataSetFactory.cxx:1058
TTreeFormula::EvalInstance
T EvalInstance(Int_t i=0, const char *stringStack[]=0)
Evaluate this treeformula.
Definition: TTreeFormula.cxx:3936
TMVA::DataSet
Class that contains all the data information.
Definition: DataSet.h:58
TMVA::DataSetInfo::SetTrainingSumSignalWeights
void SetTrainingSumSignalWeights(Double_t trainingSumSignalWeights)
Definition: DataSetInfo.h:134
Event.h
TTreeFormula
Used to pass a selection expression to the Tree drawing routine.
Definition: TTreeFormula.h:58
TRandom3.h
what
static const char * what
Definition: stlLoader.cc:6
TTreeFormula::GetNcodes
virtual Int_t GetNcodes() const
Definition: TTreeFormula.h:193
UInt_t
unsigned int UInt_t
Definition: RtypesCore.h:46
sqrt
double sqrt(double)
Types.h
Configurable.h
TMVA::DataSetInfo::GetSplitOptions
const TString & GetSplitOptions() const
Definition: DataSetInfo.h:186
TMVA::TreeInfo::GetTreeType
Types::ETreeType GetTreeType() const
Definition: DataInputHandler.h:64
TMVA::VariableInfo::GetInternalName
const TString & GetInternalName() const
Definition: VariableInfo.h:58
TMVA::Types::kMaxTreeType
@ kMaxTreeType
Definition: Types.h:147
TMVA::Endl
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
unsigned int
TMVA::ClassInfo::GetCut
const TCut & GetCut() const
Definition: ClassInfo.h:64
TTree::GetDirectory
TDirectory * GetDirectory() const
Definition: TTree.h:457
TMVA::Types::kTraining
@ kTraining
Definition: Types.h:145
TMVA::DataSetInfo::SetCorrelationMatrix
void SetCorrelationMatrix(const TString &className, TMatrixD *matrix)
Definition: DataSetInfo.cxx:352
TMVA::DataSetInfo::GetNTargets
UInt_t GetNTargets() const
Definition: DataSetInfo.h:128
DataSetFactory.h
TVectorT< Double_t >
TLeaf::GetBranch
TBranch * GetBranch() const
Definition: TLeaf.h:115
TMVA::DataSetFactory::DataSetFactory
DataSetFactory()
constructor
Definition: DataSetFactory.cxx:93
TMVA::Event::GetSpectator
Float_t GetSpectator(UInt_t ivar) const
return spectator content
Definition: Event.cxx:261
Double_t
double Double_t
Definition: RtypesCore.h:59
TMVA::DataSetFactory::CalcCorrelationMatrix
TMatrixD * CalcCorrelationMatrix(DataSet *, const UInt_t classNumber)
computes correlation matrix for variables "theVars" in tree; "theType" defines the required event "ty...
Definition: DataSetFactory.cxx:543
TMVA::MsgLogger
ostringstream derivative to redirect and format output
Definition: MsgLogger.h:59
TMVA::DataSetInfo::GetVariableInfos
std::vector< VariableInfo > & GetVariableInfos()
Definition: DataSetInfo.h:103
TMVA::TreeInfo::GetClassName
const TString & GetClassName() const
Definition: DataInputHandler.h:65
TMVA::DataSetFactory::EventStats::nWeEvAfterCut
Float_t nWeEvAfterCut
Definition: DataSetFactory.h:193
TMVA::Event::GetWeight
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not.
Definition: Event.cxx:381
TMatrixD
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:22
TMVA::DataSetInfo::AddClass
ClassInfo * AddClass(const TString &className)
Definition: DataSetInfo.cxx:113
TMVA::ClassInfo::SetNumber
void SetNumber(const UInt_t index)
Definition: ClassInfo.h:59
TMVA::DataSetInfo::GetName
virtual const char * GetName() const
Returns name of object.
Definition: DataSetInfo.h:71
TMVA::Event
Definition: Event.h:51
TMVA::ClassInfo::GetWeight
const TString & GetWeight() const
Definition: ClassInfo.h:63
TMVA::Configurable::SetConfigDescription
void SetConfigDescription(const char *d)
Definition: Configurable.h:64
TDirectory::GetFile
virtual TFile * GetFile() const
Definition: TDirectory.h:165
TMVA::DataSetFactory::EventStats::nEvBeforeCut
Int_t nEvBeforeCut
Definition: DataSetFactory.h:190
TMVA::DataSetInfo::SetTestingSumBackgrWeights
void SetTestingSumBackgrWeights(Double_t testingSumBackgrWeights)
Definition: DataSetInfo.h:139
TVectorF.h
TMVA::DataSetFactory::EventStats::nEvAfterCut
Int_t nEvAfterCut
Definition: DataSetFactory.h:191
d
#define d(i)
Definition: RSha256.hxx:102
TMVA::VariableInfo::GetExpression
const TString & GetExpression() const
Definition: VariableInfo.h:57
TMVA::DataSetInfo::GetSignalClassIndex
UInt_t GetSignalClassIndex()
Definition: DataSetInfo.h:158
TMVA::TreeInfo::GetTree
TTree * GetTree() const
Definition: DataInputHandler.h:61
ROOT::v5::TFormula::GetNdim
virtual Int_t GetNdim() const
Definition: TFormula.h:237
Tools.h
TNamed::GetName
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:47
Float_t
TTree::ResetBranchAddresses
virtual void ResetBranchAddresses()
Tell all of our branches to drop their current objects and allocate new ones.
Definition: TTree.cxx:7953
TMVA::DataInputHandler::GetEntries
UInt_t GetEntries(const TString &name) const
Definition: DataInputHandler.h:100
TMVA::Configurable::DeclareOptionRef
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
TObject::ClassName
virtual const char * ClassName() const
Returns name of class to which the object belongs.
Definition: TObject.cxx:130
TTree::SetBranchStatus
virtual void SetBranchStatus(const char *bname, Bool_t status=1, UInt_t *found=0)
Set branch status to Process or DoNotProcess.
Definition: TTree.cxx:8370
TTree::GetEntries
virtual Long64_t GetEntries() const
Definition: TTree.h:458
TMVA::DataInputHandler::end
std::vector< TreeInfo >::const_iterator end(const TString &className) const
Definition: DataInputHandler.h:112
TMath::E
constexpr Double_t E()
Base of natural log:
Definition: TMath.h:96
TMVA::DataSetInfo::GetClassNameMaxLength
Int_t GetClassNameMaxLength() const
Definition: DataSetInfo.cxx:509
DataSet.h
TMatrixF.h
TMath::Finite
Int_t Finite(Double_t x)
Check if it is finite with a mask in order to be consistent in presence of fast math.
Definition: TMath.h:771
TMVA::Configurable::ParseOptions
virtual void ParseOptions()
options parser
Definition: Configurable.cxx:124
TMVA::DataSetInfo::GetTargetInfo
VariableInfo & GetTargetInfo(Int_t i)
Definition: DataSetInfo.h:119
TLeaf::IsOnTerminalBranch
virtual Bool_t IsOnTerminalBranch() const
Definition: TLeaf.h:147
TMVA::Configurable::SetConfigName
void SetConfigName(const char *n)
Definition: Configurable.h:63
TMath.h
TMVA
create variable transformations
Definition: GeneticMinimizer.h:22
int
TMVA::DataSetFactory::EventVector
std::vector< Event * > EventVector
Definition: DataSetFactory.h:176
TEventList.h