ROOT  6.07/01
Reference Guide
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
RuleEnsemble.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Joerg Stelzer, Fredrik Tegenfeldt, Helge Voss
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : RuleEnsemble *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * A class generating an ensemble of rules *
12  * Input: a forest of decision trees *
13  * Output: an ensemble of rules *
14  * *
15  * Authors (alphabetical): *
16  * Fredrik Tegenfeldt <Fredrik.Tegenfeldt@cern.ch> - Iowa State U., USA *
17  * Helge Voss <Helge.Voss@cern.ch> - MPI-KP Heidelberg, GER *
18  * *
19  * Copyright (c) 2005: *
20  * CERN, Switzerland *
21  * Iowa State U. *
22  * MPI-K Heidelberg, Germany *
23  * *
24  * Redistribution and use in source and binary forms, with or without *
25  * modification, are permitted according to the terms listed in LICENSE *
26  * (http://tmva.sourceforge.net/LICENSE) *
27  **********************************************************************************/
28 
29 #include "TMVA/RuleEnsemble.h"
30 
31 #include "TMVA/DataSetInfo.h"
32 #include "TMVA/DecisionTree.h"
33 #include "TMVA/Event.h"
34 #include "TMVA/MethodBase.h"
35 #include "TMVA/MethodRuleFit.h"
36 #include "TMVA/MsgLogger.h"
37 #include "TMVA/RuleFit.h"
38 #include "TMVA/Tools.h"
39 #include "TMVA/Types.h"
40 
41 #include "TRandom3.h"
42 #include "TH1F.h"
43 
44 #include <algorithm>
45 #include <cstdlib>
46 #include <iomanip>
47 #include <list>
48 
49 ////////////////////////////////////////////////////////////////////////////////
50 /// constructor
51 
53  : fLearningModel ( kFull )
54  , fImportanceCut ( 0 )
55  , fLinQuantile ( 0.025 ) // default quantile for killing outliers in linear terms
56  , fOffset ( 0 )
57  , fAverageSupport ( 0.8 )
58  , fAverageRuleSigma( 0.4 ) // default value - used if only linear model is chosen
59  , fRuleFSig ( 0 )
60  , fRuleNCave ( 0 )
61  , fRuleNCsig ( 0 )
62  , fRuleMinDist ( 1e-3 ) // closest allowed 'distance' between two rules
63  , fNRulesGenerated ( 0 )
64  , fEvent ( 0 )
65  , fEventCacheOK ( true )
66  , fRuleMapOK ( true )
67  , fRuleMapInd0 ( 0 )
68  , fRuleMapInd1 ( 0 )
69  , fRuleMapEvents ( 0 )
70  , fLogger( new MsgLogger("RuleFit") )
71 {
72  Initialize( rf );
73 }
74 
75 ////////////////////////////////////////////////////////////////////////////////
76 /// copy constructor
77 
79  : fAverageSupport ( 1 )
80  , fEvent(0)
81  , fRuleMapEvents(0)
82  , fRuleFit(0)
83  , fLogger( new MsgLogger("RuleFit") )
84 {
85  Copy( other );
86 }
87 
88 ////////////////////////////////////////////////////////////////////////////////
89 /// constructor
90 
92  : fLearningModel ( kFull )
93  , fImportanceCut ( 0 )
94  , fLinQuantile ( 0.025 ) // default quantile for killing outliers in linear terms
95  , fOffset ( 0 )
96  , fImportanceRef ( 1.0 )
97  , fAverageSupport ( 0.8 )
98  , fAverageRuleSigma( 0.4 ) // default value - used if only linear model is chosen
99  , fRuleFSig ( 0 )
100  , fRuleNCave ( 0 )
101  , fRuleNCsig ( 0 )
102  , fRuleMinDist ( 1e-3 ) // closest allowed 'distance' between two rules
103  , fNRulesGenerated ( 0 )
104  , fEvent ( 0 )
105  , fEventCacheOK ( true )
106  , fRuleMapOK ( true )
107  , fRuleMapInd0 ( 0 )
108  , fRuleMapInd1 ( 0 )
109  , fRuleMapEvents ( 0 )
110  , fRuleFit ( 0 )
111  , fLogger( new MsgLogger("RuleFit") )
112 {
113 }
114 
115 ////////////////////////////////////////////////////////////////////////////////
116 /// destructor
117 
119 {
120  for ( std::vector<Rule *>::iterator itrRule = fRules.begin(); itrRule != fRules.end(); itrRule++ ) {
121  delete *itrRule;
122  }
123  // NOTE: Should not delete the histos fLinPDFB/S since they are delete elsewhere
124  delete fLogger;
125 }
126 
127 ////////////////////////////////////////////////////////////////////////////////
128 /// Initializes all member variables with default values
129 
131 {
132  SetAverageRuleSigma(0.4); // default value - used if only linear model is chosen
133  fRuleFit = rf;
134  UInt_t nvars = GetMethodBase()->GetNvar();
135  fVarImportance.clear();
136  fLinPDFB.clear();
137  fLinPDFS.clear();
138  //
139  fVarImportance.resize( nvars,0.0 );
140  fLinPDFB.resize( nvars,0 );
141  fLinPDFS.resize( nvars,0 );
142  fImportanceRef = 1.0;
143  for (UInt_t i=0; i<nvars; i++) { // a priori all linear terms are equally valid
144  fLinTermOK.push_back(kTRUE);
145  }
146 }
147 
148 ////////////////////////////////////////////////////////////////////////////////
149 
151  fLogger->SetMinType(t);
152 }
153 
154 
155 ////////////////////////////////////////////////////////////////////////////////
156 ///
157 /// Get a pointer to the original MethodRuleFit.
158 ///
159 
161 {
162  return ( fRuleFit==0 ? 0:fRuleFit->GetMethodRuleFit());
163 }
164 
165 ////////////////////////////////////////////////////////////////////////////////
166 ///
167 /// Get a pointer to the original MethodRuleFit.
168 ///
169 
171 {
172  return ( fRuleFit==0 ? 0:fRuleFit->GetMethodBase());
173 }
174 
175 ////////////////////////////////////////////////////////////////////////////////
176 /// create model
177 
179 {
180  MakeRules( fRuleFit->GetForest() );
181 
182  MakeLinearTerms();
183 
184  MakeRuleMap();
185 
186  CalcRuleSupport();
187 
188  RuleStatistics();
189 
190  PrintRuleGen();
191 }
192 
193 ////////////////////////////////////////////////////////////////////////////////
194 ///
195 /// Calculates sqrt(Sum(a_i^2)), i=1..N (NOTE do not include a0)
196 ///
197 
199 {
200  Int_t ncoeffs = fRules.size();
201  if (ncoeffs<1) return 0;
202  //
203  Double_t sum2=0;
204  Double_t val;
205  for (Int_t i=0; i<ncoeffs; i++) {
206  val = fRules[i]->GetCoefficient();
207  sum2 += val*val;
208  }
209  return sum2;
210 }
211 
212 ////////////////////////////////////////////////////////////////////////////////
213 /// reset all rule coefficients
214 
216 {
217  fOffset = 0.0;
218  UInt_t nrules = fRules.size();
219  for (UInt_t i=0; i<nrules; i++) {
220  fRules[i]->SetCoefficient(0.0);
221  }
222 }
223 
224 ////////////////////////////////////////////////////////////////////////////////
225 /// set all rule coefficients
226 
227 void TMVA::RuleEnsemble::SetCoefficients( const std::vector< Double_t > & v )
228 {
229  UInt_t nrules = fRules.size();
230  if (v.size()!=nrules) {
231  Log() << kFATAL << "<SetCoefficients> - BUG TRAP - input vector worng size! It is = " << v.size()
232  << " when it should be = " << nrules << Endl;
233  }
234  for (UInt_t i=0; i<nrules; i++) {
235  fRules[i]->SetCoefficient(v[i]);
236  }
237 }
238 
239 ////////////////////////////////////////////////////////////////////////////////
240 /// Retrieve all rule coefficients
241 
242 void TMVA::RuleEnsemble::GetCoefficients( std::vector< Double_t > & v )
243 {
244  UInt_t nrules = fRules.size();
245  v.resize(nrules);
246  if (nrules==0) return;
247  //
248  for (UInt_t i=0; i<nrules; i++) {
249  v[i] = (fRules[i]->GetCoefficient());
250  }
251 }
252 
253 ////////////////////////////////////////////////////////////////////////////////
254 /// get list of training events from the rule fitter
255 
256 const std::vector<const TMVA::Event*>* TMVA::RuleEnsemble::GetTrainingEvents() const
257 {
258  return &(fRuleFit->GetTrainingEvents());
259 }
260 
261 ////////////////////////////////////////////////////////////////////////////////
262 /// get the training event from the rule fitter
263 
265 {
266  return fRuleFit->GetTrainingEvent(i);
267 }
268 
269 ////////////////////////////////////////////////////////////////////////////////
270 /// remove rules that behave similar
271 
273 {
274  Log() << kVERBOSE << "Removing similar rules; distance = " << fRuleMinDist << Endl;
275 
276  UInt_t nrulesIn = fRules.size();
277  TMVA::Rule *first, *second;
278  std::vector< Char_t > removeMe( nrulesIn,false ); // <--- stores boolean
279 
280  Int_t nrem = 0;
281  Int_t remind=-1;
282  Double_t r;
283 
284  for (UInt_t i=0; i<nrulesIn; i++) {
285  if (!removeMe[i]) {
286  first = fRules[i];
287  for (UInt_t k=i+1; k<nrulesIn; k++) {
288  if (!removeMe[k]) {
289  second = fRules[k];
290  Bool_t equal = first->Equal(*second,kTRUE,fRuleMinDist);
291  if (equal) {
292  r = gRandom->Rndm();
293  remind = (r>0.5 ? k:i); // randomly select rule
294  }
295  else {
296  remind = -1;
297  }
298 
299  if (remind>-1) {
300  if (!removeMe[remind]) {
301  removeMe[remind] = true;
302  nrem++;
303  }
304  }
305  }
306  }
307  }
308  }
309  UInt_t ind = 0;
310  Rule *theRule;
311  for (UInt_t i=0; i<nrulesIn; i++) {
312  if (removeMe[i]) {
313  theRule = fRules[ind];
314 #if _MSC_VER >= 1400
315  fRules.erase( std::vector<Rule *>::iterator(&fRules[ind], &fRules) );
316 #else
317  fRules.erase( fRules.begin() + ind );
318 #endif
319  delete theRule;
320  ind--;
321  }
322  ind++;
323  }
324  UInt_t nrulesOut = fRules.size();
325  Log() << kVERBOSE << "Removed " << nrulesIn - nrulesOut << " out of " << nrulesIn << " rules" << Endl;
326 }
327 
328 ////////////////////////////////////////////////////////////////////////////////
329 /// cleanup rules
330 
332 {
333  UInt_t nrules = fRules.size();
334  if (nrules==0) return;
335  Log() << kVERBOSE << "Removing rules with relative importance < " << fImportanceCut << Endl;
336  if (fImportanceCut<=0) return;
337  //
338  // Mark rules to be removed
339  //
340  Rule *therule;
341  Int_t ind=0;
342  for (UInt_t i=0; i<nrules; i++) {
343  if (fRules[ind]->GetRelImportance()<fImportanceCut) {
344  therule = fRules[ind];
345 #if _MSC_VER >= 1400
346  fRules.erase( std::vector<Rule *>::iterator(&fRules[ind], &fRules) );
347 #else
348  fRules.erase( fRules.begin() + ind );
349 #endif
350  delete therule;
351  ind--;
352  }
353  ind++;
354  }
355  Log() << kINFO << "Removed " << nrules-ind << " out of a total of " << nrules
356  << " rules with importance < " << fImportanceCut << Endl;
357 }
358 
359 ////////////////////////////////////////////////////////////////////////////////
360 /// cleanup linear model
361 
363 {
364  UInt_t nlin = fLinNorm.size();
365  if (nlin==0) return;
366  Log() << kVERBOSE << "Removing linear terms with relative importance < " << fImportanceCut << Endl;
367  //
368  fLinTermOK.clear();
369  for (UInt_t i=0; i<nlin; i++) {
370  fLinTermOK.push_back( (fLinImportance[i]/fImportanceRef > fImportanceCut) );
371  }
372 }
373 
374 ////////////////////////////////////////////////////////////////////////////////
375 /// calculate the support for all rules
376 
378 {
379  Log() << kVERBOSE << "Evaluating Rule support" << Endl;
380  Double_t s,t,stot,ttot,ssb;
381  Double_t ssig, sbkg, ssum;
382  Int_t indrule=0;
383  stot = 0;
384  ttot = 0;
385  // reset to default values
386  SetAverageRuleSigma(0.4);
387  const std::vector<const Event *> *events = GetTrainingEvents();
388  Double_t nrules = static_cast<Double_t>(fRules.size());
389  Double_t ew;
390  //
391  if ((nrules>0) && (events->size()>0)) {
392  for ( std::vector< Rule * >::iterator itrRule=fRules.begin(); itrRule!=fRules.end(); itrRule++ ) {
393  s=0.0;
394  ssig=0.0;
395  sbkg=0.0;
396  for ( std::vector<const Event * >::const_iterator itrEvent=events->begin(); itrEvent!=events->end(); itrEvent++ ) {
397  if ((*itrRule)->EvalEvent( *(*itrEvent) )) {
398  ew = (*itrEvent)->GetWeight();
399  s += ew;
400  if (GetMethodRuleFit()->DataInfo().IsSignal(*itrEvent)) ssig += ew;
401  else sbkg += ew;
402  }
403  }
404  //
405  s = s/fRuleFit->GetNEveEff();
406  t = s*(1.0-s);
407  t = (t<0 ? 0:sqrt(t));
408  stot += s;
409  ttot += t;
410  ssum = ssig+sbkg;
411  ssb = (ssum>0 ? Double_t(ssig)/Double_t(ssig+sbkg) : 0.0 );
412  (*itrRule)->SetSupport(s);
413  (*itrRule)->SetNorm(t);
414  (*itrRule)->SetSSB( ssb );
415  (*itrRule)->SetSSBNeve(Double_t(ssig+sbkg));
416  indrule++;
417  }
418  fAverageSupport = stot/nrules;
419  fAverageRuleSigma = TMath::Sqrt(fAverageSupport*(1.0-fAverageSupport));
420  Log() << kVERBOSE << "Standard deviation of support = " << fAverageRuleSigma << Endl;
421  Log() << kVERBOSE << "Average rule support = " << fAverageSupport << Endl;
422  }
423 }
424 
425 ////////////////////////////////////////////////////////////////////////////////
426 /// calculate the importance of each rule
427 
429 {
430  Double_t maxRuleImp = CalcRuleImportance();
431  Double_t maxLinImp = CalcLinImportance();
432  Double_t maxImp = (maxRuleImp>maxLinImp ? maxRuleImp : maxLinImp);
433  SetImportanceRef( maxImp );
434 }
435 
436 ////////////////////////////////////////////////////////////////////////////////
437 /// set reference importance
438 
440 {
441  for ( UInt_t i=0; i<fRules.size(); i++ ) {
442  fRules[i]->SetImportanceRef(impref);
443  }
444  fImportanceRef = impref;
445 }
446 ////////////////////////////////////////////////////////////////////////////////
447 /// calculate importance of each rule
448 
450 {
451  Double_t maxImp=-1.0;
452  Double_t imp;
453  Int_t nrules = fRules.size();
454  for ( int i=0; i<nrules; i++ ) {
455  fRules[i]->CalcImportance();
456  imp = fRules[i]->GetImportance();
457  if (imp>maxImp) maxImp = imp;
458  }
459  for ( Int_t i=0; i<nrules; i++ ) {
460  fRules[i]->SetImportanceRef(maxImp);
461  }
462 
463  return maxImp;
464 }
465 
466 ////////////////////////////////////////////////////////////////////////////////
467 /// calculate the linear importance for each rule
468 
470 {
471  Double_t maxImp=-1.0;
472  UInt_t nvars = fLinCoefficients.size();
473  fLinImportance.resize(nvars,0.0);
474  if (!DoLinear()) return maxImp;
475  //
476  // The linear importance is:
477  // I = |b_x|*sigma(x)
478  // Note that the coefficients in fLinCoefficients are for the normalized x
479  // => b'_x * x' = b'_x * sigma(r)*x/sigma(x)
480  // => b_x = b'_x*sigma(r)/sigma(x)
481  // => I = |b'_x|*sigma(r)
482  //
483  Double_t imp;
484  for ( UInt_t i=0; i<nvars; i++ ) {
485  imp = fAverageRuleSigma*TMath::Abs(fLinCoefficients[i]);
486  fLinImportance[i] = imp;
487  if (imp>maxImp) maxImp = imp;
488  }
489  return maxImp;
490 }
491 
492 ////////////////////////////////////////////////////////////////////////////////
493 ///
494 /// Calculates variable importance using eq (35) in RuleFit paper by Friedman et.al
495 ///
496 
498 {
499  Log() << kVERBOSE << "Compute variable importance" << Endl;
500  Double_t rimp;
501  UInt_t nrules = fRules.size();
502  if (GetMethodBase()==0) Log() << kFATAL << "RuleEnsemble::CalcVarImportance() - should not be here!" << Endl;
503  UInt_t nvars = GetMethodBase()->GetNvar();
504  UInt_t nvarsUsed;
505  Double_t rimpN;
506  fVarImportance.resize(nvars,0);
507  // rules
508  if (DoRules()) {
509  for ( UInt_t ind=0; ind<nrules; ind++ ) {
510  rimp = fRules[ind]->GetImportance();
511  nvarsUsed = fRules[ind]->GetNumVarsUsed();
512  if (nvarsUsed<1)
513  Log() << kFATAL << "<CalcVarImportance> Variables for importance calc!!!??? A BUG!" << Endl;
514  rimpN = (nvarsUsed > 0 ? rimp/nvarsUsed:0.0);
515  for ( UInt_t iv=0; iv<nvars; iv++ ) {
516  if (fRules[ind]->ContainsVariable(iv)) {
517  fVarImportance[iv] += rimpN;
518  }
519  }
520  }
521  }
522  // linear terms
523  if (DoLinear()) {
524  for ( UInt_t iv=0; iv<fLinTermOK.size(); iv++ ) {
525  if (fLinTermOK[iv]) fVarImportance[iv] += fLinImportance[iv];
526  }
527  }
528  //
529  // Make variable importance relative the strongest variable
530  //
531  Double_t maximp = 0.0;
532  for ( UInt_t iv=0; iv<nvars; iv++ ) {
533  if ( fVarImportance[iv] > maximp ) maximp = fVarImportance[iv];
534  }
535  if (maximp>0) {
536  for ( UInt_t iv=0; iv<nvars; iv++ ) {
537  fVarImportance[iv] *= 1.0/maximp;
538  }
539  }
540 }
541 
542 ////////////////////////////////////////////////////////////////////////////////
543 /// set rules
544 ///
545 /// first clear all
546 
547 void TMVA::RuleEnsemble::SetRules( const std::vector<Rule *> & rules )
548 {
549  DeleteRules();
550  //
551  fRules.resize(rules.size());
552  for (UInt_t i=0; i<fRules.size(); i++) {
553  fRules[i] = rules[i];
554  }
555  fEventCacheOK = kFALSE;
556 }
557 
558 ////////////////////////////////////////////////////////////////////////////////
559 ///
560 /// Makes rules from the given decision tree.
561 /// First node in all rules is ALWAYS the root node.
562 ///
563 
564 void TMVA::RuleEnsemble::MakeRules( const std::vector< const DecisionTree *> & forest )
565 {
566  fRules.clear();
567  if (!DoRules()) return;
568  //
569  Int_t nrulesCheck=0;
570  Int_t nrules;
571  Int_t nendn;
572  Double_t sumnendn=0;
573  Double_t sumn2=0;
574  //
575  // UInt_t prevs;
576  UInt_t ntrees = forest.size();
577  for ( UInt_t ind=0; ind<ntrees; ind++ ) {
578  // prevs = fRules.size();
579  MakeRulesFromTree( forest[ind] );
580  nrules = CalcNRules( forest[ind] );
581  nendn = (nrules/2) + 1;
582  sumnendn += nendn;
583  sumn2 += nendn*nendn;
584  nrulesCheck += nrules;
585  }
586  Double_t nmean = (ntrees>0) ? sumnendn/ntrees : 0;
587  Double_t nsigm = TMath::Sqrt( gTools().ComputeVariance(sumn2,sumnendn,ntrees) );
588  Double_t ndev = 2.0*(nmean-2.0-nsigm)/(nmean-2.0+nsigm);
589  //
590  Log() << kVERBOSE << "Average number of end nodes per tree = " << nmean << Endl;
591  if (ntrees>1) Log() << kVERBOSE << "sigma of ditto ( ~= mean-2 ?) = "
592  << nsigm
593  << Endl;
594  Log() << kVERBOSE << "Deviation from exponential model = " << ndev << Endl;
595  Log() << kVERBOSE << "Corresponds to L (eq. 13, RuleFit ppr) = " << nmean << Endl;
596  // a BUG trap
597  if (nrulesCheck != static_cast<Int_t>(fRules.size())) {
598  Log() << kFATAL
599  << "BUG! number of generated and possible rules do not match! N(rules) = " << fRules.size()
600  << " != " << nrulesCheck << Endl;
601  }
602  Log() << kVERBOSE << "Number of generated rules: " << fRules.size() << Endl;
603 
604  // save initial number of rules
605  fNRulesGenerated = fRules.size();
606 
607  RemoveSimilarRules();
608 
609  ResetCoefficients();
610 
611 }
612 
613 ////////////////////////////////////////////////////////////////////////////////
614 ///
615 /// Make the linear terms as in eq 25, ref 2
616 /// For this the b and (1-b) quatiles are needed
617 ///
618 
620 {
621  if (!DoLinear()) return;
622 
623  const std::vector<const Event *> *events = GetTrainingEvents();
624  UInt_t neve = events->size();
625  UInt_t nvars = ((*events)[0])->GetNVariables(); // Event -> GetNVariables();
626  Double_t val,ew;
627  typedef std::pair< Double_t, Int_t> dataType;
628  typedef std::pair< Double_t, dataType > dataPoint;
629 
630  std::vector< std::vector<dataPoint> > vardata(nvars);
631  std::vector< Double_t > varsum(nvars,0.0);
632  std::vector< Double_t > varsum2(nvars,0.0);
633  // first find stats of all variables
634  // vardata[v][i].first -> value of var <v> in event <i>
635  // vardata[v][i].second.first -> the event weight
636  // vardata[v][i].second.second -> the event type
637  for (UInt_t i=0; i<neve; i++) {
638  ew = ((*events)[i])->GetWeight();
639  for (UInt_t v=0; v<nvars; v++) {
640  val = ((*events)[i])->GetValue(v);
641  vardata[v].push_back( dataPoint( val, dataType(ew,((*events)[i])->GetClass()) ) );
642  }
643  }
644  //
645  fLinDP.clear();
646  fLinDM.clear();
647  fLinCoefficients.clear();
648  fLinNorm.clear();
649  fLinDP.resize(nvars,0);
650  fLinDM.resize(nvars,0);
651  fLinCoefficients.resize(nvars,0);
652  fLinNorm.resize(nvars,0);
653 
654  Double_t averageWeight = neve ? fRuleFit->GetNEveEff()/static_cast<Double_t>(neve) : 0;
655  // sort and find limits
656  Double_t stdl;
657 
658  // find normalisation given in ref 2 after eq 26
659  Double_t lx;
660  Double_t nquant;
661  Double_t neff;
662  UInt_t indquantM;
663  UInt_t indquantP;
664 
665  for (UInt_t v=0; v<nvars; v++) {
666  varsum[v] = 0;
667  varsum2[v] = 0;
668  //
669  std::sort( vardata[v].begin(),vardata[v].end() );
670  nquant = fLinQuantile*fRuleFit->GetNEveEff(); // quantile = 0.025
671  neff=0;
672  UInt_t ie=0;
673  // first scan for lower quantile (including weights)
674  while ( (ie<neve) && (neff<nquant) ) {
675  neff += vardata[v][ie].second.first;
676  ie++;
677  }
678  indquantM = (ie==0 ? 0:ie-1);
679  // now for upper quantile
680  ie = neve;
681  neff=0;
682  while ( (ie>0) && (neff<nquant) ) {
683  ie--;
684  neff += vardata[v][ie].second.first;
685  }
686  indquantP = (ie==neve ? ie=neve-1:ie);
687  //
688  fLinDM[v] = vardata[v][indquantM].first; // delta-
689  fLinDP[v] = vardata[v][indquantP].first; // delta+
690  if (fLinPDFB[v]) delete fLinPDFB[v];
691  if (fLinPDFS[v]) delete fLinPDFS[v];
692  fLinPDFB[v] = new TH1F(Form("bkgvar%d",v),"bkg temphist",40,fLinDM[v],fLinDP[v]);
693  fLinPDFS[v] = new TH1F(Form("sigvar%d",v),"sig temphist",40,fLinDM[v],fLinDP[v]);
694  fLinPDFB[v]->Sumw2();
695  fLinPDFS[v]->Sumw2();
696  //
697  Int_t type;
698  const Double_t w = 1.0/fRuleFit->GetNEveEff();
699  for (ie=0; ie<neve; ie++) {
700  val = vardata[v][ie].first;
701  ew = vardata[v][ie].second.first;
702  type = vardata[v][ie].second.second;
703  lx = TMath::Min( fLinDP[v], TMath::Max( fLinDM[v], val ) );
704  varsum[v] += ew*lx;
705  varsum2[v] += ew*lx*lx;
706  if (type==1) fLinPDFS[v]->Fill(lx,w*ew);
707  else fLinPDFB[v]->Fill(lx,w*ew);
708  }
709  //
710  // Get normalization.
711  //
712  stdl = TMath::Sqrt( (varsum2[v] - (varsum[v]*varsum[v]/fRuleFit->GetNEveEff()))/(fRuleFit->GetNEveEff()-averageWeight) );
713  fLinNorm[v] = CalcLinNorm(stdl);
714  }
715  // Save PDFs - for debugging purpose
716  for (UInt_t v=0; v<nvars; v++) {
717  fLinPDFS[v]->Write();
718  fLinPDFB[v]->Write();
719  }
720 }
721 
722 
723 ////////////////////////////////////////////////////////////////////////////////
724 ///
725 /// This function returns Pr( y = 1 | x ) for the linear terms.
726 ///
727 
729 {
730  UInt_t nvars=fLinDP.size();
731 
732  Double_t fstot=0;
733  Double_t fbtot=0;
734  nsig = 0;
735  ntot = nvars;
736  for (UInt_t v=0; v<nvars; v++) {
737  Double_t val = fEventLinearVal[v];
738  Int_t bin = fLinPDFS[v]->FindBin(val);
739  fstot += fLinPDFS[v]->GetBinContent(bin);
740  fbtot += fLinPDFB[v]->GetBinContent(bin);
741  }
742  if (nvars<1) return 0;
743  ntot = (fstot+fbtot)/Double_t(nvars);
744  nsig = (fstot)/Double_t(nvars);
745  return fstot/(fstot+fbtot);
746 }
747 
748 ////////////////////////////////////////////////////////////////////////////////
749 ///
750 /// This function returns Pr( y = 1 | x ) for rules.
751 /// The probability returned is normalized against the number of rules which are actually passed
752 ///
753 
755 {
756  Double_t sump = 0;
757  Double_t sumok = 0;
758  Double_t sumz = 0;
759  Double_t ssb;
760  Double_t neve;
761  //
762  UInt_t nrules = fRules.size();
763  for (UInt_t ir=0; ir<nrules; ir++) {
764  if (fEventRuleVal[ir]>0) {
765  ssb = fEventRuleVal[ir]*GetRulesConst(ir)->GetSSB(); // S/(S+B) is evaluated in CalcRuleSupport() using ALL training events
766  neve = GetRulesConst(ir)->GetSSBNeve(); // number of events accepted by the rule
767  sump += ssb*neve; // number of signal events
768  sumok += neve; // total number of events passed
769  }
770  else sumz += 1.0; // all events
771  }
772 
773  nsig = sump;
774  ntot = sumok;
775  //
776  if (ntot>0) return nsig/ntot;
777  return 0.0;
778 }
779 
780 ////////////////////////////////////////////////////////////////////////////////
781 ///
782 /// We want to estimate F* = argmin Eyx( L(y,F(x) ), min wrt F(x)
783 /// F(x) = FL(x) + FR(x) , linear and rule part
784 ///
785 ///
786 
788 {
789  SetEvent(e);
790  UpdateEventVal();
791  return FStar();
792 }
793 
794 ////////////////////////////////////////////////////////////////////////////////
795 ///
796 /// We want to estimate F* = argmin Eyx( L(y,F(x) ), min wrt F(x)
797 /// F(x) = FL(x) + FR(x) , linear and rule part
798 ///
799 ///
800 
802 {
803  Double_t p=0;
804  Double_t nrs=0, nrt=0;
805  Double_t nls=0, nlt=0;
806  Double_t nt;
807  Double_t pr=0;
808  Double_t pl=0;
809 
810  // first calculate Pr(y=1|X) for rules and linear terms
811  if (DoLinear()) pl = PdfLinear(nls, nlt);
812  if (DoRules()) pr = PdfRule(nrs, nrt);
813  // nr(l)t=0 or 1
814  if ((nlt>0) && (nrt>0)) nt=2.0;
815  else nt=1.0;
816  p = (pl+pr)/nt;
817  return 2.0*p-1.0;
818 }
819 
820 ////////////////////////////////////////////////////////////////////////////////
821 /// calculate various statistics for this rule
822 
824 {
825  // TODO: NOT YET UPDATED FOR WEIGHTS
826  const std::vector<const Event *> *events = GetTrainingEvents();
827  const UInt_t neve = events->size();
828  const UInt_t nvars = GetMethodBase()->GetNvar();
829  const UInt_t nrules = fRules.size();
830  const Event *eveData;
831  // Flags
832  Bool_t sigRule;
833  Bool_t sigTag;
834  Bool_t bkgTag;
835  // Bool_t noTag;
836  Bool_t sigTrue;
837  Bool_t tagged;
838  // Counters
839  Int_t nsig=0;
840  Int_t nbkg=0;
841  Int_t ntag=0;
842  Int_t nss=0;
843  Int_t nsb=0;
844  Int_t nbb=0;
845  Int_t nbs=0;
846  std::vector<Int_t> varcnt;
847  // Clear vectors
848  fRulePSS.clear();
849  fRulePSB.clear();
850  fRulePBS.clear();
851  fRulePBB.clear();
852  fRulePTag.clear();
853  //
854  varcnt.resize(nvars,0);
855  fRuleVarFrac.clear();
856  fRuleVarFrac.resize(nvars,0);
857  //
858  for ( UInt_t i=0; i<nrules; i++ ) {
859  for ( UInt_t v=0; v<nvars; v++) {
860  if (fRules[i]->ContainsVariable(v)) varcnt[v]++; // count how often a variable occurs
861  }
862  sigRule = fRules[i]->IsSignalRule();
863  if (sigRule) { // rule is a signal rule (ie s/(s+b)>0.5)
864  nsig++;
865  }
866  else {
867  nbkg++;
868  }
869  // reset counters
870  nss=0;
871  nsb=0;
872  nbs=0;
873  nbb=0;
874  ntag=0;
875  // loop over all events
876  for (UInt_t e=0; e<neve; e++) {
877  eveData = (*events)[e];
878  tagged = fRules[i]->EvalEvent(*eveData);
879  sigTag = (tagged && sigRule); // it's tagged as a signal
880  bkgTag = (tagged && (!sigRule)); // ... as bkg
881  // noTag = !(sigTag || bkgTag); // ... not tagged
882  sigTrue = (eveData->GetClass() == 0); // true if event is true signal
883  if (tagged) {
884  ntag++;
885  if (sigTag && sigTrue) nss++;
886  if (sigTag && !sigTrue) nsb++;
887  if (bkgTag && sigTrue) nbs++;
888  if (bkgTag && !sigTrue) nbb++;
889  }
890  }
891  // Fill tagging probabilities
892  if (ntag>0 && neve > 0) { // should always be the case, but let's make sure and keep coverity quiet
893  fRulePTag.push_back(Double_t(ntag)/Double_t(neve));
894  fRulePSS.push_back(Double_t(nss)/Double_t(ntag));
895  fRulePSB.push_back(Double_t(nsb)/Double_t(ntag));
896  fRulePBS.push_back(Double_t(nbs)/Double_t(ntag));
897  fRulePBB.push_back(Double_t(nbb)/Double_t(ntag));
898  }
899  //
900  }
901  fRuleFSig = (nsig>0) ? static_cast<Double_t>(nsig)/static_cast<Double_t>(nsig+nbkg) : 0;
902  for ( UInt_t v=0; v<nvars; v++) {
903  fRuleVarFrac[v] = (nrules>0) ? Double_t(varcnt[v])/Double_t(nrules) : 0;
904  }
905 }
906 
907 ////////////////////////////////////////////////////////////////////////////////
908 /// calculate various statistics for this rule
909 
911 {
912  const UInt_t nrules = fRules.size();
913  Double_t nc;
914  Double_t sumNc =0;
915  Double_t sumNc2=0;
916  for ( UInt_t i=0; i<nrules; i++ ) {
917  nc = static_cast<Double_t>(fRules[i]->GetNcuts());
918  sumNc += nc;
919  sumNc2 += nc*nc;
920  }
921  fRuleNCave = 0.0;
922  fRuleNCsig = 0.0;
923  if (nrules>0) {
924  fRuleNCave = sumNc/nrules;
925  fRuleNCsig = TMath::Sqrt(gTools().ComputeVariance(sumNc2,sumNc,nrules));
926  }
927 }
928 
929 ////////////////////////////////////////////////////////////////////////////////
930 /// print rule generation info
931 
933 {
934  Log() << kINFO << "-------------------RULE ENSEMBLE SUMMARY------------------------" << Endl;
935  const MethodRuleFit *mrf = GetMethodRuleFit();
936  if (mrf) Log() << kINFO << "Tree training method : " << (mrf->UseBoost() ? "AdaBoost":"Random") << Endl;
937  Log() << kINFO << "Number of events per tree : " << fRuleFit->GetNTreeSample() << Endl;
938  Log() << kINFO << "Number of trees : " << fRuleFit->GetForest().size() << Endl;
939  Log() << kINFO << "Number of generated rules : " << fNRulesGenerated << Endl;
940  Log() << kINFO << "Idem, after cleanup : " << fRules.size() << Endl;
941  Log() << kINFO << "Average number of cuts per rule : " << Form("%8.2f",fRuleNCave) << Endl;
942  Log() << kINFO << "Spread in number of cuts per rules : " << Form("%8.2f",fRuleNCsig) << Endl;
943  Log() << kVERBOSE << "Complexity : " << Form("%8.2f",fRuleNCave*fRuleNCsig) << Endl;
944  Log() << kINFO << "----------------------------------------------------------------" << Endl;
945  Log() << kINFO << Endl;
946 }
947 
948 ////////////////////////////////////////////////////////////////////////////////
949 /// print function
950 
952 {
953  const EMsgType kmtype=kINFO;
954  const Bool_t isDebug = (fLogger->GetMinType()<=kDEBUG);
955  //
956  Log() << kmtype << Endl;
957  Log() << kmtype << "================================================================" << Endl;
958  Log() << kmtype << " M o d e l " << Endl;
959  Log() << kmtype << "================================================================" << Endl;
960 
961  Int_t ind;
962  const UInt_t nvars = GetMethodBase()->GetNvar();
963  const Int_t nrules = fRules.size();
964  const Int_t printN = TMath::Min(10,nrules); //nrules+1;
965  Int_t maxL = 0;
966  for (UInt_t iv = 0; iv<fVarImportance.size(); iv++) {
967  if (GetMethodBase()->GetInputLabel(iv).Length() > maxL) maxL = GetMethodBase()->GetInputLabel(iv).Length();
968  }
969  //
970  if (isDebug) {
971  Log() << kDEBUG << "Variable importance:" << Endl;
972  for (UInt_t iv = 0; iv<fVarImportance.size(); iv++) {
973  Log() << kDEBUG << std::setw(maxL) << GetMethodBase()->GetInputLabel(iv)
974  << std::resetiosflags(std::ios::right)
975  << " : " << Form(" %3.3f",fVarImportance[iv]) << Endl;
976  }
977  }
978  //
979  Log() << kmtype << "Offset (a0) = " << fOffset << Endl;
980  //
981  if (DoLinear()) {
982  if (fLinNorm.size() > 0) {
983  Log() << kmtype << "------------------------------------" << Endl;
984  Log() << kmtype << "Linear model (weights unnormalised)" << Endl;
985  Log() << kmtype << "------------------------------------" << Endl;
986  Log() << kmtype << std::setw(maxL) << "Variable"
987  << std::resetiosflags(std::ios::right) << " : "
988  << std::setw(11) << " Weights"
989  << std::resetiosflags(std::ios::right) << " : "
990  << "Importance"
991  << std::resetiosflags(std::ios::right)
992  << Endl;
993  Log() << kmtype << "------------------------------------" << Endl;
994  for ( UInt_t i=0; i<fLinNorm.size(); i++ ) {
995  Log() << kmtype << std::setw(std::max(maxL,8)) << GetMethodBase()->GetInputLabel(i);
996  if (fLinTermOK[i]) {
997  Log() << kmtype
998  << std::resetiosflags(std::ios::right)
999  << " : " << Form(" %10.3e",fLinCoefficients[i]*fLinNorm[i])
1000  << " : " << Form(" %3.3f",fLinImportance[i]/fImportanceRef) << Endl;
1001  }
1002  else {
1003  Log() << kmtype << "-> importance below threshhold = "
1004  << Form(" %3.3f",fLinImportance[i]/fImportanceRef) << Endl;
1005  }
1006  }
1007  Log() << kmtype << "------------------------------------" << Endl;
1008  }
1009  }
1010  else Log() << kmtype << "Linear terms were disabled" << Endl;
1011 
1012  if ((!DoRules()) || (nrules==0)) {
1013  if (!DoRules()) {
1014  Log() << kmtype << "Rule terms were disabled" << Endl;
1015  }
1016  else {
1017  Log() << kmtype << "Eventhough rules were included in the model, none passed! " << nrules << Endl;
1018  }
1019  }
1020  else {
1021  Log() << kmtype << "Number of rules = " << nrules << Endl;
1022  if (isDebug) {
1023  Log() << kmtype << "N(cuts) in rules, average = " << fRuleNCave << Endl;
1024  Log() << kmtype << " RMS = " << fRuleNCsig << Endl;
1025  Log() << kmtype << "Fraction of signal rules = " << fRuleFSig << Endl;
1026  Log() << kmtype << "Fraction of rules containing a variable (%):" << Endl;
1027  for ( UInt_t v=0; v<nvars; v++) {
1028  Log() << kmtype << " " << std::setw(maxL) << GetMethodBase()->GetInputLabel(v);
1029  Log() << kmtype << Form(" = %2.2f",fRuleVarFrac[v]*100.0) << " %" << Endl;
1030  }
1031  }
1032  //
1033  // Print out all rules sorted in importance
1034  //
1035  std::list< std::pair<double,int> > sortedImp;
1036  for (Int_t i=0; i<nrules; i++) {
1037  sortedImp.push_back( std::pair<double,int>( fRules[i]->GetImportance(),i ) );
1038  }
1039  sortedImp.sort();
1040  //
1041  Log() << kmtype << "Printing the first " << printN << " rules, ordered in importance." << Endl;
1042  int pind=0;
1043  for ( std::list< std::pair<double,int> >::reverse_iterator itpair = sortedImp.rbegin();
1044  itpair != sortedImp.rend(); itpair++ ) {
1045  ind = itpair->second;
1046  // if (pind==0) impref =
1047  // Log() << kmtype << "Rule #" <<
1048  // Log() << kmtype << *fRules[ind] << Endl;
1049  fRules[ind]->PrintLogger(Form("Rule %4d : ",pind+1));
1050  pind++;
1051  if (pind==printN) {
1052  if (nrules==printN) {
1053  Log() << kmtype << "All rules printed" << Endl;
1054  }
1055  else {
1056  Log() << kmtype << "Skipping the next " << nrules-printN << " rules" << Endl;
1057  }
1058  break;
1059  }
1060  }
1061  }
1062  Log() << kmtype << "================================================================" << Endl;
1063  Log() << kmtype << Endl;
1064 }
1065 
1066 ////////////////////////////////////////////////////////////////////////////////
1067 /// write rules to stream
1068 
1069 void TMVA::RuleEnsemble::PrintRaw( std::ostream & os ) const
1070 {
1071  Int_t dp = os.precision();
1072  UInt_t nrules = fRules.size();
1073  // std::sort(fRules.begin(),fRules.end());
1074  //
1075  os << "ImportanceCut= " << fImportanceCut << std::endl;
1076  os << "LinQuantile= " << fLinQuantile << std::endl;
1077  os << "AverageSupport= " << fAverageSupport << std::endl;
1078  os << "AverageRuleSigma= " << fAverageRuleSigma << std::endl;
1079  os << "Offset= " << fOffset << std::endl;
1080  os << "NRules= " << nrules << std::endl;
1081  for (UInt_t i=0; i<nrules; i++){
1082  os << "***Rule " << i << std::endl;
1083  (fRules[i])->PrintRaw(os);
1084  }
1085  UInt_t nlinear = fLinNorm.size();
1086  //
1087  os << "NLinear= " << fLinTermOK.size() << std::endl;
1088  for (UInt_t i=0; i<nlinear; i++) {
1089  os << "***Linear " << i << std::endl;
1090  os << std::setprecision(10) << (fLinTermOK[i] ? 1:0) << " "
1091  << fLinCoefficients[i] << " "
1092  << fLinNorm[i] << " "
1093  << fLinDM[i] << " "
1094  << fLinDP[i] << " "
1095  << fLinImportance[i] << " " << std::endl;
1096  }
1097  os << std::setprecision(dp);
1098 }
1099 
1100 ////////////////////////////////////////////////////////////////////////////////
1101 /// write rules to XML
1102 
1103 void* TMVA::RuleEnsemble::AddXMLTo(void* parent) const
1104 {
1105  void* re = gTools().AddChild( parent, "Weights" ); // this is the "RuleEnsemble"
1106 
1107  UInt_t nrules = fRules.size();
1108  UInt_t nlinear = fLinNorm.size();
1109  gTools().AddAttr( re, "NRules", nrules );
1110  gTools().AddAttr( re, "NLinear", nlinear );
1111  gTools().AddAttr( re, "LearningModel", (int)fLearningModel );
1112  gTools().AddAttr( re, "ImportanceCut", fImportanceCut );
1113  gTools().AddAttr( re, "LinQuantile", fLinQuantile );
1114  gTools().AddAttr( re, "AverageSupport", fAverageSupport );
1115  gTools().AddAttr( re, "AverageRuleSigma", fAverageRuleSigma );
1116  gTools().AddAttr( re, "Offset", fOffset );
1117  for (UInt_t i=0; i<nrules; i++) fRules[i]->AddXMLTo(re);
1118 
1119  for (UInt_t i=0; i<nlinear; i++) {
1120  void* lin = gTools().AddChild( re, "Linear" );
1121  gTools().AddAttr( lin, "OK", (fLinTermOK[i] ? 1:0) );
1122  gTools().AddAttr( lin, "Coeff", fLinCoefficients[i] );
1123  gTools().AddAttr( lin, "Norm", fLinNorm[i] );
1124  gTools().AddAttr( lin, "DM", fLinDM[i] );
1125  gTools().AddAttr( lin, "DP", fLinDP[i] );
1126  gTools().AddAttr( lin, "Importance", fLinImportance[i] );
1127  }
1128  return re;
1129 }
1130 
1131 ////////////////////////////////////////////////////////////////////////////////
1132 /// read rules from XML
1133 
1134 void TMVA::RuleEnsemble::ReadFromXML( void* wghtnode )
1135 {
1136  UInt_t nrules, nlinear;
1137  gTools().ReadAttr( wghtnode, "NRules", nrules );
1138  gTools().ReadAttr( wghtnode, "NLinear", nlinear );
1139  Int_t iLearningModel;
1140  gTools().ReadAttr( wghtnode, "LearningModel", iLearningModel );
1141  fLearningModel = (ELearningModel) iLearningModel;
1142  gTools().ReadAttr( wghtnode, "ImportanceCut", fImportanceCut );
1143  gTools().ReadAttr( wghtnode, "LinQuantile", fLinQuantile );
1144  gTools().ReadAttr( wghtnode, "AverageSupport", fAverageSupport );
1145  gTools().ReadAttr( wghtnode, "AverageRuleSigma", fAverageRuleSigma );
1146  gTools().ReadAttr( wghtnode, "Offset", fOffset );
1147 
1148  // read rules
1149  DeleteRules();
1150 
1151  UInt_t i = 0;
1152  fRules.resize( nrules );
1153  void* ch = gTools().GetChild( wghtnode );
1154  for (i=0; i<nrules; i++) {
1155  fRules[i] = new Rule();
1156  fRules[i]->SetRuleEnsemble( this );
1157  fRules[i]->ReadFromXML( ch );
1158 
1159  ch = gTools().GetNextChild(ch);
1160  }
1161 
1162  // read linear classifier (Fisher)
1163  fLinNorm .resize( nlinear );
1164  fLinTermOK .resize( nlinear );
1165  fLinCoefficients.resize( nlinear );
1166  fLinDP .resize( nlinear );
1167  fLinDM .resize( nlinear );
1168  fLinImportance .resize( nlinear );
1169 
1170  Int_t iok;
1171  i=0;
1172  while(ch) {
1173  gTools().ReadAttr( ch, "OK", iok );
1174  fLinTermOK[i] = (iok == 1);
1175  gTools().ReadAttr( ch, "Coeff", fLinCoefficients[i] );
1176  gTools().ReadAttr( ch, "Norm", fLinNorm[i] );
1177  gTools().ReadAttr( ch, "DM", fLinDM[i] );
1178  gTools().ReadAttr( ch, "DP", fLinDP[i] );
1179  gTools().ReadAttr( ch, "Importance", fLinImportance[i] );
1180 
1181  i++;
1182  ch = gTools().GetNextChild(ch);
1183  }
1184 }
1185 
1186 ////////////////////////////////////////////////////////////////////////////////
1187 /// read rule ensemble from stream
1188 
1189 void TMVA::RuleEnsemble::ReadRaw( std::istream & istr )
1190 {
1191  UInt_t nrules;
1192  //
1193  std::string dummy;
1194  Int_t idum;
1195  //
1196  // First block is general stuff
1197  //
1198  istr >> dummy >> fImportanceCut;
1199  istr >> dummy >> fLinQuantile;
1200  istr >> dummy >> fAverageSupport;
1201  istr >> dummy >> fAverageRuleSigma;
1202  istr >> dummy >> fOffset;
1203  istr >> dummy >> nrules;
1204  //
1205  // Now read in the rules
1206  //
1207  DeleteRules();
1208  //
1209  for (UInt_t i=0; i<nrules; i++){
1210  istr >> dummy >> idum; // read line "***Rule <ind>"
1211  fRules.push_back( new Rule() );
1212  (fRules.back())->SetRuleEnsemble( this );
1213  (fRules.back())->ReadRaw(istr);
1214  }
1215  //
1216  // and now the linear terms
1217  //
1218  UInt_t nlinear;
1219  //
1220  // coverity[tainted_data_argument]
1221  istr >> dummy >> nlinear;
1222  //
1223  fLinNorm .resize( nlinear );
1224  fLinTermOK .resize( nlinear );
1225  fLinCoefficients.resize( nlinear );
1226  fLinDP .resize( nlinear );
1227  fLinDM .resize( nlinear );
1228  fLinImportance .resize( nlinear );
1229  //
1230 
1231  Int_t iok;
1232  for (UInt_t i=0; i<nlinear; i++) {
1233  istr >> dummy >> idum;
1234  istr >> iok;
1235  fLinTermOK[i] = (iok==1);
1236  istr >> fLinCoefficients[i];
1237  istr >> fLinNorm[i];
1238  istr >> fLinDM[i];
1239  istr >> fLinDP[i];
1240  istr >> fLinImportance[i];
1241  }
1242 }
1243 
1244 ////////////////////////////////////////////////////////////////////////////////
1245 /// copy function
1246 
1248 {
1249  if(this != &other) {
1250  fRuleFit = other.GetRuleFit();
1251  fRuleMinDist = other.GetRuleMinDist();
1252  fOffset = other.GetOffset();
1253  fRules = other.GetRulesConst();
1254  fImportanceCut = other.GetImportanceCut();
1255  fVarImportance = other.GetVarImportance();
1256  fLearningModel = other.GetLearningModel();
1257  fLinQuantile = other.GetLinQuantile();
1258  fRuleNCsig = other.fRuleNCsig;
1259  fAverageRuleSigma = other.fAverageRuleSigma;
1260  fEventCacheOK = other.fEventCacheOK;
1261  fImportanceRef = other.fImportanceRef;
1262  fNRulesGenerated = other.fNRulesGenerated;
1263  fRuleFSig = other.fRuleFSig;
1264  fRuleMapInd0 = other.fRuleMapInd0;
1265  fRuleMapInd1 = other.fRuleMapInd1;
1266  fRuleMapOK = other.fRuleMapOK;
1267  fRuleNCave = other.fRuleNCave;
1268  }
1269 }
1270 
1271 ////////////////////////////////////////////////////////////////////////////////
1272 /// calculate the number of rules
1273 
1275 {
1276  if (dtree==0) return 0;
1277  Node *node = dtree->GetRoot();
1278  Int_t nendnodes = 0;
1279  FindNEndNodes( node, nendnodes );
1280  return 2*(nendnodes-1);
1281 }
1282 
1283 ////////////////////////////////////////////////////////////////////////////////
1284 /// find the number of leaf nodes
1285 
1286 void TMVA::RuleEnsemble::FindNEndNodes( const Node *node, Int_t & nendnodes )
1287 {
1288  if (node==0) return;
1289  if ((node->GetRight()==0) && (node->GetLeft()==0)) {
1290  ++nendnodes;
1291  return;
1292  }
1293  const Node *nodeR = node->GetRight();
1294  const Node *nodeL = node->GetLeft();
1295  FindNEndNodes( nodeR, nendnodes );
1296  FindNEndNodes( nodeL, nendnodes );
1297 }
1298 
1299 ////////////////////////////////////////////////////////////////////////////////
1300 /// create rules from the decsision tree structure
1301 
1303 {
1304  Node *node = dtree->GetRoot();
1305  AddRule( node );
1306 }
1307 
1308 ////////////////////////////////////////////////////////////////////////////////
1309 /// add a new rule to the tree
1310 
1312 {
1313  if (node==0) return;
1314  if (node->GetParent()==0) { // it's a root node, don't make a rule
1315  AddRule( node->GetRight() );
1316  AddRule( node->GetLeft() );
1317  }
1318  else {
1319  Rule *rule = MakeTheRule(node);
1320  if (rule) {
1321  fRules.push_back( rule );
1322  AddRule( node->GetRight() );
1323  AddRule( node->GetLeft() );
1324  }
1325  else {
1326  Log() << kFATAL << "<AddRule> - ERROR failed in creating a rule! BUG!" << Endl;
1327  }
1328  }
1329 }
1330 
1331 ////////////////////////////////////////////////////////////////////////////////
1332 ///
1333 /// Make a Rule from a given Node.
1334 /// The root node (ie no parent) does not generate a Rule.
1335 /// The first node in a rule is always the root node => fNodes.size()>=2
1336 /// Each node corresponds to a cut and the cut value is given by the parent node.
1337 ///
1338 ///
1339 
1341 {
1342  if (node==0) {
1343  Log() << kFATAL << "<MakeTheRule> Input node is NULL. Should not happen. BUG!" << Endl;
1344  return 0;
1345  }
1346 
1347  if (node->GetParent()==0) { // a root node - ignore
1348  return 0;
1349  }
1350  //
1351  std::vector< const Node * > nodeVec;
1352  const Node *parent = node;
1353  //
1354  // Make list with the input node at the end:
1355  // <root node> <node1> <node2> ... <node given as argument>
1356  //
1357  nodeVec.push_back( node );
1358  while (parent!=0) {
1359  parent = parent->GetParent();
1360  if (!parent) continue;
1361  const DecisionTreeNode* dtn = dynamic_cast<const DecisionTreeNode*>(parent);
1362  if (dtn && dtn->GetSelector()>=0)
1363  nodeVec.insert( nodeVec.begin(), parent );
1364 
1365  }
1366  if (nodeVec.size()<2) {
1367  Log() << kFATAL << "<MakeTheRule> BUG! Inconsistent Rule!" << Endl;
1368  return 0;
1369  }
1370  Rule *rule = new Rule( this, nodeVec );
1371  rule->SetMsgType( Log().GetMinType() );
1372  return rule;
1373 }
1374 
1375 ////////////////////////////////////////////////////////////////////////////////
1376 /// Makes rule map for all events
1377 
1378 void TMVA::RuleEnsemble::MakeRuleMap(const std::vector<const Event *> *events, UInt_t ifirst, UInt_t ilast)
1379 {
1380  Log() << kVERBOSE << "Making Rule map for all events" << Endl;
1381  // make rule response map
1382  if (events==0) events = GetTrainingEvents();
1383  if ((ifirst==0) || (ilast==0) || (ifirst>ilast)) {
1384  ifirst = 0;
1385  ilast = events->size()-1;
1386  }
1387  // check if identical to previous call
1388  if ((events!=fRuleMapEvents) ||
1389  (ifirst!=fRuleMapInd0) ||
1390  (ilast !=fRuleMapInd1)) {
1391  fRuleMapOK = kFALSE;
1392  }
1393  //
1394  if (fRuleMapOK) {
1395  Log() << kVERBOSE << "<MakeRuleMap> Map is already valid" << Endl;
1396  return; // already cached
1397  }
1398  fRuleMapEvents = events;
1399  fRuleMapInd0 = ifirst;
1400  fRuleMapInd1 = ilast;
1401  // check number of rules
1402  UInt_t nrules = GetNRules();
1403  if (nrules==0) {
1404  Log() << kVERBOSE << "No rules found in MakeRuleMap()" << Endl;
1405  fRuleMapOK = kTRUE;
1406  return;
1407  }
1408  //
1409  // init map
1410  //
1411  std::vector<UInt_t> ruleind;
1412  fRuleMap.clear();
1413  for (UInt_t i=ifirst; i<=ilast; i++) {
1414  ruleind.clear();
1415  fRuleMap.push_back( ruleind );
1416  for (UInt_t r=0; r<nrules; r++) {
1417  if (fRules[r]->EvalEvent(*((*events)[i]))) {
1418  fRuleMap.back().push_back(r); // save only rules that are accepted
1419  }
1420  }
1421  }
1422  fRuleMapOK = kTRUE;
1423  Log() << kVERBOSE << "Made rule map for event# " << ifirst << " : " << ilast << Endl;
1424 }
1425 
1426 ////////////////////////////////////////////////////////////////////////////////
1427 /// std::ostream operator
1428 
1429 std::ostream& TMVA::operator<< ( std::ostream& os, const RuleEnsemble & rules )
1430 {
1431  os << "DON'T USE THIS - TO BE REMOVED" << std::endl;
1432  rules.Print();
1433  return os;
1434 }
const std::vector< const TMVA::Event * > * GetTrainingEvents() const
get list of training events from the rule fitter
Double_t GetImportanceCut() const
Definition: RuleEnsemble.h:273
Double_t PdfLinear(Double_t &nsig, Double_t &ntot) const
This function returns Pr( y = 1 | x ) for the linear terms.
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
ELearningModel GetLearningModel() const
Definition: RuleEnsemble.h:272
const std::vector< Double_t > & GetVarImportance() const
Definition: RuleEnsemble.h:282
virtual Double_t Rndm(Int_t i=0)
Machine independent random number generator.
Definition: TRandom.cxx:512
bool equal(double d1, double d2, double stol=10000)
RuleEnsemble()
constructor
Rule * MakeTheRule(const Node *node)
Make a Rule from a given Node.
Int_t CalcNRules(const TMVA::DecisionTree *dtree)
calculate the number of rules
virtual ~RuleEnsemble()
destructor
1-D histogram with a float per channel (see TH1 documentation)}
Definition: TH1.h:570
Short_t Min(Short_t a, Short_t b)
Definition: TMathBase.h:170
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
Double_t GetLinQuantile() const
Definition: RuleEnsemble.h:284
const Bool_t kFALSE
Definition: Rtypes.h:92
void Print() const
print function
void PrintRuleGen() const
print rule generation info
void CleanupLinear()
cleanup linear model
void AddAttr(void *node, const char *, const T &value, Int_t precision=16)
Definition: Tools.h:308
void MakeRuleMap(const std::vector< const TMVA::Event * > *events=0, UInt_t ifirst=0, UInt_t ilast=0)
Makes rule map for all events.
void * AddChild(void *parent, const char *childname, const char *content=0, bool isRootNode=false)
add child node
Definition: Tools.cxx:1134
Short_t Abs(Short_t d)
Definition: TMathBase.h:110
virtual DecisionTreeNode * GetRoot() const
Definition: DecisionTree.h:102
Bool_t Equal(const Rule &other, Bool_t useCutValue, Double_t maxdist) const
Compare two rules.
Definition: Rule.cxx:168
void SetMsgType(EMsgType t)
const Event * GetTrainingEvent(UInt_t i) const
get the training event from the rule fitter
void SetImportanceRef(Double_t impref)
set reference importance
void RuleResponseStats()
calculate various statistics for this rule
double sqrt(double)
TClass * GetClass(T *)
Definition: TClass.h:554
Tools & gTools()
Definition: Tools.cxx:79
Double_t PdfRule(Double_t &nsig, Double_t &ntot) const
This function returns Pr( y = 1 | x ) for rules.
void RemoveSimilarRules()
remove rules that behave similar
void Copy(RuleEnsemble const &other)
copy function
void PrintRaw(std::ostream &os) const
write rules to stream
void * GetChild(void *parent, const char *childname=0)
get child node
Definition: Tools.cxx:1158
const MethodRuleFit * GetMethodRuleFit() const
Get a pointer to the original MethodRuleFit.
void CalcImportance()
calculate the importance of each rule
void CleanupRules()
cleanup rules
TThread * t[5]
Definition: threadsh1.C:13
Double_t FStar() const
We want to estimate F* = argmin Eyx( L(y,F(x) ), min wrt F(x) F(x) = FL(x) + FR(x) ...
void CalcVarImportance()
Calculates variable importance using eq (35) in RuleFit paper by Friedman et.al.
ROOT::R::TRInterface & r
Definition: Object.C:4
SVector< double, 2 > v
Definition: Dict.h:5
const RuleFit * GetRuleFit() const
Definition: RuleEnsemble.h:261
Double_t CalcRuleImportance()
calculate importance of each rule
EMsgType
Definition: Types.h:61
void MakeRulesFromTree(const DecisionTree *dtree)
create rules from the decsision tree structure
Double_t CoefficientRadius()
Calculates sqrt(Sum(a_i^2)), i=1..N (NOTE do not include a0)
void AddRule(const Node *node)
add a new rule to the tree
unsigned int UInt_t
Definition: RtypesCore.h:42
char * Form(const char *fmt,...)
bool first
Definition: line3Dfit.C:48
tuple w
Definition: qtexample.py:51
void RuleStatistics()
calculate various statistics for this rule
void MakeRules(const std::vector< const TMVA::DecisionTree * > &forest)
Makes rules from the given decision tree.
Double_t GetWeight(Double_t x) const
tuple pl
Definition: first.py:10
void ReadAttr(void *node, const char *, T &value)
Definition: Tools.h:295
const std::vector< TMVA::Rule * > & GetRulesConst() const
Definition: RuleEnsemble.h:277
R__EXTERN TRandom * gRandom
Definition: TRandom.h:62
std::ostream & operator<<(std::ostream &os, const BinaryTree &tree)
print the tree recursinvely using the << operator
Definition: BinaryTree.cxx:157
Double_t fImportanceRef
Definition: RuleEnsemble.h:365
void ReadFromXML(void *wghtnode)
read rules from XML
void FindNEndNodes(const TMVA::Node *node, Int_t &nendnodes)
find the number of leaf nodes
void Initialize(const RuleFit *rf)
Initializes all member variables with default values.
void SetCoefficients(const std::vector< Double_t > &v)
set all rule coefficients
double Double_t
Definition: RtypesCore.h:55
int type
Definition: TGX11.cxx:120
static RooMathCoreReg dummy
void * GetNextChild(void *prevchild, const char *childname=0)
XML helpers.
Definition: Tools.cxx:1170
virtual Node * GetParent() const
Definition: Node.h:93
void MakeLinearTerms()
Make the linear terms as in eq 25, ref 2 For this the b and (1-b) quatiles are needed.
virtual Node * GetRight() const
Definition: Node.h:92
Double_t fAverageRuleSigma
Definition: RuleEnsemble.h:367
UInt_t GetClass() const
Definition: Event.h:86
void CalcRuleSupport()
calculate the support for all rules
static Vc_ALWAYS_INLINE int_v max(const int_v &x, const int_v &y)
Definition: vector.h:440
Double_t GetRuleMinDist() const
Definition: RuleEnsemble.h:290
void MakeModel()
create model
Short_t GetSelector() const
void SetRules(const std::vector< TMVA::Rule * > &rules)
set rules
Bool_t UseBoost() const
Definition: MethodRuleFit.h:96
void SetMsgType(EMsgType t)
Definition: Rule.cxx:152
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:202
void ResetCoefficients()
reset all rule coefficients
void * AddXMLTo(void *parent) const
write rules to XML
void GetCoefficients(std::vector< Double_t > &v)
Retrieve all rule coefficients.
Double_t Sqrt(Double_t x)
Definition: TMath.h:464
Double_t CalcLinImportance()
calculate the linear importance for each rule
const Bool_t kTRUE
Definition: Rtypes.h:91
const MethodBase * GetMethodBase() const
Get a pointer to the original MethodRuleFit.
Double_t GetOffset() const
Definition: RuleEnsemble.h:275
Definition: math.cpp:60
virtual Node * GetLeft() const
Definition: Node.h:91
void ReadRaw(std::istream &istr)
read rule ensemble from stream