101 || std::abs(
x-
y) < std::numeric_limits<float>::min();
109 || std::abs(
x-
y) < std::numeric_limits<double>::min();
122 fMinLinCorrForFisher (1),
123 fUseExclusiveVars (
kTRUE),
131 fPruneMethod (kNoPruning),
132 fNNodesBeforePruning(0),
133 fNodePurityLimit(0.5),
141 fAnalysisType (
Types::kClassification),
159 fMinLinCorrForFisher (1),
160 fUseExclusiveVars (
kTRUE),
164 fMinNodeSize (minSize),
168 fPruneMethod (kNoPruning),
169 fNNodesBeforePruning(0),
170 fNodePurityLimit(purityLimit),
171 fRandomisedTree (randomisedTree),
172 fUseNvars (useNvars),
173 fUsePoissonNvars(usePoissonNvars),
175 fMaxDepth (nMaxDepth),
178 fAnalysisType (
Types::kClassification),
179 fDataSetInfo (dataInfo)
181 if (sepType == NULL) {
188 Log() << kWARNING <<
" You had chosen the training mode using optimal cuts, not\n"
189 <<
" based on a grid of " <<
fNCuts <<
" by setting the option NCuts < 0\n"
190 <<
" as this doesn't exist yet, I set it to " <<
fNCuts <<
" and use the grid"
206 fUseFisherCuts (
d.fUseFisherCuts),
207 fMinLinCorrForFisher (
d.fMinLinCorrForFisher),
208 fUseExclusiveVars (
d.fUseExclusiveVars),
209 fSepType (
d.fSepType),
210 fRegType (
d.fRegType),
211 fMinSize (
d.fMinSize),
212 fMinNodeSize(
d.fMinNodeSize),
213 fMinSepGain (
d.fMinSepGain),
214 fUseSearchTree (
d.fUseSearchTree),
215 fPruneStrength (
d.fPruneStrength),
216 fPruneMethod (
d.fPruneMethod),
217 fNodePurityLimit(
d.fNodePurityLimit),
218 fRandomisedTree (
d.fRandomisedTree),
219 fUseNvars (
d.fUseNvars),
220 fUsePoissonNvars(
d.fUsePoissonNvars),
221 fMyTrandom (new
TRandom3(fgRandomSeed)),
222 fMaxDepth (
d.fMaxDepth),
223 fSigClass (
d.fSigClass),
225 fAnalysisType(
d.fAnalysisType),
226 fDataSetInfo (
d.fDataSetInfo)
242 if (fMyTrandom)
delete fMyTrandom;
243 if (fRegType)
delete fRegType;
255 Log() << kFATAL <<
"SetParentTreeNodes: started with undefined ROOT node" <<
Endl;
260 if ((this->GetLeftDaughter(
n) == NULL) && (this->GetRightDaughter(
n) != NULL) ) {
261 Log() << kFATAL <<
" Node with only one daughter?? Something went wrong" <<
Endl;
263 }
else if ((this->GetLeftDaughter(
n) != NULL) && (this->GetRightDaughter(
n) == NULL) ) {
264 Log() << kFATAL <<
" Node with only one daughter?? Something went wrong" <<
Endl;
268 if (this->GetLeftDaughter(
n) != NULL) {
269 this->SetParentTreeInNodes( this->GetLeftDaughter(
n) );
271 if (this->GetRightDaughter(
n) != NULL) {
272 this->SetParentTreeInNodes( this->GetRightDaughter(
n) );
275 n->SetParentTree(
this);
276 if (
n->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(
n->GetDepth());
284 std::string
type(
"");
288 dt->
ReadXML( node, tmva_Version_Code );
304 xmin = std::vector<Float_t>(nvars);
305 xmax = std::vector<Float_t>(nvars);
308 for (
Int_t ivar=0; ivar<fNvars; ivar++) {
315 BuildNodeInfo(
Int_t fNvars, std::vector<Float_t>& inxmin, std::vector<Float_t>& inxmax){
318 xmin = std::vector<Float_t>(nvars);
319 xmax = std::vector<Float_t>(nvars);
322 for (
Int_t ivar=0; ivar<fNvars; ivar++) {
323 xmin[ivar]=inxmin[ivar];
324 xmax[ivar]=inxmax[ivar];
338 std::vector<Float_t>
xmin;
339 std::vector<Float_t>
xmax;
343 BuildNodeInfo
operator+(
const BuildNodeInfo& other)
345 BuildNodeInfo ret(nvars,
xmin,
xmax);
346 if(nvars != other.nvars)
348 std::cout <<
"!!! ERROR BuildNodeInfo1+BuildNodeInfo2 failure. Nvars1 != Nvars2." << std::endl;
352 ret.suw = suw + other.suw;
353 ret.sub = sub + other.sub;
355 ret.buw = buw + other.buw;
356 ret.bub = bub + other.bub;
357 ret.target = target + other.target;
358 ret.target2 = target2 + other.target2;
361 for(
Int_t i=0; i<nvars; i++)
363 ret.xmin[i]=
xmin[i]<other.xmin[i]?
xmin[i]:other.xmin[i];
364 ret.xmax[i]=
xmax[i]>other.xmax[i]?
xmax[i]:other.xmax[i];
388 this->GetRoot()->SetPos(
's');
389 this->GetRoot()->SetDepth(0);
390 this->GetRoot()->SetParentTree(
this);
391 fMinSize = fMinNodeSize/100. * eventSample.size();
393 Log() << kDEBUG <<
"\tThe minimal node size MinNodeSize=" << fMinNodeSize <<
" fMinNodeSize="<<fMinNodeSize<<
"% is translated to an actual number of events = "<< fMinSize<<
" for the training sample size of " << eventSample.size() <<
Endl;
394 Log() << kDEBUG <<
"\tNote: This number will be taken as absolute minimum in the node, " <<
Endl;
395 Log() << kDEBUG <<
" \tin terms of 'weighted events' and unweighted ones !! " <<
Endl;
399 UInt_t nevents = eventSample.size();
402 if (fNvars==0) fNvars = eventSample[0]->GetNVariables();
403 fVariableImportance.resize(fNvars);
405 else Log() << kFATAL <<
":<BuildTree> eventsample Size == 0 " <<
Endl;
416 auto f = [
this, &eventSample, &nPartitions](
UInt_t partition = 0){
418 Int_t start = 1.0*partition/nPartitions*eventSample.size();
419 Int_t end = (partition+1.0)/nPartitions*eventSample.size();
421 BuildNodeInfo nodeInfof(fNvars, eventSample[0]);
423 for(
Int_t iev=start; iev<end; iev++){
428 nodeInfof.s += weight;
430 nodeInfof.sub += orgWeight;
433 nodeInfof.b += weight;
435 nodeInfof.bub += orgWeight;
437 if ( DoRegression() ) {
439 nodeInfof.target +=weight*tgt;
440 nodeInfof.target2+=weight*tgt*tgt;
444 for (
UInt_t ivar=0; ivar<fNvars; ivar++) {
447 nodeInfof.xmin[ivar]=val;
448 nodeInfof.xmax[ivar]=val;
450 if (val < nodeInfof.xmin[ivar]) nodeInfof.xmin[ivar]=val;
451 if (val > nodeInfof.xmax[ivar]) nodeInfof.xmax[ivar]=val;
458 BuildNodeInfo nodeInfoInit(fNvars, eventSample[0]);
461 auto redfunc = [nodeInfoInit](std::vector<BuildNodeInfo>
v) -> BuildNodeInfo {
return std::accumulate(
v.begin(),
v.end(), nodeInfoInit); };
465 if (nodeInfo.s+nodeInfo.b < 0) {
466 Log() << kWARNING <<
" One of the Decision Tree nodes has negative total number of signal or background events. "
467 <<
"(Nsig="<<nodeInfo.s<<
" Nbkg="<<nodeInfo.b<<
" Probaby you use a Monte Carlo with negative weights. That should in principle "
468 <<
"be fine as long as on average you end up with something positive. For this you have to make sure that the "
469 <<
"minimal number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
470 <<
"% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
471 <<
"to allow for reasonable averaging!!!" <<
Endl
472 <<
" If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
473 <<
"with negative weight in the training." <<
Endl;
475 for (
UInt_t i=0; i<eventSample.size(); i++) {
476 if (eventSample[i]->
GetClass() != fSigClass) {
477 nBkg += eventSample[i]->GetWeight();
478 Log() << kDEBUG <<
"Event "<< i<<
" has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
479 <<
" boostWeight: " << eventSample[i]->GetBoostWeight() <<
Endl;
482 Log() << kDEBUG <<
" that gives in total: " << nBkg<<
Endl;
492 if (node == this->GetRoot()) {
499 for (
UInt_t ivar=0; ivar<fNvars; ivar++) {
515 if ((eventSample.size() >= 2*fMinSize && nodeInfo.s+nodeInfo.b >= 2*fMinSize) && node->
GetDepth() < fMaxDepth
516 && ( ( nodeInfo.s!=0 && nodeInfo.b !=0 && !DoRegression()) || ( (nodeInfo.s+nodeInfo.b)!=0 && DoRegression()) ) ) {
521 separationGain = this->TrainNodeFast(eventSample, node);
524 separationGain = this->TrainNodeFull(eventSample, node);
531 if (DoRegression()) {
532 node->
SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
533 node->
SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
534 if(
almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b),nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ){
537 node->
SetRMS(
TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
545 if (node->
GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->
GetDepth());
550 std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
551 std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
554 Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
556 for (
UInt_t ie=0; ie< nevents ; ie++) {
558 rightSample.push_back(eventSample[ie]);
559 nRight += eventSample[ie]->GetWeight();
560 nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
563 leftSample.push_back(eventSample[ie]);
564 nLeft += eventSample[ie]->GetWeight();
565 nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
569 if (leftSample.empty() || rightSample.empty()) {
571 Log() << kERROR <<
"<TrainNode> all events went to the same branch" <<
Endl
572 <<
"--- Hence new node == old node ... check" <<
Endl
573 <<
"--- left:" << leftSample.size()
574 <<
" right:" << rightSample.size() <<
Endl
575 <<
" while the separation is thought to be " << separationGain
576 <<
"\n when cutting on variable " << node->
GetSelector()
578 << kFATAL <<
"--- this should never happen, please write a bug report to Helge.Voss@cern.ch" <<
Endl;
599 this->BuildTree(rightSample, rightNode);
600 this->BuildTree(leftSample, leftNode );
605 if (DoRegression()) {
606 node->
SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
607 node->
SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
608 if(
almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b), nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ) {
611 node->
SetRMS(
TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
626 if (node->
GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->
GetDepth());
645 this->GetRoot()->SetPos(
's');
646 this->GetRoot()->SetDepth(0);
647 this->GetRoot()->SetParentTree(
this);
648 fMinSize = fMinNodeSize/100. * eventSample.size();
650 Log() << kDEBUG <<
"\tThe minimal node size MinNodeSize=" << fMinNodeSize <<
" fMinNodeSize="<<fMinNodeSize<<
"% is translated to an actual number of events = "<< fMinSize<<
" for the training sample size of " << eventSample.size() <<
Endl;
651 Log() << kDEBUG <<
"\tNote: This number will be taken as absolute minimum in the node, " <<
Endl;
652 Log() << kDEBUG <<
" \tin terms of 'weighted events' and unweighted ones !! " <<
Endl;
656 UInt_t nevents = eventSample.size();
659 if (fNvars==0) fNvars = eventSample[0]->GetNVariables();
660 fVariableImportance.resize(fNvars);
662 else Log() << kFATAL <<
":<BuildTree> eventsample Size == 0 " <<
Endl;
672 for (
UInt_t ivar=0; ivar<fNvars; ivar++) {
678 for (
UInt_t iev=0; iev<eventSample.size(); iev++) {
692 if ( DoRegression() ) {
695 target2+=weight*tgt*tgt;
699 for (
UInt_t ivar=0; ivar<fNvars; ivar++) {
701 if (iev==0)
xmin[ivar]=
xmax[ivar]=val;
702 if (val <
xmin[ivar])
xmin[ivar]=val;
703 if (val >
xmax[ivar])
xmax[ivar]=val;
709 Log() << kWARNING <<
" One of the Decision Tree nodes has negative total number of signal or background events. "
710 <<
"(Nsig="<<
s<<
" Nbkg="<<
b<<
" Probaby you use a Monte Carlo with negative weights. That should in principle "
711 <<
"be fine as long as on average you end up with something positive. For this you have to make sure that the "
712 <<
"minimul number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
713 <<
"% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
714 <<
"to allow for reasonable averaging!!!" <<
Endl
715 <<
" If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
716 <<
"with negative weight in the training." <<
Endl;
718 for (
UInt_t i=0; i<eventSample.size(); i++) {
719 if (eventSample[i]->
GetClass() != fSigClass) {
720 nBkg += eventSample[i]->GetWeight();
721 Log() << kDEBUG <<
"Event "<< i<<
" has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
722 <<
" boostWeight: " << eventSample[i]->GetBoostWeight() <<
Endl;
725 Log() << kDEBUG <<
" that gives in total: " << nBkg<<
Endl;
735 if (node == this->GetRoot()) {
742 for (
UInt_t ivar=0; ivar<fNvars; ivar++) {
761 if ((eventSample.size() >= 2*fMinSize &&
s+
b >= 2*fMinSize) && node->
GetDepth() < fMaxDepth
762 && ( (
s!=0 &&
b !=0 && !DoRegression()) || ( (
s+
b)!=0 && DoRegression()) ) ) {
765 separationGain = this->TrainNodeFast(eventSample, node);
767 separationGain = this->TrainNodeFull(eventSample, node);
773 if (DoRegression()) {
788 if (node->
GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->
GetDepth());
792 std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
793 std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
796 Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
798 for (
UInt_t ie=0; ie< nevents ; ie++) {
800 rightSample.push_back(eventSample[ie]);
801 nRight += eventSample[ie]->GetWeight();
802 nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
805 leftSample.push_back(eventSample[ie]);
806 nLeft += eventSample[ie]->GetWeight();
807 nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
812 if (leftSample.empty() || rightSample.empty()) {
814 Log() << kERROR <<
"<TrainNode> all events went to the same branch" <<
Endl
815 <<
"--- Hence new node == old node ... check" <<
Endl
816 <<
"--- left:" << leftSample.size()
817 <<
" right:" << rightSample.size() <<
Endl
818 <<
" while the separation is thought to be " << separationGain
819 <<
"\n when cutting on variable " << node->
GetSelector()
821 << kFATAL <<
"--- this should never happen, please write a bug report to Helge.Voss@cern.ch" <<
Endl;
842 this->BuildTree(rightSample, rightNode);
843 this->BuildTree(leftSample, leftNode );
848 if (DoRegression()) {
869 if (node->
GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->
GetDepth());
884 for (
UInt_t i=0; i<eventSample.size(); i++) {
885 this->FillEvent(*(eventSample[i]),NULL);
897 node = this->GetRoot();
903 if (event.
GetClass() == fSigClass) {
916 this->FillEvent(event, node->
GetRight());
918 this->FillEvent(event, node->
GetLeft());
927 if (this->GetRoot()!=NULL) this->GetRoot()->ClearNodeAndAllDaughters();
942 node = this->GetRoot();
951 if (
l->GetNodeType() *
r->GetNodeType() > 0) {
953 this->PruneNode(node);
957 return this->CountNodes();
971 if( fPruneMethod == kNoPruning )
return 0.0;
973 if (fPruneMethod == kExpectedErrorPruning)
976 else if (fPruneMethod == kCostComplexityPruning)
981 Log() << kFATAL <<
"Selected pruning method not yet implemented "
985 if(!tool)
return 0.0;
989 if(validationSample == NULL){
990 Log() << kFATAL <<
"Cannot automate the pruning algorithm without an "
991 <<
"independent validation sample!" <<
Endl;
992 }
else if(validationSample->size() == 0) {
993 Log() << kFATAL <<
"Cannot automate the pruning algorithm with "
994 <<
"independent validation sample of ZERO events!" <<
Endl;
1001 Log() << kFATAL <<
"Error pruning tree! Check prune.log for more information."
1021 return pruneStrength;
1033 GetRoot()->ResetValidationData();
1034 for (
UInt_t ievt=0; ievt < validationSample->size(); ievt++) {
1035 CheckEventWithPrunedTree((*validationSample)[ievt]);
1048 n = this->GetRoot();
1050 Log() << kFATAL <<
"TestPrunedTreeQuality: started with undefined ROOT node" <<
Endl;
1055 if(
n->GetLeft() != NULL &&
n->GetRight() != NULL && !
n->IsTerminal() ) {
1056 return (TestPrunedTreeQuality(
n->GetLeft(), mode ) +
1057 TestPrunedTreeQuality(
n->GetRight(), mode ));
1060 if (DoRegression()) {
1061 Double_t sumw =
n->GetNSValidation() +
n->GetNBValidation();
1062 return n->GetSumTarget2() - 2*
n->GetSumTarget()*
n->GetResponse() + sumw*
n->GetResponse()*
n->GetResponse();
1066 if (
n->GetPurity() > this->GetNodePurityLimit())
1067 return n->GetNBValidation();
1069 return n->GetNSValidation();
1071 else if ( mode == 1 ) {
1073 return (
n->GetPurity() *
n->GetNBValidation() + (1.0 -
n->GetPurity()) *
n->GetNSValidation());
1076 throw std::string(
"Unknown ValidationQualityMode");
1090 if (current == NULL) {
1091 Log() << kFATAL <<
"CheckEventWithPrunedTree: started with undefined ROOT node" <<
Endl;
1094 while(current != NULL) {
1095 if(
e->GetClass() == fSigClass)
1100 if (
e->GetNTargets() > 0) {
1123 for( EventConstList::const_iterator it = validationSample->begin();
1124 it != validationSample->end(); ++it ) {
1125 sumWeights += (*it)->GetWeight();
1136 n = this->GetRoot();
1138 Log() << kFATAL <<
"CountLeafNodes: started with undefined ROOT node" <<
Endl;
1145 if ((this->GetLeftDaughter(
n) == NULL) && (this->GetRightDaughter(
n) == NULL) ) {
1149 if (this->GetLeftDaughter(
n) != NULL) {
1150 countLeafs += this->CountLeafNodes( this->GetLeftDaughter(
n) );
1152 if (this->GetRightDaughter(
n) != NULL) {
1153 countLeafs += this->CountLeafNodes( this->GetRightDaughter(
n) );
1165 n = this->GetRoot();
1167 Log() << kFATAL <<
"DescendTree: started with undefined ROOT node" <<
Endl;
1172 if ((this->GetLeftDaughter(
n) == NULL) && (this->GetRightDaughter(
n) == NULL) ) {
1175 else if ((this->GetLeftDaughter(
n) == NULL) && (this->GetRightDaughter(
n) != NULL) ) {
1176 Log() << kFATAL <<
" Node with only one daughter?? Something went wrong" <<
Endl;
1179 else if ((this->GetLeftDaughter(
n) != NULL) && (this->GetRightDaughter(
n) == NULL) ) {
1180 Log() << kFATAL <<
" Node with only one daughter?? Something went wrong" <<
Endl;
1184 if (this->GetLeftDaughter(
n) != NULL) {
1185 this->DescendTree( this->GetLeftDaughter(
n) );
1187 if (this->GetRightDaughter(
n) != NULL) {
1188 this->DescendTree( this->GetRightDaughter(
n) );
1207 this->DeleteNode(
l);
1208 this->DeleteNode(
r);
1220 if(node == NULL)
return;
1223 node->
SetAlpha( std::numeric_limits<double>::infinity( ) );
1235 Node* current = this->GetRoot();
1237 for (
UInt_t i =0; i < depth; i++) {
1239 if ( tmp & sequence) current = this->GetRightDaughter(current);
1240 else current = this->GetLeftDaughter(current);
1250 for (
UInt_t ivar=0; ivar<fNvars; ivar++) useVariable[ivar]=
kFALSE;
1256 else useNvars = fUseNvars;
1258 UInt_t nSelectedVars = 0;
1259 while (nSelectedVars < useNvars) {
1260 Double_t bla = fMyTrandom->Rndm()*fNvars;
1263 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
1264 if (useVariable[ivar] ==
kTRUE) {
1265 mapVariable[nSelectedVars] = ivar;
1270 if (nSelectedVars != useNvars) { std::cout <<
"Bug in TrainNode - GetRandisedVariables()... sorry" << std::endl; std::exit(1);}
1280struct TrainNodeInfo{
1287 nSelS = std::vector< std::vector<Double_t> >(cNvars);
1288 nSelB = std::vector< std::vector<Double_t> >(cNvars);
1289 nSelS_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1290 nSelB_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1291 target = std::vector< std::vector<Double_t> >(cNvars);
1292 target2 = std::vector< std::vector<Double_t> >(cNvars);
1294 for(
Int_t ivar=0; ivar<cNvars; ivar++){
1295 nSelS[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1296 nSelB[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1297 nSelS_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1298 nSelB_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1299 target[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1300 target2[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1322 std::vector< std::vector<Double_t> > nSelS;
1323 std::vector< std::vector<Double_t> > nSelB;
1324 std::vector< std::vector<Double_t> > nSelS_unWeighted;
1325 std::vector< std::vector<Double_t> > nSelB_unWeighted;
1326 std::vector< std::vector<Double_t> > target;
1327 std::vector< std::vector<Double_t> > target2;
1331 TrainNodeInfo
operator+(
const TrainNodeInfo& other)
1333 TrainNodeInfo ret(cNvars, nBins);
1336 if(cNvars != other.cNvars)
1338 std::cout <<
"!!! ERROR TrainNodeInfo1+TrainNodeInfo2 failure. cNvars1 != cNvars2." << std::endl;
1343 for (
Int_t ivar=0; ivar<cNvars; ivar++) {
1344 for (
UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1345 ret.nSelS[ivar][ibin] = nSelS[ivar][ibin] + other.nSelS[ivar][ibin];
1346 ret.nSelB[ivar][ibin] = nSelB[ivar][ibin] + other.nSelB[ivar][ibin];
1347 ret.nSelS_unWeighted[ivar][ibin] = nSelS_unWeighted[ivar][ibin] + other.nSelS_unWeighted[ivar][ibin];
1348 ret.nSelB_unWeighted[ivar][ibin] = nSelB_unWeighted[ivar][ibin] + other.nSelB_unWeighted[ivar][ibin];
1349 ret.target[ivar][ibin] = target[ivar][ibin] + other.target[ivar][ibin];
1350 ret.target2[ivar][ibin] = target2[ivar][ibin] + other.target2[ivar][ibin];
1354 ret.nTotS = nTotS + other.nTotS;
1355 ret.nTotS_unWeighted = nTotS_unWeighted + other.nTotS_unWeighted;
1356 ret.nTotB = nTotB + other.nTotB;
1357 ret.nTotB_unWeighted = nTotB_unWeighted + other.nTotB_unWeighted;
1385 for (
UInt_t ivar=0; ivar <= fNvars; ivar++) {
1386 separationGain[ivar]=-1;
1392 UInt_t nevents = eventSample.size();
1400 std::vector<Double_t> fisherCoeff;
1403 if (fRandomisedTree) {
1405 GetRandomisedVariables(useVariable,mapVariable,tmp);
1408 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
1409 useVariable[ivar] =
kTRUE;
1410 mapVariable[ivar] = ivar;
1414 useVariable[fNvars] =
kFALSE;
1418 if (fUseFisherCuts) {
1419 useVariable[fNvars] =
kTRUE;
1425 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
1426 useVarInFisher[ivar] =
kFALSE;
1427 mapVarInFisher[ivar] = ivar;
1430 std::vector<TMatrixDSym*>* covMatrices;
1433 Log() << kWARNING <<
" in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" <<
Endl;
1441 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
1442 for (
UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1443 if ( (
TMath::Abs( (*
s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1444 (
TMath::Abs( (*
b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1445 useVarInFisher[ivar] =
kTRUE;
1446 useVarInFisher[jvar] =
kTRUE;
1454 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
1457 if (useVarInFisher[ivar] && useVariable[ivar]) {
1458 mapVarInFisher[nFisherVars++]=ivar;
1461 if (fUseExclusiveVars) useVariable[ivar] =
kFALSE;
1466 fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1469 delete [] useVarInFisher;
1470 delete [] mapVarInFisher;
1477 if (fUseFisherCuts && fisherOK) cNvars++;
1492 for (
UInt_t ivar=0; ivar<cNvars; ivar++) {
1494 nBins[ivar] = fNCuts+1;
1495 if (ivar < fNvars) {
1496 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() ==
'I') {
1501 cutValues[ivar] =
new Double_t [nBins[ivar]];
1505 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
1512 useVariable[ivar]=
kFALSE;
1520 for (
UInt_t iev=0; iev<nevents; iev++) {
1522 Double_t result = fisherCoeff[fNvars];
1523 for (
UInt_t jvar=0; jvar<fNvars; jvar++)
1524 result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1525 if (result >
xmax[ivar])
xmax[ivar]=result;
1526 if (result <
xmin[ivar])
xmin[ivar]=result;
1530 for (
UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1531 cutValues[ivar][ibin]=0;
1545 auto fvarInitCuts = [
this, &useVariable, &cutValues, &invBinWidth, &binWidth, &nBins, &
xmin, &
xmax](
UInt_t ivar = 0){
1547 if ( useVariable[ivar] ) {
1561 invBinWidth[ivar] = 1./binWidth[ivar];
1562 if (ivar < fNvars) {
1563 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() ==
'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
1571 for (
UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
1572 cutValues[ivar][icut]=
xmin[ivar]+(
Double_t(icut+1))*binWidth[ivar];
1584 TrainNodeInfo nodeInfo(cNvars, nBins);
1589 if(eventSample.size() >= cNvars*fNCuts*nPartitions*2)
1594 auto f = [
this, &eventSample, &fisherCoeff, &useVariable, &invBinWidth,
1595 &nBins, &
xmin, &cNvars, &nPartitions](
UInt_t partition = 0){
1597 UInt_t start = 1.0*partition/nPartitions*eventSample.size();
1598 UInt_t end = (partition+1.0)/nPartitions*eventSample.size();
1600 TrainNodeInfo nodeInfof(cNvars, nBins);
1602 for(
UInt_t iev=start; iev<end; iev++) {
1604 Double_t eventWeight = eventSample[iev]->GetWeight();
1605 if (eventSample[iev]->
GetClass() == fSigClass) {
1606 nodeInfof.nTotS+=eventWeight;
1607 nodeInfof.nTotS_unWeighted++; }
1609 nodeInfof.nTotB+=eventWeight;
1610 nodeInfof.nTotB_unWeighted++;
1615 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
1618 if ( useVariable[ivar] ) {
1620 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1622 eventData = fisherCoeff[fNvars];
1623 for (
UInt_t jvar=0; jvar<fNvars; jvar++)
1624 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1630 if (eventSample[iev]->
GetClass() == fSigClass) {
1631 nodeInfof.nSelS[ivar][iBin]+=eventWeight;
1632 nodeInfof.nSelS_unWeighted[ivar][iBin]++;
1635 nodeInfof.nSelB[ivar][iBin]+=eventWeight;
1636 nodeInfof.nSelB_unWeighted[ivar][iBin]++;
1638 if (DoRegression()) {
1639 nodeInfof.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1640 nodeInfof.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1649 TrainNodeInfo nodeInfoInit(cNvars, nBins);
1652 auto redfunc = [nodeInfoInit](std::vector<TrainNodeInfo>
v) -> TrainNodeInfo {
return std::accumulate(
v.begin(),
v.end(), nodeInfoInit); };
1661 auto fvarFillNodeInfo = [
this, &nodeInfo, &eventSample, &fisherCoeff, &useVariable, &invBinWidth, &nBins, &
xmin](
UInt_t ivar = 0){
1663 for(
UInt_t iev=0; iev<eventSample.size(); iev++) {
1666 Double_t eventWeight = eventSample[iev]->GetWeight();
1670 if (eventSample[iev]->
GetClass() == fSigClass) {
1671 nodeInfo.nTotS+=eventWeight;
1672 nodeInfo.nTotS_unWeighted++; }
1674 nodeInfo.nTotB+=eventWeight;
1675 nodeInfo.nTotB_unWeighted++;
1680 if ( useVariable[ivar] ) {
1682 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1684 eventData = fisherCoeff[fNvars];
1685 for (
UInt_t jvar=0; jvar<fNvars; jvar++)
1686 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1692 if (eventSample[iev]->
GetClass() == fSigClass) {
1693 nodeInfo.nSelS[ivar][iBin]+=eventWeight;
1694 nodeInfo.nSelS_unWeighted[ivar][iBin]++;
1697 nodeInfo.nSelB[ivar][iBin]+=eventWeight;
1698 nodeInfo.nSelB_unWeighted[ivar][iBin]++;
1700 if (DoRegression()) {
1701 nodeInfo.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1702 nodeInfo.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1715 auto fvarCumulative = [&nodeInfo, &useVariable, &nBins,
this, &eventSample](
UInt_t ivar = 0){
1716 if (useVariable[ivar]) {
1717 for (
UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
1718 nodeInfo.nSelS[ivar][ibin]+=nodeInfo.nSelS[ivar][ibin-1];
1719 nodeInfo.nSelS_unWeighted[ivar][ibin]+=nodeInfo.nSelS_unWeighted[ivar][ibin-1];
1720 nodeInfo.nSelB[ivar][ibin]+=nodeInfo.nSelB[ivar][ibin-1];
1721 nodeInfo.nSelB_unWeighted[ivar][ibin]+=nodeInfo.nSelB_unWeighted[ivar][ibin-1];
1722 if (DoRegression()) {
1723 nodeInfo.target[ivar][ibin] +=nodeInfo.target[ivar][ibin-1] ;
1724 nodeInfo.target2[ivar][ibin]+=nodeInfo.target2[ivar][ibin-1];
1727 if (nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
1728 Log() << kFATAL <<
"Helge, you have a bug ....nodeInfo.nSelS_unw..+nodeInfo.nSelB_unw..= "
1729 << nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1]
1730 <<
" while eventsample size = " << eventSample.size()
1733 double lastBins=nodeInfo.nSelS[ivar][nBins[ivar]-1] +nodeInfo.nSelB[ivar][nBins[ivar]-1];
1734 double totalSum=nodeInfo.nTotS+nodeInfo.nTotB;
1735 if (
TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
1736 Log() << kFATAL <<
"Helge, you have another bug ....nodeInfo.nSelS+nodeInfo.nSelB= "
1738 <<
" while total number of events = " << totalSum
1749 auto fvarMaxSep = [&nodeInfo, &useVariable,
this, &separationGain, &cutIndex, &nBins] (
UInt_t ivar = 0){
1750 if (useVariable[ivar]) {
1752 for (
UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) {
1764 Double_t sl = nodeInfo.nSelS_unWeighted[ivar][iBin];
1765 Double_t bl = nodeInfo.nSelB_unWeighted[ivar][iBin];
1768 Double_t slW = nodeInfo.nSelS[ivar][iBin];
1769 Double_t blW = nodeInfo.nSelB[ivar][iBin];
1777 if ( ((sl+bl)>=fMinSize && (
sr+br)>=fMinSize)
1778 && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
1781 if (DoRegression()) {
1782 sepTmp = fRegType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin]+nodeInfo.nSelB[ivar][iBin],
1783 nodeInfo.target[ivar][iBin],nodeInfo.target2[ivar][iBin],
1784 nodeInfo.nTotS+nodeInfo.nTotB,
1785 nodeInfo.target[ivar][nBins[ivar]-1],nodeInfo.target2[ivar][nBins[ivar]-1]);
1787 sepTmp = fSepType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin], nodeInfo.nSelB[ivar][iBin], nodeInfo.nTotS, nodeInfo.nTotB);
1789 if (separationGain[ivar] < sepTmp) {
1790 separationGain[ivar] = sepTmp;
1791 cutIndex[ivar] = iBin;
1801 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
1802 if (useVariable[ivar] ) {
1803 if (separationGainTotal < separationGain[ivar]) {
1804 separationGainTotal = separationGain[ivar];
1811 if (DoRegression()) {
1812 node->
SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.nTotS+nodeInfo.nTotB,nodeInfo.target[0][nBins[mxVar]-1],nodeInfo.target2[0][nBins[mxVar]-1]));
1813 node->
SetResponse(nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB));
1814 if (
almost_equal_double(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB), nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB))) {
1818 node->
SetRMS(
TMath::Sqrt(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB) - nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)));
1822 node->
SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.nTotS,nodeInfo.nTotB));
1824 if (nodeInfo.nSelS[mxVar][cutIndex[mxVar]]/nodeInfo.nTotS > nodeInfo.nSelB[mxVar][cutIndex[mxVar]]/nodeInfo.nTotB) cutType=
kTRUE;
1829 node->
SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
1832 if (mxVar < (
Int_t) fNvars){
1834 fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1841 for (
UInt_t ivar=0; ivar<=fNvars; ivar++) {
1845 fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1851 separationGainTotal = 0;
1857 for (
UInt_t i=0; i<cNvars; i++) {
1864 delete [] cutValues[i];
1875 delete [] cutValues;
1880 delete [] useVariable;
1881 delete [] mapVariable;
1883 delete [] separationGain;
1888 delete [] invBinWidth;
1890 return separationGainTotal;
1899 Double_t separationGainTotal = -1, sepTmp;
1904 for (
UInt_t ivar=0; ivar <= fNvars; ivar++) {
1905 separationGain[ivar]=-1;
1912 Int_t nTotS_unWeighted, nTotB_unWeighted;
1913 UInt_t nevents = eventSample.size();
1921 std::vector<Double_t> fisherCoeff;
1924 if (fRandomisedTree) {
1926 GetRandomisedVariables(useVariable,mapVariable,tmp);
1929 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
1930 useVariable[ivar] =
kTRUE;
1931 mapVariable[ivar] = ivar;
1935 useVariable[fNvars] =
kFALSE;
1939 if (fUseFisherCuts) {
1940 useVariable[fNvars] =
kTRUE;
1946 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
1947 useVarInFisher[ivar] =
kFALSE;
1948 mapVarInFisher[ivar] = ivar;
1951 std::vector<TMatrixDSym*>* covMatrices;
1954 Log() << kWARNING <<
" in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" <<
Endl;
1962 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
1963 for (
UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1964 if ( (
TMath::Abs( (*
s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1965 (
TMath::Abs( (*
b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1966 useVarInFisher[ivar] =
kTRUE;
1967 useVarInFisher[jvar] =
kTRUE;
1975 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
1978 if (useVarInFisher[ivar] && useVariable[ivar]) {
1979 mapVarInFisher[nFisherVars++]=ivar;
1982 if (fUseExclusiveVars) useVariable[ivar] =
kFALSE;
1987 fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1990 delete [] useVarInFisher;
1991 delete [] mapVarInFisher;
1998 if (fUseFisherCuts && fisherOK) cNvars++;
2015 for (
UInt_t ivar=0; ivar<cNvars; ivar++) {
2017 nBins[ivar] = fNCuts+1;
2018 if (ivar < fNvars) {
2019 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() ==
'I') {
2026 nSelS[ivar] =
new Double_t [nBins[ivar]];
2027 nSelB[ivar] =
new Double_t [nBins[ivar]];
2028 nSelS_unWeighted[ivar] =
new Double_t [nBins[ivar]];
2029 nSelB_unWeighted[ivar] =
new Double_t [nBins[ivar]];
2030 target[ivar] =
new Double_t [nBins[ivar]];
2031 target2[ivar] =
new Double_t [nBins[ivar]];
2032 cutValues[ivar] =
new Double_t [nBins[ivar]];
2041 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
2048 useVariable[ivar]=
kFALSE;
2056 for (
UInt_t iev=0; iev<nevents; iev++) {
2058 Double_t result = fisherCoeff[fNvars];
2059 for (
UInt_t jvar=0; jvar<fNvars; jvar++)
2060 result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2061 if (result >
xmax[ivar])
xmax[ivar]=result;
2062 if (result <
xmin[ivar])
xmin[ivar]=result;
2065 for (
UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
2066 nSelS[ivar][ibin]=0;
2067 nSelB[ivar][ibin]=0;
2068 nSelS_unWeighted[ivar][ibin]=0;
2069 nSelB_unWeighted[ivar][ibin]=0;
2070 target[ivar][ibin]=0;
2071 target2[ivar][ibin]=0;
2072 cutValues[ivar][ibin]=0;
2079 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
2081 if ( useVariable[ivar] ) {
2095 invBinWidth[ivar] = 1./binWidth[ivar];
2096 if (ivar < fNvars) {
2097 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() ==
'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
2105 for (
UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
2106 cutValues[ivar][icut]=
xmin[ivar]+(
Double_t(icut+1))*binWidth[ivar];
2114 nTotS_unWeighted=0; nTotB_unWeighted=0;
2115 for (
UInt_t iev=0; iev<nevents; iev++) {
2117 Double_t eventWeight = eventSample[iev]->GetWeight();
2118 if (eventSample[iev]->
GetClass() == fSigClass) {
2120 nTotS_unWeighted++; }
2128 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
2131 if ( useVariable[ivar] ) {
2133 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
2135 eventData = fisherCoeff[fNvars];
2136 for (
UInt_t jvar=0; jvar<fNvars; jvar++)
2137 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2143 if (eventSample[iev]->
GetClass() == fSigClass) {
2144 nSelS[ivar][iBin]+=eventWeight;
2145 nSelS_unWeighted[ivar][iBin]++;
2148 nSelB[ivar][iBin]+=eventWeight;
2149 nSelB_unWeighted[ivar][iBin]++;
2151 if (DoRegression()) {
2152 target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
2153 target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
2160 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
2161 if (useVariable[ivar]) {
2162 for (
UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
2163 nSelS[ivar][ibin]+=nSelS[ivar][ibin-1];
2164 nSelS_unWeighted[ivar][ibin]+=nSelS_unWeighted[ivar][ibin-1];
2165 nSelB[ivar][ibin]+=nSelB[ivar][ibin-1];
2166 nSelB_unWeighted[ivar][ibin]+=nSelB_unWeighted[ivar][ibin-1];
2167 if (DoRegression()) {
2168 target[ivar][ibin] +=target[ivar][ibin-1] ;
2169 target2[ivar][ibin]+=target2[ivar][ibin-1];
2172 if (nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
2173 Log() << kFATAL <<
"Helge, you have a bug ....nSelS_unw..+nSelB_unw..= "
2174 << nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1]
2175 <<
" while eventsample size = " << eventSample.size()
2178 double lastBins=nSelS[ivar][nBins[ivar]-1] +nSelB[ivar][nBins[ivar]-1];
2179 double totalSum=nTotS+nTotB;
2180 if (
TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
2181 Log() << kFATAL <<
"Helge, you have another bug ....nSelS+nSelB= "
2183 <<
" while total number of events = " << totalSum
2191 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
2192 if (useVariable[ivar]) {
2193 for (
UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) {
2205 Double_t sl = nSelS_unWeighted[ivar][iBin];
2206 Double_t bl = nSelB_unWeighted[ivar][iBin];
2218 if ( ((sl+bl)>=fMinSize && (
sr+br)>=fMinSize)
2219 && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
2222 if (DoRegression()) {
2223 sepTmp = fRegType->GetSeparationGain(nSelS[ivar][iBin]+nSelB[ivar][iBin],
2224 target[ivar][iBin],target2[ivar][iBin],
2226 target[ivar][nBins[ivar]-1],target2[ivar][nBins[ivar]-1]);
2228 sepTmp = fSepType->GetSeparationGain(nSelS[ivar][iBin], nSelB[ivar][iBin], nTotS, nTotB);
2230 if (separationGain[ivar] < sepTmp) {
2231 separationGain[ivar] = sepTmp;
2232 cutIndex[ivar] = iBin;
2240 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
2241 if (useVariable[ivar] ) {
2242 if (separationGainTotal < separationGain[ivar]) {
2243 separationGainTotal = separationGain[ivar];
2250 if (DoRegression()) {
2251 node->
SetSeparationIndex(fRegType->GetSeparationIndex(nTotS+nTotB,target[0][nBins[mxVar]-1],target2[0][nBins[mxVar]-1]));
2252 node->
SetResponse(target[0][nBins[mxVar]-1]/(nTotS+nTotB));
2253 if (
almost_equal_double(target2[0][nBins[mxVar]-1]/(nTotS+nTotB), target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB))) {
2256 node->
SetRMS(
TMath::Sqrt(target2[0][nBins[mxVar]-1]/(nTotS+nTotB) - target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB)));
2262 if (nSelS[mxVar][cutIndex[mxVar]]/nTotS > nSelB[mxVar][cutIndex[mxVar]]/nTotB) cutType=
kTRUE;
2267 node->
SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
2270 if (mxVar < (
Int_t) fNvars){
2272 fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2279 for (
UInt_t ivar=0; ivar<=fNvars; ivar++) {
2283 fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2289 separationGainTotal = 0;
2305 for (
UInt_t i=0; i<cNvars; i++) {
2308 delete [] nSelS_unWeighted[i];
2309 delete [] nSelB_unWeighted[i];
2310 delete [] target[i];
2311 delete [] target2[i];
2312 delete [] cutValues[i];
2316 delete [] nSelS_unWeighted;
2317 delete [] nSelB_unWeighted;
2320 delete [] cutValues;
2325 delete [] useVariable;
2326 delete [] mapVariable;
2328 delete [] separationGain;
2333 delete [] invBinWidth;
2335 return separationGainTotal;
2345 std::vector<Double_t> fisherCoeff(fNvars+1);
2368 for (
UInt_t ivar=0; ivar<nFisherVars; ivar++) { sumS[ivar] = sumB[ivar] = 0; }
2370 UInt_t nevents = eventSample.size();
2372 for (
UInt_t ievt=0; ievt<nevents; ievt++) {
2375 const Event * ev = eventSample[ievt];
2379 if (ev->
GetClass() == fSigClass) sumOfWeightsS += weight;
2380 else sumOfWeightsB += weight;
2383 for (
UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2387 for (
UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2388 (*meanMatx)( ivar, 2 ) = sumS[ivar];
2389 (*meanMatx)( ivar, 0 ) = sumS[ivar]/sumOfWeightsS;
2391 (*meanMatx)( ivar, 2 ) += sumB[ivar];
2392 (*meanMatx)( ivar, 1 ) = sumB[ivar]/sumOfWeightsB;
2395 (*meanMatx)( ivar, 2 ) /= (sumOfWeightsS + sumOfWeightsB);
2407 assert( sumOfWeightsS > 0 && sumOfWeightsB > 0 );
2411 const Int_t nFisherVars2 = nFisherVars*nFisherVars;
2415 memset(sum2Sig,0,nFisherVars2*
sizeof(
Double_t));
2416 memset(sum2Bgd,0,nFisherVars2*
sizeof(
Double_t));
2419 for (
UInt_t ievt=0; ievt<nevents; ievt++) {
2423 const Event* ev = eventSample.at(ievt);
2433 if ( ev->
GetClass() == fSigClass ) sum2Sig[k] += ( (xval[
x] - (*meanMatx)(
x, 0))*(xval[
y] - (*meanMatx)(
y, 0)) )*weight;
2434 else sum2Bgd[k] += ( (xval[
x] - (*meanMatx)(
x, 1))*(xval[
y] - (*meanMatx)(
y, 1)) )*weight;
2442 (*with)(
x,
y) = sum2Sig[k]/sumOfWeightsS + sum2Bgd[k]/sumOfWeightsB;
2462 prodSig = ( ((*meanMatx)(
x, 0) - (*meanMatx)(
x, 2))*
2463 ((*meanMatx)(
y, 0) - (*meanMatx)(
y, 2)) );
2464 prodBgd = ( ((*meanMatx)(
x, 1) - (*meanMatx)(
x, 2))*
2465 ((*meanMatx)(
y, 1) - (*meanMatx)(
y, 2)) );
2467 (*betw)(
x,
y) = (sumOfWeightsS*prodSig + sumOfWeightsB*prodBgd) / (sumOfWeightsS + sumOfWeightsB);
2476 (*cov)(
x,
y) = (*with)(
x,
y) + (*betw)(
x,
y);
2491 Log() << kWARNING <<
"FisherCoeff matrix is almost singular with determinant="
2493 <<
" did you use the variables that are linear combinations or highly correlated?"
2497 Log() << kFATAL <<
"FisherCoeff matrix is singular with determinant="
2499 <<
" did you use the variables that are linear combinations?"
2506 Double_t xfact =
TMath::Sqrt( sumOfWeightsS*sumOfWeightsB ) / (sumOfWeightsS + sumOfWeightsB);
2509 std::vector<Double_t> diffMeans( nFisherVars );
2511 for (
UInt_t ivar=0; ivar<=fNvars; ivar++) fisherCoeff[ivar] = 0;
2512 for (
UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2513 for (
UInt_t jvar=0; jvar<nFisherVars; jvar++) {
2514 Double_t d = (*meanMatx)(jvar, 0) - (*meanMatx)(jvar, 1);
2515 fisherCoeff[mapVarInFisher[ivar]] += invCov(ivar, jvar)*
d;
2519 fisherCoeff[mapVarInFisher[ivar]] *= xfact;
2524 for (
UInt_t ivar=0; ivar<nFisherVars; ivar++){
2525 f0 += fisherCoeff[mapVarInFisher[ivar]]*((*meanMatx)(ivar, 0) + (*meanMatx)(ivar, 1));
2529 fisherCoeff[fNvars] = f0;
2542 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2544 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2548 std::vector<Double_t> lCutValue( fNvars, 0.0 );
2549 std::vector<Double_t> lSepGain( fNvars, -1.0e6 );
2550 std::vector<Char_t> lCutType( fNvars );
2555 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2556 if((*it)->GetClass() == fSigClass) {
2557 nTotS += (*it)->GetWeight();
2561 nTotB += (*it)->GetWeight();
2567 std::vector<Char_t> useVariable(fNvars);
2571 if (fRandomisedTree) {
2572 if (fUseNvars ==0 ) {
2576 Int_t nSelectedVars = 0;
2577 while (nSelectedVars < fUseNvars) {
2578 Double_t bla = fMyTrandom->Rndm()*fNvars;
2581 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
2582 if(useVariable[ivar] ==
Char_t(
kTRUE)) nSelectedVars++;
2589 for(
UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
2590 if(!useVariable[ivar])
continue;
2594 std::sort( bdtEventSample.begin(),bdtEventSample.end() );
2597 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2599 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2600 for( ; it != it_end; ++it ) {
2601 if((**it)->GetClass() == fSigClass )
2602 sigWeightCtr += (**it)->GetWeight();
2604 bkgWeightCtr += (**it)->GetWeight();
2606 it->SetCumulativeWeight(
false,bkgWeightCtr);
2607 it->SetCumulativeWeight(
true,sigWeightCtr);
2613 Double_t separationGain = -1.0, sepTmp = 0.0, cutValue = 0.0, dVal = 0.0, norm = 0.0;
2616 for( it = bdtEventSample.begin(); it != it_end; ++it ) {
2617 if( index == 0 ) { ++index;
continue; }
2618 if( *(*it) == NULL ) {
2619 Log() << kFATAL <<
"In TrainNodeFull(): have a null event! Where index="
2620 << index <<
", and parent node=" << node->
GetParent() <<
Endl;
2623 dVal = bdtEventSample[index].GetVal() - bdtEventSample[index-1].GetVal();
2624 norm =
TMath::Abs(bdtEventSample[index].GetVal() + bdtEventSample[index-1].GetVal());
2627 if( index >= fMinSize && (nTotS_unWeighted + nTotB_unWeighted) - index >= fMinSize &&
TMath::Abs(dVal/(0.5*norm + 1)) > fPMin ) {
2629 sepTmp = fSepType->GetSeparationGain( it->GetCumulativeWeight(
true), it->GetCumulativeWeight(
false), sigWeightCtr, bkgWeightCtr );
2630 if( sepTmp > separationGain ) {
2631 separationGain = sepTmp;
2632 cutValue = it->GetVal() - 0.5*dVal;
2633 Double_t nSelS = it->GetCumulativeWeight(
true);
2634 Double_t nSelB = it->GetCumulativeWeight(
false);
2637 if( nSelS/sigWeightCtr > nSelB/bkgWeightCtr ) cutType =
kTRUE;
2643 lCutType[ivar] =
Char_t(cutType);
2644 lCutValue[ivar] = cutValue;
2645 lSepGain[ivar] = separationGain;
2648 Int_t iVarIndex = -1;
2649 for(
UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
2650 if( lSepGain[ivar] > separationGain ) {
2652 separationGain = lSepGain[ivar];
2657 if(iVarIndex >= 0) {
2662 fVariableImportance[iVarIndex] += separationGain*separationGain * (nTotS+nTotB) * (nTotS+nTotB);
2665 separationGain = 0.0;
2668 return separationGain;
2696 Log() << kFATAL <<
"CheckEvent: started with undefined ROOT node" <<
Endl;
2705 Log() << kFATAL <<
"DT::CheckEvent: inconsistent tree structure" <<
Endl;
2710 if (DoRegression()) {
2726 Double_t sumsig=0, sumbkg=0, sumtot=0;
2727 for (
UInt_t ievt=0; ievt<eventSample.size(); ievt++) {
2728 if (eventSample[ievt]->
GetClass() != fSigClass) sumbkg+=eventSample[ievt]->GetWeight();
2729 else sumsig+=eventSample[ievt]->GetWeight();
2730 sumtot+=eventSample[ievt]->GetWeight();
2733 if (sumtot!= (sumsig+sumbkg)){
2734 Log() << kFATAL <<
"<SamplePurity> sumtot != sumsig+sumbkg"
2735 << sumtot <<
" " << sumsig <<
" " << sumbkg <<
Endl;
2737 if (sumtot>0)
return sumsig/(sumsig + sumbkg);
2749 std::vector<Double_t> relativeImportance(fNvars);
2751 for (
UInt_t i=0; i< fNvars; i++) {
2752 sum += fVariableImportance[i];
2753 relativeImportance[i] = fVariableImportance[i];
2756 for (
UInt_t i=0; i< fNvars; i++) {
2758 relativeImportance[i] /=
sum;
2760 relativeImportance[i] = 0;
2762 return relativeImportance;
2770 std::vector<Double_t> relativeImportance = this->GetVariableImportance();
2771 if (ivar < fNvars)
return relativeImportance[ivar];
2773 Log() << kFATAL <<
"<GetVariableImportance>" <<
Endl
2774 <<
"--- ivar = " << ivar <<
" is out of range " <<
Endl;
bool almost_equal_double(double x, double y, int ulp=4)
bool almost_equal_float(float x, float y, int ulp=4)
TMatrixT< Double_t > TMatrixD
TString operator+(const TString &s1, const TString &s2)
Use the special concatenation constructor.
static void SetVarIndex(Int_t iVar)
Base class for BinarySearch and Decision Trees.
virtual void ReadXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
read attributes from XML
Executor & GetThreadExecutor()
Get executor class for multi-thread usage In case when MT is not enabled will return a serial executo...
static Config & Instance()
static function: returns TMVA instance
Class that contains all the data information.
void SetNEvents_unweighted(Float_t nev)
void SetNodeType(Int_t t)
void SetSeparationGain(Float_t sep)
void SetNBkgEvents(Float_t b)
void SetCutType(Bool_t t)
Double_t GetNSValidation() const
void IncrementNEvents_unweighted()
void SetFisherCoeff(Int_t ivar, Double_t coeff)
set fisher coefficients
void SetNSigEvents_unboosted(Float_t s)
void SetAlphaMinSubtree(Double_t g)
void IncrementNBkgEvents(Float_t b)
void SetNEvents_unboosted(Float_t nev)
Float_t GetNSigEvents(void) const
virtual void SetLeft(Node *l)
void SetTerminal(Bool_t s=kTRUE)
void SetResponse(Float_t r)
void SetSampleMax(UInt_t ivar, Float_t xmax)
set the maximum of variable ivar from the training sample that pass/end up in this node
void SetNBValidation(Double_t b)
void IncrementNEvents(Float_t nev)
void SetPurity(void)
return the S/(S+B) (purity) for the node REM: even if nodes with purity 0.01 are very PURE background...
void SetSubTreeR(Double_t r)
void AddToSumTarget2(Float_t t2)
virtual DecisionTreeNode * GetLeft() const
Double_t GetNodeR() const
virtual Bool_t GoesRight(const Event &) const
test event if it descends the tree at this node to the right
void SetNFisherCoeff(Int_t nvars)
Short_t GetSelector() const
void SetNSigEvents(Float_t s)
Float_t GetResponse(void) const
Float_t GetCutValue(void) const
Int_t GetNodeType(void) const
void IncrementNBkgEvents_unweighted()
void SetNSigEvents_unweighted(Float_t s)
Double_t GetNBValidation() const
void SetAlpha(Double_t alpha)
void SetSeparationIndex(Float_t sep)
virtual void SetRight(Node *r)
void IncrementNSigEvents_unweighted()
void SetNBkgEvents_unboosted(Float_t b)
Float_t GetPurity(void) const
void IncrementNSigEvents(Float_t s)
Float_t GetSampleMax(UInt_t ivar) const
return the maximum of variable ivar from the training sample that pass/end up in this node
void SetCutValue(Float_t c)
Float_t GetNBkgEvents(void) const
Float_t GetSampleMin(UInt_t ivar) const
return the minimum of variable ivar from the training sample that pass/end up in this node
void SetSampleMin(UInt_t ivar, Float_t xmin)
set the minimum of variable ivar from the training sample that pass/end up in this node
void SetSelector(Short_t i)
virtual DecisionTreeNode * GetParent() const
void SetNBkgEvents_unweighted(Float_t b)
void SetNSValidation(Double_t s)
void AddToSumTarget(Float_t t)
void SetNTerminal(Int_t n)
void SetNEvents(Float_t nev)
virtual DecisionTreeNode * GetRight() const
Implementation of a Decision Tree.
void FillTree(const EventList &eventSample)
fill the existing the decision tree structure by filling event in from the top node and see where the...
void PruneNode(TMVA::DecisionTreeNode *node)
prune away the subtree below the node
void ApplyValidationSample(const EventConstList *validationSample) const
run the validation sample through the (pruned) tree and fill in the nodes the variables NSValidation ...
Double_t TrainNodeFull(const EventConstList &eventSample, DecisionTreeNode *node)
train a node by finding the single optimal cut for a single variable that best separates signal and b...
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in... (used in gradient boosting)
void GetRandomisedVariables(Bool_t *useVariable, UInt_t *variableMap, UInt_t &nVars)
UInt_t CleanTree(DecisionTreeNode *node=NULL)
remove those last splits that result in two leaf nodes that are both of the type (i....
std::vector< const TMVA::Event * > EventConstList
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
static const Int_t fgRandomSeed
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=NULL)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
Double_t PruneTree(const EventConstList *validationSample=NULL)
prune (get rid of internal nodes) the Decision tree to avoid overtraining several different pruning m...
virtual ~DecisionTree(void)
destructor
Types::EAnalysisType fAnalysisType
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
void CheckEventWithPrunedTree(const TMVA::Event *) const
pass a single validation event through a pruned decision tree on the way down the tree,...
void PruneNodeInPlace(TMVA::DecisionTreeNode *node)
prune a node temporarily (without actually deleting its descendants which allows testing the pruned t...
void FillEvent(const TMVA::Event &event, TMVA::DecisionTreeNode *node)
fill the existing the decision tree structure by filling event in from the top node and see where the...
UInt_t CountLeafNodes(TMVA::Node *n=NULL)
return the number of terminal nodes in the sub-tree below Node n
Double_t TestPrunedTreeQuality(const DecisionTreeNode *dt=NULL, Int_t mode=0) const
return the misclassification rate of a pruned tree a "pruned tree" may have set the variable "IsTermi...
void ClearTree()
clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
Double_t SamplePurity(EventList eventSample)
calculates the purity S/(S+B) of a given event sample
void SetParentTreeInNodes(Node *n=NULL)
descend a tree to find all its leaf nodes, fill max depth reached in the tree at the same time.
Node * GetNode(ULong_t sequence, UInt_t depth)
retrieve node from the tree.
std::vector< Double_t > GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher)
calculate the fisher coefficients for the event sample and the variables used
Double_t TrainNodeFast(const EventConstList &eventSample, DecisionTreeNode *node)
Decide how to split a node using one of the variables that gives the best separation of signal/backgr...
void DescendTree(Node *n=NULL)
descend a tree to find all its leaf nodes
RegressionVariance * fRegType
DecisionTree(void)
default constructor using the GiniIndex as separation criterion, no restrictions on minium number of ...
Double_t GetSumWeights(const EventConstList *validationSample) const
calculate the normalization factor for a pruning validation sample
Double_t GetOriginalWeight() const
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not.
Float_t GetValueFast(UInt_t ivar) const
Float_t GetTarget(UInt_t itgt) const
unsigned int GetPoolSize() const
auto MapReduce(F func, ROOT::TSeq< INTEGER > args, R redfunc) -> typename std::result_of< F(INTEGER)>::type
Wrap TExecutor::MapReduce functions.
auto Map(F func, unsigned nTimes) -> std::vector< typename std::result_of< F()>::type >
Wrap TExecutor::Map functions.
Node for the BinarySearch or Decision Trees.
std::vector< DecisionTreeNode * > PruneSequence
the regularization parameter for pruning
Double_t PruneStrength
quality measure for a pruned subtree T of T_max
Calculate the "SeparationGain" for Regression analysis separation criteria used in various training a...
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
Singleton class for Global types used by TMVA.
TMatrixT< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant.
virtual Double_t Determinant() const
Return the matrix determinant.
Random number generator class based on M.
TSeq< unsigned int > TSeqU
static constexpr double sr
static constexpr double s
MsgLogger & Endl(MsgLogger &ml)
Short_t Max(Short_t a, Short_t b)
constexpr Double_t E()
Base of natural log:
Double_t Sqrt(Double_t x)
Short_t Min(Short_t a, Short_t b)
static long int sum(long int i)