#include <algorithm>
#include "TROOT.h"
#include "TSystem.h"
#include "TMath.h"
#include "TMVA/MethodRuleFit.h"
#include "TMVA/RuleFitAPI.h"
#include "TMVA/RuleFit.h"
#include "TMVA/Tools.h"
#include "TMVA/Timer.h"
ClassImp(TMVA::RuleFitAPI)
TMVA::RuleFitAPI::RuleFitAPI( const MethodRuleFit *rfbase,
RuleFit *rulefit,
EMsgType minType = kINFO ) :
fMethodRuleFit(rfbase),
fRuleFit(rulefit),
fRFProgram(kRfTrain),
fLogger("RuleFitAPI",minType)
{
if (rfbase) {
SetRFWorkDir(rfbase->GetRFWorkDir());
} else {
SetRFWorkDir("./rulefit");
}
InitRuleFit();
}
TMVA::RuleFitAPI::~RuleFitAPI()
{
}
void TMVA::RuleFitAPI::WelcomeMessage()
{
fLogger << kINFO
<< "\n"
<< "---------------------------------------------------------------------------\n"
<< "- You are running the interface to Jerome Friedmans RuleFit(tm) code. -\n"
<< "- For a full manual see the following web page: -\n"
<< "- -\n"
<< "- http://www-stat.stanford.edu/~jhf/R-RuleFit.html -\n"
<< "- -\n"
<< "---------------------------------------------------------------------------"
<< Endl;
}
void TMVA::RuleFitAPI::HowtoSetupRF()
{
fLogger << kINFO
<< "\n"
<< "------------------------ RULEFIT-JF INTERFACE SETUP -----------------------\n"
<< "\n"
<< "1. Create a rulefit directory in your current work directory:\n"
<< " mkdir " << fRFWorkDir << "\n\n"
<< " the directory may be set using the option RuleFitDir\n"
<< "\n"
<< "2. Copy (or make a link) the file rf_go.exe into this directory\n"
<< "\n"
<< "The file can be obtained from Jerome Friedmans homepage (linux):\n"
<< " wget http://www-stat.stanford.edu/~jhf/r-rulefit/linux/rf_go.exe\n"
<< "\n"
<< "Don't forget to do:\n"
<< " chmod +x rf_go.exe\n"
<< "\n"
<< "For Windows download:\n"
<< " http://www-stat.stanford.edu/~jhf/r-rulefit/windows/rf_go.exe\n"
<< "\n"
<< "NOTE: other platforms are not supported (see Friedmans homepage)\n"
<< "\n"
<< "---------------------------------------------------------------------------\n"
<< Endl;
}
void TMVA::RuleFitAPI::InitRuleFit()
{
CheckRFWorkDir();
FillIntParmsDef();
FillRealParmsDef();
}
void TMVA::RuleFitAPI::ImportSetup()
{
fRFIntParms.p = fMethodRuleFit->DataInfo().GetNVariables();
fRFIntParms.max_rules = fMethodRuleFit->GetRFNrules();
fRFIntParms.tree_size = fMethodRuleFit->GetRFNendnodes();
fRFIntParms.path_steps = fMethodRuleFit->GetGDNPathSteps();
fRFRealParms.path_inc = fMethodRuleFit->GetGDPathStep();
fRFRealParms.samp_fract = fMethodRuleFit->GetTreeEveFrac();
fRFRealParms.trim_qntl = fMethodRuleFit->GetLinQuantile();
fRFRealParms.conv_fac = fMethodRuleFit->GetGDErrScale();
if (fRuleFit->GetRuleEnsemblePtr()->DoOnlyLinear() )
fRFIntParms.lmode = kRfLinear;
else if (fRuleFit->GetRuleEnsemblePtr()->DoOnlyRules() )
fRFIntParms.lmode = kRfRules;
else
fRFIntParms.lmode = kRfBoth;
}
void TMVA::RuleFitAPI::SetRFWorkDir(const char * wdir)
{
fRFWorkDir = wdir;
}
void TMVA::RuleFitAPI::CheckRFWorkDir()
{
TString oldDir = gSystem->pwd();
if (!gSystem->cd(fRFWorkDir)) {
fLogger << kWARNING << "Must create a rulefit directory named : " << fRFWorkDir << Endl;
HowtoSetupRF();
fLogger << kFATAL << "Setup failed - aborting!" << Endl;
}
FILE *f = fopen("rf_go.exe","r");
if (f==0) {
fLogger << kWARNING << "No rf_go.exe file in directory : " << fRFWorkDir << Endl;
HowtoSetupRF();
fLogger << kFATAL << "Setup failed - aborting!" << Endl;
}
fclose(f);
gSystem->cd(oldDir.Data());
}
void TMVA::RuleFitAPI::SetTrainParms()
{
ImportSetup();
Int_t n = fMethodRuleFit->Data()->GetNTrainingEvents();
fRFIntParms.n = n;
fRFProgram = kRfTrain;
}
void TMVA::RuleFitAPI::SetTestParms()
{
ImportSetup();
Int_t n = fMethodRuleFit->Data()->GetNTestEvents();
fRFIntParms.n = n;
fRFProgram = kRfPredict;
}
void TMVA::RuleFitAPI::FillRealParmsDef()
{
fRFRealParms.xmiss = 9.0e30;
fRFRealParms.trim_qntl = 0.025;
fRFRealParms.huber = 0.8;
fRFRealParms.inter_supp = 3.0;
fRFRealParms.memory_par = 0.01;
fRFRealParms.samp_fract = 0.5;
fRFRealParms.path_inc = 0.01;
fRFRealParms.conv_fac = 1.1;
}
void TMVA::RuleFitAPI::FillIntParmsDef()
{
fRFIntParms.mode = (int)kRfClass;
fRFIntParms.lmode = (int)kRfBoth;
fRFIntParms.max_rules = 2000;
fRFIntParms.tree_size = 4;
fRFIntParms.path_speed = 2;
fRFIntParms.path_xval = 3;
fRFIntParms.path_steps = 50000;
fRFIntParms.path_testfreq = 100;
fRFIntParms.tree_store = 10000000;
fRFIntParms.cat_store = 1000000;
}
Bool_t TMVA::RuleFitAPI::WriteAll()
{
WriteIntParms();
WriteRealParms();
WriteLx();
WriteProgram();
WriteVarNames();
if (fRFProgram==kRfTrain) WriteTrain();
if (fRFProgram==kRfPredict) WriteTest();
if (fRFProgram==kRfVarimp) WriteRealVarImp();
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteIntParms()
{
std::ofstream f;
if (!OpenRFile("intparms",f)) return kFALSE;
WriteInt(f,&fRFIntParms.mode,sizeof(fRFIntParms)/sizeof(Int_t));
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteRealParms()
{
std::ofstream f;
if (!OpenRFile("realparms",f)) return kFALSE;
WriteFloat(f,&fRFRealParms.xmiss,sizeof(fRFRealParms)/sizeof(Float_t));
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteLx()
{
fRFLx.clear();
fRFLx.resize(fMethodRuleFit->DataInfo().GetNVariables(),1);
std::ofstream f;
if (!OpenRFile("lx",f)) return kFALSE;
WriteInt(f,&fRFLx[0],fRFLx.size());
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteProgram()
{
std::ofstream f;
if (!OpenRFile("program",f)) return kFALSE;
TString program;
switch (fRFProgram) {
case kRfTrain:
program = "rulefit";
break;
case kRfPredict:
program = "rulefit_pred";
break;
case kRfVarimp:
program = "varimp";
break;
default:
fRFProgram = kRfTrain;
program="rulefit";
break;
}
f << program;
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteRealVarImp()
{
std::ofstream f;
if (!OpenRFile("realvarimp",f)) return kFALSE;
Float_t rvp[2];
rvp[0] = 0.0;
rvp[1] = 0.0;
WriteFloat(f,&rvp[0],2);
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteRfOut()
{
fLogger << kWARNING << "WriteRfOut is not yet implemented" << Endl;
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteRfStatus()
{
fLogger << kWARNING << "WriteRfStatus is not yet implemented" << Endl;
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteRuleFitMod()
{
fLogger << kWARNING << "WriteRuleFitMod is not yet implemented" << Endl;
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteRuleFitSum()
{
fLogger << kWARNING << "WriteRuleFitSum is not yet implemented" << Endl;
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteTrain()
{
std::ofstream fx;
std::ofstream fy;
std::ofstream fw;
if (!OpenRFile("train.x",fx)) return kFALSE;
if (!OpenRFile("train.y",fy)) return kFALSE;
if (!OpenRFile("train.w",fw)) return kFALSE;
Float_t x,y,w;
for (UInt_t ivar=0; ivar<fMethodRuleFit->DataInfo().GetNVariables(); ivar++) {
for (Int_t ievt=0;ievt<fMethodRuleFit->Data()->GetNTrainingEvents(); ievt++) {
const Event * ev = fMethodRuleFit->GetTrainingEvent(ievt);
x = ev->GetValue(ivar);
WriteFloat(fx,&x,1);
if (ivar==0) {
w = ev->GetWeight();
y = fMethodRuleFit->DataInfo().IsSignal(ev)? 1.0 : -1.0;
WriteFloat(fy,&y,1);
WriteFloat(fw,&w,1);
}
}
}
fLogger << kINFO << "Number of training data written: " << fMethodRuleFit->Data()->GetNTrainingEvents() << Endl;
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteTest()
{
fMethodRuleFit->Data()->SetCurrentType(Types::kTesting);
std::ofstream f;
if (!OpenRFile("test.x",f)) return kFALSE;
Float_t vf;
Float_t neve;
neve = static_cast<Float_t>(fMethodRuleFit->Data()->GetNEvents());
WriteFloat(f,&neve,1);
for (UInt_t ivar=0; ivar<fMethodRuleFit->DataInfo().GetNVariables(); ivar++) {
for (Int_t ievt=0;ievt<fMethodRuleFit->Data()->GetNEvents(); ievt++) {
vf = fMethodRuleFit->GetEvent(ievt)->GetValue(ivar);
WriteFloat(f,&vf,1);
}
}
fLogger << kINFO << "Number of test data written: " << fMethodRuleFit->Data()->GetNEvents() << Endl;
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteVarNames()
{
std::ofstream f;
if (!OpenRFile("varnames",f)) return kFALSE;
for (UInt_t ivar=0; ivar<fMethodRuleFit->DataInfo().GetNVariables(); ivar++) {
f << fMethodRuleFit->DataInfo().GetVariableInfo(ivar).GetExpression() << '\n';
}
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteVarImp()
{
fLogger << kWARNING << "WriteVarImp is not yet implemented" << Endl;
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::WriteYhat()
{
fLogger << kWARNING << "WriteYhat is not yet implemented" << Endl;
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::ReadYhat()
{
fRFYhat.clear();
std::ifstream f;
if (!OpenRFile("yhat",f)) return kFALSE;
Int_t neve;
Float_t xval;
ReadFloat(f,&xval,1);
neve = static_cast<Int_t>(xval);
if (neve!=fMethodRuleFit->Data()->GetNTestEvents()) {
fLogger << kWARNING << "Inconsistent size of yhat file and test tree!" << Endl;
fLogger << kWARNING << "neve = " << neve << " , tree = " << fMethodRuleFit->Data()->GetNTestEvents() << Endl;
return kFALSE;
}
for (Int_t ievt=0; ievt<fMethodRuleFit->Data()->GetNTestEvents(); ievt++) {
ReadFloat(f,&xval,1);
fRFYhat.push_back(xval);
}
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::ReadVarImp()
{
fRFVarImp.clear();
std::ifstream f;
if (!OpenRFile("varimp",f)) return kFALSE;
UInt_t nvars;
Float_t xval;
Float_t xmax=1.0;
nvars=fMethodRuleFit->DataInfo().GetNVariables();
for (UInt_t ivar=0; ivar<nvars; ivar++) {
ReadFloat(f,&xval,1);
if (ivar==0) {
xmax=xval;
} else {
if (xval>xmax) xmax=xval;
}
fRFVarImp.push_back(xval);
}
for (UInt_t ivar=0; ivar<nvars; ivar++) {
fRFVarImp[ivar] = fRFVarImp[ivar]/xmax;
ReadFloat(f,&xval,1);
fRFVarImpInd.push_back(Int_t(xval)-1);
}
return kTRUE;
}
Bool_t TMVA::RuleFitAPI::ReadModelSum()
{
fRFVarImp.clear();
fLogger << kVERBOSE << "Reading RuleFit summary file" << Endl;
std::ifstream f;
if (!OpenRFile("rulefit.sum",f)) return kFALSE;
Int_t lines=0;
Int_t nrules=0;
Int_t nvars=0;
Int_t nvarsOpt=0;
Int_t dumI;
Float_t dumF;
Float_t offset;
Double_t impref=-1.0;
Double_t imp;
fRuleFit->GetRuleEnsemblePtr()->SetAverageRuleSigma(0.4);
Bool_t norules;
lines += ReadInt(f,&nrules);
norules = (nrules==1);
lines += ReadInt(f,&dumI);
norules = norules && (dumI==1);
lines += ReadInt(f,&dumI);
norules = norules && (dumI==1);
lines += ReadInt(f,&dumI);
norules = norules && (dumI==0);
if (nrules==0) norules=kTRUE;
if (norules) nrules = 0;
lines += ReadInt(f,&nvars);
lines += ReadInt(f,&nvarsOpt);
lines += ReadFloat(f,&dumF);
lines += ReadFloat(f,&offset);
fLogger << kDEBUG << "N(rules) = " << nrules << Endl;
fLogger << kDEBUG << "N(vars) = " << nvars << Endl;
fLogger << kDEBUG << "N(varsO) = " << nvarsOpt << Endl;
fLogger << kDEBUG << "xmiss = " << dumF << Endl;
fLogger << kDEBUG << "offset = " << offset << Endl;
if (nvars!=nvarsOpt) {
fLogger << kWARNING << "Format of rulefit.sum is ... weird?? Continuing but who knows how it will end...?" << Endl;
}
std::vector<Double_t> rfSupp;
std::vector<Double_t> rfCoef;
std::vector<Int_t> rfNcut;
std::vector<Rule *> rfRules;
if (norules) {
for (Int_t t=0; t<8; t++) {
lines += ReadFloat(f,&dumF);
}
}
for (Int_t r=0; r<nrules; r++) {
lines += ReadFloat(f,&dumF);
lines += ReadFloat(f,&dumF);
rfSupp.push_back(dumF);
lines += ReadFloat(f,&dumF);
rfCoef.push_back(dumF);
lines += ReadFloat(f,&dumF);
rfNcut.push_back(static_cast<int>(dumF+0.5));
lines += ReadFloat(f,&dumF);
}
for (Int_t r=0; r<nrules; r++) {
Int_t varind;
Double_t xmin;
Double_t xmax;
Rule *rule = new Rule(fRuleFit->GetRuleEnsemblePtr());
rfRules.push_back( rule );
RuleCut *rfcut = new RuleCut();
rfcut->SetNvars(rfNcut[r]);
rule->SetRuleCut( rfcut );
rule->SetNorm(1.0);
rule->SetSupport(0);
rule->SetSSB(0.0);
rule->SetSSBNeve(0.0);
rule->SetImportanceRef(1.0);
rule->SetSSB(0.0);
rule->SetSSBNeve(0.0);
rule->SetSupport(rfSupp[r]);
rule->SetCoefficient(rfCoef[r]);
rule->CalcImportance();
imp = rule->GetImportance();
if (imp>impref) impref = imp;
fLogger << kDEBUG << "Rule #" << r << " : " << nvars << Endl;
fLogger << kDEBUG << " support = " << rfSupp[r] << Endl;
fLogger << kDEBUG << " sigma = " << rule->GetSigma() << Endl;
fLogger << kDEBUG << " coeff = " << rfCoef[r] << Endl;
fLogger << kDEBUG << " N(cut) = " << rfNcut[r] << Endl;
for (Int_t c=0; c<rfNcut[r]; c++) {
lines += ReadFloat(f,&dumF);
varind = static_cast<Int_t>(dumF+0.5)-1;
lines += ReadFloat(f,&dumF);
xmin = static_cast<Double_t>(dumF);
lines += ReadFloat(f,&dumF);
xmax = static_cast<Double_t>(dumF);
rfcut->SetSelector(c,varind);
rfcut->SetCutMin(c,xmin);
rfcut->SetCutMax(c,xmax);
rfcut->SetCutDoMin(c,(xmin<-8.99e35 ? kFALSE:kTRUE));
rfcut->SetCutDoMax(c,(xmax> 8.99e35 ? kFALSE:kTRUE));
}
}
fRuleFit->GetRuleEnsemblePtr()->SetRules( rfRules );
fRuleFit->GetRuleEnsemblePtr()->SetOffset( offset );
std::vector<Int_t> varind;
std::vector<Double_t> xmin;
std::vector<Double_t> xmax;
std::vector<Double_t> average;
std::vector<Double_t> stdev;
std::vector<Double_t> norm;
std::vector<Double_t> coeff;
for (Int_t c=0; c<nvars; c++) {
lines += ReadFloat(f,&dumF);
varind.push_back(static_cast<Int_t>(dumF+0.5)-1);
lines += ReadFloat(f,&dumF);
xmin.push_back(static_cast<Double_t>(dumF));
lines += ReadFloat(f,&dumF);
xmax.push_back(static_cast<Double_t>(dumF));
lines += ReadFloat(f,&dumF);
average.push_back(static_cast<Double_t>(dumF));
lines += ReadFloat(f,&dumF);
stdev.push_back(static_cast<Double_t>(dumF));
Double_t nv = fRuleFit->GetRuleEnsemblePtr()->CalcLinNorm(stdev.back());
norm.push_back(nv);
lines += ReadFloat(f,&dumF);
coeff.push_back(dumF/nv);
fLogger << kDEBUG << "Linear #" << c << Endl;
fLogger << kDEBUG << " varind = " << varind.back() << Endl;
fLogger << kDEBUG << " xmin = " << xmin.back() << Endl;
fLogger << kDEBUG << " xmax = " << xmax.back() << Endl;
fLogger << kDEBUG << " average = " << average.back() << Endl;
fLogger << kDEBUG << " stdev = " << stdev.back() << Endl;
fLogger << kDEBUG << " coeff = " << coeff.back() << Endl;
}
if (xmin.size()>0) {
fRuleFit->GetRuleEnsemblePtr()->SetLinCoefficients(coeff);
fRuleFit->GetRuleEnsemblePtr()->SetLinDM(xmin);
fRuleFit->GetRuleEnsemblePtr()->SetLinDP(xmax);
fRuleFit->GetRuleEnsemblePtr()->SetLinNorm(norm);
}
imp = fRuleFit->GetRuleEnsemblePtr()->CalcLinImportance();
if (imp>impref) impref=imp;
fRuleFit->GetRuleEnsemblePtr()->SetImportanceRef(impref);
fRuleFit->GetRuleEnsemblePtr()->CleanupLinear();
fRuleFit->GetRuleEnsemblePtr()->CalcVarImportance();
fLogger << kDEBUG << "Reading model done" << Endl;
return kTRUE;
}
Int_t TMVA::RuleFitAPI::RunRuleFit()
{
TString oldDir = gSystem->pwd();
TString cmd = "./rf_go.exe";
gSystem->cd(fRFWorkDir.Data());
int rval = gSystem->Exec(cmd.Data());
gSystem->cd(oldDir.Data());
return rval;
}