Nonsense integral error values with RooPolynomial

amauri · September 30, 2015, 3:12pm

Dear experts,
I am performing a fit to the sidebands of a linear function, I’m using RooPolynomial.
The problem is that I’m getting some non-sense values when computing the error on the integral over the “signal” region. The error is huge, orders of magnitude larger than the integral itself.
What is even more strange is that this problem doesn’t appear when the slope of the linear function is negative.

I report here the output of the macro
positive_slope_fit.pdf (20.7 KB)
Extrapolation in signal region with RooFit (positive slope), integral = 335.416 ± 4258.17

negative_slope_fit.pdf (20.5 KB)
Extrapolation in signal region with RooFit (negative slope), integral =289.03 ± 14.2727

Do you have any idea of what is happening here?
I’m using ROOT 5.34.18

Thanks for your help,
Andrea

Below is the code

#ifndef __CINT__
#include "RooGlobalFunc.h"
#endif

#include "RooRealVar.h"
#include "RooPolynomial.h"
#include "RooPlot.h"
#include "RooNLLVar.h"
#include "RooAbsData.h"
#include "RooAbsPdf.h"
#include "RooAbsArg.h"
#include "RooAbsReal.h"
#include "RooDataSet.h"
#include "RooFitResult.h"
#include "RooAddPdf.h"
#include "RooArgList.h"
#include "RooMinuit.h"

using namespace RooFit;

RooDataSet* PolyData(double coeff){
  RooRealVar mass("mass", "mass", 0., 1); 
  RooRealVar c0("c0","c0", coeff);
  RooPolynomial poly("poly","poly",mass,RooArgList(c0));
  RooDataSet* data = poly.generate(mass, 3e+03);
  return data;
}

////////////////////////////////////////////////////////////////////////////////
//                      this function do the fit                              // 
////////////////////////////////////////////////////////////////////////////////
RooPlot* plot_fit(double coeff){

  double min_signal = 0.5;
  double max_signal = 0.6;
  double low_sb_min = 0.3;
  double low_sb_max = min_signal;
  double high_sb_min = max_signal;
  double high_sb_max = 0.8;

  TString str_low_sb, str_high_sb, str_sig;
  TCut cut_low_sb, cut_high_sb, cut_sig;
 
  // Declare observables
  RooRealVar mass("mass","mass", 0., 1) ;

  // Define signal, sideband range    
  mass.setRange("signal", min_signal, max_signal) ; 
  mass.setRange("low_sb", low_sb_min, low_sb_max) ;
  mass.setRange("high_sb", high_sb_min, high_sb_max) ;   
  mass.setRange("fullRange", low_sb_min, high_sb_max) ; 

  // Define coefficient
  RooRealVar c0("c0","c0", 0.01, -1, 1e6); 

  RooPolynomial poly("poly","poly",mass,RooArgList(c0));
  RooRealVar norm("norm","number of signal events",2000,1.,1000000) ;
  RooExtendPdf extpoly("extpoly","extended signal p.d.f",poly,norm,"fullRange") ;
      
  RooDataSet* data = PolyData(coeff); 
  
  // Perform unbinned extended ML fit to data
  RooFitResult * rf = extpoly.fitTo(*data,Extended(kTRUE), Range("low_sb,high_sb"), Save()) ;
 
  // LM: you need to use the parameters of the fitted function
  RooArgList pars(* extpoly.getParameters(RooArgSet(mass) ) );

  //LM: make fitted function (need to add normalized term to pdf)
  RooArgSet prodSet(extpoly); //prodSet.add(norm);
  RooProduct unNormPdf("fitted Function", "fitted Function", prodSet);
  TF1 * f2 = unNormPdf.asTF(RooArgList(mass), pars);

  float norm1 = ((RooRealVar*) pars.find("norm"))->getVal();
  float dnorm1 = ((RooRealVar*) pars.find("norm"))->getError();
  float coef0 = ((RooRealVar*) pars.find("c0"))->getVal();
  float dcoef0 = ((RooRealVar*) pars.find("c0"))->getError();

  Double_t integ2_full = f2->Integral(low_sb_min, high_sb_max);
  Double_t integ2 = norm1*f2->Integral(min_signal, max_signal, 0)/integ2_full;
  Double_t dinteg2 = norm1*f2->IntegralError(min_signal, max_signal, 0, rf->covarianceMatrix().GetMatrixArray())/integ2_full; //

  RooPlot * xframe = mass.frame(Title("Roofit fit"));
  data->plotOn(xframe, Binning(50));

  extpoly.plotOn(xframe,Range("low_sb,high_sb"));
  extpoly.plotOn(xframe,Range("signal"),LineColor(kRed),LineStyle(kDashed));
  extpoly.paramOn(xframe,Layout(0.55));

  if(coeff > 0)
    cout << endl << "---------------- Positive slope ----------------" << endl << endl;
  else
    cout << endl << "---------------- Negative slope ----------------" << endl << endl;
  cout << " norm = " << norm1 << " +- " << dnorm1 << endl;
  cout << " c0 = " << coef0 << " +- " << dcoef0 << endl;

  cout << "\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" << endl;
  cout << " Total # of entries in dataset = " << data->sumEntries() << endl;
  str_low_sb = TString(Form("mass>%f && mass<%f",low_sb_min, low_sb_max));
  cut_low_sb = str_low_sb;
  cout << " Actual # of entries in low SB [" << low_sb_min << ", " << low_sb_max << "] = " << data->sumEntries(cut_low_sb) << endl;
  str_high_sb = TString(Form("mass>%f && mass<%f",high_sb_min, high_sb_max));
  cut_high_sb = str_high_sb;
  cout << " Actual # of entries in high SB [" << high_sb_min << ", " << high_sb_max << "] = " << data->sumEntries(cut_high_sb) << endl;
  str_sig = TString(Form("mass>%f && mass<%f",min_signal, max_signal));
  cut_sig = str_sig;
  cout << " Actual # of entries between [" << min_signal << ", " << max_signal << "] = " << data->sumEntries(cut_sig) << endl;
  cout << " Extrapolation between [" << min_signal << ", " << max_signal << "] with RooFit, integral = " << integ2 << " +- " << dinteg2 << endl;
  cout << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" << endl << endl;

  return xframe;
}


////////////////////////////////////////////////////////////////////////////////
//                                  main                                      // 
////////////////////////////////////////////////////////////////////////////////

void fit(){

  RooPlot* xframe_plus = plot_fit(60);

  TCanvas* c1 = new TCanvas("positive slope fit","positive slope fit",900,600) ;
  xframe_plus->Draw();      
  c1->Update();

  RooPlot* xframe_minus = plot_fit(-0.8);

  TCanvas* c2 = new TCanvas("negative slope fit","negative slope fit",900,600) ;
  xframe_minus->Draw();      
  c2->Update();
}

moneta · October 2, 2015, 1:09pm

Hi,

I can reproduce this problem even with the latest ROOT version. It is due to a totally crazy coefficient value (and error) obtained from the fit. I will investigate this.
I recommend you to use for polynomial the RooChebychev class.
You could also use a RooBernstein, but I noticed that also in this case there is a similar problem.
Your problem could be similar to sft.its.cern.ch/jira/browse/ROOT-6664, which has been fixed in case of RooChebychev

Best Regards

Lorenzo

amauri · October 6, 2015, 2:10pm

Hi Lorenzo,
thanks for your answer and I’m sorry to bother you again.
I’m now using RooChebychev but I’m experiencing the opposite problem: the error on the integral is too small.

In the example that you mentioned the number extracted from the fit was the total number of events in the full range, and for this it is enough to use the normalization of the extPdf, but if I want the number of events expected in the blinded region I need to compute the integral of the curve.
As in my first post the absolute value of the integral seems correct, but the associated error is not reliable.

Thanks again,
Andrea

I report the output of the fit:
Actual # of entries between [0.4, 0.5] = 996
Extrapolation between [0.4, 0.5] with RooFit, integral = 918.167 ± 1.37818 (>50 sigma deviation)

fit.pdf (19.8 KB)

#ifndef __CINT__
#include "RooGlobalFunc.h"
#endif

#include "RooRealVar.h"
#include "RooPolynomial.h"
#include "RooPlot.h"
#include "RooNLLVar.h"
#include "RooAbsData.h"
#include "RooAbsPdf.h"
#include "RooAbsArg.h"
#include "RooAbsReal.h"
#include "RooDataSet.h"
#include "RooFitResult.h"
#include "RooAddPdf.h"
#include "RooArgList.h"
#include "RooMinuit.h"

using namespace RooFit;


RooDataSet* PolyData(){
  
  RooRealVar dimuon_mass("dimuon_mass", "dimuon_mass", 0., 1.); 
  RooRealVar c0("c0","c0", 6.); 
  RooPolynomial poly("poly","poly",dimuon_mass,RooArgList(c0));
  RooDataSet* data = poly.generate(dimuon_mass, 1e+04);
  return data;
}


void fit(){
  double min_signal = 0.4;
  double max_signal = 0.5;
  double low_sb_min = 0.;
  double low_sb_max = min_signal;
  double high_sb_min = max_signal;
  double high_sb_max = 1.;

  TString str_low_sb, str_high_sb, str_sig;
  TCut cut_low_sb, cut_high_sb, cut_sig;
 
  // Declare observables
  RooRealVar dimuon_mass("dimuon_mass","dimuon_mass", 0., 1.) ;

  // Define signal, sideband range    
  dimuon_mass.setRange("signal", min_signal, max_signal) ; 
  dimuon_mass.setRange("low_sb", low_sb_min, low_sb_max) ;
  dimuon_mass.setRange("high_sb", high_sb_min, high_sb_max) ;   
  dimuon_mass.setRange("fullRange", low_sb_min, high_sb_max) ; 

  // ------ Define PDF ------
 // ** RooPolynomial **
  // RooRealVar c0("c0","c0", 1., 0.1, 100);
  // RooPolynomial poly("poly","poly",dimuon_mass,RooArgList(c0));
  // RooRealVar norm("norm","number of signal events",2000,1.,1000000) ;
  // RooExtendPdf ext_poly("ext_poly","extended p.d.f",poly,norm,"fullRange") ;
  // ** CHEBYCHEV **
  RooRealVar   slope ("slope", "slope" ,0.3, 0., 1.,""); // -0.05
  RooChebychev poly  ("poly" , "poly"  ,dimuon_mass,slope);
  RooRealVar norm("norm","number of signal events",2000,1.,1000000) ;
  RooExtendPdf ext_poly ("ext_poly", "ext_poly", poly ,norm,"fullRange");

  // RooDataSet data = LoadData();
  RooDataSet* data = PolyData(); 
  
  // Perform unbinned extended ML fit to data
  RooFitResult * rf = ext_poly.fitTo(*data,Extended(kTRUE), Range("low_sb,high_sb"), Save()) ; //Range("low_sb,high_sb"),
 
  //RooArgList pars = rf->floatParsFinal();
  // LM: you need to use the parameters of the fitted function (the one from floatParsFinal is probably a copy)
  RooArgList pars(* ext_poly.getParameters(RooArgSet(dimuon_mass) ) );

  //LM: make fitted function (need to add normalized term to pdf)
  RooArgSet prodSet(ext_poly); //prodSet.add(norm);
  RooProduct unNormPdf("fitted Function", "fitted Function", prodSet);
  TF1 * f2 = unNormPdf.asTF(RooArgList(dimuon_mass), pars);

  float norm1 = ((RooRealVar*) pars.find("norm"))->getVal();
  float dnorm1 = ((RooRealVar*) pars.find("norm"))->getError();
  float coef0 = ((RooRealVar*) pars.find("slope"))->getVal();
  float dcoef0 = ((RooRealVar*) pars.find("slope"))->getError();
  cout << endl;
  cout << " norm = " << norm1 << " +- " << dnorm1 << endl;
  cout << " slope = " << coef0 << " +- " << dcoef0 << endl << endl;


  Double_t integ2_full = f2->Integral(low_sb_min, high_sb_max);
  Double_t integ2 = norm1*f2->Integral(min_signal, max_signal, 0)/integ2_full;
  Double_t dinteg2 = norm1*f2->IntegralError(min_signal, max_signal, 0, rf->covarianceMatrix().GetMatrixArray())/integ2_full; 

  RooPlot * xframe = dimuon_mass.frame(Title("RooFit fit"));
  data->plotOn(xframe, Binning(50));

  // ext_poly.plotOn(xframe,Range("low_sb,high_sb"));
  ext_poly.plotOn(xframe,Range("fullRange"));
  ext_poly.plotOn(xframe,Range("signal"),LineColor(kRed),LineStyle(kDashed));
  ext_poly.paramOn(xframe,Layout(0.55));

  TCanvas* c = new TCanvas("fit","fit",900,600) ;
  xframe->Draw();      
  c->Update();

  cout << "\n\n\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" << endl;
  cout << " Total # of entries in dataset = " << data->sumEntries() << endl;
  str_low_sb = TString(Form("dimuon_mass>%f && dimuon_mass<%f",low_sb_min, low_sb_max));
  cut_low_sb = str_low_sb;
  cout << " Actual # of entries in low SB [" << low_sb_min << ", " << low_sb_max << "] = " << data->sumEntries(cut_low_sb) << endl;
  str_high_sb = TString(Form("dimuon_mass>%f && dimuon_mass<%f",high_sb_min, high_sb_max));
  cut_high_sb = str_high_sb;
  cout << " Actual # of entries in high SB [" << high_sb_min << ", " << high_sb_max << "] = " << data->sumEntries(cut_high_sb) << endl;
  str_sig = TString(Form("dimuon_mass>%f && dimuon_mass<%f",min_signal, max_signal));
  cut_sig = str_sig;
  cout << " Actual # of entries between [" << min_signal << ", " << max_signal << "] = " << data->sumEntries(cut_sig) << endl;
  cout << " Extrapolation between [" << min_signal << ", " << max_signal << "] with RooFit, integral = " << integ2 << " +- " << dinteg2 << endl;
  cout << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" << endl;
  
}

moneta · October 11, 2015, 1:45pm

Hi,

The fit looks fine to me, the parameter errors on the normalisation and slope seems to be what you expect. I think you have an error in extrapolating. You need to consider also the normalisation of the function and its error when computing the error on the integral.
You need to uncomment the line

 prodSet.add(norm);

when you defined the function to transform as a TF1.
Also since later you multiply by the fitted normalisation and dividing by the total one, there are some uncertainty in the normalisation that you would need to take into account.

Best Regards

Lorenzo