TRobustEstimator - source file

// @(#)root/physics:$Name:  $:$Id: TRobustEstimator.cxx,v 1.9 2004/10/21 14:44:02 rdm Exp $
// Author: Anna Kreshuk  08/10/2004

/*************************************************************************
 * Copyright (C) 1995-2004, Rene Brun and Fons Rademakers.               *
 * All rights reserved.                                                  *
 *                                                                       *
 * For the licensing terms see $ROOTSYS/LICENSE.                         *
 * For the list of contributors see $ROOTSYS/README/CREDITS.             *
 *************************************************************************/

//////////////////////////////////////////////////////////////////////////////
//
//  TRobustEstimator
//
// Minimum Covariance Determinant Estimator - a Fast Algorithm
// invented by Peter J.Rousseeuw and Katrien Van Dreissen
// "A Fast Algorithm for the Minimum covariance Determinant Estimator"
// Technometrics, August 1999, Vol.41, NO.3
//
// What are robust estimators?
// "An important property of an estimator is its robustness. An estimator
// is called robust if it is insensitive to measurements that deviate
// from the expected behaviour. There are 2 ways to treat such deviating
// measurements: one may either try to recongize them and then remove
// them from the data sample; or one may leave them in the sample, taking
// care that they do not influence the estimate unduly. In both cases robust
// estimators are needed...Robust procedures compensate for systematic errors
// as much as possible, and indicate any situation in which a danger of not being
// able to operate reliably is detected."
// R.Fruhwirth, M.Regler, R.K.Bock, H.Grote, D.Notz
// "Data Analysis Techniques for High-Energy Physics", 2nd edition
//
// What does this algorithm do?
// It computes a highly robust estimator of multivariate location and scatter.
// Then, it takes those estimates to compute robust distances of all the
// data vectors. Those with large robust distances are considered outliers.
// Robust distances can then be plotted for better visualization of the data.
//
// How does this algorithm do it?
// The MCD objective is to find h observations(out of n) whose classical
// covariance matrix has the lowest determinant. The MCD estimator of location
// is then the average of those h points and the MCD estimate of scatter
// is their covariance matrix. The minimum(and default) h = (n+nvariables+1)/2
// so the algorithm is effective when less than (n+nvar+1)/2 variables are outliers.
// The algorithm also allows for exact fit situations - that is, when h or more
// observations lie on a hyperplane. Then the algorithm still yields the MCD location T
// and scatter matrix S, the latter being singular as it should be. From (T,S) the
// program then computes the equation of the hyperplane.
//
// How can this algorithm be used?
// In any case, when contamination of data is suspected, that might influence
// the classical estimates.
// Also, robust estimation of location and scatter is a tool to robustify
// other multivariate techniques such as, for example, principal-component analysis
// and discriminant analysis.
//
//
//
//
// Technical details of the algorithm:
// 0.The default h = (n+nvariables+1)/2, but the user may choose any interger h with
//   (n+nvariables+1)/2<=h<=n. The program then reports the MCD's breakdown value
//   (n-h+1)/n. If you are sure that the dataset contains less than 25% contamination
//   which is usually the case, a good compromise between breakdown value and
//  efficiency is obtained by putting h=[.75*n].
// 1.If h=n,the MCD location estimate is the average of the whole dataset, and
//   the MCD scatter estimate is its covariance matrix. Report this and stop
// 2.If nvariables=1 (univariate data), compute the MCD estimate by the exact
//   algorithm of Rousseeuw and Leroy (1987, pp.171-172) in O(nlogn)time and stop
// 3.From here on, h<n and nvariables>=2.
//   3a.If n is small:
//    - repeat (say) 500 times:
//    -- construct an initial h-subset, starting from a random (nvar+1)-subset
//    -- carry out 2 C-steps (described in the comments of CStep funtion)
//    - for the 10 results with lowest det(S):
//    -- carry out C-steps until convergence
//    - report the solution (T, S) with the lowest det(S)
//   3b.If n is larger (say, n>600), then
//    - construct up to 5 disjoint random subsets of size nsub (say, nsub=300)
//    - inside each subset repeat 500/5 times:
//    -- construct an initial subset of size hsub=[nsub*h/n]
//    -- carry out 2 C-steps
//    -- keep the best 10 results (Tsub, Ssub)
//    - pool the subsets, yielding the merged set (say, of size nmerged=1500)
//    - in the merged set, repeat for each of the 50 solutions (Tsub, Ssub)
//    -- carry out 2 C-steps
//    -- keep the 10 best results
//    - in the full dataset, repeat for those best results:
//    -- take several C-steps, using n and h
//    -- report the best final result (T, S)
// 4.To obtain consistency when the data comes from a multivariate normal
//   distribution, covariance matrix is multiplied by a correction factor
// 5.Robust distances for all elements, using the final (T, S) are calculated
//   Then the very final mean and covariance estimates are calculated only for
//   values, whose robust distances are less than a cutoff value (0.975 quantile
//   of chi2 distribution with nvariables degrees of freedom)
//
//////////////////////////////////////////////////////////////////////////////

#include "TRobustEstimator.h"
#include "TRandom.h"
#include "TMath.h"
#include "TH1D.h"
#include "TPaveLabel.h"
#include "TDecompChol.h"

ClassImp(TRobustEstimator)

const Double_t kChiMedian[50]= {
         0.454937, 1.38629, 2.36597, 3.35670, 4.35146, 5.34812, 6.34581, 7.34412, 8.34283,
	 9.34182, 10.34, 11.34, 12.34, 13.34, 14.34, 15.34, 16.34, 17.34, 18.34, 19.34,
	20.34, 21.34, 22.34, 23.34, 24.34, 25.34, 26.34, 27.34, 28.34, 29.34, 30.34,
	31.34, 32.34, 33.34, 34.34, 35.34, 36.34, 37.34, 38.34, 39.34, 40.34,
	41.34, 42.34, 43.34, 44.34, 45.34, 46.34, 47.34, 48.34, 49.33};

const Double_t kChiQuant[50]={
         5.02389, 7.3776,9.34840,11.1433,12.8325,
        14.4494,16.0128,17.5346,19.0228,20.4831,21.920,23.337,
        24.736,26.119,27.488,28.845,30.191,31.526,32.852,34.170,
        35.479,36.781,38.076,39.364,40.646,41.923,43.194,44.461,
        45.722,46.979,48.232,49.481,50.725,51.966,53.203,54.437,
        55.668,56.896,58.120,59.342,60.561,61.777,62.990,64.201,
        65.410,66.617,67.821,69.022,70.222,71.420};

//_____________________________________________________________________________
 TRobustEstimator::TRobustEstimator(){
  //this constructor should be used in a univariate case:
  //first call this constructor, then - the EvaluateUni(..) fucntion

}

//______________________________________________________________________________
 TRobustEstimator::TRobustEstimator(Int_t nvectors, Int_t nvariables, Int_t hh)
  :fMean(nvariables),
   fCovariance(nvariables),
   fInvcovariance(nvariables),
   fCorrelation(nvariables),
   fRd(nvectors),
   fSd(nvectors),
   fOut(1),
   fHyperplane(nvariables),
   fData(nvectors, nvariables)
{
  if ((nvectors<=1)||(nvariables<=0)){
     Error("TRobustEstimator","Not enough vectors or variables");
     return;
  }
  if (nvariables==1){
     Error("TRobustEstimator","For the univariate case, use the default constructor and EvaluateUni() function");
     return;
  }

  fN=nvectors;
  fNvar=nvariables;
  if (hh<(fN+fNvar+1)/2){
     if (hh>0)
        Warning("TRobustEstimator","chosen h is too small, default h is taken instead");
     fH=(fN+fNvar+1)/2;
  } else
     fH=hh;

  fVarTemp=0;
  fVecTemp=0;
  fExact=0;
}

//_____________________________________________________________________________
 void TRobustEstimator::AddColumn(Double_t *col)
{
  //adds a column to the data matrix
  //it is assumed that the column has size fN
  //variable fVarTemp keeps the number of columns l
  //already added
  if (fVarTemp==fNvar) {
    fNvar++;
    fCovariance.ResizeTo(fNvar, fNvar);
    fInvcovariance.ResizeTo(fNvar, fNvar);
    fCorrelation.ResizeTo(fNvar, fNvar);
    fMean.ResizeTo(fNvar);
    fHyperplane.ResizeTo(fNvar);
    fData.ResizeTo(fN, fNvar);
  }
  for (Int_t i=0; i<fN; i++) {
     fData(i, fVarTemp)=col[i];
  }
  fVarTemp++;
}

//_______________________________________________________________________________
 void TRobustEstimator::AddRow(Double_t *row)
{
  //adds a vector to the data matrix
  //it is supposed that the vector is of size fNvar
  if(fVecTemp==fN) {
    fN++;
    fRd.ResizeTo(fN);
    fSd.ResizeTo(fN);
    fData.ResizeTo(fN, fNvar);
  }
  for (Int_t i=0; i<fNvar; i++)
    fData(fVecTemp, i)=row[i];

  fVecTemp++;
}

//_______________________________________________________________________________
 void TRobustEstimator::Evaluate()
{
  Double_t kEps=1e-14;

  if (fH==fN){
     Warning("Evaluate","Chosen h = #observations, so classic estimates of location and scatter will be calculated");
     Classic();
     return;
  }

  Int_t i, j, k;
  Int_t ii, jj;
  Int_t nmini = 300;
  Int_t k1=500;
  Int_t nbest=10;
  TMatrixD sscp(fNvar+1, fNvar+1);
  TVectorD vec(fNvar);

  Int_t *index = new Int_t[fN];
  Double_t *ndist = new Double_t[fN];
  Double_t det;
  Double_t *deti=new Double_t[nbest];
  for (i=0; i<nbest; i++)
    deti[i]=1e16;

  for (i=0; i<fN; i++)
    fRd(i)=0;
  ////////////////////////////
  //for small n
  ////////////////////////////
  if (fN<nmini*2) {
    //for storing the best fMeans and covariances

    TMatrixD mstock(nbest, fNvar);
    TMatrixD cstock(fNvar, fNvar*nbest);

    for (k=0; k<k1; k++) {
      CreateSubset(fN, fH, fNvar, index, fData, sscp, ndist);
      //calculate the mean and covariance of the created subset
      ClearSscp(sscp);
      for (i=0; i<fH; i++) {
	for(j=0; j<fNvar; j++)
	  vec(j)=fData[index[i]][j];
	AddToSscp(sscp, vec);
      }
      Covar(sscp, fMean, fCovariance, fSd, fH);
      det = fCovariance.Determinant();
      if (det < kEps) {
	fExact = Exact(ndist);
	delete [] index;
	delete [] ndist;
	delete [] deti;
	return;
      }
      //make 2 CSteps
      det = CStep(fN, fH, index, fData, sscp, ndist);
      if (det < kEps) {
	fExact = Exact(ndist);
	delete [] index;
	delete [] ndist;
	delete [] deti;
	return;
      }
      det = CStep(fN, fH, index, fData, sscp, ndist);
      if (det < kEps) {
	fExact = Exact(ndist);
	delete [] index;
	delete [] ndist;
	delete [] deti;
	return;
      } else {

	Int_t maxind=TMath::LocMax(nbest, deti);
	if(det<deti[maxind]) {
	  deti[maxind]=det;
	  for(ii=0; ii<fNvar; ii++) {
	    mstock(maxind, ii)=fMean(ii);
	    for(jj=0; jj<fNvar; jj++)
	      cstock(ii, jj+maxind*fNvar)=fCovariance(ii, jj);
	  }
	}
      }
    }

    //now for nbest best results perform CSteps until convergence

    for (i=0; i<nbest; i++) {
      for(ii=0; ii<fNvar; ii++) {
	fMean(ii)=mstock(i, ii);
	for (jj=0; jj<fNvar; jj++)
	  fCovariance(ii, jj)=cstock(ii, jj+i*fNvar);
      }

      det=1;
      while (det>kEps) {
	  det=CStep(fN, fH, index, fData, sscp, ndist);
	  if(TMath::Abs(det-deti[i])<kEps)
	    break;
	  else
	    deti[i]=det;
      }
      for(ii=0; ii<fNvar; ii++) {
	mstock(i,ii)=fMean(ii);
	for (jj=0; jj<fNvar; jj++)
	  cstock(ii,jj+i*fNvar)=fCovariance(ii, jj);
      }
    }

    Int_t detind=TMath::LocMin(nbest, deti);
    for(ii=0; ii<fNvar; ii++) {
      fMean(ii)=mstock(detind,ii);

      for(jj=0; jj<fNvar; jj++)
	fCovariance(ii, jj)=cstock(ii,jj+detind*fNvar);
    }

    if (deti[detind]!=0) {
      //calculate robust distances and throw out the bad points
      Int_t nout = RDist(sscp);
      Double_t cutoff=kChiQuant[fNvar-1];

      fOut.Set(nout);

      j=0;
      for (i=0; i<fN; i++) {
	if(fRd(i)>cutoff) {
	   fOut[j]=i;
	   j++;
	}
      }

    } else {
      fExact=Exact(ndist);
    }
    delete [] index;
    delete [] ndist;
    delete [] deti;
    return;

  }
  /////////////////////////////////////////////////
  //if n>nmini, the dataset should be partitioned
  //partitioning
  ////////////////////////////////////////////////
  Int_t indsubdat[5];
  Int_t nsub;
  for (ii=0; ii<5; ii++)
     indsubdat[ii]=0;

  nsub = Partition(nmini, indsubdat);

  Int_t sum=0;
  for (ii=0; ii<5; ii++)
    sum+=indsubdat[ii];
  Int_t *subdat=new Int_t[sum];
  RDraw(subdat, nsub, indsubdat);
  //now the indexes of selected cases are in the array subdat
  //matrices to store best means and covariances
  Int_t nbestsub=nbest*nsub;
  TMatrixD mstockbig(nbestsub, fNvar);
  TMatrixD cstockbig(fNvar, fNvar*nbestsub);
  TMatrixD hyperplane(nbestsub, fNvar);
  for (i=0; i<nbestsub; i++) {
    for(j=0; j<fNvar; j++)
      hyperplane(i,j)=0;
  }
  Double_t *detibig = new Double_t[nbestsub];
  Int_t maxind;
  maxind=TMath::LocMax(5, indsubdat);
  TMatrixD dattemp(indsubdat[maxind], fNvar);

  Int_t k2=Int_t(k1/nsub);
  //construct h-subsets and perform 2 CSteps in subgroups

  for (Int_t kgroup=0; kgroup<nsub; kgroup++) {
    //printf("group #%d\n", kgroup);
    Int_t ntemp=indsubdat[kgroup];
    Int_t temp=0;
    for (i=0; i<kgroup; i++)
       temp+=indsubdat[i];
    Int_t par;

    for(i=0; i<ntemp; i++) {
      for (j=0; j<fNvar; j++) {
	 dattemp(i,j)=fData[subdat[temp+i]][j];
      }
    }
    Int_t htemp=Int_t(fH*ntemp/fN);

    for (i=0; i<nbest; i++)
      deti[i]=1e16;

    for(k=0; k<k2; k++) {
      CreateSubset(ntemp, htemp, fNvar, index, dattemp, sscp, ndist);
      ClearSscp(sscp);
      for (i=0; i<htemp; i++) {
	for(j=0; j<fNvar; j++) {
	   vec(j)=dattemp(index[i],j);
	}
	AddToSscp(sscp, vec);
      }
      Covar(sscp, fMean, fCovariance, fSd, htemp);
      det = fCovariance.Determinant();
      if (det<kEps) {
	par =Exact2(mstockbig, cstockbig, hyperplane, deti, nbest, kgroup, sscp,ndist);
	if(par==nbest+1) {

	  delete [] detibig;
	  delete [] deti;
	  delete [] subdat;
	  delete [] ndist;
	  delete [] index;
	  return;
	} else
	  deti[par]=det;
      } else {
	det = CStep(ntemp, htemp, index, dattemp, sscp, ndist);
	if (det<kEps) {
	  par=Exact2(mstockbig, cstockbig, hyperplane, deti, nbest, kgroup, sscp, ndist);
	  if(par==nbest+1) {

	    delete [] detibig;
	    delete [] deti;
	    delete [] subdat;
	    delete [] ndist;
	    delete [] index;
	    return;
	  } else
	    deti[par]=det;
	} else {
	  det=CStep(ntemp,htemp, index, dattemp, sscp, ndist);
	  if(det<kEps){
	    par=Exact2(mstockbig, cstockbig, hyperplane, deti, nbest, kgroup, sscp,ndist);
	    if(par==nbest+1) {

	      delete [] detibig;
	      delete [] deti;
	      delete [] subdat;
	      delete [] ndist;
	      delete [] index;
	      return;
	    } else {
	      deti[par]=det;
	    }
	  } else {
	    maxind=TMath::LocMax(nbest, deti);
	    if(det<deti[maxind]) {
	      deti[maxind]=det;
	      for(i=0; i<fNvar; i++) {
		mstockbig(nbest*kgroup+maxind,i)=fMean(i);
		for(j=0; j<fNvar; j++) {
		  cstockbig(i,nbest*kgroup*fNvar+maxind*fNvar+j)=fCovariance(i,j);

		}
	      }
	    }

	  }
	}
      }

      maxind=TMath::LocMax(nbest, deti);
      if (deti[maxind]<kEps)
	break;
    }


    for(i=0; i<nbest; i++) {
      detibig[kgroup*nbest + i]=deti[i];

    }

  }

  //now the arrays mstockbig and cstockbig store nbest*nsub best means and covariances
  //detibig stores nbest*nsub their determinants
  //merge the subsets and carry out 2 CSteps on the merged set for all 50 best solutions

  TMatrixD datmerged(sum, fNvar);
  for(i=0; i<sum; i++) {
    for (j=0; j<fNvar; j++)
      datmerged(i,j)=fData[subdat[i]][j];
  }
  //  printf("performing calculations for merged set\n");
  Int_t hmerged=Int_t(sum*fH/fN);

  Int_t nh;
  for(k=0; k<nbestsub; k++) {
    //for all best solutions perform 2 CSteps and then choose the very best
      for(ii=0; ii<fNvar; ii++) {
	fMean(ii)=mstockbig(k,ii);
	for(jj=0; jj<fNvar; jj++)
	  fCovariance(ii, jj)=cstockbig(ii,k*fNvar+jj);
      }
    if(detibig[k]==0) {
       for(i=0; i<fNvar; i++)
	fHyperplane(i)=hyperplane(k,i);
      CreateOrtSubset(datmerged,index, hmerged, sum, sscp, ndist);

    }
    det=CStep(sum, hmerged, index, datmerged, sscp, ndist);
    if (det<kEps) {
      nh= Exact(ndist);
      if (nh>=fH) {
	fExact = nh;

	delete [] detibig;
	delete [] deti;
	delete [] subdat;
	delete [] ndist;
	delete [] index;
	return;
      } else {
	CreateOrtSubset(datmerged, index, hmerged, sum, sscp, ndist);
      }
    }

    det=CStep(sum, hmerged, index, datmerged, sscp, ndist);
    if (det<kEps) {
      nh=Exact(ndist);
	if (nh>=fH) {
	  fExact = nh;
	  delete [] detibig;
	  delete [] deti;
	  delete [] subdat;
	  delete [] ndist;
	  delete [] index;
	  return;
	}
    }
    detibig[k]=det;
    for(i=0; i<fNvar; i++) {
      mstockbig(k,i)=fMean(i);
      for(j=0; j<fNvar; j++) {
	cstockbig(i,k*fNvar+j)=fCovariance(i, j);
      }
    }


  }
  //now for the subset with the smallest determinant
  //repeat CSteps until convergence
  Int_t minind=TMath::LocMin(nbestsub, detibig);
  det=detibig[minind];
  for(i=0; i<fNvar; i++) {
    fMean(i)=mstockbig(minind,i);
    fHyperplane(i)=hyperplane(minind,i);
    for(j=0; j<fNvar; j++)
      fCovariance(i, j)=cstockbig(i,minind*fNvar + j);
  }
  if(det<kEps)
    CreateOrtSubset(fData, index, fH, fN, sscp, ndist);
  det=1;
  while (det>kEps) {
    det=CStep(fN, fH, index, fData, sscp, ndist);
    if(TMath::Abs(det-detibig[minind])<kEps) {
       break;
    } else {
       detibig[minind]=det;
    }
  }
  if(det<kEps) {
    Exact(ndist);
    fExact=kTRUE;
  }
  Int_t nout = RDist(sscp);
  Double_t cutoff=kChiQuant[fNvar-1];

  fOut.Set(nout);

  j=0;
  for (i=0; i<fN; i++) {
    if(fRd(i)>cutoff) {
      fOut[j]=i;
      j++;
    }
  }

  delete [] detibig;
  delete [] deti;
  delete [] subdat;
  delete [] ndist;
  delete [] index;
  return;
}

//___________________________________________________________________________________________
 void TRobustEstimator::EvaluateUni(Int_t nvectors, Double_t *data, Double_t &mean, Double_t &sigma, Int_t hh)
{
  //for the univariate case
  //estimates of location and scatter are returned in mean and sigma parameters
  //the algorithm works on the same principle as in multivariate case -
  //it finds a subset of size hh with smallest sigma, and then returns mean and
  //sigma of this subset

  if (hh==0)
    hh=(nvectors+2)/2;
  Double_t faclts[]={2.6477,2.5092,2.3826,2.2662,2.1587,2.0589,1.9660,1.879,1.7973,1.7203,1.6473};
  Int_t *index=new Int_t[nvectors];
  TMath::Sort(nvectors, data, index, kFALSE);

  Int_t nquant;
  nquant=TMath::Min(Int_t(Double_t(((hh*1./nvectors)-0.5)*40))+1, 11);
  Double_t factor=faclts[nquant-1];

  Double_t *aw=new Double_t[nvectors];
  Double_t *aw2=new Double_t[nvectors];
  Double_t sq=0;
  Double_t sqmin=0;
  Int_t ndup=0;
  Int_t len=nvectors-hh;
  Double_t *slutn=new Double_t[len];
  for(Int_t i=0; i<len; i++)
    slutn[i]=0;
  for(Int_t jint=0; jint<len; jint++) {
    aw[jint]=0;
    for (Int_t j=0; j<hh; j++) {
      aw[jint]+=data[index[j+jint]];
      if(jint==0)
	sq+=data[index[j]]*data[index[j]];
    }
    aw2[jint]=aw[jint]*aw[jint]/hh;

    if(jint==0) {
      sq=sq-aw2[jint];
      sqmin=sq;
      slutn[ndup]=aw[jint];

    } else {
      sq=sq - data[index[jint-1]]*data[index[jint-1]]+
	data[index[jint+hh]]*data[index[jint+hh]]-
	aw2[jint]+aw2[jint-1];
      if(sq<sqmin) {
	ndup=0;
	sqmin=sq;
	slutn[ndup]=aw[jint];

      } else {
	if(sq==sqmin) {
	  ndup++;
	  slutn[ndup]=aw[jint];
	}
      }
    }
  }

  slutn[0]=slutn[Int_t((ndup)/2)]/hh;
  Double_t bstd=factor*TMath::Sqrt(sqmin/hh);
  mean=slutn[0];
  sigma=bstd;
  delete [] aw;
  delete [] aw2;
  delete [] slutn;

}


//_______________________________________________________________________
 Int_t TRobustEstimator::GetBDPoint()
{
  //returns the breakdown point of the algorithm
  Int_t n;
  n=(fN-fH+1)/fN;
  return n;
}

//_______________________________________________________________________
 Double_t TRobustEstimator::GetChiQuant(Int_t i) const
{
   if (i < 0 || i >= 50) return 0;
   return kChiQuant[i];
}

//_______________________________________________________________________
 void TRobustEstimator::GetCovariance(TMatrixDSym &matr)
{
  if (matr.GetNrows()!=fNvar || matr.GetNcols()!=fNvar){
     Warning("GetCovariance","provided matrix is of the wrong size, it will be resized");
     matr.ResizeTo(fNvar, fNvar);
  }
  matr=fCovariance;
}

//_______________________________________________________________________
 void TRobustEstimator::GetCorrelation(TMatrixDSym &matr)
{
  if (matr.GetNrows()!=fNvar || matr.GetNcols()!=fNvar) {
     Warning("GetCorrelation","provided matrix is of the wrong size, it will be resized");
     matr.ResizeTo(fNvar, fNvar);
  }
  matr=fCorrelation;
}

//____________________________________________________________________
 const TVectorD* TRobustEstimator::GetHyperplane() const
{
  if (fExact==0) {
     Error("GetHyperplane","the data doesn't lie on a hyperplane!\n");
     return 0;
  } else {
     return &fHyperplane;
  }
}

//______________________________________________________________________
 void TRobustEstimator::GetHyperplane(TVectorD &vec)
{
  if (fExact==0){
     Error("GetHyperplane","the data doesn't lie on a hyperplane!\n");
     return;
  }
  if (vec.GetNoElements()!=fNvar) {
     Warning("GetHyperPlane","provided vector is of the wrong size, it will be resized");
     vec.ResizeTo(fNvar);
  }
  vec=fHyperplane;
}

//________________________________________________________________________
 void TRobustEstimator::GetMean(TVectorD &means)
{
  if (means.GetNoElements()!=fNvar) {
     Warning("GetMean","provided vector is of the wrong size, it will be resized");
     means.ResizeTo(fNvar);
  }
  means=fMean;
}

//_________________________________________________________________________
 void TRobustEstimator::GetRDistances(TVectorD &rdist)
{
  if (rdist.GetNoElements()!=fN) {
     Warning("GetRDistances","provided vector is of the wrong size, it will be resized");
     rdist.ResizeTo(fN);
  }
  rdist=fRd;
}

//__________________________________________________________________________
 Int_t TRobustEstimator::GetNOut()
{
  return fOut.GetSize();
}

//_________________________________________________________________________
 void TRobustEstimator::AddToSscp(TMatrixD &sscp, TVectorD &vec)
{
  //update the sscp matrix with vector vec
  Int_t i, j;
  for (j=1; j<fNvar+1; j++) {
     sscp(0, j) +=vec(j-1);
     sscp(j, 0) = sscp(0, j);
  }
  for (i=1; i<fNvar+1; i++) {
     for (j=1; j<fNvar+1; j++) {
        sscp(i, j) += vec(i-1)*vec(j-1);
     }
  }
}

//__________________________________________________________________________
 void TRobustEstimator::ClearSscp(TMatrixD &sscp)
{
  //clear the sscp matrix, used for covariance and mean calculation
  for (Int_t i=0; i<fNvar+1; i++) {
    for (Int_t j=0; j<fNvar+1; j++) {
      sscp(i, j)=0;
    }
  }
}

//_______________________________________________________________
 void TRobustEstimator::Classic()
{
  //called when h=n. Returns classic covariance matrix
  //and mean
  TMatrixD sscp(fNvar+1, fNvar+1);
  TVectorD temp(fNvar);
  ClearSscp(sscp);
  for (Int_t i=0; i<fN; i++) {
    for (Int_t j=0; j<fNvar; j++)
      temp(j)=fData(i, j);
    AddToSscp(sscp, temp);
  }
  Covar(sscp, fMean, fCovariance, fSd, fN);
  Correl();

}

//____________________________________________________________________
 void TRobustEstimator::Covar(TMatrixD &sscp, TVectorD &m, TMatrixDSym &cov, TVectorD &sd, Int_t nvec)
{
  //calculates mean and covariance
  Int_t i, j;
  Double_t f;
  for (i=0; i<fNvar; i++) {
    m(i)=sscp(0, i+1);
    sd[i]=sscp(i+1, i+1);
    f=(sd[i]-m(i)*m(i)/nvec)/(nvec-1);
    if (f>1e-14) sd[i]=TMath::Sqrt(f);
    else sd[i]=0;
    m(i)/=nvec;
  }
  for (i=0; i<fNvar; i++) {
    for (j=0; j<fNvar; j++) {
      cov(i, j)=sscp(i+1, j+1)-nvec*m(i)*m(j);
      cov(i, j)/=nvec-1;
    }
  }
}

//____________________________________________________________________
 void TRobustEstimator::Correl()
{
  //transforms covariance matrix into correlation matrix
  Int_t i, j;
  Double_t *sd=new Double_t[fNvar];
  for(j=0; j<fNvar; j++)
    sd[j]=1./TMath::Sqrt(fCovariance(j, j));
  for(i=0; i<fNvar; i++) {
    for (j=0; j<fNvar; j++) {
      if (i==j)
	fCorrelation(i, j)=1.;
      else
	fCorrelation(i, j)=fCovariance(i, j)*sd[i]*sd[j];
    }
  }
  delete [] sd;
}

//____________________________________________________________________
 void TRobustEstimator::CreateSubset(Int_t ntotal, Int_t htotal, Int_t p, Int_t *index, TMatrixD &data, TMatrixD &sscp, Double_t *ndist)
{
  //creates a subset of htotal elements from ntotal elements
  //first, p+1 elements are drawn randomly(without repetitions)
  //if their covariance matrix is singular, more elements are
  //added one by one, until their covariance matrix becomes regular
  //or it becomes clear that htotal observations lie on a hyperplane
  //If covariance matrix determinant!=0, distances of all ntotal elements
  //are calculated, using formula d_i=Sqrt((x_i-M)*S_inv*(x_i-M)), where
  //M is mean and S_inv is the inverse of the covariance matrix
  //htotal points with smallest distances are included in the returned subset.

  Double_t kEps = 1e-14;
  Int_t i, j;
  Bool_t repeat=kFALSE;
  Int_t nindex=0;
  Int_t num;
  for(i=0; i<ntotal; i++)
    index[i]=ntotal+1;

  for (i=0; i<p+1; i++) {
    num=Int_t(gRandom->Uniform(0, 1)*(ntotal-1));
    if (i>0){
      for(j=0; j<=i-1; j++) {
	if(index[j]==num)

	  repeat=kTRUE;
      }
    }
    if(repeat==kTRUE) {
      i--;
      repeat=kFALSE;
    } else {
      index[i]=num;
      nindex++;
    }
  }

  ClearSscp(sscp);

  TVectorD vec(fNvar);
  Double_t det;
  for (i=0; i<p+1; i++) {
    for (j=0; j<fNvar; j++) {
      vec[j]=data[index[i]][j];

    }
    AddToSscp(sscp, vec);
  }

  Covar(sscp, fMean, fCovariance, fSd, p+1);
  det=fCovariance.Determinant();
  while((det<kEps)&&(nindex < htotal)) {
    //if covariance matrix is singular,another vector is added until
    //the matrix becomes regular or it becomes clear that all
    //vectors of the group lie on a hyperplane
    repeat=kFALSE;
    do{
      num=Int_t(gRandom->Uniform(0,1)*(ntotal-1));
      repeat=kFALSE;
      for(i=0; i<nindex; i++) {
	if(index[i]==num) {
	  repeat=kTRUE;
	  break;
	}
      }
    }while(repeat==kTRUE);

    index[nindex]=num;
    nindex++;
    //check if covariance matrix is singular
    for(j=0; j<fNvar; j++)
      vec[j]=data[index[nindex-1]][j];
    AddToSscp(sscp, vec);
    Covar(sscp, fMean, fCovariance, fSd, nindex);
    det=fCovariance.Determinant();
  }

  if(nindex!=htotal) {
    TDecompChol chol(fCovariance);
    fInvcovariance = chol.Invert();

    TVectorD temp(fNvar);
    for(j=0; j<ntotal; j++) {
      ndist[j]=0;
      for(i=0; i<fNvar; i++)
	temp[i]=data[j][i] - fMean(i);
      temp*=fInvcovariance;
      for(i=0; i<fNvar; i++)
	ndist[j]+=(data[j][i]-fMean(i))*temp[i];
    }
    KOrdStat(ntotal, ndist, htotal-1,index);
  }

}

//_____________________________________________________________________________
 void TRobustEstimator::CreateOrtSubset(TMatrixD &dat,Int_t *index, Int_t hmerged, Int_t nmerged, TMatrixD &sscp, Double_t *ndist)
{
  //creates a subset of hmerged vectors with smallest orthogonal distances to the hyperplane
  //hyp[1]*(x1-mean[1])+...+hyp[nvar]*(xnvar-mean[nvar])=0
  //This function is called in case when less than fH samples lie on a hyperplane.
  Int_t i, j;

  TVectorD vec(fNvar);
  for (i=0; i<nmerged; i++) {
    ndist[i]=0;
    for(j=0; j<fNvar; j++) {
      ndist[i]+=fHyperplane[j]*(dat[i][j]-fMean[j]);
      ndist[i]=TMath::Abs(ndist[i]);
    }
  }
  KOrdStat(nmerged, ndist, hmerged-1, index);
  ClearSscp(sscp);
  for (i=0; i<hmerged; i++) {
    for(j=0; j<fNvar; j++)
      vec[j]=dat[index[i]][j];
    AddToSscp(sscp, vec);
  }
  Covar(sscp, fMean, fCovariance, fSd, hmerged);
}

//__________________________________________________________________________
 Double_t TRobustEstimator::CStep(Int_t ntotal, Int_t htotal, Int_t *index, TMatrixD &data, TMatrixD &sscp, Double_t *ndist)
{
  //from the input htotal-subset constructs another htotal subset with lower determinant
  //
  //As proven by Peter J.Rousseeuw and Katrien Van Driessen, if distances for all elements
  //are calculated, using the formula:d_i=Sqrt((x_i-M)*S_inv*(x_i-M)), where M is the mean
  //of the input htotal-subset, and S_inv - the inverse of its covariance matrix, then
  //htotal elements with smallest distances will have covariance matrix with determinant
  //less or equal to the determinant of the input subset covariance matrix.
  //
  //determinant for this htotal-subset with smallest distances is returned

  Int_t i, j;
  TVectorD vec(fNvar);
  Double_t det;

  TDecompChol chol(fCovariance);
  fInvcovariance = chol.Invert();

  TVectorD temp(fNvar);
  for(j=0; j<ntotal; j++) {
    ndist[j]=0;
    for(i=0; i<fNvar; i++)
      temp[i]=data[j][i]-fMean[i];
    temp*=fInvcovariance;
    for(i=0; i<fNvar; i++)
      ndist[j]+=(data[j][i]-fMean[i])*temp[i];
  }

  //taking h smallest
  KOrdStat(ntotal, ndist, htotal-1, index);
  //writing their mean and covariance
  ClearSscp(sscp);
  for (i=0; i<htotal; i++) {
    for (j=0; j<fNvar; j++)
      temp[j]=data[index[i]][j];
    AddToSscp(sscp, temp);
  }
  Covar(sscp, fMean, fCovariance, fSd, htotal);
  det = fCovariance.Determinant();
  return det;
}

//_______________________________________________________________
 Int_t TRobustEstimator::Exact(Double_t *ndist)
{
  //for the exact fit situaions
  //returns number of observations on the hyperplane
 Int_t i, j;

  TMatrixDSymEigen eigen(fCovariance);
  TVectorD eigenValues=eigen.GetEigenValues();
  TMatrixD eigenMatrix=eigen.GetEigenVectors();

  for (j=0; j<fNvar; j++) {
    fHyperplane[j]=eigenMatrix(j,fNvar-1);
  }
  //calculate and return how many observations lie on the hyperplane
  for (i=0; i<fN; i++) {
    ndist[i]=0;
    for(j=0; j<fNvar; j++) {
      ndist[i]+=fHyperplane[j]*(fData[i][j]-fMean[j]);
      ndist[i]=TMath::Abs(ndist[i]);
    }
  }
  Int_t nhyp=0;

  for (i=0; i<fN; i++) {
     if(ndist[i] < 1e-14) nhyp++;
  }
  return nhyp;

}

//____________________________________________________________________________
 Int_t TRobustEstimator::Exact2(TMatrixD &mstockbig, TMatrixD &cstockbig, TMatrixD &hyperplane,
			     Double_t *deti, Int_t nbest, Int_t kgroup,
			     TMatrixD &sscp, Double_t *ndist)
{
  //This function is called if determinant of the covariance matrix of a subset=0.
  //
  //If there are more then fH vectors on a hyperplane,
  //returns this hyperplane and stops
  //else stores the hyperplane coordinates in hyperplane matrix

  Int_t i, j;

  TVectorD vec(fNvar);
  Int_t maxind = TMath::LocMax(nbest, deti);
  Int_t nh=Exact(ndist);
  //now nh is the number of observation on the hyperplane
  //ndist stores distances of observation from this hyperplane
  if(nh>=fH) {
    ClearSscp(sscp);
    for (i=0; i<fN; i++) {
      if(ndist[i]<1e-14) {
	for (j=0; j<fNvar; j++)
	  vec[j]=fData[i][j];
	AddToSscp(sscp, vec);
      }
    }
    Covar(sscp, fMean, fCovariance, fSd, nh);

    fExact=nh;
    return nbest+1;

  } else {
    //if less than fH observations lie on a hyperplane,
    //mean and covariance matrix are stored in mstockbig
    //and cstockbig in place of the previous maximum determinant
    //mean and covariance
      for(i=0; i<fNvar; i++) {
	mstockbig(nbest*kgroup+maxind,i)=fMean(i);
	hyperplane(nbest*kgroup+maxind,i)=fHyperplane(i);
	for(j=0; j<fNvar; j++) {
	   cstockbig(i,nbest*kgroup*fNvar+maxind*fNvar+j)=fCovariance(i,j);
	}

      }

      return maxind;
  }
}


//_____________________________________________________________________________
 Int_t TRobustEstimator::Partition(Int_t nmini, Int_t *indsubdat)
{
  //divides the elements into approximately equal subgroups
  //number of elements in each subgroup is stored in indsubdat
  //number of subgroups is returned
  Int_t nsub;

  if ((fN>=2*nmini) && (fN<=(3*nmini-1))) {
    if (fN%2==1){
      indsubdat[0]=Int_t(fN*0.5);
      indsubdat[1]=Int_t(fN*0.5)+1;
    } else
      indsubdat[0]=indsubdat[1]=Int_t(fN/2);
    nsub=2;
  }
  else{
    if((fN>=3*nmini) && (fN<(4*nmini -1))) {
      if(fN%3==0){
	indsubdat[0]=indsubdat[1]=indsubdat[2]=Int_t(fN/3);
      } else {
	indsubdat[0]=Int_t(fN/3);
	indsubdat[1]=Int_t(fN/3)+1;
	if (fN%3==1) indsubdat[2]=Int_t(fN/3);
	else indsubdat[2]=Int_t(fN/3)+1;
      }
	nsub=3;
    }
    else{
      if((fN>=4*nmini)&&(fN<=(5*nmini-1))){
	if (fN%4==0) indsubdat[0]=indsubdat[1]=indsubdat[2]=indsubdat[3]=Int_t(fN/4);
	else {
	  indsubdat[0]=Int_t(fN/4);
	  indsubdat[1]=Int_t(fN/4)+1;
	  if(fN%4==1) indsubdat[2]=indsubdat[3]=Int_t(fN/4);
	  if(fN%4==2) {
	    indsubdat[2]=Int_t(fN/4)+1;
	    indsubdat[3]=Int_t(fN/4);
	  }
	  if(fN%4==3) indsubdat[2]=indsubdat[3]=Int_t(fN/4)+1;
	}
	nsub=4;
      } else {
	for(Int_t i=0; i<5; i++)
	    indsubdat[i]=nmini;
	nsub=5;
      }
    }
  }
  return nsub;
}

//___________________________________________________________________________
 Int_t TRobustEstimator::RDist(TMatrixD &sscp)
{
  //Calculates robust distances.Then the samples with robust distances
  //greater than a cutoff value (0.975 quantile of chi2 distribution with
  //fNvar degrees of freedom, multiplied by a correction factor), are given
  //weiht=0, and new, reweighted estimates of location and scatter are calculated
  //The function returns the number of outliers.

 Int_t i, j;
 Int_t nout=0;

 TVectorD temp(fNvar);
 TDecompChol chol(fCovariance);
 fInvcovariance = chol.Invert();


 for (i=0; i<fN; i++) {
   fRd[i]=0;
   for(j=0; j<fNvar; j++) {
     temp[j]=fData[i][j]-fMean[j];
   }
   temp*=fInvcovariance;
   for(j=0; j<fNvar; j++) {
     fRd[i]+=(fData[i][j]-fMean[j])*temp[j];
   }
 }

 Double_t med;
 Double_t chi = kChiMedian[fNvar-1];

 med=TMath::Median(fN, fRd.GetMatrixArray());
 med/=chi;
 fCovariance*=med;
 TDecompChol chol2(fCovariance);
 fInvcovariance = chol2.Invert();

 for (i=0; i<fN; i++) {
   fRd[i]=0;
   for(j=0; j<fNvar; j++) {
     temp[j]=fData[i][j]-fMean[j];
   }

   temp*=fInvcovariance;
   for(j=0; j<fNvar; j++) {
     fRd[i]+=(fData[i][j]-fMean[j])*temp[j];
   }
 }

 Double_t cutoff = kChiQuant[fNvar-1];

  ClearSscp(sscp);
  for(i=0; i<fN; i++) {
    if (fRd[i]<=cutoff) {
      for(j=0; j<fNvar; j++)
	temp[j]=fData[i][j];
      AddToSscp(sscp,temp);
    } else {
      nout++;
    }
  }

  Covar(sscp, fMean, fCovariance, fSd, fN-nout);
  return nout;
}

//____________________________________________________________________________
 void TRobustEstimator::RDraw(Int_t *subdat, Int_t ngroup, Int_t *indsubdat)
{
  //Draws ngroup nonoverlapping subdatasets out of a dataset of size n
  //such that the selected case numbers are uniformly distributed from 1 to n

  Int_t jndex = 0;
  Int_t nrand;
  Int_t i, k, m, j;
  for (k=1; k<=ngroup; k++) {
    for (m=1; m<=indsubdat[k-1]; m++) {

      nrand = Int_t(gRandom->Uniform(0, 1) * (fN-jndex))+1;

      jndex++;
      if (jndex==1) {
	subdat[0]=nrand;
      } else {
	subdat[jndex-1]=nrand+jndex-2;
	for (i=1; i<=jndex-1; i++) {
	  if(subdat[i-1] > nrand+i-2) {
	    for(j=jndex; j>=i+1; j--) {
	      subdat[j-1]=subdat[j-2];
	    }
	    subdat[i-1]=nrand+i-2;
	    break;  //breaking the loop for(i=1...
	  }
	}
      }
    }
  }

}

//_____________________________________________________________________________
 Double_t TRobustEstimator::KOrdStat(Int_t ntotal, Double_t *a, Int_t k, Int_t *work){
  //because I need an Int_t work array
   Bool_t isAllocated = kFALSE;
   const Int_t kWorkMax=100;
   Int_t i, ir, j, l, mid;
   Int_t arr;
   Int_t *ind;
   Int_t workLocal[kWorkMax];
   Int_t temp;


   if (work) {
      ind = work;
   } else {
      ind = workLocal;
      if (ntotal > kWorkMax) {
         isAllocated = kTRUE;
         ind = new Int_t[ntotal];
      }
   }

   for (Int_t ii=0; ii<ntotal; ii++) {
      ind[ii]=ii;
   }
   Int_t rk = k;
   l=0;
   ir = ntotal-1;
   for(;;) {
      if (ir<=l+1) { //active partition contains 1 or 2 elements
         if (ir == l+1 && a[ind[ir]]<a[ind[l]])
	    {temp = ind[l]; ind[l]=ind[ir]; ind[ir]=temp;}
         Double_t tmp = a[ind[rk]];
         if (isAllocated)
            delete [] ind;
         return tmp;
      } else {
         mid = (l+ir) >> 1; //choose median of left, center and right
         {temp = ind[mid]; ind[mid]=ind[l+1]; ind[l+1]=temp;}//elements as partitioning element arr.
         if (a[ind[l]]>a[ind[ir]])  //also rearrange so that a[l]<=a[l+1]
	    {temp = ind[l]; ind[l]=ind[ir]; ind[ir]=temp;}

         if (a[ind[l+1]]>a[ind[ir]])
	    {temp=ind[l+1]; ind[l+1]=ind[ir]; ind[ir]=temp;}

         if (a[ind[l]]>a[ind[l+1]])
    	    {temp = ind[l]; ind[l]=ind[l+1]; ind[l+1]=temp;}

         i=l+1;        //initialize pointers for partitioning
         j=ir;
         arr = ind[l+1];
         for (;;) {
	    do i++; while (a[ind[i]]<a[arr]);
	    do j--; while (a[ind[j]]>a[arr]);
	    if (j<i) break;  //pointers crossed, partitioning complete
	       {temp=ind[i]; ind[i]=ind[j]; ind[j]=temp;}
         }
         ind[l+1]=ind[j];
         ind[j]=arr;
         if (j>=rk) ir = j-1; //keep active the partition that
         if (j<=rk) l=i;      //contains the k_th element
      }
   }
}
ROOT page - Class index - Class Hierarchy - Top of the page
This page has been automatically generated. If you have any comments or suggestions about the page layout send a mail to ROOT support, or contact the developers with any questions or problems regarding ROOT.