11#ifndef TMVA_DNN_MINIMIZERS 
   12#define TMVA_DNN_MINIMIZERS 
   53template<
typename Architecture_t>
 
   57   using Scalar_t = 
typename Architecture_t::Scalar_t;
 
   58   using Matrix_t = 
typename Architecture_t::Matrix_t;
 
   77                    size_t   convergenceSteps,
 
   90   template <
typename Data_t, 
typename Net_t>
 
   92                  const Data_t & TestDataIn, 
size_t nTestSamples,
 
   93                  Net_t & net, 
size_t nThreads = 1);
 
   96   template <
typename Data_t, 
typename Net_t>
 
   98                          const Data_t & TestDataIn, 
size_t nTestSamples,
 
   99                          Net_t & net, 
Scalar_t momentum, 
size_t nThreads = 1);
 
  106   template <
typename Net_t>
 
  111   template <
typename Net_t>
 
  120   template <
typename Net_t>
 
  121   void Step(Net_t &master,
 
  122             std::vector<Net_t> &nets,
 
  126   template <
typename Net_t>
 
  128                     std::vector<Net_t> &nets,
 
  131   template <
typename Net_t>
 
  136                     std::vector<Net_t> &nets,
 
  143   template <
typename Net_t>
 
  148   template <
typename Net_t>
 
  174template <
typename Architecture_t>
 
  176   : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0), fLearningRate(0),
 
  177     fMinimumError(std::numeric_limits<
Scalar_t>::infinity())
 
  183template <
typename Architecture_t>
 
  185   : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
 
  186     fTestInterval(testInterval), fLearningRate(learningRate), fMinimumError(std::numeric_limits<
Scalar_t>::infinity())
 
  192template<
typename Architecture_t>
 
  193template <
typename Data_t, 
typename Net_t>
 
  195                                                 size_t nTrainingSamples,
 
  196                                                 const Data_t & testData,
 
  208                                                   net.GetOutputWidth(), nThreads);
 
  209   auto testNet = net.CreateClone(nTestSamples);
 
  211                                                  testNet.GetBatchSize(),
 
  212                                                  testNet.GetInputWidth(),
 
  213                                                  net.GetOutputWidth());
 
  214   std::vector<Net_t> nets{};
 
  215   nets.reserve(nThreads);
 
  216   for (
size_t i = 0; i < nThreads; i++) {
 
  218       for (
size_t j = 0; j < net.GetDepth(); j++)
 
  220           auto &masterLayer = net.GetLayer(j);
 
  221           auto &layer = nets.back().GetLayer(j);
 
  223                                masterLayer.GetWeights());
 
  225                                masterLayer.GetBiases());
 
  229   size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
 
  230   std::vector<TBatch<Architecture_t>> batches{};
 
  231   batches.reserve(nThreads);
 
  234      for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
 
  236         for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
 
  238            for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.
GetBatch());
 
  239            Step(net, nets, batches);
 
  243      auto b = *testLoader.
begin();
 
  244      auto inputMatrix = 
b.GetInput();
 
  245      auto outputMatrix = 
b.GetOutput();
 
  246      auto weightMatrix = 
b.GetWeights();
 
  247      fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
 
  249   } 
while (!HasConverged());
 
  251   return fMinimumError;
 
  255template<
typename Architecture_t>
 
  256template <
typename Data_t, 
typename Net_t>
 
  258                                                     size_t nTrainingSamples,
 
  259                                                     const Data_t & testData,
 
  272                                                   net.GetOutputWidth(), nThreads);
 
  273   auto testNet = net.CreateClone(net.GetBatchSize());
 
  275                                                  testNet.GetBatchSize(),
 
  276                                                  testNet.GetInputWidth(),
 
  277                                                  net.GetOutputWidth());
 
  279   net.InitializeGradients();
 
  280   std::vector<Net_t> nets{};
 
  281   nets.reserve(nThreads);
 
  282   for (
size_t i = 0; i < nThreads; i++) {
 
  284       for (
size_t j = 0; j < net.GetDepth(); j++)
 
  286           auto &masterLayer = net.GetLayer(j);
 
  287           auto &layer = nets.back().GetLayer(j);
 
  289                                masterLayer.GetWeights());
 
  291                                masterLayer.GetBiases());
 
  295   size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
 
  296   std::vector<TBatch<Architecture_t>> batches{};
 
  297   batches.reserve(nThreads);
 
  300      for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
 
  302         for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
 
  304            for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.
GetBatch());
 
  305            if (momentum != 0.0) {
 
  306               StepMomentum(net, nets, batches, momentum);
 
  308               Step(net, nets, batches);
 
  314      for (
size_t i = 0; i < batchesInEpoch; i++) {
 
  316         auto inputMatrix = 
b.GetInput();
 
  317         auto outputMatrix = 
b.GetOutput();
 
  318         auto weightMatrix = 
b.GetWeights();
 
  319         fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
 
  321      fTestError /= (
Double_t)batchesInEpoch;
 
  322   } 
while (!HasConverged());
 
  323   return fMinimumError;
 
  327template <
typename Architecture_t>
 
  328template <
typename Net_t>
 
  332   net.Forward(input, 
true);
 
  333   net.Backward(input, 
output, weights);
 
  335   for (
size_t i = 0; i < net.GetDepth(); i++)
 
  337      auto &layer = net.GetLayer(i);
 
  338      Architecture_t::ScaleAdd(layer.GetWeights(),
 
  339                               layer.GetWeightGradients(),
 
  341      Architecture_t::ScaleAdd(layer.GetBiases(),
 
  342                               layer.GetBiasGradients(),
 
  348template <
typename Architecture_t>
 
  349template <
typename Net_t>
 
  354   net.Backward(input, 
output);
 
  356   for (
size_t i = 0; i < net.GetDepth(); i++)
 
  358      auto &layer = net.GetLayer(i);
 
  359      Architecture_t::ScaleAdd(layer.GetWeights(),
 
  360                               layer.GetWeightGradients(),
 
  362      Architecture_t::ScaleAdd(layer.GetBiases(),
 
  363                               layer.GetBiasGradients(),
 
  370template<
typename Architecture_t>
 
  371    template <
typename Net_t>
 
  374        std::vector<Net_t> & nets,
 
  377   typename Architecture_t::Matrix_t 
dummy(0,0);
 
  378   size_t depth = master.GetDepth();
 
  381   for (
size_t j = 0; j < nets.size(); j++) {
 
  382      nets[j].GetLayer(0).Forward(batches[j].GetInput(), 
true);
 
  385   for (
size_t i = 1; i < depth; i++)
 
  387      for (
size_t j = 0; j < nets.size(); j++) {
 
  388         nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), 
true);
 
  392   for (
size_t j = 0; j < nets.size(); j++) {
 
  393      evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
 
  394                                        batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
 
  395                                        batches[j].GetWeights());
 
  398   for (
size_t i = depth - 1; i > 0; i--)
 
  400      for (
size_t j = 0; j < nets.size(); j++) {
 
  401         nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
 
  402                                      nets[j].GetLayer(i-1).GetOutput(),
 
  403                                      nets[j].GetRegularization(),
 
  404                                      nets[j].GetWeightDecay());
 
  407   for (
size_t j = 0; j < nets.size(); j++) {
 
  408      nets[j].GetLayer(0).Backward(
dummy,
 
  409                                   batches[j].GetInput(),
 
  410                                   nets[j].GetRegularization(),
 
  411                                   nets[j].GetWeightDecay());
 
  414   for (
size_t j = 0; j < nets.size(); j++) {
 
  415      for (
size_t i = 0; i < depth; i++)
 
  417         auto &masterLayer = master.GetLayer(i);
 
  418         auto &layer       = nets[j].GetLayer(i);
 
  419         Architecture_t::ScaleAdd(masterLayer.GetWeights(),
 
  420                                  layer.GetWeightGradients(),
 
  423                              masterLayer.GetWeights());
 
  424         Architecture_t::ScaleAdd(masterLayer.GetBiases(),
 
  425                                  layer.GetBiasGradients(),
 
  428                              masterLayer.GetBiases());
 
  434template<
typename Architecture_t>
 
  435template <
typename Net_t>
 
  438        std::vector<Net_t> & nets,
 
  442   typename Architecture_t::Matrix_t 
dummy(0,0);
 
  443   size_t depth = master.GetDepth();
 
  446   for (
size_t j = 0; j < nets.size(); j++) {
 
  447      nets[j].GetLayer(0).Forward(batches[j].GetInput(), 
true);
 
  450   for (
size_t i = 1; i < depth; i++)
 
  452      for (
size_t j = 0; j < nets.size(); j++) {
 
  453         nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), 
true);
 
  457   for (
size_t j = 0; j < nets.size(); j++) {
 
  458      evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
 
  459                                        batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
 
  460                                        batches[j].GetWeights());
 
  463   for (
size_t i = depth - 1; i > 0; i--)
 
  465      for (
size_t j = 0; j < nets.size(); j++) {
 
  466         nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
 
  467                                      nets[j].GetLayer(i-1).GetOutput(),
 
  468                                      nets[j].GetRegularization(),
 
  469                                      nets[j].GetWeightDecay());
 
  470         Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
 
  471                                  nets[j].GetLayer(i).GetWeightGradients(),
 
  472                                  - fLearningRate / momentum);
 
  473         Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
 
  474                                  nets[j].GetLayer(i).GetBiasGradients(),
 
  475                                  - fLearningRate / momentum);
 
  477      Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
 
  478                               master.GetLayer(i).GetWeightGradients(),
 
  480      Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
 
  481                               master.GetLayer(i).GetBiasGradients(),
 
  484   for (
size_t j = 0; j < nets.size(); j++) {
 
  485      nets[j].GetLayer(0).Backward(
dummy,
 
  486                                   batches[j].GetInput(),
 
  487                                   nets[j].GetRegularization(),
 
  488                                   nets[j].GetWeightDecay());
 
  489      Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
 
  490                               nets[j].GetLayer(0).GetWeightGradients(),
 
  491                               - fLearningRate / momentum);
 
  492      Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
 
  493                               nets[j].GetLayer(0).GetBiasGradients(),
 
  494                               - fLearningRate / momentum);
 
  497   Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
 
  498                            master.GetLayer(0).GetWeightGradients(),
 
  500   Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
 
  501                            master.GetLayer(0).GetBiasGradients(),
 
  504   for (
size_t i = 0; i < depth; i++)
 
  506       auto &masterLayer = master.GetLayer(i);
 
  507       Architecture_t::ScaleAdd(masterLayer.GetWeights(),
 
  508                                masterLayer.GetWeightGradients(),
 
  510       Architecture_t::ScaleAdd(masterLayer.GetBiases(),
 
  511                                masterLayer.GetBiasGradients(),
 
  513       for (
size_t j = 0; j < nets.size(); j++) {
 
  514         auto &layer       = nets[j].GetLayer(i);
 
  516                              masterLayer.GetWeights());
 
  518                              masterLayer.GetBiases());
 
  524template<
typename Architecture_t>
 
  525template <
typename Net_t>
 
  528        std::vector<Net_t> & nets,
 
  532   typename Architecture_t::Matrix_t 
dummy(0,0);
 
  533   size_t depth = master.GetDepth();
 
  536   for (
size_t j = 0; j < nets.size(); j++) {
 
  537      nets[j].GetLayer(0).Forward(batches[j].GetInput(), 
true);
 
  540   for (
size_t i = 1; i < depth; i++)
 
  542      for (
size_t j = 0; j < nets.size(); j++) {
 
  543         nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), 
true);
 
  548   for (
size_t j = 0; j < nets.size(); j++) {
 
  549      evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
 
  550                                        batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
 
  551                                        batches[j].GetWeights());
 
  555   for (
size_t i = depth - 1; i > 0; i--)
 
  557      for (
size_t j = 0; j < nets.size(); j++) {
 
  558         nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
 
  559                                      nets[j].GetLayer(i-1).GetOutput(),
 
  560                                      nets[j].GetRegularization(),
 
  561                                      nets[j].GetWeightDecay());
 
  565   for (
size_t j = 0; j < nets.size(); j++) {
 
  566      nets[j].GetLayer(0).Backward(
dummy,
 
  567                                   batches[j].GetInput(),
 
  568                                   nets[j].GetRegularization(),
 
  569                                   nets[j].GetWeightDecay());
 
  572   for (
size_t i = 0; i < depth; i++)
 
  574      auto &masterLayer = master.GetLayer(i);
 
  575      for (
size_t j = 0; j < nets.size(); j++) {
 
  576         auto &layer       = nets[j].GetLayer(i);
 
  578                              masterLayer.GetWeights());
 
  580                              masterLayer.GetBiases());
 
  581         Architecture_t::ScaleAdd(layer.GetWeights(),
 
  582                                  masterLayer.GetWeightGradients(),
 
  584         Architecture_t::ScaleAdd(layer.GetBiases(),
 
  585                                  masterLayer.GetBiasGradients(),
 
  588      for (
size_t j = 0; j < nets.size(); j++) {
 
  589         auto &layer       = nets[j].GetLayer(i);
 
  590         Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
 
  591                                  layer.GetWeightGradients(),
 
  592                                  - fLearningRate / momentum);
 
  593         Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
 
  594                                  layer.GetBiasGradients(),
 
  595                                  - fLearningRate / momentum);
 
  597      Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
 
  598                               masterLayer.GetWeightGradients(),
 
  600      Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
 
  601                               masterLayer.GetBiasGradients(),
 
  603      Architecture_t::ScaleAdd(masterLayer.GetWeights(),
 
  604                               masterLayer.GetWeightGradients(),
 
  606      Architecture_t::ScaleAdd(masterLayer.GetBiases(),
 
  607                               masterLayer.GetBiasGradients(),
 
  613template<
typename Architecture_t>
 
  614template <
typename Net_t>
 
  620   net.Forward(input, 
true);
 
  621   net.Backward(input, 
output);
 
  623   for (
size_t i = 0; i < net.GetDepth(); i++)
 
  625      auto &layer = net.GetLayer(i);
 
  626      Architecture_t::ScaleAdd(layer.GetWeights(),
 
  627                               layer.GetWeightGradients(),
 
  630         Architecture_t::ScaleAdd(layer.GetBiases(),
 
  631                                  layer.GetBiasGradients(),
 
  638template <
typename Architecture_t>
 
  639template <
typename Net_t>
 
  645   fTrainingError = loss;
 
  646   net.Backward(input, 
output, weights);
 
  648   for (
size_t i = 0; i < net.GetDepth(); i++)
 
  650      auto &layer = net.GetLayer(i);
 
  651      Architecture_t::ScaleAdd(layer.GetWeights(),
 
  652                               layer.GetWeightGradients(),
 
  655         Architecture_t::ScaleAdd(layer.GetBiases(),
 
  656                                  layer.GetBiasGradients(),
 
  664template<
typename Architecture_t>
 
  667   if (fTestError < fMinimumError * 0.999) {
 
  668      fConvergenceCount = 0;
 
  669      fMinimumError     = fTestError;
 
  674   return (fConvergenceCount >= fConvergenceSteps);
 
  678template<
typename Architecture_t>
 
  681   fTestError = testError;
 
  682   if (fTestError < fMinimumError * 0.999) {
 
  683      fConvergenceCount = 0;
 
  684      fMinimumError     = fTestError;
 
  686      fConvergenceCount += fTestInterval;
 
  688   return (fConvergenceCount >= fConvergenceSteps);
 
static RooMathCoreReg dummy
TBatch< AArchitecture > GetBatch()
Return the next batch from the training set.
void Shuffle()
Shuffle the order of the samples in the batch.
size_t fConvergenceCount
Current number of training epochs without.
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Scalar_t fTrainingError
Holds the most recently computed training loss.
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
size_t fConvergenceSteps
Number of training epochs without considerable.
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Similar to StepReducedWeights(...) but also evaluates the loss.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Same as Step(...) but also evaluate the loss on the given training data.
void Reset()
Reset minimizer object to default state.
Scalar_t GetTrainingError() const
size_t fTestInterval
Interval for the computation of the test error.
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
void SetBatchSize(Scalar_t rate)
Scalar_t fTestError
Holds the most recently computed test loss.
typename Architecture_t::Matrix_t Matrix_t
size_t GetTestInterval() const
size_t fStepCount
Number of steps performed in the current training session.
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
void SetConvergenceSteps(size_t steps)
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t GetConvergenceCount() const
Scalar_t fMinimumError
The minimum loss achieved on the training set.
void SetLearningRate(Scalar_t rate)
Scalar_t fLearningRate
Learning rate .
size_t fBatchSize
Batch size to use for the training.
size_t GetConvergenceSteps() const
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels),...
void SetTestInterval(size_t interval)
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device.
Scalar_t GetTestError() const
typename Architecture_t::Scalar_t Scalar_t
void Copy(void *source, void *dest)
void Step(const gsl_rng *r, void *xp, double step_size)
create variable transformations
static void output(int code)