Logo ROOT   6.10/09
Reference Guide
Minimizers.h
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Simon Pfreundschuh 21/06/16
3 
4 /*************************************************************************
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 #ifndef TMVA_DNN_MINIMIZERS
12 #define TMVA_DNN_MINIMIZERS
13 
14 #include "DataLoader.h"
15 #include "Functions.h"
16 
17 #include <limits>
18 
19 namespace TMVA {
20 namespace DNN {
21 
22 //______________________________________________________________________________
23 //
24 // Generic Gradient Descent Class
25 //______________________________________________________________________________
26 //
27 
28 /*** \class TGradientDescent
29 *
30 * Generic implementation of gradient descent minimization.
31 *
32 * The TGradientDescent class implements an architecture and input data
33 * independent implementation of the gradient descent minimization algorithm.
34 *
35 * Provides Train(...) and TrainMomentum(...) functions that perform a complete
36 * training of a neural network. Those are mainly used for testing since for
37 * production a more fine grained control of the training process is desirable.
38 * This is provided by the Step(...), StepMomentum(...) and StepNesterov(...)
39 * functions that perform a single minimization step.
40 *
41 * The main training characteristics are defined by the provided learning rate,
42 * the test interval, and the convergence steps required for convergence. The
43 * test interval defines how often the error on the validation set is computed,
44 * and the values with which the step counter is increased each time the
45 * HasConverged() member function is called. A convergence step is defined as
46 * a step in which the test error is NOT less than 0.999 times the current
47 * minimal test error that has been reached. If between two subsequent calls
48 * to HasConverged(Double_t) the test error has not been sufficiently reduced
49 * it is assumed that a number of convergence steps equal to the test interval
50 * has been performed.
51 *
52 */
53 template<typename Architecture_t>
55 {
56 public:
57  using Scalar_t = typename Architecture_t::Scalar_t;
58  using Matrix_t = typename Architecture_t::Matrix_t;
59 
60 private:
61  size_t fBatchSize; ///< Batch size to use for the training.
62  size_t fStepCount; ///< Number of steps performed in the current training session
63  size_t fConvergenceSteps; ///< Number of training epochs without considerable
64  ///< decrease in the test error for convergence.
65  size_t fConvergenceCount; ///< Current number of training epochs without
66  ///< considerable decrease in the test error.
67  size_t fTestInterval; ///< Interval for the computation of the test error.
68  Scalar_t fTrainingError;///< Holds the most recently computed training loss.
69  Scalar_t fTestError; ///< Holds the most recently computed test loss.
70  Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$
71  Scalar_t fMinimumError; ///< The minimum loss achieved on the training set
72  ///< during the current traning session.
73 
74 public:
76  TGradientDescent(Scalar_t learningRate,
77  size_t convergenceSteps,
78  size_t testInterval);
79 
80  /** Reset minimizer object to default state. */
81  void Reset()
82  {
83  fMinimumError = std::numeric_limits<Scalar_t>::infinity();
84  fConvergenceCount = 0;
85  fStepCount = 0;
86  };
87 
88  /** Train the given net using the given training input data (events), training
89  output data (labels), test input data (events), test output data (labels). */
90  template <typename Data_t, typename Net_t>
91  Scalar_t Train(const Data_t & TrainingDataIn, size_t nTrainingSamples,
92  const Data_t & TestDataIn, size_t nTestSamples,
93  Net_t & net, size_t nThreads = 1);
94 
95  /** Same as Train(...) but uses the given momentum.*/
96  template <typename Data_t, typename Net_t>
97  Scalar_t TrainMomentum(const Data_t & TrainingDataIn, size_t nTrainingSamples,
98  const Data_t & TestDataIn, size_t nTestSamples,
99  Net_t & net, Scalar_t momentum, size_t nThreads = 1);
100 
101  /** Perform a single optimization step on a given batch. Propagates the input
102  matrix foward through the net, evaluates the loss and propagates the gradients
103  backward through the net. The computed gradients are scaled by the learning
104  rate \f$\alpha\f$ and subtracted from the weights and bias values of each
105  layer. */
106  template <typename Net_t>
107  void Step(Net_t &net, Matrix_t &input, const Matrix_t &output);
108 
109  /** Same as Step(...) but also evaluate the loss on the given training data.
110  * Note that this requires synchronization between host and device. */
111  template <typename Net_t>
112  Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output);
113 
114  /** Perform multiple optimization steps simultaneously. Performs the
115  * backprop algorithm on the input batches given in \p batches on
116  * the neural networks given in \p nets. The forward and backward propagation
117  * steps are executed in an interleaving manner in order to exploit potential
118  * batch-level parallelism for asynchronous device calls.
119  */
120  template <typename Net_t>
121  void Step(Net_t &master,
122  std::vector<Net_t> &nets,
123  std::vector<TBatch<Architecture_t>> &batches);
124 
125  /** Same as the Step(...) method for multiple batches but uses momentum. */
126  template <typename Net_t>
127  void StepMomentum(Net_t &master,
128  std::vector<Net_t> &nets,
129  std::vector<TBatch<Architecture_t>> &batches,
130  Scalar_t momentum);
131  template <typename Net_t>
132 
133  /** Same as the Step(...) method for multiple batches but uses Nesterov
134  * momentum. */
135  void StepNesterov(Net_t &master,
136  std::vector<Net_t> &nets,
137  std::vector<TBatch<Architecture_t>> &batches,
138  Scalar_t momentum);
139 
140  /** Does not evaluate the loss and therefore not trigger a possible synchronization
141  * with the device. Trains the weights of each layer, but only the bias terms of
142  * the first layer for compatibility with the previous implementation. */
143  template <typename Net_t>
144  void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output);
145 
146  /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger
147  * synchronization with the device. */
148  template <typename Net_t>
149  Scalar_t StepReducedWeightsLoss(Net_t &net,
150  Matrix_t &input,
151  const Matrix_t &output);
152  /** Increases the minimization step counter by the test error evaluation
153  * period and uses the current internal value of the test error to
154  * determine if the minimization has converged. */
155  bool HasConverged();
156  /** Increases the minimization step counter by the test error evaluation
157  * period and uses the provided test error value to determine if the
158  * minimization has converged. */
159  bool HasConverged(Scalar_t testError);
160 
161  size_t GetConvergenceCount() const {return fConvergenceCount;}
162  size_t GetConvergenceSteps() const {return fConvergenceSteps;}
164  Scalar_t GetTestError() const {return fTestError;}
165  size_t GetTestInterval() const {return fTestInterval;}
166 
167  void SetConvergenceSteps(size_t steps) {fConvergenceSteps = steps;}
168  void SetTestInterval(size_t interval) {fTestInterval = interval;}
169  void SetLearningRate(Scalar_t rate) {fLearningRate = rate;}
170  void SetBatchSize(Scalar_t rate) {fBatchSize = rate;}
171 };
172 
173 //
174 // Implementation
175 //______________________________________________________________________________
176 template <typename Architecture_t>
179  fMinimumError(std::numeric_limits<Scalar_t>::infinity())
180 {
181  // Nothing to do here.
182 }
183 
184 //______________________________________________________________________________
185 template <typename Architecture_t>
186 TGradientDescent<Architecture_t>::TGradientDescent(Scalar_t learningRate, size_t convergenceSteps, size_t testInterval)
187  : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
188  fTestInterval(testInterval), fLearningRate(learningRate), fMinimumError(std::numeric_limits<Scalar_t>::infinity())
189 {
190  // Nothing to do here.
191 }
192 
193 //______________________________________________________________________________
194 template<typename Architecture_t>
195 template <typename Data_t, typename Net_t>
196  auto TGradientDescent<Architecture_t>::Train(const Data_t & trainingData,
197  size_t nTrainingSamples,
198  const Data_t & testData,
199  size_t nTestSamples,
200  Net_t & net,
201  size_t nThreads)
202  -> Scalar_t
203 {
204  Reset();
205 
206  // Prepare training data.
207  TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
208  net.GetBatchSize(),
209  net.GetInputWidth(),
210  net.GetOutputWidth(), nThreads);
211  auto testNet = net.CreateClone(nTestSamples);
212  TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
213  testNet.GetBatchSize(),
214  testNet.GetInputWidth(),
215  net.GetOutputWidth());
216  std::vector<Net_t> nets{};
217  nets.reserve(nThreads);
218  for (size_t i = 0; i < nThreads; i++) {
219  nets.push_back(net);
220  for (size_t j = 0; j < net.GetDepth(); j++)
221  {
222  auto &masterLayer = net.GetLayer(j);
223  auto &layer = nets.back().GetLayer(j);
224  Architecture_t::Copy(layer.GetWeights(),
225  masterLayer.GetWeights());
226  Architecture_t::Copy(layer.GetBiases(),
227  masterLayer.GetBiases());
228  }
229  }
230 
231  size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
232  std::vector<TBatch<Architecture_t>> batches{};
233  batches.reserve(nThreads);
234 
235  do {
237  trainLoader.Shuffle();
238  for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
239  batches.clear();
240  for (size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
241  Step(net, nets, batches);
242  }
243  }
244 
245  auto b = *testLoader.begin();
246  auto inputMatrix = b.GetInput();
247  auto outputMatrix = b.GetOutput();
248  fTestError = testNet.Loss(inputMatrix, outputMatrix);
249 
250  } while (!HasConverged());
251 
252  return fMinimumError;
253 }
254 
255 //______________________________________________________________________________
256 template<typename Architecture_t>
257 template <typename Data_t, typename Net_t>
258 auto TGradientDescent<Architecture_t>::TrainMomentum(const Data_t & trainingData,
259  size_t nTrainingSamples,
260  const Data_t & testData,
261  size_t nTestSamples,
262  Net_t & net,
263  Scalar_t momentum,
264  size_t nThreads)
265  -> Scalar_t
266 {
267  Reset();
268 
269  // Prepare training data.
270  TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
271  net.GetBatchSize(),
272  net.GetInputWidth(),
273  net.GetOutputWidth(), nThreads);
274  auto testNet = net.CreateClone(net.GetBatchSize());
275  TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
276  testNet.GetBatchSize(),
277  testNet.GetInputWidth(),
278  net.GetOutputWidth());
279 
280  net.InitializeGradients();
281  std::vector<Net_t> nets{};
282  nets.reserve(nThreads);
283  for (size_t i = 0; i < nThreads; i++) {
284  nets.push_back(net);
285  for (size_t j = 0; j < net.GetDepth(); j++)
286  {
287  auto &masterLayer = net.GetLayer(j);
288  auto &layer = nets.back().GetLayer(j);
289  Architecture_t::Copy(layer.GetWeights(),
290  masterLayer.GetWeights());
291  Architecture_t::Copy(layer.GetBiases(),
292  masterLayer.GetBiases());
293  }
294  }
295 
296  size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
297  std::vector<TBatch<Architecture_t>> batches{};
298  batches.reserve(nThreads);
299 
300  do {
302  trainLoader.Shuffle();
303  for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
304  batches.clear();
305  for (size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
306  if (momentum != 0.0) {
307  StepMomentum(net, nets, batches, momentum);
308  } else {
309  Step(net, nets, batches);
310  }
311  }
312  }
313 
314  fTestError = 0.0;
315  for (size_t i = 0; i < batchesInEpoch; i++) {
316  auto b = testLoader.GetBatch();
317  auto inputMatrix = b.GetInput();
318  auto outputMatrix = b.GetOutput();
319  fTestError += testNet.Loss(inputMatrix, outputMatrix);
320  }
321  fTestError /= (Double_t)batchesInEpoch;
322  } while (!HasConverged());
323  return fMinimumError;
324 }
325 
326 //______________________________________________________________________________
327 template<typename Architecture_t>
328  template <typename Net_t>
329  void inline TGradientDescent<Architecture_t>::Step(Net_t & net,
330  Matrix_t &input,
331  const Matrix_t &output)
332 {
333  net.Forward(input, true);
334  net.Backward(input, output);
335 
336  for (size_t i = 0; i < net.GetDepth(); i++)
337  {
338  auto &layer = net.GetLayer(i);
339  Architecture_t::ScaleAdd(layer.GetWeights(),
340  layer.GetWeightGradients(),
341  -fLearningRate);
342  Architecture_t::ScaleAdd(layer.GetBiases(),
343  layer.GetBiasGradients(),
344  -fLearningRate);
345  }
346 }
347 
348 //______________________________________________________________________________
349 template<typename Architecture_t>
350 template <typename Net_t>
352  Matrix_t &input,
353  const Matrix_t &output)
354  -> Scalar_t
355 {
356  Scalar_t loss = net.Loss(input, output);
357  net.Backward(input, output);
358 
359  for (size_t i = 0; i < net.GetDepth(); i++)
360  {
361  auto &layer = net.GetLayer(i);
362  Architecture_t::ScaleAdd(layer.GetWeights(),
363  layer.GetWeightGradients(),
364  -fLearningRate);
365  Architecture_t::ScaleAdd(layer.GetBiases(),
366  layer.GetBiasGradients(),
367  -fLearningRate);
368  }
369  return loss;
370 }
371 
372 //______________________________________________________________________________
373 template<typename Architecture_t>
374  template <typename Net_t>
376  Net_t & master,
377  std::vector<Net_t> & nets,
378  std::vector<TBatch<Architecture_t>> & batches)
379 {
380  typename Architecture_t::Matrix_t dummy(0,0);
381  size_t depth = master.GetDepth();
382 
383  // Forward
384  for (size_t j = 0; j < nets.size(); j++) {
385  nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
386  }
387 
388  for (size_t i = 1; i < depth; i++)
389  {
390  for (size_t j = 0; j < nets.size(); j++) {
391  nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
392  }
393  }
394  // Gradients
395  for (size_t j = 0; j < nets.size(); j++) {
396  evaluateGradients<Architecture_t>(
397  nets[j].GetLayer(depth-1).GetActivationGradients(),
398  nets[j].GetLossFunction(),
399  batches[j].GetOutput(),
400  nets[j].GetLayer(depth-1).GetOutput());
401  }
402  // Backward
403  for (size_t i = depth - 1; i > 0; i--)
404  {
405  for (size_t j = 0; j < nets.size(); j++) {
406  nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
407  nets[j].GetLayer(i-1).GetOutput(),
408  nets[j].GetRegularization(),
409  nets[j].GetWeightDecay());
410  }
411  }
412  for (size_t j = 0; j < nets.size(); j++) {
413  nets[j].GetLayer(0).Backward(dummy,
414  batches[j].GetInput(),
415  nets[j].GetRegularization(),
416  nets[j].GetWeightDecay());
417  }
418 
419  for (size_t j = 0; j < nets.size(); j++) {
420  for (size_t i = 0; i < depth; i++)
421  {
422  auto &masterLayer = master.GetLayer(i);
423  auto &layer = nets[j].GetLayer(i);
424  Architecture_t::ScaleAdd(masterLayer.GetWeights(),
425  layer.GetWeightGradients(),
426  -fLearningRate);
427  Architecture_t::Copy(layer.GetWeights(),
428  masterLayer.GetWeights());
429  Architecture_t::ScaleAdd(masterLayer.GetBiases(),
430  layer.GetBiasGradients(),
431  -fLearningRate);
432  Architecture_t::Copy(layer.GetBiases(),
433  masterLayer.GetBiases());
434  }
435  }
436 }
437 
438 //______________________________________________________________________________
439 template<typename Architecture_t>
440 template <typename Net_t>
442  Net_t & master,
443  std::vector<Net_t> & nets,
444  std::vector<TBatch<Architecture_t>> & batches,
445  Scalar_t momentum)
446 {
447  typename Architecture_t::Matrix_t dummy(0,0);
448  size_t depth = master.GetDepth();
449 
450  // Forward
451  for (size_t j = 0; j < nets.size(); j++) {
452  nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
453  }
454 
455  for (size_t i = 1; i < depth; i++)
456  {
457  for (size_t j = 0; j < nets.size(); j++) {
458  nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
459  }
460  }
461  // Gradients
462  for (size_t j = 0; j < nets.size(); j++) {
463  evaluateGradients<Architecture_t>(
464  nets[j].GetLayer(depth-1).GetActivationGradients(),
465  nets[j].GetLossFunction(),
466  batches[j].GetOutput(),
467  nets[j].GetLayer(depth-1).GetOutput());
468  }
469  // Backward
470  for (size_t i = depth - 1; i > 0; i--)
471  {
472  for (size_t j = 0; j < nets.size(); j++) {
473  nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
474  nets[j].GetLayer(i-1).GetOutput(),
475  nets[j].GetRegularization(),
476  nets[j].GetWeightDecay());
477  Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
478  nets[j].GetLayer(i).GetWeightGradients(),
479  - fLearningRate / momentum);
480  Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
481  nets[j].GetLayer(i).GetBiasGradients(),
482  - fLearningRate / momentum);
483  }
484  Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
485  master.GetLayer(i).GetWeightGradients(),
486  momentum - 1.0);
487  Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
488  master.GetLayer(i).GetBiasGradients(),
489  momentum - 1.0);
490  }
491  for (size_t j = 0; j < nets.size(); j++) {
492  nets[j].GetLayer(0).Backward(dummy,
493  batches[j].GetInput(),
494  nets[j].GetRegularization(),
495  nets[j].GetWeightDecay());
496  Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
497  nets[j].GetLayer(0).GetWeightGradients(),
498  - fLearningRate / momentum);
499  Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
500  nets[j].GetLayer(0).GetBiasGradients(),
501  - fLearningRate / momentum);
502  }
503 
504  Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
505  master.GetLayer(0).GetWeightGradients(),
506  momentum - 1.0);
507  Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
508  master.GetLayer(0).GetBiasGradients(),
509  momentum - 1.0);
510 
511  for (size_t i = 0; i < depth; i++)
512  {
513  auto &masterLayer = master.GetLayer(i);
514  Architecture_t::ScaleAdd(masterLayer.GetWeights(),
515  masterLayer.GetWeightGradients(),
516  1.0);
517  Architecture_t::ScaleAdd(masterLayer.GetBiases(),
518  masterLayer.GetBiasGradients(),
519  1.0);
520  for (size_t j = 0; j < nets.size(); j++) {
521  auto &layer = nets[j].GetLayer(i);
522  Architecture_t::Copy(layer.GetWeights(),
523  masterLayer.GetWeights());
524  Architecture_t::Copy(layer.GetBiases(),
525  masterLayer.GetBiases());
526  }
527  }
528 }
529 
530 //______________________________________________________________________________
531 template<typename Architecture_t>
532 template <typename Net_t>
534  Net_t & master,
535  std::vector<Net_t> & nets,
536  std::vector<TBatch<Architecture_t>> & batches,
537  Scalar_t momentum)
538 {
539  typename Architecture_t::Matrix_t dummy(0,0);
540  size_t depth = master.GetDepth();
541 
542  // Forward
543  for (size_t j = 0; j < nets.size(); j++) {
544  nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
545  }
546 
547  for (size_t i = 1; i < depth; i++)
548  {
549  for (size_t j = 0; j < nets.size(); j++) {
550  nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
551  }
552  }
553 
554  // Gradients
555  for (size_t j = 0; j < nets.size(); j++) {
556  evaluateGradients<Architecture_t>(
557  nets[j].GetLayer(depth-1).GetActivationGradients(),
558  nets[j].GetLossFunction(),
559  batches[j].GetOutput(),
560  nets[j].GetLayer(depth-1).GetOutput());
561  }
562 
563  // Backward
564  for (size_t i = depth - 1; i > 0; i--)
565  {
566  for (size_t j = 0; j < nets.size(); j++) {
567  nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
568  nets[j].GetLayer(i-1).GetOutput(),
569  nets[j].GetRegularization(),
570  nets[j].GetWeightDecay());
571  }
572  }
573 
574  for (size_t j = 0; j < nets.size(); j++) {
575  nets[j].GetLayer(0).Backward(dummy,
576  batches[j].GetInput(),
577  nets[j].GetRegularization(),
578  nets[j].GetWeightDecay());
579  }
580 
581  for (size_t i = 0; i < depth; i++)
582  {
583  auto &masterLayer = master.GetLayer(i);
584  for (size_t j = 0; j < nets.size(); j++) {
585  auto &layer = nets[j].GetLayer(i);
586  Architecture_t::Copy(layer.GetWeights(),
587  masterLayer.GetWeights());
588  Architecture_t::Copy(layer.GetBiases(),
589  masterLayer.GetBiases());
590  Architecture_t::ScaleAdd(layer.GetWeights(),
591  masterLayer.GetWeightGradients(),
592  1.0);
593  Architecture_t::ScaleAdd(layer.GetBiases(),
594  masterLayer.GetBiasGradients(),
595  1.0);
596  }
597  for (size_t j = 0; j < nets.size(); j++) {
598  auto &layer = nets[j].GetLayer(i);
599  Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
600  layer.GetWeightGradients(),
601  - fLearningRate / momentum);
602  Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
603  layer.GetBiasGradients(),
604  - fLearningRate / momentum);
605  }
606  Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
607  masterLayer.GetWeightGradients(),
608  momentum - 1.0);
609  Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
610  masterLayer.GetBiasGradients(),
611  momentum - 1.0);
612  Architecture_t::ScaleAdd(masterLayer.GetWeights(),
613  masterLayer.GetWeightGradients(),
614  1.0);
615  Architecture_t::ScaleAdd(masterLayer.GetBiases(),
616  masterLayer.GetBiasGradients(),
617  1.0);
618  }
619 }
620 
621 //______________________________________________________________________________
622 template<typename Architecture_t>
623 template <typename Net_t>
625  Net_t & net,
626  Matrix_t &input,
627  const Matrix_t &output)
628 {
629  net.Forward(input, true);
630  net.Backward(input, output);
631 
632  for (size_t i = 0; i < net.GetDepth(); i++)
633  {
634  auto &layer = net.GetLayer(i);
635  Architecture_t::ScaleAdd(layer.GetWeights(),
636  layer.GetWeightGradients(),
637  -fLearningRate);
638  if (i == 0) {
639  Architecture_t::ScaleAdd(layer.GetBiases(),
640  layer.GetBiasGradients(),
641  -fLearningRate);
642  }
643  }
644 }
645 
646 //______________________________________________________________________________
647 template<typename Architecture_t>
648  template <typename Net_t>
650  Net_t & net,
651  Matrix_t &input,
652  const Matrix_t &output)
653  -> Scalar_t
654 {
655  Scalar_t loss = net.Loss(input, output);
657  net.Backward(input, output);
658 
659  for (size_t i = 0; i < net.GetDepth(); i++)
660  {
661  auto &layer = net.GetLayer(i);
662  Architecture_t::ScaleAdd(layer.GetWeights(),
663  layer.GetWeightGradients(),
664  -fLearningRate);
665  if (i == 0) {
666  Architecture_t::ScaleAdd(layer.GetBiases(),
667  layer.GetBiasGradients(),
668  -fLearningRate);
669  }
670  }
671  return loss;
672 }
673 
674 //______________________________________________________________________________
675 template<typename Architecture_t>
677 {
678  if (fTestError < fMinimumError * 0.999) {
679  fConvergenceCount = 0;
681  } else {
683  }
684 
686 }
687 
688 //______________________________________________________________________________
689 template<typename Architecture_t>
691 {
692  fTestError = testError;
693  if (fTestError < fMinimumError * 0.999) {
694  fConvergenceCount = 0;
696  } else {
698  }
700 }
701 } // namespace DNN
702 } // namespace TMVA
703 
704 #endif
typename Architecture_t::Scalar_t Scalar_t
Definition: Minimizers.h:57
Scalar_t GetTrainingError() const
Definition: Minimizers.h:163
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output)
Perform a single optimization step on a given batch.
Definition: Minimizers.h:329
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels)...
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t fStepCount
Number of steps performed in the current training session.
Definition: Minimizers.h:62
size_t fBatchSize
Batch size to use for the training.
Definition: Minimizers.h:61
Scalar_t fMinimumError
The minimum loss achieved on the training set.
Definition: Minimizers.h:71
Scalar_t GetTestError() const
Definition: Minimizers.h:164
size_t fConvergenceSteps
Number of training epochs without considerable.
Definition: Minimizers.h:63
STL namespace.
Scalar_t fTestError
Holds the most recently computed test loss.
Definition: Minimizers.h:69
void SetBatchSize(Scalar_t rate)
Definition: Minimizers.h:170
void SetConvergenceSteps(size_t steps)
Definition: Minimizers.h:167
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device...
Definition: Minimizers.h:624
size_t fConvergenceCount
Current number of training epochs without.
Definition: Minimizers.h:65
TDataLoader.
Definition: DataLoader.h:73
size_t fTestInterval
Interval for the computation of the test error.
Definition: Minimizers.h:67
void Reset()
Reset minimizer object to default state.
Definition: Minimizers.h:81
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Definition: Minimizers.h:676
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t >> &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
Definition: Minimizers.h:533
void SetLearningRate(Scalar_t rate)
Definition: Minimizers.h:169
size_t GetTestInterval() const
Definition: Minimizers.h:165
void Copy(void *source, void *dest)
size_t GetConvergenceSteps() const
Definition: Minimizers.h:162
double Double_t
Definition: RtypesCore.h:55
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output)
Similar to StepReducedWeights(...) but also evaluates the loss.
static RooMathCoreReg dummy
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t >> &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
Definition: Minimizers.h:441
Abstract ClassifierFactory template that handles arbitrary types.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output)
Same as Step(...) but also evaluate the loss on the given training data.
typename Architecture_t::Matrix_t Matrix_t
Definition: Minimizers.h:58
Scalar_t fLearningRate
Learning rate .
Definition: Minimizers.h:70
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
void SetTestInterval(size_t interval)
Definition: Minimizers.h:168
size_t GetConvergenceCount() const
Definition: Minimizers.h:161
Scalar_t fTrainingError
Holds the most recently computed training loss.
Definition: Minimizers.h:68