Logo ROOT   6.08/07
Reference Guide
Minimizers.h
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Simon Pfreundschuh 21/06/16
3 
4 /*************************************************************************
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 #ifndef TMVA_DNN_MINIMIZERS
12 #define TMVA_DNN_MINIMIZERS
13 
14 #include "DataLoader.h"
15 #include "Functions.h"
16 #include <chrono>
17 
18 namespace TMVA {
19 namespace DNN {
20 
21 //______________________________________________________________________________
22 //
23 // Generic Gradient Descent Class
24 //______________________________________________________________________________
25 //
26 
27 /*** \class TGradientDescent
28 *
29 * Generic implementation of gradient descent minimization.
30 *
31 * The TGradientDescent class implements an architecture and input data
32 * independent implementation of the gradient descent minimization algorithm.
33 *
34 * Provides Train(...) and TrainMomentum(...) functions that perform a complete
35 * training of a neural network. Thos are mainly used for testing since for
36 * production a more fine grained control of the training process is desirable.
37 * This is provided by the Step(...), StepMomentum(...) and StepNesterov(...)
38 * functions that perform a single minimization step.
39 *
40 * The main training characteristics are defined by the provided learning rate,
41 * the test interval and the convergence steps required for convergence. The
42 * test interval defines how often the error on the validation set is computed
43 * and is the values with which the step counter is increased each time
44 * the HasConverged() member function is called. A convergence step is defined as
45 * a step in which the test error is NOT less thatn 0.995 times the current
46 * minimal test error that has been reached. If between two subsequent calls
47 * to HasConverged(Double_t) the test error has not been sufficiently reduced
48 * it is assumed that a number of convergence steps equal to the test interval
49 * has been performed.
50 *
51 */
52 template<typename Architecture_t>
54 {
55 public:
56  using Scalar_t = typename Architecture_t::Scalar_t;
57  using Matrix_t = typename Architecture_t::Matrix_t;
58 
59 private:
60  size_t fBatchSize; ///< Batch size to use for the training.
61  size_t fStepCount; ///< Number of steps performed in the current
62  ///< training sessiong.
63  size_t fConvergenceSteps; ///< Number of training epochs without considerable
64  ///< decrease in the test error for convergence.
65  size_t fConvergenceCount; ///< Current number of training epochs without
66  ///< considerable decrease in the test error.
67  size_t fTestInterval; ///< Interval for the computation of the test error.
68  Scalar_t fTrainingError;///< Holds the most recently computed training loss.
69  Scalar_t fTestError; ///< Holds the most recently computed test loss.
70  Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$
71  Scalar_t fMinimumError; ///< The minimum loss achieved on the training set
72  ///< during the current traning session.
73 
74 public:
76  TGradientDescent(Scalar_t learningRate,
77  size_t convergenceSteps,
78  size_t testInterval);
79  /** Reset minimizer object to initial state. Does nothing for this minimizer. */
80  void Reset() {};
81 
82  /** Train the given net using the given training input data (events), training
83  output data (labels), test input data (events), test output data (labels). */
84  template <typename Data_t, typename Net_t>
85  Scalar_t Train(const Data_t & TrainingDataIn, size_t nTrainingSamples,
86  const Data_t & TestDataIn, size_t nTestSamples,
87  Net_t & net, size_t nThreads = 1);
88 
89  /** Same as Train(...) but uses the given momentum.*/
90  template <typename Data_t, typename Net_t>
91  Scalar_t TrainMomentum(const Data_t & TrainingDataIn, size_t nTrainingSamples,
92  const Data_t & TestDataIn, size_t nTestSamples,
93  Net_t & net, Scalar_t momentum, size_t nThreads = 1);
94 
95  /** Perform a single optimization step on a given batch. Propagates the input
96  matrix foward through the net, evaluates the loss and propagates the gradients
97  backward through the net. The computed gradients are scaled by the learning
98  rate \f$\alpha\f$ and subtracted from the weights and bias values of each
99  layer. */
100  template <typename Net_t>
101  void Step(Net_t &net, Matrix_t &input, const Matrix_t &output);
102 
103  /** Same as Step(...) but also evaluate the loss on the given training data.
104  * Note that this requires synchronization between host and device. */
105  template <typename Net_t>
106  Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output);
107 
108  /** Perform multiple optimization steps simultaneously. Performs the
109  * backprop algorithm on the input batches given in \p batches on
110  * the neural networks given in \p nets. The forward and backward propagation
111  * steps are executed in an interleaving manner in order to exploit potential
112  * batch-level parallelism for asynchronous device calls.
113  */
114  template <typename Net_t>
115  void Step(Net_t &master,
116  std::vector<Net_t> &nets,
117  std::vector<TBatch<Architecture_t>> &batches);
118 
119  /** Same as the Step(...) method for multiple batches but uses momentum. */
120  template <typename Net_t>
121  void StepMomentum(Net_t &master,
122  std::vector<Net_t> &nets,
123  std::vector<TBatch<Architecture_t>> &batches,
124  Scalar_t momentum);
125  template <typename Net_t>
126 
127  /** Same as the Step(...) method for multiple batches but uses Nesterov
128  * momentum. */
129  void StepNesterov(Net_t &master,
130  std::vector<Net_t> &nets,
131  std::vector<TBatch<Architecture_t>> &batches,
132  Scalar_t momentum);
133 
134  /** Does not evaluate the loss and therefore not trigger a possible synchronization
135  * with the device. Trains the weights of each layer, but only the bias terms of
136  * the first layer for compatibility with the previous implementation. */
137  template <typename Net_t>
138  void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output);
139 
140  /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger
141  * synchronization with the device. */
142  template <typename Net_t>
143  Scalar_t StepReducedWeightsLoss(Net_t &net,
144  Matrix_t &input,
145  const Matrix_t &output);
146  /** Increases the minimization step counter by the test error evaluation
147  * period and uses the current internal value of the test error to
148  * determine if the minimization has converged. */
149  bool HasConverged();
150  /** Increases the minimization step counter by the test error evaluation
151  * period and uses the provided test error value of to determine if
152  * the minimization has converged. */
153  bool HasConverged(Scalar_t testError);
154 
155  size_t GetConvergenceCount() const {return fConvergenceCount;}
156  size_t GetConvergenceSteps() const {return fConvergenceSteps;}
158  Scalar_t GetTestError() const {return fTestError;}
159  size_t GetTestInterval() const {return fTestInterval;}
160 
161  void SetConvergenceSteps(size_t steps) {fConvergenceSteps = steps;}
162  void SetTestInterval(size_t interval) {fTestInterval = interval;}
163  void SetLearningRate(Scalar_t rate) {fLearningRate = rate;}
164  void SetBatchSize(Scalar_t rate) {fBatchSize = rate;}
165 };
166 
167 //
168 // Implementation
169 //______________________________________________________________________________
170 template<typename Architecture_t>
174  fMinimumError(1e100)
175 {
176  // Nothing to do here.
177 }
178 
179 //______________________________________________________________________________
180 template<typename Architecture_t>
182  size_t convergenceSteps,
183  size_t testInterval)
184  : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps),
185  fConvergenceCount(0), fTestInterval(testInterval), fLearningRate(learningRate),
186  fMinimumError(1e100)
187 {
188  // Nothing to do here.
189 }
190 
191 //______________________________________________________________________________
192 template<typename Architecture_t>
193 template <typename Data_t, typename Net_t>
194  auto TGradientDescent<Architecture_t>::Train(const Data_t & trainingData,
195  size_t nTrainingSamples,
196  const Data_t & testData,
197  size_t nTestSamples,
198  Net_t & net,
199  size_t nThreads)
200  -> Scalar_t
201 {
202  // Reset iteration state.
203  fMinimumError = 1e100;
204  fConvergenceCount = 0;
205  fStepCount = 0;
206 
207  // Prepare training data.
208  bool converged = false;
209 
210  TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
211  net.GetBatchSize(),
212  net.GetInputWidth(),
213  net.GetOutputWidth(), nThreads);
214  auto testNet = net.CreateClone(nTestSamples);
215  TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
216  testNet.GetBatchSize(),
217  testNet.GetInputWidth(),
218  net.GetOutputWidth());
219  std::vector<Net_t> nets{};
220  nets.reserve(nThreads);
221  for (size_t i = 0; i < nThreads; i++) {
222  nets.push_back(net);
223  for (size_t j = 0; j < net.GetDepth(); j++)
224  {
225  auto &masterLayer = net.GetLayer(j);
226  auto &layer = nets.back().GetLayer(j);
227  Architecture_t::Copy(layer.GetWeights(),
228  masterLayer.GetWeights());
229  Architecture_t::Copy(layer.GetBiases(),
230  masterLayer.GetBiases());
231  }
232  }
233 
234  std::chrono::time_point<std::chrono::system_clock> start, end;
235  start = std::chrono::system_clock::now();
236 
237  while (!converged)
238  {
239  fStepCount++;
240 
241  trainLoader.Shuffle();
242  std::vector<TBatch<Architecture_t>> batches{};
243  for (size_t i = 0; i < nTrainingSamples / net.GetBatchSize(); i += nThreads) {
244  batches.clear();
245  for (size_t j = 0; j < nThreads; j++) {
246  batches.reserve(nThreads);
247  batches.push_back(trainLoader.GetBatch());
248  }
249  Step(net, nets, batches);
250  }
251 
252  // Compute test error.
253  if ((fStepCount % fTestInterval) == 0) {
254 
255  end = std::chrono::system_clock::now();
256  std::chrono::duration<double> elapsed_seconds = end - start;
257  start = std::chrono::system_clock::now();
258  double seconds = elapsed_seconds.count();
259  double batchesInEpoch = (double) (nTrainingSamples / net.GetBatchSize());
260  double nFlops = batchesInEpoch * fTestInterval;
261  nFlops *= net.GetNFlops();
262  std::cout << "Elapsed time for " << fTestInterval << " Epochs: "
263  << seconds << " [s] => " << nFlops * 1e-9 / seconds
264  << " GFlop/s" << std::endl;
265 
266  auto b = *testLoader.begin();
267  auto inputMatrix = b.GetInput();
268  auto outputMatrix = b.GetOutput();
269  Scalar_t loss = testNet.Loss(inputMatrix, outputMatrix);
270 
271  std::cout << "Step " << fStepCount << ": Training Error = "
272  << loss << std::endl;
273  converged = HasConverged();
274  }
275 
276  }
277  return fMinimumError;
278 }
279 
280 //______________________________________________________________________________
281 template<typename Architecture_t>
282 template <typename Data_t, typename Net_t>
283 auto TGradientDescent<Architecture_t>::TrainMomentum(const Data_t & trainingData,
284  size_t nTrainingSamples,
285  const Data_t & testData,
286  size_t nTestSamples,
287  Net_t & net,
288  Scalar_t momentum,
289  size_t nThreads)
290  -> Scalar_t
291 {
292  // Reset iteration state.
293  fMinimumError = 1e100;
294  fConvergenceCount = 0;
295  fStepCount = 0;
296 
297  // Prepare training data.
298  bool converged = false;
299 
300  TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
301  net.GetBatchSize(),
302  net.GetInputWidth(),
303  net.GetOutputWidth(), nThreads);
304  auto testNet = net.CreateClone(net.GetBatchSize());
305  TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
306  testNet.GetBatchSize(),
307  testNet.GetInputWidth(),
308  net.GetOutputWidth());
309 
310  net.InitializeGradients();
311  std::vector<Net_t> nets{};
312  nets.reserve(nThreads);
313  for (size_t i = 0; i < nThreads; i++) {
314  nets.push_back(net);
315  for (size_t j = 0; j < net.GetDepth(); j++)
316  {
317  auto &masterLayer = net.GetLayer(j);
318  auto &layer = nets.back().GetLayer(j);
319  Architecture_t::Copy(layer.GetWeights(),
320  masterLayer.GetWeights());
321  Architecture_t::Copy(layer.GetBiases(),
322  masterLayer.GetBiases());
323  }
324  }
325 
326  std::chrono::time_point<std::chrono::system_clock> start, end;
327  start = std::chrono::system_clock::now();
328 
329  while (!converged)
330  {
331  fStepCount++;
332 
333  trainLoader.Shuffle();
334  // Iterate over epoch.
335  std::vector<TBatch<Architecture_t>> batches{};
336  for (size_t i = 0; i < nTrainingSamples / net.GetBatchSize(); i += nThreads) {
337  batches.clear();
338  batches.reserve(nThreads);
339  for (size_t j = 0; j < nThreads; j++) {
340  batches.push_back(trainLoader.GetBatch());
341  }
342  if (momentum != 0.0) {
343  StepMomentum(net, nets, batches, momentum);
344  } else {
345  Step(net, nets, batches);
346  }
347  }
348 
349  // Compute test error.
350  if ((fStepCount % fTestInterval) == 0) {
351  fTestError = 0.0;
352  for (size_t i = 0; i < nTestSamples / net.GetBatchSize(); i += nThreads) {
353  auto b = testLoader.GetBatch();
354  auto inputMatrix = b.GetInput();
355  auto outputMatrix = b.GetOutput();
356  fTestError += testNet.Loss(inputMatrix, outputMatrix);
357  }
358  fTestError /= (Double_t) nTestSamples / net.GetBatchSize();
359  converged = HasConverged();
360  }
361 
362  }
363  return fMinimumError;
364 }
365 
366 //______________________________________________________________________________
367 template<typename Architecture_t>
368  template <typename Net_t>
369  void inline TGradientDescent<Architecture_t>::Step(Net_t & net,
370  Matrix_t &input,
371  const Matrix_t &output)
372 {
373  //Scalar_t loss = net.Loss(input, output);
374  //fTrainingError = loss;
375  net.Forward(input, true);
376  net.Backward(input, output);
377 
378  for (size_t i = 0; i < net.GetDepth(); i++)
379  {
380  auto &layer = net.GetLayer(i);
381  Architecture_t::ScaleAdd(layer.GetWeights(),
382  layer.GetWeightGradients(),
383  -fLearningRate);
384  Architecture_t::ScaleAdd(layer.GetBiases(),
385  layer.GetBiasGradients(),
386  -fLearningRate);
387  }
388 }
389 
390 //______________________________________________________________________________
391 template<typename Architecture_t>
392 template <typename Net_t>
394  Matrix_t &input,
395  const Matrix_t &output)
396  -> Scalar_t
397 {
398  //Scalar_t loss = net.Loss(input, output);
399  //fTrainingError = loss;
400  Scalar_t loss = net.Loss(input, output);
401  net.Backward(input, output);
402 
403  for (size_t i = 0; i < net.GetDepth(); i++)
404  {
405  auto &layer = net.GetLayer(i);
406  Architecture_t::ScaleAdd(layer.GetWeights(),
407  layer.GetWeightGradients(),
408  -fLearningRate);
409  Architecture_t::ScaleAdd(layer.GetBiases(),
410  layer.GetBiasGradients(),
411  -fLearningRate);
412  }
413  return loss;
414 }
415 
416 //______________________________________________________________________________
417 template<typename Architecture_t>
418  template <typename Net_t>
420  Net_t & master,
421  std::vector<Net_t> & nets,
422  std::vector<TBatch<Architecture_t>> & batches)
423 {
424  typename Architecture_t::Matrix_t dummy(0,0);
425  size_t depth = master.GetDepth();
426 
427  // Forward
428  for (size_t j = 0; j < nets.size(); j++) {
429  nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
430  }
431 
432  for (size_t i = 1; i < depth; i++)
433  {
434  for (size_t j = 0; j < nets.size(); j++) {
435  nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
436  }
437  }
438  // Gradients
439  for (size_t j = 0; j < nets.size(); j++) {
440  evaluateGradients<Architecture_t>(
441  nets[j].GetLayer(depth-1).GetActivationGradients(),
442  nets[j].GetLossFunction(),
443  batches[j].GetOutput(),
444  nets[j].GetLayer(depth-1).GetOutput());
445  }
446  // Backward
447  for (size_t i = depth - 1; i > 0; i--)
448  {
449  for (size_t j = 0; j < nets.size(); j++) {
450  nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
451  nets[j].GetLayer(i-1).GetOutput(),
452  nets[j].GetRegularization(),
453  nets[j].GetWeightDecay());
454  }
455  }
456  for (size_t j = 0; j < nets.size(); j++) {
457  nets[j].GetLayer(0).Backward(dummy,
458  batches[j].GetInput(),
459  nets[j].GetRegularization(),
460  nets[j].GetWeightDecay());
461  }
462 
463  for (size_t j = 0; j < nets.size(); j++) {
464  for (size_t i = 0; i < depth; i++)
465  {
466  auto &masterLayer = master.GetLayer(i);
467  auto &layer = nets[j].GetLayer(i);
468  Architecture_t::ScaleAdd(masterLayer.GetWeights(),
469  layer.GetWeightGradients(),
470  -fLearningRate);
471  Architecture_t::Copy(layer.GetWeights(),
472  masterLayer.GetWeights());
473  Architecture_t::ScaleAdd(masterLayer.GetBiases(),
474  layer.GetBiasGradients(),
475  -fLearningRate);
476  Architecture_t::Copy(layer.GetBiases(),
477  masterLayer.GetBiases());
478  }
479  }
480 }
481 
482 //______________________________________________________________________________
483 template<typename Architecture_t>
484 template <typename Net_t>
486  Net_t & master,
487  std::vector<Net_t> & nets,
488  std::vector<TBatch<Architecture_t>> & batches,
489  Scalar_t momentum)
490 {
491  typename Architecture_t::Matrix_t dummy(0,0);
492  size_t depth = master.GetDepth();
493 
494  // Forward
495  for (size_t j = 0; j < nets.size(); j++) {
496  nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
497  }
498 
499  for (size_t i = 1; i < depth; i++)
500  {
501  for (size_t j = 0; j < nets.size(); j++) {
502  nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
503  }
504  }
505  // Gradients
506  for (size_t j = 0; j < nets.size(); j++) {
507  evaluateGradients<Architecture_t>(
508  nets[j].GetLayer(depth-1).GetActivationGradients(),
509  nets[j].GetLossFunction(),
510  batches[j].GetOutput(),
511  nets[j].GetLayer(depth-1).GetOutput());
512  }
513  // Backward
514  for (size_t i = depth - 1; i > 0; i--)
515  {
516  for (size_t j = 0; j < nets.size(); j++) {
517  nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
518  nets[j].GetLayer(i-1).GetOutput(),
519  nets[j].GetRegularization(),
520  nets[j].GetWeightDecay());
521  Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
522  nets[j].GetLayer(i).GetWeightGradients(),
523  - fLearningRate / momentum);
524  Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
525  nets[j].GetLayer(i).GetBiasGradients(),
526  - fLearningRate / momentum);
527  }
528  Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
529  master.GetLayer(i).GetWeightGradients(),
530  momentum - 1.0);
531  Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
532  master.GetLayer(i).GetBiasGradients(),
533  momentum - 1.0);
534  }
535  for (size_t j = 0; j < nets.size(); j++) {
536  nets[j].GetLayer(0).Backward(dummy,
537  batches[j].GetInput(),
538  nets[j].GetRegularization(),
539  nets[j].GetWeightDecay());
540  Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
541  nets[j].GetLayer(0).GetWeightGradients(),
542  - fLearningRate / momentum);
543  Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
544  nets[j].GetLayer(0).GetBiasGradients(),
545  - fLearningRate / momentum);
546  }
547 
548  Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
549  master.GetLayer(0).GetWeightGradients(),
550  momentum - 1.0);
551  Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
552  master.GetLayer(0).GetBiasGradients(),
553  momentum - 1.0);
554 
555  for (size_t i = 0; i < depth; i++)
556  {
557  auto &masterLayer = master.GetLayer(i);
558  Architecture_t::ScaleAdd(masterLayer.GetWeights(),
559  masterLayer.GetWeightGradients(),
560  1.0);
561  Architecture_t::ScaleAdd(masterLayer.GetBiases(),
562  masterLayer.GetBiasGradients(),
563  1.0);
564  for (size_t j = 0; j < nets.size(); j++) {
565  auto &layer = nets[j].GetLayer(i);
566  Architecture_t::Copy(layer.GetWeights(),
567  masterLayer.GetWeights());
568  Architecture_t::Copy(layer.GetBiases(),
569  masterLayer.GetBiases());
570  }
571  }
572 }
573 
574 //______________________________________________________________________________
575 template<typename Architecture_t>
576 template <typename Net_t>
578  Net_t & master,
579  std::vector<Net_t> & nets,
580  std::vector<TBatch<Architecture_t>> & batches,
581  Scalar_t momentum)
582 {
583  typename Architecture_t::Matrix_t dummy(0,0);
584  size_t depth = master.GetDepth();
585 
586  // Forward
587  for (size_t j = 0; j < nets.size(); j++) {
588  nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
589  }
590 
591  for (size_t i = 1; i < depth; i++)
592  {
593  for (size_t j = 0; j < nets.size(); j++) {
594  nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
595  }
596  }
597 
598  // Gradients
599  for (size_t j = 0; j < nets.size(); j++) {
600  evaluateGradients<Architecture_t>(
601  nets[j].GetLayer(depth-1).GetActivationGradients(),
602  nets[j].GetLossFunction(),
603  batches[j].GetOutput(),
604  nets[j].GetLayer(depth-1).GetOutput());
605  }
606 
607  // Backward
608  for (size_t i = depth - 1; i > 0; i--)
609  {
610  for (size_t j = 0; j < nets.size(); j++) {
611  nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
612  nets[j].GetLayer(i-1).GetOutput(),
613  nets[j].GetRegularization(),
614  nets[j].GetWeightDecay());
615  }
616  }
617 
618  for (size_t j = 0; j < nets.size(); j++) {
619  nets[j].GetLayer(0).Backward(dummy,
620  batches[j].GetInput(),
621  nets[j].GetRegularization(),
622  nets[j].GetWeightDecay());
623  }
624 
625  for (size_t i = 0; i < depth; i++)
626  {
627  auto &masterLayer = master.GetLayer(i);
628  for (size_t j = 0; j < nets.size(); j++) {
629  auto &layer = nets[j].GetLayer(i);
630  Architecture_t::Copy(layer.GetWeights(),
631  masterLayer.GetWeights());
632  Architecture_t::Copy(layer.GetBiases(),
633  masterLayer.GetBiases());
634  Architecture_t::ScaleAdd(layer.GetWeights(),
635  masterLayer.GetWeightGradients(),
636  1.0);
637  Architecture_t::ScaleAdd(layer.GetBiases(),
638  masterLayer.GetBiasGradients(),
639  1.0);
640  }
641  for (size_t j = 0; j < nets.size(); j++) {
642  auto &layer = nets[j].GetLayer(i);
643  Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
644  layer.GetWeightGradients(),
645  - fLearningRate / momentum);
646  Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
647  layer.GetBiasGradients(),
648  - fLearningRate / momentum);
649  }
650  Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
651  masterLayer.GetWeightGradients(),
652  momentum - 1.0);
653  Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
654  masterLayer.GetBiasGradients(),
655  momentum - 1.0);
656  Architecture_t::ScaleAdd(masterLayer.GetWeights(),
657  masterLayer.GetWeightGradients(),
658  1.0);
659  Architecture_t::ScaleAdd(masterLayer.GetBiases(),
660  masterLayer.GetBiasGradients(),
661  1.0);
662  }
663 }
664 
665 //______________________________________________________________________________
666 template<typename Architecture_t>
667 template <typename Net_t>
669  Net_t & net,
670  Matrix_t &input,
671  const Matrix_t &output)
672 {
673  net.Forward(input, true);
674  net.Backward(input, output);
675 
676  for (size_t i = 0; i < net.GetDepth(); i++)
677  {
678  auto &layer = net.GetLayer(i);
679  Architecture_t::ScaleAdd(layer.GetWeights(),
680  layer.GetWeightGradients(),
681  -fLearningRate);
682  if (i == 0) {
683  Architecture_t::ScaleAdd(layer.GetBiases(),
684  layer.GetBiasGradients(),
685  -fLearningRate);
686  }
687  }
688 }
689 
690 //______________________________________________________________________________
691 template<typename Architecture_t>
692  template <typename Net_t>
694  Net_t & net,
695  Matrix_t &input,
696  const Matrix_t &output)
697  -> Scalar_t
698 {
699  Scalar_t loss = net.Loss(input, output);
701  net.Backward(input, output);
702 
703  for (size_t i = 0; i < net.GetDepth(); i++)
704  {
705  auto &layer = net.GetLayer(i);
706  Architecture_t::ScaleAdd(layer.GetWeights(),
707  layer.GetWeightGradients(),
708  -fLearningRate);
709  if (i == 0) {
710  Architecture_t::ScaleAdd(layer.GetBiases(),
711  layer.GetBiasGradients(),
712  -fLearningRate);
713  }
714  }
715  return loss;
716 }
717 
718 //______________________________________________________________________________
719 template<typename Architecture_t>
721 {
722  if (fTestError < fMinimumError * 0.999) {
723  fConvergenceCount = 0;
725  } else {
727  }
728 
730 }
731 
732 //______________________________________________________________________________
733 template<typename Architecture_t>
735 {
736  fTestError = testError;
737  if (fTestError < fMinimumError * 0.999) {
738  fConvergenceCount = 0;
740  } else {
742  }
744 }
745 } // namespace DNN
746 } // namespace TMVA
747 
748 #endif
typename Architecture_t::Scalar_t Scalar_t
Definition: Minimizers.h:56
Scalar_t GetTrainingError() const
Definition: Minimizers.h:157
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output)
Perform a single optimization step on a given batch.
Definition: Minimizers.h:369
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels)...
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t fStepCount
Number of steps performed in the current.
Definition: Minimizers.h:61
size_t fBatchSize
Batch size to use for the training.
Definition: Minimizers.h:60
Scalar_t fMinimumError
The minimum loss achieved on the training set.
Definition: Minimizers.h:71
Scalar_t GetTestError() const
Definition: Minimizers.h:158
size_t fConvergenceSteps
Number of training epochs without considerable.
Definition: Minimizers.h:63
Scalar_t fTestError
Holds the most recently computed test loss.
Definition: Minimizers.h:69
void SetBatchSize(Scalar_t rate)
Definition: Minimizers.h:164
void SetConvergenceSteps(size_t steps)
Definition: Minimizers.h:161
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device...
Definition: Minimizers.h:668
size_t fConvergenceCount
Current number of training epochs without.
Definition: Minimizers.h:65
TDataLoader.
Definition: DataLoader.h:72
size_t fTestInterval
Interval for the computation of the test error.
Definition: Minimizers.h:67
void Reset()
Reset minimizer object to initial state.
Definition: Minimizers.h:80
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Definition: Minimizers.h:720
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t >> &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
Definition: Minimizers.h:577
void SetLearningRate(Scalar_t rate)
Definition: Minimizers.h:163
size_t GetTestInterval() const
Definition: Minimizers.h:159
void Copy(void *source, void *dest)
size_t GetConvergenceSteps() const
Definition: Minimizers.h:156
double Double_t
Definition: RtypesCore.h:55
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output)
Similar to StepReducedWeights(...) but also evaluates the loss.
static RooMathCoreReg dummy
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
Definition: TRolke.cxx:630
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t >> &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
Definition: Minimizers.h:485
Abstract ClassifierFactory template that handles arbitrary types.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output)
Same as Step(...) but also evaluate the loss on the given training data.
typename Architecture_t::Matrix_t Matrix_t
Definition: Minimizers.h:57
Scalar_t fLearningRate
Learning rate .
Definition: Minimizers.h:70
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
void SetTestInterval(size_t interval)
Definition: Minimizers.h:162
size_t GetConvergenceCount() const
Definition: Minimizers.h:155
Scalar_t fTrainingError
Holds the most recently computed training loss.
Definition: Minimizers.h:68