Logo ROOT   6.18/05
Reference Guide
DLMinimizers.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/cnn:$Id$
2// Author: Vladimir Ilievski
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TDLGradientDescent *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * Deel Learning Minimizers *
12 * *
13 * Authors (alphabetical): *
14 * Vladimir Ilievski <ilievski.vladimir@live.com> - CERN, Switzerland *
15 * *
16 * Copyright (c) 2005-2015: *
17 * CERN, Switzerland *
18 * U. of Victoria, Canada *
19 * MPI-K Heidelberg, Germany *
20 * U. of Bonn, Germany *
21 * *
22 * Redistribution and use in source and binary forms, with or without *
23 * modification, are permitted according to the terms listed in LICENSE *
24 * (http://tmva.sourceforge.net/LICENSE) *
25 **********************************************************************************/
26
27#ifndef TMVA_DNN_DLMINIMIZERS
28#define TMVA_DNN_DLMINIMIZERS
29
31#include "TMVA/DNN/Functions.h"
32#include "TMVA/DNN/DeepNet.h"
33
34#include <limits>
35#include <iostream>
36
37namespace TMVA {
38namespace DNN {
39
40/*** \class TDLGradientDescent
41 *
42 * Generic implementation of gradient descent minimization for the
43 * deep learning neural nets.
44 *
45 * The TDLGradientDescent class implements an architecture, input data and
46 * deep learning neural network type independent implementation of the gradient
47 * descent minimization algorithm.
48 *
49* This is provided by the Step(...), StepMomentum(...) and
50 * StepNesterov(...) functions that perform a single minimization step.
51 *
52 * The main training characteristics are defined by the provided learning rate,
53 * the test interval, and the convergence steps required for convergence. The
54 * test interval defines how often the error on the validation set is computed,
55 * and the values with which the step counter is increased each time the
56 * HasConverged() member function is called. A convergence step is defined as
57 * a step in which the test error is NOT less than 0.999 times the current
58 * minimal test error that has been reached. If between two subsequent calls
59 * to HasConverged(Double_t) the test error has not been sufficiently reduced
60 * it is assumed that a number of convergence steps equal to the test interval
61 * has been performed.
62 */
63
64template <typename Architecture_t>
66public:
68 using Scalar_t = typename Architecture_t::Scalar_t;
69 using Matrix_t = typename Architecture_t::Matrix_t;
70
71private:
72 size_t fBatchSize; ///< Batch size to use for the training.
73 size_t fStepCount; ///< Number of steps performed in the current training session
74 size_t fConvergenceSteps; ///< Number of training epochs without considerable
75 ///< decrease in the test error for convergence.
76 size_t fConvergenceCount; ///< Current number of training epochs without
77 ///< considerable decrease in the test error.
78 size_t fTestInterval; ///< Interval for the computation of the test error.
79 Scalar_t fTrainingError; ///< Holds the most recently computed training loss.
80 Scalar_t fTestError; ///< Holds the most recently computed test loss.
81 Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$
82 Scalar_t fMinimumError; ///< The minimum loss achieved on the training set
83 ///< during the current traning session.
84
85public:
87 TDLGradientDescent(Scalar_t learningRate, size_t convergenceSteps, size_t testInterval);
88
89 /** Reset minimizer object to default state. */
90 void Reset()
91 {
92 fMinimumError = std::numeric_limits<Scalar_t>::infinity();
94 fStepCount = 0;
95 };
96
97 /** Perform a single optimization step on a given batch. Propagates the input
98 matrix foward through the net, evaluates the loss and propagates the gradients
99 backward through the net. The computed gradients are scaled by the learning
100 rate \f$\alpha\f$ and subtracted from the weights and bias values of each
101 layer. */
102 void Step(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output, const Matrix_t &weights);
103
104 /** Does not evaluate the loss and therefore not trigger a possible synchronization
105 * with the device. Trains the weights of each layer, but only the bias terms of
106 * the first layer for compatibility with the previous implementation. */
107 void StepReducedWeights(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output,
108 const Matrix_t &weights);
109
110 /** Same as Step(...) but also evaluate the loss on the given training data.
111 * Note that this requires synchronization between host and device. */
112 Scalar_t StepLoss(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output, const Matrix_t &weights);
113
114 /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger
115 * synchronization with the device. */
116 Scalar_t StepReducedWeightsLoss(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output,
117 const Matrix_t &weights);
118
119 /** Perform multiple optimization steps simultaneously. Performs the
120 * backprop algorithm on the input batches given in \p batches on
121 * the neural networks given in \p nets. The forward and backward propagation
122 * steps are executed in an interleaving manner in order to exploit potential
123 * batch-level parallelism for asynchronous device calls.
124 */
125 void Step(DeepNet_t &master, std::vector<DeepNet_t> &nets, std::vector<TTensorBatch<Architecture_t>> &batches);
126
127 /** Same as the Step(...) method for multiple batches but uses momentum. */
128 void StepMomentum(DeepNet_t &master, std::vector<DeepNet_t> &nets,
129 std::vector<TTensorBatch<Architecture_t>> &batches, Scalar_t momentum);
130
131 /** Same as the Step(...) method for multiple batches but uses Nesterov
132 * momentum. */
133 void StepNesterov(DeepNet_t &master, std::vector<DeepNet_t> &nets,
134 std::vector<TTensorBatch<Architecture_t>> &batches, Scalar_t momentum);
135
136 /** Increases the minimization step counter by the test error evaluation
137 * period and uses the current internal value of the test error to
138 * determine if the minimization has converged. */
139 bool HasConverged();
140
141 /** Increases the minimization step counter by the test error evaluation
142 * period and uses the provided test error value to determine if the
143 * minimization has converged. */
144 bool HasConverged(Scalar_t testError);
145
146 /** Getters */
147 size_t GetConvergenceCount() const { return fConvergenceCount; }
148 size_t GetConvergenceSteps() const { return fConvergenceSteps; }
150 Scalar_t GetTestError() const { return fTestError; }
151 size_t GetTestInterval() const { return fTestInterval; }
152
153 /** Setters */
154 void SetConvergenceSteps(size_t steps) { fConvergenceSteps = steps; }
155 void SetTestInterval(size_t interval) { fTestInterval = interval; }
157 void SetBatchSize(Scalar_t rate) { fBatchSize = rate; }
158};
159
160//
161// Implementation
162//______________________________________________________________________________
163template <typename Architecture_t>
165 : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0), fLearningRate(0),
166 fMinimumError(std::numeric_limits<Scalar_t>::infinity())
167{
168 // Nothing to do here.
169}
170
171//______________________________________________________________________________
172template <typename Architecture_t>
174 size_t testInterval)
175 : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
176 fTestInterval(testInterval), fLearningRate(learningRate), fMinimumError(std::numeric_limits<Scalar_t>::infinity())
177{
178 // Nothing to do here.
179}
180
181//______________________________________________________________________________
182template <typename Architecture_t>
183void TDLGradientDescent<Architecture_t>::Step(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output,
184 const Matrix_t &weights)
185{
186 // Make forward and backward pass and update the net afterwards
187 deepNet.Forward(input, true);
188 deepNet.Backward(input, output, weights);
189 deepNet.Update(fLearningRate);
190}
191
192//______________________________________________________________________________
193template <typename Architecture_t>
194void TDLGradientDescent<Architecture_t>::StepReducedWeights(DeepNet_t &deepNet, std::vector<Matrix_t> &input,
195 const Matrix_t &output, const Matrix_t &weights)
196{
197 // Make forward and backward pass and update the net afterwards
198 deepNet.Forward(input, true);
199 deepNet.Backward(input, output, weights);
200
201 for (size_t i = 0; i < deepNet.GetDepth(); i++) {
202 auto *layer = deepNet.GetLayerAt(i);
203
204 layer->UpdateWeights(layer->GetWeightGradients(), fLearningRate);
205 if (i == 0) {
206 layer->UpdateBiases(layer->GetBiasGradients(), fLearningRate);
207 }
208 }
209}
210
211//______________________________________________________________________________
212template <typename Architecture_t>
213auto TDLGradientDescent<Architecture_t>::StepLoss(DeepNet_t &deepNet, std::vector<Matrix_t> &input,
214 const Matrix_t &output, const Matrix_t &weights) -> Scalar_t
215{
216 Scalar_t loss = deepNet.Loss(input, output);
217 deepNet.Backward(input, output, weights);
218 deepNet.Update(fLearningRate);
219
220 return loss;
221}
222
223//______________________________________________________________________________
224template <typename Architecture_t>
226 const Matrix_t &output, const Matrix_t &weights)
227 -> Scalar_t
228{
229 Scalar_t loss = deepNet.Loss(input, output);
230 fTrainingError = loss;
231 deepNet.Backward(input, output, weights);
232
233 for (size_t i = 0; i < deepNet.GetDepth(); i++) {
234 auto *layer = deepNet.GetLayerAt(i);
235
236 layer->UpdateWeights(layer->GetWeightGradients(), fLearningRate);
237 if (i == 0) {
238 layer->UpdateBiases(layer->GetBiasGradients(), fLearningRate);
239 }
240 }
241
242 return loss;
243}
244
245//______________________________________________________________________________
246template <typename Architecture_t>
247void TDLGradientDescent<Architecture_t>::Step(DeepNet_t &master, std::vector<DeepNet_t> &nets,
248 std::vector<TTensorBatch<Architecture_t>> &batches)
249{
250
251 master.ParallelForward(nets, batches);
252 master.ParallelBackward(nets, batches, fLearningRate);
253}
254
255//______________________________________________________________________________
256template <typename Architecture_t>
257void TDLGradientDescent<Architecture_t>::StepMomentum(DeepNet_t &master, std::vector<DeepNet_t> &nets,
258 std::vector<TTensorBatch<Architecture_t>> &batches,
259 Scalar_t momentum)
260{
261 master.ParallelForward(nets, batches);
262 master.ParallelBackwardMomentum(nets, batches, fLearningRate, momentum);
263}
264
265//______________________________________________________________________________
266template <typename Architecture_t>
267void TDLGradientDescent<Architecture_t>::StepNesterov(DeepNet_t &master, std::vector<DeepNet_t> &nets,
268 std::vector<TTensorBatch<Architecture_t>> &batches,
269 Scalar_t momentum)
270{
271 master.ParallelForward(nets, batches);
272 master.ParallelBackwardNestorov(nets, batches, fLearningRate, momentum);
273}
274
275//______________________________________________________________________________
276template <typename Architecture_t>
278{
279 if (fTestError < fMinimumError * 0.999) {
280 fConvergenceCount = 0;
281 fMinimumError = fTestError;
282 } else {
283 fConvergenceCount++;
284 }
285
286 return (fConvergenceCount >= fConvergenceSteps);
287}
288
289//______________________________________________________________________________
290template <typename Architecture_t>
292{
293 fTestError = testError;
294 if (fTestError < fMinimumError * 0.999) {
295 fConvergenceCount = 0;
296 fMinimumError = fTestError;
297 } else {
298 fConvergenceCount += fTestInterval;
299 }
300 return (fConvergenceCount >= fConvergenceSteps);
301}
302
303} // namespace DNN
304} // namespace TMVA
305
306#endif
Scalar_t fMinimumError
The minimum loss achieved on the training set.
Definition: DLMinimizers.h:82
void SetBatchSize(Scalar_t rate)
Definition: DLMinimizers.h:157
typename Architecture_t::Scalar_t Scalar_t
Definition: DLMinimizers.h:68
Scalar_t fTestError
Holds the most recently computed test loss.
Definition: DLMinimizers.h:80
void StepNesterov(DeepNet_t &master, std::vector< DeepNet_t > &nets, std::vector< TTensorBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
Definition: DLMinimizers.h:267
size_t GetConvergenceSteps() const
Definition: DLMinimizers.h:148
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Definition: DLMinimizers.h:277
void SetTestInterval(size_t interval)
Definition: DLMinimizers.h:155
Scalar_t StepReducedWeightsLoss(DeepNet_t &deepNet, std::vector< Matrix_t > &input, const Matrix_t &output, const Matrix_t &weights)
Similar to StepReducedWeights(...) but also evaluates the loss.
Definition: DLMinimizers.h:225
size_t fStepCount
Number of steps performed in the current training session.
Definition: DLMinimizers.h:73
void Reset()
Reset minimizer object to default state.
Definition: DLMinimizers.h:90
size_t fBatchSize
Batch size to use for the training.
Definition: DLMinimizers.h:72
void StepReducedWeights(DeepNet_t &deepNet, std::vector< Matrix_t > &input, const Matrix_t &output, const Matrix_t &weights)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device.
Definition: DLMinimizers.h:194
void SetConvergenceSteps(size_t steps)
Setters.
Definition: DLMinimizers.h:154
size_t fConvergenceCount
Current number of training epochs without.
Definition: DLMinimizers.h:76
void SetLearningRate(Scalar_t rate)
Definition: DLMinimizers.h:156
Scalar_t StepLoss(DeepNet_t &deepNet, std::vector< Matrix_t > &input, const Matrix_t &output, const Matrix_t &weights)
Same as Step(...) but also evaluate the loss on the given training data.
Definition: DLMinimizers.h:213
size_t fConvergenceSteps
Number of training epochs without considerable.
Definition: DLMinimizers.h:74
TDeepNet< Architecture_t > DeepNet_t
Definition: DLMinimizers.h:67
size_t GetConvergenceCount() const
Getters.
Definition: DLMinimizers.h:147
size_t fTestInterval
Interval for the computation of the test error.
Definition: DLMinimizers.h:78
typename Architecture_t::Matrix_t Matrix_t
Definition: DLMinimizers.h:69
Scalar_t GetTrainingError() const
Definition: DLMinimizers.h:149
Scalar_t fLearningRate
Learning rate .
Definition: DLMinimizers.h:81
Scalar_t fTrainingError
Holds the most recently computed training loss.
Definition: DLMinimizers.h:79
void Step(DeepNet_t &deepNet, std::vector< Matrix_t > &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
Definition: DLMinimizers.h:183
Scalar_t GetTestError() const
Definition: DLMinimizers.h:150
void StepMomentum(DeepNet_t &master, std::vector< DeepNet_t > &nets, std::vector< TTensorBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
Definition: DLMinimizers.h:257
Generic Deep Neural Network class.
Definition: DeepNet.h:74
void ParallelBackward(std::vector< TDeepNet< Architecture_t, Layer_t > > &nets, std::vector< TTensorBatch< Architecture_t > > &batches, Scalar_t learningRate)
Function for parallel backward in the vector of deep nets, where the master net is the net calling th...
Definition: DeepNet.h:911
size_t GetDepth() const
Definition: DeepNet.h:293
Layer_t * GetLayerAt(size_t i)
Get the layer in the vector of layers at poistion i.
Definition: DeepNet.h:289
void ParallelForward(std::vector< TDeepNet< Architecture_t, Layer_t > > &nets, std::vector< TTensorBatch< Architecture_t > > &batches, bool applyDropout=false)
Function for parallel forward in the vector of deep nets, where the master net is the net calling thi...
Definition: DeepNet.h:745
void ParallelBackwardMomentum(std::vector< TDeepNet< Architecture_t, Layer_t > > &nets, std::vector< TTensorBatch< Architecture_t > > &batches, Scalar_t learningRate, Scalar_t momentum)
Function for parallel backward in the vector of deep nets, where the master net is the net calling th...
Definition: DeepNet.h:958
void Backward(std::vector< Matrix_t > &input, const Matrix_t &groundTruth, const Matrix_t &weights)
Function that executes the entire backward pass in the network.
Definition: DeepNet.h:889
void Forward(std::vector< Matrix_t > &input, bool applyDropout=false)
Function that executes the entire forward pass in the network.
Definition: DeepNet.h:734
void Update(Scalar_t learningRate)
Function that will update the weights and biases in the layers that contain weights and biases.
Definition: DeepNet.h:1083
void ParallelBackwardNestorov(std::vector< TDeepNet< Architecture_t, Layer_t > > &nets, std::vector< TTensorBatch< Architecture_t > > &batches, Scalar_t learningRate, Scalar_t momentum)
Function for parallel backward in the vector of deep nets, where the master net is the net calling th...
Definition: DeepNet.h:1021
create variable transformations
static void output(int code)
Definition: gifencode.c:226