Logo ROOT   6.16/01
Reference Guide
RMSProp.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Ravi Kiran S
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TRMSProp *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * RMSProp Optimizer Class *
12 * *
13 * Authors (alphabetical): *
14 * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15 * *
16 * Copyright (c) 2005-2018: *
17 * CERN, Switzerland *
18 * U. of Victoria, Canada *
19 * MPI-K Heidelberg, Germany *
20 * U. of Bonn, Germany *
21 * *
22 * Redistribution and use in source and binary forms, with or without *
23 * modification, are permitted according to the terms listed in LICENSE *
24 * (http://tmva.sourceforge.net/LICENSE) *
25 **********************************************************************************/
26
27#ifndef TMVA_DNN_RMSPROP
28#define TMVA_DNN_RMSPROP
29
30#include "TMatrix.h"
31#include "TMVA/DNN/Optimizer.h"
32#include "TMVA/DNN/Functions.h"
33
34namespace TMVA {
35namespace DNN {
36
37/** \class TRMSProp
38 * RMSProp Optimizer class
39 *
40 * This class represents the RMSProp Optimizer with options for applying momentum.
41 */
42template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
43 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
44class TRMSProp : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
45public:
46 using Matrix_t = typename Architecture_t::Matrix_t;
47 using Scalar_t = typename Architecture_t::Scalar_t;
48
49protected:
50 Scalar_t fMomentum; ///< The momentum used for training.
51 Scalar_t fRho; ///< The Rho constant used by the optimizer.
52 Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
53 std::vector<std::vector<Matrix_t>>
54 fPastSquaredWeightGradients; ///< The sum of the square of the past weight gradients associated with the deep net.
55 std::vector<std::vector<Matrix_t>>
56 fPastSquaredBiasGradients; ///< The sum of the square of the past bias gradients associated with the deep net.
57
58 std::vector<std::vector<Matrix_t>> fWeightUpdates; ///< The accumulation of the past Weights for performing updates.
59 std::vector<std::vector<Matrix_t>> fBiasUpdates; ///< The accumulation of the past Biases for performing updates.
60
61 /*! Update the weights, given the current weight gradients. */
62 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
63
64 /*! Update the biases, given the current bias gradients. */
65 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
66
67public:
68 /*! Constructor. */
69 TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t momentum = 0.0, Scalar_t rho = 0.9,
70 Scalar_t epsilon = 1e-7);
71
72 /*! Destructor. */
73 ~TRMSProp() = default;
74
75 /*! Getters */
76 Scalar_t GetMomentum() const { return fMomentum; }
77 Scalar_t GetRho() const { return fRho; }
78 Scalar_t GetEpsilon() const { return fEpsilon; }
79
80 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
81 std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
82
83 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
84 std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
85
86 std::vector<std::vector<Matrix_t>> &GetWeightUpdates() { return fWeightUpdates; }
87 std::vector<Matrix_t> &GetWeightUpdatesAt(size_t i) { return fWeightUpdates[i]; }
88
89 std::vector<std::vector<Matrix_t>> &GetBiasUpdates() { return fBiasUpdates; }
90 std::vector<Matrix_t> &GetBiasUpdatesAt(size_t i) { return fBiasUpdates[i]; }
91};
92
93//
94//
95// The RMSProp Optimizer Class - Implementation
96//_________________________________________________________________________________________________
97template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
100 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum), fRho(rho),
101 fEpsilon(epsilon)
102{
103 std::vector<Layer_t *> &layers = deepNet.GetLayers();
104 const size_t layersNSlices = layers.size();
105 fPastSquaredWeightGradients.resize(layersNSlices);
106 fPastSquaredBiasGradients.resize(layersNSlices);
107 fWeightUpdates.resize(layersNSlices);
108 fBiasUpdates.resize(layersNSlices);
109
110 for (size_t i = 0; i < layersNSlices; i++) {
111 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
112
113 for (size_t j = 0; j < weightsNSlices; j++) {
114 Matrix_t &currentWeights = layers[i]->GetWeightsAt(j);
115 const size_t weightsNRows = currentWeights.GetNrows();
116 const size_t weightsNCols = currentWeights.GetNcols();
117
118 fPastSquaredWeightGradients[i].emplace_back(weightsNRows, weightsNCols);
119 fWeightUpdates[i].emplace_back(weightsNRows, weightsNCols);
120 initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
121 initialize<Architecture_t>(fWeightUpdates[i][j], EInitialization::kZero);
122 }
123
124 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
125
126 for (size_t j = 0; j < biasesNSlices; j++) {
127 Matrix_t &currentBiases = layers[i]->GetBiasesAt(j);
128 const size_t biasesNRows = currentBiases.GetNrows();
129 const size_t biasesNCols = currentBiases.GetNcols();
130
131 fPastSquaredBiasGradients[i].emplace_back(biasesNRows, biasesNCols);
132 fBiasUpdates[i].emplace_back(biasesNRows, biasesNCols);
133 initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
134 initialize<Architecture_t>(fBiasUpdates[i][j], EInitialization::kZero);
135 }
136 }
137}
138
139//_________________________________________________________________________________________________
140template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
141auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
142 const std::vector<Matrix_t> &weightGradients) -> void
143{
144 std::vector<Matrix_t> &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
145 std::vector<Matrix_t> &currentLayerWeightUpdates = this->GetWeightUpdatesAt(layerIndex);
146
147 for (size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {
148
149 // accumulation matrix used for temporary storing of the current accumulation
150 Matrix_t accumulation(currentLayerPastSquaredWeightGradients[k].GetNrows(),
151 currentLayerPastSquaredWeightGradients[k].GetNcols());
152
153 // Vt = rho * Vt-1 + (1-rho) * currentSquaredWeightGradients
154 initialize<Architecture_t>(accumulation, EInitialization::kZero);
155 Matrix_t currentSquaredWeightGradients(weightGradients[k].GetNrows(), weightGradients[k].GetNcols());
156 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
157 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
158 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[k], this->GetRho());
159 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
160 Architecture_t::Copy(currentLayerPastSquaredWeightGradients[k], accumulation);
161
162 // Wt = momentum * Wt-1 + (learningRate * currentWeightGradients) / (sqrt(Vt + epsilon))
163 initialize<Architecture_t>(accumulation, EInitialization::kZero);
164 Matrix_t dummy(currentLayerPastSquaredWeightGradients[k].GetNrows(),
165 currentLayerPastSquaredWeightGradients[k].GetNcols());
166 Architecture_t::Copy(dummy, currentLayerPastSquaredWeightGradients[k]);
167 Architecture_t::ConstAdd(dummy, this->GetEpsilon());
168 Architecture_t::SqrtElementWise(dummy);
169 Architecture_t::ReciprocalElementWise(dummy);
170 Architecture_t::Hadamard(dummy, weightGradients[k]);
171
172 Architecture_t::ScaleAdd(accumulation, currentLayerWeightUpdates[k], this->GetMomentum());
173 Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
174 Architecture_t::Copy(currentLayerWeightUpdates[k], accumulation);
175 }
176
177 // updating the weights.
178 // theta = theta - Wt
179 for (size_t i = 0; i < weights.size(); i++) {
180 Architecture_t::ScaleAdd(weights[i], currentLayerWeightUpdates[i], -1.0);
181 }
182}
183
184//_________________________________________________________________________________________________
185template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
186auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
187 const std::vector<Matrix_t> &biasGradients) -> void
188{
189 std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
190 std::vector<Matrix_t> &currentLayerBiasUpdates = this->GetBiasUpdatesAt(layerIndex);
191
192 for (size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {
193
194 // accumulation matrix used for temporary storing of the current accumulation
195 Matrix_t accumulation(currentLayerPastSquaredBiasGradients[k].GetNrows(),
196 currentLayerPastSquaredBiasGradients[k].GetNcols());
197
198 // Vt = rho * Vt-1 + (1-rho) * currentSquaredBiasGradients
199 initialize<Architecture_t>(accumulation, EInitialization::kZero);
200 Matrix_t currentSquaredBiasGradients(biasGradients[k].GetNrows(), biasGradients[k].GetNcols());
201 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
202 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
203 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[k], this->GetRho());
204 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
205 Architecture_t::Copy(currentLayerPastSquaredBiasGradients[k], accumulation);
206
207 // Wt = momentum * Wt-1 + (learningRate * currentBiasGradients) / (sqrt(Vt + epsilon))
208 initialize<Architecture_t>(accumulation, EInitialization::kZero);
209 Matrix_t dummy(currentLayerPastSquaredBiasGradients[k].GetNrows(),
210 currentLayerPastSquaredBiasGradients[k].GetNcols());
211 Architecture_t::Copy(dummy, currentLayerPastSquaredBiasGradients[k]);
212 Architecture_t::ConstAdd(dummy, this->GetEpsilon());
213 Architecture_t::SqrtElementWise(dummy);
214 Architecture_t::ReciprocalElementWise(dummy);
215 Architecture_t::Hadamard(dummy, biasGradients[k]);
216
217 Architecture_t::ScaleAdd(accumulation, currentLayerBiasUpdates[k], this->GetMomentum());
218 Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
219 Architecture_t::Copy(currentLayerBiasUpdates[k], accumulation);
220 }
221
222 // updating the Biases.
223 // theta = theta - Wt
224 for (size_t i = 0; i < biases.size(); i++) {
225 Architecture_t::ScaleAdd(biases[i], currentLayerBiasUpdates[i], -1.0);
226 }
227}
228
229} // namespace DNN
230} // namespace TMVA
231
232#endif
#define e(i)
Definition: RSha256.hxx:103
static RooMathCoreReg dummy
RMSProp Optimizer class.
Definition: RMSProp.h:44
Scalar_t fRho
The Rho constant used by the optimizer.
Definition: RMSProp.h:51
typename Architecture_t::Scalar_t Scalar_t
Definition: RMSProp.h:47
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
Definition: RMSProp.h:141
~TRMSProp()=default
Destructor.
std::vector< Matrix_t > & GetPastSquaredWeightGradientsAt(size_t i)
Definition: RMSProp.h:81
Scalar_t GetRho() const
Definition: RMSProp.h:77
std::vector< std::vector< Matrix_t > > fPastSquaredWeightGradients
The sum of the square of the past weight gradients associated with the deep net.
Definition: RMSProp.h:54
std::vector< std::vector< Matrix_t > > & GetBiasUpdates()
Definition: RMSProp.h:89
Scalar_t GetEpsilon() const
Definition: RMSProp.h:78
Scalar_t fMomentum
The momentum used for training.
Definition: RMSProp.h:50
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasGradients()
Definition: RMSProp.h:83
Scalar_t fEpsilon
The Smoothing term used to avoid division by zero.
Definition: RMSProp.h:52
TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate=0.001, Scalar_t momentum=0.0, Scalar_t rho=0.9, Scalar_t epsilon=1e-7)
Constructor.
Definition: RMSProp.h:98
std::vector< std::vector< Matrix_t > > fPastSquaredBiasGradients
The sum of the square of the past bias gradients associated with the deep net.
Definition: RMSProp.h:56
std::vector< std::vector< Matrix_t > > fWeightUpdates
The accumulation of the past Weights for performing updates.
Definition: RMSProp.h:58
typename Architecture_t::Matrix_t Matrix_t
Definition: RMSProp.h:46
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
Definition: RMSProp.h:186
std::vector< Matrix_t > & GetBiasUpdatesAt(size_t i)
Definition: RMSProp.h:90
std::vector< std::vector< Matrix_t > > & GetWeightUpdates()
Definition: RMSProp.h:86
std::vector< Matrix_t > & GetWeightUpdatesAt(size_t i)
Definition: RMSProp.h:87
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightGradients()
Definition: RMSProp.h:80
std::vector< std::vector< Matrix_t > > fBiasUpdates
The accumulation of the past Biases for performing updates.
Definition: RMSProp.h:59
Scalar_t GetMomentum() const
Getters.
Definition: RMSProp.h:76
std::vector< Matrix_t > & GetPastSquaredBiasGradientsAt(size_t i)
Definition: RMSProp.h:84
Generic Optimizer class.
Definition: Optimizer.h:44
std::vector< Layer_t * > & GetLayers()
Definition: Optimizer.h:78
void Copy(void *source, void *dest)
Abstract ClassifierFactory template that handles arbitrary types.
REAL epsilon
Definition: triangle.c:617