Logo ROOT   6.18/05
Reference Guide
Adagrad.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Ravi Kiran S
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TAdagrad *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * Adagrad Optimizer Class *
12 * *
13 * Authors (alphabetical): *
14 * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15 * *
16 * Copyright (c) 2005-2018: *
17 * CERN, Switzerland *
18 * U. of Victoria, Canada *
19 * MPI-K Heidelberg, Germany *
20 * U. of Bonn, Germany *
21 * *
22 * Redistribution and use in source and binary forms, with or without *
23 * modification, are permitted according to the terms listed in LICENSE *
24 * (http://tmva.sourceforge.net/LICENSE) *
25 **********************************************************************************/
26
27#ifndef TMVA_DNN_ADAGRAD
28#define TMVA_DNN_ADAGRAD
29
30#include "TMatrix.h"
31#include "TMVA/DNN/Optimizer.h"
32#include "TMVA/DNN/Functions.h"
33
34namespace TMVA {
35namespace DNN {
36
37/** \class TAdagrad
38 * Adagrad Optimizer class
39 *
40 * This class represents the Adagrad Optimizer.
41 */
42template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
43 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
44class TAdagrad : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
45public:
46 using Matrix_t = typename Architecture_t::Matrix_t;
47 using Scalar_t = typename Architecture_t::Scalar_t;
48
49protected:
50 Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
51
52 std::vector<std::vector<Matrix_t>>
53 fPastSquaredWeightGradients; ///< The sum of the square of the past weight gradients associated with the deep net.
54 std::vector<std::vector<Matrix_t>>
55 fPastSquaredBiasGradients; ///< The sum of the square of the past bias gradients associated with the deep net.
56
57 /*! Update the weights, given the current weight gradients. */
58 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
59
60 /*! Update the biases, given the current bias gradients. */
61 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
62
63public:
64 /*! Constructor. */
65 TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate = 0.01, Scalar_t epsilon = 1e-8);
66
67 /*! Destructor. */
68 ~TAdagrad() = default;
69
70 /*! Getters */
71 Scalar_t GetEpsilon() const { return fEpsilon; }
72
73 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
74 std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
75
76 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
77 std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
78};
79
80//
81//
82// The Adagrad Optimizer Class - Implementation
83//_________________________________________________________________________________________________
84template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
86 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fEpsilon(epsilon)
87{
88 std::vector<Layer_t *> &layers = deepNet.GetLayers();
89 const size_t layersNSlices = layers.size();
90 fPastSquaredWeightGradients.resize(layersNSlices);
91 fPastSquaredBiasGradients.resize(layersNSlices);
92
93 for (size_t i = 0; i < layersNSlices; i++) {
94 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
95
96 for (size_t j = 0; j < weightsNSlices; j++) {
97 Matrix_t &currentWeights = layers[i]->GetWeightsAt(j);
98 const size_t weightsNRows = currentWeights.GetNrows();
99 const size_t weightsNCols = currentWeights.GetNcols();
100
101 fPastSquaredWeightGradients[i].emplace_back(weightsNRows, weightsNCols);
102 initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
103 }
104
105 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
106
107 for (size_t j = 0; j < biasesNSlices; j++) {
108 Matrix_t &currentBiases = layers[i]->GetBiasesAt(j);
109 const size_t biasesNRows = currentBiases.GetNrows();
110 const size_t biasesNCols = currentBiases.GetNcols();
111
112 fPastSquaredBiasGradients[i].emplace_back(biasesNRows, biasesNCols);
113 initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
114 }
115 }
116}
117
118//_________________________________________________________________________________________________
119template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
120auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
121 const std::vector<Matrix_t> &weightGradients) -> void
122{
123 std::vector<Matrix_t> &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
124
125 for (size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {
126
127 // Vt = Vt-1 + currentSquaredWeightGradients
128 Matrix_t currentSquaredWeightGradients(weightGradients[k].GetNrows(), weightGradients[k].GetNcols());
129 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
130 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
131 Architecture_t::ScaleAdd(currentLayerPastSquaredWeightGradients[k], currentSquaredWeightGradients, 1.0);
132 }
133
134 // updating the weights.
135 // theta = theta - learningRate * currentWeightGradients / (sqrt(Vt + epsilon))
136 for (size_t i = 0; i < weights.size(); i++) {
137 Matrix_t currentWeightUpdates(weights[i].GetNrows(), weights[i].GetNcols());
138 Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
139 Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
140 Architecture_t::SqrtElementWise(currentWeightUpdates);
141 Architecture_t::ReciprocalElementWise(currentWeightUpdates);
142 Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
143 Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
144 }
145}
146
147//_________________________________________________________________________________________________
148template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
149auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
150 const std::vector<Matrix_t> &biasGradients) -> void
151{
152 std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
153
154 for (size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {
155
156 // Vt = Vt-1 + currentSquaredBiasGradients
157 Matrix_t currentSquaredBiasGradients(biasGradients[k].GetNrows(), biasGradients[k].GetNcols());
158 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
159 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
160 Architecture_t::ScaleAdd(currentLayerPastSquaredBiasGradients[k], currentSquaredBiasGradients, 1.0);
161 }
162
163 // updating the biases.
164 // theta = theta - learningRate * currentBiasGradients / (sqrt(Vt + epsilon))
165 for (size_t i = 0; i < biases.size(); i++) {
166 Matrix_t currentBiasUpdates(biases[i].GetNrows(), biases[i].GetNcols());
167 Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
168 Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
169 Architecture_t::SqrtElementWise(currentBiasUpdates);
170 Architecture_t::ReciprocalElementWise(currentBiasUpdates);
171 Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
172 Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
173 }
174}
175
176} // namespace DNN
177} // namespace TMVA
178
179#endif
#define e(i)
Definition: RSha256.hxx:103
Adagrad Optimizer class.
Definition: Adagrad.h:44
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
Definition: Adagrad.h:120
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
Definition: Adagrad.h:149
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasGradients()
Definition: Adagrad.h:76
std::vector< std::vector< Matrix_t > > fPastSquaredBiasGradients
The sum of the square of the past bias gradients associated with the deep net.
Definition: Adagrad.h:55
Scalar_t GetEpsilon() const
Getters.
Definition: Adagrad.h:71
TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate=0.01, Scalar_t epsilon=1e-8)
Constructor.
Definition: Adagrad.h:85
std::vector< std::vector< Matrix_t > > fPastSquaredWeightGradients
The sum of the square of the past weight gradients associated with the deep net.
Definition: Adagrad.h:53
typename Architecture_t::Matrix_t Matrix_t
Definition: Adagrad.h:46
typename Architecture_t::Scalar_t Scalar_t
Definition: Adagrad.h:47
Scalar_t fEpsilon
The Smoothing term used to avoid division by zero.
Definition: Adagrad.h:50
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightGradients()
Definition: Adagrad.h:73
std::vector< Matrix_t > & GetPastSquaredBiasGradientsAt(size_t i)
Definition: Adagrad.h:77
~TAdagrad()=default
Destructor.
std::vector< Matrix_t > & GetPastSquaredWeightGradientsAt(size_t i)
Definition: Adagrad.h:74
Generic Optimizer class.
Definition: Optimizer.h:44
std::vector< Layer_t * > & GetLayers()
Definition: Optimizer.h:78
void Copy(void *source, void *dest)
create variable transformations
REAL epsilon
Definition: triangle.c:617