Logo ROOT  
Reference Guide
SGD.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Ravi Kiran S
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TSGD *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * Stochastic Batch Gradient Descent Optimizer Class *
12 * *
13 * Authors (alphabetical): *
14 * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15 * *
16 * Copyright (c) 2005-2018: *
17 * CERN, Switzerland *
18 * U. of Victoria, Canada *
19 * MPI-K Heidelberg, Germany *
20 * U. of Bonn, Germany *
21 * *
22 * Redistribution and use in source and binary forms, with or without *
23 * modification, are permitted according to the terms listed in LICENSE *
24 * (http://tmva.sourceforge.net/LICENSE) *
25 **********************************************************************************/
26
27#ifndef TMVA_DNN_SGD
28#define TMVA_DNN_SGD
29
30#include "TMatrix.h"
31#include "TMVA/DNN/Optimizer.h"
32#include "TMVA/DNN/Functions.h"
33
34namespace TMVA {
35namespace DNN {
36
37/** \class TSGD
38 * Stochastic Batch Gradient Descent Optimizer class
39 *
40 * This class represents the Stochastic Batch Gradient Descent Optimizer with options for applying momentum
41 * and nesterov momentum.
42 */
43template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
44 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
45class TSGD : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
46public:
47 using Matrix_t = typename Architecture_t::Matrix_t;
48 using Scalar_t = typename Architecture_t::Scalar_t;
49
50protected:
51 Scalar_t fMomentum; ///< The momentum used for training.
52 std::vector<std::vector<Matrix_t>>
53 fPastWeightGradients; ///< The sum of the past weight gradients associated with the deep net.
54 std::vector<std::vector<Matrix_t>>
55 fPastBiasGradients; ///< The sum of the past bias gradients associated with the deep net.
56
57 /*! Update the weights, given the current weight gradients. */
58 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
59
60 /*! Update the biases, given the current bias gradients. */
61 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
62
63public:
64 /*! Constructor. */
65 TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum);
66
67 /*! Destructor. */
68 ~TSGD() = default;
69
70 /*! Getters */
71 Scalar_t GetMomentum() const { return fMomentum; }
72
73 std::vector<std::vector<Matrix_t>> &GetPastWeightGradients() { return fPastWeightGradients; }
74 std::vector<Matrix_t> &GetPastWeightGradientsAt(size_t i) { return fPastWeightGradients[i]; }
75
76 std::vector<std::vector<Matrix_t>> &GetPastBiasGradients() { return fPastBiasGradients; }
77 std::vector<Matrix_t> &GetPastBiasGradientsAt(size_t i) { return fPastBiasGradients[i]; }
78};
79
80//
81//
82// The Stochastic Gradient Descent Optimizer Class - Implementation
83//_________________________________________________________________________________________________
84template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
85TSGD<Architecture_t, Layer_t, DeepNet_t>::TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum)
86 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum)
87{
88 std::vector<Layer_t *> &layers = deepNet.GetLayers();
89 size_t layersNSlices = layers.size();
90 fPastWeightGradients.resize(layersNSlices);
91 fPastBiasGradients.resize(layersNSlices);
92
93 for (size_t i = 0; i < layersNSlices; i++) {
94
95 Architecture_t::CreateWeightTensors( fPastWeightGradients[i], layers[i]->GetWeights());
96 size_t weightsNSlices = fPastWeightGradients[i].size();
97 for (size_t j = 0; j < weightsNSlices; j++) {
98 initialize<Architecture_t>(fPastWeightGradients[i][j], EInitialization::kZero);
99 }
100
101 Architecture_t::CreateWeightTensors( fPastBiasGradients[i], layers[i]->GetBiases());
102 size_t biasesNSlices = fPastBiasGradients[i].size();
103 for (size_t j = 0; j < biasesNSlices; j++) {
104 initialize<Architecture_t>(fPastBiasGradients[i][j], EInitialization::kZero);
105 }
106 }
107}
108
109
110
111//_________________________________________________________________________________________________
112template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
113auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
114 const std::vector<Matrix_t> &weightGradients) -> void
115{
116 // accumulating the current layer past weight gradients to include the current weight gradients.
117 // Vt = momentum * Vt-1 + currentGradients
118
119 std::vector<Matrix_t> &currentLayerPastWeightGradients = this->GetPastWeightGradientsAt(layerIndex);
120
121 for (size_t k = 0; k < currentLayerPastWeightGradients.size(); k++) {
122 Architecture_t::ConstMult(currentLayerPastWeightGradients[k], this->GetMomentum());
123 Architecture_t::ScaleAdd(currentLayerPastWeightGradients[k], weightGradients[k], 1.0);
124 }
125
126 // updating the weights.
127 // theta = theta - learningRate * Vt
128 for (size_t i = 0; i < weights.size(); i++) {
129 Architecture_t::ScaleAdd(weights[i], currentLayerPastWeightGradients[i], -this->GetLearningRate());
130 }
131}
132
133//_________________________________________________________________________________________________
134template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
135auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
136 const std::vector<Matrix_t> &biasGradients) -> void
137{
138 // accumulating the current layer past bias gradients to include the current bias gradients.
139 // Vt = momentum * Vt-1 + currentGradients
140
141 std::vector<Matrix_t> &currentLayerPastBiasGradients = this->GetPastBiasGradientsAt(layerIndex);
142
143 for (size_t k = 0; k < currentLayerPastBiasGradients.size(); k++) {
144 Architecture_t::ConstMult(currentLayerPastBiasGradients[k], this->GetMomentum());
145 Architecture_t::ScaleAdd(currentLayerPastBiasGradients[k], biasGradients[k], 1.0);
146 }
147
148 // updating the biases
149 // theta = theta - learningRate * Vt
150 for (size_t i = 0; i < biases.size(); i++) {
151 Architecture_t::ScaleAdd(biases[i], currentLayerPastBiasGradients[i], -this->GetLearningRate());
152 }
153}
154
155} // namespace DNN
156} // namespace TMVA
157
158#endif
Stochastic Batch Gradient Descent Optimizer class.
Definition: SGD.h:45
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
Definition: SGD.h:113
~TSGD()=default
Destructor.
Scalar_t fMomentum
The momentum used for training.
Definition: SGD.h:51
typename Architecture_t::Scalar_t Scalar_t
Definition: SGD.h:48
TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum)
Constructor.
Definition: SGD.h:85
std::vector< std::vector< Matrix_t > > & GetPastBiasGradients()
Definition: SGD.h:76
std::vector< std::vector< Matrix_t > > fPastBiasGradients
The sum of the past bias gradients associated with the deep net.
Definition: SGD.h:55
std::vector< std::vector< Matrix_t > > & GetPastWeightGradients()
Definition: SGD.h:73
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
Definition: SGD.h:135
std::vector< Matrix_t > & GetPastWeightGradientsAt(size_t i)
Definition: SGD.h:74
std::vector< Matrix_t > & GetPastBiasGradientsAt(size_t i)
Definition: SGD.h:77
std::vector< std::vector< Matrix_t > > fPastWeightGradients
The sum of the past weight gradients associated with the deep net.
Definition: SGD.h:53
typename Architecture_t::Matrix_t Matrix_t
Definition: SGD.h:47
Scalar_t GetMomentum() const
Getters.
Definition: SGD.h:71
Generic Optimizer class.
Definition: Optimizer.h:44
std::vector< Layer_t * > & GetLayers()
Definition: Optimizer.h:78
create variable transformations