27#ifndef TMVA_DNN_ADADELTA
28#define TMVA_DNN_ADADELTA
43template <
typename Architecture_t,
typename Layer_t = VGeneralLayer<Architecture_t>,
44 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
47 using Matrix_t =
typename Architecture_t::Matrix_t;
48 using Scalar_t =
typename Architecture_t::Scalar_t;
68 void UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
const std::vector<Matrix_t> &weightGradients);
71 void UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
const std::vector<Matrix_t> &biasGradients);
101template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
104 :
VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fRho(rho), fEpsilon(
epsilon)
106 std::vector<Layer_t *> &layers = deepNet.
GetLayers();
107 const size_t layersNSlices = layers.size();
117 for (
size_t i = 0; i < layersNSlices; i++) {
118 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
123 for (
size_t j = 0; j < weightsNSlices; j++) {
128 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
133 for (
size_t j = 0; j < biasesNSlices; j++) {
139 Architecture_t::CreateWeightTensors(
fWorkBiasTensor1[i], layers[i]->GetBiases());
141 Architecture_t::CreateWeightTensors(
fWorkBiasTensor2[i], layers[i]->GetBiases());
146template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
148 const std::vector<Matrix_t> &weightGradients) ->
void
150 std::vector<Matrix_t> ¤tLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
151 std::vector<Matrix_t> ¤tLayerPastSquaredWeightUpdates = this->GetPastSquaredWeightUpdatesAt(layerIndex);
153 const size_t weightsNSlices = weights.size();
154 assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices);
156 for (
size_t i = 0; i < weightsNSlices; i++) {
158 auto &accumulation = fWorkWeightTensor1[layerIndex][i];
159 auto ¤tSquaredWeightGradients = fWorkWeightTensor2[layerIndex][i];
164 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]);
165 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
166 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[i], this->GetRho());
167 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
168 Architecture_t::Copy(currentLayerPastSquaredWeightGradients[i], accumulation);
175 auto &dummy1 = fWorkWeightTensor1[layerIndex][i];
176 Architecture_t::Copy(dummy1, currentLayerPastSquaredWeightUpdates[i]);
177 Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
178 Architecture_t::SqrtElementWise(dummy1);
180 auto ¤tWeightUpdates = fWorkWeightTensor2[layerIndex][i];
182 Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
183 Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
184 Architecture_t::SqrtElementWise(currentWeightUpdates);
185 Architecture_t::ReciprocalElementWise(currentWeightUpdates);
186 Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
187 Architecture_t::Hadamard(currentWeightUpdates, dummy1);
190 Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
195 auto ¤tSquaredWeightUpdates = fWorkWeightTensor2[layerIndex][i];
196 Architecture_t::Copy(currentSquaredWeightUpdates, currentWeightUpdates);
197 Architecture_t::SquareElementWise(currentSquaredWeightUpdates);
198 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightUpdates[i], this->GetRho());
199 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightUpdates, 1 - (this->GetRho()));
200 Architecture_t::Copy(currentLayerPastSquaredWeightUpdates[i], accumulation);
205template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
207 const std::vector<Matrix_t> &biasGradients) ->
void
209 std::vector<Matrix_t> ¤tLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
210 std::vector<Matrix_t> ¤tLayerPastSquaredBiasUpdates = this->GetPastSquaredBiasUpdatesAt(layerIndex);
212 const size_t biasesNSlices = biases.size();
213 assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices);
214 for (
size_t i = 0; i < biasesNSlices; i++) {
217 auto &accumulation = fWorkBiasTensor1[layerIndex][i];
222 auto ¤tSquaredBiasGradients = fWorkBiasTensor2[layerIndex][i];
223 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]);
224 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
225 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[i], this->GetRho());
226 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
227 Architecture_t::Copy(currentLayerPastSquaredBiasGradients[i], accumulation);
233 auto &dummy1 = fWorkBiasTensor1[layerIndex][i];
234 Architecture_t::Copy(dummy1, currentLayerPastSquaredBiasUpdates[i]);
235 Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
236 Architecture_t::SqrtElementWise(dummy1);
238 auto ¤tBiasUpdates = fWorkBiasTensor2[layerIndex][i];
239 Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
240 Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
241 Architecture_t::SqrtElementWise(currentBiasUpdates);
242 Architecture_t::ReciprocalElementWise(currentBiasUpdates);
243 Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
244 Architecture_t::Hadamard(currentBiasUpdates, dummy1);
247 Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
253 auto ¤tSquaredBiasUpdates = fWorkBiasTensor2[layerIndex][i];
254 Architecture_t::Copy(currentSquaredBiasUpdates, currentBiasUpdates);
255 Architecture_t::SquareElementWise(currentSquaredBiasUpdates);
256 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasUpdates[i], this->GetRho());
257 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasUpdates, 1 - (this->GetRho()));
258 Architecture_t::Copy(currentLayerPastSquaredBiasUpdates[i], accumulation);
Adadelta Optimizer class.
std::vector< std::vector< Matrix_t > > fWorkBiasTensor2
working tensor used to keep a temporary copy of bias or bias gradients
Scalar_t GetRho() const
Getters.
Scalar_t fEpsilon
The Smoothing term used to avoid division by zero.
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightUpdates()
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
std::vector< Matrix_t > & GetPastSquaredBiasGradientsAt(size_t i)
std::vector< Matrix_t > & GetPastSquaredWeightGradientsAt(size_t i)
Scalar_t fRho
The Rho constant used by the optimizer.
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasUpdates()
std::vector< Matrix_t > & GetPastSquaredWeightUpdatesAt(size_t i)
std::vector< std::vector< Matrix_t > > fWorkBiasTensor1
working tensor used to keep a temporary copy of bias or bias gradients
std::vector< std::vector< Matrix_t > > fPastSquaredBiasUpdates
The accumulation of the square of the past bias updates associated with the deep net.
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate=1.0, Scalar_t rho=0.95, Scalar_t epsilon=1e-8)
Constructor.
Scalar_t GetEpsilon() const
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasGradients()
std::vector< std::vector< Matrix_t > > fWorkWeightTensor2
working tensor used to keep a temporary copy of weights or weight gradients
~TAdadelta()=default
Destructor.
std::vector< std::vector< Matrix_t > > fPastSquaredWeightUpdates
The accumulation of the square of the past weight updates associated with the deep net.
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightGradients()
typename Architecture_t::Scalar_t Scalar_t
typename Architecture_t::Matrix_t Matrix_t
std::vector< std::vector< Matrix_t > > fPastSquaredWeightGradients
The accumulation of the square of the past weight gradients associated with the deep net.
std::vector< std::vector< Matrix_t > > fWorkWeightTensor1
working tensor used to keep a temporary copy of weights or weight gradients
std::vector< std::vector< Matrix_t > > fPastSquaredBiasGradients
The accumulation of the square of the past bias gradients associated with the deep net.
std::vector< Matrix_t > & GetPastSquaredBiasUpdatesAt(size_t i)
std::vector< Layer_t * > & GetLayers()
create variable transformations