25template<
typename Scalar_t>
39 for (
size_t i = 0; i < (size_t) df.GetNrows(); i++) {
40 for (
size_t j = 0; j < (size_t) df.GetNcols(); j++) {
41 df(i,j) *= state_gradients_backward(i,j);
46 if (input_gradient.GetNoElements() > 0) {
47 input_gradient.Mult(df, weights_input);
51 if (state_gradients_backward.GetNoElements() > 0) {
52 state_gradients_backward.Mult(df, weights_state);
56 if (input_weight_gradients.GetNoElements() > 0) {
58 input_weight_gradients.TMult(df, input);
59 input_weight_gradients += tmp;
61 if (state_weight_gradients.GetNoElements() > 0) {
63 state_weight_gradients.TMult(df, state);
64 state_weight_gradients += tmp;
68 if (bias_gradients.GetNoElements() > 0) {
70 for (
size_t j = 0; j < (size_t) df.GetNcols(); j++) {
73 for (
size_t i = 0; i < (size_t) df.GetNrows(); i++) {
76 bias_gradients(j,0) +=
sum;
80 return input_gradient;
85template <
typename Scalar_t>
125 Hadamard(cell_gradient, fOutput);
126 Hadamard(cell_gradient, state_gradients_backward);
127 cell_gradient += cell_gradients_backward;
128 cell_gradients_backward = cell_gradient;
129 Hadamard(cell_gradients_backward, fForget);
133 Hadamard(candidate_gradient, fInput);
134 Hadamard(candidate_gradient, dc);
138 Hadamard(input_gate_gradient, fCandidate);
139 Hadamard(input_gate_gradient, di);
143 Hadamard(forget_gradient, precCellActivations);
144 Hadamard(forget_gradient, df);
148 Hadamard(output_gradient, state_gradients_backward);
149 Hadamard(output_gradient, dout);
153 tmpInp.
Mult(input_gate_gradient, weights_input);
154 input_gradient = tmpInp;
155 tmpInp.
Mult(forget_gradient, weights_forget);
156 input_gradient += tmpInp;
157 tmpInp.
Mult(candidate_gradient, weights_candidate);
158 input_gradient += tmpInp;
159 tmpInp.
Mult(output_gradient, weights_output);
160 input_gradient += tmpInp;
164 tmpState.
Mult(input_gate_gradient, weights_input_state);
165 state_gradients_backward = tmpState;
166 tmpState.
Mult(forget_gradient, weights_forget_state);
167 state_gradients_backward += tmpState;
168 tmpState.
Mult(candidate_gradient, weights_candidate_state);
169 state_gradients_backward += tmpState;
170 tmpState.
Mult(output_gradient, weights_output_state);
171 state_gradients_backward += tmpState;
175 input_weight_gradients.TMult(input_gate_gradient, input);
176 input_weight_gradients += tmp;
177 tmp = forget_weight_gradients;
178 forget_weight_gradients.
TMult(forget_gradient, input);
179 forget_weight_gradients += tmp;
180 tmp = candidate_weight_gradients;
181 candidate_weight_gradients.
TMult(candidate_gradient, input);
182 candidate_weight_gradients += tmp;
183 tmp = output_weight_gradients;
184 output_weight_gradients.
TMult(output_gradient, input);
185 output_weight_gradients += tmp;
189 input_state_weight_gradients.TMult(input_gate_gradient, precStateActivations);
190 input_state_weight_gradients += tmp1;
191 tmp1 = forget_state_weight_gradients;
192 forget_state_weight_gradients.
TMult(forget_gradient, precStateActivations);
193 forget_state_weight_gradients += tmp1;
194 tmp1 = candidate_state_weight_gradients;
195 candidate_state_weight_gradients.
TMult(candidate_gradient, precStateActivations);
196 candidate_state_weight_gradients += tmp1;
197 tmp1 = output_state_weight_gradients;
198 output_state_weight_gradients.
TMult(output_gradient, precStateActivations);
199 output_state_weight_gradients += tmp1;
202 for (
size_t j = 0; j < (size_t) df.GetNcols(); j++) {
203 Scalar_t sum_inp = 0.0, sum_forget = 0.0, sum_candidate = 0.0, sum_out = 0.0;
205 for (
size_t i = 0; i < (size_t) df.GetNrows(); i++) {
206 sum_inp += input_gate_gradient(i,j);
207 sum_forget += forget_gradient(i,j);
208 sum_candidate += candidate_gradient(i,j);
209 sum_out += output_gradient(i,j);
211 input_bias_gradients(j,0) += sum_inp;
212 forget_bias_gradients(j,0) += sum_forget;
213 candidate_bias_gradients(j,0) += sum_candidate;
214 output_bias_gradients(j,0) += sum_out;
217 return input_gradient;
223template <
typename Scalar_t>
253 for (
size_t j = 0; j < (size_t) reset_gradient.
GetNcols(); j++) {
254 for (
size_t i = 0; i < (size_t) reset_gradient.
GetNrows(); i++) {
255 reset_gradient(i,j) = 1 - reset_gradient(i,j);
258 Hadamard(reset_gradient, dc);
259 Hadamard(reset_gradient, state_gradients_backward);
261 tmpMul.
Mult(reset_gradient, weights_candidate_state);
262 Hadamard(tmpMul, precStateActivations);
263 Hadamard(tmpMul, dr);
264 reset_gradient = tmpMul;
268 for (
size_t j = 0; j < (size_t) update_gradient.
GetNcols(); j++) {
269 for (
size_t i = 0; i < (size_t) update_gradient.
GetNrows(); i++) {
270 update_gradient(i,j) = update_gradient(i,j) - fCandidate(i,j);
273 Hadamard(update_gradient, du);
274 Hadamard(update_gradient, state_gradients_backward);
278 for (
size_t j = 0; j < (size_t) candidate_gradient.
GetNcols(); j++) {
279 for (
size_t i = 0; i < (size_t) candidate_gradient.
GetNrows(); i++) {
280 candidate_gradient(i,j) = 1 - candidate_gradient(i,j);
283 Hadamard(candidate_gradient, dc);
284 Hadamard(candidate_gradient, state_gradients_backward);
290 Hadamard(term, temp);
291 state_gradients_backward = term;
294 term = precStateActivations;
296 Hadamard(term, temp);
298 var.
Mult(term, weights_update_state);
300 state_gradients_backward += term;
304 for (
size_t j = 0; j < (size_t) term.
GetNcols(); j++) {
305 for (
size_t i = 0; i < (size_t) term.
GetNrows(); i++) {
306 term(i,j) = - term(i,j);
310 Hadamard(term, temp);
311 var.
Mult(term, weights_update_state);
313 state_gradients_backward += term;
317 for (
size_t j = 0; j < (size_t) term.
GetNcols(); j++) {
318 for (
size_t i = 0; i < (size_t) term.
GetNrows(); i++) {
319 term(i,j) = 1 - term(i,j);
323 Hadamard(term, temp);
324 var.
Mult(term, weights_candidate_state);
325 Hadamard(var, fReset);
327 state_gradients_backward += term;
331 for (
size_t j = 0; j < (size_t) term.
GetNcols(); j++) {
332 for (
size_t i = 0; i < (size_t) term.
GetNrows(); i++) {
333 term(i,j) = 1 - term(i,j);
337 Hadamard(term, temp);
338 var.
Mult(term, weights_candidate_state);
339 Hadamard(var, precStateActivations);
341 term.
Mult(var, weights_reset_state);
342 state_gradients_backward += term;
346 tmpInp.
Mult(reset_gradient, weights_reset);
347 input_gradient = tmpInp;
348 tmpInp.
Mult(update_gradient, weights_update);
349 input_gradient += tmpInp;
350 tmpInp.
Mult(candidate_gradient, weights_candidate);
351 input_gradient += tmpInp;
355 reset_weight_gradients.TMult(reset_gradient, input);
356 reset_weight_gradients += tmp;
357 tmp = update_weight_gradients;
358 update_weight_gradients.
TMult(update_gradient, input);
359 update_weight_gradients += tmp;
360 tmp = candidate_weight_gradients;
361 candidate_weight_gradients.
TMult(candidate_gradient, input);
362 candidate_weight_gradients += tmp;
366 reset_state_weight_gradients.TMult(reset_gradient, precStateActivations);
367 reset_state_weight_gradients += tmp1;
368 tmp1 = update_state_weight_gradients;
369 update_state_weight_gradients.
TMult(update_gradient, precStateActivations);
370 update_state_weight_gradients += tmp1;
371 tmp1 = candidate_state_weight_gradients;
373 Hadamard(tmp2, precStateActivations);
374 candidate_state_weight_gradients.TMult(candidate_gradient, tmp2);
375 candidate_state_weight_gradients += tmp1;
378 for (
size_t j = 0; j < (size_t) du.GetNcols(); j++) {
379 Scalar_t sum_reset = 0.0, sum_update = 0.0, sum_candidate = 0.0;
381 for (
size_t i = 0; i < (size_t) du.GetNrows(); i++) {
382 sum_reset += reset_gradient(i,j);
383 sum_update += update_gradient(i,j);
384 sum_candidate += candidate_gradient(i,j);
386 reset_bias_gradients(j,0) += sum_reset;
387 update_bias_gradients(j,0) += sum_update;
388 candidate_bias_gradients(j,0) += sum_candidate;
391 return input_gradient;
static Matrix_t & GRULayerBackward(TMatrixT< Scalar_t > &state_gradients_backward, TMatrixT< Scalar_t > &reset_weight_gradients, TMatrixT< Scalar_t > &update_weight_gradients, TMatrixT< Scalar_t > &candidate_weight_gradients, TMatrixT< Scalar_t > &reset_state_weight_gradients, TMatrixT< Scalar_t > &update_state_weight_gradients, TMatrixT< Scalar_t > &candidate_state_weight_gradients, TMatrixT< Scalar_t > &reset_bias_gradients, TMatrixT< Scalar_t > &update_bias_gradients, TMatrixT< Scalar_t > &candidate_bias_gradients, TMatrixT< Scalar_t > &dr, TMatrixT< Scalar_t > &du, TMatrixT< Scalar_t > &dc, const TMatrixT< Scalar_t > &precStateActivations, const TMatrixT< Scalar_t > &fReset, const TMatrixT< Scalar_t > &fUpdate, const TMatrixT< Scalar_t > &fCandidate, const TMatrixT< Scalar_t > &weights_reset, const TMatrixT< Scalar_t > &weights_update, const TMatrixT< Scalar_t > &weights_candidate, const TMatrixT< Scalar_t > &weights_reset_state, const TMatrixT< Scalar_t > &weights_update_state, const TMatrixT< Scalar_t > &weights_candidate_state, const TMatrixT< Scalar_t > &input, TMatrixT< Scalar_t > &input_gradient)
Backward pass for GRU Network.
static Matrix_t & LSTMLayerBackward(TMatrixT< Scalar_t > &state_gradients_backward, TMatrixT< Scalar_t > &cell_gradients_backward, TMatrixT< Scalar_t > &input_weight_gradients, TMatrixT< Scalar_t > &forget_weight_gradients, TMatrixT< Scalar_t > &candidate_weight_gradients, TMatrixT< Scalar_t > &output_weight_gradients, TMatrixT< Scalar_t > &input_state_weight_gradients, TMatrixT< Scalar_t > &forget_state_weight_gradients, TMatrixT< Scalar_t > &candidate_state_weight_gradients, TMatrixT< Scalar_t > &output_state_weight_gradients, TMatrixT< Scalar_t > &input_bias_gradients, TMatrixT< Scalar_t > &forget_bias_gradients, TMatrixT< Scalar_t > &candidate_bias_gradients, TMatrixT< Scalar_t > &output_bias_gradients, TMatrixT< Scalar_t > &di, TMatrixT< Scalar_t > &df, TMatrixT< Scalar_t > &dc, TMatrixT< Scalar_t > &dout, const TMatrixT< Scalar_t > &precStateActivations, const TMatrixT< Scalar_t > &precCellActivations, const TMatrixT< Scalar_t > &fInput, const TMatrixT< Scalar_t > &fForget, const TMatrixT< Scalar_t > &fCandidate, const TMatrixT< Scalar_t > &fOutput, const TMatrixT< Scalar_t > &weights_input, const TMatrixT< Scalar_t > &weights_forget, const TMatrixT< Scalar_t > &weights_candidate, const TMatrixT< Scalar_t > &weights_output, const TMatrixT< Scalar_t > &weights_input_state, const TMatrixT< Scalar_t > &weights_forget_state, const TMatrixT< Scalar_t > &weights_candidate_state, const TMatrixT< Scalar_t > &weights_output_state, const TMatrixT< Scalar_t > &input, TMatrixT< Scalar_t > &input_gradient, TMatrixT< Scalar_t > &cell_gradient, TMatrixT< Scalar_t > &cell_tanh)
Backward pass for LSTM Network.
static Matrix_t & RecurrentLayerBackward(TMatrixT< Scalar_t > &state_gradients_backward, TMatrixT< Scalar_t > &input_weight_gradients, TMatrixT< Scalar_t > &state_weight_gradients, TMatrixT< Scalar_t > &bias_gradients, TMatrixT< Scalar_t > &df, const TMatrixT< Scalar_t > &state, const TMatrixT< Scalar_t > &weights_input, const TMatrixT< Scalar_t > &weights_state, const TMatrixT< Scalar_t > &input, TMatrixT< Scalar_t > &input_gradient)
Backpropagation step for a Recurrent Neural Network.
void TMult(const TMatrixT< Element > &a, const TMatrixT< Element > &b)
Create a matrix C such that C = A' * B.
void Mult(const TMatrixT< Element > &a, const TMatrixT< Element > &b)
General matrix multiplication. Create a matrix C such that C = A * B.
create variable transformations
static uint64_t sum(uint64_t i)