25template<
typename AFloat>
43 if (input_gradient.GetNoElements() > 0) {
44 Multiply(input_gradient, df, weights_input);
48 if (state_gradients_backward.GetNoElements() > 0) {
49 Multiply(state_gradients_backward, df, weights_state);
57 if (input_weight_gradients.GetNoElements() > 0) {
58 TransposeMultiply(input_weight_gradients, df,
input, 1. , 1.);
61 if (state_weight_gradients.GetNoElements() > 0) {
62 TransposeMultiply(state_weight_gradients, df, state, 1. , 1. );
66 if (bias_gradients.GetNoElements() > 0) {
67 SumColumns(bias_gradients, df, 1., 1.);
70 return input_gradient;
74template <
typename Scalar_t>
115 TCpuMatrix<Scalar_t> tmpState(state_gradients_backward.GetNrows(), state_gradients_backward.GetNcols());
123 Hadamard(cell_gradient, fOutput);
124 Hadamard(cell_gradient, state_gradients_backward);
125 ScaleAdd(cell_gradient, cell_gradients_backward);
126 Copy(cell_gradients_backward, cell_gradient);
127 Hadamard(cell_gradients_backward, fForget);
130 Copy(candidate_gradient, cell_gradient);
131 Hadamard(candidate_gradient, fInput);
132 Hadamard(candidate_gradient, dc);
135 Copy(input_gate_gradient, cell_gradient);
136 Hadamard(input_gate_gradient, fCandidate);
137 Hadamard(input_gate_gradient, di);
140 Copy(forget_gradient, cell_gradient);
141 Hadamard(forget_gradient, precCellActivations);
142 Hadamard(forget_gradient, df);
145 Copy(output_gradient, cell_tanh);
146 Hadamard(output_gradient, state_gradients_backward);
147 Hadamard(output_gradient, dout);
150 Multiply(tmpInp, input_gate_gradient, weights_input);
151 Copy(input_gradient, tmpInp);
152 Multiply(tmpInp, forget_gradient, weights_forget);
153 ScaleAdd(input_gradient, tmpInp);
154 Multiply(tmpInp, candidate_gradient, weights_candidate);
155 ScaleAdd(input_gradient, tmpInp);
156 Multiply(tmpInp, output_gradient, weights_output);
157 ScaleAdd(input_gradient, tmpInp);
160 Multiply(tmpState, input_gate_gradient, weights_input_state);
161 Copy(state_gradients_backward, tmpState);
162 Multiply(tmpState, forget_gradient, weights_forget_state);
163 ScaleAdd(state_gradients_backward, tmpState);
164 Multiply(tmpState, candidate_gradient, weights_candidate_state);
165 ScaleAdd(state_gradients_backward, tmpState);
166 Multiply(tmpState, output_gradient, weights_output_state);
167 ScaleAdd(state_gradients_backward, tmpState);
170 TransposeMultiply(input_weight_gradients, input_gate_gradient,
input, 1. , 1.);
171 TransposeMultiply(forget_weight_gradients, forget_gradient,
input, 1. , 1.);
172 TransposeMultiply(candidate_weight_gradients, candidate_gradient,
input, 1. , 1.);
173 TransposeMultiply(output_weight_gradients, output_gradient,
input, 1. , 1.);
176 TransposeMultiply(input_state_weight_gradients, input_gate_gradient, precStateActivations, 1. , 1. );
177 TransposeMultiply(forget_state_weight_gradients, forget_gradient, precStateActivations, 1. , 1. );
178 TransposeMultiply(candidate_state_weight_gradients, candidate_gradient, precStateActivations, 1. , 1. );
179 TransposeMultiply(output_state_weight_gradients, output_gradient, precStateActivations, 1. , 1. );
182 SumColumns(input_bias_gradients, input_gate_gradient, 1., 1.);
183 SumColumns(forget_bias_gradients, forget_gradient, 1., 1.);
184 SumColumns(candidate_bias_gradients, candidate_gradient, 1., 1.);
185 SumColumns(output_bias_gradients, output_gradient, 1., 1.);
187 return input_gradient;
192template <
typename Scalar_t>
222 int r = fUpdate.GetNrows(),
c = fUpdate.GetNcols();
224 Copy(reset_gradient, fUpdate);
225 for (
size_t j = 0; j < (size_t)reset_gradient.
GetNcols(); j++) {
226 for (
size_t i = 0; i < (size_t)reset_gradient.
GetNrows(); i++) {
227 reset_gradient(i, j) = 1 - reset_gradient(i, j);
230 Hadamard(reset_gradient, dc);
231 Hadamard(reset_gradient, state_gradients_backward);
234 if (!resetGateAfter) {
237 Multiply(tmpMul, reset_gradient, weights_candidate_state);
238 Hadamard(tmpMul, precStateActivations);
241 MultiplyTranspose(tmpMul, precStateActivations, weights_candidate_state);
242 Hadamard(tmpMul, reset_gradient);
244 Hadamard(tmpMul, dr);
245 Copy(reset_gradient, tmpMul);
249 Copy(update_gradient, precStateActivations);
250 for (
size_t j = 0; j < (size_t)update_gradient.
GetNcols(); j++) {
251 for (
size_t i = 0; i < (size_t)update_gradient.
GetNrows(); i++) {
252 update_gradient(i, j) = update_gradient(i, j) - fCandidate(i, j);
255 Hadamard(update_gradient, du);
256 Hadamard(update_gradient, state_gradients_backward);
260 Copy(candidate_gradient, fUpdate);
261 for (
size_t j = 0; j < (size_t)candidate_gradient.
GetNcols(); j++) {
262 for (
size_t i = 0; i < (size_t)candidate_gradient.
GetNrows(); i++) {
263 candidate_gradient(i, j) = 1 - candidate_gradient(i, j);
266 Hadamard(candidate_gradient, dc);
267 Hadamard(candidate_gradient, state_gradients_backward);
272 Copy(temp, state_gradients_backward);
275 Hadamard(term, temp);
276 Copy(state_gradients_backward, term);
279 Copy(term, precStateActivations);
281 Hadamard(term, temp);
283 Multiply(var, term, weights_update_state);
285 ScaleAdd(state_gradients_backward, term);
288 Copy(term, fCandidate);
289 for (
size_t j = 0; j < (size_t)term.
GetNcols(); j++) {
290 for (
size_t i = 0; i < (size_t)term.
GetNrows(); i++) {
291 term(i, j) = -term(i, j);
295 Hadamard(term, temp);
296 Multiply(var, term, weights_update_state);
298 ScaleAdd(state_gradients_backward, term);
302 for (
size_t j = 0; j < (size_t)term.
GetNcols(); j++) {
303 for (
size_t i = 0; i < (size_t)term.
GetNrows(); i++) {
304 term(i, j) = 1 - term(i, j);
308 Hadamard(term, temp);
310 if (!resetGateAfter) {
313 Multiply(var, term, weights_candidate_state);
314 Hadamard(var, fReset);
318 Hadamard(term, fReset);
319 Multiply(var, term, weights_candidate_state);
323 ScaleAdd(state_gradients_backward, term);
327 for (
size_t j = 0; j < (size_t)term.
GetNcols(); j++) {
328 for (
size_t i = 0; i < (size_t)term.
GetNrows(); i++) {
329 term(i, j) = 1 - term(i, j);
334 Hadamard(term, temp);
335 if (!resetGateAfter) {
338 Multiply(var, term, weights_candidate_state);
339 Hadamard(var, precStateActivations);
342 MultiplyTranspose(var, precStateActivations, weights_candidate_state);
346 Multiply(term, var, weights_reset_state);
347 ScaleAdd(state_gradients_backward, term);
351 Multiply(tmpInp, reset_gradient, weights_reset);
352 Copy(input_gradient, tmpInp);
353 Multiply(tmpInp, update_gradient, weights_update);
354 ScaleAdd(input_gradient, tmpInp);
355 Multiply(tmpInp, candidate_gradient, weights_candidate);
356 ScaleAdd(input_gradient, tmpInp);
359 TransposeMultiply(reset_weight_gradients, reset_gradient,
input, 1., 1.);
360 TransposeMultiply(update_weight_gradients, update_gradient,
input, 1., 1.);
361 TransposeMultiply(candidate_weight_gradients, candidate_gradient,
input, 1., 1.);
364 TransposeMultiply(reset_state_weight_gradients, reset_gradient, precStateActivations, 1., 1.);
365 TransposeMultiply(update_state_weight_gradients, update_gradient, precStateActivations, 1., 1.);
370 if (!resetGateAfter) {
372 Copy(tempvar, precStateActivations);
373 Hadamard(tempvar, fReset);
374 TransposeMultiply(candidate_state_weight_gradients, candidate_gradient, tempvar, 1., 1.);
378 Copy(tempvar, candidate_gradient);
379 Hadamard(tempvar, fReset);
380 TransposeMultiply(candidate_state_weight_gradients, tempvar, precStateActivations, 1., 1.);
384 SumColumns(reset_bias_gradients, reset_gradient, 1., 1.);
385 SumColumns(update_bias_gradients, update_gradient, 1., 1.);
386 SumColumns(candidate_bias_gradients, candidate_gradient, 1., 1.);
388 return input_gradient;
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
static Matrix_t & LSTMLayerBackward(TCpuMatrix< Scalar_t > &state_gradients_backward, TCpuMatrix< Scalar_t > &cell_gradients_backward, TCpuMatrix< Scalar_t > &input_weight_gradients, TCpuMatrix< Scalar_t > &forget_weight_gradients, TCpuMatrix< Scalar_t > &candidate_weight_gradients, TCpuMatrix< Scalar_t > &output_weight_gradients, TCpuMatrix< Scalar_t > &input_state_weight_gradients, TCpuMatrix< Scalar_t > &forget_state_weight_gradients, TCpuMatrix< Scalar_t > &candidate_state_weight_gradients, TCpuMatrix< Scalar_t > &output_state_weight_gradients, TCpuMatrix< Scalar_t > &input_bias_gradients, TCpuMatrix< Scalar_t > &forget_bias_gradients, TCpuMatrix< Scalar_t > &candidate_bias_gradients, TCpuMatrix< Scalar_t > &output_bias_gradients, TCpuMatrix< Scalar_t > &di, TCpuMatrix< Scalar_t > &df, TCpuMatrix< Scalar_t > &dc, TCpuMatrix< Scalar_t > &dout, const TCpuMatrix< Scalar_t > &precStateActivations, const TCpuMatrix< Scalar_t > &precCellActivations, const TCpuMatrix< Scalar_t > &fInput, const TCpuMatrix< Scalar_t > &fForget, const TCpuMatrix< Scalar_t > &fCandidate, const TCpuMatrix< Scalar_t > &fOutput, const TCpuMatrix< Scalar_t > &weights_input, const TCpuMatrix< Scalar_t > &weights_forget, const TCpuMatrix< Scalar_t > &weights_candidate, const TCpuMatrix< Scalar_t > &weights_output, const TCpuMatrix< Scalar_t > &weights_input_state, const TCpuMatrix< Scalar_t > &weights_forget_state, const TCpuMatrix< Scalar_t > &weights_candidate_state, const TCpuMatrix< Scalar_t > &weights_output_state, const TCpuMatrix< Scalar_t > &input, TCpuMatrix< Scalar_t > &input_gradient, TCpuMatrix< Scalar_t > &cell_gradient, TCpuMatrix< Scalar_t > &cell_tanh)
Backward pass for LSTM Network.
static Matrix_t & RecurrentLayerBackward(Matrix_t &state_gradients_backward, Matrix_t &input_weight_gradients, Matrix_t &state_weight_gradients, Matrix_t &bias_gradients, Matrix_t &df, const Matrix_t &state, const Matrix_t &weights_input, const Matrix_t &weights_state, const Matrix_t &input, Matrix_t &input_gradient)
Backward pass for Recurrent Networks.
static Matrix_t & GRULayerBackward(TCpuMatrix< Scalar_t > &state_gradients_backward, TCpuMatrix< Scalar_t > &reset_weight_gradients, TCpuMatrix< Scalar_t > &update_weight_gradients, TCpuMatrix< Scalar_t > &candidate_weight_gradients, TCpuMatrix< Scalar_t > &reset_state_weight_gradients, TCpuMatrix< Scalar_t > &update_state_weight_gradients, TCpuMatrix< Scalar_t > &candidate_state_weight_gradients, TCpuMatrix< Scalar_t > &reset_bias_gradients, TCpuMatrix< Scalar_t > &update_bias_gradients, TCpuMatrix< Scalar_t > &candidate_bias_gradients, TCpuMatrix< Scalar_t > &dr, TCpuMatrix< Scalar_t > &du, TCpuMatrix< Scalar_t > &dc, const TCpuMatrix< Scalar_t > &precStateActivations, const TCpuMatrix< Scalar_t > &fReset, const TCpuMatrix< Scalar_t > &fUpdate, const TCpuMatrix< Scalar_t > &fCandidate, const TCpuMatrix< Scalar_t > &weights_reset, const TCpuMatrix< Scalar_t > &weights_update, const TCpuMatrix< Scalar_t > &weights_candidate, const TCpuMatrix< Scalar_t > &weights_reset_state, const TCpuMatrix< Scalar_t > &weights_update_state, const TCpuMatrix< Scalar_t > &weights_candidate_state, const TCpuMatrix< Scalar_t > &input, TCpuMatrix< Scalar_t > &input_gradient, bool resetGateAfter)
Backward pass for GRU Network.
create variable transformations