Logo ROOT  
Reference Guide
Cuda.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 05/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12///////////////////////////////////////////////////////////////////
13// Definition of the TCuda architecture class, which provides an //
14// implementation of the low-level functionality for neural //
15// networks for the CUDA computing architectures. //
16///////////////////////////////////////////////////////////////////
17
18#ifndef TMVA_DNN_ARCHITECTURES_CUDA
19#define TMVA_DNN_ARCHITECTURES_CUDA
20
21#include "TMVA/DNN/Functions.h"
26
27
28#include "cuda.h"
29#include "Cuda/CudaBuffers.h"
30#include "Cuda/CudaMatrix.h"
31#include "Cuda/CudaTensor.h"
32#include "TMVA/DNN/DataLoader.h"
33#include <utility>
34#include <vector>
35
36class TRandom;
37
38namespace TMVA
39{
40namespace DNN
41{
50 struct CudaDataType {};
51 struct DummyType {};
52
54
55/** The TCuda architecture class.
56 *
57 * Low-level interface class for CUDA computing architectures. Contains as
58 * public types the declaration of the scalar, matrix and buffer types
59 * for this architecture as well as the remaining functions in the low-level
60 * interface in the form of static members.
61 */
62template<typename AReal = Float_t>
63class TCuda
64{
65private:
67public:
68
69 using AFloat = AReal;
71
76
81 //using OpTensorDescriptor_t = CudaOpTensorDescriptor;
84 //using ReductionDescriptor_t = CudaReduceTensorDescriptor;
90
91 using EmptyDescriptor_t = CudaEmptyDescriptor; // Used if a descriptor is not needed in a class
92
95
102
104
105 static Tensor_t CreateTensor(size_t n, size_t c, size_t h, size_t w) {
106 return Tensor_t( {c,h*w,n}, GetTensorLayout());
107 }
108 static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t h, size_t w) {
109 return Tensor_t( buffer, {c,h*w, n}, GetTensorLayout(), 0, 0);
110 }
111
112 // create a weight tensor/matrix from another tensor using its shape
113 // static Matrix_t CreateWeightTensor( Matrix_t & A) {
114 // return Matrix_t( A.GetNrows(), A.GetNcols());
115 // }
116 // create a weight tensor/matrix vector from another tensor/weight vector using the given tensor shapes
117 // this function is used by the optimizers to stgore intermidiate weights representations
118 static void CreateWeightTensors( std::vector<Matrix_t> & newWeights, const std::vector<Matrix_t> & weights) {
119 if (!newWeights.empty()) newWeights.clear();
120 size_t n = weights.size();
121 for (size_t i = 0; i < n; ++i)
122 newWeights.emplace_back( weights[i].GetNrows(), weights[i].GetNcols());
123 }
124
125 //____________________________________________________________________________
126 //
127 // Architecture Initialization
128 //____________________________________________________________________________
129
130 /** Initialize CNN data/operator descriptors. Not used at the moment.*/
131
132 static void InitializeBNormDescriptors(TDescriptors * & /*descriptors*/,
133 BNormLayer_t */*L = nullptr*/) {
134 Error("InitializeBNormDescriptrs", "Batch normalization on GPU is supported only with Cudnn");
135 }
136
137 static void InitializeConvDescriptors(TDescriptors *& /*descriptors*/, ConvLayer_t * /*L = nullptr*/) {}
138
139 static void InitializePoolDescriptors(TDescriptors *& /*descriptors*/, PoolingLayer_t * /*L = nullptr*/) {}
140
141 static void InitializeActivationDescriptor(ActivationDescriptor_t &/*descriptors*/, EActivationFunction /*activFunc */ , double /*coef*/ = 0.0) {}
142
143 /** Release CNN data/operator descriptors. Not used at the moment.*/
144 static void ReleaseConvDescriptors(TDescriptors * & /*descriptors*/) {}
145 static void ReleasePoolDescriptors(TDescriptors * & /*descriptors*/) {}
146 static void ReleaseBNormDescriptors(TDescriptors *& /*descriptors*/) {}
147
148 static void InitializeConvWorkspace(TWorkspace * & /*workspace*/,
149 TDescriptors * & /*descriptors*/,
150 const DNN::CNN::TConvParams & /*params*/,
151 ConvLayer_t */*L = nullptr*/) {}
152 static void InitializePoolDropoutWorkspace(TWorkspace * & /*workspace*/,
153 TDescriptors * & /*descriptors*/,
154 const DNN::CNN::TConvParams & /*params*/,
155 PoolingLayer_t */*L = nullptr*/) {}
156
157 static void ReleaseDescriptor(ActivationDescriptor_t & /*activationDescr*/) {}
158
159 static void FreeConvWorkspace(TWorkspace * & /*workspace*/, ConvLayer_t */*L = nullptr*/) {} ///< Only used for certain cudnn on-device memory
160 static void FreePoolDropoutWorkspace(TWorkspace * & /*workspace*/, PoolingLayer_t */*L = nullptr*/) {}
161
162
163 //____________________________________________________________________________
164 //
165 // Propagation
166 //____________________________________________________________________________
167
168 /** @name Forward Propagation
169 * Low-level functions required for the forward propagation of activations
170 * through the network.
171 */
172 ///@{
173 /** Matrix-multiply \p input with the transpose of \pweights and
174 * write the results into \p output. */
175 static void MultiplyTranspose(Matrix_t &output, const Matrix_t &input, const Matrix_t &weights);
176
177 static void MultiplyTranspose(Tensor_t &output, const Tensor_t &input, const Matrix_t &weights) {
178 Matrix_t output_matrix = output.GetMatrix();
179 MultiplyTranspose( output_matrix, input.GetMatrix(), weights);
180 //ensor_t::MatrixToTensor(output_matrix, output); // this maybe is not needed
181 }
182
183 /** Add the vectors biases row-wise to the matrix output */
184 static void AddRowWise(Matrix_t &output,const Matrix_t &biases);
185
186 static void AddRowWise(Tensor_t &output, const Matrix_t &biases) {
187 Matrix_t output_matrix = output.GetMatrix();
188 AddRowWise(output_matrix, biases);
189 //Tensor_t::MatrixToTensor(output_matrix, output); // this maybe is not needed
190 }
191
192 /** @name Backward Propagation (Dense Layers)
193 * Low-level functions required for the forward propagation of activations
194 * through the network.
195 */
196 ///@{
197 /** Perform the complete backward propagation step. If the provided
198 * \p activationGradientsBackward matrix is not empty, compute the
199 * gradients of the objective function with respect to the activations
200 * of the previous layer (backward direction).
201 * Also compute the weight and the bias gradients. Modifies the values
202 * in \p df and thus produces only a valid result, if it is applied the
203 * first time after the corresponding forward propagation has been per-
204 * formed. */
205 static void Backward(Tensor_t & activationGradientsBackward,
206 Matrix_t & weightGradients,
207 Matrix_t & biasGradients,
208 const Tensor_t & df,
209 const Tensor_t & activationGradients,
210 const Matrix_t & weights,
211 const Tensor_t & activationBackward);
212
213 /** Adds a the elements in matrix B scaled by c to the elements in
214 * the matrix A. This is required for the weight update in the gradient
215 * descent step.*/
216 static void ScaleAdd(Matrix_t & A,
217 const Matrix_t & B,
218 Scalar_t beta = 1.0);
219
220 static void Copy(Matrix_t & B,
221 const Matrix_t & A);
222
223 // copy from another type of matrix
224 template<typename AMatrix_t>
225 static void CopyDiffArch(Matrix_t & B, const AMatrix_t & A);
226
227
228 /** Above functions extended to vectors */
229 static void ScaleAdd(Tensor_t & A,
230 const Tensor_t & B,
231 Scalar_t beta = 1.0);
232
233 static void Copy(Tensor_t & A,
234 const Tensor_t & B);
235
236 // copy from another tensor
237 template<typename ATensor_t>
238 static void CopyDiffArch(Tensor_t & A,
239 const ATensor_t & B);
240
241 // copy from vector of matrices of different types
242 template<typename AMatrix_t>
243 static void CopyDiffArch(std::vector<Matrix_t> & A,
244 const std::vector<AMatrix_t> & B);
245
246 ///@}
247
248 //____________________________________________________________________________
249 //
250 // Activation Functions
251 //____________________________________________________________________________
252
253 /** @name Activation Functions
254 * For each activation function, the low-level interface contains two routines.
255 * One that applies the acitvation function to a matrix and one that evaluate
256 * the derivatives of the activation function at the elements of a given matrix
257 * and writes the results into the result matrix.
258 */
259 ///@{
260 /* impl using Matrix */
261 /*inline void evaluate(Matrix_t &A, EActivationFunction f)
262 {
263 Tensor_t tA(A);
264 evaluate<TCuda<AReal>>(tA,f);
265 }*/
267 const ActivationDescriptor_t activationDescr,
268 const double coef = 0.0, const AFloat alpha = 1,
269 const AFloat beta = 0);
270
271 /** Computes the gradient of the activation function */
272 static void ActivationFunctionBackward(Tensor_t & dX, const Tensor_t & Y,
273 const Tensor_t & dY, const Tensor_t & X,
274 EActivationFunction activFunct,
275 const ActivationDescriptor_t activationDescr,
276 const AFloat alpha = 1,
277 const AFloat beta = 0);
278
280 const Tensor_t &A);
281
282 static void Relu(Tensor_t & B);
283 static void ReluDerivative(Tensor_t & B,
284 const Tensor_t & A);
285
286 static void Sigmoid(Tensor_t & B);
288 const Tensor_t & A);
289
290 static void Tanh(Tensor_t & B);
291 static void TanhDerivative(Tensor_t & B,
292 const Tensor_t & A);
293
294 static void SymmetricRelu(Tensor_t & B);
296 const Tensor_t & A);
297
298 static void SoftSign(Tensor_t & B);
300 const Tensor_t & A);
301
302 static void Gauss(Tensor_t & B);
304 const Tensor_t & A);
305 ///@}
306
307 //____________________________________________________________________________
308 //
309 // Loss Functions
310 //____________________________________________________________________________
311
312 /** @name Loss Functions
313 * Loss functions compute a scalar value given the \p output of the network
314 * for a given training input and the expected network prediction \p Y that
315 * quantifies the quality of the prediction. For each function also a routing
316 * that computes the gradients (suffixed by Gradients) must be provided for
317 * the starting of the backpropagation algorithm.
318 */
319 ///@{
320
322 const Matrix_t &weights);
323 static void MeanSquaredErrorGradients(Matrix_t &dY, const Matrix_t &Y,
324 const Matrix_t &output, const Matrix_t &weights);
325
326 /** Sigmoid transformation is implicitly applied, thus \p output should
327 * hold the linear activations of the last layer in the net. */
329 const Matrix_t &weights);
330
331 static void CrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
332 const Matrix_t &output, const Matrix_t &weights);
333
334 /** Softmax transformation is implicitly applied, thus \p output should
335 * hold the linear activations of the last layer in the net. */
337 const Matrix_t &weights);
339 const Matrix_t &output, const Matrix_t &weights);
340 ///@}
341
342 //____________________________________________________________________________
343 //
344 // Output Functions
345 //____________________________________________________________________________
346
347 /** @name Output Functions
348 * Output functions transform the activations \p output of the
349 * output layer in the network to a valid prediction \p YHat for
350 * the desired usage of the network, e.g. the identity function
351 * for regression or the sigmoid transformation for two-class
352 * classification.
353 */
354 ///@{
355 static void Sigmoid(Matrix_t &YHat,
356 const Matrix_t & );
357 static void Softmax(Matrix_t &YHat,
358 const Matrix_t & );
359 ///@}
360
361 //____________________________________________________________________________
362 //
363 // Regularization
364 //____________________________________________________________________________
365
366 /** @name Regularization
367 * For each regularization type two functions are required, one named
368 * <tt><Type>Regularization</tt> that evaluates the corresponding
369 * regularization functional for a given weight matrix and the
370 * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
371 * component in the gradients to the provided matrix.
372 */
373 ///@{
374
377 const Matrix_t & W,
379
382 const Matrix_t & W,
384 ///@}
385
386 //____________________________________________________________________________
387 //
388 // Initialization
389 //____________________________________________________________________________
390
391 /** @name Initialization
392 * For each initialization method, one function in the low-level interface
393 * is provided. The naming scheme is <p>Initialize<Type></p> for a given
394 * initialization method Type.
395 */
396 ///@{
397
398 static void InitializeGauss(Matrix_t & A);
401 static void InitializeZero(Matrix_t & A);
404
405 // return static instance of random generator used for initialization
406 // if generator does not exist it is created the first time with a random seed (e.g. seed = 0)
408 // set random seed for the static geenrator
409 // if the static geneerator does not exists it is created
410 static void SetRandomSeed(size_t seed);
411 ///@}
412
413 //____________________________________________________________________________
414 //
415 // Dropout
416 //____________________________________________________________________________
417
418 /** @name Dropout
419 */
420 ///@{
421
422 /** Apply dropout with activation probability \p p to the given
423 * tensor \p A and scale the result by reciprocal of \p p. */
424 static void DropoutForward(Tensor_t & A,
425 TDescriptors * descriptors,
426 TWorkspace * workspace,
427 Scalar_t p);
428
429 static void DropoutForward(Matrix_t & A, Scalar_t p) {
430 Tensor_t tA(A);
431 DropoutForward( tA, static_cast<TDescriptors *> (nullptr), static_cast<TWorkspace *> (nullptr), p );
432 }
433
434 static void DropoutBackward(Tensor_t & /* A */,
435 TDescriptors * /*descriptors */,
436 TWorkspace * /* workspace */ ) {}
437 ///@}
438
439 //____________________________________________________________________________
440 //
441 // Batch Normalization
442 //____________________________________________________________________________
443
444 /** @name Batch Normalization Layer Propagation
445 */
446 ///@{
447
448 /** The input from each batch are normalized during training to have zero mean and unit variance
449 * and they are then scaled by two parameter, different for each input variable:
450 * - a scale factor \gamma gamma
451 * - an offset \beta beta */
452
454 Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
455 Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
456 Scalar_t epsilon, const TensorDescriptor_t &bnParDescriptor);
457
458 /** During inference the inputs are not normalized using the batch mean but the previously computed
459 * at running mean and variance */
460
462 const Matrix_t &runningMeans, const Matrix_t &runningVars,
464
465 static void BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx,
466 Matrix_t &gamma, // Matrix_t &beta, (not needed)
467 Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean, const Matrix_t &variance,
468 const Matrix_t &iVariance, Scalar_t epsilon, const TensorDescriptor_t &);
469
470 //____________________________________________________________________________
471 //
472 // Convolutional Layer Propagation
473 //____________________________________________________________________________
474
475 /** @name Forward Propagation in Convolutional Layer
476 */
477 ///@{
478
479 /** Calculate how many neurons "fit" in the output layer, given the input as well as the layer's hyperparameters. */
480 static size_t calculateDimension(size_t imgDim, size_t fltDim, size_t padding, size_t stride);
481
482 /** Transform the matrix B in local view format, suitable for
483 * convolution, and store it in matrix A */
484 static void Im2col(Matrix_t &A,
485 const Matrix_t &B,
486 size_t imgHeight,
487 size_t imgWidth,
488 size_t fltHeight,
489 size_t fltWidth,
490 size_t strideRows,
491 size_t strideCols,
492 size_t zeroPaddingHeight,
493 size_t zeroPaddingWidth);
494
495 static void Im2colIndices(std::vector<int> &V, const Matrix_t &B, size_t nLocalViews, size_t imgHeight, size_t imgWidth, size_t fltHeight,
496 size_t fltWidth, size_t strideRows, size_t strideCols, size_t zeroPaddingHeight,
497 size_t zeroPaddingWidth);
498 static void Im2colFast(Matrix_t &A, const Matrix_t &B, const std::vector<int> & V);
499
500 /** Rotates the matrix \p B, which is representing a weights,
501 * and stores them in the matrix \p A. */
502 static void RotateWeights(Matrix_t &A, const Matrix_t &B, size_t filterDepth, size_t filterHeight,
503 size_t filterWidth, size_t numFilters);
504
505 /** Add the biases in the Convolutional Layer. */
506 static void AddConvBiases(Matrix_t &output, const Matrix_t &biases);
507 ///@}
508
509 /** Dummy placeholder - preparation is currently only required for the CUDA architecture. */
510 static void PrepareInternals(Tensor_t &) {}
511
512 /** Forward propagation in the Convolutional layer */
514 Tensor_t & inputActivationFunc,
515 const Tensor_t &input,
516 const Matrix_t &weights, const Matrix_t & biases,
517 const DNN::CNN::TConvParams & params, EActivationFunction activFunc,
518 Tensor_t & /* inputPrime */,
519 const ConvDescriptors_t & /*descriptors*/, // Empty struct for cuda architecture
520 ConvWorkspace_t & /*workspace*/); // Empty struct for cuda architecture
521 //void * cudnnWorkspace = nullptr); // Remains nullptr for cuda architecture
522 /** @name Backward Propagation in Convolutional Layer
523 */
524 ///@{
525
526 /** Perform the complete backward propagation step in a Convolutional Layer.
527 * If the provided \p activationGradientsBackward matrix is not empty, compute the
528 * gradients of the objective function with respect to the activations
529 * of the previous layer (backward direction).
530 * Also compute the weight and the bias gradients. Modifies the values
531 * in \p df and thus produces only a valid result, if it is applied the
532 * first time after the corresponding forward propagation has been per-
533 * formed. */
534 static void ConvLayerBackward(Tensor_t &activationGradientsBackward,
535 Matrix_t &weightGradients, Matrix_t &biasGradients,
536 Tensor_t &df,
537 Tensor_t &activationGradients,
538 const Matrix_t &weights,
539 const Tensor_t &activationBackward,
540 const Tensor_t & outputTensor,
541 EActivationFunction activFunc,
542 const ConvDescriptors_t & /*descriptors*/,
543 ConvWorkspace_t & /*workspace*/,
544 size_t batchSize, size_t inputHeight,
545 size_t inputWidth, size_t depth,
546 size_t height, size_t width,
547 size_t filterDepth, size_t filterHeight,
548 size_t filterWidth, size_t nLocalViews );
549
550 /** Utility function for calculating the activation gradients of the layer
551 * before the convolutional layer. */
552 static void CalculateConvActivationGradients(Tensor_t &activationGradientsBackward,
553 const Tensor_t &df,
554 const Matrix_t &weights, size_t batchSize,
555 size_t inputHeight, size_t inputWidth, size_t depth, size_t height,
556 size_t width, size_t filterDepth, size_t filterHeight,
557 size_t filterWidth);
558
559 /** Utility function for calculating the weight gradients of the convolutional
560 * layer. */
561 static void CalculateConvWeightGradients(Matrix_t &weightGradients,
562 const Tensor_t &df,
563 const Tensor_t &activations_backward,
564 size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth,
565 size_t height, size_t width, size_t filterDepth, size_t filterHeight,
566 size_t filterWidth, size_t nLocalViews);
567
568 /** Utility function for calculating the bias gradients of the convolutional
569 * layer */
570 static void CalculateConvBiasGradients(Matrix_t &biasGradients, const Tensor_t &df,
571 size_t batchSize, size_t depth, size_t nLocalViews);
572 ///@}
573
574 //____________________________________________________________________________
575 //
576 // Max Pooling Layer Propagation
577 //____________________________________________________________________________
578 /** @name Forward Propagation in Max Pooling Layer
579 */
580 ///@{
581
582 /** Downsample the matrix \p C to the matrix \p A, using max
583 * operation, such that the winning indices are stored in matrix
584 * \p B. */
585 static void Downsample(Tensor_t &A, Tensor_t &B, const Tensor_t &C,
586 const PoolingDescriptors_t & /*descriptors*/,
587 PoolingWorkspace_t & /*workspace*/,
588 size_t imgHeight, size_t imgWidth, size_t fltHeight,
589 size_t fltWidth, size_t strideRows, size_t strideCols);
590
591 ///@}
592
593 /** @name Backward Propagation in Max Pooling Layer
594 */
595 ///@{
596 /** Perform the complete backward propagation step in a Pooling Layer. Based on the
597 * winning idices stored in the index matrix, it just forwards the actiovation
598 * gradients to the previous layer. */
599 static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward,
600 const Tensor_t &activationGradients,
601 const Tensor_t &indexMatrix,
602 const Tensor_t & /*inputActivation*/,
603 const Tensor_t & /*outputTensor*/,
604 const PoolingDescriptors_t & /*descriptors*/,
605 PoolingWorkspace_t & /*workspace*/,
606 size_t imgHeight,
607 size_t imgWidth,
608 size_t fltHeight,
609 size_t fltWidth,
610 size_t strideRows,
611 size_t strideCols,
612 size_t nLocalViews);
613
614 ///@}
615
616 //____________________________________________________________________________
617 //
618 // Reshape Layer Propagation
619 //____________________________________________________________________________
620 /** @name Forward and Backward Propagation in Reshape Layer
621 */
622 ///@{
623
624 /** Transform the matrix \p B to a matrix with different dimensions \p A */
625 static void Reshape(Matrix_t &A, const Matrix_t &B);
626
627 /** Flattens the tensor \p B, such that each matrix, is stretched in
628 * one row, resulting with a matrix \p A. */
629 static void Flatten(Tensor_t &A, const Tensor_t &B); // size_t size, size_t nRows, size_t nCols);
630
631 /** Transforms each row of \p B to a matrix and stores it in the
632 * tensor \p B. */
633 static void Deflatten(Tensor_t &A, const Tensor_t &B); // size_t index, size_t nRows,size_t nCols);
634
635 /** Rearrage data accoring to time fill B x T x D out with T x B x D matrix in*/
636 static void Rearrange(Tensor_t &out, const Tensor_t &in);
637
638
639 /** Backward pass for Recurrent Networks */
640 static Matrix_t & RecurrentLayerBackward(Matrix_t & state_gradients_backward, // BxH
641 Matrix_t & input_weight_gradients,
642 Matrix_t & state_weight_gradients,
643 Matrix_t & bias_gradients,
644 Matrix_t & df, //DxH
645 const Matrix_t & state, // BxH
646 const Matrix_t & weights_input, // HxD
647 const Matrix_t & weights_state, // HxH
648 const Matrix_t & input, // BxD
649 Matrix_t & input_gradient);
650
651
652 ///@}
653
654 //____________________________________________________________________________
655 //
656 // Additional Arithmetic Functions
657 //____________________________________________________________________________
658
659 /** @name Additional Arithmetic Functions
660 *
661 * Additional arithmetic on CUDA matrices used to implement the low-level
662 * interface.
663 */
664 ///@{
665
666 /** Standard multiplication of two matrices \p A and \p B with the result being
667 * written into C.
668 */
669 static void Multiply(Matrix_t &C,
670 const Matrix_t &A,
671 const Matrix_t &B);
672 /** Matrix multiplication of two matrices \p A and \p B^T (transposed) with the
673 * result being written into C.
674 */
676 const Matrix_t &input,
677 const Matrix_t &Weights,
678 Scalar_t alpha = 1.0, Scalar_t beta = 0.);
679 /** In-place Hadamard (element-wise) product of matrices \p A and \p B
680 * with the result being written into \p A.
681 */
682 static void Hadamard(Tensor_t &A,
683 const Tensor_t &B);
684 static void Hadamard(Matrix_t &A,
685 const Matrix_t &B);
686 // {
687 // Tensor_t tA(A);
688 // Hadamard( tA, Tensor_t(B));
689 // }
690
691 /** Sum columns of (m x n) matrixx \p A and write the results into the first
692 * m elements in \p A.
693 */
694 static void SumColumns(Matrix_t &B,
695 const Matrix_t &A,
696 Scalar_t alpha = 1.0, Scalar_t beta = 0.);
697
698 /** Compute the sum of all elements in \p A */
699 static Scalar_t Sum(const Matrix_t &A);
700
701 /** Check two matrices for equality, taking floating point arithmetic errors into account. */
702 static bool AlmostEquals(const Matrix_t &A, const Matrix_t &B, double epsilon = 0.1);
703
704 /** Add the constant \p beta to all the elements of matrix \p A and write the
705 * result into \p A.
706 */
708
709 /** Multiply the constant \p beta to all the elements of matrix \p A and write the
710 * result into \p A.
711 */
713
714 /** Reciprocal each element of the matrix \p A and write the result into
715 * \p A
716 */
718
719 /** Square each element of the matrix \p A and write the result into
720 * \p A
721 */
723
724 /** Square root each element of the matrix \p A and write the result into
725 * \p A
726 */
728
729 // optimizer functions
730 static void AdamUpdate(Matrix_t & A, const Matrix_t & M, const Matrix_t & V, Scalar_t alpha, Scalar_t eps);
733
734 // printing of tensor
735 static void PrintTensor( const Tensor_t & A, const std::string name = "Cuda-tensor", bool = false);
736
737 ///////////////////////////////////////////////////////////////////////////////
738 /// extra functions defined only for CPU architecture !!!
739 //////////////////////////////////////////////////////////////////////////////
740
741 /** Sum rows of (m x n) matrix \p A and write the results into the first
742 * m elements in \p B.
743 */
744 static void SumRows(Matrix_t & B, const Matrix_t & A);
745
746
747};
748
749//____________________________________________________________________________
750template <typename AFloat>
751template <typename AMatrix_t>
753 const AMatrix_t &A)
754{
755 // copy from another architecture using the reference one
756 // this is not very efficient since creates temporary objects
757 TMatrixT<AFloat> tmp = A;
758 Copy(B, TCudaMatrix<AFloat>(tmp) );
759}
760
761//____________________________________________________________________________
762template <typename AFloat>
763template <typename AMatrix_t>
765 const std::vector<AMatrix_t> &A)
766{
767 for (size_t i = 0; i < B.size(); ++i) {
768 CopyDiffArch(B[i], A[i]);
769 }
770}
771
772template <typename AFloat>
773void TCuda<AFloat>::PrintTensor(const typename TCuda<AFloat>::Tensor_t & A, const std::string name, bool )
774{
775 std::cout << name << " size = " << A.GetSize() << " shape = { ";
776 auto shape = A.GetShape();
777 for (size_t k = 0; k < shape.size()-1; ++k)
778 std::cout << shape[k] << " , ";
779 std::cout << shape.back() << " } ";
780 std::cout << " strides = { ";
781 auto strides = A.GetStrides();
782 for (size_t k = 0; k < strides.size()-1; ++k)
783 std::cout << strides[k] << " , ";
784 std::cout << strides.back() << " }\n ";
785
786 if (A.GetShape().size() == 2 ) {
787 for (size_t i = 0; i < A.GetShape()[0]; ++i) {
788 std::cout << "{ ";
789 for (size_t j = 0; j < A.GetShape()[1]; ++j) {
790 std::cout << A(i,j) << " ";
791 }
792 std::cout << " } " << std::endl;
793 }
794 } else if (A.GetShape().size() == 3 ) {
795 for (size_t i = 0; i < A.GetFirstSize(); ++i) {
796 std::cout << "{ ";
797 for (size_t j = 0; j < A.GetHSize(); ++j) {
798 std::cout << "{ ";
799 for (size_t k = 0; k < A.GetWSize(); ++k) {
800 std::cout << A(i,j,k) << " ";
801 }
802 std::cout << " } " << std::endl;
803 }
804 std::cout << " } " << std::endl;
805 }
806 }
807 else {
808 for (size_t l = 0; l < A.GetSize(); ++l) {
809 std::cout << A.GetData()[l] << " ";
810 }
811 std::cout << "\n";
812 }
813}
814
815
816} // namespace DNN
817} // namespace TMVA
818
819#endif
#define c(i)
Definition: RSha256.hxx:101
#define h(i)
Definition: RSha256.hxx:106
include TDocParser_001 C image html pict1_TDocParser_001 png width
Definition: TDocParser.cxx:121
void Error(const char *location, const char *msgfmt,...)
char name[80]
Definition: TGX11.cxx:109
Generic Max Pooling Layer class.
Definition: MaxPoolLayer.h:59
Layer implementing Batch Normalization.
TCudaDeviceBuffer.
Definition: CudaBuffers.h:100
TCudaHostBuffer.
Definition: CudaBuffers.h:43
TCudaMatrix Class.
Definition: CudaMatrix.h:106
TCudaTensor Class.
Definition: CudaTensor.h:84
TCudaMatrix< AFloat > GetMatrix() const
Definition: CudaTensor.h:300
The TCuda architecture class.
Definition: Cuda.h:64
static void Deflatten(Tensor_t &A, const Tensor_t &B)
Transforms each row of B to a matrix and stores it in the tensor B.
static void AdamUpdate(Matrix_t &A, const Matrix_t &M, const Matrix_t &V, Scalar_t alpha, Scalar_t eps)
static void AddL2RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
static void CalculateConvActivationGradients(Tensor_t &activationGradientsBackward, const Tensor_t &df, const Matrix_t &weights, size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth, size_t height, size_t width, size_t filterDepth, size_t filterHeight, size_t filterWidth)
Utility function for calculating the activation gradients of the layer before the convolutional layer...
static void SymmetricRelu(Tensor_t &B)
static void FreeConvWorkspace(TWorkspace *&, ConvLayer_t *)
Only used for certain cudnn on-device memory.
Definition: Cuda.h:159
static void InitializeUniform(Matrix_t &A)
static void ReciprocalElementWise(Matrix_t &A)
Reciprocal each element of the matrix A and write the result into A.
static void Softmax(Matrix_t &YHat, const Matrix_t &)
static void Im2colFast(Matrix_t &A, const Matrix_t &B, const std::vector< int > &V)
static void InitializeIdentity(Matrix_t &A)
static void AddConvBiases(Matrix_t &output, const Matrix_t &biases)
Add the biases in the Convolutional Layer.
static void InitializeGlorotUniform(Matrix_t &A)
static void ConstAdd(Matrix_t &A, Scalar_t beta)
Add the constant beta to all the elements of matrix A and write the result into A.
static void Downsample(Tensor_t &A, Tensor_t &B, const Tensor_t &C, const PoolingDescriptors_t &, PoolingWorkspace_t &, size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols)
Downsample the matrix C to the matrix A, using max operation, such that the winning indices are store...
static Scalar_t MeanSquaredError(const Matrix_t &Y, const Matrix_t &output, const Matrix_t &weights)
static void CrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y, const Matrix_t &output, const Matrix_t &weights)
static void RotateWeights(Matrix_t &A, const Matrix_t &B, size_t filterDepth, size_t filterHeight, size_t filterWidth, size_t numFilters)
Rotates the matrix B, which is representing a weights, and stores them in the matrix A.
static void CopyDiffArch(Matrix_t &B, const AMatrix_t &A)
static void DropoutForward(Tensor_t &A, TDescriptors *descriptors, TWorkspace *workspace, Scalar_t p)
Apply dropout with activation probability p to the given tensor A and scale the result by reciprocal ...
static void CalculateConvBiasGradients(Matrix_t &biasGradients, const Tensor_t &df, size_t batchSize, size_t depth, size_t nLocalViews)
Utility function for calculating the bias gradients of the convolutional layer.
static void BatchNormLayerForwardInference(int axis, const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta, Tensor_t &y, const Matrix_t &runningMeans, const Matrix_t &runningVars, Scalar_t epsilon, const TensorDescriptor_t &)
During inference the inputs are not normalized using the batch mean but the previously computed at ru...
static void Sigmoid(Matrix_t &YHat, const Matrix_t &)
static bool AlmostEquals(const Matrix_t &A, const Matrix_t &B, double epsilon=0.1)
Check two matrices for equality, taking floating point arithmetic errors into account.
static void InitializeBNormDescriptors(TDescriptors *&, BNormLayer_t *)
Initialize CNN data/operator descriptors.
Definition: Cuda.h:132
static size_t calculateDimension(size_t imgDim, size_t fltDim, size_t padding, size_t stride)
Calculate how many neurons "fit" in the output layer, given the input as well as the layer's hyperpar...
static void AdamUpdateFirstMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta)
AFloat Scalar_t
Definition: Cuda.h:70
static void AddL1RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
static void SumRows(Matrix_t &B, const Matrix_t &A)
extra functions defined only for CPU architecture !!!
static void ConvLayerForward(Tensor_t &output, Tensor_t &inputActivationFunc, const Tensor_t &input, const Matrix_t &weights, const Matrix_t &biases, const DNN::CNN::TConvParams &params, EActivationFunction activFunc, Tensor_t &, const ConvDescriptors_t &, ConvWorkspace_t &)
Forward propagation in the Convolutional layer.
static void Sigmoid(Tensor_t &B)
static void DropoutBackward(Tensor_t &, TDescriptors *, TWorkspace *)
Definition: Cuda.h:434
static void SoftSignDerivative(Tensor_t &B, const Tensor_t &A)
static void AdamUpdateSecondMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta)
static void SymmetricReluDerivative(Tensor_t &B, const Tensor_t &A)
static void Backward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients, Matrix_t &biasGradients, const Tensor_t &df, const Tensor_t &activationGradients, const Matrix_t &weights, const Tensor_t &activationBackward)
Perform the complete backward propagation step.
static void PrintTensor(const Tensor_t &A, const std::string name="Cuda-tensor", bool=false)
Definition: Cuda.h:773
static void Im2colIndices(std::vector< int > &V, const Matrix_t &B, size_t nLocalViews, size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols, size_t zeroPaddingHeight, size_t zeroPaddingWidth)
static void Copy(Tensor_t &A, const Tensor_t &B)
static void InitializeConvDescriptors(TDescriptors *&, ConvLayer_t *)
Definition: Cuda.h:137
static void Tanh(Tensor_t &B)
static void SigmoidDerivative(Tensor_t &B, const Tensor_t &A)
static TRandom * fgRandomGen
Definition: Cuda.h:66
static void InitializePoolDropoutWorkspace(TWorkspace *&, TDescriptors *&, const DNN::CNN::TConvParams &, PoolingLayer_t *)
Definition: Cuda.h:152
static void AddRowWise(Matrix_t &output, const Matrix_t &biases)
Add the vectors biases row-wise to the matrix output.
static void ScaleAdd(Tensor_t &A, const Tensor_t &B, Scalar_t beta=1.0)
Above functions extended to vectors.
static TMVA::Experimental::MemoryLayout GetTensorLayout()
Definition: Cuda.h:103
static void Multiply(Matrix_t &C, const Matrix_t &A, const Matrix_t &B)
Standard multiplication of two matrices A and B with the result being written into C.
static void BatchNormLayerForwardTraining(int axis, const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta, Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans, Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum, Scalar_t epsilon, const TensorDescriptor_t &bnParDescriptor)
The input from each batch are normalized during training to have zero mean and unit variance and they...
static Scalar_t CrossEntropy(const Matrix_t &Y, const Matrix_t &output, const Matrix_t &weights)
Sigmoid transformation is implicitly applied, thus output should hold the linear activations of the l...
static void Gauss(Tensor_t &B)
static void ActivationFunctionForward(Tensor_t &X, EActivationFunction activFunct, const ActivationDescriptor_t activationDescr, const double coef=0.0, const AFloat alpha=1, const AFloat beta=0)
static void SoftmaxCrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y, const Matrix_t &output, const Matrix_t &weights)
static Matrix_t & RecurrentLayerBackward(Matrix_t &state_gradients_backward, Matrix_t &input_weight_gradients, Matrix_t &state_weight_gradients, Matrix_t &bias_gradients, Matrix_t &df, const Matrix_t &state, const Matrix_t &weights_input, const Matrix_t &weights_state, const Matrix_t &input, Matrix_t &input_gradient)
Backward pass for Recurrent Networks.
static Scalar_t Sum(const Matrix_t &A)
Compute the sum of all elements in A.
static void InitializePoolDescriptors(TDescriptors *&, PoolingLayer_t *)
Definition: Cuda.h:139
static void Hadamard(Matrix_t &A, const Matrix_t &B)
static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t h, size_t w)
Definition: Cuda.h:108
static void SquareElementWise(Matrix_t &A)
Square each element of the matrix A and write the result into A.
TCudaTensor< AFloat > Tensor_t
Definition: Cuda.h:73
static void InitializeGlorotNormal(Matrix_t &A)
static void SumColumns(Matrix_t &B, const Matrix_t &A, Scalar_t alpha=1.0, Scalar_t beta=0.)
Sum columns of (m x n) matrixx A and write the results into the first m elements in A.
static void ReluDerivative(Tensor_t &B, const Tensor_t &A)
AReal AFloat
Definition: Cuda.h:69
static Scalar_t L2Regularization(const Matrix_t &W)
static void FreePoolDropoutWorkspace(TWorkspace *&, PoolingLayer_t *)
Definition: Cuda.h:160
static void IdentityDerivative(Tensor_t &B, const Tensor_t &A)
static TRandom & GetRandomGenerator()
static void Hadamard(Tensor_t &A, const Tensor_t &B)
In-place Hadamard (element-wise) product of matrices A and B with the result being written into A.
static void CreateWeightTensors(std::vector< Matrix_t > &newWeights, const std::vector< Matrix_t > &weights)
Definition: Cuda.h:118
static void SqrtElementWise(Matrix_t &A)
Square root each element of the matrix A and write the result into A.
static void InitializeGauss(Matrix_t &A)
static void InitializeActivationDescriptor(ActivationDescriptor_t &, EActivationFunction, double=0.0)
Definition: Cuda.h:141
static void GaussDerivative(Tensor_t &B, const Tensor_t &A)
static void CalculateConvWeightGradients(Matrix_t &weightGradients, const Tensor_t &df, const Tensor_t &activations_backward, size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth, size_t height, size_t width, size_t filterDepth, size_t filterHeight, size_t filterWidth, size_t nLocalViews)
Utility function for calculating the weight gradients of the convolutional layer.
static void InitializeConvWorkspace(TWorkspace *&, TDescriptors *&, const DNN::CNN::TConvParams &, ConvLayer_t *)
Definition: Cuda.h:148
static void ReleaseBNormDescriptors(TDescriptors *&)
Definition: Cuda.h:146
static void ConstMult(Matrix_t &A, Scalar_t beta)
Multiply the constant beta to all the elements of matrix A and write the result into A.
static void PrepareInternals(Tensor_t &)
Dummy placeholder - preparation is currently only required for the CUDA architecture.
Definition: Cuda.h:510
static void Rearrange(Tensor_t &out, const Tensor_t &in)
Rearrage data accoring to time fill B x T x D out with T x B x D matrix in.
static void AddRowWise(Tensor_t &output, const Matrix_t &biases)
Definition: Cuda.h:186
static void SoftSign(Tensor_t &B)
static void BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx, Matrix_t &gamma, Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean, const Matrix_t &variance, const Matrix_t &iVariance, Scalar_t epsilon, const TensorDescriptor_t &)
static void ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients, Matrix_t &biasGradients, Tensor_t &df, Tensor_t &activationGradients, const Matrix_t &weights, const Tensor_t &activationBackward, const Tensor_t &outputTensor, EActivationFunction activFunc, const ConvDescriptors_t &, ConvWorkspace_t &, size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth, size_t height, size_t width, size_t filterDepth, size_t filterHeight, size_t filterWidth, size_t nLocalViews)
Perform the complete backward propagation step in a Convolutional Layer.
static void MultiplyTranspose(Tensor_t &output, const Tensor_t &input, const Matrix_t &weights)
Definition: Cuda.h:177
static void MultiplyTranspose(Matrix_t &output, const Matrix_t &input, const Matrix_t &weights)
Matrix-multiply input with the transpose of \pweights and write the results into output.
static void ReleaseConvDescriptors(TDescriptors *&)
Release CNN data/operator descriptors.
Definition: Cuda.h:144
static void SetRandomSeed(size_t seed)
static void Copy(Matrix_t &B, const Matrix_t &A)
static void TanhDerivative(Tensor_t &B, const Tensor_t &A)
static void ReleaseDescriptor(ActivationDescriptor_t &)
Definition: Cuda.h:157
static void CopyDiffArch(std::vector< Matrix_t > &A, const std::vector< AMatrix_t > &B)
static void ActivationFunctionBackward(Tensor_t &dX, const Tensor_t &Y, const Tensor_t &dY, const Tensor_t &X, EActivationFunction activFunct, const ActivationDescriptor_t activationDescr, const AFloat alpha=1, const AFloat beta=0)
Computes the gradient of the activation function.
static void Relu(Tensor_t &B)
static Scalar_t SoftmaxCrossEntropy(const Matrix_t &Y, const Matrix_t &output, const Matrix_t &weights)
Softmax transformation is implicitly applied, thus output should hold the linear activations of the l...
static Scalar_t L1Regularization(const Matrix_t &W)
static void InitializeZero(Matrix_t &A)
static void Reshape(Matrix_t &A, const Matrix_t &B)
Transform the matrix B to a matrix with different dimensions A.
static void Im2col(Matrix_t &A, const Matrix_t &B, size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols, size_t zeroPaddingHeight, size_t zeroPaddingWidth)
Transform the matrix B in local view format, suitable for convolution, and store it in matrix A.
static void DropoutForward(Matrix_t &A, Scalar_t p)
Definition: Cuda.h:429
static void TransposeMultiply(Matrix_t &output, const Matrix_t &input, const Matrix_t &Weights, Scalar_t alpha=1.0, Scalar_t beta=0.)
Matrix multiplication of two matrices A and B^T (transposed) with the result being written into C.
static void ScaleAdd(Matrix_t &A, const Matrix_t &B, Scalar_t beta=1.0)
Adds a the elements in matrix B scaled by c to the elements in the matrix A.
static void Flatten(Tensor_t &A, const Tensor_t &B)
Flattens the tensor B, such that each matrix, is stretched in one row, resulting with a matrix A.
static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward, const Tensor_t &activationGradients, const Tensor_t &indexMatrix, const Tensor_t &, const Tensor_t &, const PoolingDescriptors_t &, PoolingWorkspace_t &, size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols, size_t nLocalViews)
Perform the complete backward propagation step in a Pooling Layer.
static void CopyDiffArch(Tensor_t &A, const ATensor_t &B)
static void MeanSquaredErrorGradients(Matrix_t &dY, const Matrix_t &Y, const Matrix_t &output, const Matrix_t &weights)
static void ReleasePoolDescriptors(TDescriptors *&)
Definition: Cuda.h:145
static Tensor_t CreateTensor(size_t n, size_t c, size_t h, size_t w)
Definition: Cuda.h:105
TMatrixT.
Definition: TMatrixT.h:39
This is the base class for the ROOT Random number generators.
Definition: TRandom.h:27
double beta(double x, double y)
Calculates the beta function.
Double_t y[n]
Definition: legend1.C:17
Double_t x[n]
Definition: legend1.C:17
const Int_t n
Definition: legend1.C:16
static double B[]
static double A[]
static double C[]
double gamma(double x)
void Copy(void *source, void *dest)
double weightDecay(double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
compute the weight decay for regularization (L1 or L2)
Definition: NeuralNet.icc:498
EActivationFunction
Enum that represents layer activation functions.
Definition: Functions.h:32
MemoryLayout
Memory layout type (copy from RTensor.hxx)
Definition: CudaTensor.h:47
create variable transformations
auto * l
Definition: textangle.C:4
REAL epsilon
Definition: triangle.c:617
static void output(int code)
Definition: gifencode.c:226