Logo ROOT  
Reference Guide
TCudnn.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Joana Niermann 23/07/19
3
4/*************************************************************************
5 * Copyright (C) 2019, Joana Niermann *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12///////////////////////////////////////////////////////////////////
13// Definition of the TCudnn architecture class, which provides //
14// a wrapping of the low-level functionality for neural networks //
15// in the cuDNN library. //
16///////////////////////////////////////////////////////////////////
17
18#ifndef TMVA_DNN_ARCHITECTURES_CUDNN
19#define TMVA_DNN_ARCHITECTURES_CUDNN
20
21#include "RConfigure.h" // for definition of R__HAS_CUDNN
22
23#ifndef R__HAS_CUDNN
24#error This file can be compiled only when cudnn is available in ROOT
25#else
26
27#include "TMVA/DNN/Functions.h"
29//#include "TMVA/DNN/CNN/Descriptors.h"
33
34#include "cudnn.h"
35#include "Cuda/CudaBuffers.h"
36#include "Cuda/CudaTensor.h"
38#include <utility>
39#include <vector>
40
42
43class TRandom;
44
45namespace TMVA
46{
47namespace DNN
48{
49
50struct TCudnnEmptyDescriptor {};
51
52
53/** The TCudnn architecture class.
54 *
55 * Low-level interface class for CUDA computing architectures using the cuDNN
56 * library as backend. Contains as public types the declaration of the scalar,
57 * matrix and buffer types for this architecture, as well as the remaining
58 * functions in the low-level interface in the form of static members.
59 */
60template<typename AFloat = Float_t>
61class TCudnn
62{
63private:
64 static TRandom * fgRandomGen;
65public:
66
67 using Scalar_t = AFloat;
68 using Matrix_t = TCudaTensor<AFloat>;
69 using Tensor_t = TCudaTensor<AFloat>;
70 using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>;
71 using HostBuffer_t = TCudaHostBuffer<AFloat>;
72
73 // The descriptors for the (tensor) data are held by the data classes (CudaTensor)
74 using ActivationDescriptor_t = cudnnActivationDescriptor_t;
75 using ConvolutionDescriptor_t = cudnnConvolutionDescriptor_t;
76 using DropoutDescriptor_t = cudnnDropoutDescriptor_t;
77 using FilterDescriptor_t = cudnnFilterDescriptor_t;
78 //using OpTensorDescriptor_t = cudnnOpTensorDescriptor_t;
79 using PoolingDescriptor_t = cudnnPoolingDescriptor_t;
80 //using ReductionDescriptor_t = cudnnReduceTensorDescriptor_t;
81 using AlgorithmForward_t = cudnnConvolutionFwdAlgo_t;
82 using AlgorithmBackward_t = cudnnConvolutionBwdDataAlgo_t;
83 using AlgorithmHelper_t = cudnnConvolutionBwdFilterAlgo_t;
84 using AlgorithmDataType_t = cudnnDataType_t;
85 using ReduceTensorDescriptor_t = cudnnReduceTensorDescriptor_t;
86 using TensorDescriptor_t = cudnnTensorDescriptor_t;
87
88 using EmptyDescriptor_t = TCudnnEmptyDescriptor; // Used if a descriptor is not needed in a class
89
90 using BNormLayer_t = TBatchNormLayer<TCudnn<AFloat>>;
91 using BNormDescriptors_t = TDNNGenDescriptors<BNormLayer_t>;
92 //using BNormWorkspace_t = CNN::TCNNWorkspace<BNormLayer_t>;*/
93 using ConvLayer_t = CNN::TConvLayer<TCudnn<AFloat>>;
94 using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
95 using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
96 using PoolingLayer_t = CNN::TMaxPoolLayer<TCudnn<AFloat>>;
97 using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
98 using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
99
100 // template <typename AFloat>
101 // using ConvDescriptors_t = CNN::TCNNDescriptors<CNN::TConvLayer<TCudnn<AFloat>>>;
102
103 // convolution options
104 // default is -1 (left to cudnn)
105 struct CNNOptions {
106
107 static int ConvFwdAlgorithm;
108 static int ConvBwdDataAlgorithm;
109 static int ConvBwdFilterAlgorithm;
110 // default is 0 (left to cudnn : a value -1 will indicate to not use any space)
111 static Long_t ConvMaxWorkspaceSize;
112 }; // namespace DNN
113
115
116
117 static Tensor_t CreateTensor(size_t n, size_t c, size_t h, size_t w) {
118 return Tensor_t( {n,c,h,w}, GetTensorLayout(), 0, 0);
119 }
120
121 static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t h, size_t w) {
122 return Tensor_t( buffer, {n,c,h,w}, GetTensorLayout(), 0, 0);
123 }
124
125 // create a weight tensor/matrix vector from another tensor/weight vector using the given tensor shapes
126 // this function is used by the optimizers to stgore intermidiate weights representations
127 static void CreateWeightTensors( std::vector<Matrix_t> & newWeights, const std::vector<Matrix_t> & weights) {
128 if (!newWeights.empty()) newWeights.clear();
129 size_t n = weights.size();
130 for (size_t i = 0; i < n; ++i)
131 newWeights.emplace_back( weights[i].GetShape(), weights[i].GetLayout(), 0, 0);
132 }
133 //____________________________________________________________________________
134 //
135 // Architecture Initialization
136 //____________________________________________________________________________
137
138 static void InitializeBNormDescriptors(TDescriptors * & descriptors,
139 BNormLayer_t *L = nullptr);
140
141 static void InitializeConvDescriptors(TDescriptors * & descriptors,
142 ConvLayer_t *L = nullptr);
143
144 static void InitializePoolDescriptors(TDescriptors * & descriptors,
145 PoolingLayer_t *L = nullptr);
146
147 static void InitializeActivationDescriptor(ActivationDescriptor_t & descriptors, EActivationFunction activFunc, double coef = 0.0);
148
149 static void ReleaseConvDescriptors(TDescriptors * descriptors );
150 static void ReleasePoolDescriptors(TDescriptors * descriptors );
151 static void ReleaseBNormDescriptors(TDescriptors * descriptors );
152 static void ReleaseDescriptor(EmptyDescriptor_t & emptyDescr) {} // Does nothing
153 static void ReleaseDescriptor(ActivationDescriptor_t & activationDescr);
154 static void ReleaseDescriptor(ConvolutionDescriptor_t & convolutionDescr);
155 static void ReleaseDescriptor(DropoutDescriptor_t & dropoutDescr);
156 static void ReleaseDescriptor(FilterDescriptor_t & filterDescr);
157 static void ReleaseDescriptor(PoolingDescriptor_t & poolingDescr);
158 static void ReleaseDescriptor(TensorDescriptor_t & tensorDescr);
159
160
161 static void InitializeConvWorkspace(TWorkspace * & workspace,
162 TDescriptors * & descriptors,
163 const DNN::CNN::TConvParams & params,
164 ConvLayer_t *L = nullptr);
165 static void InitializePoolDropoutWorkspace(TWorkspace * & workspace,
166 TDescriptors * & descriptors,
167 const DNN::CNN::TConvParams & params,
168 PoolingLayer_t *L = nullptr);
169
170 static void FreeConvWorkspace(TWorkspace * workspace, ConvLayer_t *L = nullptr);
171 static void FreePoolDropoutWorkspace(TWorkspace * workspace, PoolingLayer_t *L = nullptr);
172 //____________________________________________________________________________
173 //
174 // Propagation
175 //____________________________________________________________________________
176
177 /** @name Forward Propagation
178 * Low-level functions required for the forward propagation of activations
179 * through the network.
180 */
181 ///@{
182 /** Matrix-multiply \p input with the transpose of \pweights and
183 * write the results into \p output. */
184 static void MultiplyTranspose(Tensor_t &output, const Tensor_t &input, const Matrix_t &weights);
185
186 /** Add the vectors biases row-wise to the matrix output */
187 static void AddRowWise(Tensor_t &output,const Matrix_t &biases);
188
189 /** @name Backward Propagation (Dense Layers)
190 * Low-level functions required for the forward propagation of activations
191 * through the network.
192 */
193 ///@{
194 /** Perform the complete backward propagation step. If the provided
195 * \p activationGradientsBackward matrix is not empty, compute the
196 * gradients of the objective function with respect to the activations
197 * of the previous layer (backward direction).
198 * Also compute the weight and the bias gradients. Modifies the values
199 * in \p df and thus produces only a valid result, if it is applied the
200 * first time after the corresponding forward propagation has been per-
201 * formed. */
202 static void Backward(Tensor_t & activationGradientsBackward,
203 Matrix_t & weightGradients,
204 Matrix_t & biasGradients,
205 Tensor_t & df,
206 const Tensor_t & activationGradients,
207 const Matrix_t & weights,
208 const Tensor_t & activationBackward);
209
210 /** Above functions extended to vectors */
211 static void ScaleAdd(Tensor_t & A, const Tensor_t & B,
212 Scalar_t alpha = 1.0,
213 Scalar_t beta = 1.0);
214
215 /** Deep copy from B to A. */
216 static void Copy(Tensor_t & A, const Tensor_t & B);
217
218 // copy from another tensor
219 template<typename ATensor_t>
220 static void CopyDiffArch(Tensor_t & A,
221 const ATensor_t & B);
222
223 template <typename ATensor_t>
224 static void CopyWeightsDiffArch(Tensor_t &A, const ATensor_t &B);
225
226 //template<>
227 static void CopyDiffArch(Tensor_t A, const Tensor_t & B ) { Copy(A,B); }
228
229 // copy from vector of matrices of different types
230 template<typename AMatrix_t>
231 static void CopyDiffArch(std::vector<Tensor_t> & A,
232 const std::vector<AMatrix_t> & B);
233
234
235 //____________________________________________________________________________
236 //
237 // Activation Functions
238 //____________________________________________________________________________
239
240 /** @name Activation Functions
241 * For each activation function, the low-level interface contains two routines.
242 * One that applies the acitvation function to a matrix and one that evaluate
243 * the derivatives of the activation function at the elements of a given matrix
244 * and writes the results into the result matrix.
245 */
246 ///@{
247 static void Identity(Tensor_t & X) {}
248 static void IdentityDerivative(Tensor_t & dX, Tensor_t& X,
249 Tensor_t & Y, Tensor_t & dY,
250 ActivationDescriptor_t activationDescr,
251 const AFloat alpha = 1,
252 const AFloat beta = 1) {}
253
254 static void ActivationFunctionForward(Tensor_t & X, EActivationFunction activFunct,
255 const ActivationDescriptor_t activationDescr,
256 const double coef = 0.0, const AFloat alpha = 1,
257 const AFloat beta = 0);
258
259 // same as above but using different input/output tensors
260 static void ActivationFunctionForward(Tensor_t &Y, const Tensor_t & X, EActivationFunction activFunct,
261 const ActivationDescriptor_t activationDescr, const double coef = 0.0,
262 const AFloat alpha = 1, const AFloat beta = 0);
263
264 /** Computes the gradient of the activation function */
265 static void ActivationFunctionBackward(Tensor_t & dX, const Tensor_t & Y,
266 const Tensor_t & dY, const Tensor_t & X,
267 EActivationFunction activFunct,
268 const ActivationDescriptor_t activationDescr,
269 const AFloat alpha = 1,
270 const AFloat beta = 0);
271
272 //
273 // No cudnn implementation for the following activation functions
274 //
275 //static void SymmetricRelu(Tensor_t & B);
276 static void SymmetricReluDerivative(Tensor_t & B,
277 const Tensor_t & A) {}
278
279 //static void SoftSign(Tensor_t & B);
280 static void SoftSignDerivative(Tensor_t & B,
281 const Tensor_t & A) {}
282
283 //static void Gauss(Tensor_t & B);
284 static void GaussDerivative(Tensor_t & B,
285 const Tensor_t & A) {}
286 ///@}
287
288 //____________________________________________________________________________
289 //
290 // Loss Functions
291 //____________________________________________________________________________
292
293 /** @name Loss Functions
294 * Loss functions compute a scalar value given the \p output of the network
295 * for a given training input and the expected network prediction \p Y that
296 * quantifies the quality of the prediction. For each function also a routing
297 * that computes the gradients (suffixed by Gradients) must be provided for
298 * the starting of the backpropagation algorithm.
299 */
300 ///@{
301
302 static Scalar_t MeanSquaredError(const Matrix_t &Y, const Matrix_t &output,
303 const Matrix_t &weights);
304 static void MeanSquaredErrorGradients(Matrix_t &dY, const Matrix_t &Y,
305 const Matrix_t &output, const Matrix_t &weights);
306
307 /** Sigmoid transformation is implicitly applied, thus \p output should
308 * hold the linear activations of the last layer in the net. */
309 static Scalar_t CrossEntropy(const Matrix_t &Y, const Matrix_t &output,
310 const Matrix_t &weights);
311
312 static void CrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
313 const Matrix_t &output, const Matrix_t &weights);
314
315 /** Softmax transformation is implicitly applied, thus \p output should
316 * hold the linear activations of the last layer in the net. */
317 static Scalar_t SoftmaxCrossEntropy(const Matrix_t &Y, const Matrix_t &output,
318 const Matrix_t &weights);
319 static void SoftmaxCrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
320 const Matrix_t &output, const Matrix_t &weights);
321 ///@}
322
323 //____________________________________________________________________________
324 //
325 // Output Functions
326 //____________________________________________________________________________
327
328 /** @name Output Functions
329 * Output functions transform the activations \p output of the
330 * output layer in the network to a valid prediction \p YHat for
331 * the desired usage of the network, e.g. the identity function
332 * for regression or the sigmoid transformation for two-class
333 * classification.
334 */
335 ///@{
336 static void Sigmoid(Matrix_t &YHat,
337 const Matrix_t & );
338 static void Softmax(Matrix_t &YHat,
339 const Matrix_t & );
340 ///@}
341
342
343
344 //____________________________________________________________________________
345 //
346 // Dropout
347 //____________________________________________________________________________
348
349 /** @name Dropout
350 */
351 ///@{
352
353 /** Apply dropout with activation probability \p p to the given
354 * tensor \p A and scale the result by reciprocal of \p p. */
355 static void DropoutForward(Tensor_t & A,
356 TDescriptors * descriptors,
357 TWorkspace * workspace,
358 Scalar_t p);
359
360 static void DropoutBackward(Tensor_t & A,
361 TDescriptors * descriptors,
362 TWorkspace * workspace);
363
364 ///@}
365
366 //____________________________________________________________________________
367 //
368 // Batch Normalization
369 //____________________________________________________________________________
370
371 /** @name Batch Normalization Layer Propagation
372 */
373 ///@{
374
375 /** The input from each batch are normalized during training to have zero mean and unit variance
376 * and they are then scaled by two parameter, different for each input variable:
377 * - a scale factor \gamma gamma
378 * - an offset \beta beta */
379
380 static void BatchNormLayerForwardTraining(int axis, const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta,
381 Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
382 Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
383 Scalar_t epsilon, const TensorDescriptor_t &bnParDescriptor);
384
385 /** During inference the inputs are not normalized using the batch mean but the previously computed
386 * at running mean and variance */
387
388 static void BatchNormLayerForwardInference(int axis, const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta,
389 Tensor_t &y, const Matrix_t &runningMeans,
390 const Matrix_t &runningVars, Scalar_t epsilon,
391 const TensorDescriptor_t &);
392
393 static void BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx,
394 Matrix_t &gamma, // Matrix_t &beta, (not needed)
395 Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean, const Matrix_t &variance,
396 const Matrix_t &iVariance, Scalar_t epsilon, const TensorDescriptor_t &);
397
398 //____________________________________________________________________________
399 //
400 // Regularization
401 //____________________________________________________________________________
402
403 /** @name Regularization
404 * For each regularization type two functions are required, one named
405 * <tt><Type>Regularization</tt> that evaluates the corresponding
406 * regularization functional for a given weight matrix and the
407 * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
408 * component in the gradients to the provided matrix.
409 */
410
411 static Scalar_t L1Regularization(const Matrix_t &W)
412 {
413 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
415 }
416 static void AddL1RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
417 {
418 TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
419 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
421 }
422
423 static Scalar_t L2Regularization(const Matrix_t &W)
424 {
425 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
427 }
428 static void AddL2RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
429 {
430 TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
431 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
433 }
434 ///@}
435
436 //____________________________________________________________________________
437 //
438 // Initialization
439 //____________________________________________________________________________
440
441 /** @name Initialization
442 * For each initialization method, one function in the low-level interface
443 * is provided. The naming scheme is <p>Initialize<Type></p> for a given
444 * initialization method Type.
445 */
446 ///@{
447
448 static void InitializeGauss(Matrix_t &A);
449 static void InitializeUniform(Matrix_t &A);
450 static void InitializeIdentity(Matrix_t &A);
451 static void InitializeZero(Matrix_t &A);
452 static void InitializeGlorotNormal(Matrix_t &A);
453 static void InitializeGlorotUniform(Matrix_t &A);
454
455 // return static instance of random generator used for initialization
456 // if generator does not exist it is created the first time with a random seed (e.g. seed = 0)
457 static TRandom &GetRandomGenerator();
458 // set random seed for the static geenrator
459 // if the static geneerator does not exists it is created
460 static void SetRandomSeed(size_t seed);
461 ///@}
462
463 //____________________________________________________________________________
464 //
465 // Dropout
466 //____________________________________________________________________________
467
468 /** @name Dropout
469 */
470 ///@{
471
472 /** Apply dropout with activation probability \p p to the given
473 * tensor \p A and scale the result by reciprocal of \p p. */
474 static void Dropout(Tensor_t &A, Scalar_t p) {}
475
476 ///@}
477
478 //____________________________________________________________________________
479 //
480 // Convolutional Layer Propagation
481 //____________________________________________________________________________
482
483 /** @name Forward Propagation in Convolutional Layer
484 */
485 ///@{
486
487 /** Add the biases in the Convolutional Layer. */
488 static void AddConvBiases(Matrix_t &output, const Matrix_t &biases);
489 ///@}
490
491 /** Dummy placeholder - preparation is currently only required for the CUDA architecture. */
492 static void PrepareInternals(Tensor_t &) {}
493
494 /** Forward propagation in the Convolutional layer */
495 static void ConvLayerForward(Tensor_t &output,
496 Tensor_t &inputActivationFunc, // this is output conv w/o activ func.
497 const Tensor_t &input, const Matrix_t &weights, const Matrix_t &biases,
498 const DNN::CNN::TConvParams &params, EActivationFunction activFunc,
499 Tensor_t & /* inputPrime */, const ConvDescriptors_t &descriptors,
500 ConvWorkspace_t &workspace);
501 // const AFloat alpha = 1,
502 // const AFloat beta = 1);
503
504 /** @name Backward Propagation in Convolutional Layer
505 */
506 ///@{
507
508 /** Perform the complete backward propagation step in a Convolutional Layer.
509 * If the provided \p activationGradientsBackward matrix is not empty, compute the
510 * gradients of the objective function with respect to the activations
511 * of the previous layer (backward direction).
512 * Also compute the weight and the bias gradients. Modifies the values
513 * in \p df and thus produces only a valid result, if it is applied the
514 * first time after the corresponding forward propagation has been per-
515 * formed. */
516 static void ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients,
517 Matrix_t &biasGradients, Tensor_t &inputActivation, Tensor_t &activationGradients,
518 const Matrix_t &weights, const Tensor_t &activationBackward,
519 const Tensor_t &outputTensor, EActivationFunction activFunc,
520 const ConvDescriptors_t &descriptors, ConvWorkspace_t &workspace, size_t /*batchSize*/,
521 size_t /*inputHeight*/, size_t /*inputWidth*/, size_t /*depth*/, size_t /*height*/,
522 size_t /*width*/, size_t /*filterDepth*/, size_t /*filterHeight*/,
523 size_t /*filterWidth*/, size_t /*nLocalViews*/);
524
525 ///@}
526
527 //____________________________________________________________________________
528 //
529 // Max Pooling Layer Propagation
530 //____________________________________________________________________________
531 /** @name Forward Propagation in Max Pooling Layer
532 */
533 ///@{
534
535 /** Downsample the matrix \p C to the matrix \p A, using max
536 * operation, such that the winning indices are stored in matrix
537 * \p B. No winning indices needed for cuDNN. */
538 static void Downsample(Tensor_t &A, Tensor_t & /*B*/, const Tensor_t &C, const PoolingDescriptors_t &descriptors,
539 PoolingWorkspace_t &workspace, size_t imgHeight, size_t imgWidth, size_t fltHeight,
540 size_t fltWidth, size_t strideRows, size_t strideCols);
541
542 ///@}
543
544 /** @name Backward Propagation in Max Pooling Layer
545 */
546 ///@{
547 /** Perform the complete backward propagation step in a Pooling Layer. Based on the
548 * input to and output from the MaxPoolLayer, the gradients for the winning pixels
549 * are computed. */
550 static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward, const Tensor_t &activationGradients,
551 const Tensor_t & /*indexMatrix*/, const Tensor_t &inputActivation,
552 const Tensor_t &outputTensor, const PoolingDescriptors_t &descriptors,
553 PoolingWorkspace_t &workspace, size_t imgHeight, size_t imgWidth, size_t fltHeight,
554 size_t fltWidth, size_t strideRows, size_t strideCols, size_t nLocalViews);
555
556 ///@}
557
558 //____________________________________________________________________________
559 //
560 // Reshape Layer Propagation
561 //____________________________________________________________________________
562 /** @name Forward and Backward Propagation in Reshape Layer
563 */
564 ///@{
565
566 /** Transform the matrix \p B to a matrix with different dimensions \p A */
567 // static void Reshape(Matrix_t &A, const Matrix_t &B);
568
569 /** Flattens the tensor \p B, such that each matrix, is stretched in
570 * one row, resulting with a matrix \p A. */
571 static void Flatten(Tensor_t &A, const Tensor_t &B);
572
573 /** Transforms each row of \p B to a matrix and stores it in the
574 * tensor \p B. */
575 static void Deflatten(Tensor_t &A, const Tensor_t &B); // size_t index, size_t nRows,size_t nCols);
576
577 /** Rearrage data accoring to time fill B x T x D out with T x B x D matrix in*/
578 static void Rearrange(Tensor_t &out, const Tensor_t &in) { TCuda<AFloat>::Rearrange(out, in); }
579
580 /** Backward pass for Recurrent Networks */
581 static Matrix_t &RecurrentLayerBackward(Matrix_t &state_gradients_backward, // BxH
582 Matrix_t & /* input_weight_gradients */, Matrix_t &/* state_weight_gradients */,
583 Matrix_t &/* bias_gradients */,
584 Matrix_t &/* df */, // DxH
585 const Matrix_t &/* state */, // BxH
586 const Matrix_t &/* weights_input */, // HxD
587 const Matrix_t &/* weights_state */, // HxH
588 const Matrix_t &/* input */, // BxD
589 Matrix_t &/* input_gradient */)
590 {
591 return state_gradients_backward;
592 }
593
594 ///@}
595
596 //____________________________________________________________________________
597 //
598 // Additional Arithmetic Functions
599 //____________________________________________________________________________
600
601 /** @name Additional Arithmetic Functions
602 *
603 * Additional arithmetic on CUDA matrices used to implement the low-level
604 * interface.
605 */
606
607 /** In-place Hadamard (element-wise) product of matrices \p A and \p B
608 * with the result being written into \p A.
609 */
610 static void Hadamard(Tensor_t &A, const Tensor_t &B)
611 {
612 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), 1, A.GetSize());
613 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), 1, B.GetSize());
614 assert(A.GetSize() == B.GetSize());
615 TCuda<AFloat>::Hadamard(tmpA, tmpB);
616 }
617 // static void Hadamard(Matrix_t &A,
618 // const Matrix_t &B);*/
619 // {
620 // Tensor_t tA(A);
621 // Hadamard( tA, Tensor_t(B));
622 // }
623
624
625 /** Compute the sum of all elements in \p A */
626 static Scalar_t Sum(const Matrix_t &A, Scalar_t alpha = 1.0, Scalar_t beta = 0.0);
627
628 /** Check two matrices for equality, taking floating point arithmetic errors into account. */
629 //static bool AlmostEquals(const Matrix_t &A, const Matrix_t &B, double epsilon = 0.1);
630
631 /** Add the constant \p beta to all the elements of matrix \p A and write the
632 * result into \p A.
633 */
634 static void ConstAdd(Matrix_t &A, Scalar_t beta) {
635 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
637 }
638
639 /** Multiply the constant \p beta to all the elements of matrix \p A and write the
640 * result into \p A.
641 */
642 static void ConstMult(Matrix_t &A, Scalar_t beta) {
643 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
645 }
646
647 /** Reciprocal each element of the matrix \p A and write the result into
648 * \p A
649 */
650 static void ReciprocalElementWise(Matrix_t &A) {
651 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
653 }
654
655 /** Square each element of the matrix \p A and write the result into
656 * \p A
657 */
658 static void SquareElementWise(Matrix_t &A) {
659 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
661 }
662
663 /** Square root each element of the matrix \p A and write the result into
664 * \p A
665 */
666 //static void SqrtElementWise(Matrix_t &A, Scalar_t alpha = 1, Scalar_t beta = 0, Scalar_t gamma = 0) {
667 static void SqrtElementWise(Matrix_t &A) {
668 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
670 }
671
672 // optimizer functions
673 static void AdamUpdate(Matrix_t & A, const Matrix_t & M, const Matrix_t & V, Scalar_t alpha, Scalar_t eps) {
674 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
675 TCudaMatrix<AFloat> tmpM(M.GetDeviceBuffer(), M.GetSize(),1);
676 TCudaMatrix<AFloat> tmpV(V.GetDeviceBuffer(), V.GetSize(),1);
677 TCuda<AFloat>::AdamUpdate(tmpA, tmpM, tmpV,alpha, eps);
678 }
679 static void AdamUpdateFirstMom(Matrix_t & A, const Matrix_t & B, Scalar_t beta) {
680 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
681 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
683 }
684 static void AdamUpdateSecondMom(Matrix_t & A, const Matrix_t & B, Scalar_t beta) {
685 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
686 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
688 }
689
690 // printing of tensor
691 static void PrintTensor( const Tensor_t & A, const std::string name = "tensor", bool = false);
692
693
694
695 ///////////////////////////////////////////////////////////////////////////////
696 /// extra functions defined only for CPU architecture !!!
697 //////////////////////////////////////////////////////////////////////////////
698
699 /** Sum rows of (m x n) matrix \p A and write the results into the first
700 * m elements in \p B.
701 */
702 static void SumRows(Matrix_t & B, const Matrix_t & A);
703
704
705};
706
707
708//____________________________________________________________________________
709template <typename AFloat>
710template <typename ATensor>
711void TCudnn<AFloat>::CopyDiffArch(TCudaTensor<AFloat> &B,
712 const ATensor &A)
713{
714
715 // should add static assert that A has not to be same type as B
716
717 // this copying tensors from different architectures
718 if (B.GetLayout() == GetTensorLayout() ) {
719 assert(B.GetShape().size() == 4);
720 for (size_t i = 0; i < A.GetFirstSize(); ++i) {
721 TMatrixT<AFloat> matIn = A.At(i).GetMatrix(); // this convert tensor (B,D,HW) in (D,HW)i -> (D,HW)i
722 // TMAtrix has the correct layout (row-wise) no need to traspose in this case
723 TCudaTensor<AFloat> tmpOut = B.At(i); // matrix (D,HW)
724 // copy will copy the buffer
725 TCudaTensor<AFloat> tmpIn(matIn.GetMatrixArray(), tmpOut.GetShape(), tmpOut.GetLayout());
726 Copy(tmpOut, tmpIn);
727 }
728 } else {
729 // case of same layout (column major)
730 TMatrixT<AFloat> tmp = A;
731 TCudaMatrix<AFloat> tmp2(tmp);
732 TCudaTensor<AFloat> tA(tmp2);
733 Copy(B, tA);
734 }
735}
736
737//____________________________________________________________________________
738template <typename AFloat>
739template <typename AMatrix>
740void TCudnn<AFloat>::CopyWeightsDiffArch(TCudaTensor<AFloat> &B, const AMatrix &A)
741{
742 // copy from another architecture using the reference one
743 // this is not very efficient since creates temporary objects
744 TMatrixT<AFloat> tmp = A; // .GetMatrix();
745 // we need to traspose for different layout
746 if (B.GetLayout() == GetTensorLayout() ) {
747 // this is for CNN weights that are in row-major formats
748 assert(B.GetShape().size() == 4); // weights shape should be 4
749 tmp.T();
750 }
751 TCudaMatrix<AFloat> tmp2(tmp);
752 TCudaTensor<AFloat> tA(tmp2);
753 Copy(B, tA);
754}
755
756//____________________________________________________________________________
757template <typename AFloat>
758template <typename AMatrix_t>
759void TCudnn<AFloat>::CopyDiffArch(std::vector<Tensor_t> &B,
760 const std::vector<AMatrix_t> &A)
761{
762 for (size_t i = 0; i < B.size(); ++i) {
763 CopyWeightsDiffArch(B[i], A[i]);
764 }
765}
766
767template <typename AFloat>
768void TCudnn<AFloat>::PrintTensor(const typename TCudnn<AFloat>::Tensor_t & A, const std::string name, bool truncate )
769{
770 std::cout << name << " size = " << A.GetSize() << " shape = { ";
771 auto shape = A.GetShape();
772 for (size_t k = 0; k < shape.size()-1; ++k)
773 std::cout << shape[k] << " , ";
774 std::cout << shape.back() << " } ";
775 std::cout << " strides = { ";
776 auto strides = A.GetStrides();
777 for (size_t k = 0; k < strides.size()-1; ++k)
778 std::cout << strides[k] << " , ";
779 std::cout << strides.back() << " }\n ";
780
781 if (A.GetShape().size() == 2 ) {
782 for (size_t i = 0; i < A.GetShape()[0]; ++i) {
783 std::cout << "{ ";
784 size_t n = A.GetShape()[1];
785 if (truncate) n = std::min(n,size_t(10));
786 for (size_t j = 0; j < n; ++j) {
787 std::cout << A(i,j) << " ";
788
789 }
790 if (truncate && n < A.GetShape()[1]) std::cout << " ...... ";
791 std::cout << " } " << std::endl;
792 }
793 } else if (A.GetShape().size() == 3 ) {
794 for (size_t i = 0; i < A.GetFirstSize(); ++i) {
795 std::cout << "{ ";
796 for (size_t j = 0; j < A.GetHSize(); ++j) {
797 std::cout << "{ ";
798 size_t n = A.GetWSize();
799 if (truncate) n = std::min(n,size_t(10));
800 for (size_t k = 0; k < n; ++k) {
801 std::cout << A(i,j,k) << " ";
802 }
803 if (truncate && n < A.GetWSize()) std::cout << " ...... ";
804 std::cout << " } " << std::endl;
805 }
806 std::cout << " } " << std::endl;
807 }
808 } else if (A.GetShape().size() == 4 ) {
809 for (size_t i = 0; i < A.GetShape()[0]; ++i) {
810 std::cout << "{ ";
811 for (size_t j = 0; j < A.GetShape()[1]; ++j) {
812 std::cout << "{ ";
813 for (size_t k = 0; k < A.GetShape()[2]; ++k) {
814 size_t n = A.GetShape()[3];
815 if (truncate) n = std::min(n,size_t(10));
816 for (size_t l = 0; l < n; ++l) {
817 std::cout << A(i,j,k,l) << " ";
818 }
819 if (truncate && n < A.GetShape()[3]) std::cout << " ...... ";
820 std::cout << " } " << std::endl;
821 }
822 std::cout << " } " << std::endl;
823 }
824 std::cout << " } " << std::endl;
825 }
826 }
827 else {
828 for (size_t l = 0; l < A.GetSize(); ++l) {
829 std::cout << A.GetData()[l] << " ";
830 }
831 std::cout << "\n";
832 }
833}
834
835// initialize the CNN options
836// possible options for forward (from 0 to 7)
837//
838// 0 : CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
839// 1 : CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
840// 6 : CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
841// 7 : CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; (lots of memory)
842
843// for backward data (from 0 to 5)
844// 1 : CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
845// 5 CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
846
847template <typename AFloat>
848int TCudnn<AFloat>::CNNOptions::ConvFwdAlgorithm = -1;
849template <typename AFloat>
850int TCudnn<AFloat>::CNNOptions::ConvBwdDataAlgorithm = -1;
851template <typename AFloat>
852int TCudnn<AFloat>::CNNOptions::ConvBwdFilterAlgorithm = -1;
853template <typename AFloat>
854Long_t TCudnn<AFloat>::CNNOptions::ConvMaxWorkspaceSize = -1; // -1 let use Cudnn defaults
855
856} // namespace DNN
857} // namespace TMVA
858
859#endif
860#endif
#define c(i)
Definition: RSha256.hxx:101
#define h(i)
Definition: RSha256.hxx:106
long Long_t
Definition: RtypesCore.h:50
char name[80]
Definition: TGX11.cxx:109
static void AdamUpdate(Matrix_t &A, const Matrix_t &M, const Matrix_t &V, Scalar_t alpha, Scalar_t eps)
static void ReciprocalElementWise(Matrix_t &A)
Reciprocal each element of the matrix A and write the result into A.
static void ConstAdd(Matrix_t &A, Scalar_t beta)
Add the constant beta to all the elements of matrix A and write the result into A.
static void AdamUpdateFirstMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta)
static void AddL1RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
static void AdamUpdateSecondMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta)
static void SquareElementWise(Matrix_t &A)
Square each element of the matrix A and write the result into A.
static Scalar_t L2Regularization(const Matrix_t &W)
static void Hadamard(Tensor_t &A, const Tensor_t &B)
In-place Hadamard (element-wise) product of matrices A and B with the result being written into A.
static void SqrtElementWise(Matrix_t &A)
Square root each element of the matrix A and write the result into A.
static void ConstMult(Matrix_t &A, Scalar_t beta)
Multiply the constant beta to all the elements of matrix A and write the result into A.
static void Rearrange(Tensor_t &out, const Tensor_t &in)
Rearrage data accoring to time fill B x T x D out with T x B x D matrix in.
static Scalar_t L1Regularization(const Matrix_t &W)
TMatrixT.
Definition: TMatrixT.h:39
virtual const Element * GetMatrixArray() const
Definition: TMatrixT.h:222
TMatrixT< Element > & T()
Definition: TMatrixT.h:150
This is the base class for the ROOT Random number generators.
Definition: TRandom.h:27
double beta(double x, double y)
Calculates the beta function.
Double_t y[n]
Definition: legend1.C:17
Double_t x[n]
Definition: legend1.C:17
const Int_t n
Definition: legend1.C:16
static double B[]
static double A[]
static double C[]
double gamma(double x)
void Copy(void *source, void *dest)
T Sum(const RVec< T > &v)
Sum elements of an RVec.
Definition: RVec.hxx:758
static constexpr double L
struct TMVA::DNN::CNN::TConvParams TConvParams
double weightDecay(double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
compute the weight decay for regularization (L1 or L2)
Definition: NeuralNet.icc:498
EActivationFunction
Enum that represents layer activation functions.
Definition: Functions.h:32
std::shared_ptr< std::function< double(double)> > Sigmoid
Definition: NeuralNet.cxx:26
T Identity(T value)
Identity function f(x) = x.
Definition: Objectives.hxx:41
MemoryLayout
Memory layout type (copy from RTensor.hxx)
Definition: CudaTensor.h:47
create variable transformations
auto * l
Definition: textangle.C:4
REAL epsilon
Definition: triangle.c:617
static void output(int code)
Definition: gifencode.c:226