Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
TCudnn.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Joana Niermann 23/07/19
3
4/*************************************************************************
5 * Copyright (C) 2019, Joana Niermann *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12///////////////////////////////////////////////////////////////////
13// Definition of the TCudnn architecture class, which provides //
14// a wrapping of the low-level functionality for neural networks //
15// in the cuDNN library. //
16///////////////////////////////////////////////////////////////////
17
18#ifndef TMVA_DNN_ARCHITECTURES_CUDNN
19#define TMVA_DNN_ARCHITECTURES_CUDNN
20
21#include "RConfigure.h" // for definition of R__HAS_CUDNN
22
23#ifndef R__HAS_CUDNN
24#error This file can be compiled only when cudnn is available in ROOT
25#else
26
27#include "cudnn.h"
28
29#include "TMVA/DNN/Functions.h"
31//#include "TMVA/DNN/CNN/Descriptors.h"
38
39
40#include "Cuda/CudaBuffers.h"
41#include "Cuda/CudaTensor.h"
43#include <utility>
44#include <vector>
45#include <string>
46
48
49class TRandom;
50
51namespace TMVA
52{
53namespace DNN
54{
55
56struct TCudnnEmptyDescriptor {};
57
58
59/** The TCudnn architecture class.
60 *
61 * Low-level interface class for CUDA computing architectures using the cuDNN
62 * library as backend. Contains as public types the declaration of the scalar,
63 * matrix and buffer types for this architecture, as well as the remaining
64 * functions in the low-level interface in the form of static members.
65 */
66template<typename AFloat = Float_t>
67class TCudnn
68{
69private:
70 static TRandom * fgRandomGen;
71public:
72
73 using Scalar_t = AFloat;
74 using Matrix_t = TCudaTensor<AFloat>;
75 using Tensor_t = TCudaTensor<AFloat>;
76 using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>;
77 using HostBuffer_t = TCudaHostBuffer<AFloat>;
78
79 // The descriptors for the (tensor) data are held by the data classes (CudaTensor)
80 using ActivationDescriptor_t = cudnnActivationDescriptor_t;
81 using ConvolutionDescriptor_t = cudnnConvolutionDescriptor_t;
82 using DropoutDescriptor_t = cudnnDropoutDescriptor_t;
83 using FilterDescriptor_t = cudnnFilterDescriptor_t;
84 //using OpTensorDescriptor_t = cudnnOpTensorDescriptor_t;
85 using PoolingDescriptor_t = cudnnPoolingDescriptor_t;
86 //using ReductionDescriptor_t = cudnnReduceTensorDescriptor_t;
87 using AlgorithmForward_t = cudnnConvolutionFwdAlgo_t;
88 using AlgorithmBackward_t = cudnnConvolutionBwdDataAlgo_t;
89 using AlgorithmHelper_t = cudnnConvolutionBwdFilterAlgo_t;
90 using AlgorithmDataType_t = cudnnDataType_t;
91 using ReduceTensorDescriptor_t = cudnnReduceTensorDescriptor_t;
92 using TensorDescriptor_t = cudnnTensorDescriptor_t;
93 using RecurrentDescriptor_t = cudnnRNNDescriptor_t;
94#if (CUDNN_VERSION >= 8000)
96#else
98#endif
99 using EmptyDescriptor_t = TCudnnEmptyDescriptor; // Used if a descriptor is not needed in a class
100
101 using BNormLayer_t = TBatchNormLayer<TCudnn<AFloat>>;
102 using BNormDescriptors_t = TDNNGenDescriptors<BNormLayer_t>;
103 //using BNormWorkspace_t = CNN::TCNNWorkspace<BNormLayer_t>;*/
104 using ConvLayer_t = CNN::TConvLayer<TCudnn<AFloat>>;
105 using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
106 using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
107 using PoolingLayer_t = CNN::TMaxPoolLayer<TCudnn<AFloat>>;
108 using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
109 using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
110
111 using RNNLayer_t = RNN::TBasicRNNLayer<TCudnn<AFloat>>;
112 using RNNDescriptors_t = RNN::TRNNDescriptors<TCudnn<AFloat>>;
113 using RNNWorkspace_t = RNN::TRNNWorkspace<TCudnn<AFloat>>;
114
115 using LSTMLayer_t = RNN::TBasicLSTMLayer<TCudnn<AFloat>>;
116 // using LSTMDescriptors_t = RNN::TRNNDescriptors<LSTMLayer_t>;
117 // using LSTMWorkspace_t = RNN::TRNNWorkspace<LSTMLayer_t>;
118
119 using GRULayer_t = RNN::TBasicGRULayer<TCudnn<AFloat>>;
120 // using GRUDescriptors_t = RNN::TRNNDescriptors<GRULayer_t>;
121 // using GRUWorkspace_t = RNN::TRNNWorkspace<GRULayer_t>;
122
123 // template <typename AFloat>
124 // using ConvDescriptors_t = CNN::TCNNDescriptors<CNN::TConvLayer<TCudnn<AFloat>>>;
125
126 // convolution options
127 // default is -1 (left to cudnn)
128 struct CNNOptions {
129
130 static int ConvFwdAlgorithm;
131 static int ConvBwdDataAlgorithm;
132 static int ConvBwdFilterAlgorithm;
133 // default is 0 (left to cudnn : a value -1 will indicate to not use any space)
135 }; // namespace DNN
136
137 static TMVA::Experimental::MemoryLayout GetTensorLayout() { return TMVA::Experimental::MemoryLayout::RowMajor; }
138
139
140 static Tensor_t CreateTensor(size_t n, size_t c, size_t h, size_t w) {
141 return Tensor_t( {n,c,h,w}, GetTensorLayout(), 0, 0);
142 }
143
144 static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t h, size_t w) {
145 return Tensor_t( buffer, {n,c,h,w}, GetTensorLayout(), 0, 0);
146 }
147
148 static Tensor_t CreateTensor(size_t n, size_t c, size_t w)
149 {
150 return Tensor_t({n, c, w}, GetTensorLayout(), 0, 0);
151 }
152
153 static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t w)
154 {
155 return Tensor_t(buffer, {n, c, w}, GetTensorLayout(), 0, 0);
156 }
157
158 static bool IsCudnn() { return true; }
159
160 // create a weight tensor/matrix vector from another tensor/weight vector using the given tensor shapes
161 // this function is used by the optimizers to store intermediate weights representations
162 static void CreateWeightTensors( std::vector<Matrix_t> & newWeights, const std::vector<Matrix_t> & weights) {
163 if (!newWeights.empty()) newWeights.clear();
164 size_t n = weights.size();
165 for (size_t i = 0; i < n; ++i)
166 newWeights.emplace_back( weights[i].GetShape(), weights[i].GetLayout(), 0, 0);
167 }
168 //____________________________________________________________________________
169 //
170 // Architecture Initialization
171 //____________________________________________________________________________
172
173 static void InitializeBNormDescriptors(TDescriptors * & descriptors,
174 BNormLayer_t *L = nullptr);
175
176 static void InitializeConvDescriptors(TDescriptors * & descriptors,
177 ConvLayer_t *L = nullptr);
178
179 static void InitializePoolDescriptors(TDescriptors * & descriptors,
180 PoolingLayer_t *L = nullptr);
181
182 static void InitializeRNNDescriptors(TDescriptors *&descriptors, RNNLayer_t *layer)
183 {
185 }
186 static void InitializeLSTMDescriptors(TDescriptors *&descriptors, LSTMLayer_t *layer) {
188 }
189 static void InitializeGRUDescriptors(TDescriptors *&descriptors, GRULayer_t *layer) {
191 }
192 template<typename RNNLayer>
193 static void InitializeRecurrentDescriptors(TDescriptors *&descriptors, RNNLayer *L);
194 // static void InitializeRNNDescriptors(TDescriptors *&descriptors, LSTMLayer_t *L = nullptr);
195 // static void InitializeRNNDescriptors(TDescriptors *&descriptors, GRULayer_t *L = nullptr);
196
197 static void InitializeActivationDescriptor(ActivationDescriptor_t & descriptors, EActivationFunction activFunc, double coef = 0.0);
198
199 static void ReleaseConvDescriptors(TDescriptors * descriptors );
200 static void ReleasePoolDescriptors(TDescriptors * descriptors );
201 static void ReleaseRNNDescriptors(TDescriptors *descriptors);
202 static void ReleaseBNormDescriptors(TDescriptors * descriptors );
203 static void ReleaseDescriptor(EmptyDescriptor_t & emptyDescr) {} // Does nothing
204 static void ReleaseDescriptor(ActivationDescriptor_t & activationDescr);
205 static void ReleaseDescriptor(ConvolutionDescriptor_t & convolutionDescr);
206 static void ReleaseDescriptor(DropoutDescriptor_t & dropoutDescr);
207 static void ReleaseDescriptor(FilterDescriptor_t & filterDescr);
208 static void ReleaseDescriptor(PoolingDescriptor_t & poolingDescr);
209 static void ReleaseDescriptor(TensorDescriptor_t & tensorDescr);
210
211
212 static void InitializeConvWorkspace(TWorkspace * & workspace,
213 TDescriptors * & descriptors,
214 const DNN::CNN::TConvParams & params,
215 ConvLayer_t *L = nullptr);
216 static void InitializePoolDropoutWorkspace(TWorkspace * & workspace,
217 TDescriptors * & descriptors,
218 const DNN::CNN::TConvParams & params,
219 PoolingLayer_t *L = nullptr);
220
221 static void InitializeRNNWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, RNNLayer_t *layer)
222 {
224 }
225 static void InitializeLSTMWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, LSTMLayer_t *layer)
226 {
228 }
229 static void InitializeGRUWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, GRULayer_t *layer)
230 {
232 }
233 template<typename RNNLayer>
234 static void InitializeRecurrentWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors,
235 RNNLayer *layer);
236
237 static void FreeConvWorkspace(TWorkspace * workspace);
238 static void FreePoolDropoutWorkspace(TWorkspace * workspace);
239 static void FreeRNNWorkspace(TWorkspace *workspace);
240
241 // tensor inizialization for recurrent networks
242 static void InitializeRNNTensors(RNNLayer_t *layer) { InitializeRecurrentTensors<RNNLayer_t>(layer); }
243 static void InitializeLSTMTensors(LSTMLayer_t *layer) { InitializeRecurrentTensors<LSTMLayer_t>(layer); }
244 static void InitializeGRUTensors(GRULayer_t *layer) { InitializeRecurrentTensors<GRULayer_t>(layer); }
245 template <typename RNNLayer>
247
248 //____________________________________________________________________________
249 //
250 // Propagation
251 //____________________________________________________________________________
252
253 /** @name Forward Propagation
254 * Low-level functions required for the forward propagation of activations
255 * through the network.
256 */
257 ///@{
258 /** Matrix-multiply \p input with the transpose of \pweights and
259 * write the results into \p output. */
260 static void MultiplyTranspose(Tensor_t &output, const Tensor_t &input, const Matrix_t &weights);
261
262 /** Add the vectors biases row-wise to the matrix output */
263 static void AddRowWise(Tensor_t &output,const Matrix_t &biases);
264
265 /** @name Backward Propagation (Dense Layers)
266 * Low-level functions required for the forward propagation of activations
267 * through the network.
268 */
269 ///@{
270 /** Perform the complete backward propagation step. If the provided
271 * \p activationGradientsBackward matrix is not empty, compute the
272 * gradients of the objective function with respect to the activations
273 * of the previous layer (backward direction).
274 * Also compute the weight and the bias gradients. Modifies the values
275 * in \p df and thus produces only a valid result, if it is applied the
276 * first time after the corresponding forward propagation has been per-
277 * formed. */
278 static void Backward(Tensor_t & activationGradientsBackward,
279 Matrix_t & weightGradients,
280 Matrix_t & biasGradients,
281 Tensor_t & df,
282 const Tensor_t & activationGradients,
283 const Matrix_t & weights,
284 const Tensor_t & activationBackward);
285
286 /** Above functions extended to vectors */
287 static void ScaleAdd(Tensor_t & A, const Tensor_t & B,
288 Scalar_t alpha = 1.0,
289 Scalar_t beta = 1.0);
290
291 /** Deep copy from B to A. */
292 static void Copy(Tensor_t & A, const Tensor_t & B);
293
294 // copy from another tensor
295 template<typename ATensor_t>
296 static void CopyDiffArch(Tensor_t & A,
297 const ATensor_t & B);
298
299 template <typename ATensor_t>
300 static void CopyWeightsDiffArch(Tensor_t &A, const ATensor_t &B);
301
302 //template<>
303 static void CopyDiffArch(Tensor_t A, const Tensor_t & B ) { Copy(A,B); }
304
305 // copy from vector of matrices of different types
306 template<typename AMatrix_t>
307 static void CopyDiffArch(std::vector<Tensor_t> & A,
308 const std::vector<AMatrix_t> & B);
309
310
311 //____________________________________________________________________________
312 //
313 // Activation Functions
314 //____________________________________________________________________________
315
316 /** @name Activation Functions
317 * For each activation function, the low-level interface contains two routines.
318 * One that applies the activation function to a matrix and one that evaluate
319 * the derivatives of the activation function at the elements of a given matrix
320 * and writes the results into the result matrix.
321 */
322 ///@{
323 static void Identity(Tensor_t & X) {}
324 static void IdentityDerivative(Tensor_t & dX, Tensor_t& X,
325 Tensor_t & Y, Tensor_t & dY,
326 ActivationDescriptor_t activationDescr,
327 const AFloat alpha = 1,
328 const AFloat beta = 1) {}
329
330 static void ActivationFunctionForward(Tensor_t & X, EActivationFunction activFunct,
331 const ActivationDescriptor_t activationDescr,
332 const double coef = 0.0, const AFloat alpha = 1,
333 const AFloat beta = 0);
334
335 // same as above but using different input/output tensors
336 static void ActivationFunctionForward(Tensor_t &Y, const Tensor_t & X, EActivationFunction activFunct,
337 const ActivationDescriptor_t activationDescr, const double coef = 0.0,
338 const AFloat alpha = 1, const AFloat beta = 0);
339
340 /** Computes the gradient of the activation function */
341 static void ActivationFunctionBackward(Tensor_t & dX, const Tensor_t & Y,
342 const Tensor_t & dY, const Tensor_t & X,
343 EActivationFunction activFunct,
344 const ActivationDescriptor_t activationDescr,
345 const AFloat alpha = 1,
346 const AFloat beta = 0);
347
348 //
349 // No cudnn implementation for the following activation functions
350 //
351 //static void SymmetricRelu(Tensor_t & B);
352
353 // implementations not used by Cudnn
354 static void Relu(Tensor_t &) {}
355 static void Sigmoid(Tensor_t &) {}
356 static void Tanh(Tensor_t &) {}
357 static void FastTanh(Tensor_t &) {}
358 static void SymmetricRelu(Tensor_t &) {}
359 static void SoftSign(Tensor_t &) {}
360 static void Gauss(Tensor_t &) {}
361
362 static void IdentityDerivative(Tensor_t &, const Tensor_t &) {}
363 static void ReluDerivative(Tensor_t &, const Tensor_t &) {}
364 static void SigmoidDerivative(Tensor_t &, const Tensor_t &) {}
365 static void TanhDerivative(Tensor_t &, const Tensor_t &) {}
366 static void FastTanhDerivative(Tensor_t &, const Tensor_t &) {}
367 static void SymmetricReluDerivative(Tensor_t & , const Tensor_t & ) {}
368 static void SoftSignDerivative(Tensor_t & , const Tensor_t & ) {}
369 static void GaussDerivative(Tensor_t & , const Tensor_t & ) {}
370 ///@}
371
372 //____________________________________________________________________________
373 //
374 // Loss Functions
375 //____________________________________________________________________________
376
377 /** @name Loss Functions
378 * Loss functions compute a scalar value given the \p output of the network
379 * for a given training input and the expected network prediction \p Y that
380 * quantifies the quality of the prediction. For each function also a routing
381 * that computes the gradients (suffixed by Gradients) must be provided for
382 * the starting of the backpropagation algorithm.
383 */
384 ///@{
385
386 static Scalar_t MeanSquaredError(const Matrix_t &Y, const Matrix_t &output,
387 const Matrix_t &weights);
388 static void MeanSquaredErrorGradients(Matrix_t &dY, const Matrix_t &Y,
389 const Matrix_t &output, const Matrix_t &weights);
390
391 /** Sigmoid transformation is implicitly applied, thus \p output should
392 * hold the linear activations of the last layer in the net. */
393 static Scalar_t CrossEntropy(const Matrix_t &Y, const Matrix_t &output,
394 const Matrix_t &weights);
395
396 static void CrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
397 const Matrix_t &output, const Matrix_t &weights);
398
399 /** Softmax transformation is implicitly applied, thus \p output should
400 * hold the linear activations of the last layer in the net. */
401 static Scalar_t SoftmaxCrossEntropy(const Matrix_t &Y, const Matrix_t &output,
402 const Matrix_t &weights);
403 static void SoftmaxCrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
404 const Matrix_t &output, const Matrix_t &weights);
405 ///@}
406
407 //____________________________________________________________________________
408 //
409 // Output Functions
410 //____________________________________________________________________________
411
412 /** @name Output Functions
413 * Output functions transform the activations \p output of the
414 * output layer in the network to a valid prediction \p YHat for
415 * the desired usage of the network, e.g. the identity function
416 * for regression or the sigmoid transformation for two-class
417 * classification.
418 */
419 ///@{
420 static void Sigmoid(Matrix_t &YHat,
421 const Matrix_t & );
422 static void Softmax(Matrix_t &YHat,
423 const Matrix_t & );
424 ///@}
425
426
427
428 //____________________________________________________________________________
429 //
430 // Dropout
431 //____________________________________________________________________________
432
433 /** @name Dropout
434 */
435 ///@{
436
437 /** Apply dropout with activation probability \p p to the given
438 * tensor \p A and scale the result by reciprocal of \p p. */
439 static void DropoutForward(Tensor_t & A,
440 TDescriptors * descriptors,
441 TWorkspace * workspace,
442 Scalar_t p);
443
444 static void DropoutBackward(Tensor_t & A,
445 TDescriptors * descriptors,
446 TWorkspace * workspace);
447
448 ///@}
449
450 //____________________________________________________________________________
451 //
452 // Batch Normalization
453 //____________________________________________________________________________
454
455 /** @name Batch Normalization Layer Propagation
456 */
457 ///@{
458
459 /** The input from each batch are normalized during training to have zero mean and unit variance
460 * and they are then scaled by two parameter, different for each input variable:
461 * - a scale factor \gamma gamma
462 * - an offset \beta beta */
463
464 static void BatchNormLayerForwardTraining(int axis, const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta,
465 Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
466 Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
467 Scalar_t epsilon, const TensorDescriptor_t &bnParDescriptor);
468
469 /** During inference the inputs are not normalized using the batch mean but the previously computed
470 * at running mean and variance */
471
472 static void BatchNormLayerForwardInference(int axis, const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta,
473 Tensor_t &y, const Matrix_t &runningMeans,
474 const Matrix_t &runningVars, Scalar_t epsilon,
475 const TensorDescriptor_t &);
476
477 static void BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx,
478 Matrix_t &gamma, // Matrix_t &beta, (not needed)
479 Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean, const Matrix_t &variance,
480 const Matrix_t &iVariance, Scalar_t epsilon, const TensorDescriptor_t &);
481
482 //____________________________________________________________________________
483 //
484 // Regularization
485 //____________________________________________________________________________
486
487 /** @name Regularization
488 * For each regularization type two functions are required, one named
489 * <tt><Type>Regularization</tt> that evaluates the corresponding
490 * regularization functional for a given weight matrix and the
491 * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
492 * component in the gradients to the provided matrix.
493 */
494
495 static Scalar_t L1Regularization(const Matrix_t &W)
496 {
497 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
498 return TCuda<AFloat>::L1Regularization(mW);
499 }
500 static void AddL1RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
501 {
502 TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
503 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
504 return TCuda<AFloat>::AddL1RegularizationGradients(mA, mW, weightDecay);
505 }
506
507 static Scalar_t L2Regularization(const Matrix_t &W)
508 {
509 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
510 return TCuda<AFloat>::L2Regularization(mW);
511 }
512 static void AddL2RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
513 {
514 TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
515 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
516 return TCuda<AFloat>::AddL1RegularizationGradients(mA, mW, weightDecay);
517 }
518 ///@}
519
520 //____________________________________________________________________________
521 //
522 // Initialization
523 //____________________________________________________________________________
524
525 /** @name Initialization
526 * For each initialization method, one function in the low-level interface
527 * is provided. The naming scheme is <p>Initialize<Type></p> for a given
528 * initialization method Type.
529 */
530 ///@{
531
532 static void InitializeGauss(Matrix_t &A);
533 static void InitializeUniform(Matrix_t &A);
534 static void InitializeIdentity(Matrix_t &A);
535 static void InitializeZero(Matrix_t &A);
536 static void InitializeGlorotNormal(Matrix_t &A);
537 static void InitializeGlorotUniform(Matrix_t &A);
538
539 // return static instance of random generator used for initialization
540 // if generator does not exist it is created the first time with a random seed (e.g. seed = 0)
541 static TRandom &GetRandomGenerator();
542 // set random seed for the static generator
543 // if the static generator does not exists it is created
544 static void SetRandomSeed(size_t seed);
545 ///@}
546
547 //____________________________________________________________________________
548 //
549 // Dropout
550 //____________________________________________________________________________
551
552 /** @name Dropout
553 */
554 ///@{
555
556 /** Apply dropout with activation probability \p p to the given
557 * tensor \p A and scale the result by reciprocal of \p p. */
558 static void Dropout(Tensor_t &A, Scalar_t p) {}
559
560 ///@}
561
562 //____________________________________________________________________________
563 //
564 // Convolutional Layer Propagation
565 //____________________________________________________________________________
566
567 /** @name Forward Propagation in Convolutional Layer
568 */
569 ///@{
570
571 /** Add the biases in the Convolutional Layer. */
572 static void AddConvBiases(Matrix_t &output, const Matrix_t &biases);
573 ///@}
574
575 /** Dummy placeholder - preparation is currently only required for the CUDA architecture. */
576 static void PrepareInternals(Tensor_t &) {}
577
578 /** Forward propagation in the Convolutional layer */
579 static void ConvLayerForward(Tensor_t &output,
580 Tensor_t &inputActivationFunc, // this is output conv w/o activ func.
581 const Tensor_t &input, const Matrix_t &weights, const Matrix_t &biases,
582 const DNN::CNN::TConvParams &params, EActivationFunction activFunc,
583 Tensor_t & /* inputPrime */, const ConvDescriptors_t &descriptors,
584 ConvWorkspace_t &workspace);
585 // const AFloat alpha = 1,
586 // const AFloat beta = 1);
587
588 /** @name Backward Propagation in Convolutional Layer
589 */
590 ///@{
591
592 /** Perform the complete backward propagation step in a Convolutional Layer.
593 * If the provided \p activationGradientsBackward matrix is not empty, compute the
594 * gradients of the objective function with respect to the activations
595 * of the previous layer (backward direction).
596 * Also compute the weight and the bias gradients. Modifies the values
597 * in \p df and thus produces only a valid result, if it is applied the
598 * first time after the corresponding forward propagation has been per-
599 * formed. */
600 static void ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients,
601 Matrix_t &biasGradients, Tensor_t &inputActivation, Tensor_t &activationGradients,
602 const Matrix_t &weights, const Tensor_t &activationBackward,
603 const Tensor_t &outputTensor, EActivationFunction activFunc,
604 const ConvDescriptors_t &descriptors, ConvWorkspace_t &workspace, size_t /*batchSize*/,
605 size_t /*inputHeight*/, size_t /*inputWidth*/, size_t /*depth*/, size_t /*height*/,
606 size_t /*width*/, size_t /*filterDepth*/, size_t /*filterHeight*/,
607 size_t /*filterWidth*/, size_t /*nLocalViews*/);
608
609 ///@}
610
611 //____________________________________________________________________________
612 //
613 // Max Pooling Layer Propagation
614 //____________________________________________________________________________
615 /** @name Forward Propagation in Max Pooling Layer
616 */
617 ///@{
618
619 /** Downsample the matrix \p C to the matrix \p A, using max
620 * operation, such that the winning indices are stored in matrix
621 * \p B. No winning indices needed for cuDNN. */
622 static void Downsample(Tensor_t &A, Tensor_t & /*B*/, const Tensor_t &C, const PoolingDescriptors_t &descriptors,
623 PoolingWorkspace_t &workspace, size_t imgHeight, size_t imgWidth, size_t fltHeight,
624 size_t fltWidth, size_t strideRows, size_t strideCols);
625
626 ///@}
627
628 /** @name Backward Propagation in Max Pooling Layer
629 */
630 ///@{
631 /** Perform the complete backward propagation step in a Pooling Layer. Based on the
632 * input to and output from the MaxPoolLayer, the gradients for the winning pixels
633 * are computed. */
634 static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward, const Tensor_t &activationGradients,
635 const Tensor_t & /*indexMatrix*/, const Tensor_t &inputActivation,
636 const Tensor_t &outputTensor, const PoolingDescriptors_t &descriptors,
637 PoolingWorkspace_t &workspace, size_t imgHeight, size_t imgWidth, size_t fltHeight,
638 size_t fltWidth, size_t strideRows, size_t strideCols, size_t nLocalViews);
639
640 ///@}
641
642 //____________________________________________________________________________
643 //
644 // Reshape Layer Propagation
645 //____________________________________________________________________________
646 /** @name Forward and Backward Propagation in Reshape Layer
647 */
648 ///@{
649
650 /** Transform the matrix \p B to a matrix with different dimensions \p A */
651 // static void Reshape(Matrix_t &A, const Matrix_t &B);
652
653 /** Flattens the tensor \p B, such that each matrix, is stretched in
654 * one row, resulting with a matrix \p A. */
655 static void Flatten(Tensor_t &A, const Tensor_t &B);
656
657 /** Transforms each row of \p B to a matrix and stores it in the
658 * tensor \p B. */
659 static void Deflatten(Tensor_t &A, const Tensor_t &B); // size_t index, size_t nRows,size_t nCols);
660
661 /** Rearrage data according to time fill B x T x D out with T x B x D matrix in*/
662 static void Rearrange(Tensor_t &out, const Tensor_t &in);
663
664 // RNN functions
665 static void RNNForward(const Tensor_t &x, const Tensor_t &hx, const Tensor_t &cx, const Tensor_t &weights,
666 Tensor_t &y, Tensor_t &hy, Tensor_t &cy, const RNNDescriptors_t &descr,
667 RNNWorkspace_t &workspace, bool isTraining);
668
669 static void RNNBackward(const Tensor_t &x, const Tensor_t &hx, const Tensor_t &cx, const Tensor_t &y, const Tensor_t &dy,
670 const Tensor_t &dhy, const Tensor_t &dcy, const Tensor_t &weights, Tensor_t &dx, Tensor_t &dhx,
671 Tensor_t &dcx, Tensor_t &dw, const RNNDescriptors_t &desc, RNNWorkspace_t &workspace);
672
673
674 // Backward pass for Recurrent Networks functions used by another architectures
675 //******************************************************************************************
676 static Matrix_t &RecurrentLayerBackward(Matrix_t &state_gradients_backward, // BxH
677 Matrix_t & /* input_weight_gradients */,
678 Matrix_t & /* state_weight_gradients */, Matrix_t & /* bias_gradients */,
679 Matrix_t & /* df */, // DxH
680 const Matrix_t & /* state */, // BxH
681 const Matrix_t & /* weights_input */, // HxD
682 const Matrix_t & /* weights_state */, // HxH
683 const Matrix_t & /* input */, // BxD
684 Matrix_t & /* input_gradient */)
685 {
687 }
688 static Matrix_t &LSTMLayerBackward(
689 Matrix_t & state_gradients_backward , Matrix_t & /*cell_gradients_backward*/,
690 Matrix_t & /*input_weight_gradients*/, Matrix_t & /*forget_weight_gradients*/,
691 Matrix_t & /*candidate_weight_gradients*/, Matrix_t & /*output_weight_gradients*/,
692 Matrix_t & /*input_state_weight_gradients*/, Matrix_t & /*forget_state_weight_gradients*/,
693 Matrix_t & /*candidate_state_weight_gradients*/,
694 Matrix_t & /*output_state_weight_gradients*/, Matrix_t & /*input_bias_gradients*/,
695 Matrix_t & /*forget_bias_gradients*/, Matrix_t & /*candidate_bias_gradients*/,
696 Matrix_t & /*output_bias_gradients*/, Matrix_t & /*di*/, Matrix_t & /*df*/,
697 Matrix_t & /*dc*/, Matrix_t & /*dout*/,
698 const Matrix_t & /*precStateActivations*/, const Matrix_t & /*precCellActivations*/,
699 const Matrix_t & /*fInput*/, const Matrix_t & /*fForget*/,
700 const Matrix_t & /*fCandidate*/, const Matrix_t & /*fOutput*/,
701 const Matrix_t & /*weights_input*/, const Matrix_t & /*weights_forget*/,
702 const Matrix_t & /*weights_candidate*/, const Matrix_t & /*weights_output*/,
703 const Matrix_t & /*weights_input_state*/, const Matrix_t & /*weights_forget_state*/,
704 const Matrix_t & /*weights_candidate_state*/, const Matrix_t & /*weights_output_state*/,
705 const Matrix_t & /*input*/, Matrix_t & /*input_gradient*/,
706 Matrix_t & /*cell_gradient*/, Matrix_t & /*cell_tanh*/)
707 {
709 }
710
711 /** Backward pass for GRU Network */
712 static Matrix_t &GRULayerBackward(
713 Matrix_t & state_gradients_backward, Matrix_t & /*reset_weight_gradients*/,
714 Matrix_t & /*update_weight_gradients*/, Matrix_t & /*candidate_weight_gradients*/,
715 Matrix_t & /*reset_state_weight_gradients*/, Matrix_t & /*update_state_weight_gradients*/,
716 Matrix_t & /*candidate_state_weight_gradients*/, Matrix_t & /*reset_bias_gradients*/,
717 Matrix_t & /*update_bias_gradients*/, Matrix_t & /*candidate_bias_gradients*/,
718 Matrix_t & /*dr*/, Matrix_t & /*du*/, Matrix_t & /*dc*/,
719 const Matrix_t & /*precStateActivations*/, const Matrix_t & /*fReset*/,
720 const Matrix_t & /*fUpdate*/, const Matrix_t & /*fCandidate*/,
721 const Matrix_t & /*weights_reset*/, const Matrix_t & /*weights_update*/,
722 const Matrix_t & /*weights_candidate*/, const Matrix_t & /*weights_reset_state*/,
723 const Matrix_t & /*weights_update_state*/, const Matrix_t & /*weights_candidate_state*/,
724 const Matrix_t & /*input*/, Matrix_t & /*input_gradient*/, bool)
725 {
727 }
728
729 ///@}
730
731 //____________________________________________________________________________
732 //
733 // Additional Arithmetic Functions
734 //____________________________________________________________________________
735
736 /** @name Additional Arithmetic Functions
737 *
738 * Additional arithmetic on CUDA matrices used to implement the low-level
739 * interface.
740 */
741
742 /** In-place Hadamard (element-wise) product of matrices \p A and \p B
743 * with the result being written into \p A.
744 */
745 static void Hadamard(Tensor_t &A, const Tensor_t &B)
746 {
747 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), 1, A.GetSize());
748 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), 1, B.GetSize());
749 assert(A.GetSize() == B.GetSize());
750 TCuda<AFloat>::Hadamard(tmpA, tmpB);
751 }
752 // static void Hadamard(Matrix_t &A,
753 // const Matrix_t &B);*/
754 // {
755 // Tensor_t tA(A);
756 // Hadamard( tA, Tensor_t(B));
757 // }
758
759
760 /** Compute the sum of all elements in \p A */
761 static Scalar_t Sum(const Matrix_t &A, Scalar_t alpha = 1.0, Scalar_t beta = 0.0);
762
763 /** Check two matrices for equality, taking floating point arithmetic errors into account. */
764 //static bool AlmostEquals(const Matrix_t &A, const Matrix_t &B, double epsilon = 0.1);
765
766 /** Add the constant \p beta to all the elements of matrix \p A and write the
767 * result into \p A.
768 */
769 static void ConstAdd(Matrix_t &A, Scalar_t beta) {
770 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
771 TCuda<AFloat>::ConstAdd(tmp,beta);
772 }
773
774 /** Multiply the constant \p beta to all the elements of matrix \p A and write the
775 * result into \p A.
776 */
777 static void ConstMult(Matrix_t &A, Scalar_t beta) {
778 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
779 TCuda<AFloat>::ConstMult(tmp,beta);
780 }
781
782 /** Reciprocal each element of the matrix \p A and write the result into
783 * \p A
784 */
785 static void ReciprocalElementWise(Matrix_t &A) {
786 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
787 TCuda<AFloat>::ReciprocalElementWise(tmp);
788 }
789
790 /** Square each element of the matrix \p A and write the result into
791 * \p A
792 */
793 static void SquareElementWise(Matrix_t &A) {
794 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
795 TCuda<AFloat>::SquareElementWise(tmp);
796 }
797
798 /** Square root each element of the matrix \p A and write the result into
799 * \p A
800 */
801 //static void SqrtElementWise(Matrix_t &A, Scalar_t alpha = 1, Scalar_t beta = 0, Scalar_t gamma = 0) {
802 static void SqrtElementWise(Matrix_t &A) {
803 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
804 TCuda<AFloat>::SqrtElementWise(tmp);
805 }
806
807 // optimizer functions
808 static void AdamUpdate(Matrix_t & A, const Matrix_t & M, const Matrix_t & V, Scalar_t alpha, Scalar_t eps) {
809 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
810 TCudaMatrix<AFloat> tmpM(M.GetDeviceBuffer(), M.GetSize(),1);
811 TCudaMatrix<AFloat> tmpV(V.GetDeviceBuffer(), V.GetSize(),1);
812 TCuda<AFloat>::AdamUpdate(tmpA, tmpM, tmpV,alpha, eps);
813 }
814 static void AdamUpdateFirstMom(Matrix_t & A, const Matrix_t & B, Scalar_t beta) {
815 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
816 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
817 TCuda<AFloat>::AdamUpdateFirstMom(tmpA, tmpB, beta);
818 }
819 static void AdamUpdateSecondMom(Matrix_t & A, const Matrix_t & B, Scalar_t beta) {
820 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
821 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
822 TCuda<AFloat>::AdamUpdateSecondMom(tmpA, tmpB, beta);
823 }
824
825 // printing of tensor
826 static void PrintTensor( const Tensor_t & A, const std::string name = "tensor", bool = true);
827
828 static void PrintTensor4dDescriptor(TensorDescriptor_t descriptor);
829 static void PrintTensorNdDescriptor(TensorDescriptor_t descriptor, int n = 10);
830
831 ///////////////////////////////////////////////////////////////////////////////
832 /// extra functions defined only for CPU architecture !!!
833 //////////////////////////////////////////////////////////////////////////////
834
835 /** Sum rows of (m x n) matrix \p A and write the results into the first
836 * m elements in \p B.
837 */
838 static void SumRows(Matrix_t &B, const Matrix_t &A);
839};
840
841
842//____________________________________________________________________________
843template <typename AFloat>
844template <typename ATensor>
846 const ATensor &A)
847{
848
849 // should add static assert that A has not to be same type as B
850
851 // this copying tensors from different architectures
852 if (B.GetLayout() == GetTensorLayout()) {
853 if ( B.GetShape().size() == 4) {
854 assert(B.GetShape().size() == 4);
855 size_t firstSize = (A.GetLayout() == GetTensorLayout()) ? A.GetShape()[0] : A.GetShape().back();
856 for (size_t i = 0; i < firstSize; ++i) {
857 TMatrixT<AFloat> matIn = A.At(i).GetMatrix(); // this convert tensor (B,D,HW) in (D,HW)i -> (D,HW)i
858 // TMAtrix has the correct layout (row-wise) no need to traspose in this case
859 TCudaTensor<AFloat> tmpOut = B.At(i); // matrix (D,HW)
860 // copy will copy the buffer
861 TCudaTensor<AFloat> tmpIn(matIn.GetMatrixArray(), tmpOut.GetShape(), tmpOut.GetLayout());
862 Copy(tmpOut, tmpIn);
863 }
864 }
865 else {
866 // for RNN weights
870 Copy(B, tA);
871 }
872 } else {
873 // case of same layout (column major)
877 Copy(B, tA);
878 }
879}
880
881//____________________________________________________________________________
882template <typename AFloat>
883template <typename AMatrix>
885{
886 // copy from another architecture using the reference one
887 // this is not very efficient since creates temporary objects
888 TMatrixT<AFloat> tmp = A; // .GetMatrix();
889 // we need to traspose for different layout
890 if (B.GetLayout() == GetTensorLayout() ) {
891 // this is for CNN weights that are in row-major formats
892 //assert(B.GetShape().size() == 4); // weights shape should be 4
893 tmp.T();
894 }
897 Copy(B, tA);
898}
899
900//____________________________________________________________________________
901template <typename AFloat>
902template <typename AMatrix_t>
903void TCudnn<AFloat>::CopyDiffArch(std::vector<Tensor_t> &B,
904 const std::vector<AMatrix_t> &A)
905{
906 for (size_t i = 0; i < B.size(); ++i) {
907 CopyWeightsDiffArch(B[i], A[i]);
908 }
909}
910
911template <typename AFloat>
912void TCudnn<AFloat>::PrintTensor(const typename TCudnn<AFloat>::Tensor_t & A, const std::string name, bool truncate )
913{
914 std::cout << name << " size = " << A.GetSize() << " shape = { ";
915 auto shape = A.GetShape();
916 for (size_t k = 0; k < shape.size()-1; ++k)
917 std::cout << shape[k] << " , ";
918 std::cout << shape.back() << " } ";
919 std::cout << " strides = { ";
920 auto strides = A.GetStrides();
921 for (size_t k = 0; k < strides.size()-1; ++k)
922 std::cout << strides[k] << " , ";
923 std::cout << strides.back() << " }\n ";
924 if (A.GetShape().size() == 1 ) {
925 size_t n = A.GetShape()[0];
926 if (truncate) n = std::min(n,size_t(10));
927 for (size_t j = 0; j < n; ++j) {
928 std::cout << A(0,j) << " ";
929 }
930 if (truncate && n < A.GetShape()[0]) std::cout << " ...... ";
931 std::cout << " } " << std::endl;
932 } else if (A.GetShape().size() == 2 ) {
933 size_t n1 = A.GetShape()[0];
934 size_t n2 = A.GetShape()[1];
935 if (truncate) n1 = std::min(n1,size_t(10));
936 for (size_t i = 0; i < n1; ++i) {
937 std::cout << "{ ";
938 if (truncate) n2 = std::min(n2,size_t(10));
939 for (size_t j = 0; j < n2; ++j) {
940 std::cout << A(i,j) << " ";
941 }
942 if (truncate && n2 < A.GetShape()[1]) std::cout << " ...... ";
943 std::cout << " } " << std::endl;
944 }
945 if (truncate && n1 < A.GetShape()[0]) std::cout << " ...............\n";
946 } else if (A.GetShape().size() == 3 ) {
947 size_t n1 = A.GetFirstSize();
948 size_t n2 = A.GetHSize();
949 size_t n3 = A.GetWSize();
950 if (truncate) n1 = std::min(n1,size_t(10));
951 if (truncate) n2 = std::min(n2,size_t(10));
952 if (truncate) n3 = std::min(n3,size_t(10));
953 for (size_t i = 0; i < n1; ++i) {
954 std::cout << "{ ";
955 for (size_t j = 0; j < n2; ++j) {
956 std::cout << "{ ";
957 for (size_t k = 0; k < n3; ++k) {
958 std::cout << A(i,j,k) << " ";
959 }
960 if (truncate && n3 < A.GetWSize()) std::cout << " ...... ";
961 std::cout << " } " << std::endl;
962 }
963 if (truncate && n2 < A.GetHSize()) std::cout << ".................\n";
964 std::cout << " } " << std::endl;
965 }
966 if (truncate && n1 < A.GetFirstSize()) std::cout << "...................\n";
967 } else if (A.GetShape().size() == 4 ) {
968 for (size_t i = 0; i < A.GetShape()[0]; ++i) {
969 std::cout << "{ ";
970 for (size_t j = 0; j < A.GetShape()[1]; ++j) {
971 std::cout << "{ ";
972 for (size_t k = 0; k < A.GetShape()[2]; ++k) {
973 size_t n = A.GetShape()[3];
974 if (truncate) n = std::min(n,size_t(10));
975 for (size_t l = 0; l < n; ++l) {
976 std::cout << A(i,j,k,l) << " ";
977 }
978 if (truncate && n < A.GetShape()[3]) std::cout << " ...... ";
979 std::cout << " } " << std::endl;
980 }
981 std::cout << " } " << std::endl;
982 }
983 std::cout << " } " << std::endl;
984 }
985 }
986 else {
987 for (size_t l = 0; l < A.GetSize(); ++l) {
988 std::cout << A.GetData()[l] << " ";
989 }
990 std::cout << "\n";
991 }
992}
993
994template <typename AFloat>
996 int n, c, h, w = 0;
997 int s1, s2, s3, s4 = 0;
1000 std::cout << "Descriptor for 4d tensor of shape { " << n << " , " << c << " , " << h << " , " << w << " }"
1001 << " and strides { " << s1 << " , " << s2 << " , " << s3 << " , " << s4 << " }" << std::endl;
1002}
1003template <typename AFloat>
1004void TCudnn<AFloat>::PrintTensorNdDescriptor(TensorDescriptor_t descriptor, int ndim)
1005{
1006 int n = 0;
1007 std::vector<int> dims(ndim);
1008 std::vector<int> strides(ndim);
1010 cudnnGetTensorNdDescriptor(descriptor, ndim, &dataType, &n, dims.data(), strides.data());
1011 dims.resize(n);
1012 strides.resize(n);
1013 std::cout << "Descriptor for Nd tensor of dim = " << n << " shape { ";
1014 for (auto d : dims)
1015 std::cout << d << " , ";
1016 std::cout << "} and strides { ";
1017 for (auto s : strides)
1018 std::cout << s << " , ";
1019 std::cout << " }" << std::endl;
1020}
1021
1022// initialize the CNN options
1023// possible options for forward (from 0 to 7)
1024//
1025// 0 : CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
1026// 1 : CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
1027// 6 : CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
1028// 7 : CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; (lots of memory)
1029
1030// for backward data (from 0 to 5)
1031// 1 : CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
1032// 5 CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
1033
1034template <typename AFloat>
1036template <typename AFloat>
1038template <typename AFloat>
1040template <typename AFloat>
1041Long_t TCudnn<AFloat>::CNNOptions::ConvMaxWorkspaceSize = -1; // -1 let use Cudnn defaults
1042
1043} // namespace DNN
1044} // namespace TMVA
1045
1046#endif
1047#endif
#define d(i)
Definition RSha256.hxx:102
#define c(i)
Definition RSha256.hxx:101
#define s1(x)
Definition RSha256.hxx:91
#define h(i)
Definition RSha256.hxx:106
long Long_t
Definition RtypesCore.h:54
#define X(type, name)
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
char name[80]
Definition TGX11.cxx:110
void PrintTensor(RTensor< T > &t)
This is the base class for the ROOT Random number generators.
Definition TRandom.h:27
T Sum(const RVec< T > &v, const T zero=T(0))
Sum elements of an RVec.
Definition RVec.hxx:1954
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
void Copy(void *source, void *dest)
__global__ void SymmetricRelu(AFloat *A, int m, int n)
Definition Kernels.cuh:590
__global__ void SigmoidDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:524
__global__ void Dropout(AFloat *A, int m, int n, AFloat dropoutProbability, curandState_t *state)
Definition Kernels.cuh:964
__global__ void SoftmaxCrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:882
__global__ void IdentityDerivative(AFloat *A, int m, int n)
Definition Kernels.cuh:450
__global__ void SqrtElementWise(AFloat *A, int m, int n)
Definition Kernels.cuh:391
__global__ void AdamUpdate(AFloat *A, const AFloat *M, const AFloat *V, int m, int n, AFloat alpha, AFloat eps)
optimizer kernel functions
Definition Kernels.cuh:408
__global__ void SoftmaxCrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:851
__global__ void AddL1RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
Definition Kernels.cuh:767
__global__ void MeanSquaredErrorGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:750
__global__ void Relu(AFloat *A, int m, int n)
Definition Kernels.cuh:463
__global__ void ReluDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:478
__global__ void AddL2RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
Definition Kernels.cuh:784
__global__ void AddRowWise(AFloat *W, const AFloat *theta, int m, int n)
Definition Kernels.cuh:307
__global__ void ConstMult(AFloat *A, AFloat beta, int m, int n)
Definition Kernels.cuh:349
__global__ void GaussDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:665
__global__ void Deflatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Deflatten a 2D-array into an array of 2D-arrays.
Definition Kernels.cuh:1225
__global__ void CrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:800
__global__ void Softmax(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:540
__global__ void TanhDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:574
__global__ void CrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:831
__global__ void ConstAdd(AFloat *A, AFloat beta, int m, int n)
Definition Kernels.cuh:335
__global__ void SymmetricReluDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:604
__global__ void MeanSquaredError(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:681
__global__ void SquareElementWise(AFloat *A, int m, int n)
Definition Kernels.cuh:377
__global__ void SoftSignDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:634
__global__ void Hadamard(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:321
__global__ void AdamUpdateFirstMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
Definition Kernels.cuh:422
__global__ void ReciprocalElementWise(AFloat *A, int m, int n)
Definition Kernels.cuh:363
__global__ void Downsample(AFloat *output, AFloat *indexMatrix, const AFloat *input, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Downsampling kernel used as the forward propagation step of a Max-Pooling layer.
Definition Kernels.cuh:1002
__global__ void AdamUpdateSecondMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
Definition Kernels.cuh:436
std::shared_ptr< std::function< double(double)> > Tanh
Definition NeuralNet.cxx:29
std::shared_ptr< std::function< double(double)> > Gauss
Definition NeuralNet.cxx:12
std::shared_ptr< std::function< double(double)> > Sigmoid
Definition NeuralNet.cxx:26
std::shared_ptr< std::function< double(double)> > SoftSign
Definition NeuralNet.cxx:32
MemoryLayout
Memory layout type (copy from RTensor.hxx)
Definition CudaTensor.h:47
create variable transformations
TLine l
Definition textangle.C:4
static void output()