17#ifndef TMVA_DNN_ARCHITECTURES_CUDA_KERNELS 
   18#define TMVA_DNN_ARCHITECTURES_CUDA_KERNELS 
   30template<
typename AFloat>
 
   36   unsigned long long int* 
address_as_ull = (
unsigned long long int*)address;
 
 
   54template<
typename AFloat>
 
   65   if ((
blockDim.y > 512) && (i < 512)) {
 
   72   if ((
blockDim.y > 256) && (i < 256)) {
 
   78   if ((
blockDim.y > 128) && (i < 128)) {
 
 
  133template<
typename AFloat>
 
  206template<
typename AFloat>
 
  255template<
typename AFloat>
 
  306template<
typename AFloat>
 
  308                           const AFloat * theta,
 
  315   if ((i < 
m) && (
j < 
n))
 
 
  320template<
typename AFloat>
 
  329   if ((i < 
m) && (
j < 
n))
 
 
  334template<
typename AFloat>
 
  342   if ((i < 
m) && (
j < 
n)) {
 
 
  348template<
typename AFloat>
 
  356   if ((i < 
m) && (
j < 
n)) {
 
 
  362template<
typename AFloat>
 
  370   if ((i < 
m) && (
j < 
n)) {
 
 
  376template<
typename AFloat>
 
  384   if ((i < 
m) && (
j < 
n)) {
 
 
  390template<
typename AFloat>
 
  398   if ((i < 
m) && (
j < 
n)) {
 
 
  407template<
typename AFloat>
 
  409                           int m, 
int n, AFloat alpha, AFloat eps)
 
  415   if ((i < 
m) && (
j < 
n)) {
 
 
  421template<
typename AFloat>
 
  423                           int m, 
int n, AFloat beta)
 
  429   if ((i < 
m) && (
j < 
n)) {
 
 
  435template<
typename AFloat>
 
  437                           int m, 
int n, AFloat beta)
 
  443   if ((i < 
m) && (
j < 
n)) {
 
 
  449template<
typename AFloat>
 
  457   if ((i < 
m) && (
j < 
n))
 
 
  462template<
typename AFloat>
 
  470   if ((i < 
m) && (
j < 
n)) {
 
  472      A[
index] = (
x < 0.0) ? 0.0 : 
x;
 
 
  477template<
typename AFloat>
 
  479                               const AFloat * A, 
int m, 
int n)
 
  485   if ((i < 
m) && (
j < 
n)) {
 
  487      B[
index] = (
x < 0.0) ? 0.0 : 1.0;
 
 
  492template<
typename AFloat>
 
  500   if ((i < 
m) && (
j < 
n)) {
 
  501      AFloat sig = 1.0 / (1.0 + exp(-A[
index]));
 
 
  507template<
typename AFloat>
 
  516   if ((i < 
m) && (
j < 
n)) {
 
  517      AFloat sig = 1.0 / (1.0 + exp(-A[
index]));
 
 
  523template<
typename AFloat>
 
  532   if ((i < 
m) && (
j < 
n)) {
 
  533      AFloat sig = 1.0 / (1.0 + exp(-A[
index]));
 
  534      B[
index] = sig * (1.0 - sig);
 
 
  539template<
typename AFloat>
 
  548      for (
int j = 0; 
j < 
n; 
j++) {
 
  549         sum += exp(A[i + 
j * 
n]);
 
  551      for (
int j = 0; 
j < 
n; 
j++) {
 
  552         B[i + 
j * 
n] = exp(A[i * 
n + 
j]) / 
sum;
 
 
  558template<
typename AFloat>
 
  566   if ((i < 
m) && (
j < 
n)) {
 
  567      AFloat t = ::tanh(A[
index]);
 
 
  573template<
typename AFloat>
 
  582   if ((i < 
m) && (
j < 
n)) {
 
  583      AFloat t = ::tanh(A[
index]);
 
 
  589template<
typename AFloat>
 
  597   if ((i < 
m) && (
j < 
n)) {
 
 
  603template<
typename AFloat>
 
  612   if ((i < 
m) && (
j < 
n)) {
 
 
  618template<
typename AFloat>
 
  626   if ((i < 
m) && (
j < 
n)) {
 
 
  633template<
typename AFloat>
 
  642   if ((i < 
m) && (
j < 
n)) {
 
  643      AFloat 
x = 1.0 + fabs(A[
index]);
 
 
  649template<
typename AFloat>
 
  657   if ((i < 
m) && (
j < 
n)) {
 
 
  664template<
typename AFloat>
 
  673   if ((i < 
m) && (
j < 
n)) {
 
  675      B[
index] = - 2.0 * 
x * exp(- 
x * 
x);
 
 
  680template<
typename AFloat>
 
  684                                 const AFloat * weights,
 
  694   if ((i < 
m) && (
j < 
n)) {
 
  695       AFloat 
w = weights[i];
 
  696       AFloat 
norm = 1 / ((AFloat) (
m * 
n));
 
 
  706template<
typename AFloat>
 
  718   if ((i < 
m) && (
j < 
n)) {
 
 
  728template<
typename AFloat>
 
  740   if ((i < 
m) && (
j < 
n)) {
 
 
  749template<
typename AFloat>
 
  753                                          const AFloat * weights,
 
  760   if ((i < 
m) && (
j < 
n)) {
 
 
  766template<
typename AFloat>
 
  776   if ((i < 
m) && (
j < 
n)) {
 
  777       AFloat 
sign = (B[
index] < 0.0) ? -1.0 : 1.0;
 
 
  783template<
typename AFloat>
 
  793   if ((i < 
m) && (
j < 
n)) {
 
 
  799template<
typename AFloat>
 
  803                             const AFloat * weights,
 
  813   if ((i < 
m) && (
j < 
n)) {
 
  814      AFloat 
norm = 1 / ((AFloat) (
m * 
n));
 
  816      AFloat 
lr = std::log(1. + exp(-
x));
 
  817      if (
x < -75.) 
lr = -
x;
 
  818      else if (
x > 75.) 
lr = exp(-
x);
 
 
  830template<
typename AFloat>
 
  834                                      const AFloat * weights,
 
  841   if ((i < 
m) && (
j < 
n)) {
 
  842      AFloat 
norm = 1 / ((AFloat) (
m * 
n));
 
 
  850template<
typename AFloat>
 
  854                                    const AFloat * weights,
 
  861   AFloat 
norm = 1.0 / ((AFloat) 
m);
 
  866      for (
int j = 0; 
j < 
n; 
j++) {
 
  869      for (
int j = 0; 
j < 
n; 
j++) {
 
 
  881template<
typename AFloat>
 
  885                                             const AFloat * weights,
 
  889   AFloat 
norm = 1.0 / ((AFloat) 
m);
 
  894      for (
int j = 0; 
j < 
n; 
j++) {
 
  898      for (
int j = 0; 
j < 
n; 
j++) {
 
 
  906template<
typename AFloat>
 
  917   if ((i < 
m) && (
j < 
n))
 
 
  926template<
typename AFloat>
 
  939   if ((i < 
m) && (
j < 
n)) {
 
 
  948template<
typename AFloat>
 
  954   if (i >= 
m || 
j >= 
n) 
return;
 
 
  963template<
typename AFloat>
 
  972   if ((i < 
m) && (
j < 
n)) {
 
 
 1001template<
typename AFloat>
 
 1058template<
typename AFloat>
 
 1063                                int strideRows, 
int strideCols)
 
 1088            if (row >= 
height || col >= 
width || col < 0 || row < 0) 
continue;
 
 
 1101template<
typename AFloat>
 
 1110   int jump = filterHeight * filterWidth;
 
 
 1117template<
typename AFloat>
 
 1124   A[i + 
j * 
nRows] += B[i];
 
 
 1127template<
typename AFloat>
 
 1135    for (
size_t event = 0; 
event < batchSize; 
event++) {
 
 
 1141template<
typename AFloat>
 
 1170template<
typename AFloat>
 
 1190template<
typename AFloat>
 
 1224template<
typename AFloat>
 
 1242template<
typename AFloat>
 
 
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
 
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
 
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
 
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
 
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
 
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
 
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
 
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t height
 
static constexpr int BlockSize
 
__global__ void SymmetricRelu(AFloat *A, int m, int n)
 
__global__ void UpdateWeights(AFloat *A, const AFloat **B, int batchSize, int nRows, int nCols)
 
__device__ int calculateDimension(int imgDim, int fltDim, int padding, int stride)
Calculate the dimension of an output volume, given the sliding parameters and the input shape.
 
__global__ void SigmoidDerivative(AFloat *B, const AFloat *A, int m, int n)
 
__device__ AFloat AtomicAdd(AFloat *address, AFloat val)
 
__global__ void Dropout(AFloat *A, int m, int n, AFloat dropoutProbability, curandState_t *state)
 
__global__ void SoftmaxCrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
 
__global__ void SumColumns(AFloat *B, const AFloat *A, int m, int n)
 
__global__ void IdentityDerivative(AFloat *A, int m, int n)
 
__global__ void SqrtElementWise(AFloat *A, int m, int n)
 
__global__ void AdamUpdate(AFloat *A, const AFloat *M, const AFloat *V, int m, int n, AFloat alpha, AFloat eps)
optimizer kernel functions
 
__global__ void SoftmaxCrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
 
__global__ void AddL1RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
 
__device__ void ReduceSumVertical(AFloat *result, AFloat *sdata, int n)
 
__global__ void MeanSquaredErrorGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
 
__global__ void Relu(AFloat *A, int m, int n)
 
__global__ void ReluDerivative(AFloat *B, const AFloat *A, int m, int n)
 
__global__ void AbsoluteSum(AFloat *result, const AFloat *A, int m, int n)
 
__global__ void AddL2RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
 
__device__ AFloat max(AFloat x, AFloat y)
 
__global__ void AddRowWise(AFloat *W, const AFloat *theta, int m, int n)
 
__global__ void ConstMult(AFloat *A, AFloat beta, int m, int n)
 
__global__ void GaussDerivative(AFloat *B, const AFloat *A, int m, int n)
 
__global__ void Deflatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Deflatten a 2D-array into an array of 2D-arrays.
 
__global__ void Flatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Flatten an array of 2D-arrays into a single 2D-array.
 
__global__ void CrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
 
__global__ void Softmax(AFloat *B, const AFloat *A, int m, int n)
 
__global__ void RotateWeights(AFloat *A, const AFloat *B, int filterDepth, int filterHeight, int filterWidth, int numFilters)
 
__global__ void TanhDerivative(AFloat *B, const AFloat *A, int m, int n)
 
__global__ void CrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
 
__global__ void ReduceMatrix(AFloat *result, const AFloat *A, int m, int n)
 
__global__ void Im2Col(AFloat *A, const AFloat *B, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols, int zeroPaddingHeight, int zeroPaddingWidth)
A kernel that re-arranges image regions of the input matrix \B, into column vectors in matrix \A.
 
__global__ void DeflattenRM(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
 
__global__ void ConstAdd(AFloat *A, AFloat beta, int m, int n)
 
__global__ void SymmetricReluDerivative(AFloat *B, const AFloat *A, int m, int n)
 
__global__ void MeanSquaredError(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
 
__global__ void SquareElementWise(AFloat *A, int m, int n)
 
__global__ void SoftSignDerivative(AFloat *B, const AFloat *A, int m, int n)
 
__global__ void Reshape(AFloat *A, const AFloat *B, int nRowsA, int nColsA, int nRowsB, int nColsB)
 
__global__ void Hadamard(AFloat *B, const AFloat *A, int m, int n)
 
__global__ void AlmostEquals(bool *result, const AFloat *A, const AFloat *B, double epsilon, int m, int n)
 
__global__ void FlattenRM(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
 
__global__ void SquaredSum(AFloat *result, const AFloat *A, int m, int n)
 
__global__ void AdamUpdateFirstMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
 
__global__ void ReciprocalElementWise(AFloat *A, int m, int n)
 
__device__ void ReduceSum(AFloat *result, AFloat *sdata)
 
__global__ void MaxPoolBackward(AFloat *activationGradientsBackward, const AFloat *activationGradients, const AFloat *indexMatrix, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Back-propagate the gradients through a max-pooling layer.
 
__global__ void Downsample(AFloat *output, AFloat *indexMatrix, const AFloat *input, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Downsampling kernel used as the forward propagation step of a Max-Pooling layer.
 
__global__ void AdamUpdateSecondMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
 
__global__ void AddBiases(AFloat *A, const AFloat *B, int nRows, int nCols)
 
std::shared_ptr< std::function< double(double)> > Tanh
 
double weightDecay(double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
compute the weight decay for regularization (L1 or L2)
 
std::shared_ptr< std::function< double(double)> > Gauss
 
std::shared_ptr< std::function< double(double)> > Sigmoid
 
std::shared_ptr< std::function< double(double)> > SoftSign
 
create variable transformations
 
static uint64_t sum(uint64_t i)