17#ifndef TMVA_DNN_ARCHITECTURES_CUDA_KERNELS
18#define TMVA_DNN_ARCHITECTURES_CUDA_KERNELS
30template<
typename AFloat>
31__device__ AFloat
AtomicAdd(AFloat* address, AFloat val);
34__device__
double AtomicAdd(
double* address,
double val)
36 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
37 unsigned long long int old = *address_as_ull, assumed;
40 old = atomicCAS(address_as_ull, assumed,
41 __double_as_longlong(val +
42 __longlong_as_double(assumed)));
43 }
while (assumed != old);
44 return __longlong_as_double(old);
48__device__
float AtomicAdd(
float* address,
float val)
50 return atomicAdd(address, val);
54template<
typename AFloat>
62 int index = i * blockDim.x + j;
65 if ((blockDim.y > 512) && (i < 512)) {
66 if ((i + 512) < blockDim.y) {
67 sdata[
index] += sdata[
index + 512 * blockDim.x];
72 if ((blockDim.y > 256) && (i < 256)) {
73 if ((i + 256) < blockDim.y) {
74 sdata[
index] += sdata[
index + 256 * blockDim.x];
78 if ((blockDim.y > 128) && (i < 128)) {
79 if ((i + 128) < blockDim.y) {
80 sdata[
index] += sdata[
index + 128 * blockDim.x];
84 if ((blockDim.y > 64) && (i < 64)) {
85 if ((i + 64) < blockDim.y) {
86 sdata[
index] += sdata[
index + 64 * blockDim.x];
90 if ((blockDim.y > 32) && (i < 32)) {
91 if ((i + 32) < blockDim.y) {
92 sdata[
index] += sdata[
index + 32 * blockDim.x];
96 if ((blockDim.y > 16) && (i < 16)) {
97 if ((i + 16) < blockDim.y) {
98 sdata[
index] += sdata[
index + 16 * blockDim.x];
102 if ((blockDim.y > 8) && (i < 8)) {
103 if ((i + 8) < blockDim.y) {
104 sdata[
index] += sdata[
index + 8 * blockDim.x];
108 if ((blockDim.y > 4) && (i < 4)) {
109 if ((i + 4) < blockDim.y) {
110 sdata[
index] += sdata[
index + 4 * blockDim.x];
114 if ((blockDim.y > 2) && (i < 2)) {
115 if ((i + 2) < blockDim.y) {
116 sdata[
index] += sdata[
index + 2 * blockDim.x];
120 if ((blockDim.y > 1) && (i < 1)) {
121 if ((i + 1) < blockDim.y) {
122 sdata[
index] += sdata[
index + 1 * blockDim.x];
126 if ((i == 0) && ((blockIdx.x * blockDim.x + threadIdx.x) <
n)) {
133template<
typename AFloat>
136 int tid = threadIdx.x + threadIdx.y * blockDim.x;
141 sdata[tid] += sdata[tid + 512];
148 sdata[tid] += sdata[tid + 256];
154 sdata[tid] += sdata[tid + 128];
160 sdata[tid] += sdata[tid + 64];
166 sdata[tid] += sdata[tid + 32];
172 sdata[tid] += sdata[tid + 16];
178 sdata[tid] += sdata[tid + 8];
184 sdata[tid] += sdata[tid + 4];
190 sdata[tid] += sdata[tid + 2];
196 sdata[tid] += sdata[tid + 1];
206template<
typename AFloat>
207__device__ AFloat
max(AFloat
x, AFloat
y)
229 return ((imgDim - fltDim + 2 * padding) / stride) + 1;
255template<
typename AFloat>
265 int zeroPaddingHeight,
266 int zeroPaddingWidth)
269 int i = blockDim.y * blockIdx.y + threadIdx.y;
272 int j = blockDim.x * blockIdx.x + threadIdx.x;
275 int NLocalViewPixels = fltHeight * fltWidth * depth;
278 int NLocalViews =
calculateDimension(imgWidth, fltWidth, zeroPaddingWidth, strideCols) *
281 if (i >= NLocalViews || j >= NLocalViewPixels)
return;
283 int index = j * NLocalViews + i;
285 int numSlidesPerRow =
calculateDimension(imgWidth, fltWidth, zeroPaddingWidth, strideCols);
288 int bz = j / (fltHeight * fltWidth);
291 int by = (i / numSlidesPerRow) * strideRows - zeroPaddingHeight + (j - bz * fltHeight * fltWidth) / fltWidth;
294 int bx = (i % numSlidesPerRow) * strideCols - zeroPaddingWidth + (j - bz * fltHeight * fltWidth) % fltWidth;
296 if (bx < 0 || by < 0 || bx >= imgWidth || by >= imgHeight) {
301 A[
index] = B[(bx + by * imgWidth) * depth + bz];
306template<
typename AFloat>
308 const AFloat * theta,
311 int i = blockDim.y * blockIdx.y + threadIdx.y;
312 int j = blockDim.x * blockIdx.x + threadIdx.x;
315 if ((i <
m) && (j <
n))
316 W[
index] += theta[j];
320template<
typename AFloat>
325 int i = blockDim.y * blockIdx.y + threadIdx.y;
326 int j = blockDim.x * blockIdx.x + threadIdx.x;
329 if ((i <
m) && (j <
n))
334template<
typename AFloat>
338 int i = blockDim.y * blockIdx.y + threadIdx.y;
339 int j = blockDim.x * blockIdx.x + threadIdx.x;
342 if ((i <
m) && (j <
n)) {
348template<
typename AFloat>
352 int i = blockDim.y * blockIdx.y + threadIdx.y;
353 int j = blockDim.x * blockIdx.x + threadIdx.x;
356 if ((i <
m) && (j <
n)) {
362template<
typename AFloat>
366 int i = blockDim.y * blockIdx.y + threadIdx.y;
367 int j = blockDim.x * blockIdx.x + threadIdx.x;
370 if ((i <
m) && (j <
n)) {
376template<
typename AFloat>
380 int i = blockDim.y * blockIdx.y + threadIdx.y;
381 int j = blockDim.x * blockIdx.x + threadIdx.x;
384 if ((i <
m) && (j <
n)) {
390template<
typename AFloat>
394 int i = blockDim.y * blockIdx.y + threadIdx.y;
395 int j = blockDim.x * blockIdx.x + threadIdx.x;
398 if ((i <
m) && (j <
n)) {
407template<
typename AFloat>
408__global__
void AdamUpdate(AFloat * A,
const AFloat * M,
const AFloat * V,
409 int m,
int n, AFloat alpha, AFloat eps)
411 int i = blockDim.y * blockIdx.y + threadIdx.y;
412 int j = blockDim.x * blockIdx.x + threadIdx.x;
415 if ((i <
m) && (j <
n)) {
421template<
typename AFloat>
423 int m,
int n, AFloat beta)
425 int i = blockDim.y * blockIdx.y + threadIdx.y;
426 int j = blockDim.x * blockIdx.x + threadIdx.x;
429 if ((i <
m) && (j <
n)) {
435template<
typename AFloat>
437 int m,
int n, AFloat beta)
439 int i = blockDim.y * blockIdx.y + threadIdx.y;
440 int j = blockDim.x * blockIdx.x + threadIdx.x;
443 if ((i <
m) && (j <
n)) {
449template<
typename AFloat>
453 int i = blockDim.y * blockIdx.y + threadIdx.y;
454 int j = blockDim.x * blockIdx.x + threadIdx.x;
457 if ((i <
m) && (j <
n))
462template<
typename AFloat>
463__global__
void Relu(AFloat * A,
466 int i = blockDim.y * blockIdx.y + threadIdx.y;
467 int j = blockDim.x * blockIdx.x + threadIdx.x;
470 if ((i <
m) && (j <
n)) {
472 A[
index] = (
x < 0.0) ? 0.0 :
x;
477template<
typename AFloat>
479 const AFloat * A,
int m,
int n)
481 int i = blockDim.y * blockIdx.y + threadIdx.y;
482 int j = blockDim.x * blockIdx.x + threadIdx.x;
485 if ((i <
m) && (j <
n)) {
487 B[
index] = (
x < 0.0) ? 0.0 : 1.0;
492template<
typename AFloat>
496 int i = blockDim.y * blockIdx.y + threadIdx.y;
497 int j = blockDim.x * blockIdx.x + threadIdx.x;
500 if ((i <
m) && (j <
n)) {
501 AFloat sig = 1.0 / (1.0 + exp(-A[
index]));
507template<
typename AFloat>
512 int i = blockDim.y * blockIdx.y + threadIdx.y;
513 int j = blockDim.x * blockIdx.x + threadIdx.x;
516 if ((i <
m) && (j <
n)) {
517 AFloat sig = 1.0 / (1.0 + exp(-A[
index]));
523template<
typename AFloat>
528 int i = blockDim.y * blockIdx.y + threadIdx.y;
529 int j = blockDim.x * blockIdx.x + threadIdx.x;
532 if ((i <
m) && (j <
n)) {
533 AFloat sig = 1.0 / (1.0 + exp(-A[
index]));
534 B[
index] = sig * (1.0 - sig);
539template<
typename AFloat>
544 int i = blockDim.x * blockIdx.x + threadIdx.x;
548 for (
int j = 0; j <
n; j++) {
549 sum += exp(A[i + j *
n]);
551 for (
int j = 0; j <
n; j++) {
552 B[i + j *
n] = exp(A[i *
n + j]) /
sum;
558template<
typename AFloat>
559__global__
void Tanh(AFloat * A,
562 int i = blockDim.y * blockIdx.y + threadIdx.y;
563 int j = blockDim.x * blockIdx.x + threadIdx.x;
566 if ((i <
m) && (j <
n)) {
567 AFloat t = ::tanh(A[
index]);
573template<
typename AFloat>
578 int i = blockDim.y * blockIdx.y + threadIdx.y;
579 int j = blockDim.x * blockIdx.x + threadIdx.x;
582 if ((i <
m) && (j <
n)) {
583 AFloat t = ::tanh(A[
index]);
589template<
typename AFloat>
593 int i = blockDim.y * blockIdx.y + threadIdx.y;
594 int j = blockDim.x * blockIdx.x + threadIdx.x;
597 if ((i <
m) && (j <
n)) {
603template<
typename AFloat>
608 int i = blockDim.y * blockIdx.y + threadIdx.y;
609 int j = blockDim.x * blockIdx.x + threadIdx.x;
612 if ((i <
m) && (j <
n)) {
618template<
typename AFloat>
622 int i = blockDim.y * blockIdx.y + threadIdx.y;
623 int j = blockDim.x * blockIdx.x + threadIdx.x;
626 if ((i <
m) && (j <
n)) {
633template<
typename AFloat>
638 int i = blockDim.y * blockIdx.y + threadIdx.y;
639 int j = blockDim.x * blockIdx.x + threadIdx.x;
642 if ((i <
m) && (j <
n)) {
643 AFloat
x = 1.0 + fabs(A[
index]);
649template<
typename AFloat>
653 int i = blockDim.y * blockIdx.y + threadIdx.y;
654 int j = blockDim.x * blockIdx.x + threadIdx.x;
657 if ((i <
m) && (j <
n)) {
664template<
typename AFloat>
669 int i = blockDim.y * blockIdx.y + threadIdx.y;
670 int j = blockDim.x * blockIdx.x + threadIdx.x;
673 if ((i <
m) && (j <
n)) {
675 B[
index] = - 2.0 *
x * exp(-
x *
x);
680template<
typename AFloat>
684 const AFloat * weights,
687 int i = blockDim.y * blockIdx.y + threadIdx.y;
688 int j = blockDim.x * blockIdx.x + threadIdx.x;
689 int tid = blockDim.x * threadIdx.y + threadIdx.x;
694 if ((i <
m) && (j <
n)) {
695 AFloat
w = weights[i];
696 AFloat norm = 1 / ((AFloat) (
m *
n));
698 sdata[tid] =
w * norm *
e *
e;
706template<
typename AFloat>
711 int i = blockDim.y * blockIdx.y + threadIdx.y;
712 int j = blockDim.x * blockIdx.x + threadIdx.x;
713 int tid = blockDim.x * threadIdx.y + threadIdx.x;
718 if ((i <
m) && (j <
n)) {
728template<
typename AFloat>
733 int i = blockDim.y * blockIdx.y + threadIdx.y;
734 int j = blockDim.x * blockIdx.x + threadIdx.x;
735 int tid = blockDim.x * threadIdx.y + threadIdx.x;
740 if ((i <
m) && (j <
n)) {
741 sdata[tid] = abs(A[
index]);
749template<
typename AFloat>
753 const AFloat * weights,
756 int i = blockDim.y * blockIdx.y + threadIdx.y;
757 int j = blockDim.x * blockIdx.x + threadIdx.x;
760 if ((i <
m) && (j <
n)) {
766template<
typename AFloat>
772 int i = blockDim.y * blockIdx.y + threadIdx.y;
773 int j = blockDim.x * blockIdx.x + threadIdx.x;
776 if ((i <
m) && (j <
n)) {
777 AFloat sign = (B[
index] < 0.0) ? -1.0 : 1.0;
783template<
typename AFloat>
789 int i = blockDim.y * blockIdx.y + threadIdx.y;
790 int j = blockDim.x * blockIdx.x + threadIdx.x;
793 if ((i <
m) && (j <
n)) {
799template<
typename AFloat>
803 const AFloat * weights,
806 int i = blockDim.y * blockIdx.y + threadIdx.y;
807 int j = blockDim.x * blockIdx.x + threadIdx.x;
808 int tid = blockDim.x * threadIdx.y + threadIdx.x;
813 if ((i <
m) && (j <
n)) {
814 AFloat norm = 1 / ((AFloat) (
m *
n));
816 AFloat lr = std::log(1. + exp(-
x));
817 if (
x < -75.) lr = -
x;
818 else if (
x > 75.) lr = exp(-
x);
820 AFloat ce = Y[
index] * lr + (1.0 - Y[
index]) * (
x + lr);
821 sdata[tid] = weights[i] * norm * ce;
830template<
typename AFloat>
834 const AFloat * weights,
837 int i = blockDim.y * blockIdx.y + threadIdx.y;
838 int j = blockDim.x * blockIdx.x + threadIdx.x;
841 if ((i <
m) && (j <
n)) {
842 AFloat norm = 1 / ((AFloat) (
m *
n));
845 dY[
index] = weights[i] * norm * (sig -
y);
850template<
typename AFloat>
854 const AFloat * weights,
857 int i = blockDim.y * blockIdx.y + threadIdx.y;
858 int tid = threadIdx.y;
861 AFloat norm = 1.0 / ((AFloat)
m);
866 for (
int j = 0; j <
n; j++) {
869 for (
int j = 0; j <
n; j++) {
870 sdata[tid] += Y[i + j *
m] * log(exp(
output[i + j *
m]) /
sum);
872 sdata[tid] *= -weights[i] * norm;
881template<
typename AFloat>
885 const AFloat * weights,
888 int i = blockDim.y * blockIdx.y + threadIdx.y;
889 AFloat norm = 1.0 / ((AFloat)
m);
894 for (
int j = 0; j <
n; j++) {
896 sumY += Y[i + j *
m];
898 for (
int j = 0; j <
n; j++) {
899 dY[i + j *
m] = sumY * exp(
output[i + j *
m]) /
sum - Y[i + j *
m];
900 dY[i + j *
m] *= weights[i] * norm;
906template<
typename AFloat>
911 int i = blockDim.y * blockIdx.y + threadIdx.y;
912 int j = blockDim.x * blockIdx.x + threadIdx.x;
913 int tid = threadIdx.y * blockDim.x + threadIdx.x;
917 if ((i <
m) && (j <
n))
918 smem[tid] = A[
index];
926template<
typename AFloat>
931 int i = blockDim.y * blockIdx.y + threadIdx.y;
932 int j = blockDim.x * blockIdx.x + threadIdx.x;
933 int matrixIndex = j *
m + i;
934 int blockIndex = blockDim.x * threadIdx.y + threadIdx.x;
939 if ((i <
m) && (j <
n)) {
940 smem[blockIndex] = A[matrixIndex];
942 smem[blockIndex] = 0.0;
948template<
typename AFloat>
951 int i = blockDim.y * blockIdx.y + threadIdx.y;
952 int j = blockDim.x * blockIdx.x + threadIdx.x;
954 if (i >=
m || j >=
n)
return;
955 int matrixIndex = j *
m + i;
959 if(fabs(A[matrixIndex] - B[matrixIndex]) > epsilon)
result[0] =
false;
963template<
typename AFloat>
966 AFloat dropoutProbability,
967 curandState_t *state)
969 int i = blockDim.y * blockIdx.y + threadIdx.y;
970 int j = blockDim.x * blockIdx.x + threadIdx.x;
971 int tid = i * gridDim.x + j;
972 if ((i <
m) && (j <
n)) {
973 float r = curand_uniform(state + tid);
974 if (
r > dropoutProbability) {
977 A[j *
m + i] /= dropoutProbability;
1001template<
typename AFloat>
1003 int imgWidth,
int fltHeight,
int fltWidth,
int strideRows,
int strideCols)
1006 int i = blockDim.y * blockIdx.y + threadIdx.y;
1009 int j = blockDim.x * blockIdx.x + threadIdx.x;
1015 if (i >= depth || j >= NLocalViews)
return;
1017 int outputIndex = j * depth + i;
1021 int rowMin = (j / numSlidesPerRow) * strideRows;
1022 int colMin = (j % numSlidesPerRow) * strideCols;
1026 AFloat maxIndex = 0;
1029 for (
size_t by = rowMin; by < rowMin + fltHeight; by++) {
1030 for (
size_t bx = colMin; bx < colMin + fltWidth; bx++) {
1031 int inputIndex = (bx + by * imgWidth) * depth + bz;
1034 maxIndex = bx + by * imgWidth;
1039 indexMatrix[outputIndex] = maxIndex;
1058template<
typename AFloat>
1060 const AFloat * activationGradients,
1061 const AFloat * indexMatrix,
1062 int depth,
int imgHeight,
int imgWidth,
int fltHeight,
int fltWidth,
1063 int strideRows,
int strideCols)
1065 int slice = blockDim.y * blockIdx.y + threadIdx.y;
1066 int j = blockDim.x * blockIdx.x + threadIdx.x;
1068 if (slice >= depth || j >= imgHeight * imgWidth)
return;
1074 int backRow = j % imgHeight;
1075 int backCol = j / imgHeight;
1078 int nextRowMin = floor((backRow - fltHeight) / (AFloat) strideRows) + 1;
1079 int nextColMin = floor((backCol - fltWidth) / (AFloat) strideCols) + 1;
1081 int outputIndex = 0;
1085 for (
int row = nextRowMin; row <= nextRowMin + fltHeight - strideRows; row++) {
1086 for (
int col = nextColMin; col <= nextColMin + fltWidth - strideCols; col++) {
1088 if (row >=
height || col >=
width || col < 0 || row < 0)
continue;
1090 outputIndex = (row *
width + col) * depth + slice;
1093 if (indexMatrix[outputIndex] == backCol + backRow * imgWidth) {
1094 grad += activationGradients[outputIndex];
1098 activationGradientsBackward[(backCol + backRow * imgWidth) * depth + slice] = grad;
1101template<
typename AFloat>
1102__global__
void RotateWeights(AFloat * A,
const AFloat * B,
int filterDepth,
int filterHeight,
int filterWidth,
1105 int i = blockDim.y * blockIdx.y + threadIdx.y;
1106 int j = blockDim.x * blockIdx.x + threadIdx.x;
1108 if (i >= numFilters || j > filterDepth * filterHeight * filterWidth)
return;
1110 int jump = filterHeight * filterWidth;
1112 int col = i * jump + jump - j % jump - 1;
1114 A[col * filterDepth + row] = B[j * numFilters + i];
1117template<
typename AFloat>
1118__global__
void AddBiases(AFloat * A,
const AFloat * B,
int nRows,
int nCols)
1120 int i = blockDim.y * blockIdx.y + threadIdx.y;
1121 int j = blockDim.x * blockIdx.x + threadIdx.x;
1122 if (i >= nRows || j >= nCols)
return;
1124 A[i + j * nRows] += B[i];
1127template<
typename AFloat>
1128__global__
void UpdateWeights(AFloat * A,
const AFloat ** B,
int batchSize,
int nRows,
int nCols)
1130 int i = blockDim.y * blockIdx.y + threadIdx.y;
1131 int j = blockDim.x * blockIdx.x + threadIdx.x;
1133 if (i >= nRows || j >= nCols)
return;
1135 for (
size_t event = 0;
event < batchSize;
event++) {
1136 size_t index = i * nCols + j;
1141template<
typename AFloat>
1142__global__
void Reshape(AFloat * A,
const AFloat * B,
int nRowsA,
int nColsA,
int nRowsB,
int nColsB)
1144 int i = blockDim.y * blockIdx.y + threadIdx.y;
1145 int j = blockDim.x * blockIdx.x + threadIdx.x;
1146 if (i >= nRowsA || j >= nColsA)
return;
1148 size_t indexA = j * nRowsA + i;
1150 size_t nElem = i * nColsA + j;
1151 size_t indexB = (nElem % nColsB) * nRowsB + nElem / nColsB;
1153 A[indexA] = B[indexB];
1170template<
typename AFloat>
1171__global__
void Flatten(AFloat * A,
const AFloat *B,
int size,
int nRows,
int nCols)
1173 int i = blockDim.y * blockIdx.y + threadIdx.y;
1174 int j = blockDim.x * blockIdx.x + threadIdx.x;
1176 int nColsA = nRows * nCols;
1177 if (i >=
size || j >= nColsA)
return;
1180 int row = j / nCols;
1181 int col = j % nCols;
1183 AFloat element = B[ i * nColsA + col * nRows + row ];
1190template<
typename AFloat>
1191__global__
void FlattenRM(AFloat * A,
const AFloat *B,
int size,
int nRows,
int nCols)
1193 int i = blockDim.y * blockIdx.y + threadIdx.y;
1194 int j = blockDim.x * blockIdx.x + threadIdx.x;
1196 int nColsA = nRows * nCols;
1197 if (i >=
size || j >= nColsA)
return;
1200 int row = j / nCols;
1201 int col = j % nCols;
1203 AFloat element = B[ i * nColsA + row * nCols + col ];
1224template<
typename AFloat>
1225__global__
void Deflatten(AFloat * A,
const AFloat * B,
int size,
int nRows,
int nCols)
1227 int i = blockDim.y * blockIdx.y + threadIdx.y;
1228 int j = blockDim.x * blockIdx.x + threadIdx.x;
1230 int nColsB = nRows * nCols;
1231 if (i >=
size || j >= nColsB)
return;
1233 AFloat element = B[j *
size + i];
1236 int row = j / nCols;
1237 int col = j % nCols;
1238 A[ i * nColsB + col * nRows + row] = element;
1242template<
typename AFloat>
1245 int i = blockDim.y * blockIdx.y + threadIdx.y;
1246 int j = blockDim.x * blockIdx.x + threadIdx.x;
1248 int nColsB = nRows * nCols;
1249 if (i >=
size || j >= nColsB)
return;
1251 AFloat element = B[j *
size + i];
1254 int row = j / nCols;
1255 int col = j % nCols;
1256 A[ i * nColsB + row * nCols + col] = element;
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t height
Implementation of the CrossEntropy as separation criterion.
static constexpr int BlockSize
__global__ void SymmetricRelu(AFloat *A, int m, int n)
__global__ void UpdateWeights(AFloat *A, const AFloat **B, int batchSize, int nRows, int nCols)
__device__ int calculateDimension(int imgDim, int fltDim, int padding, int stride)
Calculate the dimension of an output volume, given the sliding parameters and the input shape.
__global__ void SigmoidDerivative(AFloat *B, const AFloat *A, int m, int n)
__device__ AFloat AtomicAdd(AFloat *address, AFloat val)
__global__ void Dropout(AFloat *A, int m, int n, AFloat dropoutProbability, curandState_t *state)
__global__ void SoftmaxCrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void SumColumns(AFloat *B, const AFloat *A, int m, int n)
__global__ void IdentityDerivative(AFloat *A, int m, int n)
__global__ void SqrtElementWise(AFloat *A, int m, int n)
__global__ void AdamUpdate(AFloat *A, const AFloat *M, const AFloat *V, int m, int n, AFloat alpha, AFloat eps)
optimizer kernel functions
__global__ void SoftmaxCrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void AddL1RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
__device__ void ReduceSumVertical(AFloat *result, AFloat *sdata, int n)
__global__ void MeanSquaredErrorGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void Relu(AFloat *A, int m, int n)
__global__ void ReluDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void AbsoluteSum(AFloat *result, const AFloat *A, int m, int n)
__global__ void AddL2RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
__device__ AFloat max(AFloat x, AFloat y)
__global__ void AddRowWise(AFloat *W, const AFloat *theta, int m, int n)
__global__ void ConstMult(AFloat *A, AFloat beta, int m, int n)
__global__ void GaussDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void Deflatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Deflatten a 2D-array into an array of 2D-arrays.
__global__ void Flatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Flatten an array of 2D-arrays into a single 2D-array.
__global__ void Softmax(AFloat *B, const AFloat *A, int m, int n)
__global__ void RotateWeights(AFloat *A, const AFloat *B, int filterDepth, int filterHeight, int filterWidth, int numFilters)
__global__ void TanhDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void CrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void ReduceMatrix(AFloat *result, const AFloat *A, int m, int n)
__global__ void Im2Col(AFloat *A, const AFloat *B, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols, int zeroPaddingHeight, int zeroPaddingWidth)
A kernel that re-arranges image regions of the input matrix \B, into column vectors in matrix \A.
__global__ void DeflattenRM(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
__global__ void ConstAdd(AFloat *A, AFloat beta, int m, int n)
__global__ void SymmetricReluDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void MeanSquaredError(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void SquareElementWise(AFloat *A, int m, int n)
__global__ void SoftSignDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void Reshape(AFloat *A, const AFloat *B, int nRowsA, int nColsA, int nRowsB, int nColsB)
__global__ void Hadamard(AFloat *B, const AFloat *A, int m, int n)
__global__ void AlmostEquals(bool *result, const AFloat *A, const AFloat *B, double epsilon, int m, int n)
__global__ void FlattenRM(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
__global__ void SquaredSum(AFloat *result, const AFloat *A, int m, int n)
__global__ void AdamUpdateFirstMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
__global__ void ReciprocalElementWise(AFloat *A, int m, int n)
__device__ void ReduceSum(AFloat *result, AFloat *sdata)
__global__ void MaxPoolBackward(AFloat *activationGradientsBackward, const AFloat *activationGradients, const AFloat *indexMatrix, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Back-propagate the gradients through a max-pooling layer.
__global__ void Downsample(AFloat *output, AFloat *indexMatrix, const AFloat *input, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Downsampling kernel used as the forward propagation step of a Max-Pooling layer.
__global__ void AdamUpdateSecondMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
__global__ void AddBiases(AFloat *A, const AFloat *B, int nRows, int nCols)
std::shared_ptr< std::function< double(double)> > Tanh
double weightDecay(double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
compute the weight decay for regularization (L1 or L2)
std::shared_ptr< std::function< double(double)> > Gauss
std::shared_ptr< std::function< double(double)> > Sigmoid
std::shared_ptr< std::function< double(double)> > SoftSign
create variable transformations
static uint64_t sum(uint64_t i)