summaryrefslogtreecommitdiff
path: root/ot/gpu/cudamat/cudamat/cudamat_kernels.cuh
diff options
context:
space:
mode:
Diffstat (limited to 'ot/gpu/cudamat/cudamat/cudamat_kernels.cuh')
-rw-r--r--ot/gpu/cudamat/cudamat/cudamat_kernels.cuh92
1 files changed, 92 insertions, 0 deletions
diff --git a/ot/gpu/cudamat/cudamat/cudamat_kernels.cuh b/ot/gpu/cudamat/cudamat/cudamat_kernels.cuh
new file mode 100644
index 0000000..25e2858
--- /dev/null
+++ b/ot/gpu/cudamat/cudamat/cudamat_kernels.cuh
@@ -0,0 +1,92 @@
+#ifndef NVMATRIX_KERNEL_H_
+#define NVMATRIX_KERNEL_H_
+
+#define NUM_RND_BLOCKS 96
+#define NUM_RND_THREADS_PER_BLOCK 128
+#define NUM_RND_STREAMS (NUM_RND_BLOCKS * NUM_RND_THREADS_PER_BLOCK)
+
+/*
+ * Defines for getting the values at the lower and upper 32 bits
+ * of a 64-bit number.
+ */
+#define LOW_BITS(x) ((x) & 0xffffffff)
+#define HIGH_BITS(x) ((x) >> 32)
+
+/*
+ * Number of iterations to run random number generator upon initialization.
+ */
+#define NUM_RND_BURNIN 100
+
+/*
+ * CUDA grid dimensions for different types of kernels
+ */
+#define COPY_BLOCK_SIZE 16
+#
+// element-wise kernels use min(ceil(N / 512), 4096) blocks of 512 threads
+#define MAX_VECTOR_OP_BLOCKS 4096
+#define MAX_VECTOR_OP_THREADS_PER_BLOCK 512
+#define NUM_VECTOR_OP_BLOCKS(N) (min(((N) + MAX_VECTOR_OP_THREADS_PER_BLOCK - 1)/MAX_VECTOR_OP_THREADS_PER_BLOCK, MAX_VECTOR_OP_BLOCKS))
+#define NUM_VECTOR_OP_THREADS_PER_BLOCK(N) (min((N), MAX_VECTOR_OP_THREADS_PER_BLOCK))
+
+#define PI 3.1415926535897932f
+
+__global__ void kSeedRandom(unsigned int* randMults, unsigned long long* randWords, unsigned int seed);
+__global__ void kRandomUniform(unsigned int* randMults, unsigned long long* randWords, double* gData, unsigned int numElements);
+__global__ void kRandomGaussian(unsigned int* rndMults, unsigned long long* rndWords, double* gData, unsigned int numElements);
+
+__global__ void kGetRowSlice(double* source, double* target, int start, int end, int width, int height);
+__global__ void kTranspose(double *odata, double *idata, int width, int height);
+__global__ void kSetRowSlice(double* source, double* target, int start, int end, int width, int height);
+
+__global__ void kLessThan(double* mat1, double* mat2, double* target, unsigned int len);
+__global__ void kLessThanScalar(double* mat, double val, double* target, unsigned int len);
+__global__ void kGreaterThan(double* mat1, double* mat2, double* target, unsigned int len);
+__global__ void kGreaterThanScalar(double* mat, double val, double* target, unsigned int len);
+__global__ void kEquals(double* mat1, double* mat2, double* target, unsigned int len);
+__global__ void kEqualsScalar(double* mat, double val, double* target, unsigned int len);
+__global__ void kMinimum(double* mat1, double* mat2, double* target, unsigned int len);
+__global__ void kMinimumScalar(double* mat, double val, double* target, unsigned int len);
+__global__ void kMaximum(double* mat1, double* mat2, double* target, unsigned int len);
+__global__ void kMaximumScalar(double* mat, double val, double* target, unsigned int len);
+__global__ void kMinColumnwise(double* mat, double* target, unsigned int width, unsigned int height);
+__global__ void kMinRowwise(double* mat, double* target, unsigned int width, unsigned int height);
+__global__ void kMaxColumnwise(double* mat, double* target, unsigned int width, unsigned int height);
+__global__ void kMaxRowwise(double* mat, double* target, unsigned int width, unsigned int height);
+__global__ void kArgMinColumnwise(double* mat, double* target, unsigned int width, unsigned int height);
+__global__ void kArgMinRowwise(double* mat, double* target, unsigned int width, unsigned int height);
+__global__ void kArgMaxColumnwise(double* mat, double* target, unsigned int width, unsigned int height);
+__global__ void kArgMaxRowwise(double* mat, double* target, unsigned int width, unsigned int height);
+__global__ void kSign(double* mat, double* target, unsigned int len);
+__global__ void kApplySigmoid(double* mat, double* target, unsigned int len);
+__global__ void kApplyTanh(double* mat, double* target, unsigned int len);
+__global__ void kApplySoftThreshold(double* mat, double alpha, double* target, unsigned int len);
+__global__ void kApplyAbs(double* mat, double* target, unsigned int len);
+__global__ void kApplyLog1PlusExp(double* mat, double* target, unsigned int len);
+__global__ void kLog(double* mat, double* target, unsigned int len);
+__global__ void kExp(double* mat, double* target, unsigned int len);
+__global__ void kGamma(double* mat, double* target, unsigned int len);
+__global__ void kLogGamma(double* mat, double* target, unsigned int len);
+__global__ void kSqrt(double* mat, double* target, unsigned int len);
+__global__ void kPow(double* mat, double pow, double* target, unsigned int len);
+__global__ void kPowMatrix(double* mat, double* pow, double* target, unsigned int len);
+__global__ void kReciprocal(double* mat, double* target, unsigned int len);
+__global__ void kAddColVector(double* mat, double* vec, double* tgtMat, unsigned int width, unsigned int height);
+__global__ void kAddRowVector(double* mat, double* vec, double* tgtMat, unsigned int width, unsigned int height);
+__global__ void kAddColMult(double* mat, double* vec, double* tgtMat, double mult, unsigned int width, unsigned int height);
+__global__ void kMultByColVector(double* mat, double* vec, double* tgtMat, unsigned int width, unsigned int height);
+__global__ void kMultByRowVector(double* mat, double* vec, double* tgtMat, unsigned int width, unsigned int height);
+__global__ void kDivByColVector(double* mat, double* vec, double* tgtMat, unsigned int width, unsigned int height);
+__global__ void kDivByRowVector(double* mat, double* vec, double* tgtMat, unsigned int width, unsigned int height);
+__global__ void kAdd(double* a, double* b, double* dest, unsigned int numEls);
+__global__ void kSubtract(double* a, double* b, double* dest, unsigned int numEls);
+__global__ void kMult(double* a, double* b, double* dest, unsigned int numEls);
+__global__ void kDivide(double* a, double* b, double* dest, unsigned int numEls);
+__global__ void kMultScalar(double* mat, double alpha, double* dest, unsigned int len);
+__global__ void kAssignScalar(double* dest, double alpha, unsigned int len);
+__global__ void kDivideScalar(double* mat, double alpha, double* dest, unsigned int len);
+__global__ void kAddScalar(double* a, double alpha, double* dest, unsigned int numEls);
+__global__ void kSelectRows(double* source, double* target, double* indices, int nRowIs, int nCols, int nSourceRows);
+__global__ void kSetSelectedRows(double* target, double* source, double* indices, int nRowIs, int nCols, int nTargetRows);
+__global__ void kWhere(double* condition_mat, double* if_mat, double* else_mat, double* target, unsigned int len);
+__global__ void kCorrelate(double* source, double* kernel, double* dest, int width, int height, int fwidth, int fheight);
+#endif