summaryrefslogtreecommitdiff
path: root/ot/gpu/cudamat/cudamat/cudamat.cu
diff options
context:
space:
mode:
Diffstat (limited to 'ot/gpu/cudamat/cudamat/cudamat.cu')
-rw-r--r--ot/gpu/cudamat/cudamat/cudamat.cu1633
1 files changed, 1633 insertions, 0 deletions
diff --git a/ot/gpu/cudamat/cudamat/cudamat.cu b/ot/gpu/cudamat/cudamat/cudamat.cu
new file mode 100644
index 0000000..522f9cc
--- /dev/null
+++ b/ot/gpu/cudamat/cudamat/cudamat.cu
@@ -0,0 +1,1633 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <cublas.h>
+#include "cudamat_kernels.cuh"
+#include "cudamat.cuh"
+
+extern "C" {
+
+/* ------------------------------ CUBLAS init/shutdown ------------------------------ */
+
+inline bool check_cublas_error() {
+ cublasStatus status = cublasGetError();
+
+ return status != CUBLAS_STATUS_SUCCESS;
+}
+
+inline bool checkCUDAError() {
+ cudaError_t err = cudaGetLastError();
+
+ if (cudaSuccess != err)
+ printf("%s\n", cudaGetErrorString( err));
+ return cudaSuccess != err;
+}
+
+EXPORT const char* get_last_cuda_error() {
+ cudaError_t err = cudaGetLastError();
+
+ return cudaGetErrorString( err);
+}
+
+EXPORT const char* get_last_clib_error() {
+ return strerror(errno);
+}
+
+EXPORT int cublas_init() {
+ cublasInit();
+ if (check_cublas_error())
+ return CUBLAS_ERROR;
+ else
+ return 0;
+}
+
+EXPORT int cublas_shutdown() {
+ cublasShutdown();
+ cudaThreadExit();
+
+ return 0;
+}
+
+
+EXPORT int cuda_set_device(int deviceId) {
+ cudaSetDevice(deviceId);
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+ else
+ return 0;
+}
+
+EXPORT int init_random(rnd_struct* rnd_state, int seed, char* cudamatpath) {
+ unsigned int * host_mults;
+ host_mults = (unsigned int*)malloc(NUM_RND_STREAMS * sizeof(unsigned int));
+ FILE * pFile;
+
+ pFile = fopen (cudamatpath,"r");
+ if (pFile == NULL) {
+ return ERROR_FILE_OPEN;
+ }
+
+ for (int i = 0; i < NUM_RND_STREAMS; i++) {
+ if (fscanf (pFile, "%u", &host_mults[i]) != 1) {
+ return ERROR_FILE_SCAN;
+ }
+ }
+ fclose (pFile);
+
+ cublasAlloc(NUM_RND_STREAMS, sizeof(unsigned int), (void**)&rnd_state->dev_mults);
+ cublasAlloc(NUM_RND_STREAMS, sizeof(unsigned long long), (void**)&rnd_state->dev_words);
+ cublasSetVector(NUM_RND_STREAMS, sizeof(unsigned int), host_mults, 1, rnd_state->dev_mults, 1);
+ //cudaMalloc((void **)&rnd_state->dev_mults, NUM_RND_STREAMS * sizeof(unsigned int));
+ //cudaMalloc((void **)&rnd_state->dev_words, NUM_RND_STREAMS * sizeof(unsigned long long));
+ //cudaMemcpy(rnd_state->dev_mults, host_mults, NUM_RND_STREAMS * sizeof(unsigned int), cudaMemcpyHostToDevice);
+ cudaThreadSynchronize();
+
+ kSeedRandom<<<NUM_RND_BLOCKS, NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, seed);
+
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+ else
+ return 0;
+}
+
+/* ------------------------------ Utility routines ------------------------------ */
+
+EXPORT int get_leading_dimension(cudamat* mat) {
+ return mat->is_trans ? mat->size[1] : mat->size[0];
+}
+
+EXPORT int get_nonleading_dimension(cudamat* mat) {
+ return mat->is_trans ? mat->size[0] : mat->size[1];
+}
+
+EXPORT void set_transpose(cudamat* mat, int is_trans) {
+ mat->is_trans = is_trans;
+}
+
+inline char get_transpose_char(cudamat* mat) {
+ return mat->is_trans ? 't' : 'n';
+}
+
+EXPORT void cuda_sync_threads() {
+ cudaThreadSynchronize();
+}
+
+/* ------------------------------ Allocating/moving data ------------------------------ */
+
+EXPORT int allocate_device_memory(cudamat* mat) {
+ int len = mat->size[0]*mat->size[1];
+
+ cublasStatus stat;
+
+ stat = cublasAlloc(len, sizeof(mat->data_device[0]), (void**)&mat->data_device);
+
+ if (stat != CUBLAS_STATUS_SUCCESS || check_cublas_error()) {
+ checkCUDAError();
+ return CUBLAS_ERROR;
+ }
+
+ mat->on_device = 1;
+ return 0;
+}
+
+EXPORT int copy_to_host(cudamat* mat) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (mat->on_device) {
+ cublasGetVector(len, sizeof(mat->data_host[0]), mat->data_device, 1, mat->data_host, 1);
+
+ if (check_cublas_error())
+ return CUBLAS_ERROR;
+ } else
+ return ERROR_NOT_ON_DEVICE;
+
+ return 0;
+}
+
+EXPORT int copy_to_device(cudamat* mat) {
+ int len = mat->size[0]*mat->size[1];
+ int err_code = 0;
+
+ //if (!mat->owns_data)
+ // return VIEW_ERROR;
+
+ if (!mat->on_device) {
+ err_code = allocate_device_memory(mat);
+ if (err_code)
+ return err_code;
+ }
+
+ cublasSetVector(len, sizeof(mat->data_host[0]), mat->data_host, 1, mat->data_device, 1);
+
+ if (check_cublas_error())
+ return CUBLAS_ERROR;
+
+ return 0;
+}
+
+EXPORT int copy_on_device(cudamat* mat1, cudamat* mat2) {
+ int len = mat1->size[0]*mat1->size[1];
+
+ if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ cublasDcopy(len, mat1->data_device, 1, mat2->data_device, 1);
+
+ if (check_cublas_error())
+ return CUBLAS_ERROR;
+ else
+ return 0;
+}
+
+EXPORT int get_row_slice(cudamat* source, cudamat* target, unsigned int start, unsigned int end) {
+ int height = source->size[0];
+ int width = source->size[1];
+
+ if ((end - start) != target->size[0] || source->size[1] != target->size[1] || start >= end || end > height)
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ dim3 kernelBlockGrid((int)ceil((end - start)/32.), (int)ceil(width/32.), 1);
+ dim3 kernelBlockDim(32, 1, 1);
+
+ kGetRowSlice<<<kernelBlockGrid,kernelBlockDim>>>(source->data_device, target->data_device, start, end, width, height);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+ else
+ return 0;
+}
+
+EXPORT int set_row_slice(cudamat* source, cudamat* target, unsigned int start, unsigned int end) {
+ int height = target->size[0];
+ int width = target->size[1];
+
+ if ((end - start) != source->size[0] || source->size[1] != target->size[1] || start >= end || end > height)
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ dim3 kernelBlockGrid((int)ceil((end - start)/32.), (int)ceil(width/32.), 1);
+ dim3 kernelBlockDim(32, 1, 1);
+
+ kSetRowSlice<<<kernelBlockGrid,kernelBlockDim>>>(source->data_device, target->data_device, start, end, width, height);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+ else
+ return 0;
+}
+
+EXPORT int copy_transpose(cudamat* source, cudamat* target) {
+ unsigned int height = source->size[0];
+ unsigned int width = source->size[1];
+
+ if (source->size[0] != target->size[1] || source->size[1] != target->size[0])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ // setup execution parameters
+ unsigned int grid_x = height / COPY_BLOCK_SIZE;
+ if (height % COPY_BLOCK_SIZE)
+ grid_x++;
+
+ unsigned int grid_y = width / COPY_BLOCK_SIZE;
+ if (width % COPY_BLOCK_SIZE)
+ grid_y++;
+
+ dim3 grid(grid_x, grid_y, 1);
+ dim3 threads(COPY_BLOCK_SIZE, COPY_BLOCK_SIZE, 1);
+
+ kTranspose<<< grid, threads >>>(target->data_device, source->data_device, height, width);
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+ else
+ return 0;
+}
+
+EXPORT int free_device_memory(cudamat* mat) {
+ if (mat->owns_data && mat->on_device) {
+ cublasStatus stat;
+
+ stat = cublasFree(mat->data_device);
+ mat->on_device = 0;
+
+ if (stat != CUBLAS_STATUS_SUCCESS || check_cublas_error())
+ return CUBLAS_ERROR;
+ }
+
+ return 0;
+}
+
+EXPORT int reshape(cudamat* mat, unsigned int m, unsigned int n) {
+ if (mat->size[0] * mat->size[1] != m * n)
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ mat->size[0] = m;
+ mat->size[1] = n;
+
+ return 0;
+}
+
+EXPORT int get_slice(cudamat* source, cudamat* target, unsigned int first_col, unsigned int last_col) {
+ if (source->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (!source->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (last_col > source->size[1] || (first_col >= last_col))
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ int num_rows = source->size[0];
+
+ target->data_host = 0;
+ target->data_device = source->data_device + first_col * num_rows;
+ target->on_device = 1;
+ target->on_host = 0;
+ target->size[0] = source->size[0];
+ target->size[1] = last_col - first_col;
+ target->is_trans = 0;
+ target->owns_data = 0;
+
+ return 0;
+}
+
+EXPORT int get_vector_slice(cudamat* source, cudamat* target, unsigned int first_ind, unsigned int last_ind) {
+ // source must be a vector
+ if (source->size[0] > 1 && source->size[1] > 1)
+ return ERROR_GENERIC;
+
+ if (source->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (!source->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (first_ind >= last_ind)
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ int num_rows = source->size[0];
+
+ target->data_host = 0;
+ target->data_device = source->data_device + first_ind * num_rows;
+ target->on_device = 1;
+ target->on_host = 0;
+ target->is_trans = 0;
+ target->owns_data = 0;
+
+ if (source->size[0] > 1) {
+ if (last_ind > source->size[0])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ target->size[0] = last_ind - first_ind;
+ target->size[1] = 1;
+ } else {
+ if (last_ind > source->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ target->size[0] = 1;
+ target->size[1] = last_ind - first_ind;
+ }
+
+ return 0;
+}
+
+/* ------------------------------ Initialization routines ------------------------------ */
+
+EXPORT void init_from_array(cudamat* mat, double* data, int m, int n) {
+ mat->data_host = data;
+ mat->size[0] = m;
+ mat->size[1] = n;
+ mat->on_device = 0;
+ mat->on_host = 1;
+ mat->is_trans = 0;
+ mat->owns_data = 1;
+}
+
+EXPORT int init_empty(cudamat* mat, int m, int n) {
+ mat->size[0] = m;
+ mat->size[1] = n;
+ mat->on_device = 0;
+ mat->on_host = 0;
+ mat->is_trans = 0;
+ mat->owns_data = 1;
+
+ return allocate_device_memory(mat);
+}
+
+/* ------------------------------ Random number generation ------------------------------ */
+EXPORT int fill_with_rand(rnd_struct* rnd_state, cudamat* mat) {
+ int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ kRandomUniform<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, mat->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+ else
+ return 0;
+}
+
+EXPORT int fill_with_randn(rnd_struct* rnd_state, cudamat* mat) {
+ int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ kRandomGaussian<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, mat->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+ else
+ return 0;
+}
+/* ------------------------------ Algebraic operations ------------------------------ */
+
+EXPORT int add_col_vec(cudamat* mat, cudamat* vec, cudamat* target) {
+ unsigned int h = mat->size[0],
+ w = mat->size[1];
+
+ if (!mat->on_device || !vec->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (mat->size[0] != vec->size[0] || vec->size[1] != 1 ||
+ mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kAddColVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError()) {
+ return CUDA_ERROR;
+ }
+
+ return 0;
+}
+
+EXPORT int add_col_mult(cudamat* mat, cudamat* vec, cudamat* target, double mult) {
+ unsigned int h = mat->size[0],
+ w = mat->size[1];
+
+ if (!mat->on_device || !vec->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (mat->size[0] != vec->size[0] || vec->size[1] != 1 ||
+ mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kAddColMult<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, mult, w, h);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int add_row_vec(cudamat* mat, cudamat* vec, cudamat* target) {
+ unsigned int h = mat->size[0],
+ w = mat->size[1];
+
+ if (!mat->on_device || !vec->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (mat->size[1] != vec->size[1] || vec->size[0] != 1 ||
+ mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kAddRowVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int mult_by_col_vec(cudamat* mat, cudamat* vec, cudamat* target) {
+ unsigned int h = mat->size[0],
+ w = mat->size[1];
+
+ if (!mat->on_device || !vec->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (mat->size[0] != vec->size[0] || vec->size[1] != 1 ||
+ mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kMultByColVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int mult_by_row_vec(cudamat* mat, cudamat* vec, cudamat* target) {
+ unsigned int h = mat->size[0],
+ w = mat->size[1];
+
+ if (!mat->on_device || !vec->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (mat->size[1] != vec->size[1] || vec->size[0] != 1 ||
+ mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kMultByRowVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int divide_by_col_vec(cudamat* mat, cudamat* vec, cudamat* target) {
+ unsigned int h = mat->size[0],
+ w = mat->size[1];
+
+ if (!mat->on_device || !vec->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (mat->size[0] != vec->size[0] || vec->size[1] != 1 ||
+ mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kDivByColVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int divide_by_row_vec(cudamat* mat, cudamat* vec, cudamat* target) {
+ unsigned int h = mat->size[0],
+ w = mat->size[1];
+
+ if (!mat->on_device || !vec->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (mat->size[1] != vec->size[1] || vec->size[0] != 1 ||
+ mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kDivByRowVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int less_than(cudamat* mat1, cudamat* mat2, cudamat* target) {
+ int len = mat1->size[0]*mat1->size[1];
+
+ if (!mat1->on_device || !mat2->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat1->is_trans != mat2->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
+ mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kLessThan<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int less_than_scalar(cudamat* mat, double val, cudamat* target) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans != target->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kLessThanScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int greater_than(cudamat* mat1, cudamat* mat2, cudamat* target) {
+ int len = mat1->size[0]*mat1->size[1];
+
+ if (!mat1->on_device || !mat2->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat1->is_trans != mat2->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
+ mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kGreaterThan<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int greater_than_scalar(cudamat* mat, double val, cudamat* target) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans != target->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kGreaterThanScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int equals(cudamat* mat1, cudamat* mat2, cudamat* target) {
+ int len = mat1->size[0]*mat1->size[1];
+
+ if (!mat1->on_device || !mat2->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat1->is_trans != mat2->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
+ mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kEquals<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int equals_scalar(cudamat* mat, double val, cudamat* target) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans != target->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kEqualsScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int minimum(cudamat* mat1, cudamat* mat2, cudamat* target) {
+ int len = mat1->size[0]*mat1->size[1];
+
+ if (!mat1->on_device || !mat2->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat1->is_trans != mat2->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
+ mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kMinimum<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
+
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int minimum_scalar(cudamat* mat, double val, cudamat* target) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans != target->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kMinimumScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len);
+
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int maximum(cudamat* mat1, cudamat* mat2, cudamat* target) {
+ int len = mat1->size[0]*mat1->size[1];
+
+ if (!mat1->on_device || !mat2->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat1->is_trans != mat2->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
+ mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kMaximum<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
+
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int maximum_scalar(cudamat* mat, double val, cudamat* target) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans != target->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kMaximumScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len);
+
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int min_by_axis(cudamat* mat, cudamat* target, int axis) {
+ unsigned int h = mat->size[0],
+ w = mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (axis == 0) {
+ if (target->size[0] != 1 || target->size[1] != mat->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kMinColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h);
+ } else {
+ if (target->size[1] != 1 || target->size[0] != mat->size[0])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kMinRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h);
+ }
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int max_by_axis(cudamat* mat, cudamat* target, int axis) {
+ unsigned int h = mat->size[0],
+ w = mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (axis == 0) {
+ if (target->size[0] != 1 || target->size[1] != mat->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kMaxColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h);
+ } else {
+ if (target->size[1] != 1 || target->size[0] != mat->size[0])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kMaxRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h);
+ }
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int argmin_by_axis(cudamat* mat, cudamat* target, int axis) {
+ unsigned int h = mat->size[0],
+ w = mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (axis == 0) {
+ if (target->size[0] != 1 || target->size[1] != mat->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kArgMinColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h);
+ } else {
+ if (target->size[1] != 1 || target->size[0] != mat->size[0])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kArgMinRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h);
+ }
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int argmax_by_axis(cudamat* mat, cudamat* target, int axis) {
+ unsigned int h = mat->size[0],
+ w = mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans)
+ return ERROR_TRANSPOSED;
+
+ if (axis == 0) {
+ if (target->size[0] != 1 || target->size[1] != mat->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kArgMaxColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h);
+ } else {
+ if (target->size[1] != 1 || target->size[0] != mat->size[0])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kArgMaxRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h);
+ }
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int sign(cudamat* mat, cudamat* target) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->is_trans != target->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kSign<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int apply_sigmoid(cudamat* mat, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kApplySigmoid<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int apply_tanh(cudamat* mat, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kApplyTanh<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int apply_soft_threshold(cudamat* mat, double alpha, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kApplySoftThreshold<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len);
+
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int apply_abs(cudamat* mat, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kApplyAbs<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int apply_log_1_plus_exp(cudamat* mat, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kApplyLog1PlusExp<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int apply_log(cudamat* mat, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kLog<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int apply_exp(cudamat* mat, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kExp<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int apply_gamma(cudamat* mat, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kGamma<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int apply_lgamma(cudamat* mat, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kLogGamma<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int apply_sqrt(cudamat* mat, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kSqrt<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int apply_pow(cudamat* mat, double pow, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kPow<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, pow, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int apply_pow_matrix(cudamat* mat, cudamat* pow, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ if (mat->size[0] != pow->size[0] || mat->size[1] != pow->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kPowMatrix<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, pow->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int reciprocal(cudamat* mat, cudamat* target) {
+ unsigned int len = mat->size[0] * mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kReciprocal<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int dot(cudamat* mat1, cudamat* mat2, cudamat* target, double beta, double alpha) {
+ if (!mat1->on_device || !mat2->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (get_leading_dimension(mat1) != get_leading_dimension(target) ||
+ get_nonleading_dimension(mat2) != get_nonleading_dimension(target) ||
+ get_nonleading_dimension(mat1) != get_leading_dimension(mat2)) {
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+ }
+ int m = get_leading_dimension(mat1),
+ k = get_leading_dimension(mat2),
+ n = get_nonleading_dimension(mat2);
+
+ // gemv if second matrix is a (column) vector
+ if (n == 1) {
+ cublasDgemv(get_transpose_char(mat1), mat1->size[0], mat1->size[1],
+ alpha, mat1->data_device, mat1->size[0],
+ mat2->data_device, 1,
+ beta, target->data_device, 1);
+ }
+ // gemv if first matrix is a (row) vector
+ else if (m == 1) {
+ cublasDgemv(mat2->is_trans ? 'n' : 't', mat2->size[0], mat2->size[1],
+ alpha, mat2->data_device, mat2->size[0],
+ mat1->data_device, 1,
+ beta, target->data_device, 1);
+ }
+ // gemm otherwise
+ else {
+ cublasDgemm(get_transpose_char(mat1), get_transpose_char(mat2),
+ m, n, k,
+ alpha, mat1->data_device, mat1->size[0],
+ mat2->data_device, mat2->size[0],
+ beta, target->data_device, target->size[0]);
+ }
+
+ if (check_cublas_error())
+ return CUBLAS_ERROR;
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ return 0;
+}
+
+EXPORT double vdot(cudamat* mat1, cudamat* mat2, int* err_code) {
+ int len = mat1->size[0]*mat1->size[1];
+ double res;
+
+ if (!mat1->on_device || !mat2->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat1->is_trans != mat2->is_trans) {
+ *err_code = ERROR_TRANSPOSEDNESS;
+ return 0;
+ }
+
+ if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1]) {
+ *err_code = ERROR_INCOMPATIBLE_DIMENSIONS;
+ return 0;
+ }
+
+ res = cublasDdot(len, mat1->data_device, 1, mat2->data_device, 1);
+
+ if (check_cublas_error()) {
+ *err_code = CUBLAS_ERROR;
+ return -1.;
+ } else {
+ *err_code = 0;
+ return res;
+ }
+}
+
+/* Perform the operation mat1 = mat1 + alpha * mat2. mat1 and mat2 must
+ have the same transposedness. */
+EXPORT int add_mult(cudamat* mat1, cudamat* mat2, double alpha) {
+ int len = mat1->size[0]*mat1->size[1];
+
+ if (!mat1->on_device || !mat2->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat1->is_trans != mat2->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ cublasDaxpy(len, alpha, mat2->data_device, 1, mat1->data_device, 1);
+
+ if (check_cublas_error())
+ return CUBLAS_ERROR;
+
+ return 0;
+}
+
+EXPORT int add_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) {
+ int len = mat1->size[0]*mat1->size[1];
+
+ if (!mat1->on_device || !mat2->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat1->is_trans != mat2->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
+ mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ if (mat1 == target) {
+ cublasDaxpy(len, 1, mat2->data_device, 1, mat1->data_device, 1);
+
+ if (check_cublas_error())
+ return CUBLAS_ERROR;
+
+ } else {
+ kAdd<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+ }
+
+ return 0;
+}
+
+EXPORT int subtract_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) {
+ int len = mat1->size[0]*mat1->size[1];
+
+ if (!mat1->on_device || !mat2->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat1->is_trans != mat2->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
+ mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kSubtract<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int divide_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) {
+ int len = mat1->size[0]*mat1->size[1];
+
+ if (!mat1->on_device || !mat2->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat1->is_trans != mat2->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
+ mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kDivide<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+/* Elementwise multiplication of 2 matrices */
+EXPORT int mult_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) {
+ int len = mat1->size[0]*mat1->size[1];
+
+ if (!mat1->on_device || !mat2->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat1->is_trans != mat2->is_trans)
+ return ERROR_TRANSPOSEDNESS;
+
+ if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
+ mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kMult<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int assign_scalar(cudamat* mat, double alpha) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (!mat->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ kAssignScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int mult_by_scalar(cudamat* mat, double alpha, cudamat* target) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ if (mat == target) {
+ cublasDscal(len, alpha, mat->data_device, 1);
+
+ if (check_cublas_error())
+ return CUBLAS_ERROR;
+
+ } else {
+ kMultScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+ }
+
+ return 0;
+}
+
+EXPORT int divide_by_scalar(cudamat* mat, double alpha, cudamat* target) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kDivideScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int add_scalar(cudamat* mat, double alpha, cudamat* target) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (!mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kAddScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT double euclid_norm(cudamat* mat, int* err_code) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (!mat->on_device) {
+ *err_code = ERROR_NOT_ON_DEVICE;
+ return -1.;
+ }
+
+ double res = cublasDnrm2(len, mat->data_device, 1);
+
+ if (check_cublas_error()) {
+ *err_code = CUBLAS_ERROR;
+ return -1.;
+ } else {
+ *err_code = 0;
+ return res;
+ }
+}
+
+EXPORT double manhattan_norm(cudamat* mat, int* err_code) {
+ int len = mat->size[0]*mat->size[1];
+
+ if (!mat->on_device) {
+ *err_code = ERROR_NOT_ON_DEVICE;
+ return -1.;
+ }
+
+ double res = cublasDasum(len, mat->data_device, 1);
+
+ if (check_cublas_error()) {
+ *err_code = CUBLAS_ERROR;
+ return -1.;
+ } else {
+ *err_code = 0;
+ return res;
+ }
+}
+
+EXPORT int selectRows(cudamat* source, cudamat* target, cudamat* indices){
+ const int nRetRows = indices->size[1];
+
+ if (nRetRows==0) return 0;
+
+ dim3 gridDim((nRetRows+31)/32);
+ dim3 blockDim(32);
+
+ kSelectRows<<<gridDim, blockDim>>>(source->data_device, target->data_device, indices->data_device, nRetRows, source->size[0], source->size[1]);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+ else
+ return 0;
+}
+
+EXPORT int setSelectedRows(cudamat* target, cudamat* source, cudamat* indices){
+ const int nSetRows = indices->size[1];
+
+ if (nSetRows==0)
+ return 0;
+
+ dim3 gridDim((nSetRows+31)/32);
+ dim3 blockDim(32);
+
+ kSetSelectedRows<<<gridDim, blockDim>>>(target->data_device, source->data_device, indices->data_device, nSetRows, target->size[0], target->size[1]);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+ else
+ return 0;
+}
+
+EXPORT int where(cudamat* condition_mat, cudamat* if_mat, cudamat* else_mat, cudamat* target) {
+ unsigned int len = condition_mat->size[0] * condition_mat->size[1];
+
+ if (!condition_mat->on_device || !target->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (condition_mat->size[0] != target->size[0] || condition_mat->size[1] != target->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ if (condition_mat->size[0] != if_mat->size[0] || condition_mat->size[1] != if_mat->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ if (condition_mat->size[0] != else_mat->size[0] || condition_mat->size[1] != else_mat->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kWhere<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(condition_mat->data_device,
+ if_mat->data_device, else_mat->data_device, target->data_device, len);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+EXPORT int correlate(cudamat* source, cudamat* kernel, cudamat* dest) {
+ int len = source->size[0] * source->size[1];
+
+ if (!source->on_device || !kernel->on_device || !dest->on_device)
+ return ERROR_NOT_ON_DEVICE;
+
+ if (source->size[0] != dest->size[0] || source->size[1] != dest->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ if (kernel->size[0] % 2 == 0 || kernel->size[1] % 2 == 0 ||
+ kernel->size[0] > source->size[0] || kernel->size[1] > source->size[1])
+ return ERROR_INCOMPATIBLE_DIMENSIONS;
+
+ kCorrelate<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(source->data_device,
+ kernel->data_device, dest->data_device, source->size[1], source->size[0],
+ kernel->size[1], kernel->size[0]);
+
+ if (SYNC_THREADS)
+ cudaThreadSynchronize();
+
+ if (checkCUDAError())
+ return CUDA_ERROR;
+
+ return 0;
+}
+
+}