diff options
Diffstat (limited to 'ot/gpu/cudamat/cudamat/cudamat.cu')
-rw-r--r-- | ot/gpu/cudamat/cudamat/cudamat.cu | 1633 |
1 files changed, 1633 insertions, 0 deletions
diff --git a/ot/gpu/cudamat/cudamat/cudamat.cu b/ot/gpu/cudamat/cudamat/cudamat.cu new file mode 100644 index 0000000..522f9cc --- /dev/null +++ b/ot/gpu/cudamat/cudamat/cudamat.cu @@ -0,0 +1,1633 @@ +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <stdlib.h> +#include <cublas.h> +#include "cudamat_kernels.cuh" +#include "cudamat.cuh" + +extern "C" { + +/* ------------------------------ CUBLAS init/shutdown ------------------------------ */ + +inline bool check_cublas_error() { + cublasStatus status = cublasGetError(); + + return status != CUBLAS_STATUS_SUCCESS; +} + +inline bool checkCUDAError() { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) + printf("%s\n", cudaGetErrorString( err)); + return cudaSuccess != err; +} + +EXPORT const char* get_last_cuda_error() { + cudaError_t err = cudaGetLastError(); + + return cudaGetErrorString( err); +} + +EXPORT const char* get_last_clib_error() { + return strerror(errno); +} + +EXPORT int cublas_init() { + cublasInit(); + if (check_cublas_error()) + return CUBLAS_ERROR; + else + return 0; +} + +EXPORT int cublas_shutdown() { + cublasShutdown(); + cudaThreadExit(); + + return 0; +} + + +EXPORT int cuda_set_device(int deviceId) { + cudaSetDevice(deviceId); + + if (checkCUDAError()) + return CUDA_ERROR; + else + return 0; +} + +EXPORT int init_random(rnd_struct* rnd_state, int seed, char* cudamatpath) { + unsigned int * host_mults; + host_mults = (unsigned int*)malloc(NUM_RND_STREAMS * sizeof(unsigned int)); + FILE * pFile; + + pFile = fopen (cudamatpath,"r"); + if (pFile == NULL) { + return ERROR_FILE_OPEN; + } + + for (int i = 0; i < NUM_RND_STREAMS; i++) { + if (fscanf (pFile, "%u", &host_mults[i]) != 1) { + return ERROR_FILE_SCAN; + } + } + fclose (pFile); + + cublasAlloc(NUM_RND_STREAMS, sizeof(unsigned int), (void**)&rnd_state->dev_mults); + cublasAlloc(NUM_RND_STREAMS, sizeof(unsigned long long), (void**)&rnd_state->dev_words); + cublasSetVector(NUM_RND_STREAMS, sizeof(unsigned int), host_mults, 1, rnd_state->dev_mults, 1); + //cudaMalloc((void **)&rnd_state->dev_mults, NUM_RND_STREAMS * sizeof(unsigned int)); + //cudaMalloc((void **)&rnd_state->dev_words, NUM_RND_STREAMS * sizeof(unsigned long long)); + //cudaMemcpy(rnd_state->dev_mults, host_mults, NUM_RND_STREAMS * sizeof(unsigned int), cudaMemcpyHostToDevice); + cudaThreadSynchronize(); + + kSeedRandom<<<NUM_RND_BLOCKS, NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, seed); + + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + else + return 0; +} + +/* ------------------------------ Utility routines ------------------------------ */ + +EXPORT int get_leading_dimension(cudamat* mat) { + return mat->is_trans ? mat->size[1] : mat->size[0]; +} + +EXPORT int get_nonleading_dimension(cudamat* mat) { + return mat->is_trans ? mat->size[0] : mat->size[1]; +} + +EXPORT void set_transpose(cudamat* mat, int is_trans) { + mat->is_trans = is_trans; +} + +inline char get_transpose_char(cudamat* mat) { + return mat->is_trans ? 't' : 'n'; +} + +EXPORT void cuda_sync_threads() { + cudaThreadSynchronize(); +} + +/* ------------------------------ Allocating/moving data ------------------------------ */ + +EXPORT int allocate_device_memory(cudamat* mat) { + int len = mat->size[0]*mat->size[1]; + + cublasStatus stat; + + stat = cublasAlloc(len, sizeof(mat->data_device[0]), (void**)&mat->data_device); + + if (stat != CUBLAS_STATUS_SUCCESS || check_cublas_error()) { + checkCUDAError(); + return CUBLAS_ERROR; + } + + mat->on_device = 1; + return 0; +} + +EXPORT int copy_to_host(cudamat* mat) { + int len = mat->size[0]*mat->size[1]; + + if (mat->on_device) { + cublasGetVector(len, sizeof(mat->data_host[0]), mat->data_device, 1, mat->data_host, 1); + + if (check_cublas_error()) + return CUBLAS_ERROR; + } else + return ERROR_NOT_ON_DEVICE; + + return 0; +} + +EXPORT int copy_to_device(cudamat* mat) { + int len = mat->size[0]*mat->size[1]; + int err_code = 0; + + //if (!mat->owns_data) + // return VIEW_ERROR; + + if (!mat->on_device) { + err_code = allocate_device_memory(mat); + if (err_code) + return err_code; + } + + cublasSetVector(len, sizeof(mat->data_host[0]), mat->data_host, 1, mat->data_device, 1); + + if (check_cublas_error()) + return CUBLAS_ERROR; + + return 0; +} + +EXPORT int copy_on_device(cudamat* mat1, cudamat* mat2) { + int len = mat1->size[0]*mat1->size[1]; + + if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + cublasDcopy(len, mat1->data_device, 1, mat2->data_device, 1); + + if (check_cublas_error()) + return CUBLAS_ERROR; + else + return 0; +} + +EXPORT int get_row_slice(cudamat* source, cudamat* target, unsigned int start, unsigned int end) { + int height = source->size[0]; + int width = source->size[1]; + + if ((end - start) != target->size[0] || source->size[1] != target->size[1] || start >= end || end > height) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + dim3 kernelBlockGrid((int)ceil((end - start)/32.), (int)ceil(width/32.), 1); + dim3 kernelBlockDim(32, 1, 1); + + kGetRowSlice<<<kernelBlockGrid,kernelBlockDim>>>(source->data_device, target->data_device, start, end, width, height); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + else + return 0; +} + +EXPORT int set_row_slice(cudamat* source, cudamat* target, unsigned int start, unsigned int end) { + int height = target->size[0]; + int width = target->size[1]; + + if ((end - start) != source->size[0] || source->size[1] != target->size[1] || start >= end || end > height) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + dim3 kernelBlockGrid((int)ceil((end - start)/32.), (int)ceil(width/32.), 1); + dim3 kernelBlockDim(32, 1, 1); + + kSetRowSlice<<<kernelBlockGrid,kernelBlockDim>>>(source->data_device, target->data_device, start, end, width, height); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + else + return 0; +} + +EXPORT int copy_transpose(cudamat* source, cudamat* target) { + unsigned int height = source->size[0]; + unsigned int width = source->size[1]; + + if (source->size[0] != target->size[1] || source->size[1] != target->size[0]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + // setup execution parameters + unsigned int grid_x = height / COPY_BLOCK_SIZE; + if (height % COPY_BLOCK_SIZE) + grid_x++; + + unsigned int grid_y = width / COPY_BLOCK_SIZE; + if (width % COPY_BLOCK_SIZE) + grid_y++; + + dim3 grid(grid_x, grid_y, 1); + dim3 threads(COPY_BLOCK_SIZE, COPY_BLOCK_SIZE, 1); + + kTranspose<<< grid, threads >>>(target->data_device, source->data_device, height, width); + + if (checkCUDAError()) + return CUDA_ERROR; + else + return 0; +} + +EXPORT int free_device_memory(cudamat* mat) { + if (mat->owns_data && mat->on_device) { + cublasStatus stat; + + stat = cublasFree(mat->data_device); + mat->on_device = 0; + + if (stat != CUBLAS_STATUS_SUCCESS || check_cublas_error()) + return CUBLAS_ERROR; + } + + return 0; +} + +EXPORT int reshape(cudamat* mat, unsigned int m, unsigned int n) { + if (mat->size[0] * mat->size[1] != m * n) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + mat->size[0] = m; + mat->size[1] = n; + + return 0; +} + +EXPORT int get_slice(cudamat* source, cudamat* target, unsigned int first_col, unsigned int last_col) { + if (source->is_trans) + return ERROR_TRANSPOSED; + + if (!source->on_device) + return ERROR_NOT_ON_DEVICE; + + if (last_col > source->size[1] || (first_col >= last_col)) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + int num_rows = source->size[0]; + + target->data_host = 0; + target->data_device = source->data_device + first_col * num_rows; + target->on_device = 1; + target->on_host = 0; + target->size[0] = source->size[0]; + target->size[1] = last_col - first_col; + target->is_trans = 0; + target->owns_data = 0; + + return 0; +} + +EXPORT int get_vector_slice(cudamat* source, cudamat* target, unsigned int first_ind, unsigned int last_ind) { + // source must be a vector + if (source->size[0] > 1 && source->size[1] > 1) + return ERROR_GENERIC; + + if (source->is_trans) + return ERROR_TRANSPOSED; + + if (!source->on_device) + return ERROR_NOT_ON_DEVICE; + + if (first_ind >= last_ind) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + int num_rows = source->size[0]; + + target->data_host = 0; + target->data_device = source->data_device + first_ind * num_rows; + target->on_device = 1; + target->on_host = 0; + target->is_trans = 0; + target->owns_data = 0; + + if (source->size[0] > 1) { + if (last_ind > source->size[0]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + target->size[0] = last_ind - first_ind; + target->size[1] = 1; + } else { + if (last_ind > source->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + target->size[0] = 1; + target->size[1] = last_ind - first_ind; + } + + return 0; +} + +/* ------------------------------ Initialization routines ------------------------------ */ + +EXPORT void init_from_array(cudamat* mat, double* data, int m, int n) { + mat->data_host = data; + mat->size[0] = m; + mat->size[1] = n; + mat->on_device = 0; + mat->on_host = 1; + mat->is_trans = 0; + mat->owns_data = 1; +} + +EXPORT int init_empty(cudamat* mat, int m, int n) { + mat->size[0] = m; + mat->size[1] = n; + mat->on_device = 0; + mat->on_host = 0; + mat->is_trans = 0; + mat->owns_data = 1; + + return allocate_device_memory(mat); +} + +/* ------------------------------ Random number generation ------------------------------ */ +EXPORT int fill_with_rand(rnd_struct* rnd_state, cudamat* mat) { + int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device) + return ERROR_NOT_ON_DEVICE; + + kRandomUniform<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, mat->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + else + return 0; +} + +EXPORT int fill_with_randn(rnd_struct* rnd_state, cudamat* mat) { + int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device) + return ERROR_NOT_ON_DEVICE; + + kRandomGaussian<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, mat->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + else + return 0; +} +/* ------------------------------ Algebraic operations ------------------------------ */ + +EXPORT int add_col_vec(cudamat* mat, cudamat* vec, cudamat* target) { + unsigned int h = mat->size[0], + w = mat->size[1]; + + if (!mat->on_device || !vec->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans) + return ERROR_TRANSPOSED; + + if (mat->size[0] != vec->size[0] || vec->size[1] != 1 || + mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kAddColVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) { + return CUDA_ERROR; + } + + return 0; +} + +EXPORT int add_col_mult(cudamat* mat, cudamat* vec, cudamat* target, double mult) { + unsigned int h = mat->size[0], + w = mat->size[1]; + + if (!mat->on_device || !vec->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans) + return ERROR_TRANSPOSED; + + if (mat->size[0] != vec->size[0] || vec->size[1] != 1 || + mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kAddColMult<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, mult, w, h); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int add_row_vec(cudamat* mat, cudamat* vec, cudamat* target) { + unsigned int h = mat->size[0], + w = mat->size[1]; + + if (!mat->on_device || !vec->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans) + return ERROR_TRANSPOSED; + + if (mat->size[1] != vec->size[1] || vec->size[0] != 1 || + mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kAddRowVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int mult_by_col_vec(cudamat* mat, cudamat* vec, cudamat* target) { + unsigned int h = mat->size[0], + w = mat->size[1]; + + if (!mat->on_device || !vec->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans) + return ERROR_TRANSPOSED; + + if (mat->size[0] != vec->size[0] || vec->size[1] != 1 || + mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kMultByColVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int mult_by_row_vec(cudamat* mat, cudamat* vec, cudamat* target) { + unsigned int h = mat->size[0], + w = mat->size[1]; + + if (!mat->on_device || !vec->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans) + return ERROR_TRANSPOSED; + + if (mat->size[1] != vec->size[1] || vec->size[0] != 1 || + mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kMultByRowVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int divide_by_col_vec(cudamat* mat, cudamat* vec, cudamat* target) { + unsigned int h = mat->size[0], + w = mat->size[1]; + + if (!mat->on_device || !vec->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans) + return ERROR_TRANSPOSED; + + if (mat->size[0] != vec->size[0] || vec->size[1] != 1 || + mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kDivByColVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int divide_by_row_vec(cudamat* mat, cudamat* vec, cudamat* target) { + unsigned int h = mat->size[0], + w = mat->size[1]; + + if (!mat->on_device || !vec->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans) + return ERROR_TRANSPOSED; + + if (mat->size[1] != vec->size[1] || vec->size[0] != 1 || + mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kDivByRowVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int less_than(cudamat* mat1, cudamat* mat2, cudamat* target) { + int len = mat1->size[0]*mat1->size[1]; + + if (!mat1->on_device || !mat2->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat1->is_trans != mat2->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || + mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kLessThan<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int less_than_scalar(cudamat* mat, double val, cudamat* target) { + int len = mat->size[0]*mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans != target->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kLessThanScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int greater_than(cudamat* mat1, cudamat* mat2, cudamat* target) { + int len = mat1->size[0]*mat1->size[1]; + + if (!mat1->on_device || !mat2->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat1->is_trans != mat2->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || + mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kGreaterThan<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int greater_than_scalar(cudamat* mat, double val, cudamat* target) { + int len = mat->size[0]*mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans != target->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kGreaterThanScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int equals(cudamat* mat1, cudamat* mat2, cudamat* target) { + int len = mat1->size[0]*mat1->size[1]; + + if (!mat1->on_device || !mat2->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat1->is_trans != mat2->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || + mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kEquals<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int equals_scalar(cudamat* mat, double val, cudamat* target) { + int len = mat->size[0]*mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans != target->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kEqualsScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int minimum(cudamat* mat1, cudamat* mat2, cudamat* target) { + int len = mat1->size[0]*mat1->size[1]; + + if (!mat1->on_device || !mat2->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat1->is_trans != mat2->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || + mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kMinimum<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); + + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int minimum_scalar(cudamat* mat, double val, cudamat* target) { + int len = mat->size[0]*mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans != target->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kMinimumScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len); + + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int maximum(cudamat* mat1, cudamat* mat2, cudamat* target) { + int len = mat1->size[0]*mat1->size[1]; + + if (!mat1->on_device || !mat2->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat1->is_trans != mat2->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || + mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kMaximum<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); + + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int maximum_scalar(cudamat* mat, double val, cudamat* target) { + int len = mat->size[0]*mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans != target->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kMaximumScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len); + + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int min_by_axis(cudamat* mat, cudamat* target, int axis) { + unsigned int h = mat->size[0], + w = mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans) + return ERROR_TRANSPOSED; + + if (axis == 0) { + if (target->size[0] != 1 || target->size[1] != mat->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kMinColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h); + } else { + if (target->size[1] != 1 || target->size[0] != mat->size[0]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kMinRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h); + } + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int max_by_axis(cudamat* mat, cudamat* target, int axis) { + unsigned int h = mat->size[0], + w = mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans) + return ERROR_TRANSPOSED; + + if (axis == 0) { + if (target->size[0] != 1 || target->size[1] != mat->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kMaxColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h); + } else { + if (target->size[1] != 1 || target->size[0] != mat->size[0]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kMaxRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h); + } + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int argmin_by_axis(cudamat* mat, cudamat* target, int axis) { + unsigned int h = mat->size[0], + w = mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans) + return ERROR_TRANSPOSED; + + if (axis == 0) { + if (target->size[0] != 1 || target->size[1] != mat->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kArgMinColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h); + } else { + if (target->size[1] != 1 || target->size[0] != mat->size[0]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kArgMinRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h); + } + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int argmax_by_axis(cudamat* mat, cudamat* target, int axis) { + unsigned int h = mat->size[0], + w = mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans) + return ERROR_TRANSPOSED; + + if (axis == 0) { + if (target->size[0] != 1 || target->size[1] != mat->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kArgMaxColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h); + } else { + if (target->size[1] != 1 || target->size[0] != mat->size[0]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kArgMaxRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h); + } + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int sign(cudamat* mat, cudamat* target) { + int len = mat->size[0]*mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->is_trans != target->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kSign<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int apply_sigmoid(cudamat* mat, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kApplySigmoid<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int apply_tanh(cudamat* mat, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kApplyTanh<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int apply_soft_threshold(cudamat* mat, double alpha, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kApplySoftThreshold<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len); + + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int apply_abs(cudamat* mat, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kApplyAbs<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int apply_log_1_plus_exp(cudamat* mat, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kApplyLog1PlusExp<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int apply_log(cudamat* mat, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kLog<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int apply_exp(cudamat* mat, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kExp<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int apply_gamma(cudamat* mat, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kGamma<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int apply_lgamma(cudamat* mat, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kLogGamma<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int apply_sqrt(cudamat* mat, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kSqrt<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int apply_pow(cudamat* mat, double pow, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kPow<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, pow, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int apply_pow_matrix(cudamat* mat, cudamat* pow, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + if (mat->size[0] != pow->size[0] || mat->size[1] != pow->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kPowMatrix<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, pow->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int reciprocal(cudamat* mat, cudamat* target) { + unsigned int len = mat->size[0] * mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kReciprocal<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int dot(cudamat* mat1, cudamat* mat2, cudamat* target, double beta, double alpha) { + if (!mat1->on_device || !mat2->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (get_leading_dimension(mat1) != get_leading_dimension(target) || + get_nonleading_dimension(mat2) != get_nonleading_dimension(target) || + get_nonleading_dimension(mat1) != get_leading_dimension(mat2)) { + return ERROR_INCOMPATIBLE_DIMENSIONS; + } + int m = get_leading_dimension(mat1), + k = get_leading_dimension(mat2), + n = get_nonleading_dimension(mat2); + + // gemv if second matrix is a (column) vector + if (n == 1) { + cublasDgemv(get_transpose_char(mat1), mat1->size[0], mat1->size[1], + alpha, mat1->data_device, mat1->size[0], + mat2->data_device, 1, + beta, target->data_device, 1); + } + // gemv if first matrix is a (row) vector + else if (m == 1) { + cublasDgemv(mat2->is_trans ? 'n' : 't', mat2->size[0], mat2->size[1], + alpha, mat2->data_device, mat2->size[0], + mat1->data_device, 1, + beta, target->data_device, 1); + } + // gemm otherwise + else { + cublasDgemm(get_transpose_char(mat1), get_transpose_char(mat2), + m, n, k, + alpha, mat1->data_device, mat1->size[0], + mat2->data_device, mat2->size[0], + beta, target->data_device, target->size[0]); + } + + if (check_cublas_error()) + return CUBLAS_ERROR; + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + return 0; +} + +EXPORT double vdot(cudamat* mat1, cudamat* mat2, int* err_code) { + int len = mat1->size[0]*mat1->size[1]; + double res; + + if (!mat1->on_device || !mat2->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat1->is_trans != mat2->is_trans) { + *err_code = ERROR_TRANSPOSEDNESS; + return 0; + } + + if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1]) { + *err_code = ERROR_INCOMPATIBLE_DIMENSIONS; + return 0; + } + + res = cublasDdot(len, mat1->data_device, 1, mat2->data_device, 1); + + if (check_cublas_error()) { + *err_code = CUBLAS_ERROR; + return -1.; + } else { + *err_code = 0; + return res; + } +} + +/* Perform the operation mat1 = mat1 + alpha * mat2. mat1 and mat2 must + have the same transposedness. */ +EXPORT int add_mult(cudamat* mat1, cudamat* mat2, double alpha) { + int len = mat1->size[0]*mat1->size[1]; + + if (!mat1->on_device || !mat2->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat1->is_trans != mat2->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + cublasDaxpy(len, alpha, mat2->data_device, 1, mat1->data_device, 1); + + if (check_cublas_error()) + return CUBLAS_ERROR; + + return 0; +} + +EXPORT int add_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) { + int len = mat1->size[0]*mat1->size[1]; + + if (!mat1->on_device || !mat2->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat1->is_trans != mat2->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || + mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + if (mat1 == target) { + cublasDaxpy(len, 1, mat2->data_device, 1, mat1->data_device, 1); + + if (check_cublas_error()) + return CUBLAS_ERROR; + + } else { + kAdd<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + } + + return 0; +} + +EXPORT int subtract_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) { + int len = mat1->size[0]*mat1->size[1]; + + if (!mat1->on_device || !mat2->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat1->is_trans != mat2->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || + mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kSubtract<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int divide_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) { + int len = mat1->size[0]*mat1->size[1]; + + if (!mat1->on_device || !mat2->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat1->is_trans != mat2->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || + mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kDivide<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +/* Elementwise multiplication of 2 matrices */ +EXPORT int mult_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) { + int len = mat1->size[0]*mat1->size[1]; + + if (!mat1->on_device || !mat2->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat1->is_trans != mat2->is_trans) + return ERROR_TRANSPOSEDNESS; + + if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || + mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kMult<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int assign_scalar(cudamat* mat, double alpha) { + int len = mat->size[0]*mat->size[1]; + + if (!mat->on_device) + return ERROR_NOT_ON_DEVICE; + + kAssignScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int mult_by_scalar(cudamat* mat, double alpha, cudamat* target) { + int len = mat->size[0]*mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + if (mat == target) { + cublasDscal(len, alpha, mat->data_device, 1); + + if (check_cublas_error()) + return CUBLAS_ERROR; + + } else { + kMultScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + } + + return 0; +} + +EXPORT int divide_by_scalar(cudamat* mat, double alpha, cudamat* target) { + int len = mat->size[0]*mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kDivideScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int add_scalar(cudamat* mat, double alpha, cudamat* target) { + int len = mat->size[0]*mat->size[1]; + + if (!mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kAddScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT double euclid_norm(cudamat* mat, int* err_code) { + int len = mat->size[0]*mat->size[1]; + + if (!mat->on_device) { + *err_code = ERROR_NOT_ON_DEVICE; + return -1.; + } + + double res = cublasDnrm2(len, mat->data_device, 1); + + if (check_cublas_error()) { + *err_code = CUBLAS_ERROR; + return -1.; + } else { + *err_code = 0; + return res; + } +} + +EXPORT double manhattan_norm(cudamat* mat, int* err_code) { + int len = mat->size[0]*mat->size[1]; + + if (!mat->on_device) { + *err_code = ERROR_NOT_ON_DEVICE; + return -1.; + } + + double res = cublasDasum(len, mat->data_device, 1); + + if (check_cublas_error()) { + *err_code = CUBLAS_ERROR; + return -1.; + } else { + *err_code = 0; + return res; + } +} + +EXPORT int selectRows(cudamat* source, cudamat* target, cudamat* indices){ + const int nRetRows = indices->size[1]; + + if (nRetRows==0) return 0; + + dim3 gridDim((nRetRows+31)/32); + dim3 blockDim(32); + + kSelectRows<<<gridDim, blockDim>>>(source->data_device, target->data_device, indices->data_device, nRetRows, source->size[0], source->size[1]); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + else + return 0; +} + +EXPORT int setSelectedRows(cudamat* target, cudamat* source, cudamat* indices){ + const int nSetRows = indices->size[1]; + + if (nSetRows==0) + return 0; + + dim3 gridDim((nSetRows+31)/32); + dim3 blockDim(32); + + kSetSelectedRows<<<gridDim, blockDim>>>(target->data_device, source->data_device, indices->data_device, nSetRows, target->size[0], target->size[1]); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + else + return 0; +} + +EXPORT int where(cudamat* condition_mat, cudamat* if_mat, cudamat* else_mat, cudamat* target) { + unsigned int len = condition_mat->size[0] * condition_mat->size[1]; + + if (!condition_mat->on_device || !target->on_device) + return ERROR_NOT_ON_DEVICE; + + if (condition_mat->size[0] != target->size[0] || condition_mat->size[1] != target->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + if (condition_mat->size[0] != if_mat->size[0] || condition_mat->size[1] != if_mat->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + if (condition_mat->size[0] != else_mat->size[0] || condition_mat->size[1] != else_mat->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kWhere<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(condition_mat->data_device, + if_mat->data_device, else_mat->data_device, target->data_device, len); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +EXPORT int correlate(cudamat* source, cudamat* kernel, cudamat* dest) { + int len = source->size[0] * source->size[1]; + + if (!source->on_device || !kernel->on_device || !dest->on_device) + return ERROR_NOT_ON_DEVICE; + + if (source->size[0] != dest->size[0] || source->size[1] != dest->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + if (kernel->size[0] % 2 == 0 || kernel->size[1] % 2 == 0 || + kernel->size[0] > source->size[0] || kernel->size[1] > source->size[1]) + return ERROR_INCOMPATIBLE_DIMENSIONS; + + kCorrelate<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(source->data_device, + kernel->data_device, dest->data_device, source->size[1], source->size[0], + kernel->size[1], kernel->size[0]); + + if (SYNC_THREADS) + cudaThreadSynchronize(); + + if (checkCUDAError()) + return CUDA_ERROR; + + return 0; +} + +} |