diff options
Diffstat (limited to 'ot/gpu/cudamat/cudamat/cudamat.cu')
-rw-r--r-- | ot/gpu/cudamat/cudamat/cudamat.cu | 1633 |
1 files changed, 0 insertions, 1633 deletions
diff --git a/ot/gpu/cudamat/cudamat/cudamat.cu b/ot/gpu/cudamat/cudamat/cudamat.cu deleted file mode 100644 index 522f9cc..0000000 --- a/ot/gpu/cudamat/cudamat/cudamat.cu +++ /dev/null @@ -1,1633 +0,0 @@ -#include <stdio.h> -#include <errno.h> -#include <string.h> -#include <stdlib.h> -#include <cublas.h> -#include "cudamat_kernels.cuh" -#include "cudamat.cuh" - -extern "C" { - -/* ------------------------------ CUBLAS init/shutdown ------------------------------ */ - -inline bool check_cublas_error() { - cublasStatus status = cublasGetError(); - - return status != CUBLAS_STATUS_SUCCESS; -} - -inline bool checkCUDAError() { - cudaError_t err = cudaGetLastError(); - - if (cudaSuccess != err) - printf("%s\n", cudaGetErrorString( err)); - return cudaSuccess != err; -} - -EXPORT const char* get_last_cuda_error() { - cudaError_t err = cudaGetLastError(); - - return cudaGetErrorString( err); -} - -EXPORT const char* get_last_clib_error() { - return strerror(errno); -} - -EXPORT int cublas_init() { - cublasInit(); - if (check_cublas_error()) - return CUBLAS_ERROR; - else - return 0; -} - -EXPORT int cublas_shutdown() { - cublasShutdown(); - cudaThreadExit(); - - return 0; -} - - -EXPORT int cuda_set_device(int deviceId) { - cudaSetDevice(deviceId); - - if (checkCUDAError()) - return CUDA_ERROR; - else - return 0; -} - -EXPORT int init_random(rnd_struct* rnd_state, int seed, char* cudamatpath) { - unsigned int * host_mults; - host_mults = (unsigned int*)malloc(NUM_RND_STREAMS * sizeof(unsigned int)); - FILE * pFile; - - pFile = fopen (cudamatpath,"r"); - if (pFile == NULL) { - return ERROR_FILE_OPEN; - } - - for (int i = 0; i < NUM_RND_STREAMS; i++) { - if (fscanf (pFile, "%u", &host_mults[i]) != 1) { - return ERROR_FILE_SCAN; - } - } - fclose (pFile); - - cublasAlloc(NUM_RND_STREAMS, sizeof(unsigned int), (void**)&rnd_state->dev_mults); - cublasAlloc(NUM_RND_STREAMS, sizeof(unsigned long long), (void**)&rnd_state->dev_words); - cublasSetVector(NUM_RND_STREAMS, sizeof(unsigned int), host_mults, 1, rnd_state->dev_mults, 1); - //cudaMalloc((void **)&rnd_state->dev_mults, NUM_RND_STREAMS * sizeof(unsigned int)); - //cudaMalloc((void **)&rnd_state->dev_words, NUM_RND_STREAMS * sizeof(unsigned long long)); - //cudaMemcpy(rnd_state->dev_mults, host_mults, NUM_RND_STREAMS * sizeof(unsigned int), cudaMemcpyHostToDevice); - cudaThreadSynchronize(); - - kSeedRandom<<<NUM_RND_BLOCKS, NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, seed); - - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - else - return 0; -} - -/* ------------------------------ Utility routines ------------------------------ */ - -EXPORT int get_leading_dimension(cudamat* mat) { - return mat->is_trans ? mat->size[1] : mat->size[0]; -} - -EXPORT int get_nonleading_dimension(cudamat* mat) { - return mat->is_trans ? mat->size[0] : mat->size[1]; -} - -EXPORT void set_transpose(cudamat* mat, int is_trans) { - mat->is_trans = is_trans; -} - -inline char get_transpose_char(cudamat* mat) { - return mat->is_trans ? 't' : 'n'; -} - -EXPORT void cuda_sync_threads() { - cudaThreadSynchronize(); -} - -/* ------------------------------ Allocating/moving data ------------------------------ */ - -EXPORT int allocate_device_memory(cudamat* mat) { - int len = mat->size[0]*mat->size[1]; - - cublasStatus stat; - - stat = cublasAlloc(len, sizeof(mat->data_device[0]), (void**)&mat->data_device); - - if (stat != CUBLAS_STATUS_SUCCESS || check_cublas_error()) { - checkCUDAError(); - return CUBLAS_ERROR; - } - - mat->on_device = 1; - return 0; -} - -EXPORT int copy_to_host(cudamat* mat) { - int len = mat->size[0]*mat->size[1]; - - if (mat->on_device) { - cublasGetVector(len, sizeof(mat->data_host[0]), mat->data_device, 1, mat->data_host, 1); - - if (check_cublas_error()) - return CUBLAS_ERROR; - } else - return ERROR_NOT_ON_DEVICE; - - return 0; -} - -EXPORT int copy_to_device(cudamat* mat) { - int len = mat->size[0]*mat->size[1]; - int err_code = 0; - - //if (!mat->owns_data) - // return VIEW_ERROR; - - if (!mat->on_device) { - err_code = allocate_device_memory(mat); - if (err_code) - return err_code; - } - - cublasSetVector(len, sizeof(mat->data_host[0]), mat->data_host, 1, mat->data_device, 1); - - if (check_cublas_error()) - return CUBLAS_ERROR; - - return 0; -} - -EXPORT int copy_on_device(cudamat* mat1, cudamat* mat2) { - int len = mat1->size[0]*mat1->size[1]; - - if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - cublasDcopy(len, mat1->data_device, 1, mat2->data_device, 1); - - if (check_cublas_error()) - return CUBLAS_ERROR; - else - return 0; -} - -EXPORT int get_row_slice(cudamat* source, cudamat* target, unsigned int start, unsigned int end) { - int height = source->size[0]; - int width = source->size[1]; - - if ((end - start) != target->size[0] || source->size[1] != target->size[1] || start >= end || end > height) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - dim3 kernelBlockGrid((int)ceil((end - start)/32.), (int)ceil(width/32.), 1); - dim3 kernelBlockDim(32, 1, 1); - - kGetRowSlice<<<kernelBlockGrid,kernelBlockDim>>>(source->data_device, target->data_device, start, end, width, height); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - else - return 0; -} - -EXPORT int set_row_slice(cudamat* source, cudamat* target, unsigned int start, unsigned int end) { - int height = target->size[0]; - int width = target->size[1]; - - if ((end - start) != source->size[0] || source->size[1] != target->size[1] || start >= end || end > height) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - dim3 kernelBlockGrid((int)ceil((end - start)/32.), (int)ceil(width/32.), 1); - dim3 kernelBlockDim(32, 1, 1); - - kSetRowSlice<<<kernelBlockGrid,kernelBlockDim>>>(source->data_device, target->data_device, start, end, width, height); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - else - return 0; -} - -EXPORT int copy_transpose(cudamat* source, cudamat* target) { - unsigned int height = source->size[0]; - unsigned int width = source->size[1]; - - if (source->size[0] != target->size[1] || source->size[1] != target->size[0]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - // setup execution parameters - unsigned int grid_x = height / COPY_BLOCK_SIZE; - if (height % COPY_BLOCK_SIZE) - grid_x++; - - unsigned int grid_y = width / COPY_BLOCK_SIZE; - if (width % COPY_BLOCK_SIZE) - grid_y++; - - dim3 grid(grid_x, grid_y, 1); - dim3 threads(COPY_BLOCK_SIZE, COPY_BLOCK_SIZE, 1); - - kTranspose<<< grid, threads >>>(target->data_device, source->data_device, height, width); - - if (checkCUDAError()) - return CUDA_ERROR; - else - return 0; -} - -EXPORT int free_device_memory(cudamat* mat) { - if (mat->owns_data && mat->on_device) { - cublasStatus stat; - - stat = cublasFree(mat->data_device); - mat->on_device = 0; - - if (stat != CUBLAS_STATUS_SUCCESS || check_cublas_error()) - return CUBLAS_ERROR; - } - - return 0; -} - -EXPORT int reshape(cudamat* mat, unsigned int m, unsigned int n) { - if (mat->size[0] * mat->size[1] != m * n) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - mat->size[0] = m; - mat->size[1] = n; - - return 0; -} - -EXPORT int get_slice(cudamat* source, cudamat* target, unsigned int first_col, unsigned int last_col) { - if (source->is_trans) - return ERROR_TRANSPOSED; - - if (!source->on_device) - return ERROR_NOT_ON_DEVICE; - - if (last_col > source->size[1] || (first_col >= last_col)) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - int num_rows = source->size[0]; - - target->data_host = 0; - target->data_device = source->data_device + first_col * num_rows; - target->on_device = 1; - target->on_host = 0; - target->size[0] = source->size[0]; - target->size[1] = last_col - first_col; - target->is_trans = 0; - target->owns_data = 0; - - return 0; -} - -EXPORT int get_vector_slice(cudamat* source, cudamat* target, unsigned int first_ind, unsigned int last_ind) { - // source must be a vector - if (source->size[0] > 1 && source->size[1] > 1) - return ERROR_GENERIC; - - if (source->is_trans) - return ERROR_TRANSPOSED; - - if (!source->on_device) - return ERROR_NOT_ON_DEVICE; - - if (first_ind >= last_ind) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - int num_rows = source->size[0]; - - target->data_host = 0; - target->data_device = source->data_device + first_ind * num_rows; - target->on_device = 1; - target->on_host = 0; - target->is_trans = 0; - target->owns_data = 0; - - if (source->size[0] > 1) { - if (last_ind > source->size[0]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - target->size[0] = last_ind - first_ind; - target->size[1] = 1; - } else { - if (last_ind > source->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - target->size[0] = 1; - target->size[1] = last_ind - first_ind; - } - - return 0; -} - -/* ------------------------------ Initialization routines ------------------------------ */ - -EXPORT void init_from_array(cudamat* mat, double* data, int m, int n) { - mat->data_host = data; - mat->size[0] = m; - mat->size[1] = n; - mat->on_device = 0; - mat->on_host = 1; - mat->is_trans = 0; - mat->owns_data = 1; -} - -EXPORT int init_empty(cudamat* mat, int m, int n) { - mat->size[0] = m; - mat->size[1] = n; - mat->on_device = 0; - mat->on_host = 0; - mat->is_trans = 0; - mat->owns_data = 1; - - return allocate_device_memory(mat); -} - -/* ------------------------------ Random number generation ------------------------------ */ -EXPORT int fill_with_rand(rnd_struct* rnd_state, cudamat* mat) { - int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device) - return ERROR_NOT_ON_DEVICE; - - kRandomUniform<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, mat->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - else - return 0; -} - -EXPORT int fill_with_randn(rnd_struct* rnd_state, cudamat* mat) { - int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device) - return ERROR_NOT_ON_DEVICE; - - kRandomGaussian<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, mat->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - else - return 0; -} -/* ------------------------------ Algebraic operations ------------------------------ */ - -EXPORT int add_col_vec(cudamat* mat, cudamat* vec, cudamat* target) { - unsigned int h = mat->size[0], - w = mat->size[1]; - - if (!mat->on_device || !vec->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans) - return ERROR_TRANSPOSED; - - if (mat->size[0] != vec->size[0] || vec->size[1] != 1 || - mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kAddColVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) { - return CUDA_ERROR; - } - - return 0; -} - -EXPORT int add_col_mult(cudamat* mat, cudamat* vec, cudamat* target, double mult) { - unsigned int h = mat->size[0], - w = mat->size[1]; - - if (!mat->on_device || !vec->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans) - return ERROR_TRANSPOSED; - - if (mat->size[0] != vec->size[0] || vec->size[1] != 1 || - mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kAddColMult<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, mult, w, h); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int add_row_vec(cudamat* mat, cudamat* vec, cudamat* target) { - unsigned int h = mat->size[0], - w = mat->size[1]; - - if (!mat->on_device || !vec->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans) - return ERROR_TRANSPOSED; - - if (mat->size[1] != vec->size[1] || vec->size[0] != 1 || - mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kAddRowVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int mult_by_col_vec(cudamat* mat, cudamat* vec, cudamat* target) { - unsigned int h = mat->size[0], - w = mat->size[1]; - - if (!mat->on_device || !vec->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans) - return ERROR_TRANSPOSED; - - if (mat->size[0] != vec->size[0] || vec->size[1] != 1 || - mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kMultByColVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int mult_by_row_vec(cudamat* mat, cudamat* vec, cudamat* target) { - unsigned int h = mat->size[0], - w = mat->size[1]; - - if (!mat->on_device || !vec->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans) - return ERROR_TRANSPOSED; - - if (mat->size[1] != vec->size[1] || vec->size[0] != 1 || - mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kMultByRowVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int divide_by_col_vec(cudamat* mat, cudamat* vec, cudamat* target) { - unsigned int h = mat->size[0], - w = mat->size[1]; - - if (!mat->on_device || !vec->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans) - return ERROR_TRANSPOSED; - - if (mat->size[0] != vec->size[0] || vec->size[1] != 1 || - mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kDivByColVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int divide_by_row_vec(cudamat* mat, cudamat* vec, cudamat* target) { - unsigned int h = mat->size[0], - w = mat->size[1]; - - if (!mat->on_device || !vec->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans) - return ERROR_TRANSPOSED; - - if (mat->size[1] != vec->size[1] || vec->size[0] != 1 || - mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kDivByRowVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int less_than(cudamat* mat1, cudamat* mat2, cudamat* target) { - int len = mat1->size[0]*mat1->size[1]; - - if (!mat1->on_device || !mat2->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat1->is_trans != mat2->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || - mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kLessThan<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int less_than_scalar(cudamat* mat, double val, cudamat* target) { - int len = mat->size[0]*mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans != target->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kLessThanScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int greater_than(cudamat* mat1, cudamat* mat2, cudamat* target) { - int len = mat1->size[0]*mat1->size[1]; - - if (!mat1->on_device || !mat2->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat1->is_trans != mat2->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || - mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kGreaterThan<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int greater_than_scalar(cudamat* mat, double val, cudamat* target) { - int len = mat->size[0]*mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans != target->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kGreaterThanScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int equals(cudamat* mat1, cudamat* mat2, cudamat* target) { - int len = mat1->size[0]*mat1->size[1]; - - if (!mat1->on_device || !mat2->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat1->is_trans != mat2->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || - mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kEquals<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int equals_scalar(cudamat* mat, double val, cudamat* target) { - int len = mat->size[0]*mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans != target->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kEqualsScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int minimum(cudamat* mat1, cudamat* mat2, cudamat* target) { - int len = mat1->size[0]*mat1->size[1]; - - if (!mat1->on_device || !mat2->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat1->is_trans != mat2->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || - mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kMinimum<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); - - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int minimum_scalar(cudamat* mat, double val, cudamat* target) { - int len = mat->size[0]*mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans != target->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kMinimumScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len); - - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int maximum(cudamat* mat1, cudamat* mat2, cudamat* target) { - int len = mat1->size[0]*mat1->size[1]; - - if (!mat1->on_device || !mat2->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat1->is_trans != mat2->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || - mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kMaximum<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); - - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int maximum_scalar(cudamat* mat, double val, cudamat* target) { - int len = mat->size[0]*mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans != target->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kMaximumScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len); - - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int min_by_axis(cudamat* mat, cudamat* target, int axis) { - unsigned int h = mat->size[0], - w = mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans) - return ERROR_TRANSPOSED; - - if (axis == 0) { - if (target->size[0] != 1 || target->size[1] != mat->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kMinColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h); - } else { - if (target->size[1] != 1 || target->size[0] != mat->size[0]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kMinRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h); - } - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int max_by_axis(cudamat* mat, cudamat* target, int axis) { - unsigned int h = mat->size[0], - w = mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans) - return ERROR_TRANSPOSED; - - if (axis == 0) { - if (target->size[0] != 1 || target->size[1] != mat->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kMaxColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h); - } else { - if (target->size[1] != 1 || target->size[0] != mat->size[0]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kMaxRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h); - } - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int argmin_by_axis(cudamat* mat, cudamat* target, int axis) { - unsigned int h = mat->size[0], - w = mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans) - return ERROR_TRANSPOSED; - - if (axis == 0) { - if (target->size[0] != 1 || target->size[1] != mat->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kArgMinColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h); - } else { - if (target->size[1] != 1 || target->size[0] != mat->size[0]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kArgMinRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h); - } - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int argmax_by_axis(cudamat* mat, cudamat* target, int axis) { - unsigned int h = mat->size[0], - w = mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans) - return ERROR_TRANSPOSED; - - if (axis == 0) { - if (target->size[0] != 1 || target->size[1] != mat->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kArgMaxColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h); - } else { - if (target->size[1] != 1 || target->size[0] != mat->size[0]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kArgMaxRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h); - } - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int sign(cudamat* mat, cudamat* target) { - int len = mat->size[0]*mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->is_trans != target->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kSign<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int apply_sigmoid(cudamat* mat, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kApplySigmoid<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int apply_tanh(cudamat* mat, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kApplyTanh<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int apply_soft_threshold(cudamat* mat, double alpha, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kApplySoftThreshold<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len); - - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int apply_abs(cudamat* mat, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kApplyAbs<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int apply_log_1_plus_exp(cudamat* mat, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kApplyLog1PlusExp<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int apply_log(cudamat* mat, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kLog<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int apply_exp(cudamat* mat, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kExp<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int apply_gamma(cudamat* mat, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kGamma<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int apply_lgamma(cudamat* mat, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kLogGamma<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int apply_sqrt(cudamat* mat, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kSqrt<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int apply_pow(cudamat* mat, double pow, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kPow<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, pow, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int apply_pow_matrix(cudamat* mat, cudamat* pow, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - if (mat->size[0] != pow->size[0] || mat->size[1] != pow->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kPowMatrix<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, pow->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int reciprocal(cudamat* mat, cudamat* target) { - unsigned int len = mat->size[0] * mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kReciprocal<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int dot(cudamat* mat1, cudamat* mat2, cudamat* target, double beta, double alpha) { - if (!mat1->on_device || !mat2->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (get_leading_dimension(mat1) != get_leading_dimension(target) || - get_nonleading_dimension(mat2) != get_nonleading_dimension(target) || - get_nonleading_dimension(mat1) != get_leading_dimension(mat2)) { - return ERROR_INCOMPATIBLE_DIMENSIONS; - } - int m = get_leading_dimension(mat1), - k = get_leading_dimension(mat2), - n = get_nonleading_dimension(mat2); - - // gemv if second matrix is a (column) vector - if (n == 1) { - cublasDgemv(get_transpose_char(mat1), mat1->size[0], mat1->size[1], - alpha, mat1->data_device, mat1->size[0], - mat2->data_device, 1, - beta, target->data_device, 1); - } - // gemv if first matrix is a (row) vector - else if (m == 1) { - cublasDgemv(mat2->is_trans ? 'n' : 't', mat2->size[0], mat2->size[1], - alpha, mat2->data_device, mat2->size[0], - mat1->data_device, 1, - beta, target->data_device, 1); - } - // gemm otherwise - else { - cublasDgemm(get_transpose_char(mat1), get_transpose_char(mat2), - m, n, k, - alpha, mat1->data_device, mat1->size[0], - mat2->data_device, mat2->size[0], - beta, target->data_device, target->size[0]); - } - - if (check_cublas_error()) - return CUBLAS_ERROR; - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - return 0; -} - -EXPORT double vdot(cudamat* mat1, cudamat* mat2, int* err_code) { - int len = mat1->size[0]*mat1->size[1]; - double res; - - if (!mat1->on_device || !mat2->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat1->is_trans != mat2->is_trans) { - *err_code = ERROR_TRANSPOSEDNESS; - return 0; - } - - if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1]) { - *err_code = ERROR_INCOMPATIBLE_DIMENSIONS; - return 0; - } - - res = cublasDdot(len, mat1->data_device, 1, mat2->data_device, 1); - - if (check_cublas_error()) { - *err_code = CUBLAS_ERROR; - return -1.; - } else { - *err_code = 0; - return res; - } -} - -/* Perform the operation mat1 = mat1 + alpha * mat2. mat1 and mat2 must - have the same transposedness. */ -EXPORT int add_mult(cudamat* mat1, cudamat* mat2, double alpha) { - int len = mat1->size[0]*mat1->size[1]; - - if (!mat1->on_device || !mat2->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat1->is_trans != mat2->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - cublasDaxpy(len, alpha, mat2->data_device, 1, mat1->data_device, 1); - - if (check_cublas_error()) - return CUBLAS_ERROR; - - return 0; -} - -EXPORT int add_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) { - int len = mat1->size[0]*mat1->size[1]; - - if (!mat1->on_device || !mat2->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat1->is_trans != mat2->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || - mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - if (mat1 == target) { - cublasDaxpy(len, 1, mat2->data_device, 1, mat1->data_device, 1); - - if (check_cublas_error()) - return CUBLAS_ERROR; - - } else { - kAdd<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - } - - return 0; -} - -EXPORT int subtract_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) { - int len = mat1->size[0]*mat1->size[1]; - - if (!mat1->on_device || !mat2->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat1->is_trans != mat2->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || - mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kSubtract<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int divide_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) { - int len = mat1->size[0]*mat1->size[1]; - - if (!mat1->on_device || !mat2->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat1->is_trans != mat2->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || - mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kDivide<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -/* Elementwise multiplication of 2 matrices */ -EXPORT int mult_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) { - int len = mat1->size[0]*mat1->size[1]; - - if (!mat1->on_device || !mat2->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat1->is_trans != mat2->is_trans) - return ERROR_TRANSPOSEDNESS; - - if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] || - mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kMult<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int assign_scalar(cudamat* mat, double alpha) { - int len = mat->size[0]*mat->size[1]; - - if (!mat->on_device) - return ERROR_NOT_ON_DEVICE; - - kAssignScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int mult_by_scalar(cudamat* mat, double alpha, cudamat* target) { - int len = mat->size[0]*mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - if (mat == target) { - cublasDscal(len, alpha, mat->data_device, 1); - - if (check_cublas_error()) - return CUBLAS_ERROR; - - } else { - kMultScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - } - - return 0; -} - -EXPORT int divide_by_scalar(cudamat* mat, double alpha, cudamat* target) { - int len = mat->size[0]*mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kDivideScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int add_scalar(cudamat* mat, double alpha, cudamat* target) { - int len = mat->size[0]*mat->size[1]; - - if (!mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kAddScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT double euclid_norm(cudamat* mat, int* err_code) { - int len = mat->size[0]*mat->size[1]; - - if (!mat->on_device) { - *err_code = ERROR_NOT_ON_DEVICE; - return -1.; - } - - double res = cublasDnrm2(len, mat->data_device, 1); - - if (check_cublas_error()) { - *err_code = CUBLAS_ERROR; - return -1.; - } else { - *err_code = 0; - return res; - } -} - -EXPORT double manhattan_norm(cudamat* mat, int* err_code) { - int len = mat->size[0]*mat->size[1]; - - if (!mat->on_device) { - *err_code = ERROR_NOT_ON_DEVICE; - return -1.; - } - - double res = cublasDasum(len, mat->data_device, 1); - - if (check_cublas_error()) { - *err_code = CUBLAS_ERROR; - return -1.; - } else { - *err_code = 0; - return res; - } -} - -EXPORT int selectRows(cudamat* source, cudamat* target, cudamat* indices){ - const int nRetRows = indices->size[1]; - - if (nRetRows==0) return 0; - - dim3 gridDim((nRetRows+31)/32); - dim3 blockDim(32); - - kSelectRows<<<gridDim, blockDim>>>(source->data_device, target->data_device, indices->data_device, nRetRows, source->size[0], source->size[1]); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - else - return 0; -} - -EXPORT int setSelectedRows(cudamat* target, cudamat* source, cudamat* indices){ - const int nSetRows = indices->size[1]; - - if (nSetRows==0) - return 0; - - dim3 gridDim((nSetRows+31)/32); - dim3 blockDim(32); - - kSetSelectedRows<<<gridDim, blockDim>>>(target->data_device, source->data_device, indices->data_device, nSetRows, target->size[0], target->size[1]); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - else - return 0; -} - -EXPORT int where(cudamat* condition_mat, cudamat* if_mat, cudamat* else_mat, cudamat* target) { - unsigned int len = condition_mat->size[0] * condition_mat->size[1]; - - if (!condition_mat->on_device || !target->on_device) - return ERROR_NOT_ON_DEVICE; - - if (condition_mat->size[0] != target->size[0] || condition_mat->size[1] != target->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - if (condition_mat->size[0] != if_mat->size[0] || condition_mat->size[1] != if_mat->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - if (condition_mat->size[0] != else_mat->size[0] || condition_mat->size[1] != else_mat->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kWhere<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(condition_mat->data_device, - if_mat->data_device, else_mat->data_device, target->data_device, len); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -EXPORT int correlate(cudamat* source, cudamat* kernel, cudamat* dest) { - int len = source->size[0] * source->size[1]; - - if (!source->on_device || !kernel->on_device || !dest->on_device) - return ERROR_NOT_ON_DEVICE; - - if (source->size[0] != dest->size[0] || source->size[1] != dest->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - if (kernel->size[0] % 2 == 0 || kernel->size[1] % 2 == 0 || - kernel->size[0] > source->size[0] || kernel->size[1] > source->size[1]) - return ERROR_INCOMPATIBLE_DIMENSIONS; - - kCorrelate<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(source->data_device, - kernel->data_device, dest->data_device, source->size[1], source->size[0], - kernel->size[1], kernel->size[0]); - - if (SYNC_THREADS) - cudaThreadSynchronize(); - - if (checkCUDAError()) - return CUDA_ERROR; - - return 0; -} - -} |