summaryrefslogtreecommitdiff
path: root/ot/gpu/cudamat/cudamat/cudamat.cu
diff options
context:
space:
mode:
Diffstat (limited to 'ot/gpu/cudamat/cudamat/cudamat.cu')
-rw-r--r--ot/gpu/cudamat/cudamat/cudamat.cu1633
1 files changed, 0 insertions, 1633 deletions
diff --git a/ot/gpu/cudamat/cudamat/cudamat.cu b/ot/gpu/cudamat/cudamat/cudamat.cu
deleted file mode 100644
index 522f9cc..0000000
--- a/ot/gpu/cudamat/cudamat/cudamat.cu
+++ /dev/null
@@ -1,1633 +0,0 @@
-#include <stdio.h>
-#include <errno.h>
-#include <string.h>
-#include <stdlib.h>
-#include <cublas.h>
-#include "cudamat_kernels.cuh"
-#include "cudamat.cuh"
-
-extern "C" {
-
-/* ------------------------------ CUBLAS init/shutdown ------------------------------ */
-
-inline bool check_cublas_error() {
- cublasStatus status = cublasGetError();
-
- return status != CUBLAS_STATUS_SUCCESS;
-}
-
-inline bool checkCUDAError() {
- cudaError_t err = cudaGetLastError();
-
- if (cudaSuccess != err)
- printf("%s\n", cudaGetErrorString( err));
- return cudaSuccess != err;
-}
-
-EXPORT const char* get_last_cuda_error() {
- cudaError_t err = cudaGetLastError();
-
- return cudaGetErrorString( err);
-}
-
-EXPORT const char* get_last_clib_error() {
- return strerror(errno);
-}
-
-EXPORT int cublas_init() {
- cublasInit();
- if (check_cublas_error())
- return CUBLAS_ERROR;
- else
- return 0;
-}
-
-EXPORT int cublas_shutdown() {
- cublasShutdown();
- cudaThreadExit();
-
- return 0;
-}
-
-
-EXPORT int cuda_set_device(int deviceId) {
- cudaSetDevice(deviceId);
-
- if (checkCUDAError())
- return CUDA_ERROR;
- else
- return 0;
-}
-
-EXPORT int init_random(rnd_struct* rnd_state, int seed, char* cudamatpath) {
- unsigned int * host_mults;
- host_mults = (unsigned int*)malloc(NUM_RND_STREAMS * sizeof(unsigned int));
- FILE * pFile;
-
- pFile = fopen (cudamatpath,"r");
- if (pFile == NULL) {
- return ERROR_FILE_OPEN;
- }
-
- for (int i = 0; i < NUM_RND_STREAMS; i++) {
- if (fscanf (pFile, "%u", &host_mults[i]) != 1) {
- return ERROR_FILE_SCAN;
- }
- }
- fclose (pFile);
-
- cublasAlloc(NUM_RND_STREAMS, sizeof(unsigned int), (void**)&rnd_state->dev_mults);
- cublasAlloc(NUM_RND_STREAMS, sizeof(unsigned long long), (void**)&rnd_state->dev_words);
- cublasSetVector(NUM_RND_STREAMS, sizeof(unsigned int), host_mults, 1, rnd_state->dev_mults, 1);
- //cudaMalloc((void **)&rnd_state->dev_mults, NUM_RND_STREAMS * sizeof(unsigned int));
- //cudaMalloc((void **)&rnd_state->dev_words, NUM_RND_STREAMS * sizeof(unsigned long long));
- //cudaMemcpy(rnd_state->dev_mults, host_mults, NUM_RND_STREAMS * sizeof(unsigned int), cudaMemcpyHostToDevice);
- cudaThreadSynchronize();
-
- kSeedRandom<<<NUM_RND_BLOCKS, NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, seed);
-
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
- else
- return 0;
-}
-
-/* ------------------------------ Utility routines ------------------------------ */
-
-EXPORT int get_leading_dimension(cudamat* mat) {
- return mat->is_trans ? mat->size[1] : mat->size[0];
-}
-
-EXPORT int get_nonleading_dimension(cudamat* mat) {
- return mat->is_trans ? mat->size[0] : mat->size[1];
-}
-
-EXPORT void set_transpose(cudamat* mat, int is_trans) {
- mat->is_trans = is_trans;
-}
-
-inline char get_transpose_char(cudamat* mat) {
- return mat->is_trans ? 't' : 'n';
-}
-
-EXPORT void cuda_sync_threads() {
- cudaThreadSynchronize();
-}
-
-/* ------------------------------ Allocating/moving data ------------------------------ */
-
-EXPORT int allocate_device_memory(cudamat* mat) {
- int len = mat->size[0]*mat->size[1];
-
- cublasStatus stat;
-
- stat = cublasAlloc(len, sizeof(mat->data_device[0]), (void**)&mat->data_device);
-
- if (stat != CUBLAS_STATUS_SUCCESS || check_cublas_error()) {
- checkCUDAError();
- return CUBLAS_ERROR;
- }
-
- mat->on_device = 1;
- return 0;
-}
-
-EXPORT int copy_to_host(cudamat* mat) {
- int len = mat->size[0]*mat->size[1];
-
- if (mat->on_device) {
- cublasGetVector(len, sizeof(mat->data_host[0]), mat->data_device, 1, mat->data_host, 1);
-
- if (check_cublas_error())
- return CUBLAS_ERROR;
- } else
- return ERROR_NOT_ON_DEVICE;
-
- return 0;
-}
-
-EXPORT int copy_to_device(cudamat* mat) {
- int len = mat->size[0]*mat->size[1];
- int err_code = 0;
-
- //if (!mat->owns_data)
- // return VIEW_ERROR;
-
- if (!mat->on_device) {
- err_code = allocate_device_memory(mat);
- if (err_code)
- return err_code;
- }
-
- cublasSetVector(len, sizeof(mat->data_host[0]), mat->data_host, 1, mat->data_device, 1);
-
- if (check_cublas_error())
- return CUBLAS_ERROR;
-
- return 0;
-}
-
-EXPORT int copy_on_device(cudamat* mat1, cudamat* mat2) {
- int len = mat1->size[0]*mat1->size[1];
-
- if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- cublasDcopy(len, mat1->data_device, 1, mat2->data_device, 1);
-
- if (check_cublas_error())
- return CUBLAS_ERROR;
- else
- return 0;
-}
-
-EXPORT int get_row_slice(cudamat* source, cudamat* target, unsigned int start, unsigned int end) {
- int height = source->size[0];
- int width = source->size[1];
-
- if ((end - start) != target->size[0] || source->size[1] != target->size[1] || start >= end || end > height)
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- dim3 kernelBlockGrid((int)ceil((end - start)/32.), (int)ceil(width/32.), 1);
- dim3 kernelBlockDim(32, 1, 1);
-
- kGetRowSlice<<<kernelBlockGrid,kernelBlockDim>>>(source->data_device, target->data_device, start, end, width, height);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
- else
- return 0;
-}
-
-EXPORT int set_row_slice(cudamat* source, cudamat* target, unsigned int start, unsigned int end) {
- int height = target->size[0];
- int width = target->size[1];
-
- if ((end - start) != source->size[0] || source->size[1] != target->size[1] || start >= end || end > height)
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- dim3 kernelBlockGrid((int)ceil((end - start)/32.), (int)ceil(width/32.), 1);
- dim3 kernelBlockDim(32, 1, 1);
-
- kSetRowSlice<<<kernelBlockGrid,kernelBlockDim>>>(source->data_device, target->data_device, start, end, width, height);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
- else
- return 0;
-}
-
-EXPORT int copy_transpose(cudamat* source, cudamat* target) {
- unsigned int height = source->size[0];
- unsigned int width = source->size[1];
-
- if (source->size[0] != target->size[1] || source->size[1] != target->size[0])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- // setup execution parameters
- unsigned int grid_x = height / COPY_BLOCK_SIZE;
- if (height % COPY_BLOCK_SIZE)
- grid_x++;
-
- unsigned int grid_y = width / COPY_BLOCK_SIZE;
- if (width % COPY_BLOCK_SIZE)
- grid_y++;
-
- dim3 grid(grid_x, grid_y, 1);
- dim3 threads(COPY_BLOCK_SIZE, COPY_BLOCK_SIZE, 1);
-
- kTranspose<<< grid, threads >>>(target->data_device, source->data_device, height, width);
-
- if (checkCUDAError())
- return CUDA_ERROR;
- else
- return 0;
-}
-
-EXPORT int free_device_memory(cudamat* mat) {
- if (mat->owns_data && mat->on_device) {
- cublasStatus stat;
-
- stat = cublasFree(mat->data_device);
- mat->on_device = 0;
-
- if (stat != CUBLAS_STATUS_SUCCESS || check_cublas_error())
- return CUBLAS_ERROR;
- }
-
- return 0;
-}
-
-EXPORT int reshape(cudamat* mat, unsigned int m, unsigned int n) {
- if (mat->size[0] * mat->size[1] != m * n)
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- mat->size[0] = m;
- mat->size[1] = n;
-
- return 0;
-}
-
-EXPORT int get_slice(cudamat* source, cudamat* target, unsigned int first_col, unsigned int last_col) {
- if (source->is_trans)
- return ERROR_TRANSPOSED;
-
- if (!source->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (last_col > source->size[1] || (first_col >= last_col))
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- int num_rows = source->size[0];
-
- target->data_host = 0;
- target->data_device = source->data_device + first_col * num_rows;
- target->on_device = 1;
- target->on_host = 0;
- target->size[0] = source->size[0];
- target->size[1] = last_col - first_col;
- target->is_trans = 0;
- target->owns_data = 0;
-
- return 0;
-}
-
-EXPORT int get_vector_slice(cudamat* source, cudamat* target, unsigned int first_ind, unsigned int last_ind) {
- // source must be a vector
- if (source->size[0] > 1 && source->size[1] > 1)
- return ERROR_GENERIC;
-
- if (source->is_trans)
- return ERROR_TRANSPOSED;
-
- if (!source->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (first_ind >= last_ind)
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- int num_rows = source->size[0];
-
- target->data_host = 0;
- target->data_device = source->data_device + first_ind * num_rows;
- target->on_device = 1;
- target->on_host = 0;
- target->is_trans = 0;
- target->owns_data = 0;
-
- if (source->size[0] > 1) {
- if (last_ind > source->size[0])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- target->size[0] = last_ind - first_ind;
- target->size[1] = 1;
- } else {
- if (last_ind > source->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- target->size[0] = 1;
- target->size[1] = last_ind - first_ind;
- }
-
- return 0;
-}
-
-/* ------------------------------ Initialization routines ------------------------------ */
-
-EXPORT void init_from_array(cudamat* mat, double* data, int m, int n) {
- mat->data_host = data;
- mat->size[0] = m;
- mat->size[1] = n;
- mat->on_device = 0;
- mat->on_host = 1;
- mat->is_trans = 0;
- mat->owns_data = 1;
-}
-
-EXPORT int init_empty(cudamat* mat, int m, int n) {
- mat->size[0] = m;
- mat->size[1] = n;
- mat->on_device = 0;
- mat->on_host = 0;
- mat->is_trans = 0;
- mat->owns_data = 1;
-
- return allocate_device_memory(mat);
-}
-
-/* ------------------------------ Random number generation ------------------------------ */
-EXPORT int fill_with_rand(rnd_struct* rnd_state, cudamat* mat) {
- int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- kRandomUniform<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, mat->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
- else
- return 0;
-}
-
-EXPORT int fill_with_randn(rnd_struct* rnd_state, cudamat* mat) {
- int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- kRandomGaussian<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, mat->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
- else
- return 0;
-}
-/* ------------------------------ Algebraic operations ------------------------------ */
-
-EXPORT int add_col_vec(cudamat* mat, cudamat* vec, cudamat* target) {
- unsigned int h = mat->size[0],
- w = mat->size[1];
-
- if (!mat->on_device || !vec->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans)
- return ERROR_TRANSPOSED;
-
- if (mat->size[0] != vec->size[0] || vec->size[1] != 1 ||
- mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kAddColVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError()) {
- return CUDA_ERROR;
- }
-
- return 0;
-}
-
-EXPORT int add_col_mult(cudamat* mat, cudamat* vec, cudamat* target, double mult) {
- unsigned int h = mat->size[0],
- w = mat->size[1];
-
- if (!mat->on_device || !vec->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans)
- return ERROR_TRANSPOSED;
-
- if (mat->size[0] != vec->size[0] || vec->size[1] != 1 ||
- mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kAddColMult<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, mult, w, h);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int add_row_vec(cudamat* mat, cudamat* vec, cudamat* target) {
- unsigned int h = mat->size[0],
- w = mat->size[1];
-
- if (!mat->on_device || !vec->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans)
- return ERROR_TRANSPOSED;
-
- if (mat->size[1] != vec->size[1] || vec->size[0] != 1 ||
- mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kAddRowVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int mult_by_col_vec(cudamat* mat, cudamat* vec, cudamat* target) {
- unsigned int h = mat->size[0],
- w = mat->size[1];
-
- if (!mat->on_device || !vec->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans)
- return ERROR_TRANSPOSED;
-
- if (mat->size[0] != vec->size[0] || vec->size[1] != 1 ||
- mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kMultByColVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int mult_by_row_vec(cudamat* mat, cudamat* vec, cudamat* target) {
- unsigned int h = mat->size[0],
- w = mat->size[1];
-
- if (!mat->on_device || !vec->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans)
- return ERROR_TRANSPOSED;
-
- if (mat->size[1] != vec->size[1] || vec->size[0] != 1 ||
- mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kMultByRowVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int divide_by_col_vec(cudamat* mat, cudamat* vec, cudamat* target) {
- unsigned int h = mat->size[0],
- w = mat->size[1];
-
- if (!mat->on_device || !vec->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans)
- return ERROR_TRANSPOSED;
-
- if (mat->size[0] != vec->size[0] || vec->size[1] != 1 ||
- mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kDivByColVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int divide_by_row_vec(cudamat* mat, cudamat* vec, cudamat* target) {
- unsigned int h = mat->size[0],
- w = mat->size[1];
-
- if (!mat->on_device || !vec->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans)
- return ERROR_TRANSPOSED;
-
- if (mat->size[1] != vec->size[1] || vec->size[0] != 1 ||
- mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kDivByRowVector<<<NUM_VECTOR_OP_BLOCKS(w*h),NUM_VECTOR_OP_THREADS_PER_BLOCK(w*h)>>>(mat->data_device, vec->data_device, target->data_device, w, h);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int less_than(cudamat* mat1, cudamat* mat2, cudamat* target) {
- int len = mat1->size[0]*mat1->size[1];
-
- if (!mat1->on_device || !mat2->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat1->is_trans != mat2->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
- mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kLessThan<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int less_than_scalar(cudamat* mat, double val, cudamat* target) {
- int len = mat->size[0]*mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans != target->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kLessThanScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int greater_than(cudamat* mat1, cudamat* mat2, cudamat* target) {
- int len = mat1->size[0]*mat1->size[1];
-
- if (!mat1->on_device || !mat2->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat1->is_trans != mat2->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
- mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kGreaterThan<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int greater_than_scalar(cudamat* mat, double val, cudamat* target) {
- int len = mat->size[0]*mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans != target->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kGreaterThanScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int equals(cudamat* mat1, cudamat* mat2, cudamat* target) {
- int len = mat1->size[0]*mat1->size[1];
-
- if (!mat1->on_device || !mat2->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat1->is_trans != mat2->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
- mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kEquals<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int equals_scalar(cudamat* mat, double val, cudamat* target) {
- int len = mat->size[0]*mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans != target->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kEqualsScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int minimum(cudamat* mat1, cudamat* mat2, cudamat* target) {
- int len = mat1->size[0]*mat1->size[1];
-
- if (!mat1->on_device || !mat2->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat1->is_trans != mat2->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
- mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kMinimum<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
-
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int minimum_scalar(cudamat* mat, double val, cudamat* target) {
- int len = mat->size[0]*mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans != target->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kMinimumScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len);
-
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int maximum(cudamat* mat1, cudamat* mat2, cudamat* target) {
- int len = mat1->size[0]*mat1->size[1];
-
- if (!mat1->on_device || !mat2->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat1->is_trans != mat2->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
- mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kMaximum<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
-
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int maximum_scalar(cudamat* mat, double val, cudamat* target) {
- int len = mat->size[0]*mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans != target->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kMaximumScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, val, target->data_device, len);
-
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int min_by_axis(cudamat* mat, cudamat* target, int axis) {
- unsigned int h = mat->size[0],
- w = mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans)
- return ERROR_TRANSPOSED;
-
- if (axis == 0) {
- if (target->size[0] != 1 || target->size[1] != mat->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kMinColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h);
- } else {
- if (target->size[1] != 1 || target->size[0] != mat->size[0])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kMinRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h);
- }
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int max_by_axis(cudamat* mat, cudamat* target, int axis) {
- unsigned int h = mat->size[0],
- w = mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans)
- return ERROR_TRANSPOSED;
-
- if (axis == 0) {
- if (target->size[0] != 1 || target->size[1] != mat->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kMaxColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h);
- } else {
- if (target->size[1] != 1 || target->size[0] != mat->size[0])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kMaxRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h);
- }
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int argmin_by_axis(cudamat* mat, cudamat* target, int axis) {
- unsigned int h = mat->size[0],
- w = mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans)
- return ERROR_TRANSPOSED;
-
- if (axis == 0) {
- if (target->size[0] != 1 || target->size[1] != mat->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kArgMinColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h);
- } else {
- if (target->size[1] != 1 || target->size[0] != mat->size[0])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kArgMinRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h);
- }
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int argmax_by_axis(cudamat* mat, cudamat* target, int axis) {
- unsigned int h = mat->size[0],
- w = mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans)
- return ERROR_TRANSPOSED;
-
- if (axis == 0) {
- if (target->size[0] != 1 || target->size[1] != mat->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kArgMaxColumnwise<<<w,32>>>(mat->data_device, target->data_device, w, h);
- } else {
- if (target->size[1] != 1 || target->size[0] != mat->size[0])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kArgMaxRowwise<<<h,32>>>(mat->data_device, target->data_device, w, h);
- }
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int sign(cudamat* mat, cudamat* target) {
- int len = mat->size[0]*mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->is_trans != target->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kSign<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int apply_sigmoid(cudamat* mat, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kApplySigmoid<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int apply_tanh(cudamat* mat, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kApplyTanh<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int apply_soft_threshold(cudamat* mat, double alpha, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kApplySoftThreshold<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len);
-
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int apply_abs(cudamat* mat, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kApplyAbs<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int apply_log_1_plus_exp(cudamat* mat, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kApplyLog1PlusExp<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int apply_log(cudamat* mat, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kLog<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int apply_exp(cudamat* mat, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kExp<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int apply_gamma(cudamat* mat, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kGamma<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int apply_lgamma(cudamat* mat, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kLogGamma<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int apply_sqrt(cudamat* mat, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kSqrt<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int apply_pow(cudamat* mat, double pow, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kPow<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, pow, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int apply_pow_matrix(cudamat* mat, cudamat* pow, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- if (mat->size[0] != pow->size[0] || mat->size[1] != pow->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kPowMatrix<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, pow->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int reciprocal(cudamat* mat, cudamat* target) {
- unsigned int len = mat->size[0] * mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kReciprocal<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int dot(cudamat* mat1, cudamat* mat2, cudamat* target, double beta, double alpha) {
- if (!mat1->on_device || !mat2->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (get_leading_dimension(mat1) != get_leading_dimension(target) ||
- get_nonleading_dimension(mat2) != get_nonleading_dimension(target) ||
- get_nonleading_dimension(mat1) != get_leading_dimension(mat2)) {
- return ERROR_INCOMPATIBLE_DIMENSIONS;
- }
- int m = get_leading_dimension(mat1),
- k = get_leading_dimension(mat2),
- n = get_nonleading_dimension(mat2);
-
- // gemv if second matrix is a (column) vector
- if (n == 1) {
- cublasDgemv(get_transpose_char(mat1), mat1->size[0], mat1->size[1],
- alpha, mat1->data_device, mat1->size[0],
- mat2->data_device, 1,
- beta, target->data_device, 1);
- }
- // gemv if first matrix is a (row) vector
- else if (m == 1) {
- cublasDgemv(mat2->is_trans ? 'n' : 't', mat2->size[0], mat2->size[1],
- alpha, mat2->data_device, mat2->size[0],
- mat1->data_device, 1,
- beta, target->data_device, 1);
- }
- // gemm otherwise
- else {
- cublasDgemm(get_transpose_char(mat1), get_transpose_char(mat2),
- m, n, k,
- alpha, mat1->data_device, mat1->size[0],
- mat2->data_device, mat2->size[0],
- beta, target->data_device, target->size[0]);
- }
-
- if (check_cublas_error())
- return CUBLAS_ERROR;
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- return 0;
-}
-
-EXPORT double vdot(cudamat* mat1, cudamat* mat2, int* err_code) {
- int len = mat1->size[0]*mat1->size[1];
- double res;
-
- if (!mat1->on_device || !mat2->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat1->is_trans != mat2->is_trans) {
- *err_code = ERROR_TRANSPOSEDNESS;
- return 0;
- }
-
- if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1]) {
- *err_code = ERROR_INCOMPATIBLE_DIMENSIONS;
- return 0;
- }
-
- res = cublasDdot(len, mat1->data_device, 1, mat2->data_device, 1);
-
- if (check_cublas_error()) {
- *err_code = CUBLAS_ERROR;
- return -1.;
- } else {
- *err_code = 0;
- return res;
- }
-}
-
-/* Perform the operation mat1 = mat1 + alpha * mat2. mat1 and mat2 must
- have the same transposedness. */
-EXPORT int add_mult(cudamat* mat1, cudamat* mat2, double alpha) {
- int len = mat1->size[0]*mat1->size[1];
-
- if (!mat1->on_device || !mat2->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat1->is_trans != mat2->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- cublasDaxpy(len, alpha, mat2->data_device, 1, mat1->data_device, 1);
-
- if (check_cublas_error())
- return CUBLAS_ERROR;
-
- return 0;
-}
-
-EXPORT int add_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) {
- int len = mat1->size[0]*mat1->size[1];
-
- if (!mat1->on_device || !mat2->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat1->is_trans != mat2->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
- mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- if (mat1 == target) {
- cublasDaxpy(len, 1, mat2->data_device, 1, mat1->data_device, 1);
-
- if (check_cublas_error())
- return CUBLAS_ERROR;
-
- } else {
- kAdd<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
- }
-
- return 0;
-}
-
-EXPORT int subtract_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) {
- int len = mat1->size[0]*mat1->size[1];
-
- if (!mat1->on_device || !mat2->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat1->is_trans != mat2->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
- mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kSubtract<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int divide_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) {
- int len = mat1->size[0]*mat1->size[1];
-
- if (!mat1->on_device || !mat2->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat1->is_trans != mat2->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
- mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kDivide<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-/* Elementwise multiplication of 2 matrices */
-EXPORT int mult_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) {
- int len = mat1->size[0]*mat1->size[1];
-
- if (!mat1->on_device || !mat2->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat1->is_trans != mat2->is_trans)
- return ERROR_TRANSPOSEDNESS;
-
- if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1] ||
- mat1->size[0] != target->size[0] || mat1->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kMult<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat1->data_device, mat2->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int assign_scalar(cudamat* mat, double alpha) {
- int len = mat->size[0]*mat->size[1];
-
- if (!mat->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- kAssignScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int mult_by_scalar(cudamat* mat, double alpha, cudamat* target) {
- int len = mat->size[0]*mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- if (mat == target) {
- cublasDscal(len, alpha, mat->data_device, 1);
-
- if (check_cublas_error())
- return CUBLAS_ERROR;
-
- } else {
- kMultScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
- }
-
- return 0;
-}
-
-EXPORT int divide_by_scalar(cudamat* mat, double alpha, cudamat* target) {
- int len = mat->size[0]*mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kDivideScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int add_scalar(cudamat* mat, double alpha, cudamat* target) {
- int len = mat->size[0]*mat->size[1];
-
- if (!mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (mat->size[0] != target->size[0] || mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kAddScalar<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(mat->data_device, alpha, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT double euclid_norm(cudamat* mat, int* err_code) {
- int len = mat->size[0]*mat->size[1];
-
- if (!mat->on_device) {
- *err_code = ERROR_NOT_ON_DEVICE;
- return -1.;
- }
-
- double res = cublasDnrm2(len, mat->data_device, 1);
-
- if (check_cublas_error()) {
- *err_code = CUBLAS_ERROR;
- return -1.;
- } else {
- *err_code = 0;
- return res;
- }
-}
-
-EXPORT double manhattan_norm(cudamat* mat, int* err_code) {
- int len = mat->size[0]*mat->size[1];
-
- if (!mat->on_device) {
- *err_code = ERROR_NOT_ON_DEVICE;
- return -1.;
- }
-
- double res = cublasDasum(len, mat->data_device, 1);
-
- if (check_cublas_error()) {
- *err_code = CUBLAS_ERROR;
- return -1.;
- } else {
- *err_code = 0;
- return res;
- }
-}
-
-EXPORT int selectRows(cudamat* source, cudamat* target, cudamat* indices){
- const int nRetRows = indices->size[1];
-
- if (nRetRows==0) return 0;
-
- dim3 gridDim((nRetRows+31)/32);
- dim3 blockDim(32);
-
- kSelectRows<<<gridDim, blockDim>>>(source->data_device, target->data_device, indices->data_device, nRetRows, source->size[0], source->size[1]);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
- else
- return 0;
-}
-
-EXPORT int setSelectedRows(cudamat* target, cudamat* source, cudamat* indices){
- const int nSetRows = indices->size[1];
-
- if (nSetRows==0)
- return 0;
-
- dim3 gridDim((nSetRows+31)/32);
- dim3 blockDim(32);
-
- kSetSelectedRows<<<gridDim, blockDim>>>(target->data_device, source->data_device, indices->data_device, nSetRows, target->size[0], target->size[1]);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
- else
- return 0;
-}
-
-EXPORT int where(cudamat* condition_mat, cudamat* if_mat, cudamat* else_mat, cudamat* target) {
- unsigned int len = condition_mat->size[0] * condition_mat->size[1];
-
- if (!condition_mat->on_device || !target->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (condition_mat->size[0] != target->size[0] || condition_mat->size[1] != target->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- if (condition_mat->size[0] != if_mat->size[0] || condition_mat->size[1] != if_mat->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- if (condition_mat->size[0] != else_mat->size[0] || condition_mat->size[1] != else_mat->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kWhere<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(condition_mat->data_device,
- if_mat->data_device, else_mat->data_device, target->data_device, len);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-EXPORT int correlate(cudamat* source, cudamat* kernel, cudamat* dest) {
- int len = source->size[0] * source->size[1];
-
- if (!source->on_device || !kernel->on_device || !dest->on_device)
- return ERROR_NOT_ON_DEVICE;
-
- if (source->size[0] != dest->size[0] || source->size[1] != dest->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- if (kernel->size[0] % 2 == 0 || kernel->size[1] % 2 == 0 ||
- kernel->size[0] > source->size[0] || kernel->size[1] > source->size[1])
- return ERROR_INCOMPATIBLE_DIMENSIONS;
-
- kCorrelate<<<NUM_VECTOR_OP_BLOCKS(len),NUM_VECTOR_OP_THREADS_PER_BLOCK(len)>>>(source->data_device,
- kernel->data_device, dest->data_device, source->size[1], source->size[0],
- kernel->size[1], kernel->size[0]);
-
- if (SYNC_THREADS)
- cudaThreadSynchronize();
-
- if (checkCUDAError())
- return CUDA_ERROR;
-
- return 0;
-}
-
-}