diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/clblast.h | 8 | ||||
-rw-r--r-- | include/clblast_c.h | 27 | ||||
-rw-r--r-- | include/clblast_cuda.h | 8 |
3 files changed, 43 insertions, 0 deletions
diff --git a/include/clblast.h b/include/clblast.h index ce64b37a..3e65f52a 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -636,6 +636,14 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event = nullptr); +// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM +template <typename T> +StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event = nullptr); + // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template <typename T> StatusCode AxpyBatched(const size_t n, diff --git a/include/clblast_c.h b/include/clblast_c.h index 23a3afcc..918c25f6 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -1410,6 +1410,33 @@ CLBlastStatusCode PUBLIC_API CLBlastHim2col(const size_t channels, const size_t cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event); +// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM +CLBlastStatusCode PUBLIC_API CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastDconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastCconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastZconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastHconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event); + // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n, const float *alphas, diff --git a/include/clblast_cuda.h b/include/clblast_cuda.h index d82ee331..01044037 100644 --- a/include/clblast_cuda.h +++ b/include/clblast_cuda.h @@ -608,6 +608,14 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width CUdeviceptr col_buffer, const size_t col_offset, const CUcontext context, const CUdevice device); +// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM +template <typename T> +StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const CUdeviceptr im_buffer, const size_t im_offset, + const CUdeviceptr kernel_buffer, const size_t kernel_offset, + CUdeviceptr result_buffer, const size_t result_offset, + const CUcontext context, const CUdevice device); + // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template <typename T> StatusCode AxpyBatched(const size_t n, |