diff options
Diffstat (limited to 'include/clblast.h')
-rw-r--r-- | include/clblast.h | 77 |
1 files changed, 45 insertions, 32 deletions
diff --git a/include/clblast.h b/include/clblast.h index 5df0f605..c8596b39 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -68,8 +68,8 @@ enum class StatusCode { kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device - kInvalidVectorDot = -2043, // Vector dot is not a valid OpenCL buffer - kInsufficientMemoryDot = -2042, // Vector dot's OpenCL buffer is too small + kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer + kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small }; // Matrix layout and transpose types @@ -121,28 +121,28 @@ StatusCode Rotm(const size_t n, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event = nullptr); -// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template <typename T> StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template <typename T> StatusCode Scal(const size_t n, const T alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template <typename T> StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template <typename T> StatusCode Axpy(const size_t n, const T alpha, @@ -150,7 +150,7 @@ StatusCode Axpy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Dot product of two vectors: SDOT/DDOT +// Dot product of two vectors: SDOT/DDOT/HDOT template <typename T> StatusCode Dot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, @@ -174,42 +174,42 @@ StatusCode Dotc(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2 +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template <typename T> StatusCode Nrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template <typename T> StatusCode Asum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template <typename T> StatusCode Sum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template <typename T> StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template <typename T> StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template <typename T> StatusCode Min(const size_t n, cl_mem imin_buffer, const size_t imin_offset, @@ -220,7 +220,7 @@ StatusCode Min(const size_t n, // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV template <typename T> StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -231,7 +231,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template <typename T> StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, @@ -275,7 +275,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric matrix-vector multiplication: SSYMV/DSYMV +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template <typename T> StatusCode Symv(const Layout layout, const Triangle triangle, const size_t n, @@ -286,7 +286,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template <typename T> StatusCode Sbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, @@ -297,7 +297,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template <typename T> StatusCode Spmv(const Layout layout, const Triangle triangle, const size_t n, @@ -308,7 +308,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template <typename T> StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, @@ -316,7 +316,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template <typename T> StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, @@ -324,7 +324,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template <typename T> StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, @@ -356,7 +356,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// General rank-1 matrix update: SGER/DGER +// General rank-1 matrix update: SGER/DGER/HGER template <typename T> StatusCode Ger(const Layout layout, const size_t m, const size_t n, @@ -424,7 +424,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric rank-1 matrix update: SSYR/DSYR +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template <typename T> StatusCode Syr(const Layout layout, const Triangle triangle, const size_t n, @@ -433,7 +433,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric packed rank-1 matrix update: SSPR/DSPR +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template <typename T> StatusCode Spr(const Layout layout, const Triangle triangle, const size_t n, @@ -442,7 +442,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric rank-2 matrix update: SSYR2/DSYR2 +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template <typename T> StatusCode Syr2(const Layout layout, const Triangle triangle, const size_t n, @@ -452,7 +452,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template <typename T> StatusCode Spr2(const Layout layout, const Triangle triangle, const size_t n, @@ -466,7 +466,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM template <typename T> StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -477,7 +477,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template <typename T> StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -499,7 +499,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template <typename T> StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, @@ -519,7 +519,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template <typename T> StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, @@ -541,7 +541,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template <typename T> StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, @@ -550,7 +550,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM template <typename T> StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, @@ -560,6 +560,19 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c cl_command_queue* queue, cl_event* event = nullptr); // ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +template <typename T> +StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event = nullptr); + +// ================================================================================================= // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on // for the same device. This cache can be cleared to free up system memory or in case of debugging. |